gpt4 book ai didi

Python - 重复数据删除问题 : TypeError: unhashable type: 'numpy.ndarray'

转载 作者:太空宇宙 更新时间:2023-11-04 03:37:52 30 4
gpt4 key购买 nike

我在运行重复数据删除时遇到问题。我正在尝试使用此库从大量地址中删除重复项。这是我的代码:

import collections
import logging
import optparse
from numpy import nan

import dedupe
from unidecode import unidecode

optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
help='Increase verbosity (specify multiple times for more)'
)
(opts, args) = optp.parse_args()
log_level = logging.WARNING
if opts.verbose == 1:
log_level = logging.INFO
elif opts.verbose >= 2:
log_level = logging.DEBUG
logging.getLogger().setLevel(log_level)

input_file = 'H:/My Documents/Python Scripts/Dedupe/DupeTester.csv'
output_file = 'csv_example_output.csv'
settings_file = 'csv_example_learned_settings'
training_file = 'csv_example_training.json'

def preProcess(column):

import unidecode
column = column.decode("utf8")
column = unidecode.unidecode(column)
column = re.sub(' +', ' ', column)
column = re.sub('\n', ' ', column)
column = column.strip().strip('"').strip("'").lower().strip()
return column

def readData(filename):
data_d = {}
with open(filename) as f:
reader = csv.DictReader(f)
for row in reader:
clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
row_id = int(row[''])
data_d[row_id] = dict(clean_row)

return data_d


print 'importing data ...'
data_d = readData(input_file)

if os.path.exists(settings_file):
print 'reading from', settings_file
with open(settings_file, 'rb') as f:
deduper = dedupe.StaticDedupe(f)

else:
fields = [
{"field" : "fulladdr", "type" : "Address"},
{"field" : "zip", "type" : "ShortString"},
]

deduper = dedupe.Dedupe(fields)

deduper.sample(data_d, 200)

if os.path.exists(training_file):
print 'reading labeled examples from ', training_file
with open(training_file, 'rb') as f:
deduper.readTraining(f)

print 'starting active labeling...'

dedupe.consoleLabel(deduper)

deduper.train()

with open(training_file, 'w') as tf :
deduper.writeTraining(tf)

with open(settings_file, 'w') as sf :
deduper.writeSettings(sf)

print 'blocking...'



threshold = deduper.threshold(data_d, recall_weight=2)



print 'clustering...'
clustered_dupes = deduper.match(data_d, threshold)

print '# duplicate sets', len(clustered_dupes)




cluster_membership = {}
cluster_id = 0
for (cluster_id, cluster) in enumerate(clustered_dupes):
id_set, scores = cluster
cluster_d = [data_d[c] for c in id_set]
canonical_rep = dedupe.canonicalize(cluster_d)
for record_id, score in zip(id_set, scores) :
cluster_membership[record_id] = {
"cluster id" : cluster_id,
"canonical representation" : canonical_rep,
"confidence": score
}

singleton_id = cluster_id + 1

with open(output_file, 'w') as f_output:
writer = csv.writer(f_output)

with open(input_file) as f_input :
reader = csv.reader(f_input)

heading_row = reader.next()
heading_row.insert(0, 'confidence_score')
heading_row.insert(0, 'Cluster ID')
canonical_keys = canonical_rep.keys()
for key in canonical_keys:
heading_row.append('canonical_' + key)

writer.writerow(heading_row)

for row in reader:
row_id = int(row[0])
if row_id in cluster_membership :
cluster_id = cluster_membership[row_id]["cluster id"]
canonical_rep = cluster_membership[row_id]["canonical representation"]
row.insert(0, cluster_membership[row_id]['confidence'])
row.insert(0, cluster_id)
for key in canonical_keys:
row.append(canonical_rep[key].encode('utf8'))
else:
row.insert(0, None)
row.insert(0, singleton_id)
singleton_id += 1
for key in canonical_keys:
row.append(None)
writer.writerow(row)

具体来说,我在运行时得到以下信息:

C:\Anaconda\lib\site-packages\dedupe\core.py:18: UserWarning: There may be duplicates in the sample
warnings.warn("There may be duplicates in the sample")
Traceback (most recent call last):

File "<ipython-input-1-33e46d604c5f>", line 1, in <module>
runfile('H:/My Documents/Python Scripts/Dedupe/dupetestscript.py', wdir='H:/My Documents/Python Scripts/Dedupe')

File "C:\Anaconda\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 580, in runfile
execfile(filename, namespace)

File "H:/My Documents/Python Scripts/Dedupe/dupetestscript.py", line 67, in <module>
deduper.sample(data_d, 200)

File "C:\Anaconda\lib\site-packages\dedupe\api.py", line 924, in sample
random_sample_size))

TypeError: unhashable type: 'numpy.ndarray'

最佳答案

可以更改 numpy 数组(它是“可变的”)。 Python 通过使用键的哈希值而不是键来加速字典访问。

因此只有数字、字符串或元组等可哈希对象才能用作字典中的键。来自 Python 术语表中 hashable 的定义:

An object is hashable if it has a hash value which never changes during its lifetime (it needs a __hash__() method), and can be compared to other objects (it needs an __eq__() method). Hashable objects which compare equal must have the same hash value.

Hashability makes an object usable as a dictionary key and a set member, because these data structures use the hash value internally.

All of Python’s immutable built-in objects are hashable, while no mutable containers (such as lists or dictionaries) are. Objects which are instances of user-defined classes are hashable by default; they all compare unequal (except with themselves), and their hash value is derived from their id().

关于Python - 重复数据删除问题 : TypeError: unhashable type: 'numpy.ndarray' ,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/27992436/

30 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com