gpt4 book ai didi

python - 在 Python 中快速计算频率

转载 作者:太空宇宙 更新时间:2023-11-04 03:38:41 28 4
gpt4 key购买 nike

我需要计算语料库中单词的出现频率。通常我使用 collections 包中的 Counter 类。

from collections import Counter
list_of_words = ['one', 'two', 'three', 'three']
freqs = Counter(list_of_words)

但是,我正在分析的语料库由几百万个单词组成,所以如果有更快的方法来计算这些分数会很好吗?

下面是读入单词的代码:

from read_cg3 import read_cg3

test = read_cg3('/Users/arashsaidi/Work/Corpus/DUO_Corpus/Bokmaal-tagged-random/DUO_BM_0.txt')
count = 0
word_list = []
for sentence in test:
for word in sentence:
count += 1
word_list.append(word)
print count

read_cg3 是一个读取已解析文件并返回句子列表的模块。这是模块:

import re


def is_number(s):
try:
float(s)
return True
except ValueError:
return False


def read_cg3(cg3_file):
"""
Reads a cg3 file and returns a list of each sentence with Token, parsed, and one tag
:param cg3_file: path to file
:return: list of words + attributes
"""
rx_token = re.compile("^\"<(.+?)>\"$")
rx_attributes = re.compile("^\s+\".+?\"\s+.+$")
rx_eos = re.compile("^\s*$")

curr_token = None
curr_word = []
curr_sentence = []
result = []

with open(cg3_file) as cg3_file:
for line in cg3_file:

if rx_token.match(line):
curr_token = "\"%s\"" % rx_token.match(line).group(1)
# print curr_token

if rx_attributes.match(line):
curr_word = line.split()
# print curr_word[0], curr_word[1]
# print curr_word
if curr_token and curr_word:
# to get more tags uncomment this and comment below
# curr_sentence += [[curr_token] + curr_word]
if '$' not in curr_word[0] and not is_number(curr_word[0].strip('"').replace('.', '')) \
and len(curr_word[0]) < 30:
# curr_sentence += [[curr_token.strip('"')] +
# [curr_word[0].lower().strip('"')] + [curr_word[1]]]
curr_sentence += [curr_word[0].lower().strip('"')]
curr_token = None
curr_word = []

if rx_eos.match(line):
# print curr_sentence
if curr_sentence:
result += [curr_sentence]
curr_sentence = []
curr_token = None
curr_word = []

# cleanup if last sentence not EOL
if curr_token and curr_word:
print 'cg3 reached end of file and did some cleanup on file {}'.format(cg3_file)
curr_sentence += [[curr_token] + curr_word]

if curr_sentence:
print 'cg3 reached end of file and did some cleanup on file {}'.format(cg3_file)
result += curr_sentence

return result

这是 read_cg3 读取文件的方式:

"<TEKNOLOGI>"
"teknologi" subst appell mask ub ent
"<OG>"
"og" konj <*>
"<UNDERVISNING>"
"undervisning" subst appell fem ub ent <*>
"<|>"
"$|" clb <overskrift> <<<

"<En>"
"en" det mask ent kvant
"<intervjuunders¯kelse>"
"intervjuunders¯kelse" subst appell mask ub ent
"<av>"
"av" prep
"<musikklÊreres>"
"musikklÊrer" subst appell mask ub fl gen
"<didaktiske>"
"didaktisk" adj fl pos
"<bruk>"
"bruk" subst appell mask ub ent
"<av>"
"av" prep
"<digitale>"
"digital" adj fl pos
"<verkt¯y>"
"verkt¯y" subst appell n¯yt ub fl <*¯y>
"<i>"
"i" prep
"<undervisningsfaget>"
"undervisningsfag" subst appell n¯yt be ent
"<komposisjon>"
"komposisjon" subst appell mask ub ent
"<i>"
"i" prep
"<videregÂende>"
"videregÂende" adj ub m/f ent pos
"<skole>"
"skole" subst appell mask ub ent
"<|>"
"$|" clb <overskrift> <<<

"<Markus>"
"Markus" subst prop mask
"<A.>"
"A." subst prop fork <*>
"<SkjÊrstad>"
"SkjÊrstad" subst prop <*stad> <*>
"<|>"
"$|" clb <overskrift> <<<

我的方法只读入一个文件,这是为了测试,语料库由大约30000个文件组成。

最佳答案

看起来您不需要使用标记,并且您的正则表达式可以取消。这将计算每个单词在每个文件中出现的次数:

import multiprocessing as mp
import os
import itertools

def wordCounter(qIn, qOut):
answer = {}
for fname, words in iter(qIn.get, None):
for word in words:
if fname not in answer:
answer[fname] = {}
if word not in answer[fname]:
answer[fname][word] = 0
answer[fname][word] += 1
qOut.put(answer)


def getLines(corpusPath, qIn, numProcs):
for fname in os.listdir(corpusPath):
with open(os.path.join(corpusPath, fname)) as infile:
for i, (k,lines) in enumerate(itertools.groupby((l.strip() for l in infile), lambda line : bool(line) and not line.startswith('"<') and "$" not in line.split(None,1)[0])):
if not k:
continue
qIn.put((fname, [line.split(None,1)[0].strip('"').strip().lower() for line in lines]))

for _ in range(numProcs):
qIn.put(None)


def main(corpusPath):
qIn, qOut = [mp.Queue() for _ in range(2)]
procs = [mp.Process(target=wordCounter, args=(qIn, qOut)) for _ in range(mp.cpu_count() -1)]

lineGetter = mp.Process(target=getLines, args=(corpusPath, qIn, len(procs)))
lineGetter.start()

for p in procs:
p.start()

answer = {}
for _ in range(len(procs)):
for fname, wdict in qOut.get().items():
if fname not in answer:
answer[fname] = {}
for word,count in wdict.items():
if word not in answer[fname]:
answer[fname][word] = 0
answer[fname][word] += count

for fname in sorted(answer):
for word in sorted(answer[fname]):
print("{} appeared in {} {} times".format(word, fname, answer[fname][word]))

for p in procs:
p.terminate()
lineGetter.terminate()

整个过程只用了不到一秒钟的时间来处理您的测试文件。请注意,其中一些是由于设置子进程的开销造成的,因此这应该可以更好地扩展到大型语料库。

希望对你有帮助

关于python - 在 Python 中快速计算频率,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/27649310/

28 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com