gpt4 book ai didi

python - 搜索 csv 文件最快的方法是什么?

转载 作者:行者123 更新时间:2023-12-01 00:22:17 24 4
gpt4 key购买 nike

任务:检查文件中序列号和护照号码的可用性。

我的决定如下:

def check_passport(filename, series: str, number: str) -> dict:
"""
Find passport number and series
:param filename:csv filename path
:param series: passport series
:param number: passport number
:return:
"""
print(f'series={series}, number={number}')
find = False
start = datetime.datetime.now()
with open(filename, 'r', encoding='utf_8_sig') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
try:
for row in reader:
if row[0] == series and row[1] == num:
print(row[0])
print(row[1])
find = True
break
except Exception as e:
print(e)
print(datetime.datetime.now() - start)
if find:
return {'result': False, 'message': f'Passport found'}
else:
return {'result': False, 'message': f'Passport not found in Database'}

这是 csv 文件的一部分

PASSP_SERIES,PASSP_NUMBER
3604,015558
6003,711925
6004,461914
6001,789369

如果您的文件中没有护照,时间会更糟,因为您需要检查所有行。我的最好成绩是 53 秒。

最佳答案

检查了三种解决方案

  1. 原始帖子的 CSV 逐行方法
  2. 使用原始文本而不是使用 CSV 阅读器进行分区CSV 字段
  3. 使用 Pandas 读取和处理数据 block

结果使用 1000 万到 3000 万行执行测试。 enter image description here

摘要使用 Pandas 是最慢的方法。当观察this article时,这并不奇怪。被考虑(即 Pandas 由于其开销而成为读取 CSV 文件的较慢方法之一)。最快的方法是将文件作为原始文本文件处理并查找原始文本中的数字(比最初发布的使用 CSV 阅读器的方法快约 2 倍)。 Pandas 比原始方法慢约 30%。

测试代码

import timeit
import time
import random
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import math
import itertools

def wrapper(func, *args, **kwargs):
" Use to produced 0 argument function for call it"
# Reference https://www.pythoncentral.io/time-a-python-function/
def wrapped():
return func(*args, **kwargs)
return wrapped

def method_line_by_line(filename, series: str, number: str) -> dict:
"""
Find passport number and series
:param filename:csv filename path
:param series: passport series
:param number: passport number
:return:
"""
find = False
with open(filename, 'r', encoding='utf_8_sig') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
try:
for row in reader:
if row[0] == series and row[1] == num:
find = True
break
except Exception as e:
pass

if find:
return {'result': True, 'message': 'Passport found'}
else:
return {'result': False, 'message': 'Passport not found in Database'}

def method_raw_text(filename, series: str, number: str) -> dict:
"""
Find passport number and series by interating through text records
:param filename:csv filename path
:param series: passport series
:param number: passport number
:return:
"""
pattern = series + "," + number
with open(filename, 'r', encoding='utf_8_sig') as csvfile:
if any(map(lambda x: pattern == x.rstrip(), csvfile)): # iterates through text looking for match
return {'result': True, 'message': 'Passport found'}
else:
return {'result': False, 'message': 'Passport not found in Database'}

def method_pandas_chunks(filename, series: str, number: str) -> dict:
"""
Find passport number and series using Pandas in chunks
:param filename:csv filename path
:param series: passport series
:param number: passport number
:return:
"""
chunksize = 10 ** 5
for df in pd.read_csv(filename, chunksize=chunksize, dtype={'PASSP_SERIES': str,'PASSP_NUMBER':str}):

df_search = df[(df['PASSP_SERIES'] == series) & (df['PASSP_NUMBER'] == number)]

if not df_search.empty:
break

if not df_search.empty:
return {'result': True, 'message': 'Passport found'}
else:
return {'result': False, 'message': 'Passport not found in Database'}

def generate_data(filename, number_records):
" Generates random data for tests"
df = pd.DataFrame(np.random.randint(0, 1e6,size=(number_records, 2)), columns=['PASSP_SERIES', 'PASSP_NUMBER'])
df.to_csv(filename, index = None, header=True)
return df

def profile():
Nls = [x for x in range(10000000, 30000001, 5000000)] # range of number of test rows
number_iterations = 3 # repeats per test
methods = [method_line_by_line, method_raw_text, method_pandas_chunks]
time_methods = [[] for _ in range(len(methods))]

for N in Nls:
# Generate CSV File with N rows
generate_data('test.csv', N)

for i, func in enumerate(methods):
wrapped = wrapper(func, 'test.csv', 'x', 'y') # Use x & y to ensure we process entire
# file without finding a match
time_methods[i].append(math.log(timeit.timeit(wrapped, number=number_iterations)))

markers = itertools.cycle(('o', '+', '.'))
colors = itertools.cycle(('r', 'b', 'g'))
labels = itertools.cycle(('line-by-line', 'raw-text', 'pandas'))
print(time_methods)
for i in range(len(time_methods)):
plt.plot(Nls,time_methods[i],marker = next(markers),color=next(colors),linestyle='-',label=next(labels))

plt.xlabel('list size', fontsize=18)
plt.ylabel('log(time)', fontsize=18)
plt.legend(loc = 'upper left')
plt.show()

# Run Test
profile()

关于python - 搜索 csv 文件最快的方法是什么?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/58876624/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com