gpt4 book ai didi

python - 保留第一个并删除接下来的重复行

转载 作者:太空宇宙 更新时间:2023-11-03 16:25:20 24 4
gpt4 key购买 nike

我想保留重复项集中的第一行。我还尝试附加当前的输入文件。我的疑问是,如果同时删除重复项并附加同一文件,是否可能?如果是这样,那么下面引用 Pandas 文档的代码不起作用。

输入:

,id_merged,time_1,time_2,gps_1or3,gps_2or4
0,00022d9064bc,1073260801,1073260803,819251,440006 #duplicate_keep
1,00022d9064bc,1073260801,1073260803,819251,440006 #duplicate_remove
2,00022d9064bc,1073260801,1073260803,819251,440006 #duplicate_remove
3,00022d9064bc,1073260801,1073260803,819251,440006 #duplicate_remove
4,00022d9064bc,1073260803,1073260810,819213,439954
5,00904b4557d3,1073260803,1073261920,817526,439458
6,00022de73863,1073260804,1073265410,817558,439525
7,00904b14b494,1073260804,1073262625,817558,439525
8,00904b14b494,1073260804,1073265163,817558,439525
9,00904b14b494,1073260804,1073263786,817558,439525
10,00022d1406df,1073260807,1073260809,820428,438735
0,00022d9064bc,1073260801,1073260803,819251,440006
1,00022dba8f51,1073260801,1073260803,819251,440006
2,00022de1c6c1,1073260801,1073260803,819251,440006
3,003065f30f37,1073260801,1073260803,819251,440006
4,00904b48a3b6,1073260801,1073260803,819251,440006
5,00904b83a0ea,1073260803,1073260810,819213,439954
6,00904b85d3cf,1073260803,1073261920,817526,439458
7,00904b14b494,1073260804,1073265410,817558,439525
8,00904b99499c,1073260804,1073262625,817558,439525
9,00904bb96e83,1073260804,1073265163,817558,439525
10,00904bf91b75,1073260804,1073263786,817558,439525

预期输出:索引=无,标题=无

00022d9064bc,1073260801,1073260803,819251,440006
00022d9064bc,1073260803,1073260810,819213,439954
00904b4557d3,1073260803,1073261920,817526,439458
00022de73863,1073260804,1073265410,817558,439525
00904b14b494,1073260804,1073262625,817558,439525
00022d1406df,1073260807,1073260809,820428,438735
00022d9064bc,1073260801,1073260803,819251,440006
00022dba8f51,1073260801,1073260803,819251,440006
00022de1c6c1,1073260801,1073260803,819251,440006
003065f30f37,1073260801,1073260803,819251,440006
00904b48a3b6,1073260801,1073260803,819251,440006
00904b83a0ea,1073260803,1073260810,819213,439954
00904b85d3cf,1073260803,1073261920,817526,439458
00904b14b494,1073260804,1073265410,817558,439525
00904b99499c,1073260804,1073262625,817558,439525
00904bb96e83,1073260804,1073265163,817558,439525
00904bf91b75,1073260804,1073263786,817558,439525

匹配该行的每个元素,如果整行都是重复的,则保留第一个元素并删除其余的重复项。

代码:

from StringIO import StringIO
import pandas as pd

df = pd.read_csv(StringIO('input.csv'), index_col=[0], header=[' ','id_merged','time_1','time_2','gps_1or3','gps_2or4'])


df.drop_duplicates(keep='first')

df.to_csv('dart_small_final.csv',mode = 'a',header=False, index=False)

编辑一个:

import csv
import pandas as pd

df = pd.read_csv('dart_small_final.csv', index_col=[0], header=[' ','id_merged','time_1','time_2','gps_1or3','gps_2or4'])


df.drop_duplicates(keep=first, inplace=True)
df.reset_index(drop=True, inplace=True)

df.to_csv('dart_final.csv', header=None, index=None)

错误:

Traceback (most recent call last):
File "remove_dup.py", line 4, in <module>
df = pd.read_csv('dart_small_final.csv', index_col=[0], header=[' ','id_merged','time_1','time_2','gps_1or3','gps_2or4'])
File "/usr/local/lib/python2.7/dist-packages/pandas/io/parsers.py", line 562, in parser_f
return _read(filepath_or_buffer, kwds)
File "/usr/local/lib/python2.7/dist-packages/pandas/io/parsers.py", line 315, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "/usr/local/lib/python2.7/dist-packages/pandas/io/parsers.py", line 645, in __init__
self._make_engine(self.engine)
File "/usr/local/lib/python2.7/dist-packages/pandas/io/parsers.py", line 799, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "/usr/local/lib/python2.7/dist-packages/pandas/io/parsers.py", line 1213, in __init__
self._reader = _parser.TextReader(src, **kwds)
File "pandas/parser.pyx", line 504, in pandas.parser.TextReader.__cinit__ (pandas/parser.c:4950)
TypeError: cannot concatenate 'str' and 'int' objects

最佳答案

您需要添加参数inplace=True:

import pandas as pd
import io

temp=u""",id_merged,time_1,time_2,gps_1or3,gps_2or4
0,00022d9064bc,1073260801,1073260803,819251,440006
1,00022d9064bc,1073260801,1073260803,819251,440006
2,00022d9064bc,1073260801,1073260803,819251,440006
3,00022d9064bc,1073260801,1073260803,819251,440006
4,00022d9064bc,1073260803,1073260810,819213,439954
5,00904b4557d3,1073260803,1073261920,817526,439458
6,00022de73863,1073260804,1073265410,817558,439525
7,00904b14b494,1073260804,1073262625,817558,439525
8,00904b14b494,1073260804,1073265163,817558,439525
9,00904b14b494,1073260804,1073263786,817558,439525
10,00022d1406df,1073260807,1073260809,820428,438735
0,00022d9064bc,1073260801,1073260803,819251,440006
1,00022dba8f51,1073260801,1073260803,819251,440006
2,00022de1c6c1,1073260801,1073260803,819251,440006
3,003065f30f37,1073260801,1073260803,819251,440006
4,00904b48a3b6,1073260801,1073260803,819251,440006
5,00904b83a0ea,1073260803,1073260810,819213,439954
6,00904b85d3cf,1073260803,1073261920,817526,439458
7,00904b14b494,1073260804,1073265410,817558,439525
8,00904b99499c,1073260804,1073262625,817558,439525
9,00904bb96e83,1073260804,1073265163,817558,439525
10,00904bf91b75,1073260804,1073263786,817558,439525"""
#after testing replace io.StringIO(temp) to filename
df = pd.read_csv(io.StringIO(temp), index_col=0)
print (df)
id_merged time_1 time_2 gps_1or3 gps_2or4
0 00022d9064bc 1073260801 1073260803 819251 440006
1 00022d9064bc 1073260801 1073260803 819251 440006
2 00022d9064bc 1073260801 1073260803 819251 440006
3 00022d9064bc 1073260801 1073260803 819251 440006
4 00022d9064bc 1073260803 1073260810 819213 439954
5 00904b4557d3 1073260803 1073261920 817526 439458
6 00022de73863 1073260804 1073265410 817558 439525
7 00904b14b494 1073260804 1073262625 817558 439525
8 00904b14b494 1073260804 1073265163 817558 439525
9 00904b14b494 1073260804 1073263786 817558 439525
10 00022d1406df 1073260807 1073260809 820428 438735
0 00022d9064bc 1073260801 1073260803 819251 440006
1 00022dba8f51 1073260801 1073260803 819251 440006
2 00022de1c6c1 1073260801 1073260803 819251 440006
3 003065f30f37 1073260801 1073260803 819251 440006
4 00904b48a3b6 1073260801 1073260803 819251 440006
5 00904b83a0ea 1073260803 1073260810 819213 439954
6 00904b85d3cf 1073260803 1073261920 817526 439458
7 00904b14b494 1073260804 1073265410 817558 439525
8 00904b99499c 1073260804 1073262625 817558 439525
9 00904bb96e83 1073260804 1073265163 817558 439525
10 00904bf91b75 1073260804 1073263786 817558 439525
df.drop_duplicates(keep='first', inplace=True)
#or assign output to df
#df = df.drop_duplicates(keep='first')
df.reset_index(drop=True, inplace=True)
print (df)
id_merged time_1 time_2 gps_1or3 gps_2or4
0 00022d9064bc 1073260801 1073260803 819251 440006
1 00022d9064bc 1073260803 1073260810 819213 439954
2 00904b4557d3 1073260803 1073261920 817526 439458
3 00022de73863 1073260804 1073265410 817558 439525
4 00904b14b494 1073260804 1073262625 817558 439525
5 00904b14b494 1073260804 1073265163 817558 439525
6 00904b14b494 1073260804 1073263786 817558 439525
7 00022d1406df 1073260807 1073260809 820428 438735
8 00022dba8f51 1073260801 1073260803 819251 440006
9 00022de1c6c1 1073260801 1073260803 819251 440006
10 003065f30f37 1073260801 1073260803 819251 440006
11 00904b48a3b6 1073260801 1073260803 819251 440006
12 00904b83a0ea 1073260803 1073260810 819213 439954
13 00904b85d3cf 1073260803 1073261920 817526 439458
14 00904b14b494 1073260804 1073265410 817558 439525
15 00904b99499c 1073260804 1073262625 817558 439525
16 00904bb96e83 1073260804 1073265163 817558 439525
17 00904bf91b75 1073260804 1073263786 817558 439525

关于python - 保留第一个并删除接下来的重复行,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/38040929/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com