gpt4 book ai didi

python - 如何获取2个连续记录之间的 "time difference"?

转载 作者:行者123 更新时间:2023-12-02 08:08:35 26 4
gpt4 key购买 nike

鉴于我有一个数据集如下:

import pandas as pd
import numpy as np

dt = {
"facility":["Ann Arbor","Ann Arbor","Detriot","Detriot","Detriot"],
"patient_ID":[4388,4388,9086,9086,9086],
"year":[2004,2007,2007,2008,2011],
"month":[8,9,9,6,2],
"Nr_Small":[0,0,5,12,10],
"Nr_Medium":[3,1,1,4,3],
"Nr_Large":[2,0,0,0,0]
}

dt = pd.DataFrame(dt)
dt.head()

对于每组用户(将其视为 groupby Patient_ID ),我希望获得 year 之间的差异月 每两个连续行之间。这是我的代码:

patients = dt['patient_ID'].unique()


for patient in patients:
print(patient)
patientDT = dt[ dt.patient_ID == patient] # Get group of records for each paitent
patientDT['NumberOfVisits'] = np.shape(patientDT)[0] # Add number of records for each paitent as a new column

patientDT.sort_values(['year', 'month'], ascending=[True, True],inplace=True) # sort by year and month
patientDT = addPeriodBetween2Visits(patientDT)

print(patientDT)

print("------------------------------")

然后必须获取差异的方法是addPeriodBetween2Visits:

def addPeriodBetween2Visits(patientDT):

for i in range(0,np.shape(patientDT)[0]):

if(i == 0):
patientDT['PeriodBetween2Visits'] = 0
else:
lastVisit = patientDT.loc[i-1,'year']*12 + patientDT.loc[i-1,'month']
recentVisit = patientDT.loc[i,'year']*12 + patientDT.loc[i,'month']
patientDT.loc[i,'PeriodBetween2Visits'] = recentVisit - lastVisit


return patientDT

不幸的是,它失败了,但我不清楚这个错误。这是jupyter笔记本中的错误日志:

KeyError                                  Traceback (most recent call last)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2656 try:
-> 2657 return self._engine.get_loc(key)
2658 except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

KeyError: 0

During handling of the above exception, another exception occurred:

KeyError Traceback (most recent call last)
<ipython-input-4-eda12d36a355> in <module>
8
9 patientDT.sort_values(['year', 'month'], ascending=[True, True],inplace=True)
---> 10 patientDT = addPeriodBetween2Visits(patientDT)
11
12 print(patientDT)

<ipython-input-2-c8b1e6851452> in addPeriodBetween2Visits(patientDT)
7 else:
8 #print(patientDT.loc[i-1,'year'])
----> 9 lastVisit = patientDT.loc[i-1,'year']*12 + patientDT.loc[i-1,'month']
10 recentVisit = patientDT.loc[i,'year']*12 + patientDT.loc[i,'month']
11 patientDT.loc[i,'PeriodBetween2Visits'] = recentVisit - lastVisit

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1492 except (KeyError, IndexError, AttributeError):
1493 pass
-> 1494 return self._getitem_tuple(key)
1495 else:
1496 # we by definition only have the 0th axis

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_tuple(self, tup)
866 def _getitem_tuple(self, tup):
867 try:
--> 868 return self._getitem_lowerdim(tup)
869 except IndexingError:
870 pass

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_lowerdim(self, tup)
986 for i, key in enumerate(tup):
987 if is_label_like(key) or isinstance(key, tuple):
--> 988 section = self._getitem_axis(key, axis=i)
989
990 # we have yielded a scalar ?

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1911 # fall thru to straight lookup
1912 self._validate_key(key, axis)
-> 1913 return self._get_label(key, axis=axis)
1914
1915

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _get_label(self, label, axis)
139 raise IndexingError('no slices here, handle elsewhere')
140
--> 141 return self.obj._xs(label, axis=axis)
142
143 def _get_loc(self, key, axis=None):

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level)
3583 drop_level=drop_level)
3584 else:
-> 3585 loc = self.index.get_loc(key)
3586
3587 if isinstance(loc, np.ndarray):

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2657 return self._engine.get_loc(key)
2658 except KeyError:
-> 2659 return self._engine.get_loc(self._maybe_cast_indexer(key))
2660 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2661 if indexer.ndim > 1 or indexer.size > 1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

KeyError: 0

这是最喜欢的输出:

enter image description here

如果您具体告诉我为什么我的代码不起作用。

最佳答案

对于计数,请使用 Series.mapSeries.value_counts .

使用DataFrameGroupBy.diff使用 Series,因此按列 dt['patent_ID'] 分组,然后替换缺失值并转换为整数:

dt['NumberOfVisits'] = dt['patient_ID'].map(dt['patient_ID'].value_counts())

dt["PeriodBetween2Visits"] = (dt["year"].mul(12)
.add(dt["month"])
.groupby(dt['patient_ID'])
.diff()
.fillna(0)
.astype(int))


print (dt)
facility patient_ID year month Nr_Small Nr_Medium Nr_Large \
0 Ann Arbor 4388 2004 8 0 3 2
1 Ann Arbor 4388 2007 9 0 1 0
2 Detriot 9086 2007 9 5 1 0
3 Detriot 9086 2008 6 12 4 0
4 Detriot 9086 2011 2 10 3 0

NumberOfVisits PeriodBetween2Visits
0 2 0
1 2 37
2 3 0
3 3 9
4 3 32

关于python - 如何获取2个连续记录之间的 "time difference"?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/59702732/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com