gpt4 book ai didi

python - 对大型数据集进行操作

转载 作者:太空狗 更新时间:2023-10-30 01:37:56 25 4
gpt4 key购买 nike

我必须对包含 DNA 序列片段信息的 PSL 记录进行一些分析。基本上我必须在相同的重叠群中找到来自相同读取的条目(这些都是 PSL 条目中的值)。问题是 PSL 记录很大(10-30 Mb 文本文档)。我写了一个程序,在给定足够时间的情况下,它可以处理短记录和长记录,但它花费的时间比指定的时间长。有人告诉我该程序不应超过 15 秒。我的花了超过 15 分钟。

PSL 记录如下所示:

275 11  0   0   0   0   0   0   -   M02034:35:000000000-A7UU0:1:1101:19443:1992/2   286 0   286 NODE_406138_length_13407_cov_13.425076  13465   408 694 1   286,    0,  408,

171 5 0 0 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:13497:2001/2 294 0 176 NODE_500869_length_34598_cov_30.643419 34656 34334 34510 1 176, 0, 34334,

188 14 0 10 0 0 0 0 + M02034:35:000000000-A7UU0:1:1101:18225:2002/1 257 45 257 NODE_455027_length_12018_cov_13.759444 12076 11322 11534 1 212, 45, 11322,

我的代码是这样的:

import sys
class PSLreader :
'''
Class to provide reading of a file containing psl alignments
formatted sequences:
object instantiation:
myPSLreader = PSLreader(<file name>):

object attributes:
fname: the initial file name

methods:
readPSL() : reads psl file, yielding those alignments that are within the first or last
1000 nt

readPSLpairs() : yields psl pairs that support a circular hypothesis

Author: David Bernick
Date: May 12, 2013
'''

def __init__ (self, fname=''):
'''contructor: saves attribute fname '''

self.fname = fname

def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)

def readPSL (self):
'''
using filename given in init, returns each filtered psl records
that contain alignments that are within the terminal 1000nt of
the target. Incomplete psl records are discarded.
If filename was not provided, stdin is used.

This method selects for alignments that could may be part of a
circle.

Illumina pairs aligned to the top strand would have read1(+) and read2(-).
For the bottoms trand, read1(-) and read2(+).

For potential circularity,
these are the conditions that can support circularity:
read1(+) near the 3' terminus
read1(-) near the 5' terminus
read2(-) near the 5' terminus
read2(+) near the 3' terminus

so...
any read(+) near the 3', or
any read(-) near the 5'

'''

nearEnd = 1000 # this constant determines "near the end"
with self.doOpen() as fileH:

for line in fileH:
pslList = line.split()
if len(pslList) < 17:
continue
tSize = int(pslList[14])
tStart = int(pslList[15])
strand = str(pslList[8])

if strand.startswith('+') and (tSize - tStart > nearEnd):
continue
elif strand.startswith('-') and (tStart > nearEnd):
continue

yield line

def readPSLpairs (self):
read1 = []
read2 = []

for psl in self.readPSL():
parsed_psl = psl.split()
strand = parsed_psl[9][-1]
if strand == '1':
read1.append(parsed_psl)
elif strand == '2':
read2.append(parsed_psl)

output = {}
for psl1 in read1:
name1 = psl1[9][:-1]
contig1 = psl1[13]
for psl2 in read2:
name2 = psl2[9][:-1]
contig2 = psl2[13]
if name1 == name2 and contig1 == contig2:
try:
output[contig1] += 1
break
except:
output[contig1] = 1
break

print(output)


PSL_obj = PSLreader('EEV14-Vf.filtered.psl')
PSL_obj.readPSLpairs()

我得到了一些示例代码,如下所示:

def doSomethingPairwise (a):
for leftItem in a[1]:
for rightItem in a[2]:
if leftItem[1] is rightItem[1]:
print (a)
thisStream = [['David', 'guitar', 1], ['David', 'guitar', 2],
['John', 'violin', 1], ['John', 'oboe', 2],
['Patrick', 'theremin', 1], ['Patrick', 'lute',2] ]
thisGroup = None
thisGroupList = [ [], [], [] ]

for name, instrument, num in thisStream:
if name != thisGroup:

doSomethingPairwise(thisGroupList)

thisGroup = name
thisGroupList = [ [], [], [] ]

thisGroupList[num].append([name, instrument, num])
doSomethingPairwise(thisGroupList)

但是当我尝试实现它时,我的程序仍然花费了很长时间。我在想这个错误的方式吗?我意识到嵌套循环很慢,但我看不到替代方法。

编辑:我想通了,数据是预先分类的,这使得我的蛮力解决方案非常不切实际和不必要。

最佳答案

希望能帮到你,因为这个问题需要一个最好的输入示例文件

#is better create PSLRecord class
class PSLRecord:
def __init__(self, line):
pslList = line.split()
properties = ("matches", "misMatches", "repMatches", "nCount",
"qNumInsert", "qBaseInsert", "tNumInsert",
"tBaseInsert", "strand", "qName", "qSize", "qStart",
"qEnd", "tName", "tSize", "tStart", "tEnd", "blockCount",
"blockSizes", "qStarts", "tStarts")
self.__dict__.update(dict(zip(properties, pslList)))

class PSLreader :
def __init__ (self, fname=''):
self.fname = fname

def doOpen (self):
if self.fname is '':
return sys.stdin
else:
return open(self.fname)

def readPSL (self):
with self.doOpen() as fileH:
for line in fileH:
pslrc = PSLRecord(line)
yield pslrc

#return a dictionary with all psl records group by qName and tName
def readPSLpairs (self):
dictpsl = {}
for pslrc in self.readPSL():
#OP requirement, remove '1' or '2' char, in pslrc.qName[:-1]
key = (pslrc.qName[:-1], pslrc.tName)
if not key in dictpsl:
dictpsl[key] = []
dictpsl[key].append(pslrc)
return dictpsl

#Function filter .... is better out and self-contained
def f_filter(pslrec, nearEnd = 1000):
if (pslrec.strand.startswith('+') and
(int(pslrec.tSize) - int(pslrec.tStart) > nearEnd)):
return False
if (pslrec.strand.startswith('-') and
(int(pslrec.tStart) > nearEnd)):
return False
return True

PSL_obj = PSLreader('EEV14-Vf.filtered.psl')

#read dictionary of pairs
dictpsl = PSL_obj.readPSLpairs()

from itertools import product
#product from itertools
#(1) x (2,3) = (1,2),(1,3)

output = {}
for key, v in dictpsl.items():
name, contig = key
#i get filters aligns in principal strand
strand_princ = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '1']
#i get filters aligns in secondary strand
strand_sec = [pslrec for pslrec in v if f_filter(pslrec) and
pslrec.qName[-1] == '2']
for pslrec_princ, pslrec_sec in product(strand_princ, strand_sec):
#This For has fewer comparisons, since I was grouped before
if not contig in output:
output[contig] = 1
output[contig] += 1

注意:如果你问我,10-30 Mb 不是大文件

关于python - 对大型数据集进行操作,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/30448434/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com