gpt4 book ai didi

python - 在 Python 中解析大型 XML 文件时出现内存错误

转载 作者:行者123 更新时间:2023-12-03 21:40:34 46 4
gpt4 key购买 nike

我的 XML 文件如下所示:

<root>
<group from="1", to="100">
<link target="1"/>
...
<link target="100"/>
</group>
...
</root>

我有一个 6000 <group>元素和 5M <link>元素。我想要一个以元组( fromto )为键的字典和一个 <link> 的列表s' target属性,但我收到以下代码的内存错误:

from lxml import etree
from gzip import open as gopen

def extractTargets(fin):
targets = dict()

with gopen(fin) as xml:
context = etree.iterparse(xml, tag="group")

for event, elem in context:
targets[(elem.get("from"), elem.get("to"))] = elem.xpath("link/@target")
elem.clear()

while elem.getprevious() is not None:
del elem.getparent()[0]
del context

最佳答案

尝试以下代码:

lxml文件

import lxml.etree
from gzip import open as gopen

class GroupDictTarget(object):
def __init__(self, d):
self.d = d
def start(self, tag, attrib):
if tag == 'group':
self.group = self.d[attrib['from'], attrib['to']] = []
elif tag == 'link':
self.group.append(attrib['target'])
def close(self):
pass

def extractTargets(fin):
with gopen(fin) as xml:
targets = {}
parser = lxml.etree.XMLParser(target=GroupDictTarget(targets))
lxml.etree.parse(xml, parser)
return targets

xml.parsers.expat
import xml.parsers.expat
from gzip import open as gopen

class GroupDictTarget(object):
# Same as above

def extractTargets(fin):
targets = {}
p = xml.parsers.expat.ParserCreate()
p.StartElementHandler = GroupDictTarget(targets).start
with gopen(fin) as f:
p.ParseFile(f)
return targets

sax文件
import xml.sax
from gzip import open as gopen

class GroupDictTarget(object):
# Same as above

def extractTargets(fin):
targets = {}
handler = xml.sax.handler.ContentHandler()
handler.startElement = GroupDictTarget(targets).start
with gopen(fin) as f:
xml.sax.parse(f, handler)
return targets

关于python - 在 Python 中解析大型 XML 文件时出现内存错误,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/17252010/

46 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com