gpt4 book ai didi

Python文件查找+写入文件中输出奇怪的 "NUL"

转载 作者:太空宇宙 更新时间:2023-11-03 17:40:59 25 4
gpt4 key购买 nike

我正在编写一个下载器,它将 url 拆分为多个部分并使用线程下载,可能我不会使用“join”,因为 join = 无法流式传输(如果所有线程未完成,则无法写入文件)

但是问题是 f.seek 和 write 输出非常奇怪的文件,文件的内容总是有“NUL”字符(在 Notepad++ 中)并且文件中的文本只是整个文件的 1/3。

大家好,感谢大家帮助我,这是我的代码 2.0 版本,感谢 Padraic Cunningham 的建议和解释,我几乎按照您的建议修复了我的代码:所以请帮我检查代码,我想需要你们帮忙将其转换为 http.server 文件流方法:

import os, requests
import threading
import urllib3
import urllib.request, urllib.error, urllib.parse
import time
import re

pool = urllib3.PoolManager(maxsize=10)
URL = "https://raw.githubusercontent.com/langpavel/tampermonkey/master/src/emulation.js"
fileName = "1.js"
countsize = 0
#if os.path.exists(fileName):
# os.remove(fileName)

def defwrite(filename,data,offset):
f = open(filename,'wb')
f.seek(offset)
f.write(data)
f.close()

def buildRange(url, numsplits):
global pool
value = int(requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None))
print("Fullsize: ", value)
print("Try devide with 3 :", value / 3)
lst = []
for i in range(numsplits):
if i == range(numsplits):
lst.append('%s-%s' % (i * value//numsplits + 1, i * value//numsplits + 1 + (value - (i * value//numsplits + 1))))
if i == 0:
lst.append('%s-%s' % (0, value//numsplits))
else:
lst.append('%s-%s' % (i * value//numsplits + 1, (i + 1) * value//numsplits))
return lst

def main(url=None, splitBy=3):
global fileName, pool, countsize
start_time = time.time()
if not url:
print("Please Enter some url to begin download.")
return

#fileName = "1.jpg"

#print("%s bytes to download." % sizeInBytes)
# if not sizeInBytes:
# print("Size cannot be determined.")
# return
#sinzeInBytes = buildRange(url,
dataDict = {}
f = open(fileName,'wb')

# split total num bytes into ranges
#ranges = buildRange(url,int(sizeInBytes), splitBy)
ranges = buildRange(url, splitBy)
print(ranges)
def downloadChunk(idx, irange):
print(idx)
#time.sleep(1*idx)
#req = urllib.request.Request(url)
#req.headers['Range'] = 'bytes={}'.format(irange)
headers = urllib3._collections.HTTPHeaderDict()
headers.add('Range', 'bytes=' + str(irange))
data = pool.urlopen('GET', URL, headers=headers).data
#print(data)
#print("finish: " + str(irange))
offset = int(re.sub("(^.*?)-(.*?)$", "\\1", irange))
print(offset)
# print(irange)
f.seek(offset, 0)
#f.truncate(0)
#print(f.tell())
f.write(data)
#f.read()
#f.close()
countsize = countsize + offset


#defwrite("1.txt", req, re.sub("(^.*?)-", "\\1", str(irange)))

# create one downloading thread per chunk
downloaders = [
threading.Thread(
target=downloadChunk,
args=(idx, irange),
)
for idx,irange in enumerate(ranges)
]


# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
#th.isAlive()
#for th in downloaders:
#th.join()
#print(th.join)
print(countsize)
#print('done: got {} chunks, total {} bytes'.format(
# len(dataDict), sum( (
## len(chunk) for chunk in list(dataDict.values())
# ) )
#))

#print("--- %s seconds ---" % str(time.time() - start_time))

# if os.path.exists(fileName):
# os.remove(fileName)
#reassemble file in correct order
#with open(fileName, 'wb') as fh:
# for _idx,chunk in sorted(dataDict.items()):
# fh.write(chunk)
#stream_chunk = 16 * 1024
#with open(fileName, 'wb') as fp:
# while True:
# for _idx,chunk in sorted(dataDict.items()):
#fh.write(chunk)
# chunking = chunk.read(stream_chunk)
# if not chunk:
# break
# fp.write(chunking)


# print("Finished Writing file %s" % fileName)
#print('file size {} bytes'.format(os.path.getsize(fileName)))

if __name__ == '__main__':
if os.path.exists(fileName):
os.remove(fileName)
main(URL, splitBy=16)

这是我的代码,请帮我修复它:版本1.0,忽略它,版本2.0以上:

import os, requests
import threading
import urllib3
import urllib.request, urllib.error, urllib.parse
import time
import re

pool = urllib3.PoolManager(maxsize=10)
URL = "https://raw.githubusercontent.com/langpavel/tampermonkey/master/src/emulation.js"
fileName = "1.js"
#if os.path.exists(fileName):
# os.remove(fileName)

def defwrite(filename,data,offset):
f = open(filename,'wb')
f.seek(offset)
f.write(data)
f.close()

def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == range(numsplits):
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(value - round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst

def main(url=None, splitBy=3):
global fileName, pool
start_time = time.time()
if not url:
print("Please Enter some url to begin download.")
return

#fileName = "1.jpg"
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print("%s bytes to download." % sizeInBytes)
if not sizeInBytes:
print("Size cannot be determined.")
return

dataDict = {}

# split total num bytes into ranges
ranges = buildRange(int(sizeInBytes), splitBy)

def downloadChunk(idx, irange):
print(idx)
#req = urllib.request.Request(url)
#req.headers['Range'] = 'bytes={}'.format(irange)
headers = urllib3._collections.HTTPHeaderDict()
headers.add('Range', 'bytes=' + str(irange))
data = pool.urlopen('GET', URL, headers=headers).data
print(data)
print("finish: " + str(irange))
offset = int(re.sub("(^.*?)-(.*?)$", "\\1", irange))
#print(offset)
# print(irange)
f = open(fileName,'wb')
f.seek(offset)
#f.truncate(0)
#print(f.tell())
f.write(data)
#f.read()
#f.close()



#defwrite("1.txt", req, re.sub("(^.*?)-", "\\1", str(irange)))

# create one downloading thread per chunk
downloaders = [
threading.Thread(
target=downloadChunk,
args=(idx, irange),
)
for idx,irange in enumerate(ranges)
]

# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
#th.isAlive()
#for th in downloaders:
#th.join()
#print(th.join)

#print('done: got {} chunks, total {} bytes'.format(
# len(dataDict), sum( (
## len(chunk) for chunk in list(dataDict.values())
# ) )
#))

#print("--- %s seconds ---" % str(time.time() - start_time))

# if os.path.exists(fileName):
# os.remove(fileName)
#reassemble file in correct order
#with open(fileName, 'wb') as fh:
# for _idx,chunk in sorted(dataDict.items()):
# fh.write(chunk)
#stream_chunk = 16 * 1024
#with open(fileName, 'wb') as fp:
# while True:
# for _idx,chunk in sorted(dataDict.items()):
#fh.write(chunk)
# chunking = chunk.read(stream_chunk)
# if not chunk:
# break
# fp.write(chunking)


# print("Finished Writing file %s" % fileName)
#print('file size {} bytes'.format(os.path.getsize(fileName)))

if __name__ == '__main__':
main(URL, splitBy=3)

最佳答案

您使用三个线程,其中目标函数是 downloadChunk,您使用 wb 打开文件三次,覆盖,因此您得到 1/3内容。您还无缘无故地调用了 search 。如果您想附加到文件,则每次都可以使用 a 打开文件,或者在函数之外打开文件一次。您正在尝试使用空文件进行查找并写入,这就是空字节的来源。

如果您想打开一个文件进行读写,以便可以使用行缓冲进行查找:

 with open("whatever.file", "r+b",buffering=1) as f

然后使用该文件进行写入,不要一直在函数中打开并覆盖,该文件也必须存在。

关于Python文件查找+写入文件中输出奇怪的 "NUL",我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/30556258/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com