gpt4 book ai didi

python - 请求和多处理

转载 作者:太空宇宙 更新时间:2023-11-03 14:38:53 25 4
gpt4 key购买 nike

所以我试图同时在多个网站上使用 requestsBeautifulSoup ,但由于某种原因我无法使其工作。这是一个完整的示例:

import multiprocessing as mp
import requests
from bs4 import BeautifulSoup
from random import randint

# Define an output queue


class Spider(object):
"""docstring for Spider"""
def __init__(self):
super(Spider, self).__init__()

# define a example function
def rand_string(length, output):
random_post=randint(1000000,9999999)
response=requests.get('https://stackoverflow.com/questions/'+str(random_post))
soup=BeautifulSoup(response.content,'lxml')
try:
title=soup.find('a',{'class':'question-hyperlink'}).string
except:
title="not found"

output.put(title)

# Setup a list of processes that we want to run
def run(self):

output = mp.Queue()
processes = [mp.Process(target=Spider.rand_string, args=(x, output)) for x in range(10)]

for p in processes:
p.start()

# Exit the completed processes

for p in processes:
p.join()

# Get process results from the output queue

results = [output.get() for p in processes]
print(results)

# Run processes

if __name__ == '__main__':

spider=Spider()
spider.run()

最佳答案

我添加了一堆调试打印语句来遵循您的流程并得出了一些结论......

  1. 有时您可能会遇到 bs4 的递归深度限制...
  2. 您之前链接的答案(在评论中)确实与您的问题相关。
  3. Windows 没有 fork()是一个巨大的痛苦。

您的主要错误位于 rand_string()与行:

title=soup.find('a',{'class':'question-hyperlink'}).string

这返回了 <class 'bs4.element.NavigableString'>而不是<class str> 。当这被传递到mp.Queue.put()时对它进行腌制以便通过内部管道发送的尝试因递归错误而失败,从而导致队列停滞。我不确定是否真的可以通过 pickled 管道发送 bs4 元素(也许你将引用循环转换为弱引用?),但总是发送简单的 python 对象要容易得多。我还将队列的创建移至主上下文(在 spider.run() 之外),尽管这并不是特别必要,只要它仅由主线程执行即可。这是我的最终迭代中的调试代码,以便您可以遵循我的测试方法:

from multiprocessing import Process, Queue, current_process
import requests
from bs4 import BeautifulSoup
from random import randint
import sys
#sys.setrecursionlimit(1000)

class Spider(object):
"""docstring for Spider"""

# define a example function
@staticmethod
def rand_string(length, output):

<b>print("{} entry point".format(current_process().name))</b>
random_post=randint(1000000,9999999)
response=requests.get('https://stackoverflow.com/questions/'+str(random_post))
<b>print("{} got request response".format(current_process().name))</b>
soup=BeautifulSoup(response.content,'lxml')
try:
title = soup.find('a',{'class':'question-hyperlink'}).string
except:
title = "not found"

<b>print("{} got title: '{}' of type: {}".format(current_process().name, title, type(title)))</b>

<b><em>###### This did it ######
title = str(title) #fix or fake news?</em></b>

output.put([title,current_process().name])
output.close()
<b>print("{} exit point".format(current_process().name))</b>


# Setup a list of processes that we want to run
# @staticmethod
def run(self, outq):
processes = []
for x in range(5):
processes.append(Process(target=self.rand_string, name="process_{}".format(x), args=(x, outq,),) )
<b>print("creating process_{}".format(x))</b>

for p in processes:
p.start()
<b>print("{} started".format(p.name))</b>

# Exit the completed processes
for p in processes:
p.join()
<b>print("successuflly joined {}".format(p.name))</b>

# Get process results from the output queue
<b>print("joined all workers")</b>
# return None
out = []
while not outq.empty():
result = outq.get()
<b>print("got {}".format(result))</b>
out.append(result)
return out

# Run processes
if __name__ == '__main__':
outq = Queue()
spider=Spider()
out = spider.run(outq)
<b>print("done")</b>

以及运行所述代码的输出:

creating process_0creating process_1creating process_2creating process_3creating process_4process_0 startedprocess_1 startedprocess_2 startedprocess_3 startedprocess_4 startedprocess_2 entry pointprocess_2 got request responseprocess_2 got title: 'not found' of type: <class 'str'>process_2 exit pointprocess_0 entry pointprocess_0 got request responseprocess_0 got title: 'Starting Activity when video is finished playing' of type: <class 'bs4.element.NavigableString'>process_0 exit pointsuccessuflly joined process_0process_3 entry pointprocess_3 got request responseprocess_3 got title: 'Just don't understand the point of these typedefs' of type: <class 'bs4.element.NavigableString'>process_3 exit pointprocess_1 entry pointprocess_1 got request responseprocess_1 got title: 'Import button + File browse field in admin product grid in magento' of type: <class 'bs4.element.NavigableString'>process_1 exit pointprocess_4 entry pointprocess_4 got request responseprocess_4 got title: 'How can I do a query with subselect' of type: <class 'bs4.element.NavigableString'>process_4 exit pointsuccessuflly joined process_1successuflly joined process_2successuflly joined process_3successuflly joined process_4joined all workersgot ['not found', 'process_2']got ['Starting Activity when video is finished playing', 'process_0']got ["Just don't understand the point of these typedefs", 'process_3']got ['Import button + File browse field in admin product grid in magento', 'process_1']got ['How can I do a query with subselect', 'process_4']done

关于python - 请求和多处理,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/46710707/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com