gpt4 book ai didi

Python基于多线程实现抓取数据存入数据库的方法

转载 作者:qq735679552 更新时间:2022-09-27 22:32:09 24 4
gpt4 key购买 nike

CFSDN坚持开源创造价值,我们致力于搭建一个资源共享平台,让每一个IT人在这里找到属于你的精彩世界.

这篇CFSDN的博客文章Python基于多线程实现抓取数据存入数据库的方法由作者收集整理,如果你对这篇文章有兴趣,记得点赞哟.

本文实例讲述了Python基于多线程实现抓取数据存入数据库的方法。分享给大家供大家参考,具体如下:

1. 数据库类 。

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
"""
使用须知:
代码中数据表名 aces ,需要更改该数据表名称的注意更改
"""
import pymysql
class Database():
   # 设置本地数据库用户名和密码
   host = "localhost"
   user = "root"
   password = ""
   database = "test"
   port = 3306
   charset = "utf8"
   cursor = ''
   connet = ''
   def __init__( self ):
     #连接到数据库
     self .connet = pymysql.connect(host = self .host , user = self .user,password = self .password , database = self .database, charset = self .charset)
     self .cursor = self .connet.cursor()
   # #删表
   def dropTables( self ):
     self .cursor.execute( '''''drop table if exists aces''' )
     print ( "删表" )
   #建表
   def createTables( self ):
     self .cursor.execute( '''''create table if not exists aces
             (
               asin  varchar(11) primary key not null,
               checked varchar(200));''' )
     print ( "建表" )
   #保存数据
   def save( self ,aceslist):
     self .cursor.execute( "insert into aces ( asin, checked) values(%s,%s)" , (aceslist[ 0 ],aceslist[ 1 ]))
     self .connet.commit()
   #判断元素是否已经在数据库里,在就返回true ,不在就返回false
   def is_exists_asin( self ,asin):
     self .cursor.execute( 'select * from aces where asin = %s' ,asin)
     if self .cursor.fetchone() is None :
       return False
     return True
# db =Database()

2. 多线程任务类 。

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import urllib.parse
import urllib.parse
import urllib.request
from queue import Queue
import time
import random
import threading
import logging
import pymysql
from bs4 import BeautifulSoup
from local_data import Database
#一个模块中存储多个类 AmazonSpeder , ThreadCrawl(threading.Thread), AmazonSpiderJob
class AmazonSpider():
   def __init__( self ):
     self .db = Database()
   def randHeader( self ):
     head_connection = [ 'Keep-Alive' , 'close' ]
     head_accept = [ 'text/html, application/xhtml+xml, */*' ]
     head_accept_language = [ 'zh-CN,fr-FR;q=0.5' , 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3' ]
     head_user_agent = [ 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko' ,
               'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36' ,
               'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)' ,
               'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1' ,
               'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3' ,
               'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12' ,
               'Opera/9.27 (Windows NT 5.2; U; zh-cn)' ,
               'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0' ,
               'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)' ,
               'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6' ,
               'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)' ,
               'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)' ,
               'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)' ,
               'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ' ,
               'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)' ,
               'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ' ,
               'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER' ,
               'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)' ,
               'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11' ]
     header = {
       'Connection' : head_connection[ 0 ],
       'Accept' : head_accept[ 0 ],
       'Accept-Language' : head_accept_language[ 1 ],
       'User-Agent' : head_user_agent[random.randrange( 0 , len (head_user_agent))]
     }
     return header
   def getDataById( self , queryId):
     #如果数据库中有的数据,直接返回不处理
     if self .db.is_exists_asin(queryId):
       return
     req = urllib.request.Request(url = "https://www.amazon.com/dp/" + str (queryId) , headers = self .randHeader())
     webpage = urllib.request.urlopen(req)
     html = webpage.read()
     soup = BeautifulSoup(html, 'html.parser' )
     content = soup.find_all( "span" , id = "asTitle" )
     # 加入一种判断,有的asin没有该定位,
     if len (content):
       # 非空
       state = content[ 0 ].string
     else :
       # 列表为空,没有定位到
       state = "other"
     print (queryId)
     print (state)
     self .db.save([queryId,state])
class ThreadCrawl(threading.Thread): #ThreadCrawl类继承了Threading.Thread类
   def __init__( self , queue): #子类特有属性, queue
     FORMAT = time.strftime( "[%Y-%m-%d %H:%M:%S]" , time.localtime()) + "[AmazonSpider]-----%(message)s------"
     logging.basicConfig(level = logging.INFO, format = FORMAT )
     threading.Thread.__init__( self )
     self .queue = queue
     self .spider = AmazonSpider() #子类特有属性spider, 并初始化,将实例用作属性
   def run( self ):
     while True :
       success = True
       item = self .queue.get() #调用队列对象的get()方法从队头删除并返回一个项目item
       try :
         self .spider.getDataById(item) #调用实例spider的方法getDataById(item)
       except :
         # print("失败")
         success = False
       if not success :
         self .queue.put(item)
       logging.info( "now queue size is: %d" % self .queue.qsize()) #队列对象qsize()方法,返回队列的大小
       self .queue.task_done() #队列对象在完成一项工作后,向任务已经完成的队列发送一个信号
class AmazonSpiderJob():
   def __init__( self , size , qs):
     self .size = size # 将形参size的值存储到属性变量size中
     self .qs = qs
   def work( self ):
     toSpiderQueue = Queue() #创建一个Queue队列对象
     for q in self .qs:
       toSpiderQueue.put(q) #调用队列对象的put()方法,在对尾插入一个项目item
     for i in range ( self .size):
       t = ThreadCrawl(toSpiderQueue)  #将实例用到一个类的方法中
       t.setDaemon( True )
       t.start()
     toSpiderQueue.join()  #队列对象,等到队列为空,再执行别的操作

3. 主线程类 。

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from amazon_s import AmazonSpiderJob #从一个模块中导入类
import pymysql
import pandas as pd
from local_data import Database
if __name__ = = '__main__' :
   #初次跑程序的时候,需要删除旧表,然后新建表,之后重启再跑的时候需要注释
   #----------------------
   db = Database()
   db.dropTables()
   db.createTables()
   #---------------------------
   df = pd.read_excel( "ASIN检查_viogico_1108.xlsx" )
   # print(df.info())
   qs = df[ "asin1" ].values
   print (qs)
   print ( len (qs))
   amazonJob = AmazonSpiderJob( 8 , qs)
   amazonJob.work()

希望本文所述对大家Python程序设计有所帮助.

原文链接:https://blog.csdn.net/zn505119020/article/details/78590416 。

最后此篇关于Python基于多线程实现抓取数据存入数据库的方法的文章就讲到这里了,如果你想了解更多关于Python基于多线程实现抓取数据存入数据库的方法的内容请搜索CFSDN的文章或继续浏览相关文章,希望大家以后支持我的博客! 。

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com