gpt4 book ai didi

python爬取本站电子书信息并入库的实现代码

转载 作者:qq735679552 更新时间:2022-09-29 22:32:09 27 4
gpt4 key购买 nike

CFSDN坚持开源创造价值,我们致力于搭建一个资源共享平台,让每一个IT人在这里找到属于你的精彩世界.

这篇CFSDN的博客文章python爬取本站电子书信息并入库的实现代码由作者收集整理,如果你对这篇文章有兴趣,记得点赞哟.

入门级爬虫:只抓取书籍名称,信息及下载地址并存储到数据库 。

数据库工具类:DBUtil.py 。

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import pymysql
 
class DBUtils( object ):
   def connDB( self ):               #连接数据库
     conn = pymysql.connect(host = '192.168.251.114' ,port = 3306 , user = 'root' ,passwd = 'b6f3g2' ,db = 'yangsj' ,charset = 'utf8' );
     cur = conn.cursor();
     return (conn,cur);
 
   def exeUpdate( self ,conn,cur,sql):        #更新或插入操作
     sta = cur.execute(sql);
     conn.commit();
     return (sta);
 
   def exeDelete( self ,conn,cur,IDs):        #删除操作 demo 没用到
     sta = 0 ;
     for eachID in IDs.split( ' ' ):
       sta + = cur.execute( "delete from students where Id=%d" % ( int (eachID)));
     conn.commit();
     return (sta);
 
   def exeQuery( self ,cur,sql):           #查找操作
     effect_row = cur.execute(sql);
     return (effect_row,cur);
 
   def connClose( self ,conn,cur):          #关闭连接,释放资源
     cur.close();
     conn.close();
 
if __name__ = = '__main__' :
   dbUtil = DBUtils();
   conn,cur = dbUtil.connDB();

书籍操作文件 bookOpe.py 。

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from DBUtil import DBUtils
from bookInfo import Book
from bookInfo import DownLoadInfo
import logging
logging.basicConfig(
   level = logging.INFO
)
class BookOperator( object ):
   def __addBook( self ,book):
     logging.info( "add book:%s" % book.bookName);
     dbUtil = DBUtils();
     conn,cur = dbUtil.connDB();
     insertBookSql = ( "insert into book (bookName,bookUrl,bookInfo) values ('%s','%s','%s');" % (book.bookName,book.downLoadUrl,book.mainInfo));
     dbUtil.exeUpdate(conn,cur,insertBookSql);
     dbUtil.connClose(conn,cur);
   def __selectLastBookId( self ):
     logging.info( "selectLastBookId " );
     dbUtil = DBUtils();
     conn,cur = dbUtil.connDB();
     selectLastBookSql = "select id from book order by id desc limit 1" ;
     effect_row,cur = dbUtil.exeQuery(cur,selectLastBookSql);
     bookId = cur.fetchone()[ 0 ];
     dbUtil.connClose(conn,cur);
     return bookId;
   def __addBookDownLoadInfos( self ,downLoadInfos,bookId):
     logging.info( "add bookId:%s" % bookId);
     dbUtil = DBUtils();
     conn,cur = dbUtil.connDB();
     for downLoadinfo in downLoadInfos:
       insertBookDownLoadInfo = ( "insert into book_down_url (bookId,downName,downUrl) values ('%s','%s','%s');" % (bookId,downLoadinfo.downName,downLoadinfo.downUrl));
       dbUtil.exeUpdate(conn,cur,insertBookDownLoadInfo);
     dbUtil.connClose(conn,cur);
   def addBookInfo( self ,book):
     logging.info( "add bookInfo:%s" % book.bookName);
     self .__addBook(book);
     bookId = self .__selectLastBookId();
     self .__addBookDownLoadInfos(book.downLoadInfos,bookId);
if __name__ = = '__main__' :
   bookope = BookOperator();
   book = Book( "aaa" , "yang" , "cccc" );
   book.addDownLoadUrl(DownLoadInfo( "aaa.html" , "书籍" ));
   bookope.addBookInfo(book);

书籍信息文件 bookInfo.py 。

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import sys
sys.encoding = "utf8"
class Book( object ):
   #书籍信息#
   def __init__( self ,mainInfo,downLoadUrl,bookName):
     self .mainInfo = mainInfo;
     self .downLoadUrl = downLoadUrl;
     self .bookName = bookName;
     self .downLoadInfos = [];
   def addDownLoadUrl( self ,downloadInfo):
     self .downLoadInfos.append(downloadInfo);
   def print_book_info( self ):
     print ( "bookName :%s" % ( self .bookName));
class DownLoadInfo( object ):
   #下载信息#
   def __init__( self ,downUrl,downName):
     self .downUrl = downUrl;
     self .downName = downName;
   def print_down_info( self ):
     print ( "downLoad %s - %s" % ( self .downUrl, self .downName));

51job界面解析文件 FiveOneJobFetch.py 。

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import requests
from bs4 import BeautifulSoup
import sys
from bookInfo import Book
from bookInfo import DownLoadInfo
import logging
sys.encoding = "utf8"
class PageFetch( object ):
   host = "//www.zzvips.com/" #域名+分类
   category = "books/" ; #具体请求页
   def __init__( self ,pageUrl):
     self .pageUrl = pageUrl; #完整URL
     self .url = PageFetch.host + PageFetch.category + pageUrl;
   def __getPageContent( self ):
     req = requests.get( self .url);
     if req.status_code = = 200 :
       req.encoding = "gb2312" ;
       strText = req.text;
       return strText;
     else :
       return "";
   def getPageContent(url):
     req = requests.get(url);
     if req.status_code = = 200 :
       req.encoding = "gb2312" ;
       strText = req.text;
       return strText;
     else :
       return "";
   def __getMaxPageNumAndUrl( self ):
     fetchUrl = self .pageUrl;
     #获取分页地址 分页url 形如 list45_2.html 2为页号#
     maxPageNum = 0 ;
     maxLink = "";
     while maxLink = = "":
       url = PageFetch.host + PageFetch.category + fetchUrl;
       reqContent = PageFetch.getPageContent(url)
       soup = BeautifulSoup (reqContent, "html.parser" );
       for ul in soup.select( ".plist" ):
         print ( "数据" );
         print (ul);
         maxPageNum = ul.select( "strong" )[ 0 ].text;
         alink = ul.select( "a" );
         if alink[ - 1 ][ 'href' ] = = "#" :
           maxLink = alink[ 1 ][ 'href' ];
         else :
           fetchUrl = alink[ - 1 ][ 'href' ];
     return maxPageNum,maxLink;
   def __formatPage( self ,pageNum):
     #格式化url 形如 list45_2.html#
     lineBeginSite = self .pageUrl.index( "_" ) + 1 ;
     docBeginSite = self .pageUrl.index( "." );
     return self .pageUrl[:lineBeginSite] + str (pageNum + 1 ) + self .pageUrl[docBeginSite:];
   def getBookPageList( self ):
     #获取书籍每页的URL#
     shortPageList = [];
     maxPageNum,urlPattern = self .__getMaxPageNumAndUrl();
     for i in range ( int (maxPageNum)):
       shortPageList.append( self .host + self .category + self .__formatPage(i));
     return shortPageList;
   def getDownloadPage(url):
     downPage = [];
     reqContent = PageFetch.getPageContent(url);
     soup = BeautifulSoup (reqContent, "html.parser" );
     for a in soup.select( ".cur-cat-list .btn-dl" ):
       downPage.append(PageFetch.host + a[ 'href' ]);
     return downPage;
   def getBookInfo(url):
     logging.info( "获取书籍信息url:%s" % url);
     reqContent = PageFetch.getPageContent(url);
     soup = BeautifulSoup (reqContent, "html.parser" );
     mainInfo = (soup.select( "#soft-intro" ))[ 0 ].text.replace( "截图:" ," ").replace(" ' "," ");
     title = (soup.select( "dl dt h1" ))[ 0 ].text.replace( "'" ,"");
     book = Book(mainInfo,url,title);
     for ul in soup.select( ".ul_Address" ):
       for li in ul.select( "li" ):
         downLoadInfo = DownLoadInfo(li.select( "a" )[ 0 ][ 'href' ],li.select( "a" )[ 0 ].text);
         book.addDownLoadUrl(downLoadInfo);
     return book;
if __name__ = = '__main__' :
   p = PageFetch( "list152_1.html" );
   shortPageList = p.getBookPageList();
   downPage = [];
   for page in shortPageList:
     downLoadPage = PageFetch.getDownloadPage(page);
     downPage = downPage + downLoadPage;
   print ( "================汇总如下===============================" );
   for bookDownLoadPage in downPage:
     book = PageFetch.getBookInfo(bookDownLoadPage);
     print (book.bookName + ":%s" % book.downLoadUrl);
     for d in book.downLoadInfos:
       print ( "%s - %s" % (d.downUrl,d.downName));
   # p = PageFetch("list977_1.html");
   # p = p.getMaxPageNumAndUrl();
   # print (p);

执行文件,以上文件copy在相同的文件夹下 执行此文件即可 51Job.py 。

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from FiveOneJobFetch import PageFetch
from bookInfo import Book
from bookInfo import DownLoadInfo
from bookOpe import BookOperator
 
def main(url):
   p = PageFetch(url);
   shortPageList = p.getBookPageList();
   bookOperator = BookOperator();
   downPage = [];
   for page in shortPageList:
     downLoadPage = PageFetch.getDownloadPage(page);
     downPage = downPage + downLoadPage;
   for bookDownLoadPage in downPage:
     book = PageFetch.getBookInfo(bookDownLoadPage);
     bookOperator.addBookInfo(book);
   print ( "数据抓取成功:" + url);
 
if __name__ = = '__main__' :
   urls = [ "list152_35.html" , "list300_2.html" , "list476_6.html" , "list977_2.html" , "list572_5.html" , "list509_2.html" , "list481_1.html" , "list576_1.html" , "list482_1.html" , "list483_1.html" , "list484_1.html" ];
   for url in urls:
     main(url);

数据库表:书籍信息表和下载地址表 。

?
1
2
3
4
5
6
7
8
9
10
CREATE TABLE `book` (
`id` INT (11) NOT NULL AUTO_INCREMENT,
`bookName` VARCHAR (200) NULL DEFAULT NULL ,
`bookUrl` VARCHAR (500) NULL DEFAULT NULL ,
`bookInfo` TEXT NULL ,
PRIMARY KEY (`id`)
)
COLLATE = 'utf8mb4_general_ci'
ENGINE=InnoDB
AUTO_INCREMENT=2936;
?
1
2
3
4
5
6
7
8
9
10
CREATE TABLE `book_down_url` (
`id` INT (11) NOT NULL AUTO_INCREMENT,
`bookId` INT (11) NOT NULL DEFAULT '0' ,
`downName` VARCHAR (200) NOT NULL DEFAULT '0' ,
`downUrl` VARCHAR (2000) NOT NULL DEFAULT '0' ,
PRIMARY KEY (`id`)
)
COLLATE = 'utf8mb4_general_ci'
ENGINE=InnoDB
AUTO_INCREMENT=44441;

git地址:https://git.oschina.net/yangsj/BookFetch/tree/master 。

最后此篇关于python爬取本站电子书信息并入库的实现代码的文章就讲到这里了,如果你想了解更多关于python爬取本站电子书信息并入库的实现代码的内容请搜索CFSDN的文章或继续浏览相关文章,希望大家以后支持我的博客! 。

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com