作者热门文章
- ubuntu12.04环境下使用kvm ioctl接口实现最简单的虚拟机
- Ubuntu 通过无线网络安装Ubuntu Server启动系统后连接无线网络的方法
- 在Ubuntu上搭建网桥的方法
- ubuntu 虚拟机上网方式及相关配置详解
CFSDN坚持开源创造价值,我们致力于搭建一个资源共享平台,让每一个IT人在这里找到属于你的精彩世界.
这篇CFSDN的博客文章python爬取本站电子书信息并入库的实现代码由作者收集整理,如果你对这篇文章有兴趣,记得点赞哟.
入门级爬虫:只抓取书籍名称,信息及下载地址并存储到数据库 。
数据库工具类:DBUtil.py 。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
import
pymysql
class
DBUtils(
object
):
def
connDB(
self
):
#连接数据库
conn
=
pymysql.connect(host
=
'192.168.251.114'
,port
=
3306
, user
=
'root'
,passwd
=
'b6f3g2'
,db
=
'yangsj'
,charset
=
'utf8'
);
cur
=
conn.cursor();
return
(conn,cur);
def
exeUpdate(
self
,conn,cur,sql):
#更新或插入操作
sta
=
cur.execute(sql);
conn.commit();
return
(sta);
def
exeDelete(
self
,conn,cur,IDs):
#删除操作 demo 没用到
sta
=
0
;
for
eachID
in
IDs.split(
' '
):
sta
+
=
cur.execute(
"delete from students where Id=%d"
%
(
int
(eachID)));
conn.commit();
return
(sta);
def
exeQuery(
self
,cur,sql):
#查找操作
effect_row
=
cur.execute(sql);
return
(effect_row,cur);
def
connClose(
self
,conn,cur):
#关闭连接,释放资源
cur.close();
conn.close();
if
__name__
=
=
'__main__'
:
dbUtil
=
DBUtils();
conn,cur
=
dbUtil.connDB();
|
书籍操作文件 bookOpe.py 。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
from
DBUtil
import
DBUtils
from
bookInfo
import
Book
from
bookInfo
import
DownLoadInfo
import
logging
logging.basicConfig(
level
=
logging.INFO
)
class
BookOperator(
object
):
def
__addBook(
self
,book):
logging.info(
"add book:%s"
%
book.bookName);
dbUtil
=
DBUtils();
conn,cur
=
dbUtil.connDB();
insertBookSql
=
(
"insert into book (bookName,bookUrl,bookInfo) values ('%s','%s','%s');"
%
(book.bookName,book.downLoadUrl,book.mainInfo));
dbUtil.exeUpdate(conn,cur,insertBookSql);
dbUtil.connClose(conn,cur);
def
__selectLastBookId(
self
):
logging.info(
"selectLastBookId "
);
dbUtil
=
DBUtils();
conn,cur
=
dbUtil.connDB();
selectLastBookSql
=
"select id from book order by id desc limit 1"
;
effect_row,cur
=
dbUtil.exeQuery(cur,selectLastBookSql);
bookId
=
cur.fetchone()[
0
];
dbUtil.connClose(conn,cur);
return
bookId;
def
__addBookDownLoadInfos(
self
,downLoadInfos,bookId):
logging.info(
"add bookId:%s"
%
bookId);
dbUtil
=
DBUtils();
conn,cur
=
dbUtil.connDB();
for
downLoadinfo
in
downLoadInfos:
insertBookDownLoadInfo
=
(
"insert into book_down_url (bookId,downName,downUrl) values ('%s','%s','%s');"
%
(bookId,downLoadinfo.downName,downLoadinfo.downUrl));
dbUtil.exeUpdate(conn,cur,insertBookDownLoadInfo);
dbUtil.connClose(conn,cur);
def
addBookInfo(
self
,book):
logging.info(
"add bookInfo:%s"
%
book.bookName);
self
.__addBook(book);
bookId
=
self
.__selectLastBookId();
self
.__addBookDownLoadInfos(book.downLoadInfos,bookId);
if
__name__
=
=
'__main__'
:
bookope
=
BookOperator();
book
=
Book(
"aaa"
,
"yang"
,
"cccc"
);
book.addDownLoadUrl(DownLoadInfo(
"aaa.html"
,
"书籍"
));
bookope.addBookInfo(book);
|
书籍信息文件 bookInfo.py 。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
import
sys
sys.encoding
=
"utf8"
class
Book(
object
):
#书籍信息#
def
__init__(
self
,mainInfo,downLoadUrl,bookName):
self
.mainInfo
=
mainInfo;
self
.downLoadUrl
=
downLoadUrl;
self
.bookName
=
bookName;
self
.downLoadInfos
=
[];
def
addDownLoadUrl(
self
,downloadInfo):
self
.downLoadInfos.append(downloadInfo);
def
print_book_info(
self
):
print
(
"bookName :%s"
%
(
self
.bookName));
class
DownLoadInfo(
object
):
#下载信息#
def
__init__(
self
,downUrl,downName):
self
.downUrl
=
downUrl;
self
.downName
=
downName;
def
print_down_info(
self
):
print
(
"downLoad %s - %s"
%
(
self
.downUrl,
self
.downName));
|
51job界面解析文件 FiveOneJobFetch.py 。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
import
requests
from
bs4
import
BeautifulSoup
import
sys
from
bookInfo
import
Book
from
bookInfo
import
DownLoadInfo
import
logging
sys.encoding
=
"utf8"
class
PageFetch(
object
):
host
=
"//www.zzvips.com/"
;
#域名+分类
category
=
"books/"
;
#具体请求页
def
__init__(
self
,pageUrl):
self
.pageUrl
=
pageUrl;
#完整URL
self
.url
=
PageFetch.host
+
PageFetch.category
+
pageUrl;
def
__getPageContent(
self
):
req
=
requests.get(
self
.url);
if
req.status_code
=
=
200
:
req.encoding
=
"gb2312"
;
strText
=
req.text;
return
strText;
else
:
return
"";
def
getPageContent(url):
req
=
requests.get(url);
if
req.status_code
=
=
200
:
req.encoding
=
"gb2312"
;
strText
=
req.text;
return
strText;
else
:
return
"";
def
__getMaxPageNumAndUrl(
self
):
fetchUrl
=
self
.pageUrl;
#获取分页地址 分页url 形如 list45_2.html 2为页号#
maxPageNum
=
0
;
maxLink
=
"";
while
maxLink
=
=
"":
url
=
PageFetch.host
+
PageFetch.category
+
fetchUrl;
reqContent
=
PageFetch.getPageContent(url)
soup
=
BeautifulSoup (reqContent,
"html.parser"
);
for
ul
in
soup.select(
".plist"
):
print
(
"数据"
);
print
(ul);
maxPageNum
=
ul.select(
"strong"
)[
0
].text;
alink
=
ul.select(
"a"
);
if
alink[
-
1
][
'href'
]
=
=
"#"
:
maxLink
=
alink[
1
][
'href'
];
else
:
fetchUrl
=
alink[
-
1
][
'href'
];
return
maxPageNum,maxLink;
def
__formatPage(
self
,pageNum):
#格式化url 形如 list45_2.html#
lineBeginSite
=
self
.pageUrl.index(
"_"
)
+
1
;
docBeginSite
=
self
.pageUrl.index(
"."
);
return
self
.pageUrl[:lineBeginSite]
+
str
(pageNum
+
1
)
+
self
.pageUrl[docBeginSite:];
def
getBookPageList(
self
):
#获取书籍每页的URL#
shortPageList
=
[];
maxPageNum,urlPattern
=
self
.__getMaxPageNumAndUrl();
for
i
in
range
(
int
(maxPageNum)):
shortPageList.append(
self
.host
+
self
.category
+
self
.__formatPage(i));
return
shortPageList;
def
getDownloadPage(url):
downPage
=
[];
reqContent
=
PageFetch.getPageContent(url);
soup
=
BeautifulSoup (reqContent,
"html.parser"
);
for
a
in
soup.select(
".cur-cat-list .btn-dl"
):
downPage.append(PageFetch.host
+
a[
'href'
]);
return
downPage;
def
getBookInfo(url):
logging.info(
"获取书籍信息url:%s"
%
url);
reqContent
=
PageFetch.getPageContent(url);
soup
=
BeautifulSoup (reqContent,
"html.parser"
);
mainInfo
=
(soup.select(
"#soft-intro"
))[
0
].text.replace(
"截图:"
,"
").replace("
'
","
");
title
=
(soup.select(
"dl dt h1"
))[
0
].text.replace(
"'"
,"");
book
=
Book(mainInfo,url,title);
for
ul
in
soup.select(
".ul_Address"
):
for
li
in
ul.select(
"li"
):
downLoadInfo
=
DownLoadInfo(li.select(
"a"
)[
0
][
'href'
],li.select(
"a"
)[
0
].text);
book.addDownLoadUrl(downLoadInfo);
return
book;
if
__name__
=
=
'__main__'
:
p
=
PageFetch(
"list152_1.html"
);
shortPageList
=
p.getBookPageList();
downPage
=
[];
for
page
in
shortPageList:
downLoadPage
=
PageFetch.getDownloadPage(page);
downPage
=
downPage
+
downLoadPage;
print
(
"================汇总如下==============================="
);
for
bookDownLoadPage
in
downPage:
book
=
PageFetch.getBookInfo(bookDownLoadPage);
print
(book.bookName
+
":%s"
%
book.downLoadUrl);
for
d
in
book.downLoadInfos:
print
(
"%s - %s"
%
(d.downUrl,d.downName));
# p = PageFetch("list977_1.html");
# p = p.getMaxPageNumAndUrl();
# print (p);
|
执行文件,以上文件copy在相同的文件夹下 执行此文件即可 51Job.py 。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
from
FiveOneJobFetch
import
PageFetch
from
bookInfo
import
Book
from
bookInfo
import
DownLoadInfo
from
bookOpe
import
BookOperator
def
main(url):
p
=
PageFetch(url);
shortPageList
=
p.getBookPageList();
bookOperator
=
BookOperator();
downPage
=
[];
for
page
in
shortPageList:
downLoadPage
=
PageFetch.getDownloadPage(page);
downPage
=
downPage
+
downLoadPage;
for
bookDownLoadPage
in
downPage:
book
=
PageFetch.getBookInfo(bookDownLoadPage);
bookOperator.addBookInfo(book);
print
(
"数据抓取成功:"
+
url);
if
__name__
=
=
'__main__'
:
urls
=
[
"list152_35.html"
,
"list300_2.html"
,
"list476_6.html"
,
"list977_2.html"
,
"list572_5.html"
,
"list509_2.html"
,
"list481_1.html"
,
"list576_1.html"
,
"list482_1.html"
,
"list483_1.html"
,
"list484_1.html"
];
for
url
in
urls:
main(url);
|
数据库表:书籍信息表和下载地址表 。
1
2
3
4
5
6
7
8
9
10
|
CREATE
TABLE
`book` (
`id`
INT
(11)
NOT
NULL
AUTO_INCREMENT,
`bookName`
VARCHAR
(200)
NULL
DEFAULT
NULL
,
`bookUrl`
VARCHAR
(500)
NULL
DEFAULT
NULL
,
`bookInfo` TEXT
NULL
,
PRIMARY
KEY
(`id`)
)
COLLATE
=
'utf8mb4_general_ci'
ENGINE=InnoDB
AUTO_INCREMENT=2936;
|
1
2
3
4
5
6
7
8
9
10
|
CREATE
TABLE
`book_down_url` (
`id`
INT
(11)
NOT
NULL
AUTO_INCREMENT,
`bookId`
INT
(11)
NOT
NULL
DEFAULT
'0'
,
`downName`
VARCHAR
(200)
NOT
NULL
DEFAULT
'0'
,
`downUrl`
VARCHAR
(2000)
NOT
NULL
DEFAULT
'0'
,
PRIMARY
KEY
(`id`)
)
COLLATE
=
'utf8mb4_general_ci'
ENGINE=InnoDB
AUTO_INCREMENT=44441;
|
git地址:https://git.oschina.net/yangsj/BookFetch/tree/master 。
最后此篇关于python爬取本站电子书信息并入库的实现代码的文章就讲到这里了,如果你想了解更多关于python爬取本站电子书信息并入库的实现代码的内容请搜索CFSDN的文章或继续浏览相关文章,希望大家以后支持我的博客! 。
关闭。这个问题不符合Stack Overflow guidelines .它目前不接受答案。 要求我们推荐或查找工具、库或最喜欢的场外资源的问题对于 Stack Overflow 来说是偏离主题的,
我是一名优秀的程序员,十分优秀!