- ubuntu12.04环境下使用kvm ioctl接口实现最简单的虚拟机
- Ubuntu 通过无线网络安装Ubuntu Server启动系统后连接无线网络的方法
- 在Ubuntu上搭建网桥的方法
- ubuntu 虚拟机上网方式及相关配置详解
CFSDN坚持开源创造价值,我们致力于搭建一个资源共享平台,让每一个IT人在这里找到属于你的精彩世界.
这篇CFSDN的博客文章scrapy爬虫完整实例由作者收集整理,如果你对这篇文章有兴趣,记得点赞哟.
本文主要通过实例介绍了scrapy框架的使用,分享了两个例子,爬豆瓣文本例程 douban 和图片例程 douban_imgs ,具体如下.
。
。
。
目录树 。
1
2
3
4
5
6
7
8
9
10
11
12
|
douban
-
-
douban
-
-
spiders
-
-
__init__.py
-
-
bookspider.py
-
-
douban_comment_spider.py
-
-
doumailspider.py
-
-
__init__.py
-
-
items.py
-
-
pipelines.py
-
-
settings.py
-
-
scrapy.cfg
|
。
–spiders–init.py 。
1
2
3
4
|
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
|
。
bookspider.py 。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
# -*- coding:utf-8 -*-
'''by sudo rm -rf http://imchenkun.com'''
import
scrapy
from
douban.items
import
DoubanBookItem
class
BookSpider(scrapy.Spider):
name
=
'douban-book'
allowed_domains
=
[
'douban.com'
]
start_urls
=
[
'https://book.douban.com/top250'
]
def
parse(
self
, response):
# 请求第一页
yield
scrapy.Request(response.url, callback
=
self
.parse_next)
# 请求其它页
for
page
in
response.xpath(
'//div[@class="paginator"]/a'
):
link
=
page.xpath(
'@href'
).extract()[
0
]
yield
scrapy.Request(link, callback
=
self
.parse_next)
def
parse_next(
self
, response):
for
item
in
response.xpath(
'//tr[@class="item"]'
):
book
=
DoubanBookItem()
book[
'name'
]
=
item.xpath(
'td[2]/div[1]/a/@title'
).extract()[
0
]
book[
'content'
]
=
item.xpath(
'td[2]/p/text()'
).extract()[
0
]
book[
'ratings'
]
=
item.xpath(
'td[2]/div[2]/span[2]/text()'
).extract()[
0
]
yield
book
|
。
douban_comment_spider.py 。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
|
# -*- coding:utf-8 -*-
import
scrapy
from
faker
import
Factory
from
douban.items
import
DoubanMovieCommentItem
import
urlparse
f
=
Factory.create()
class
MailSpider(scrapy.Spider):
name
=
'douban-comment'
allowed_domains
=
[
'accounts.douban.com'
,
'douban.com'
]
start_urls
=
[
'https://www.douban.com/'
]
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
,
'Connection'
:
'keep-alive'
,
'Host'
:
'accounts.douban.com'
,
'User-Agent'
: f.user_agent()
}
formdata
=
{
'form_email'
:
'你的邮箱'
,
'form_password'
:
'你的密码'
,
# 'captcha-solution': '',
# 'captcha-id': '',
'login'
:
'登录'
,
'redir'
:
'https://www.douban.com/'
,
'source'
:
'None'
}
def
start_requests(
self
):
return
[scrapy.Request(url
=
'https://www.douban.com/accounts/login'
,
headers
=
self
.headers,
meta
=
{
'cookiejar'
:
1
},
callback
=
self
.parse_login)]
def
parse_login(
self
, response):
# 如果有验证码要人为处理
if
'captcha_image'
in
response.body:
print
'Copy the link:'
link
=
response.xpath(
'//img[@class="captcha_image"]/@src'
).extract()[
0
]
print
link
captcha_solution
=
raw_input
(
'captcha-solution:'
)
captcha_id
=
urlparse.parse_qs(urlparse.urlparse(link).query,
True
)[
'id'
]
self
.formdata[
'captcha-solution'
]
=
captcha_solution
self
.formdata[
'captcha-id'
]
=
captcha_id
return
[scrapy.FormRequest.from_response(response,
formdata
=
self
.formdata,
headers
=
self
.headers,
meta
=
{
'cookiejar'
: response.meta[
'cookiejar'
]},
callback
=
self
.after_login
)]
def
after_login(
self
, response):
print
response.status
self
.headers[
'Host'
]
=
"www.douban.com"
yield
scrapy.Request(url
=
'https://movie.douban.com/subject/22266320/reviews'
,
meta
=
{
'cookiejar'
: response.meta[
'cookiejar'
]},
headers
=
self
.headers,
callback
=
self
.parse_comment_url)
yield
scrapy.Request(url
=
'https://movie.douban.com/subject/22266320/reviews'
,
meta
=
{
'cookiejar'
: response.meta[
'cookiejar'
]},
headers
=
self
.headers,
callback
=
self
.parse_next_page,
dont_filter
=
True
)
#不去重
def
parse_next_page(
self
, response):
print
response.status
try
:
next_url
=
response.urljoin(response.xpath(
'//span[@class="next"]/a/@href'
).extract()[
0
])
print
"下一页"
print
next_url
yield
scrapy.Request(url
=
next_url,
meta
=
{
'cookiejar'
: response.meta[
'cookiejar'
]},
headers
=
self
.headers,
callback
=
self
.parse_comment_url,
dont_filter
=
True
)
yield
scrapy.Request(url
=
next_url,
meta
=
{
'cookiejar'
: response.meta[
'cookiejar'
]},
headers
=
self
.headers,
callback
=
self
.parse_next_page,
dont_filter
=
True
)
except
:
print
"Next page Error"
return
def
parse_comment_url(
self
, response):
print
response.status
for
item
in
response.xpath(
'//div[@class="main review-item"]'
):
comment_url
=
item.xpath(
'header/h3[@class="title"]/a/@href'
).extract()[
0
]
comment_title
=
item.xpath(
'header/h3[@class="title"]/a/text()'
).extract()[
0
]
print
comment_title
print
comment_url
yield
scrapy.Request(url
=
comment_url,
meta
=
{
'cookiejar'
: response.meta[
'cookiejar'
]},
headers
=
self
.headers,
callback
=
self
.parse_comment)
def
parse_comment(
self
, response):
print
response.status
for
item
in
response.xpath(
'//div[@id="content"]'
):
comment
=
DoubanMovieCommentItem()
comment[
'useful_num'
]
=
item.xpath(
'//div[@class="main-panel-useful"]/button[1]/text()'
).extract()[
0
].strip()
comment[
'no_help_num'
]
=
item.xpath(
'//div[@class="main-panel-useful"]/button[2]/text()'
).extract()[
0
].strip()
comment[
'people'
]
=
item.xpath(
'//span[@property="v:reviewer"]/text()'
).extract()[
0
]
comment[
'people_url'
]
=
item.xpath(
'//header[@class="main-hd"]/a[1]/@href'
).extract()[
0
]
comment[
'star'
]
=
item.xpath(
'//header[@class="main-hd"]/span[1]/@title'
).extract()[
0
]
data_type
=
item.xpath(
'//div[@id="link-report"]/div/@data-original'
).extract()[
0
]
print
"data_type: "
+
data_type
if
data_type
=
=
'0'
:
comment[
'comment'
]
=
"\t#####\t"
.join(
map
(
lambda
x:x.strip(), item.xpath(
'//div[@id="link-report"]/div/p/text()'
).extract()))
elif
data_type
=
=
'1'
:
comment[
'comment'
]
=
"\t#####\t"
.join(
map
(
lambda
x:x.strip(), item.xpath(
'//div[@id="link-report"]/div[1]/text()'
).extract()))
comment[
'title'
]
=
item.xpath(
'//span[@property="v:summary"]/text()'
).extract()[
0
]
comment[
'comment_page_url'
]
=
response.url
#print comment
yield
comment
|
。
doumailspider.py 。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
# -*- coding:utf-8 -*-
'''by sudo rm -rf http://imchenkun.com'''
import
scrapy
from
faker
import
Factory
from
douban.items
import
DoubanMailItem
import
urlparse
f
=
Factory.create()
class
MailSpider(scrapy.Spider):
name
=
'douban-mail'
allowed_domains
=
[
'accounts.douban.com'
,
'douban.com'
]
start_urls
=
[
'https://www.douban.com/'
]
headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
,
'Connection'
:
'keep-alive'
,
'Host'
:
'accounts.douban.com'
,
'User-Agent'
: f.user_agent()
}
formdata
=
{
'form_email'
:
'你的邮箱'
,
'form_password'
:
'你的密码'
,
# 'captcha-solution': '',
# 'captcha-id': '',
'login'
:
'登录'
,
'redir'
:
'https://www.douban.com/'
,
'source'
:
'None'
}
def
start_requests(
self
):
return
[scrapy.Request(url
=
'https://www.douban.com/accounts/login'
,
headers
=
self
.headers,
meta
=
{
'cookiejar'
:
1
},
callback
=
self
.parse_login)]
def
parse_login(
self
, response):
# 如果有验证码要人为处理
if
'captcha_image'
in
response.body:
print
'Copy the link:'
link
=
response.xpath(
'//img[@class="captcha_image"]/@src'
).extract()[
0
]
print
link
captcha_solution
=
raw_input
(
'captcha-solution:'
)
captcha_id
=
urlparse.parse_qs(urlparse.urlparse(link).query,
True
)[
'id'
]
self
.formdata[
'captcha-solution'
]
=
captcha_solution
self
.formdata[
'captcha-id'
]
=
captcha_id
return
[scrapy.FormRequest.from_response(response,
formdata
=
self
.formdata,
headers
=
self
.headers,
meta
=
{
'cookiejar'
: response.meta[
'cookiejar'
]},
callback
=
self
.after_login
)]
def
after_login(
self
, response):
print
response.status
self
.headers[
'Host'
]
=
"www.douban.com"
return
scrapy.Request(url
=
'https://www.douban.com/doumail/'
,
meta
=
{
'cookiejar'
: response.meta[
'cookiejar'
]},
headers
=
self
.headers,
callback
=
self
.parse_mail)
def
parse_mail(
self
, response):
print
response.status
for
item
in
response.xpath(
'//div[@class="doumail-list"]/ul/li'
):
mail
=
DoubanMailItem()
mail[
'sender_time'
]
=
item.xpath(
'div[2]/div/span[1]/text()'
).extract()[
0
]
mail[
'sender_from'
]
=
item.xpath(
'div[2]/div/span[2]/text()'
).extract()[
0
]
mail[
'url'
]
=
item.xpath(
'div[2]/p/a/@href'
).extract()[
0
]
mail[
'title'
]
=
item.xpath(
'div[2]/p/a/text()'
).extract()[
0
]
print
mail
yield
mail
|
。
init.py 。
(此文件内无代码) 。
。
items.py 。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
# -*- coding: utf-8 -*-
import
scrapy
class
DoubanBookItem(scrapy.Item):
name
=
scrapy.Field()
# 书名
price
=
scrapy.Field()
# 价格
edition_year
=
scrapy.Field()
# 出版年份
publisher
=
scrapy.Field()
# 出版社
ratings
=
scrapy.Field()
# 评分
author
=
scrapy.Field()
# 作者
content
=
scrapy.Field()
class
DoubanMailItem(scrapy.Item):
sender_time
=
scrapy.Field()
# 发送时间
sender_from
=
scrapy.Field()
# 发送人
url
=
scrapy.Field()
# 豆邮详细地址
title
=
scrapy.Field()
# 豆邮标题
class
DoubanMovieCommentItem(scrapy.Item):
useful_num
=
scrapy.Field()
# 多少人评论有用
no_help_num
=
scrapy.Field()
# 多少人评论无用
people
=
scrapy.Field()
# 评论者
people_url
=
scrapy.Field()
# 评论者页面
star
=
scrapy.Field()
# 评分
comment
=
scrapy.Field()
# 评论
title
=
scrapy.Field()
# 标题
comment_page_url
=
scrapy.Field()
# 当前页
|
。
pipelines.py 。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
# -*- coding: utf-8 -*-
class
DoubanBookPipeline(
object
):
def
process_item(
self
, item, spider):
info
=
item[
'content'
].split(
' / '
)
# [法] 圣埃克苏佩里 / 马振聘 / 人民文学出版社 / 2003-8 / 22.00元
item[
'name'
]
=
item[
'name'
]
item[
'price'
]
=
info[
-
1
]
item[
'edition_year'
]
=
info[
-
2
]
item[
'publisher'
]
=
info[
-
3
]
return
item
class
DoubanMailPipeline(
object
):
def
process_item(
self
, item, spider):
item[
'title'
]
=
item[
'title'
].replace(
' '
, '
').replace('
\\n
', '
')
return
item
class
DoubanMovieCommentPipeline(
object
):
def
process_item(
self
, item, spider):
return
item
|
。
settings.py 。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
# -*- coding: utf-8 -*-
# Scrapy settings for douban project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME
=
'douban'
SPIDER_MODULES
=
[
'douban.spiders'
]
NEWSPIDER_MODULE
=
'douban.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
from
faker
import
Factory
f
=
Factory.create()
USER_AGENT
=
f.user_agent()
# Obey robots.txt rules
ROBOTSTXT_OBEY
=
True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS
=
{
'Host'
:
'book.douban.com'
,
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
,
'Accept-Language'
:
'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
,
'Accept-Encoding'
:
'gzip, deflate, br'
,
'Connection'
:
'keep-alive'
,
}
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'douban.middlewares.MyCustomSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'douban.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES
=
{
#'douban.pipelines.DoubanBookPipeline': 300,
#'douban.pipelines.DoubanMailPipeline': 600,
'douban.pipelines.DoubanMovieCommentPipeline'
:
900
,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
。
scrapy.cfg 。
1
2
3
4
5
6
7
8
9
10
11
|
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default
=
douban.settings
[deploy]
#url = http://localhost:6800/
project
=
douban
|
。
。
。
目录树 。
1
2
3
4
5
6
7
8
9
10
11
|
douban_imgs
-
-
douban
-
-
spiders
-
-
__init__.py
-
-
download_douban.py
-
-
__init__.py
-
-
items.py
-
-
pipelines.py
-
-
run_spider.py
-
-
settings.py
-
-
scrapy.cfg
|
。
–spiders–init.py 。
1
2
3
4
|
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
|
。
download_douban.py 。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
|
# coding=utf-8
from
scrapy.spiders
import
Spider
import
re
from
scrapy
import
Request
from
douban_imgs.items
import
DoubanImgsItem
class
download_douban(Spider):
name
=
'download_douban'
default_headers
=
{
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
,
'Accept-Encoding'
:
'gzip, deflate, sdch, br'
,
'Accept-Language'
:
'zh-CN,zh;q=0.8,en;q=0.6'
,
'Cache-Control'
:
'max-age=0'
,
'Connection'
:
'keep-alive'
,
'Host'
:
'www.douban.com'
,
'User-Agent'
:
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
,
}
def
__init__(
self
, url
=
'1638835355'
,
*
args,
*
*
kwargs):
self
.allowed_domains
=
[
'douban.com'
]
self
.start_urls
=
[
'http://www.douban.com/photos/album/%s/'
%
(url)]
self
.url
=
url
# call the father base function
#super(download_douban, self).__init__(*args, **kwargs)
def
start_requests(
self
):
for
url
in
self
.start_urls:
yield
Request(url
=
url, headers
=
self
.default_headers, callback
=
self
.parse)
def
parse(
self
, response):
list_imgs
=
response.xpath(
'//div[@class="photolst clearfix"]//img/@src'
).extract()
if
list_imgs:
item
=
DoubanImgsItem()
item[
'image_urls'
]
=
list_imgs
yield
item
|
。
init.py 。
(此文件内无代码) 。
。
items.py 。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import
scrapy
from
scrapy
import
Item, Field
class
DoubanImgsItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
image_urls
=
Field()
images
=
Field()
image_paths
=
Field()
|
。
pipelines.py 。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from
scrapy.pipelines.images
import
ImagesPipeline
from
scrapy.exceptions
import
DropItem
from
scrapy
import
Request
from
scrapy
import
log
class
DoubanImgsPipeline(
object
):
def
process_item(
self
, item, spider):
return
item
class
DoubanImgDownloadPipeline(ImagesPipeline):
default_headers
=
{
'accept'
:
'image/webp,image/*,*/*;q=0.8'
,
'accept-encoding'
:
'gzip, deflate, sdch, br'
,
'accept-language'
:
'zh-CN,zh;q=0.8,en;q=0.6'
,
'cookie'
:
'bid=yQdC/AzTaCw'
,
'referer'
:
'https://www.douban.com/photos/photo/2370443040/'
,
'user-agent'
:
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
,
}
def
get_media_requests(
self
, item, info):
for
image_url
in
item[
'image_urls'
]:
self
.default_headers[
'referer'
]
=
image_url
yield
Request(image_url, headers
=
self
.default_headers)
def
item_completed(
self
, results, item, info):
image_paths
=
[x[
'path'
]
for
ok, x
in
results
if
ok]
if
not
image_paths:
raise
DropItem(
"Item contains no images"
)
item[
'image_paths'
]
=
image_paths
return
item
|
。
run_spider.py 。
1
2
3
|
from
scrapy
import
cmdline
cmd_str
=
'scrapy crawl download_douban'
cmdline.execute(cmd_str.split(
' '
))
|
。
settings.py 。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
# -*- coding: utf-8 -*-
# Scrapy settings for douban_imgs project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME
=
'douban_imgs'
SPIDER_MODULES
=
[
'douban_imgs.spiders'
]
NEWSPIDER_MODULE
=
'douban_imgs.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'douban_imgs (+http://www.yourdomain.com)'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS=32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY=3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN=16
# CONCURRENT_REQUESTS_PER_IP=16
# Disable cookies (enabled by default)
# COOKIES_ENABLED=False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED=False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'douban_imgs.middlewares.MyCustomSpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'douban_imgs.middlewares.MyCustomDownloaderMiddleware': 543,
# }
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES
=
{
'douban_imgs.pipelines.DoubanImgDownloadPipeline'
:
300
,
}
IMAGES_STORE
=
'D:\\doubanimgs'
#IMAGES_STORE = '/tmp'
IMAGES_EXPIRES
=
90
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# NOTE: AutoThrottle will honour the standard settings for concurrency and delay
# AUTOTHROTTLE_ENABLED=True
# The initial download delay
# AUTOTHROTTLE_START_DELAY=5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY=60
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG=False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED=True
# HTTPCACHE_EXPIRATION_SECS=0
# HTTPCACHE_DIR='httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES=[]
# HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
|
。
scrapy.cfg 。
1
2
3
4
5
6
7
8
9
10
11
|
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default
=
douban_imgs.settings
[deploy]
#url = http://localhost:6800/
project
=
douban_imgs
|
。
总结 。
以上就是本文关于scrapy爬虫完整实例的全部内容,希望对大家有所帮助。感兴趣的朋友可以继续参阅本站其他相关专题,如有不足之处,欢迎留言指出。感谢朋友们对本站的支持! 。
原文链接:http://blog.csdn.net/nnnnnnnnnnnny/article/details/54426779 。
最后此篇关于scrapy爬虫完整实例的文章就讲到这里了,如果你想了解更多关于scrapy爬虫完整实例的内容请搜索CFSDN的文章或继续浏览相关文章,希望大家以后支持我的博客! 。
有没有一种方法可以使用标准类型构造函数(例如 int、set、dict、list、tuple 等)以用户定义的方式将用户定义类的实例强制转换为其中一种类型?例如 class Example:
我知道这个问题在Stackoverflow中有很多问题,但是即使有很多答案,这些答案也帮不了我什么,也没有找到答案。 在我的WebAPP中,它可以正常工作,但是当我将其转换为API时,它失败了(主题标
这个问题已经有答案了: Why does the ternary operator unexpectedly cast integers? (3 个回答) 已关闭 9 年前。 最近遇到一个Java的陷
我尝试使用 FirebaseApp.configure() 配置 Firebase,但遇到以下崩溃: *** Terminating app due to uncaught exception 'c
我有一个自连接员工实体类,其中包含与其自身相关的 id、name 和 ref 列。我想创建它的新实例并将其保存到数据库。 首先我创建了一个 Employee 类的实例并将其命名为 manager。然后
我有一个用于添加新公寓的表单,在该表单中我有一个下拉列表,用户可以在其中选择负责的人员。 显然,当您从下拉列表中选择并尝试保存公寓时,我的应用程序认为该人已被修改。它给了我下面的错误,指示我应该首先保
从 Visualforce 页面,我需要检索我们组织的 salesforce 实例的 URL,而不是 Visual Force URL。 例如我需要https://cs1.salesforce.com
我遇到了一些可能的问题答案,但这是关于从 Hibernate 3.4.0GA 升级到 Hibernate 4.1.8 的问题。所以这曾经在以前的版本下工作,我已经四处搜索了为什么它在这个新版本中出现了
似乎一遍又一遍地问这个问题,我仍然找不到解决我问题的答案。我在下面有一个域模型。每个新创建或更新的“安全用户”都需要我确保其具有配置文件,如果没有,则创建一个新的配置文件并分配给它。 配置文件的要求相
我很难调试为什么 JPA 不级联我的 @ManyToMany 关系。我发现的所有答案都与缺少级联语句有关。但我确实拥有它们并且仍然得到: Caused by: org.hibernate.Transi
Play 服务 API 表明有一个叫做 Instance ID 的东西 但是,在 Android Studio 中包含以下内容后,我无法导入 InstanceID 类 compile "com.goo
我正在使用 Seam 框架。我有 2 个实体: 请求.java @Entity @Table(name = "SRV_REQUEST") public class Request { private
This question处理构建一个适当的Monad来自单子(monad)的实例,但仅在某些约束下 - 例如Set .诀窍是将其包装成 ContT ,它将约束推迟到包装/展开其值。 现在我想对 Ap
我正在尝试执行此查询: StringBuffer sb = new StringBuffer(); sb.append("select p from PointsEntity p " + "where
我试图了解是否可以更改我的 hibernate 配置并使用单个 MySQL 实例(而不是我当前拥有的多个 MySQL 实例): 我有一个使用 hibernate 的 Java 应用程序,与 2 个模式
我有一个选项卡滑动布局,其中包括四个选项卡,每个选项卡都有自己的布局和 fragment ,在我的主要 Activity 布局中,viewpager 参与更改选项卡。特定 View (选项卡)在应用程
我看到很多帖子声称他们正在运行 MySql 的 RDS 实例,但无法连接到该实例,但我没有运行 RDS。 我使用 EC2 实例来托管我的 WordPress 博客,该博客是使用 Web 平台安装程序安
因为我在我的 ec-2 实例上的 python 虚拟环境中运行应用程序( Airflow ),并且我想在同一个 ec2 实例上的默认 python 环境中运行命令,所以我认为 ssh 到我自己的实例更
这个问题已经有答案了: How to fix the Hibernate "object references an unsaved transient instance - save the tra
例子: run APP1 .. ... run APP1 ... run APP2 如何在 APP2 中对 Vue 说我需要调用 APP1?
我是一名优秀的程序员,十分优秀!