gpt4 book ai didi

scrapy爬虫完整实例

转载 作者:qq735679552 更新时间:2022-09-28 22:32:09 25 4
gpt4 key购买 nike

CFSDN坚持开源创造价值,我们致力于搭建一个资源共享平台,让每一个IT人在这里找到属于你的精彩世界.

这篇CFSDN的博客文章scrapy爬虫完整实例由作者收集整理,如果你对这篇文章有兴趣,记得点赞哟.

本文主要通过实例介绍了scrapy框架的使用,分享了两个例子,爬豆瓣文本例程 douban 和图片例程 douban_imgs ,具体如下.

  。

例程1: douban

  。

  。

目录树 。

?
1
2
3
4
5
6
7
8
9
10
11
12
douban
- - douban
  - - spiders
   - - __init__.py
   - - bookspider.py
   - - douban_comment_spider.py
   - - doumailspider.py
  - - __init__.py
  - - items.py
  - - pipelines.py
  - - settings.py
- - scrapy.cfg

  。

–spiders–init.py 。

?
1
2
3
4
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

  。

bookspider.py 。

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# -*- coding:utf-8 -*-
'''by sudo rm -rf http://imchenkun.com'''
import scrapy
from douban.items import DoubanBookItem
 
 
class BookSpider(scrapy.Spider):
   name = 'douban-book'
   allowed_domains = [ 'douban.com' ]
   start_urls = [
     'https://book.douban.com/top250'
   ]
 
   def parse( self , response):
     # 请求第一页
     yield scrapy.Request(response.url, callback = self .parse_next)
 
     # 请求其它页
     for page in response.xpath( '//div[@class="paginator"]/a' ):
       link = page.xpath( '@href' ).extract()[ 0 ]
       yield scrapy.Request(link, callback = self .parse_next)
 
   def parse_next( self , response):
     for item in response.xpath( '//tr[@class="item"]' ):
       book = DoubanBookItem()
       book[ 'name' ] = item.xpath( 'td[2]/div[1]/a/@title' ).extract()[ 0 ]
       book[ 'content' ] = item.xpath( 'td[2]/p/text()' ).extract()[ 0 ]
       book[ 'ratings' ] = item.xpath( 'td[2]/div[2]/span[2]/text()' ).extract()[ 0 ]
       yield book

  。

douban_comment_spider.py 。

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# -*- coding:utf-8 -*-
import scrapy
from faker import Factory
from douban.items import DoubanMovieCommentItem
import urlparse
f = Factory.create()
 
 
class MailSpider(scrapy.Spider):
   name = 'douban-comment'
   allowed_domains = [ 'accounts.douban.com' , 'douban.com' ]
   start_urls = [
     'https://www.douban.com/'
   ]
 
   headers = {
     'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' ,
     'Accept-Encoding' : 'gzip, deflate, br' ,
     'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' ,
     'Connection' : 'keep-alive' ,
     'Host' : 'accounts.douban.com' ,
     'User-Agent' : f.user_agent()
   }
 
   formdata = {
     'form_email' : '你的邮箱' ,
     'form_password' : '你的密码' ,
     # 'captcha-solution': '',
     # 'captcha-id': '',
     'login' : '登录' ,
     'redir' : 'https://www.douban.com/' ,
     'source' : 'None'
   }
 
   def start_requests( self ):
     return [scrapy.Request(url = 'https://www.douban.com/accounts/login' ,
                 headers = self .headers,
                 meta = { 'cookiejar' : 1 },
                 callback = self .parse_login)]
 
   def parse_login( self , response):
     # 如果有验证码要人为处理
     if 'captcha_image' in response.body:
       print 'Copy the link:'
       link = response.xpath( '//img[@class="captcha_image"]/@src' ).extract()[ 0 ]
       print link
       captcha_solution = raw_input ( 'captcha-solution:' )
       captcha_id = urlparse.parse_qs(urlparse.urlparse(link).query, True )[ 'id' ]
       self .formdata[ 'captcha-solution' ] = captcha_solution
       self .formdata[ 'captcha-id' ] = captcha_id
     return [scrapy.FormRequest.from_response(response,
                          formdata = self .formdata,
                          headers = self .headers,
                          meta = { 'cookiejar' : response.meta[ 'cookiejar' ]},
                          callback = self .after_login
                          )]
 
   def after_login( self , response):
     print response.status
     self .headers[ 'Host' ] = "www.douban.com"
     yield scrapy.Request(url = 'https://movie.douban.com/subject/22266320/reviews' ,
                meta = { 'cookiejar' : response.meta[ 'cookiejar' ]},
                headers = self .headers,
                callback = self .parse_comment_url)
     yield scrapy.Request(url = 'https://movie.douban.com/subject/22266320/reviews' ,
                meta = { 'cookiejar' : response.meta[ 'cookiejar' ]},
                headers = self .headers,
                callback = self .parse_next_page,
                dont_filter = True #不去重
 
   def parse_next_page( self , response):
     print response.status
     try :
       next_url = response.urljoin(response.xpath( '//span[@class="next"]/a/@href' ).extract()[ 0 ])
       print "下一页"
       print next_url
       yield scrapy.Request(url = next_url,
                meta = { 'cookiejar' : response.meta[ 'cookiejar' ]},
                headers = self .headers,
                callback = self .parse_comment_url,
                dont_filter = True )
       yield scrapy.Request(url = next_url,
                meta = { 'cookiejar' : response.meta[ 'cookiejar' ]},
                headers = self .headers,
                callback = self .parse_next_page,
                dont_filter = True )
     except :
       print "Next page Error"
       return
 
   def parse_comment_url( self , response):
     print response.status
     for item in response.xpath( '//div[@class="main review-item"]' ):
       comment_url = item.xpath( 'header/h3[@class="title"]/a/@href' ).extract()[ 0 ]
       comment_title = item.xpath( 'header/h3[@class="title"]/a/text()' ).extract()[ 0 ]
       print comment_title
       print comment_url
       yield scrapy.Request(url = comment_url,
                meta = { 'cookiejar' : response.meta[ 'cookiejar' ]},
                headers = self .headers,
                callback = self .parse_comment)
 
   def parse_comment( self , response):
     print response.status
     for item in response.xpath( '//div[@id="content"]' ):
       comment = DoubanMovieCommentItem()
       comment[ 'useful_num' ] = item.xpath( '//div[@class="main-panel-useful"]/button[1]/text()' ).extract()[ 0 ].strip()
       comment[ 'no_help_num' ] = item.xpath( '//div[@class="main-panel-useful"]/button[2]/text()' ).extract()[ 0 ].strip()
       comment[ 'people' ] = item.xpath( '//span[@property="v:reviewer"]/text()' ).extract()[ 0 ]
       comment[ 'people_url' ] = item.xpath( '//header[@class="main-hd"]/a[1]/@href' ).extract()[ 0 ]
       comment[ 'star' ] = item.xpath( '//header[@class="main-hd"]/span[1]/@title' ).extract()[ 0 ]
 
       data_type = item.xpath( '//div[@id="link-report"]/div/@data-original' ).extract()[ 0 ]
       print "data_type: " + data_type
       if data_type = = '0' :
         comment[ 'comment' ] = "\t#####\t" .join( map ( lambda x:x.strip(), item.xpath( '//div[@id="link-report"]/div/p/text()' ).extract()))
       elif data_type = = '1' :
         comment[ 'comment' ] = "\t#####\t" .join( map ( lambda x:x.strip(), item.xpath( '//div[@id="link-report"]/div[1]/text()' ).extract()))
       comment[ 'title' ] = item.xpath( '//span[@property="v:summary"]/text()' ).extract()[ 0 ]
       comment[ 'comment_page_url' ] = response.url
       #print comment
       yield comment

  。

doumailspider.py 。

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# -*- coding:utf-8 -*-
'''by sudo rm -rf http://imchenkun.com'''
import scrapy
from faker import Factory
from douban.items import DoubanMailItem
import urlparse
f = Factory.create()
 
 
class MailSpider(scrapy.Spider):
   name = 'douban-mail'
   allowed_domains = [ 'accounts.douban.com' , 'douban.com' ]
   start_urls = [
     'https://www.douban.com/'
   ]
 
   headers = {
     'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' ,
     'Accept-Encoding' : 'gzip, deflate, br' ,
     'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' ,
     'Connection' : 'keep-alive' ,
     'Host' : 'accounts.douban.com' ,
     'User-Agent' : f.user_agent()
   }
 
   formdata = {
     'form_email' : '你的邮箱' ,
     'form_password' : '你的密码' ,
     # 'captcha-solution': '',
     # 'captcha-id': '',
     'login' : '登录' ,
     'redir' : 'https://www.douban.com/' ,
     'source' : 'None'
   }
 
   def start_requests( self ):
     return [scrapy.Request(url = 'https://www.douban.com/accounts/login' ,
                 headers = self .headers,
                 meta = { 'cookiejar' : 1 },
                 callback = self .parse_login)]
 
   def parse_login( self , response):
     # 如果有验证码要人为处理
     if 'captcha_image' in response.body:
       print 'Copy the link:'
       link = response.xpath( '//img[@class="captcha_image"]/@src' ).extract()[ 0 ]
       print link
       captcha_solution = raw_input ( 'captcha-solution:' )
       captcha_id = urlparse.parse_qs(urlparse.urlparse(link).query, True )[ 'id' ]
       self .formdata[ 'captcha-solution' ] = captcha_solution
       self .formdata[ 'captcha-id' ] = captcha_id
     return [scrapy.FormRequest.from_response(response,
                          formdata = self .formdata,
                          headers = self .headers,
                          meta = { 'cookiejar' : response.meta[ 'cookiejar' ]},
                          callback = self .after_login
                          )]
 
   def after_login( self , response):
     print response.status
     self .headers[ 'Host' ] = "www.douban.com"
     return scrapy.Request(url = 'https://www.douban.com/doumail/' ,
                meta = { 'cookiejar' : response.meta[ 'cookiejar' ]},
                headers = self .headers,
                callback = self .parse_mail)
 
   def parse_mail( self , response):
     print response.status
     for item in response.xpath( '//div[@class="doumail-list"]/ul/li' ):
       mail = DoubanMailItem()
       mail[ 'sender_time' ] = item.xpath( 'div[2]/div/span[1]/text()' ).extract()[ 0 ]
       mail[ 'sender_from' ] = item.xpath( 'div[2]/div/span[2]/text()' ).extract()[ 0 ]
       mail[ 'url' ] = item.xpath( 'div[2]/p/a/@href' ).extract()[ 0 ]
       mail[ 'title' ] = item.xpath( 'div[2]/p/a/text()' ).extract()[ 0 ]
       print mail
       yield mail

  。

init.py 。

(此文件内无代码) 。

  。

items.py 。

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# -*- coding: utf-8 -*-
import scrapy
 
 
class DoubanBookItem(scrapy.Item):
   name = scrapy.Field()      # 书名
   price = scrapy.Field()      # 价格
   edition_year = scrapy.Field()  # 出版年份
   publisher = scrapy.Field()    # 出版社
   ratings = scrapy.Field()     # 评分
   author = scrapy.Field()     # 作者
   content = scrapy.Field()
 
 
class DoubanMailItem(scrapy.Item):
   sender_time = scrapy.Field()   # 发送时间
   sender_from = scrapy.Field()   # 发送人
   url = scrapy.Field()       # 豆邮详细地址
   title = scrapy.Field()      # 豆邮标题
 
class DoubanMovieCommentItem(scrapy.Item):
   useful_num = scrapy.Field()   # 多少人评论有用
   no_help_num = scrapy.Field()   # 多少人评论无用
   people = scrapy.Field()     # 评论者
   people_url = scrapy.Field()   # 评论者页面
   star = scrapy.Field()      # 评分
   comment = scrapy.Field()     # 评论
   title = scrapy.Field()      # 标题
   comment_page_url = scrapy.Field() # 当前页

  。

pipelines.py 。

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# -*- coding: utf-8 -*-
 
 
class DoubanBookPipeline( object ):
   def process_item( self , item, spider):
     info = item[ 'content' ].split( ' / ' ) # [法] 圣埃克苏佩里 / 马振聘 / 人民文学出版社 / 2003-8 / 22.00元
     item[ 'name' ] = item[ 'name' ]
     item[ 'price' ] = info[ - 1 ]
     item[ 'edition_year' ] = info[ - 2 ]
     item[ 'publisher' ] = info[ - 3 ]
     return item
 
 
class DoubanMailPipeline( object ):
   def process_item( self , item, spider):
     item[ 'title' ] = item[ 'title' ].replace( ' ' , ' ').replace(' \\n ', ' ')
     return item
 
 
class DoubanMovieCommentPipeline( object ):
   def process_item( self , item, spider):
     return item

  。

settings.py 。

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# -*- coding: utf-8 -*-
 
# Scrapy settings for douban project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#   http://doc.scrapy.org/en/latest/topics/settings.html
#   http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#   http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 
BOT_NAME = 'douban'
 
SPIDER_MODULES = [ 'douban.spiders' ]
NEWSPIDER_MODULE = 'douban.spiders'
 
 
# Crawl responsibly by identifying yourself (and your website) on the user-agent
from faker import Factory
f = Factory.create()
USER_AGENT = f.user_agent()
 
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
 
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
 
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
 
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
 
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
 
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
   'Host' : 'book.douban.com' ,
   'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' ,
   'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' ,
   'Accept-Encoding' : 'gzip, deflate, br' ,
   'Connection' : 'keep-alive' ,
}
#DEFAULT_REQUEST_HEADERS = {
#  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#  'Accept-Language': 'en',
#}
 
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#  'douban.middlewares.MyCustomSpiderMiddleware': 543,
#}
 
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#  'douban.middlewares.MyCustomDownloaderMiddleware': 543,
#}
 
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#  'scrapy.extensions.telnet.TelnetConsole': None,
#}
 
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   #'douban.pipelines.DoubanBookPipeline': 300,
   #'douban.pipelines.DoubanMailPipeline': 600,
   'douban.pipelines.DoubanMovieCommentPipeline' : 900 ,
}
 
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
 
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

  。

scrapy.cfg 。

?
1
2
3
4
5
6
7
8
9
10
11
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
 
[settings]
default = douban.settings
 
[deploy]
#url = http://localhost:6800/
project = douban

  。

例程2: douban_imgs

  。

  。

目录树 。

?
1
2
3
4
5
6
7
8
9
10
11
douban_imgs
- - douban
  - - spiders
   - - __init__.py
   - - download_douban.py
  - - __init__.py
  - - items.py
  - - pipelines.py
  - - run_spider.py
  - - settings.py
- - scrapy.cfg

  。

–spiders–init.py 。

?
1
2
3
4
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

  。

download_douban.py 。

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# coding=utf-8
from scrapy.spiders import Spider
import re
from scrapy import Request
from douban_imgs.items import DoubanImgsItem
 
 
class download_douban(Spider):
   name = 'download_douban'
 
   default_headers = {
     'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' ,
     'Accept-Encoding' : 'gzip, deflate, sdch, br' ,
     'Accept-Language' : 'zh-CN,zh;q=0.8,en;q=0.6' ,
     'Cache-Control' : 'max-age=0' ,
     'Connection' : 'keep-alive' ,
     'Host' : 'www.douban.com' ,
     'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' ,
   }
 
   def __init__( self , url = '1638835355' , * args, * * kwargs):
     self .allowed_domains = [ 'douban.com' ]
     self .start_urls = [
       'http://www.douban.com/photos/album/%s/' % (url)]
     self .url = url
     # call the father base function
 
     #super(download_douban, self).__init__(*args, **kwargs)
 
   def start_requests( self ):
 
     for url in self .start_urls:
       yield Request(url = url, headers = self .default_headers, callback = self .parse)
 
   def parse( self , response):
     list_imgs = response.xpath( '//div[@class="photolst clearfix"]//img/@src' ).extract()
     if list_imgs:
       item = DoubanImgsItem()
       item[ 'image_urls' ] = list_imgs
       yield item

  。

init.py 。

(此文件内无代码) 。

  。

items.py 。

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# -*- coding: utf-8 -*-
 
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
 
import scrapy
from scrapy import Item, Field
 
 
class DoubanImgsItem(scrapy.Item):
   # define the fields for your item here like:
   # name = scrapy.Field()
   image_urls = Field()
   images = Field()
   image_paths = Field()

  。

pipelines.py 。

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# -*- coding: utf-8 -*-
 
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy import Request
from scrapy import log
 
 
class DoubanImgsPipeline( object ):
   def process_item( self , item, spider):
     return item
 
 
class DoubanImgDownloadPipeline(ImagesPipeline):
   default_headers = {
     'accept' : 'image/webp,image/*,*/*;q=0.8' ,
     'accept-encoding' : 'gzip, deflate, sdch, br' ,
     'accept-language' : 'zh-CN,zh;q=0.8,en;q=0.6' ,
     'cookie' : 'bid=yQdC/AzTaCw' ,
     'referer' : 'https://www.douban.com/photos/photo/2370443040/' ,
     'user-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' ,
   }
 
   def get_media_requests( self , item, info):
     for image_url in item[ 'image_urls' ]:
       self .default_headers[ 'referer' ] = image_url
       yield Request(image_url, headers = self .default_headers)
 
   def item_completed( self , results, item, info):
     image_paths = [x[ 'path' ] for ok, x in results if ok]
     if not image_paths:
       raise DropItem( "Item contains no images" )
     item[ 'image_paths' ] = image_paths
     return item

  。

run_spider.py 。

?
1
2
3
from scrapy import cmdline
cmd_str = 'scrapy crawl download_douban'
cmdline.execute(cmd_str.split( ' ' ))

  。

settings.py 。

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# -*- coding: utf-8 -*-
 
# Scrapy settings for douban_imgs project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#   http://doc.scrapy.org/en/latest/topics/settings.html
#   http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#   http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 
BOT_NAME = 'douban_imgs'
 
SPIDER_MODULES = [ 'douban_imgs.spiders' ]
NEWSPIDER_MODULE = 'douban_imgs.spiders'
 
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'douban_imgs (+http://www.yourdomain.com)'
 
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS=32
 
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY=3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN=16
# CONCURRENT_REQUESTS_PER_IP=16
 
# Disable cookies (enabled by default)
# COOKIES_ENABLED=False
 
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED=False
 
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#  'Accept-Language': 'en',
# }
 
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#  'douban_imgs.middlewares.MyCustomSpiderMiddleware': 543,
# }
 
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#  'douban_imgs.middlewares.MyCustomDownloaderMiddleware': 543,
# }
 
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#  'scrapy.telnet.TelnetConsole': None,
# }
 
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'douban_imgs.pipelines.DoubanImgDownloadPipeline' : 300 ,
}
 
IMAGES_STORE = 'D:\\doubanimgs'
#IMAGES_STORE = '/tmp'
 
IMAGES_EXPIRES = 90
 
 
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# NOTE: AutoThrottle will honour the standard settings for concurrency and delay
# AUTOTHROTTLE_ENABLED=True
# The initial download delay
# AUTOTHROTTLE_START_DELAY=5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY=60
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG=False
 
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED=True
# HTTPCACHE_EXPIRATION_SECS=0
# HTTPCACHE_DIR='httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES=[]
# HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'

  。

scrapy.cfg 。

?
1
2
3
4
5
6
7
8
9
10
11
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
 
[settings]
default = douban_imgs.settings
 
[deploy]
#url = http://localhost:6800/
project = douban_imgs

  。

总结 。

以上就是本文关于scrapy爬虫完整实例的全部内容,希望对大家有所帮助。感兴趣的朋友可以继续参阅本站其他相关专题,如有不足之处,欢迎留言指出。感谢朋友们对本站的支持! 。

原文链接:http://blog.csdn.net/nnnnnnnnnnnny/article/details/54426779 。

最后此篇关于scrapy爬虫完整实例的文章就讲到这里了,如果你想了解更多关于scrapy爬虫完整实例的内容请搜索CFSDN的文章或继续浏览相关文章,希望大家以后支持我的博客! 。

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com