- html - 出于某种原因,IE8 对我的 Sass 文件中继承的 html5 CSS 不友好?
- JMeter 在响应断言中使用 span 标签的问题
- html - 在 :hover and :active? 上具有不同效果的 CSS 动画
- html - 相对于居中的 html 内容固定的 CSS 重复背景?
我们的大学网络系统有大约 1200 个站点,包括几百万个页面。我们在一台本地运行 apache 的机器上安装和配置了 Stormcrawler,并将驱动器映射到 Web 环境的文件系统。这意味着我们可以让 Stormcrawler 尽可能快地爬行,而不会产生任何网络流量,也不会影响公共(public)网络的存在。我们让 Tika 解析器运行以索引 .doc、.pdf 等。
name: "www-all-crawler"
includes:
- resource: true
file: "/crawler-default.yaml"
override: false
- resource: false
file: "crawler-conf.yaml"
override: true
- resource: false
file: "es-conf.yaml"
override: true
spouts:
- id: "spout"
className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.AggregationSpout"
parallelism: 10
bolts:
- id: "partitioner"
className: "com.digitalpebble.stormcrawler.bolt.URLPartitionerBolt"
parallelism: 1
- id: "fetcher"
className: "com.digitalpebble.stormcrawler.bolt.FetcherBolt"
parallelism: 2
- id: "sitemap"
className: "com.digitalpebble.stormcrawler.bolt.SiteMapParserBolt"
parallelism: 1
- id: "parse"
className: "com.digitalpebble.stormcrawler.bolt.JSoupParserBolt"
parallelism: 1
- id: "index"
className: "com.digitalpebble.stormcrawler.elasticsearch.bolt.IndexerBolt"
parallelism: 1
- id: "status"
className: "com.digitalpebble.stormcrawler.elasticsearch.persistence.StatusUpdaterBolt"
parallelism: 1
- id: "status_metrics"
className: "com.digitalpebble.stormcrawler.elasticsearch.metrics.StatusMetricsBolt"
parallelism: 1
- id: "redirection_bolt"
className: "com.digitalpebble.stormcrawler.tika.RedirectionBolt"
parallelism: 1
- id: "parser_bolt"
className: "com.digitalpebble.stormcrawler.tika.ParserBolt"
parallelism: 1
streams:
- from: "spout"
to: "partitioner"
grouping:
type: SHUFFLE
- from: "spout"
to: "status_metrics"
grouping:
type: SHUFFLE
- from: "partitioner"
to: "fetcher"
grouping:
type: FIELDS
args: ["key"]
- from: "fetcher"
to: "sitemap"
grouping:
type: LOCAL_OR_SHUFFLE
- from: "sitemap"
to: "parse"
grouping:
type: LOCAL_OR_SHUFFLE
- from: "parse"
to: "index"
grouping:
type: LOCAL_OR_SHUFFLE
- from: "fetcher"
to: "status"
grouping:
type: FIELDS
args: ["url"]
streamId: "status"
- from: "sitemap"
to: "status"
grouping:
type: FIELDS
args: ["url"]
streamId: "status"
- from: "parse"
to: "status"
grouping:
type: FIELDS
args: ["url"]
streamId: "status"
- from: "index"
to: "status"
grouping:
type: FIELDS
args: ["url"]
streamId: "status"
- from: "parse"
to: "redirection_bolt"
grouping:
type: LOCAL_OR_SHUFFLE
- from: "redirection_bolt"
to: "parser_bolt"
grouping:
type: LOCAL_OR_SHUFFLE
- from: "redirection_bolt"
to: "index"
grouping:
type: LOCAL_OR_SHUFFLE
- from: "parser_bolt"
to: "index"
grouping:
type: LOCAL_OR_SHUFFLE
- from: "redirection_bolt"
to: "parser_bolt"
grouping:
type: LOCAL_OR_SHUFFLE
streamId: "tika"
# Custom configuration for StormCrawler
# This is used to override the default values from crawler-default.xml and provide additional ones
# for your custom components.
# Use this file with the parameter -conf when launching your extension of ConfigurableTopology.
# This file does not contain all the key values but only the most frequently used ones. See crawler-default.xml for an extensive list.
config:
topology.workers: 2
topology.message.timeout.secs: 300
topology.max.spout.pending: 100
topology.debug: false
fetcher.threads.number: 50
# give 2gb to the workers
worker.heap.memory.mb: 2048
# mandatory when using Flux
topology.kryo.register:
- com.digitalpebble.stormcrawler.Metadata
# metadata to transfer to the outlinks
# used by Fetcher for redirections, sitemapparser, etc...
# these are also persisted for the parent document (see below)
# metadata.transfer:
# - customMetadataName
# lists the metadata to persist to storage
# these are not transfered to the outlinks
metadata.persist:
- _redirTo
- error.cause
- error.source
- isSitemap
- isFeed
http.agent.name: "Storm Crawler"
http.agent.version: "1.0"
http.agent.description: "built with StormCrawler Archetype 1.13"
http.agent.url: "http://example.com/"
http.agent.email: "noreply@example"
# The maximum number of bytes for returned HTTP response bodies.
# The fetched page will be trimmed to 65KB in this case
# Set -1 to disable the limit.
http.content.limit: 2000000
jsoup.treat.non.html.as.error: false
# FetcherBolt queue dump => comment out to activate
# if a file exists on the worker machine with the corresponding port number
# the FetcherBolt will log the content of its internal queues to the logs
# fetcherbolt.queue.debug.filepath: "/tmp/fetcher-dump-{port}"
parsefilters.config.file: "parsefilters.json"
urlfilters.config.file: "urlfilters.json"
# revisit a page daily (value in minutes)
# set it to -1 to never refetch a page
fetchInterval.default: 2880
# revisit a page with a fetch error after 2 hours (value in minutes)
# set it to -1 to never refetch a page
fetchInterval.fetch.error: 120
# never revisit a page with an error (or set a value in minutes)
### Currently set to check back in 1 month.
fetchInterval.error: 40320
# text extraction for JSoupParserBolt
textextractor.include.pattern:
- DIV[id="block-edu-bootstrap-subtheme-content" class="block block-system block-system-main-block"]
- MAIN[role="main"]
- DIV[id="content--news"]
- DIV[id="content--person"]
- ARTICLE[class="node container node--type-facility facility-full node-101895 node--promoted node--view-mode-full py-5"]
- ARTICLE[class="node container node--type-spotlight spotlight-full node-90543 node--promoted node--view-mode-full py-5"]
- DIV[class="field field--name-field-content field--type-entity-reference-revisions field--label-hidden field__items"]
- ARTICLE
- BODY
# - DIV[id="maincontent"]
# - DIV[itemprop="articleBody"]
# - ARTICLE
textextractor.exclude.tags:
- STYLE
- SCRIPT
- FOOTER
# custom fetch interval to be used when a document has the key/value in its metadata
# and has been fetched successfully (value in minutes)
# fetchInterval.FETCH_ERROR.isFeed=true: 30
# fetchInterval.isFeed=true: 10
# configuration for the classes extending AbstractIndexerBolt
# indexer.md.filter: "someKey=aValue"
indexer.url.fieldname: "url"
indexer.text.fieldname: "content"
indexer.canonical.name: "canonical"
indexer.md.mapping:
- parse.title=title
- parse.keywords=keywords
- parse.description=description
- domain=domain
# Metrics consumers:
topology.metrics.consumer.register:
- class: "org.apache.storm.metric.LoggingMetricsConsumer"
parallelism.hint: 1
# configuration for Elasticsearch resources
config:
# ES indexer bolt
# adresses can be specified as a full URL
# if not we assume that the protocol is http and the port 9200
es.indexer.addresses: "https://example.com:9200"
es.indexer.index.name: "www-all-index"
# es.indexer.pipeline: "_PIPELINE_"
#### Check the document type thoroughly it needs to match with the elastic search index mapping ####
es.indexer.doc.type: "doc"
es.indexer.user: "{username}"
es.indexer.password: "{password}"
es.indexer.create: false
#### Change the Cluster Name ####
es.indexer.settings:
cluster.name: "edu-web"
# ES metricsConsumer
es.metrics.addresses: "https://example.com:9200"
es.metrics.index.name: "www-all-metrics"
#### Check the document type thoroughly it needs to match with the elastic search index mapping ####
es.metrics.doc.type: "datapoint"
es.metrics.user: "{username}"
es.metrics.password: "{password}"
#### Change the Cluster Name ####
es.metrics.settings:
cluster.name: "edu-web"
# ES spout and persistence bolt
es.status.addresses: "https://example.com:9200"
es.status.index.name: "www-all-status"
#### Check the document type thoroughly it needs to match with the elastic search index mapping ####
es.status.doc.type: "status"
es.status.user: "{username}"
es.status.password: "{password}"
# the routing is done on the value of 'partition.url.mode'
es.status.routing: true
# stores the value used for the routing as a separate field
# needed by the spout implementations
es.status.routing.fieldname: "metadata.hostname"
es.status.bulkActions: 500
es.status.flushInterval: "5s"
es.status.concurrentRequests: 1
#### Change the Cluster Name ####
es.status.settings:
cluster.name: "edu-web"
################
# spout config #
################
# positive or negative filter parsable by the Lucene Query Parser
# es.status.filterQuery: "-(metadata.hostname:stormcrawler.net)"
# time in secs for which the URLs will be considered for fetching after a ack of fail
spout.ttl.purgatory: 30
# Min time (in msecs) to allow between 2 successive queries to ES
spout.min.delay.queries: 1000
# Delay since previous query date (in secs) after which the nextFetchDate value will be reset to the current time
# Setting this to -1 or a large value means that the ES will cache the results but also that less and less results
# might be returned.
spout.reset.fetchdate.after: 120
es.status.max.buckets: 50
es.status.max.urls.per.bucket: 20
# field to group the URLs into buckets
es.status.bucket.field: "metadata.hostname"
# field to sort the URLs within a bucket
es.status.bucket.sort.field: "nextFetchDate"
# field to sort the buckets
es.status.global.sort.field: "nextFetchDate"
# CollapsingSpout : limits the deep paging by resetting the start offset for the ES query
es.status.max.start.offset: 500
# AggregationSpout : sampling improves the performance on large crawls
es.status.sample: false
# AggregationSpout (expert): adds this value in mins to the latest date returned in the results and
# use it as nextFetchDate
es.status.recentDate.increase: -1
es.status.recentDate.min.gap: -1
topology.metrics.consumer.register:
- class: "com.digitalpebble.stormcrawler.elasticsearch.metrics.MetricsConsumer"
parallelism.hint: 1
#whitelist:
# - "fetcher_counter"
# - "fetcher_average.bytes_fetched"
#blacklist:
# - "__receive.*"
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>www.all.edu</groupId>
<artifactId>www-all</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<stormcrawler.version>1.13</stormcrawler.version>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.3.2</version>
<executions>
<execution>
<goals>
<goal>exec</goal>
</goals>
</execution>
</executions>
<configuration>
<executable>java</executable>
<includeProjectDependencies>true</includeProjectDependencies>
<includePluginDependencies>false</includePluginDependencies>
<classpathScope>compile</classpathScope>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>1.3.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<createDependencyReducedPom>false</createDependencyReducedPom>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>org.apache.storm.flux.Flux</mainClass>
<manifestEntries>
<Change></Change>
<Build-Date></Build-Date>
</manifestEntries>
</transformer>
</transformers>
<!-- The filters below are necessary if you want to include the Tika
module -->
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
<filter>
<!-- https://issues.apache.org/jira/browse/STORM-2428 -->
<artifact>org.apache.storm:flux-core</artifact>
<excludes>
<exclude>org/apache/commons/**</exclude>
<exclude>org/apache/http/**</exclude>
<exclude>org/yaml/**</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>com.digitalpebble.stormcrawler</groupId>
<artifactId>storm-crawler-core</artifactId>
<version>${stormcrawler.version}</version>
</dependency>
<dependency>
<groupId>com.digitalpebble.stormcrawler</groupId>
<artifactId>storm-crawler-tika</artifactId>
<version>${stormcrawler.version}</version>
</dependency>
<dependency>
<dependency>
<groupId>com.digitalpebble.stormcrawler</groupId>
<artifactId>storm-crawler-elasticsearch</artifactId>
<version>${stormcrawler.version}</version>
</dependency>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-core</artifactId>
<version>1.2.2</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>flux-core</artifactId>
<version>1.2.2</version>
</dependency>
</dependencies>
</project>
最佳答案
好的,所以您实际上正在处理少量不同的主机名。你真的可以在一个带有单个 ES spout 的单个 ES 分片上拥有它。要点是提取器将根据主机名强制执行礼貌,并且抓取速度会相对较慢。您可能也不需要多个 FetcherBolt 实例。
由于您正在爬取自己的网站,因此您可能会更加积极地使用爬虫并允许多个获取线程同时从同一主机名中提取,请尝试设置
fetcher.threads.per.queue:25
并且还从每个查询中检索更多的 URL 到 ES
es.status.max.urls.per.bucket:200
这应该会让你的爬行速度更快。
顺便说一句:如果您可以在 https://github.com/DigitalPebble/storm-crawler/wiki/Powered-By 中列出,请给我发一封电子邮件
?
其他读者注意:仅当您正在抓取自己的网站时才建议这样做。对第三方网站咄咄逼人是不礼貌且没有效率的,因为您有被列入黑名单的风险。
关于elasticsearch - Stormcrawler -> Elasticsearch 的最佳设置,如果爬行的礼貌不是问题?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/55281184/
当爬网程序爬行完一个域时,我想执行一项操作(在我的例子中,将一个元组发送到一个 bolt )。 我发现 StormCrawler 甚至能够在给定的时间间隔后重新访问网站。在同时爬取多个域的场景下,在哪
我们正在尝试将网页的内容存储在 中状态索引 ,以及 url、状态和元数据信息。 我们尝试编辑 ES_IndexInit.sh 并在“状态”映射部分添加下一个属性: "content": {
我正在使用 Stormcrawler 1.15、ElasticSearch 7.5,并按照本教程启动并运行 SC:https://www.youtube.com/watch?v=KTerugU12TY
我正在使用stormcrawler将数据放入一些Elasticsearch索引中,并且在状态索引中有一堆URL,它们具有各种状态-DISCOVERED,FETCHED,ERROR等。 我想知道是否可以
我们的大学网络系统有大约 1200 个站点,包括几百万个页面。我们在一台本地运行 apache 的机器上安装和配置了 Stormcrawler,并将驱动器映射到 Web 环境的文件系统。这意味着我们可
Stormcrawler 1.13出现了一个奇怪的问题。在我们的某些(但不是全部)网站上,我们有一个标记,而SC的indexer.md.mapping设置为- parse.college=colleg
stormcrawler maven 原型(prototype)似乎与我的项目中的 warc 模块配合得不太好。目前它仅创建空的 0 字节文件,其名称类似于“crawl-20180802121925-
在Stormcrawler 1.14的已完成任务列表中,我同时看到“升级到Elasticsearch 7.0.x”和“ES 6.70依赖Elasticsearch”。这是否意味着1.14与两者兼容?
我一直在使用StormCrawler来抓取网站。作为https协议(protocol),我在StormCrawler中设置了默认的https协议(protocol)。但是,当我抓取一些网站时,我收到以
我们有很多网站正在更新、添加和删除。我很好奇 Stormcrawler 如何处理一个具有先前“获取”的 url 的站点,当下次 SC 到达它时,它已被删除并生成重定向或 404。来自的内容会发生什么旧
Closed. This question does not meet Stack Overflow guidelines 。它目前不接受答案。 想改善这个问题吗?更新问题,使其成为 Stack Ov
我是一名优秀的程序员,十分优秀!