gpt4 book ai didi

javascript - 使用 Nodejs 进行网页抓取

转载 作者:太空宇宙 更新时间:2023-11-04 03:06:49 27 4
gpt4 key购买 nike

我创建了一个简单的网络抓取工具,可以从该网站提取文章标题和 URL:http://espn.go.com/college-football/ 。但是,抓取工具仅返回 46-50 篇文章,而不是网站上的所有文章。我尝试更改 cheerio 使用的 CSS 选择器,但它抓取的文章数量没有任何变化。这是我正在使用的代码:

var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var mongo = require('mongoskin');
var db = mongo.db("mongodb://localhost:27017/test", { native_parser: true });


url = 'http://espn.go.com/college-football/';

function Headline(title, link) {
this.Title = title;
this.link = link;
}

request(url, function (error, response, html) {
if (!error) {
var $ = cheerio.load(html);

var result = [];

// Grab the articles titles/url
$('.text-container h1 a.realStory', '#news-feed-content').each(function (i, elem) {
console.log($(elem).text(), elem.attribs.href);
var articleObject = new Headline($(elem).text(), elem.attribs.href);
result.push(articleObject);
});
}

fs.writeFile('espn_articles.json', JSON.stringify(result, null, 4), function (err) {

console.log('File successfully written! - Check your project directory for the output.json file');

})

db.collection('articles').insert(result, function (error, record) {
if (error) throw error;
console.log("data saved");
});
});

最佳答案

这是一个使用 Osmosis 的示例.

osmosis('http://espn.go.com/college-football/')
.find('#news-feed-content .text-container')
.set({
author: '.author',
category: '.category-link',
title: '.realStory',
link: '.realStory@href',
blurb: 'p'
})
.follow('.realStory@href')
.set({
date: '.article-meta @data-date',
images: [ 'picture @srcset' ],
content: '.article-body'
})
.data(function (article) {
/*
{ author: '...',
category: '...',
title: 'Harbaugh, Michigan reel in Florida OL Herbert',
link: '...',
blurb: 'Jim Harbaugh and Michigan have landed another recruit from SEC country in Kai-Leon Herbert of Florida.',
date: '2016-07-06T17:25:09Z',
images: [ '...', '...' ],
content: '...'
}
*/

db.collection('articles').insert(article, function (error, record) {
// ...
});
})
.log(console.log)
.error(console.log)
.debug(console.log);

关于javascript - 使用 Nodejs 进行网页抓取,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/38061278/

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com