gpt4 book ai didi

javascript - 迭代node.js请求函数

转载 作者:行者123 更新时间:2023-11-30 16:08:36 26 4
gpt4 key购买 nike

这个问题是关于 node.js 中的爬虫的。start_url 在他抓取 URL 的位置给出,并将它们“推送”到 .json 文件 (output.json)。目前,他仅使用 start_url 运行请求函数,并将收集到的 URL 保存在 output.json 中。我希望他通过将 start_url 替换为第一个收集的 URL 来使用保存的 URL,然后再次收集链接……等等……

var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');

var start_url = ["http://stackoverflow.com/"]

var req = function(url){
request(url, function(error, response, html){
var $ = cheerio.load(html);

var data = [];

$("a").each(function() {
var link = $(this);
var exurls = {exurl: new Array(link.attr("href"))}

data.push(exurls);

// Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
// save to "output.json" from time to time, so you can stop it anytime
});

fs.writeFile("output.json", JSON.stringify(data, null, 4), function(err){
if(err){
console.log(err);
} else {
console.log("File successfully written!");
}
});
});
}
for (var i = 0; i < start_url.length; i++){
req(start_url[i]);
}

最佳答案

所以你可以做的是递归地调用函数。下面的例子应该有效:

var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');

var start_url = ["http://stackoverflow.com/"]

var req = function(url){
var count = 0;

request(url, function(error, response, html){
var $ = cheerio.load(html);

$("a").each(function() {
var link = $(this);
var exurls = {exurl: new Array(link.attr("href"))}

start_url.push(exurls);

// Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
// save to "output.json" from time to time, so you can stop it anytime
});

try {
fs.writeFileSync("output.json");
console.log("File successfully written!");
}catch(err){
console.log(err);
}

++count;

if(start_url.length > count) {
req(start_url[count]);
}
});
}

return req(start_url[0]);

问题在于您每次都在完全重写文件。如果这种情况持续一段时间,您将耗尽内存。另一种选择是创建写入流

var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');

var start_url = ["http://stackoverflow.com/"]

var wstream = fs.createWriteStream("output.json");

var req = function(url){

request(url, function(error, response, html){
var $ = cheerio.load(html);

$("a").each(function() {
var link = $(this);
var exurls = {exurl: new Array(link.attr("href"))}

start_url.push(exurls);

// Queue "exurls" for "start_url" and call the same function with the new URL (endless loop)
// save to "output.json" from time to time, so you can stop it anytime
wstream.write('"'+ exurls + '",');
});

start_url.shift();
if(start_url.length > 0) {
return req(start_url[0]);
}

wstream.end();
});
}

req(start_url[0]);

编辑:切换到基本队列以解决内存问题



关于javascript - 迭代node.js请求函数,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/36625295/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com