gpt4 book ai didi

javascript - NodeJS 的 Promise 爬虫的递归循环

转载 作者:行者123 更新时间:2023-12-01 01:28:52 27 4
gpt4 key购买 nike

我正在尝试使用递归循环和 promise 来抓取网站。但它失败了..它只对第一页发出请求,在第二页时程序停止向我发出未处理的 promise 拒绝警告

我有这三个 JS 文件:

  1. scrapeAll.js(是调用 scrapePage.js 的递归循环)
  2. scrapePage.js
  3. scrapeComponents.js

scrapeAll.js:

var indexPage = 0;

scrapePage(indexPage).then((json)=>{
console.log(JSON.stringify(json, null, 4));
if(indexPage === Number.MAX_SAFE_INTEGER){
console.log("MAX SAFE INTEGER");
return;
}
save(json);
indexpage++;
scrapePage(indexPage);
}).catch((data)=>{
console.log(data);
if(indexPage === Number.MAX_SAFE_INTEGER){
console.log("MAX SAFE INTEGER");
return;
}
indexPage++;
scrapePage(indexPage);
});

ScrapePage.JS

let makeRequestCounter = 0;


function scrapePage(number) {
return new Promise((resolve, reject) => {

let url = URL + number;

let options = {
url: url,
headers: {
Host: SITE,
Connection: "keep-alive",
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "it-IT,it;q=0.9,en-US;q=0.8,en;q=0.7",
"Cache-Control": "max-age=0",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36",
"Cookie": restoreCookieToString()
}
};

makeRequest(options).then((jsonData) => {
resolve(jsonData);
}).catch((error) => {
//REQUEST_LIMIT_EXCEEDED
if (error === CONSTANTS.REQUEST_LIMIT_EXCEEDED) {
reject(CONSTANTS.REQUEST_LIMIT_EXCEEDED);
}

//ALREADY_EXIST
else if (error === CONSTANTS.ALREADY_EXIST) {
reject(CONSTANTS.ALREADY_EXIST);
}

else if (error === 404) {
reject("no data found at this page");
}

//error can beeconnrefused or econnreset
else if (error.code !== undefined) {

//econnrefused
if (error.code === CONSTANTS.ECONNREFUSED) {
reject("WRONG_URL", url);
}

//econnreset
else if (error.code === CONSTANTS.ECONNRESET) {
console.log("\neconnreset error\n");
makeRequest(options);
}

}
}
);
});
}

function makeRequest(options) {
return new Promise((resolve, reject) => {

let json = {
category: [],
imgs: [],
title: "",
description: "",
url: ""
};


if (makeRequestCounter === CONSTANTS.REQUEST_LIMIT) {
reject(CONSTANTS.REQUEST_LIMIT_EXCEEDED);
}

makeRequestCounter++;

console.log("request to: ", options.url);


request(options, function (error, response, html) {
if (error) {
//error: possible econnreset econnrefused
reject(error);

} else {

if (response.statusCode === 200) {
cookieSave(response.headers);


//---------- check if in db the url is already saved -------------//

check(response.request.uri.href, (err) => {
if (!err) {
reject(CONSTANTS.ALREADY_EXIST);
}
});

//----------finish checking, is new -------------------//


//GETTING TITLE

title(html, json_recipe).then((json) => {

//GETTING category

category(html, json).then((json) => {

//GETTING images

imgs(html, json).then((json) => {


description(html, json).then((json) => {


json.url = response.request.uri.href;

resolve(json);

//description error
}).catch((error) => {
console.log(error);
});

//images error
}).catch((error) => {
console.log(error);
});

//category error
}).catch((error) => {
console.log(error);
});
//title error
}
).catch((error) => {
console.log(error);
});
}

//no data in this page
if (response.statusCode === 404) {
reject(response.statusCode);
}
}

});
});
}

scrapeComponents.js

...

function description(html, json) {
return new Promise((resolve, reject) => {

const $ = cheerio.load(html);

let description = $('.submitter__description').text().trim();

json.description = JSON.parse(description);

resolve(json);

});

}
...

错误:

UnhandledPromiseRejectionWarning: Unhandled promise rejection (rejection id: 1): no data found at this page

程序发出第一个请求并在 scrapeAll.js 正确返回 scrapePage(indexPage = 1)。第二次我的程序与第一次完全相同,但何时返回 scrapeAll.js (reject("no data found at this page"); 在 ScrapePage.js 中)程序以错误结束。两个页面都没有数据,但程序也失败,好的页面仅保存第一个。我认为我在 promise 方面犯了一个很大的错误。非常感谢你们。

最佳答案

您对 scrapPage 函数的调用仅运行一次,并且您不会迭代调用它。您可能必须使用函数在迭代中调用它。更新您的 scrapeAll.js:

    function callScrapPage() {
var indexPage = 0;
while (indexPage < Number.MAX_SAFE_INTEGER) {
scrapePage(indexPage).then((json) => {
console.log(JSON.stringify(json, null, 4));
save(json);
indexpage++;
}
}
}

关于javascript - NodeJS 的 Promise 爬虫的递归循环,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/53490980/

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com