gpt4 book ai didi

javascript - 获取数据时如何迭代不同的URL?

转载 作者:行者123 更新时间:2023-12-02 22:24:35 29 4
gpt4 key购买 nike

我在一些评论者的帮助下更新了代码。简短摘要:- 我想从 800 多个页面中抓取包含有关产品信息的 HTML,并将该数据解析为 JSON 并将其保存在 JSON 文件中。当我确实喜欢一次 20 页时,该代码可以工作,但是当我尝试完成所有这些时,我收到以下错误:

Error: Max redirects exceeded.

这是完整的代码:

// Import required modules
const axios = require('axios');
const cheerio = require('cheerio');
const fs = require('fs');

const url = "http://johndevisser.marktplaza.nl/?p=";

async function getProductsHtml(data) {
const $ = await cheerio.load(data);
let productsHTML = [];
$("div.item").each((i, prod) => {
productsHTML.push(($(prod).html()));
});
return productsHTML;
};

async function parseProducts(html) {
let products = [];
for (item in html) {
// Store the data we already have
const $ = await cheerio.load(html[item]);
let product = {};
let mpUrl = $("a").attr("href");
product["title"] = $("a").attr("title");
product["mpUrl"] = mpUrl;
product["imgUrl"] = $("img").attr("src");
let priceText = $("span.subtext").text().split("\xa0")[1].replace(",", ".");
product["price"] = parseFloat(priceText);
products.push(product);
}
return products;
}

async function addDescriptionToProducts(prods) {
for (i in prods) {
const response = await axios.get(prods[i]["mpUrl"])
const $ = cheerio.load(response.data);
description = $("div.description p").text();
prods[i]["descr"] = description;
}
return prods
}

async function getProductsFromPage(i) {
try {
const page = await axios.get(`http://johndevisser.marktplaza.nl/?p=${i}`);
console.log("GET request succeeded!");
// Get the Array with HTML of each product
const productsHTML = await getProductsHtml(page.data);
console.log("HTML array obtained!");
// Get the Array of objects with meta info
const productsParsed = await parseProducts(productsHTML);
console.log("Products parsed!")
// Add description to each product
const productsMeta = await addDescriptionToProducts(productsParsed);
console.log("Descriptions added!")
// Return the Array with all product information
return productsMeta;
} catch(e) {
console.log(e);
}
};

async function saveAllProducts() {
try {
const allProducts = await getAllProducts();
let jsonProducts = await JSON.stringify(allProducts);
fs.writeFile("products.json", jsonProducts, "utf8", (e) => {
if (e) {
console.log(e)
}
});
} catch(e) {
console.log(e);
}
}

async function getAllProducts() {
try {
let allProducts = [];
for (let i = 1; i < 855; i++) {
const productsFromPage = await getProductsFromPage(i);
allProducts = [...allProducts, ...productsFromPage];
console.log("Saved products from page " + i);
}
return allProducts
} catch(e) {
console.log(e);
}
}

saveAllProducts();

最佳答案

在尝试获取所有 800 个产品之前,我强烈建议您退后一步,看看您当前的代码。有一些因素使得运行此脚本 800 次变得更加困难。

  1. getProducts 接收页面 html 并将产品的 html 放入全局变量中。这称为副作用,它使代码比需要的更加复杂。
  2. parseProducts 接收产品 html 数组,但不使用它。相反,它使用全局变量。
  3. parseProducts 解析每个产品 html 并将元数据放入另一个全局变量中。
  4. fetchAndUpdateProducts 正在做两件事;解析页面并写入 json

由于这些原因,fetchAndUpdateProducts 中的流程变得有点不清楚,调试变得困难。

所以我建议从一种新方法开始,就像这样

async getProductsFromPage(i) {
try {
const page = await axios.get(`http://johndevisser.marktplaza.nl/?p=${i}`);

// Get the Array with HTML of each product
const productsHTML = getProductsHtml(response.data);

// Get the Array of objects with meta info
const productsParsed = parseProducts(productsHTML);

// Add description to each product
const productsMeta = await addDescriptionToProducts(productsParsed);

// Return the Array with all product information
return productsMeta;
} catch(e) {

}
}

之后您将能够执行以下操作:

const p1 = await getProductsFromPage(1);
const p2 = await getProductsFromPage(2);
const p3 = await getProductsFromPage(3);
// etc.

甚至将所有数据合并到一个数组中:

let allProducts = [];

for(let i = 0; i < 800; i++){
const productsFromPage = await getProductsFromPage(i);
allProducts = [...allProducts, ...productsFromPage];
}

// Write to JSON

关于javascript - 获取数据时如何迭代不同的URL?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/59108124/

29 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com