gpt4 book ai didi

node.js - 如何使用 Puppeteer 抓取无限滚动网站

转载 作者:行者123 更新时间:2023-12-05 02:04:42 29 4
gpt4 key购买 nike

<分区>

我正在尝试抓取一个无限滚动的网站。

我正在控制滚动,但它仍然在到达网页末尾后退出。

这是我的代码:

const puppeteer = require("puppeteer");

module.exports.scraper = async (url, callBack) => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();

await page.setUserAgent(
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
);

await page.setViewport({ width: 1200, height: 768 });

function wait(ms) {
return new Promise((resolve) => setTimeout(() => resolve(), ms));
}

await page.goto(`${url}/products/?department=men&l2_category=polos-t-shirts`, {
waitUntil: "networkidle0",
});

// Get the height of the rendered page
const bodyHandle = await page.$("body");
const { height } = await bodyHandle.boundingBox();
await bodyHandle.dispose();

// Scroll one viewport at a time, pausing to let content load
const viewportHeight = page.viewport().height;
let viewportIncr = 0;
while (viewportIncr + viewportHeight < height) {
await page.evaluate((_viewportHeight) => {
window.scrollBy(0, _viewportHeight);
}, viewportHeight);
await wait(1600);
viewportIncr = viewportIncr + viewportHeight;
}

let data = await page.evaluate(() => {
window.scrollTo(0, 0);
let products = [];
let productElements = document.querySelectorAll(".product-wrap");

productElements.forEach((productElement) => {
let productJson = {};
try {
productJson.imageUrl = productElement.querySelector(".renderedImg").src;
productJson.brandName = productElement.querySelector(
".brand-name",
).innerText;
} catch (e) {
console.log(e);
}
products.push(productJson);
});
return products;
});
await wait(100);
callBack(data, true);
await browser.close();
};

遇到这种情况怎么抓取?

29 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com