gpt4 book ai didi

javascript - puppeteer:迭代 CSV 文件和每行的屏幕截图?

转载 作者:行者123 更新时间:2023-12-02 21:56:51 25 4
gpt4 key购买 nike

我想迭代 CSV 文件并使用 puppeteer 截取 CSV 文件中每一行的 URL。

我有以下代码,工作正常,但每个请求都会等待前一个请求完成,因此需要很长时间才能运行:

const csv = require('csv-parser');
const fs = require('fs');
const puppeteer = require('puppeteer');

(async () => {
const browser = await puppeteer.launch();

const getFile = async function(rowId, path) {
const page = await browser.newPage();
page.setViewport({ width: 1000, height: 1500, deviceScaleFactor: 1 });
let url = 'https://www.facebook.com/ads/library/?id=' + rowId;
const response = await page.goto(url, { waitUntil: 'networkidle2' });
await page.waitFor(3000);
const body = await page.$('body');
await body.screenshot({
path: path
});
page.close();
};

let fname = 'ids.csv'
const csvPipe = fs.createReadStream(fname).pipe(csv());
csvPipe.on('data', async (row) => {
let id = row.ad_id;
console.log(id);
let path = './images/' + id + '.png';
csvPipe.pause();
await getFile(id, path);
csvPipe.resume();
}).on('end', () => {
console.log('CSV file successfully processed');
});
})();

如何使请求并行运行,以加快速度?

如果我删除 pause()resume() 行,那么每次函数运行时我都会收到此错误:

(node:18610) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). (rejection id: 14)
(node:18610) UnhandledPromiseRejectionWarning: TypeError: Cannot read property 'screenshot' of null
at getFile (/Users/me/Dropbox/Projects/scrape/index.js:29:12)
at <anonymous>
at process._tickCallback (internal/process/next_tick.js:189:7)

最佳答案

这是一个并行运行用户可控数量的 getFile() 操作的方案。您将 maxInFlight 变量设置为要并行运行的页面数量(这可能只是您的内存使用情况或 facebook 可能应用的任何速率限制的问题)。您必须通过实验来决定将其设置为什么。我最初将其设置为 10,以允许 10 个页面同时“运行”。

这里的总体思路是,getFile() 递增/递减 inFlightCntr 作为一次打开页面数量的度量,然后根据 csvPipe 暂停或恢复在那个柜台上。

const csv = require('csv-parser');
const fs = require('fs');
const puppeteer = require('puppeteer');

(async () => {
const browser = await puppeteer.launch();

const maxInFlight = 10; // set this value to control how many pages run in parallel
let inFlightCntr = 0;
let paused = false;

async function getFile(rowId, path) {
try {
++inFlightCntr;
const page = await browser.newPage();
page.setViewport({ width: 1000, height: 1500, deviceScaleFactor: 1 });
let url = 'https://www.facebook.com/ads/library/?id=' + rowId;
const response = await page.goto(url, { waitUntil: 'networkidle2' });
await page.waitFor(3000);
const body = await page.$('body');
await body.screenshot({
path: path
});
await page.close();
} catch(e) {
console.log(e);
page.close();
} finally {
--inFlightCntr;
}
}

let fname = 'ids.csv'
const csvPipe = fs.createReadStream(fname).pipe(csv());
csvPipe.on('data', async (row) => {
let id = row.ad_id;
console.log(id);
let path = './images/' + id + '.png';
getFile(id, path).finally(() => {
if (paused && inFlightCntr < maxInFlight) {
cvsPipe.resume();
paused = false;
}
});
if (!paused && inFlightCntr >= maxInFlight) {
cvsPipe.pause();
paused = true;
}
}).on('end', () => {
console.log('CSV file successfully processed');
});
})();
<小时/>

如果您只是运行 csvPipe 将所有行收集到一个数组中(在处理任何行之前),代码可能会更简单一些。然后,您可以使用任意数量的 Promise 并发函数来处理数组,同时控制并行运行的数量。请参阅this answer从昨天开始,我们介绍了许多可让您在并行处理数组时管理并发性的函数。该实现的外观如下:

const csv = require('csv-parser');
const fs = require('fs');
const puppeteer = require('puppeteer');

(async () => {
const browser = await puppeteer.launch();

const maxInFlight = 10; // set this value to control how many pages run in parallel
const fname = 'ids.csv'
const csvPipe = fs.createReadStream(fname).pipe(csv());
const rowIDs = [];

async function getFile(rowId, path) {
try {
const page = await browser.newPage();
page.setViewport({ width: 1000, height: 1500, deviceScaleFactor: 1 });
let url = 'https://www.facebook.com/ads/library/?id=' + rowId;
const response = await page.goto(url, { waitUntil: 'networkidle2' });
await page.waitFor(3000);
const body = await page.$('body');
await body.screenshot({
path: path
});
} catch(e) {
console.log(e);
} finally {
await page.close();
}
}

csvPipe.on('data', row => {
rowIDs.push(row.ad_id);
}).on('end', () => {
// all rowIDs in the array now
pMap(rowIDs, (id) => {
let path = './images/' + id + '.png';
return getFile(id, path);
}, maxInFlight).then(() => {
console.log("all items processed"); // all done now
}).catch(err => {
console.log(e);
});
});
})();


// utility function for processing an array asynchronously with
// no more than limit items "in flight" at the same time
function pMap(array, fn, limit) {
return new Promise(function(resolve, reject) {
var index = 0, cnt = 0, stop = false, results = new Array(array.length);

function run() {
while (!stop && index < array.length && cnt < limit) {
(function(i) {
++cnt;
++index;
fn(array[i]).then(function(data) {
results[i] = data;
--cnt;
// see if we are done or should run more requests
if (cnt === 0 && index === array.length) {
resolve(results);
} else {
run();
}
}, function(err) {
// set stop flag so no more requests will be sent
stop = true;
--cnt;
reject(err);
});
})(index);
}
}
run();
});
}

关于javascript - puppeteer:迭代 CSV 文件和每行的屏幕截图?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/59981135/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com