gpt4 book ai didi

javascript - 使用 session 生成的 url 使用 puppeteer 获取 pdf 缓冲区

转载 作者:行者123 更新时间:2023-11-30 20:44:51 29 4
gpt4 key购买 nike

我一直在使用 puppeteer 尝试从一个网站获取 pdf - 或其缓冲响应 - 在单击文档链接(在新选项卡中打开)后执行两个请求:

  1. 第一个请求 ( http://epicdocs.planningni.gov.uk/ViewDocument.pa?uri=4157826&ext=PDF ) 检索 session guid 以访问文档
  2. 第二个请求 ( http://epicdocs.planningni.gov.uk/ViewDocument.aspx?guid=4ecd1fe5-43c6-4202-96e3-66b393fb819c) 使用该 guid 访问文档并在浏览器上呈现 pdf。

我尝试的结果是生成了一个空白的 pdf,即使它是在页面加载后创建的(使用 Fiddler 检查)。

我试过了

  • 拦截targetcreated事件获取页面
  • 获取第二个请求url并使用page.goto获取pdf
  • 等待页面响应以获取缓冲区
  • 设置 Page.setDownloadBehaviour 以允许下载而不是在浏览器中呈现

感谢任何指导和帮助。尝试的代码如下:

const puppeteer = require("puppeteer");

let browser;

async function getDocument(index, title, page) {
if (index != 19) return "";
console.log("getDocument START");
console.log("#repDocuments__ctl" + index + "_lnkViewDoc\ntitle: " + title);
let docPagePromise = new Promise((resolve, reject) =>
browser.once("targetcreated", async target => {
let targetUrl = await target.url();
if (targetUrl.indexOf("ViewDocument.aspx?") !== -1) {
console.log(targetUrl);
return resolve(target.page());
} else {
console.log("Failed to detect the ViewDocument page");
}
})
);

/* Tried to set the download behaviour to download automatically the pdf but it didn't work */
// await page._client.send("Page.setDownloadBehaviour", {
// behaviour: "allow",
// downloadPath: "./"
// });
await page.click(`#repDocuments__ctl${index}_lnkViewDoc`);
let pdfResults = "";
let pdfPage = await docPagePromise;

/* If I get the target from the page returned from the promise I get the correct ur, however the page url is blank */
// let target = await pdfPage.target();
// let url = await target.url();
// let response = await pdfPage.goto(url);
// console.log(response);
pdfPage.on("console.log", msg => console.log(msg));

/* This is never called */
await pdfPage.on("response", async response => {
console.log("PDF PAGE Response");
let responseBuffer = await response.buffer();
let responseHeaders = response.headers();
console.log("PDF PAGE Response Header: " + responseHeaders);
console.log("PDF PAGE Response Buffer: " + responseBuffer);
return {
responseHeaders,
responseBuffer
};
});
console.log(pdfResults);

let pdfTitle = await pdfPage.title();
console.log("PDFPage URL: " + pdfPage.url());
console.log("PDFPage Title: " + pdfTitle);

let pdfTarget = await pdfPage.target();
console.log("PDFTarget URL: " + (await pdfTarget.url()));
console.log("PDFTarget Type: " + pdfTarget.type());
pdfPage = await pdfTarget.page();
console.log("PDFPage URL: " + pdfPage.url());

await pdfPage.waitFor(3000);
let pdf = await pdfPage.pdf({ path: title + ".pdf" });
console.log(pdf);
return pdf;
}

async function getAdditionalDocumentation(page) {
console.log("getAdditionalDocumentation START");

await page.waitForSelector("#repGroupSummary__ctl1_lnkGroupName");
await page.click("#repGroupSummary__ctl1_lnkGroupName");
await page.waitForSelector("#pnlDocumentList > table > tbody > tr");

await page.waitFor(2000);

const documents = await page.$$eval(
"#pnlDocumentList > table > tbody > tr",
docs =>
docs.map((doc, i) => ({
type: doc.querySelector(".tdl-subgroup > span").innerText,
datePublished: doc.querySelector(
".tdl-date > span[id*='DatePublished']"
).innerText,
dateReceived: doc.querySelector(".tdl-date > span[id*='DateReceived']")
.innerText,
docType: doc.querySelector(".tdl-doctype > span").innerText,
description: doc.querySelector(".tdl-description > span").innerText
// 'docBuffer': window.getDocument(i + 1, doc.querySelector('.tdl-description > span').innerText)
}))
);

for (let i = 0; i < documents.length; i++) {
documents[i].docBuffer = await getDocument(i + 1, documents[i].description, page);
}

await page.click("#btnSummary");
console.log("getAdditionalDocumentation FINISH");

return documents;
}

async function getDocuments(page, browser) {
console.log("getDocuments");
let newPagePromise = new Promise((resolve, reject) =>
browser.once("targetcreated", async target => {
let targetUrl = await target.url();
if (targetUrl.indexOf("ShowCaseFile.aspx?") !== -1) {
console.log(targetUrl);
return resolve(target.page());
} else {
console.log("Failed to detect the ShowCaseFile page");
}
})
);
await page.click("#tab_externalDocuments > span");
await page.waitForSelector("#hp-doc-link");

await page.click("#hp-doc-link");
const newPage = await newPagePromise;

const additionalDocumentation = await getAdditionalDocumentation(newPage);

return {
additionalDocumentation
};
}




async function run() {
try {
browser = await puppeteer.launch();
const page = await browser.newPage();

page.on("console", msg => console.log("PAGE LOG:", ...msg.args));

const planningReference = "LA04/2017/1388/F";
await page.goto(
"http://epicpublic.planningni.gov.uk/publicaccess/search.do?action=simple&searchType=Application"
);
await page.waitForSelector("#simpleSearchString");
await page.type("#simpleSearchString", planningReference);
await page.click("#simpleSearchForm > div.row3 > input.button.primary");

await page.waitForSelector("#simpleDetailsTable");

console.log("getDocuments START");
const documents = await getDocuments(page, browser);
console.log("getDocuments FINISH");

console.log(documents);
console.log(documents.additionalDocumentation.length);
} finally {
browser.close();
}
}

run();

最佳答案

使用 exposefunction 将缓冲区数据写入磁盘:

page.exposeFunction("writeABString", async (strbuf, targetFile) => {
var str2ab = function _str2ab(str) { // Convert a UTF-8 String to an ArrayBuffer

var buf = new ArrayBuffer(str.length); // 1 byte for each char
var bufView = new Uint8Array(buf);

for (var i=0, strLen=str.length; i < strLen; i++) {
bufView[i] = str.charCodeAt(i);
}
return buf;
}

console.log("In 'writeABString' function...");

return new Promise((resolve, reject) => {

// Convert the ArrayBuffer string back to an ArrayBufffer, which in turn is converted to a Buffer
let buf = Buffer.from(str2ab(strbuf));

// Try saving the file.
fs.writeFile(targetFile, buf, (err, text) => {
if(err) reject(err);
else resolve(targetFile);
});
});
});

通过下载链接,您可以将它与 fetch api 一起使用,以将其作为 blob 并将其转换为:

page.evaluate( async () => {

function arrayBufferToString(buffer){ // Convert an ArrayBuffer to an UTF-8 String

var bufView = new Uint8Array(buffer);
var length = bufView.length;
var result = '';
var addition = Math.pow(2,8)-1;

for(var i = 0;i<length;i+=addition){
if(i + addition > length){
addition = length - i;
}
result += String.fromCharCode.apply(null, bufView.subarray(i,i+addition));
}
return result;
}

let geturl = "https://whateverurl.example.com";

return fetch(geturl, {
credentials: 'same-origin', // usefull when we are logged into a website and want to send cookies
responseType: 'arraybuffer', // get response as an ArrayBuffer
})
.then(response => response.arrayBuffer())
.then( arrayBuffer => {
var bufstring = arrayBufferToString(arrayBuffer);
return window.writeABString(bufstring, '/tmp/downloadtest.pdf');
})
.catch(function (error) {
console.log('Request failed: ', error);
});
});

有关更多信息,请查看 github puppeteer 页面上的这个问题。问题中也提出了上述解决方案。 Source

关于javascript - 使用 session 生成的 url 使用 puppeteer 获取 pdf 缓冲区,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/48830285/

29 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com