gpt4 book ai didi

javascript - Casperjs 使用 casper.each 迭代链接列表

转载 作者:数据小太阳 更新时间:2023-10-29 03:51:49 24 4
gpt4 key购买 nike

我正在尝试使用 Casperjs 从页面获取链接列表,然后打开每个链接,并将来自这些页面的特定类型的数据添加到数组对象。

我遇到的问题是对每个列表项执行的循环。

首先,我从原始页面得到一个 listOfLinks。这部分有效并使用长度我可以检查这个列表是否已填充。

但是,使用如下所示的循环语句 this.each,没有任何控制台语句出现,casperjs 似乎跳过了这个 block 。

用标准的 for 循环替换 this.each,执行仅通过第一个链接的一部分,因为语句“Creating new array in object for x.html”出现一次,然后代码停止执行。使用 IIFE 不会改变这一点。

编辑:在详细 Debug模式下会发生以下情况:

Creating new array object for https://example.com 
[debug] [phantom] Navigation requested: url=about:blank, type=Other, willNavigate=true, isMainFrame=true

因此由于某种原因,传递给 thenOpen 函数的 URL 被更改为空白...

我觉得 Casperjs 的异步特性中有些东西我在这里没有掌握,如果能指出一个工作示例,我将不胜感激。

casper.then(function () {

var date = Date.now();
console.log(date);

var object = {};
object[date] = {}; // new object for date

var listOfLinks = this.evaluate(function(){
console.log("getting links");
return document.getElementsByClassName('importantLink');
});

console.log(listOfLinks.length);

this.each(listOfLinks, function(self, link) {

var eachPageHref = link.href;

console.log("Creating new array in object for " + eachPageHref);

object[date][eachPageHref] = []; // array for page to store names

self.thenOpen(eachPageHref, function () {

var listOfItems = this.evaluate(function() {
var items = [];
// Perform DOM manipulation to get items
return items;
});
});

object[date][eachPageHref] = items;

});
console.log(JSON.stringify(object));

});

最佳答案

我决定使用我们自己的 Stackoverflow.com 作为演示站点来运行您的脚本。我在您的代码中纠正了一些小问题,结果是这个练习从 PhantomJS 赏金问题中获取评论。

var casper = require('casper').create();

casper
.start()
.open('http://stackoverflow.com/questions/tagged/phantomjs?sort=featured&pageSize=30')
.then(function () {

var date = Date.now(), object = {};
object[date] = {};

var listOfLinks = this.evaluate(function(){

// Getting links to other pages to scrape, this will be
// a primitive array that will be easily returned from page.evaluate
var links = [].map.call(document.querySelectorAll("#questions .question-hyperlink"), function(link) {
return link.href;
});
return links;
});

// Now to iterate over that array of links
this.each(listOfLinks, function(self, eachPageHref) {

object[date][eachPageHref] = []; // array for page to store names

self.thenOpen(eachPageHref, function () {

// Getting comments from each page, also as an array
var listOfItems = this.evaluate(function() {
var items = [].map.call(document.getElementsByClassName("comment-text"), function(comment) {
return comment.innerText;
});
return items;
});
object[date][eachPageHref] = listOfItems;
});
});

// After each links has been scraped, output the resulting object
this.then(function(){
console.log(JSON.stringify(object));
});
})

casper.run();

更改内容:page.evaluate 现在返回简单数组,这是 casper.each() 正确迭代所必需的。 href 属性立即在 page.evaluate 中提取。还有这个更正:

 object[date][eachPageHref] = listOfItems; // previously assigned items which were undefined in this scope

脚本运行结果为

{"1478596579898":{"http://stackoverflow.com/questions/40410927/phantomjs-from-node-on-windows":["en.wikipedia.org/wiki/File_URI_scheme – Igor 2 days ago\n","@Igor is there something in particular you see wrong, or are you suggesting the phantom module has an incorrect URI? – Danny Buonocore 2 days ago\n","Probably windows security issue not allowing to run an unsigned program. – Vaviloff yesterday\n"],"http://stackoverflow.com/questions/40412726/casperjs-iterating-over-a-list-of-links-using-casper-each":["Thanks, this looked really promising. I made the changes but it didn't solve the problem. And I just realised that in debug mode the following happens: Creating new array object for https://example.com [debug] [phantom] Navigation requested: url=about:blank, type=Other, willNavigate=true, isMainFrame=true and then Casperjs silently fails. It seems that the correct link that gets passed into thenOpen gets changed to about:blank... – cyc665 yesterday\n"]}}

关于javascript - Casperjs 使用 casper.each 迭代链接列表,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/40412726/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com