gpt4 book ai didi

javascript - Browserify 用 casperJS 编写的抓取工具

转载 作者:太空宇宙 更新时间:2023-11-04 02:16:46 25 4
gpt4 key购买 nike

我正在尝试使此代码在浏览器中运行。

scrape.html

<!doctype html>

<html>
<head>
<title> </title>
<style>
label {
margin-bottom: 2%;
}

div {
margin-bottom: 2%;
}
</style>
<script src = "../../AppData/Roaming/npm/node_modules/phantomjs/lib/phantomjs.js"></script>
<script src = "../../AppData/Roaming/npm/node_modules/casperjs/modules/casper.js"></script>
</head>

<body>
<form action="#" id = "form" method="get">
<label for="start">Start Page</label>
<div>
<input type = "number" name = "number1" value = "start"></input>
</div>
<label for="end">End Page</label>
<div>
<input type = "number" name = "number2" value = "end"></input>
</div>
<button onclick="myFunction()"> Submit </button>
</form>
<script>
function myFunction() {
var x = document.getElementById("form");
var number = [];
var i;
for (i = 0; i < x.length-1 ;i++) {
number.push(x.elements[i].value);
}
console.log(number);
//var casper = require('casper').create();
casper.then(function(){
console.log(this.fetchText('div.info-list-text'));

var startUrl = 'http://www.bedbathandbeyond.com/comm/c/Michigan/p/number1*3';
var endUrl = 'http://www.bedbathandbeyond.com/comm/c/Michigan/p/number2*3'
});
}
</script>
</body>
</html>

它会产生以下错误,

casper.js:32 Uncaught ReferenceError: patchRequire is not defined

我认为导致该错误的原因是我们无法像在 Node.js 中那样使用 require 在浏览器中导入模块。为了使此功能在浏览器中可用,我在项目文件夹中安装了 browserify 并创建了以下 JS 文件。

browserReq.js

var casper = require('casper').create();

var url = 'ok,-MI'
var baseUrl = 'http://www.bedandbeyond.com/comm/c/'+url;
console.log(baseUrl);

var nextBtn = "a.navigation-button.next";

var allLinks = [];

casper.start(baseUrl);

casper.waitForSelector(nextBtn, processPage);

casper.run();

function processPage() {
var pageData = this.evaluate(getPageData);
allLinks = allLinks.concat(pageData);

if (!this.exists(nextBtn)) {
return;
}

this.thenClick(nextBtn).then(function() {
//this.echo(this.getCurrentUrl());
//this.wait(1000);
}).then(processPage);
}

function getPageData(){
//return document.title;
var links = document.getElementsByClassName('pro-title');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
});
return links;
}

casper.then(function(){
//require('utils').dump(allLinks);
this.each(allLinks,function(self,link){
this.thenOpen(link,function(a){
jsonObj = {};
jsonObj.title = this.fetchText('a.profile-full-name');

jsonObj.services = this.getHTML('div.info-list-text span:nth-child(2) span');
jsonObj.services = jsonObj.services.replace(/&amp;/g,"and");

jsonObj.location = this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span');
//jsonObj.contact = this.fetchText('span.pro-contact-text');
jsonObj.description = this.getHTML('div.profile-about div:nth-child(1)');
//jsonObj.description.replace(/\s/g, '');

//require('utils').dump(jsonObj);
//jsonObj.description = jsonObj.description.replace(/[\t\n]/g,"");

//jsonObj = JSON.stringify(jsonObj, null, '\t');
require('utils').dump(jsonObj);
});
});
});

我正在使用 browserify browserReq.js -o browserReqOut.js -d 运行此文件。

它给了我以下错误,无法从项目文件夹位置找到模块“casper”。我已经在项目文件夹和全局安装了 casperJS。

更新1:

我将 scrape.html 中表单元素的值发布到以下代码,

scrape.php

<?php $url = $_POST["urlToScrape"]; ?><br>
<?php $page1 = $_POST["number1"]; ?> <br>
<?php $page2 = $_POST["number2"]; ?><br>
<?php $newProxyList = explode(PHP_EOL, $_POST['proxy']); ?> <br>

<?php echo $url ?> <br>
<?php echo $page1 ?> <br>
<?php echo $page2 ?> <br>
<?php echo $newProxyList[0] ?> <br>

<?php echo "<script>

var casper = require('casper').create();

var baseUrl = 'http://www.houzz.com/professionals/c/Nashville,-TN';
console.log(baseUrl);

var nextBtn = 'a.navigation-button.next';

var allLinks = [];

casper.start(baseUrl);

casper.waitForSelector(nextBtn, processPage);

casper.run();

function processPage() {
var pageData = this.evaluate(getPageData);
allLinks = allLinks.concat(pageData);

if (!this.exists(nextBtn)) {
return;
}

this.thenClick(nextBtn).then(function() {
this.echo(this.getCurrentUrl());
//this.wait(1000);
}).then(processPage);
}

function getPageData(){
//return document.title;
var links = document.getElementsByClassName('pro-title');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
});
return links;
}

casper.then(function(){
//require('utils').dump(allLinks);
this.each(allLinks,function(self,link){
this.thenOpen(link,function(a){
jsonObj = {};
jsonObj.title = this.fetchText('a.profile-full-name');

jsonObj.services = this.getHTML('div.info-list-text span:nth-child(2) span');
jsonObj.services = jsonObj.services.replace(/&amp;/g,'and');

jsonObj.location = this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span');
//jsonObj.contact = this.fetchText('span.pro-contact-text');
jsonObj.description = this.getHTML('div.profile-about div:nth-child(1)');
//jsonObj.description.replace(/\s/g, '');

//require('utils').dump(jsonObj);

//jsonObj = JSON.stringify(jsonObj, null, '\t');
require('utils').dump(jsonObj);
});
});
});

</script>"
?>

</body>
</html>

它仍然给我同样的错误,Uncaught ReferenceError: require is not Defined。当 PHP 在服务器上执行并且 require 模块在服务器上也可用时,为什么我会收到此错误。

最佳答案

PhantomJS 是一个完整的浏览器,其 own API 。 CasperJS 使用该 API 来做事情。除非您在浏览器中用纯 JavaScript 实现完整的 PhantomJS API,否则您将无法浏览器化 CasperJS。

关于javascript - Browserify 用 casperJS 编写的抓取工具,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/35370471/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com