java - 爬行amazon.com-6ren

java - 爬行amazon.com

转载作者：行者123 更新时间：2023-11-29 07:32:20

我正在搜寻亚马逊产品，并且原理还不错。

我从这个不错的教程中获得了三个类：

http://www.netinstructions.com/how-to-make-a-simple-web-crawler-in-java/

我将文件添加到以下代码（Spider类）中：

import java.io.FileNotFoundException;
import java.util.*;


public class Spider {
    public static final int MAX_PAGES_TO_SEARCH = 10000;
    private Set<String> pagesVisited = new HashSet<String>();
    private List<String> pagesToVisit = new LinkedList<String>();

    public void search(String url) {
        while (this.pagesVisited.size() < MAX_PAGES_TO_SEARCH) {
        String currentUrl;
        SpiderLeg leg = new SpiderLeg();
        if (this.pagesToVisit.isEmpty()) {
            //System.out.println("abc");
            currentUrl = url;
            this.pagesVisited.add(url);
        } else {
            //System.out.println("def");
            currentUrl = this.nextUrl();
        }
        try {
            Thread.sleep(10000);
            leg.crawl(currentUrl); // Lots of stuff happening here. Look at the crawl method in
        } catch (FileNotFoundException e) {
            System.out.println("Oops, FileNotFoundException caught");
        } catch (InterruptedException e) {
            e.printStackTrace();
        }

        this.pagesToVisit.addAll(leg.getLinks());
        //System.out.println("Test");
    }
    System.out.println("\n**Done** Visited " + this.pagesVisited.size() + " web page(s)");
    SpiderLeg leg = new SpiderLeg();
    leg.calcAdjMatrix();
    for (int i = 0; i < leg.adjMatrix.length; i++) {
        System.out.println(Arrays.toString(leg.adjMatrix[i]));

    }

}

private String nextUrl() {
    String nextUrl;
    do {
        if (this.pagesToVisit.isEmpty()){
            return "https://www.amazon.de/Proband-Thriller-Guido-Kniesel/dp/1535287004/ref=sr_1_1?s=books&ie=UTF8&qid=1478247246&sr=1-1&keywords=%5B%5D";
        }
        nextUrl = this.pagesToVisit.remove(0);
    } while (this.pagesVisited.contains(nextUrl));
    this.pagesVisited.add(nextUrl);
    return nextUrl;
}
}

SpiderLeg类：

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.util.*;

public class SpiderLeg {
// We'll use a fake USER_AGENT so the web server thinks the robot is a normal web browser.
    private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36";
    private static List<String> links = new LinkedList<String>();
    private static String graphLink;
    private Document htmlDocument;
    private static double counter = 0;
    static Map<String, Set<String>> adjMap = new HashMap<String, Set<String>>();
    static int[][] adjMatrix;
    static List<String> mapping;

    public boolean crawl(String url) throws FileNotFoundException {
        if (url.isEmpty()) {
        return false;
    }
    try{
        Connection connection = Jsoup.connect(url).ignoreContentType(true).userAgent(USER_AGENT);
        Document htmlDocument = connection.get();
        this.htmlDocument = htmlDocument;
        if(connection.response().statusCode() == 200){
            // 200 is the HTTP OK status code
            // indicating that everything is great.
            counter++;
            double progress;
            progress = (counter/Spider.MAX_PAGES_TO_SEARCH)*100;
            System.out.println("\n**Visiting** Received web page at " + url);
            System.out.println("\n**Progress** " + progress + "%");
        }
        if(!connection.response().contentType().contains("text/html")) {
            System.out.println("**Failure** Retrieved something other than HTML");
            return false;
        }

        //Elements linksOnPage = htmlDocument.select("a[href*=/gp/product/]");
        Elements linksOnPage = htmlDocument.select("a[href*=/dp/]");
        Elements salesRank = htmlDocument.select("span.zg_hrsr_rank");
        Elements category = htmlDocument.select("span.zg_hrsr_ladder a");

        String categoryString = category.html();
        String salesRankString = salesRank.html();
        salesRankString = salesRankString.replace("\n", " ");
        categoryString = categoryString.replace("\n", " ");
        //System.out.println(categoryString);
        System.out.println("Found (" + linksOnPage.size() + ") links");

        PrintWriter pw = new PrintWriter(new FileWriter("Horror.csv", true));
        StringBuilder sb = new StringBuilder();

        int beginIndex = url.indexOf(".de/");
        int endIndex = url.indexOf("/dp");
        String title = url.substring(beginIndex+4,endIndex);

        if(!adjMap.containsKey(title)){
            if(categoryString.contains("Horror")){
                adjMap.put(title, new HashSet<String>());
                sb.append(title);
                sb.append(',');
                sb.append(salesRankString);
                sb.append(',');
                sb.append(categoryString);
                sb.append(',');
                for(Element link : linksOnPage){
                    String graphLink = link.attr("abs:href");
                    if(!graphLink.contains("one-click")){
                        if(!graphLink.contains("Kindle")){
                            if(!graphLink.contains("unsticky")){
                                this.links.add(graphLink);
                                //adjMap.get(url).add(graphLink);
                                adjMap.get(title).add(cutTitle(graphLink));
                                sb.append(graphLink);
                                sb.append(',');
                            }
                        }
                    }
                }
            sb.append('\n');
            pw.write(sb.toString());
            pw.close();
            }

        }


        System.out.println("done!");
        return true;
    }
    catch(IOException ioe) {
        // We were not successful in our HTTP request
        System.out.println("Error in out HTTP request " + ioe);
        return false;
    }
    }

public static void calcAdjMatrix(){
    Set<String> allMyURLs = new HashSet(adjMap.keySet());
    for(String s: adjMap.keySet()){
        allMyURLs.addAll(adjMap.get(s));
        System.out.println(s + "\t" + adjMap.get(s));
    }

    int dim = allMyURLs.size();
    adjMatrix = new int[dim][dim];
    List<String> nodes_list = new ArrayList<>();
    for(String s: allMyURLs){
        nodes_list.add(s);
    }

    for(String s: nodes_list){
        Set<String> outEdges = adjMap.get(s);
        int i = nodes_list.indexOf(s);
        if(outEdges != null){
            for(String s1: outEdges){
                int j = nodes_list.indexOf(s1);
                adjMatrix[i][j] = 1;
            }
        }

    }

}

public String cutTitle(String url) throws FileNotFoundException{
    int beginIndex = url.indexOf(".de/");
    int endIndex = url.indexOf("/dp");
    String title;
    if(url.contains(".de") && url.contains("/dp")){
        title = url.substring(beginIndex+4,endIndex);
    }else{
        title = "wrong url";
    }

    return title;
}
public boolean searchForWord(String searchWord) {

    if(this.htmlDocument == null) {
        System.out.println("ERROR! Call crawl() before performing analysis on the document");
        return false;
    }
    System.out.println("Searching for the word " + searchWord + "...");
    String bodyText = this.htmlDocument.body().text();
    return bodyText.toLowerCase().contains(searchWord.toLowerCase());
}


public List<String> getLinks(){
    return this.links;
}

}

SpiderTest类：

public class SpiderTest {
    public static void main(String[] args) {
        Spider spider = new Spider();
        spider.search("https://www.amazon.de/Wille-geschehe-Psychothriller-Guido-Kniesel/dp/1537455389/ref=pd_sim_14_1?_encoding=UTF8&psc=1&refRID=CQPDDGY4BJ4D8THNNSZ6");
    }
}

现在的问题是，在我认为有100个URL之后，亚马逊禁止我访问服务器。该程序不再找到URL。

有谁知道我该如何解决？

最佳答案

好吧，不要粗鲁然后爬行它们。

检查他们的robots.txt（wiki）看看他们允许您做什么。如果他们禁止您进入他们不希望您去的地方，请不要感到惊讶。

关于java - 爬行amazon.com，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/40448325/

文章推荐： java - JFrame 上的 JPanel 数组

文章推荐： java - 使用 int[] 数组通过嵌套循环相互操作

文章推荐： java - 错误 :Compatible type:java. util.String 无法转换为 String

vba - 爬行 ActiveX 按钮
再会! 我有一个 ActiveX 按钮(根据从顶部开始的行数锚定在位置上)，它运行 VBA 代码以在特定点插入指定数量的复制行。代码本身工作正常，但按钮“克隆”自身并将自身覆盖在新行的位置，即使我设置
hadoop - 无法启动 Nutch 爬行
我正在尝试在 Ubuntu 14.04 上部署 Nutch 2.3 + ElasticSearch 1.4 + HBase 0.94 以下 tutorial .当我尝试开始爬行注入(inject)网址
vba - 爬行 Zip 文件
我正在尝试爬行某个驱动器并从埋藏在子目录中的某些 .xls 文件中获取数据。该驱动器超过 1 TB，并且文件夹并不都具有相同的层次结构，因此我正在遍历所有文件夹。到目前为止，该脚本运行良好。问题是，
c# - 在 c++/c# 中从哪里开始抓取/爬行？
首先，我希望抓取是从桌面 .exe 向网站发送请求并获取数据的正确词。如果是，我应该使用什么库或插件？我是否应该使用另一种语言来执行此操作(如 Java 或其他语言？)。我需要一些“提示”，因为我真的
python - 如何通过命令生成 url 让 scrapy 爬行
这是我的代码: def parse(self, response): selector = Selector(response) sites = selector.xpath("//
node.js - 使用 Node.js 爬行
完整的 Node.js 菜鸟，所以不要评判我...... 我有一个简单的要求。爬取网站，查找所有产品页面，并保存产品页面中的一些数据。说的更简单，做的更简单。查看 Node.js 示例，我找不到类
scrapy - 广泛的 Scrapy 爬行 : sgmlLinkextractor rule does not work
我花了很多时间玩弄和使用谷歌，但我无法解决我的问题。我是 Scrapy 的新手，希望你能帮助我。部分有效的爬虫:我从 MySQL 数据库中定义我的 start_requests url。使用“par
Ajax 爬行 : old way vs new way (#! )
老方法当我以前在需要内容被搜索引擎索引的项目中异步加载页面时，我使用了一种非常简单的技术，那就是 Page $('#example').click(function(){
json - 爬行 : Difference between "query string parameter" and "request payload"
我正在尝试使用 Scrapy 抓取 ajax 站点，网址是 http://www.target.com/p/bounty-select-a-size-white-paper-towels-12-meg
java - 无法让 apache nutch 爬行 - 权限和 JAVA_HOME 可疑
我正在尝试按照 NutchTutorial 运行基本爬网: bin/nutch crawl urls -dir crawl -depth 3 -topN 5 所以我已经安装了 Nutch，并使用 So
python - 从 Python 库的角度来看，爬行、解析、索引、搜索之间有什么区别
很难说出这里要问什么。这个问题模棱两可、含糊不清、不完整、过于宽泛或夸夸其谈，无法以目前的形式得到合理的回答。如需帮助澄清此问题以便重新打开，visit the help center . 关闭 1
javascript - 使用 GAS : Error - Argument too large:value 进行巨大的网站抓取/爬行
我做了一个抓取脚本，通过爬行逐一抓取任何网站(要输入的url)的内部页面，获取其他内部url并处理它们以获取所有页面并提取其纯文本(剥离的html)。请参阅my previous回答。该脚本运行良好，
java - 使用 selenium : How to keep logged in after close Driver in java 爬行
无论如何，驱动程序是否可以记住登录 session ，所以它不会带我回到登录页面(例如google-chrome)? 这就是我现在正在做的事情 public static void main(Stri
node.js - NodeJS 使用 node-crawler 或 simplecrawler 进行 Web 爬行
我是网络爬虫新手，我需要一些关于这两个 Node JS 爬虫的指导。目标:我的目标是抓取网站并仅获取该域内的内部(本地)URL。我对任何页面数据或抓取不感兴趣。只是 URL。我的困惑:使用 nod

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

java - 爬行amazon.com