gpt4 book ai didi

java - 如何提取网页的主要内容?

转载 作者:行者123 更新时间:2023-12-02 09:27:02 42 4
gpt4 key购买 nike

我正在尝试编写网页内容的摘要。为此,我需要从网页中提取所有不相关的文本和数据。

我用过boilerpipe,但是文本提取不好。结果是here ,您可以在其中看到很多不相关的文字。

还尝试 JSoup 通过删除页眉、页脚、外部链接等来清除不相关的数据。但结果还是不符合标准。

    Document doc = Jsoup.connect("www.anyurl.com").get()
doc.head().remove();
doc.getElementsByTag("header").remove();
doc.getElementsByTag("footer").remove();
doc.getElementsByTag("form").remove();
doc.getElementsByTag("table").remove();
doc.getElementsByTag("meta").remove();
doc.getElementsByTag("img").remove();
doc.getElementsByTag("a").remove();
doc.getElementsByTag("br").remove();

doc.getElementsByClass("tags").remove();
doc.getElementsByClass("copyright").remove();
doc.getElementsByClass("widget").remove();

doc.select("div[class*=foot").remove();
doc.select("div[class*=tag").remove();
doc.select("div[class*=Loading").remove();
doc.select("div[class*=Widget").remove();
doc.select("div[class*=Head").remove();
doc.select("div[class*=menu").remove();
doc.select("p[class*=link").remove();

Elements paragraphs = doc.select("p");
Elements divs = doc.select("div");

formattedOutput = paragraphs.text() + divs.text();

有人可以建议我如何完成这件事吗?除了boilerpipe之外还有什么Java库可以满足您的需要吗?

最佳答案

我不懂java,但你可以使用从网页中提取主要内容

<?php

class ContentExtractor {

var $container_tags = array(
'div', 'table', 'td', 'th', 'tr', 'tbody', 'thead', 'tfoot', 'col',
'colgroup', 'ul', 'ol', 'html', 'center', 'span'
);
var $removed_tags = array(
'script', 'noscript', 'style', 'form', 'meta', 'input', 'iframe', 'embed', 'hr', 'img',
'#comment', 'link', 'label'
);
var $ignore_len_tags = array(
'span'
);

var $link_text_ratio = 0.04;
var $min_text_len = 20;
var $min_words = 0;

var $total_links = 0;
var $total_unlinked_words = 0;
var $total_unlinked_text='';
var $text_blocks = 0;

var $tree = null;
var $unremoved=array();

function sanitize_text($text){
$text = str_ireplace('&nbsp;', ' ', $text);
$text = html_entity_decode($text, ENT_QUOTES);

$utf_spaces = array("\xC2\xA0", "\xE1\x9A\x80", "\xE2\x80\x83",
"\xE2\x80\x82", "\xE2\x80\x84", "\xE2\x80\xAF", "\xA0");
$text = str_replace($utf_spaces, ' ', $text);

return trim($text);
}

function extract($text, $ratio = null, $min_len = null){
$this->tree = new DOMDocument();

$start = microtime(true);
if (!@$this->tree->loadHTML($text)) return false;

$root = $this->tree->documentElement;
$start = microtime(true);
$this->HeuristicRemove($root, ( ($ratio == null) || ($min_len == null) ));

if ($ratio == null) {
$this->total_unlinked_text = $this->sanitize_text($this->total_unlinked_text);

$words = preg_split('/[\s\r\n\t\|?!.,]+/', $this->total_unlinked_text);
$words = array_filter($words);
$this->total_unlinked_words = count($words);
unset($words);
if ($this->total_unlinked_words>0) {
$this->link_text_ratio = $this->total_links / $this->total_unlinked_words;// + 0.01;
$this->link_text_ratio *= 1.3;
}

} else {
$this->link_text_ratio = $ratio;
};

if ($min_len == null) {
$this->min_text_len = strlen($this->total_unlinked_text)/$this->text_blocks;
} else {
$this->min_text_len = $min_len;
}

$start = microtime(true);
$this->ContainerRemove($root);

return $this->tree->saveHTML();
}

function HeuristicRemove($node, $do_stats = false){
if (in_array($node->nodeName, $this->removed_tags)){
return true;
};

if ($do_stats) {
if ($node->nodeName == 'a') {
$this->total_links++;
}
$found_text = false;
};

$nodes_to_remove = array();

if ($node->hasChildNodes()){
foreach($node->childNodes as $child){
if ($this->HeuristicRemove($child, $do_stats)) {
$nodes_to_remove[] = $child;
} else if ( $do_stats && ($node->nodeName != 'a') && ($child->nodeName == '#text') ) {
$this->total_unlinked_text .= $child->wholeText;
if (!$found_text){
$this->text_blocks++;
$found_text=true;
}
};
}
foreach ($nodes_to_remove as $child){
$node->removeChild($child);
}
}

return false;
}

function ContainerRemove($node){
if (is_null($node)) return 0;
$link_cnt = 0;
$word_cnt = 0;
$text_len = 0;
$delete = false;
$my_text = '';

$ratio = 1;

$nodes_to_remove = array();
if ($node->hasChildNodes()){
foreach($node->childNodes as $child){
$data = $this->ContainerRemove($child);

if ($data['delete']) {
$nodes_to_remove[]=$child;
} else {
$text_len += $data[2];
}

$link_cnt += $data[0];

if ($child->nodeName == 'a') {
$link_cnt++;
} else {
if ($child->nodeName == '#text') $my_text .= $child->wholeText;
$word_cnt += $data[1];
}
}

foreach ($nodes_to_remove as $child){
$node->removeChild($child);
}

$my_text = $this->sanitize_text($my_text);

$words = preg_split('/[\s\r\n\t\|?!.,\[\]]+/', $my_text);
$words = array_filter($words);

$word_cnt += count($words);
$text_len += strlen($my_text);

};

if (in_array($node->nodeName, $this->container_tags)){
if ($word_cnt>0) $ratio = $link_cnt/$word_cnt;

if ($ratio > $this->link_text_ratio){
$delete = true;
}

if ( !in_array($node->nodeName, $this->ignore_len_tags) ) {
if ( ($text_len < $this->min_text_len) || ($word_cnt<$this->min_words) ) {
$delete = true;
}
}

}

return array($link_cnt, $word_cnt, $text_len, 'delete' => $delete);
}

}

/****************************
Simple usage example
*****************************/

$html = file_get_contents('http://en.wikipedia.org/wiki/Shannon_index');

$extractor = new ContentExtractor();
$content = $extractor->extract($html);
echo $content;

?>

关于java - 如何提取网页的主要内容?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/40168806/

42 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com