gpt4 book ai didi

c++ - 无法使用 htmlcxx 单独提取文本

转载 作者:行者123 更新时间:2023-11-27 22:30:11 25 4
gpt4 key购买 nike

包括

#include <iostream>
#include <sstream>
#include <curl/curl.h>
#include <htmlcxx/html/ParserDom.h>
#include <iostream>
using namespace std;
using namespace htmlcxx;

static size_t http_write(void* buf, size_t size, size_t nmemb, void* userp)
{
if(userp)
{
ostringstream* oss = static_cast<ostringstream*>(userp);
streamsize len = size * nmemb;
oss->write(static_cast<char*>(buf), len);
return nmemb;
}

return 0;
}

string get_html_page(const string& url, long timeout = 0)
{
CURL* curl = curl_easy_init();

ostringstream oss;

curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &http_write);
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl, CURLOPT_FILE, &oss);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());

curl_easy_perform(curl);
curl_easy_cleanup(curl);

return oss.str();
}

int main()
{
string html = get_html_page("http://www.google.co.in");

//cout << html << endl;
HTML::ParserDom parser;
tree<HTML::Node> dom = parser.parseTree(html);

//Print whole DOM tree
//cout <<dom <<endl;

//Dump all links in the tree
tree<HTML::Node>::iterator it = dom.begin();
tree<HTML::Node>::iterator end = dom.end();
for (; it !=end; ++it)
{
if (strcasecmp(it->tagName().c_str(), "A") == 0)
{
it->parseAttributes();
//cout << it->attribute("href").second << endl;
}
}

//Dump all text of the document
it = dom.begin();
end = dom.end();
for (; it != end; ++it)
{
if ((!it->isTag()) && (!it->isComment()))
{
cout << it->text();
}
}
// cout << endl;
return 0;
}

我使用这段代码单独从 html 页面中提取文本,它提取了 java 脚本代码,我的代码有什么问题吗??

输出:

 Googlewindow.google={kEI:"0a97TLvcFMS7rAe5htz9Ag",kEXPI:"25901,26119,26325",kCSI:{e:"25901,26119,26325",ei:"0a97TLvcFMS7rAe5htz9Ag",expi:"25901,26119,26325"},ml:function(){},kHL:"en",time:function(){return(new Date).getTime()},log:function(b,d,c){var a=new Image,e=google,g=e.lc,f=e.li;a.onerror=(a.onload=(a.onabort=function(){delete g[f]}));g[f]=a;c=c||"/gen_204?atyp=i&ct="+b+"&cad="+d+"&zx="+google.time();a.src=c;e.li=f+1},lc:[],li:0,Toolbelt:{}};
window.google.sn="webhp";window.google.timers={load:{t:{start:(new Date).getTime()}}};try{}catch(u){}window.google.jsrt_kill=1;
var _gjwl=location;function _gjuc(){var e=_gjwl.href.indexOf("#");if(e>=0){var a=_gjwl.href.substring(e);if(a.indexOf("&q=")>0||a.indexOf("#q=")>=0){a=a.substring(1);if(a.indexOf("#")==-1){for(var c=0;c<a.length;){var d=c;if(a.charAt(d)=="&")++d;var b=a.indexOf("&",d);if(b==-1)b=a.length;var f=a.substring(d,b);if(f.indexOf("fp=")==0){a=a.substring(0,c)+a.substring(b,a.length);b=c}else if(f=="cad=h")return 0;c=b}_gjwl.href="/search?"+a+"&cad=h";return 1}}}return 0}function _gjp(){!(window._gjwl.hash&&
window._gjuc())&&setTimeout(_gjp,500)};
window._gjp && _gjp()body{margin:0}#gog{padding:3px 8px 0}td{line-height:.8em}.gac_m td{line-height:17px}form{margin-bottom:20px}body,td,a,p,.h{font-family:arial,sans-serif}.h{color:#36c;font-size:20px}.q{color:#00c}.ts td{padding:0}.ts{border-collapse:collapse}em{font-weight:bold;font-style:normal}.lst{width:496px}.tiah{width:458px}input{font-family:inherit}a.gb1,a.gb2,a.gb3,a.gb4{color:#11c !important}#gog{background:#fff}#gbar,#guser{font-size:13px;padding-top:1px !important}#gbar{float:left;height:22px}#guser{padding-bottom:7px !important;text-align:right}.gbh,.gbd{border-top:1px solid #c9d7f1;font-size:1px}.gbh{height:0;position:absolute;top:24px;width:100%}#gbs,.gbm{background:#fff;left:0;position:absolute;text-align:left;visibility:hidden;z-index:1000}.gbm{border:1px solid;border-color:#c9d7f1 #36c #36c #a2bae7;z-index:1001}.gb1{margin-right:.5em}.gb1,.gb3{zoom:1}.gb2{display:block;padding:.2em .5em}.gb2,.gb3{text-decoration:none;border-bottom:none}a.gb1,a.gb2,a.gb3,a.gb4{color:#00c !important}a.gb2:hover{background:#36c;color:#fff !important}body{background:#fff;color:black}input{-moz-box-sizing:content-box}a{color:#11c;text-decoration:none}a:hover,a:active{text-decoration:underline}.fl a{color:#4272db}a:visited{color:#551a8b}a.gb1,a.gb4{text-decoration:underline}a.gb3:hover{text-decoration:none}#ghead a.gb2:hover{color:#fff!important}.ds{display:-moz-inline-box}.ds{border-bottom:solid 1px #e7e7e7;border-right:solid 1px #e7e7e7;display:inline-block;margin:3px 0 4px;margin-left:4px}.sblc{padding-top:5px}.sblc a{display:block;margin:2px 0;margin-left:13px;font-size:11px;}.lsbb{background:#eee;border:solid 1px;border-color:#ccc #999 #999 #ccc;height:30px;display:block}.lsb{background:url(/images/srpr/nav_logo14.png) bottom;font:15px arial,sans-serif;border:none;color:#000;cursor:pointer;height:30px;margin:0;outline:0;vertical-align:top}.lsb:active{background:#ccc}.lst:focus{outline:none}.ftl,#fll a{margin:0 12px}#addlang a{padding:0 3px}.gac_v div{display:none}.gac_v .gac_v2,.gac_bt{display:block!important}google.y={};google.x=function(e,g){google.y[e.id]=[e,g];return false};window.gbar={qs:function(){},tg:function(e){var o={id:'gbar'};for(i in e)o[i]=e[i];google.x(o,function(){gbar.tg(o)})}};Web Images Maps News Orkut Books Gmail more &#9660;Translate Scholar Blogs YouTube Calendar Photos Documents Reader Sites Groups even more &raquo; iGoogle | Search settings | Sign in India&nbsp;Advanced SearchLanguage ToolsGoogle.co.in offered in: Hindi Bengali Telugu Marathi Tamil Gujarati Kannada Malayalam PunjabiAdvertising&nbsp;ProgramsAbout GoogleGo to Google.com&copy; 2010 - Privacy if(google.y)google.y.first=[];if(google.y)google.y.first=[];google.dstr=[];google.rein=[];window.setTimeout(function(){var a=document.createElement("script");a.src="/extern_js/f/CgJlbhICaW4gACswRTgBLCswWjgDLCswDjgALCswFzgHLCswJzgELCswPDgDLCswUTgDLCswCjhzQB0sKzAWOB0sKzAZOCAsKzAlOMqIASwrMDU4BCwrMEA4EiwrMEE4BSwrME44BiwrMFQ4ASwrMBg4BSwrMCY4DSyAAheQAhg/x2R96GGjycQ.js";(document.getElementById("xjsd")||document.body).appendChild(a);if(google.timers&&google.timers.load.t)google.timers.load.t.xjsls=(new Date).getTime();},0);
;google.neegg=1;google.y.first.push(function(){var form=document.f||document.f||document.gs;google.ac.i(form,form.q,'','','',{o:1,sw:1});google.History&&google.History.initialize('/')});if(google.j&&google.j.en&&google.j.xi){window.setTimeout(google.j.xi,0);google.fade=null;}(function(){
var b,d,e,f;function g(a,c){if(a.removeEventListener){a.removeEventListener("load",c,false);a.removeEventListener("error",c,false)}else{a.detachEvent("onload",c);a.detachEvent("onerror",c)}}function h(a){f=(new Date).getTime();++d;a=a||window.event;var c=a.target||a.srcElement;g(c,h)}var i=document.getElementsByTagName("img");b=i.length;d=0;for(var j=0,k;j<b;++j){k=i[j];if(k.complete||typeof k.src!="string"||!k.src)++d;else if(k.addEventListener){k.addEventListener("load",h,false);k.addEventListener("error",
h,false)}else{k.attachEvent("onload",h);k.attachEvent("onerror",h)}}e=b-d;function l(){if(!google.timers.load.t)return;google.timers.load.t.ol=(new Date).getTime();google.timers.load.t.iml=f;google.kCSI.imc=d;google.kCSI.imn=b;google.kCSI.imp=e;google.timers.load.t.xjs&&google.report&&google.report(google.timers.load,google.kCSI)}if(window.addEventListener)window.addEventListener("load",l,false);else if(window.attachEvent)window.attachEvent("onload",l);google.timers.load.t.prt=(f=(new Date).getTime());
})();

最佳答案

这是预期的行为(我是图书馆的作者之一)。如果您不想打印 javascript 负载,则需要跳过代码中的 javascript 标记。您可以简单地将以下内容添加为 for 循环中的第一行。

if (it->isTag() && strcasecasecmp(it->tagName(), "javascript") == 0) 继续;

关于c++ - 无法使用 htmlcxx 单独提取文本,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/3600718/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com