gpt4 book ai didi

java - 解析来自 Bing 的 XHTML 结果

转载 作者:行者123 更新时间:2023-12-01 16:05:06 25 4
gpt4 key购买 nike

我正在尝试解析从 bing 搜索引擎接收到的搜索查询,这些搜索查询是在 java 的 xhtml 中接收的。我正在使用 sax XmlReader 来读取结果,但我不断收到错误。这是我的代码 - 这是供读者使用的代码:

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;


public class XHTMLHandler extends DefaultHandler{
public XHTMLHandler()
{
super();
}
public void startDocument ()
{
System.out.println("Start document");
}
public void endDocument ()
{
System.out.println("End document");
}
public void startElement (String uri, String name,String qName, Attributes atts)
{
if ("".equals (uri))
System.out.println("Start element: " + qName);
else
System.out.println("Start element: {" + uri + "}" + name);
}

public void endElement (String uri, String name, String qName)
{
if ("".equals (uri))
System.out.println("End element: " + qName);
else
System.out.println("End element: {" + uri + "}" + name);
}
public void startPrefixMapping (String prefix, String uri)
throws SAXException {
}
public void endPrefixMapping (String prefix)
throws SAXException {
}



public void characters (char ch[], int start, int length)
{
System.out.print("Characters: \"");
for (int i = start; i < start + length; i++) {
switch (ch[i]) {
case '\\':
System.out.print("\\\\");
break;
case '"':
System.out.print("\\\"");
break;
case '\n':
System.out.print("\\n");
break;
case '\r':
System.out.print("\\r");
break;
case '\t':
System.out.print("\\t");
break;
default:
System.out.print(ch[i]);
break;
}
}
System.out.print("\"\n");
}

}

这是程序本身:

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpRetryException;
import java.net.HttpURLConnection;
import java.net.URL;

import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;


public class Searching {
private String m_urlBingSearch = "http://www.bing.com/search?q=";
private HttpURLConnection m_httpCon;
private OutputStreamWriter m_streamWriter;
//private BufferedReader m_bufferReader;
private URL m_serverAdress;
private StringBuilder sb;
private String m_line;
private InputSource m_inputSrc;
public Searching()
{

m_httpCon = null;
m_streamWriter = null;
//m_bufferReader = null;
m_serverAdress = null;
sb = null;
m_line = new String();
}
public void SearchBing(String searchPrms) throws SAXException,IOException
{


//set up connection
sb = new StringBuilder();
sb.append(m_urlBingSearch);
sb.append(searchPrms);
m_serverAdress = new URL(sb.toString());
m_httpCon = (HttpURLConnection)m_serverAdress.openConnection();
m_httpCon.setRequestMethod("GET");
m_httpCon.setDoOutput(true);
m_httpCon.setConnectTimeout(10000);
m_httpCon.connect();
//m_streamWriter = new OutputStreamWriter(m_httpCon.getOutputStream());
//m_bufferReader = new BufferedReader(new InputStreamReader(m_httpCon.getInputStream()));
XMLReader reader = XMLReaderFactory.createXMLReader();
XHTMLHandler handle = new XHTMLHandler();
reader.setContentHandler(handle);
reader.setErrorHandler(handle);
//reader.startPrefixMapping("html", "http://www.w3.org/1999/xhtml");
handle.startPrefixMapping("html", "http://www.w3.org/1999/xhtml");
m_inputSrc = new InputSource(m_httpCon.getInputStream());
reader.parse(m_inputSrc);
m_httpCon.disconnect();


}
public static void main(String [] args) throws SAXException,IOException
{
Searching s = new Searching();
s.SearchBing("beatles");
}
}

这是我的错误消息:

Exception in thread "main" java.io.IOException: Server returned HTTP response code: 503 for URL: http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd    at sun.net.www.protocol.http.HttpURLConnection.getInputStream(Unknown Source)    at com.sun.org.apache.xerces.internal.impl.XMLEntityManager.setupCurrentEntity(Unknown Source)    at com.sun.org.apache.xerces.internal.impl.XMLEntityManager.startEntity(Unknown Source)    at com.sun.org.apache.xerces.internal.impl.XMLEntityManager.startDTDEntity(Unknown Source)    at com.sun.org.apache.xerces.internal.impl.XMLDTDScannerImpl.setInputSource(Unknown Source)    at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl$DTDDriver.dispatch(Unknown Source)    at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl$DTDDriver.next(Unknown Source)    at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl$PrologDriver.next(Unknown Source)    at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl.next(Unknown Source)    at com.sun.org.apache.xerces.internal.impl.XMLNSDocumentScannerImpl.next(Unknown Source)    at com.sun.org.apache.xerces.internal.impl.XMLDocumentFragmentScannerImpl.scanDocument(Unknown Source)    at com.sun.org.apache.xerces.internal.parsers.XML11Configuration.parse(Unknown Source)    at com.sun.org.apache.xerces.internal.parsers.XML11Configuration.parse(Unknown Source)    at com.sun.org.apache.xerces.internal.parsers.XMLParser.parse(Unknown Source)    at com.sun.org.apache.xerces.internal.parsers.AbstractSAXParser.parse(Unknown Source)    at Searching.SearchBing(Searching.java:57)    at Searching.main(Searching.java:65)

有人可以帮忙吗?我认为这与 dtd 有关,但我不知道修复它

最佳答案

Server returned HTTP response code: 503 for URL: http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd

显然您正在尝试使用外部实体获取解析器来解析 XHTML 文档。它拖入 DTD 外部子集,以便它可以读取 HTML 特定实体的任何声明,例如  é

此时,您从托管该 DTD 外部子集的 w3.org 服务器收到 HTTP 503,但即使您没有收到,每次都用 DTD 请求轰炸该服务器仍然是非常不礼貌的行为你刮一下。 (也许他们就因为这个原因阻止了你?)

您可以创建一个EntityResolver返回您自己的 DTD 本地副本,或仅包含实体定义的精简版本。或者,如果您拥有的 XMLReader 实现支持该功能,您可以通过使用 setFeature 关闭该选项,要求读取器根本不获取 DTD。 (例如 for Xerxes 。)不过,如果文档包含非内置实体引用(例如  ),您可能会遇到麻烦。

此外,由于这是一个以 text/html 形式提供的实时网页,特别是因为它来自 Microsoft,因此可以非常乐观地假设它将保持格式良好!屏幕抓取通常最好使用能够容忍 HTML 怪异的解析器来完成。但正如上面的评论所述,无论如何,使用 API 都比屏幕抓取要好得多。

关于java - 解析来自 Bing 的 XHTML 结果,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/2775409/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com