gpt4 book ai didi

java - 元素到 HTMLDocument 中的字符串

转载 作者:塔克拉玛干 更新时间:2023-11-01 22:57:35 29 4
gpt4 key购买 nike

我有一个 Element 对象,它是一个 HTMLDocument 对象,我想将这个元素的值字符串化。

我想要这个结果

克里斯蒂娜·托特 (Christina Toth),药学博士。 D.

=======================

请看下面的代码。

public static void main(String args[]) throws Exception {

InputStream is = Nullsoft.getInputStream();
InputStreamReader isr = new InputStreamReader(is);
BufferedReader br = new BufferedReader(isr);

HTMLEditorKit htmlKit = new HTMLEditorKit();
HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();

HTMLEditorKit.Parser parser = new ParserDelegator();
HTMLEditorKit.ParserCallback callback = htmlDoc.getReader(0);
parser.parse(br, callback, true);

// Parse
ElementIterator iterator = new ElementIterator(htmlDoc);
Element element;
while ((element = iterator.next()) != null) {
AttributeSet attributes = element.getAttributes();
Object name = attributes.getAttribute(StyleConstants.NameAttribute);
if ((name instanceof HTML.Tag)
&& ((name == HTML.Tag.DIV) || (name == HTML.Tag.H2) || (name == HTML.Tag.H3))) {
StringBuffer text = new StringBuffer();
int count = element.getElementCount();
for (int i = 0; i < count; i++) {
Element child = element.getElement(i);
AttributeSet childAttributes = child.getAttributes();
// if (childAttributes.getAttribute(StyleConstants.NameAttribute) == HTML.Tag.CONTENT)
{
int startOffset = child.getStartOffset();
int endOffset = child.getEndOffset();
int length = endOffset - startOffset;
text.append(htmlDoc.getText(startOffset, length));
}
}
System.out.println(name + ": " + text.toString());
}
}
System.exit(0);
}

public static InputStream getInputStream() {

String text = "<html>\n" +
"<head>\n" +
"<title>pg_0001</title>\n" +
"\n" +
"<style type=\"text/css\">\n" +
".ft3{font-style:normal;font-weight:bold;font-size:11px;font-family:Helvetica;color:#000000;}\n" +
"</style>\n" +
"</head>\n" +
"<body vlink=\"#FFFFFF\" link=\"#FFFFFF\" bgcolor=\"#ffffff\">\n" +
"\n" +
"\n" +
"<div style=\"position:absolute;top:597;left:252\"><nobr><span class=\"ft3\">Christina Toth, Pharm. D.</span></nobr></div>\n" +
"\n" +
"\n" +
"</body>\n" +
"</html>";
InputStream is = null;
try {

is = new ByteArrayInputStream(text.getBytes("UTF-8"));

} catch (UnsupportedEncodingException e) {
e.printStackTrace();

}
return is;
}

最佳答案

试试这个。

编辑使用 read() HTMLEditorKit的方法.

import java.io.StringReader;
import javax.swing.text.AttributeSet;
import javax.swing.text.Element;
import javax.swing.text.ElementIterator;
import javax.swing.text.StyleConstants;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;

public class NewMain {

public static void main(String args[]) throws Exception {
HTMLEditorKit htmlKit = new HTMLEditorKit();
HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();
htmlKit.read(new StringReader(text), htmlDoc, 0);
// Parse
ElementIterator iterator = new ElementIterator(htmlDoc);
Element element;
while ((element = iterator.next()) != null) {
AttributeSet as = element.getAttributes();
Object name = as.getAttribute(StyleConstants.NameAttribute);
if (name == HTML.Tag.DIV) {
StringBuffer sb = new StringBuffer();
sb.append(name).append(": ");
int count = element.getElementCount();
for (int i = 0; i < count; i++) {
Element child = element.getElement(i);
int startOffset = child.getStartOffset();
int endOffset = child.getEndOffset();
int length = endOffset - startOffset;
sb.append(htmlDoc.getText(startOffset, length));
}
System.out.println(sb);
}
}
}
private static String text
= "<html>\n"
+ "<head>\n"
+ "<title>pg_0001</title>\n"
+ "\n"
+ "<style type=\"text/css\">\n"
+ ".ft3{font-style:normal;font-weight:bold;font-size:11px;"
+ "font-family:Helvetica;color:#000000;}\n"
+ "</style>\n"
+ "</head>\n"
+ "<body vlink=\"#FFFFFF\" link=\"#FFFFFF\" bgcolor=\"#ffffff\">\n"
+ "\n"
+ "\n"
+ "<div style=\"position:absolute;top:597;left:252\"><nobr><span "
+ "class=\"ft3\">Christina Toth, Pharm. D.</span></nobr></div>\n"
+ "\n"
+ "\n"
+ "</body>\n"
+ "</html>";
}

控制台:

div: Christina Toth, Pharm. D.

关于java - 元素到 HTMLDocument 中的字符串,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/2882782/

29 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com