gpt4 book ai didi

javascript - 将 html 元素插入文本字符串以匹配另一个 html 字符串

转载 作者:技术小花猫 更新时间:2023-10-29 12:39:01 25 4
gpt4 key购买 nike

有两个文件 pdf 和 html,以纯文本字符串(从 pdf 中提取文本后)和 html 读取文件,现在试图使纯文本具有与 html 字符串相同的 html 标签。然后比较它们,找出不同之处

当前无法正常工作的简单示例的最终编辑

var text1="here is example text";

var text2="<html><body><div>here is another <span>example</span> text</div></body></html>";

var div = document.createElement("div");
div.innerHTML = text2;
var text = div.textContent || div.innerText || "";

var content= text.split(" ");
var alltags=text2.match(/<.+?>/g);
var pdfwords=text1.split(" ");
var output="";
for(var j=0; j<alltags.length; j++){
for(i=0; i<pdfwords.length; i++){
if(pdfwords[i]===content[j]){

output+=alltags[i]+pdfwords[i];
}
}
}

document.write(output);

输出应该是

"<html><body><div>here is another<span>example</span> text</div></body></html>"

比较这两个字符串输出和 text2 显示差异,因为插入了“另一个”

最佳答案

这是您想要的简单解决方案,它是一个动态解决方案,因为它将处理找到的任何标签并仅比较文本内容。 findDiff()将找到差异并以输出和不同单词的数组作为参数调用回调函数。

JSFiddle:https://jsfiddle.net/9svuc7om/18/

/**
* Parse and construct an Array of PDF text tokens
* @params {string} text The PDF text to be parsed
* @return {object} The parsed Array of tokens
*/
function parsePDFText(text) {
var token = text.split(' ');
for (var i=0,l=token.length; i<l; i++) {
// remove token of first space and consecutive space
if (token[i] == '') {
token.splice(i, 1);
}
}
return token;
}

/**
* Return the minimum indexOf among all the arguments
* @params {...number} index The indexOf
* @return {number} The minimum indexOf, -1 if all arguments are -1
*/
function findMinIndex() {
var min;
for (var i = 0, l = arguments.length; i < l; i++) {
// indexOf() returns -1 if not found
if (arguments[i] === -1) {
continue;
}
if (typeof min === 'undefined' || arguments[i] < min) {
min = arguments[i];
}
}
return min || -1;
}

/**
* Parse and construct an Array of HTML tokens
* @params {string} text The HTML text to be parsed
* @return {object} The parsed Array of tokens
*/
function parseHTMLText(text) {
var currentIndex = 0,
tl = text.length,
tokens = [],
token, firstChar, endPos;
while (currentIndex < tl) {
// determine the next token type
firstChar = text.charAt(currentIndex);
if (firstChar == '<') {
// a tag
// find the position of closing tag, assume all tags are well formed
endPos = text.indexOf('>', currentIndex + 1) + 1;
token = {
type: 'tag',
content: text.slice(currentIndex, endPos),
valid: true
}
currentIndex = endPos;
} else if (firstChar == ' ') {
// a space
token = {
type: 'space',
content: ' ',
valid: true
}
currentIndex++;
} else {
// a character, possibliy part of a word
// find the end of the word
// assume a word is delimitered either by tags or space
endPos = findMinIndex(text.indexOf('<', currentIndex), text.indexOf(' ', currentIndex));
// endPos is `-1` if there are not delimiter anymore, end of string reached
if (endPos === -1) {
endPos = tl;
}
token = {
type: 'text',
content: text.slice(currentIndex, endPos),
valid: true
}
currentIndex = endPos;
}
tokens.push(token);
}
return tokens;
}

/**
* Find the difference between pdf text and html text and pass the output and differenc to a callback function
* @params {string} pdfText The pdf text
* @params {string} htmlText The html text
* @params {function} callback The callback function
*/
function findDiff(pdfText, htmlText, callback) {
var output = '', // the final output
diff = [], // the array of different words
pdfTokens = parsePDFText(pdfText),
htmlTokens = parseHTMLText(htmlText),
j=0, hl=htmlTokens.length;
// the pdf text is the reference point, i.e. all the words in pdf text should always be present in html text as well
for (var i=0,pl=pdfTokens.length; i<pl; i++) {
// find the first occurrence of the pdf text
for(; j<hl; j++) {
if (htmlTokens[j].type != 'text') {
// exclude comparison to non-text
continue;
}
// check if the two text matches
if (htmlTokens[j].content == pdfTokens[i]) {
// a match is found
j++;
break;
} else {
// push the different html token into `diff` array
diff.push(htmlTokens[j].content);
// set the `valid` field of token to false
htmlTokens[j].valid = false;
}
}
}
// invalidate the rest of the html text
for(; j<hl; j++) {
if (htmlTokens[j].type == 'text') {
htmlTokens[j].valid = false;
}
}
// concat the final string to output
for (j=0; j<hl; j++) {
if (htmlTokens[j].valid) {
output += htmlTokens[j].content;
}
}
callback(output, diff);
}

你可以通过使用调用该函数

findDiff(text1, text2, function(output, diff) {
console.log(output);
console.log(diff);
});

但是,这个方案也有一些局限性

  1. 它假定 pdf 中的所有内容都存在于 HTML 文本中
  2. 它只处理<>和空格,如果还有其他可能的分隔符,例如选项卡,需要额外的代码
  3. 它假定所有标签都是格式正确的,并且文本内容之间不会有结束标签(如果需要,您应该使用 &gt; &lt; 代替)
  4. 该功能是一个简化的解决方案,未经过全面测试。您不能指望它有任何保证,需要进行一些调整。我建议只提供 body 中的内容甚至更窄的范围而不是整个 HTML 文件(如果在您的情况下可能),因为 HTML 文件的内容会有太多变化。

关于javascript - 将 html 元素插入文本字符串以匹配另一个 html 字符串,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/37264902/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com