gpt4 book ai didi

ifilter - 用于 docx 解析器错误的 word ifilter

转载 作者:行者123 更新时间:2023-12-02 18:01:44 24 4
gpt4 key购买 nike

.Docx 文档似乎没有被索引。

我在 .docx 中使用了唯一的字符串,但当我搜索“one”时,未返回 .docx。

例如,以下文本:

“这是第一行的文本,这是第二行的文本。”

将通过 iFilter 提取为:

“这是第一行的文本,这是第二行的文本。”

因此,当 Ifilter 解析 .docx 时,他会删除换行符分隔符并尝试解析“oneand here”... .

看来 .docx 的 Word ifilter 将一行的最后一个单词与下一行的第一个单词连接起来。

任何人都可以提供一些如何解决此问题的想法吗?

提前致谢。

最佳答案

好的,我现在明白了。基本上 64 位 IFilter 无法正常工作。它合并由换行符分隔的单词,并且不将它们贯穿。我用过Ionic.zip使用 DocxToText 的稍微修改版本来访问 docx zip 存档并解析重要的 xml 文件。现在效果很好。

这是最初由 Jevgenij Pankov 创建的修改后的代码

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Ionic.Zip;
using System.IO;
using System.Xml;

public class DocxToText
{
private const string ContentTypeNamespace =
@"http://schemas.openxmlformats.org/package/2006/content-types";

private const string WordprocessingMlNamespace =
@"http://schemas.openxmlformats.org/wordprocessingml/2006/main";

private const string DocumentXmlXPath =
"/t:Types/t:Override[@ContentType=\"" +
"application/vnd.openxmlformats-officedocument." +
"wordprocessingml.document.main+xml\"]";

private const string BodyXPath = "/w:document/w:body";

private string docxFile = "";
private string docxFileLocation = "";

public DocxToText(string fileName)
{
docxFile = fileName;
}

#region ExtractText()
///

/// Extracts text from the Docx file.

///

/// Extracted text.

public string ExtractText()
{
if (string.IsNullOrEmpty(docxFile))
throw new Exception("Input file not specified.");

// Usually it is "/word/document.xml"


docxFileLocation = FindDocumentXmlLocation();

if (string.IsNullOrEmpty(docxFileLocation))
throw new Exception("It is not a valid Docx file.");

return ReadDocumentXml();
}
#endregion

#region FindDocumentXmlLocation()
///

/// Gets location of the "document.xml" zip entry.

///

/// Location of the "document.xml".

private string FindDocumentXmlLocation()
{
using (ZipFile zip = new ZipFile(docxFile))
{
foreach (ZipEntry entry in zip)
{
// Find "[Content_Types].xml" zip entry
if (string.Compare(entry.FileName, "[Content_Types].xml", true) == 0)
{
XmlDocument xmlDoc = new XmlDocument();
using (var stream = new MemoryStream())
{

entry.Extract(stream);
stream.Position = 0;

xmlDoc.PreserveWhitespace = true;
xmlDoc.Load(stream);
}

//Create an XmlNamespaceManager for resolving namespaces


XmlNamespaceManager nsmgr =
new XmlNamespaceManager(xmlDoc.NameTable);
nsmgr.AddNamespace("t", ContentTypeNamespace);

// Find location of "document.xml"


XmlNode node = xmlDoc.DocumentElement.SelectSingleNode(
DocumentXmlXPath, nsmgr);

if (node != null)
{
string location =
((XmlElement)node).GetAttribute("PartName");
return location.TrimStart(new char[] { '/' });
}
break;
}
}
}
return null;
}
#endregion

#region ReadDocumentXml()
///

/// Reads "document.xml" zip entry.

///

/// Text containing in the document.

private string ReadDocumentXml()
{
StringBuilder sb = new StringBuilder();

using (ZipFile zip = new ZipFile(docxFile))
{
foreach (ZipEntry entry in zip)
{
if (string.Compare(entry.FileName, docxFileLocation, true) == 0)
{
XmlDocument xmlDoc = new XmlDocument();
using (var stream = new MemoryStream())
{

entry.Extract(stream);
stream.Position = 0;

xmlDoc.PreserveWhitespace = true;
xmlDoc.Load(stream);
}

XmlNamespaceManager nsmgr =
new XmlNamespaceManager(xmlDoc.NameTable);
nsmgr.AddNamespace("w", WordprocessingMlNamespace);

XmlNode node =
xmlDoc.DocumentElement.SelectSingleNode(BodyXPath, nsmgr);

if (node == null)
return string.Empty;

sb.Append(ReadNode(node));

break;
}
}
}
return sb.ToString();
}
#endregion

#region ReadNode()
///

/// Reads content of the node and its nested childs.

///

/// XmlNode.

/// Text containing in the node.

private string ReadNode(XmlNode node)
{
if (node == null || node.NodeType != XmlNodeType.Element)
return string.Empty;

StringBuilder sb = new StringBuilder();
foreach (XmlNode child in node.ChildNodes)
{
if (child.NodeType != XmlNodeType.Element) continue;

switch (child.LocalName)
{
case "t": // Text

sb.Append(child.InnerText.TrimEnd());

string space =
((XmlElement)child).GetAttribute("xml:space");
if (!string.IsNullOrEmpty(space) &&
space == "preserve")
sb.Append(' ');

break;

case "cr": // Carriage return

case "br": // Page break

sb.Append(Environment.NewLine);
break;

case "tab": // Tab

sb.Append("\t");
break;

case "p": // Paragraph

sb.Append(ReadNode(child));
sb.Append(Environment.NewLine);
sb.Append(Environment.NewLine);
break;

default:
sb.Append(ReadNode(child));
break;
}
}
return sb.ToString();
}
#endregion
}

这是这段代码的用法...

DocxToText dtt = new DocxToText(filepath);
string docxText = dtt.ExtractText();

关于ifilter - 用于 docx 解析器错误的 word ifilter,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/1939187/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com