gpt4 book ai didi

c# - 如何使用 iTextSharp 获取文本格式

转载 作者:IT王子 更新时间:2023-10-29 04:37:47 26 4
gpt4 key购买 nike

我正在使用 iTextSharp 从 PDF 中读取文本内容。我也能读懂。但是我丢失了文本格式,如字体、颜色等。有什么方法也可以获得这种格式。

下面是我用来精确文本的代码段 -

PdfReader reader = new PdfReader("F:\\EBooks\\AspectsOfAjax.pdf");
textBox1.Text = ExtractTextFromPDFBytes(reader.GetPageContent(1));

private string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;
// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] {"'", "T*", "\""}, previousCharacters))
{
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 && CheckToken( new string[]{"ET"}, previousCharacters))
{
inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) || ((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
// Store the recent characters for when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;

// Start of a text object
if (!inTextObject && CheckToken(new string[]{"BT"}, previousCharacters))
{
inTextObject = true;
}
}
return resultString;
}
catch
{
return "";
}
}

private bool CheckToken(string[] tokens, char[] recent)
{
foreach(string token in tokens)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a))
)
{
return true;
}
}
return false;
}

最佳答案

让我尝试为您指出不同的方向。 iTextSharp 有一个非常漂亮和简单的文本提取系统,可以处理一些基本的标记。不幸的是它不处理颜色信息但是 according to @Mark Storer it might not be too hard to implement yourself .

开始编辑

我开始致力于实现颜色信息。参见 my blog post here更多细节。 (抱歉格式不好,现在去吃晚饭。)

结束编辑

下面的代码结合了这里的几个问题和答案,包括 this one to get the font height (虽然它不准确)以及另一个(在我的生活中我似乎再也找不到了)展示了如何检测人造粗体。

PostscriptFontName 返回字体名称前面的一些额外字符,我认为这与嵌入字体子集的时间有关。

下面是一个完整的 WinForms 应用程序,它以 iTextSharp 5.1.1.0 为目标并将文本提取为 HTML。

示例 PDF 的屏幕截图

Screenshot of sample PDF

提取为 HTML 的示例文本

<span style="font-family:NJNSWD+Papyrus-Regular;font-size:11.61407">Hello </span>
<span style="font-family:NJNSWD+Papyrus-Regular-Bold;font-size:11.61407">w</span>
<span style="font-family:NJNSWD+Papyrus-Regular-Bold;font-size:37.87201">o</span>
<span style="font-family:NJNSWD+Papyrus-Regular-Bold;font-size:11.61407">rl</span>
<span style="font-family:NJNSWD+Papyrus-Regular;font-size:11.61407">d </span>
<br />
<span style="font-family:NJNSWD+Papyrus-Regular;font-size:11.61407">Test </span>

代码

using System;
using System.Collections.Generic;
using System.Text;
using System.Windows.Forms;
using iTextSharp.text.pdf.parser;
using iTextSharp.text.pdf;

namespace WindowsFormsApplication2
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}

private void Form1_Load(object sender, EventArgs e)
{
PdfReader reader = new PdfReader(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "Document.pdf"));
TextWithFontExtractionStategy S = new TextWithFontExtractionStategy();
string F = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, 1, S);
Console.WriteLine(F);

this.Close();
}

public class TextWithFontExtractionStategy : iTextSharp.text.pdf.parser.ITextExtractionStrategy
{
//HTML buffer
private StringBuilder result = new StringBuilder();

//Store last used properties
private Vector lastBaseLine;
private string lastFont;
private float lastFontSize;

//http://api.itextpdf.com/itext/com/itextpdf/text/pdf/parser/TextRenderInfo.html
private enum TextRenderMode
{
FillText = 0,
StrokeText = 1,
FillThenStrokeText = 2,
Invisible = 3,
FillTextAndAddToPathForClipping = 4,
StrokeTextAndAddToPathForClipping = 5,
FillThenStrokeTextAndAddToPathForClipping = 6,
AddTextToPaddForClipping = 7
}



public void RenderText(iTextSharp.text.pdf.parser.TextRenderInfo renderInfo)
{
string curFont = renderInfo.GetFont().PostscriptFontName;
//Check if faux bold is used
if ((renderInfo.GetTextRenderMode() == (int)TextRenderMode.FillThenStrokeText))
{
curFont += "-Bold";
}

//This code assumes that if the baseline changes then we're on a newline
Vector curBaseline = renderInfo.GetBaseline().GetStartPoint();
Vector topRight = renderInfo.GetAscentLine().GetEndPoint();
iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(curBaseline[Vector.I1], curBaseline[Vector.I2], topRight[Vector.I1], topRight[Vector.I2]);
Single curFontSize = rect.Height;

//See if something has changed, either the baseline, the font or the font size
if ((this.lastBaseLine == null) || (curBaseline[Vector.I2] != lastBaseLine[Vector.I2]) || (curFontSize != lastFontSize) || (curFont != lastFont))
{
//if we've put down at least one span tag close it
if ((this.lastBaseLine != null))
{
this.result.AppendLine("</span>");
}
//If the baseline has changed then insert a line break
if ((this.lastBaseLine != null) && curBaseline[Vector.I2] != lastBaseLine[Vector.I2])
{
this.result.AppendLine("<br />");
}
//Create an HTML tag with appropriate styles
this.result.AppendFormat("<span style=\"font-family:{0};font-size:{1}\">", curFont, curFontSize);
}

//Append the current text
this.result.Append(renderInfo.GetText());

//Set currently used properties
this.lastBaseLine = curBaseline;
this.lastFontSize = curFontSize;
this.lastFont = curFont;
}

public string GetResultantText()
{
//If we wrote anything then we'll always have a missing closing tag so close it here
if (result.Length > 0)
{
result.Append("</span>");
}
return result.ToString();
}

//Not needed
public void BeginTextBlock() { }
public void EndTextBlock() { }
public void RenderImage(ImageRenderInfo renderInfo) { }
}
}
}

关于c# - 如何使用 iTextSharp 获取文本格式,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/6882098/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com