gpt4 book ai didi

java - IText 像 pdftotext -layout 一样读取 PDF?

转载 作者:行者123 更新时间:2023-12-02 05:36:52 24 4
gpt4 key购买 nike

我正在寻找实现 java 解决方案的最简单方法,该解决方案与

的输出非常安静
pdftotext -layout FILE

在 Linux 机器上。 (当然它也应该便宜)

我刚刚尝试了 IText、PDFBox 和 PDFTextStream 的一些代码片段。迄今为止最准确的解决方案是 PDFTextStream,它使用 VisualOutputTarget 来获得我的文件的良好表示。

所以我的列布局被认为是正确的,我可以使用它。但是IText也应该有一个解决方案,或者?

我发现的每个简单的片段都会产生简单的有序字符串,这些字符串是一团糟(搞乱了行/列/行)。是否有任何可能更容易并且可能不涉及自己的策略的解决方案?或者有我可以使用的开源策略吗?

//我按照 mkl 的说明编写了自己的策略对象,如下所示:

package com.test.pdfextractiontest.itext;

import ...


public class MyLocationTextExtractionStrategy implements TextExtractionStrategy {

/** set to true for debugging */
static boolean DUMP_STATE = false;

/** a summary of all found text */
private final List<TextChunk> locationalResult = new ArrayList<TextChunk>();


public MyLocationTextExtractionStrategy() {
}


@Override
public void beginTextBlock() {
}


@Override
public void endTextBlock() {
}

private boolean startsWithSpace(final String str) {
if (str.length() == 0) {
return false;
}
return str.charAt(0) == ' ';
}


private boolean endsWithSpace(final String str) {
if (str.length() == 0) {
return false;
}
return str.charAt(str.length() - 1) == ' ';
}

private List<TextChunk> filterTextChunks(final List<TextChunk> textChunks, final TextChunkFilter filter) {
if (filter == null) {
return textChunks;
}

final List<TextChunk> filtered = new ArrayList<TextChunk>();
for (final TextChunk textChunk : textChunks) {
if (filter.accept(textChunk)) {
filtered.add(textChunk);
}
}
return filtered;
}


protected boolean isChunkAtWordBoundary(final TextChunk chunk, final TextChunk previousChunk) {
final float dist = chunk.distanceFromEndOf(previousChunk);

if (dist < -chunk.getCharSpaceWidth() || dist > chunk.getCharSpaceWidth() / 2.0f) {
return true;
}

return false;
}

public String getResultantText(final TextChunkFilter chunkFilter) {
if (DUMP_STATE) {
dumpState();
}

final List<TextChunk> filteredTextChunks = filterTextChunks(this.locationalResult, chunkFilter);
Collections.sort(filteredTextChunks);

final StringBuffer sb = new StringBuffer();
TextChunk lastChunk = null;
for (final TextChunk chunk : filteredTextChunks) {

if (lastChunk == null) {
sb.append(chunk.text);
} else {
if (chunk.sameLine(lastChunk)) {

if (isChunkAtWordBoundary(chunk, lastChunk) && !startsWithSpace(chunk.text)
&& !endsWithSpace(lastChunk.text)) {
sb.append(' ');
}
final Float dist = chunk.distanceFromEndOf(lastChunk)/3;
for(int i = 0; i<Math.round(dist); i++) {
sb.append(' ');
}
sb.append(chunk.text);
} else {
sb.append('\n');
sb.append(chunk.text);
}
}
lastChunk = chunk;
}

return sb.toString();
}

返回一个带有结果文本的字符串。 */ @覆盖 公共(public)字符串 getResultantText() {

        return getResultantText(null);

}

private void dumpState() {
for (final TextChunk location : this.locationalResult) {
location.printDiagnostics();

System.out.println();
}

}


@Override
public void renderText(final TextRenderInfo renderInfo) {
LineSegment segment = renderInfo.getBaseline();
if (renderInfo.getRise() != 0) {

final Matrix riseOffsetTransform = new Matrix(0, -renderInfo.getRise());
segment = segment.transformBy(riseOffsetTransform);
}
final TextChunk location =
new TextChunk(renderInfo.getText(), segment.getStartPoint(), segment.getEndPoint(),
renderInfo.getSingleSpaceWidth(),renderInfo);
this.locationalResult.add(location);
}

public static class TextChunk implements Comparable<TextChunk> {
/** the text of the chunk */
private final String text;
/** the starting location of the chunk */
private final Vector startLocation;
/** the ending location of the chunk */
private final Vector endLocation;
/** unit vector in the orientation of the chunk */
private final Vector orientationVector;
/** the orientation as a scalar for quick sorting */
private final int orientationMagnitude;

private final TextRenderInfo info;

private final int distPerpendicular;

private final float distParallelStart;

private final float distParallelEnd;
/** the width of a single space character in the font of the chunk */
private final float charSpaceWidth;

public TextChunk(final String string, final Vector startLocation, final Vector endLocation,
final float charSpaceWidth,final TextRenderInfo ri) {
this.text = string;
this.startLocation = startLocation;
this.endLocation = endLocation;
this.charSpaceWidth = charSpaceWidth;

this.info = ri;

Vector oVector = endLocation.subtract(startLocation);
if (oVector.length() == 0) {
oVector = new Vector(1, 0, 0);
}
this.orientationVector = oVector.normalize();
this.orientationMagnitude =
(int) (Math.atan2(this.orientationVector.get(Vector.I2), this.orientationVector.get(Vector.I1)) * 1000);

final Vector origin = new Vector(0, 0, 1);
this.distPerpendicular = (int) startLocation.subtract(origin).cross(this.orientationVector).get(Vector.I3);

this.distParallelStart = this.orientationVector.dot(startLocation);
this.distParallelEnd = this.orientationVector.dot(endLocation);
}

public Vector getStartLocation() {
return this.startLocation;
}


public Vector getEndLocation() {
return this.endLocation;
}


public String getText() {
return this.text;
}

public float getCharSpaceWidth() {
return this.charSpaceWidth;
}

private void printDiagnostics() {
System.out.println("Text (@" + this.startLocation + " -> " + this.endLocation + "): " + this.text);
System.out.println("orientationMagnitude: " + this.orientationMagnitude);
System.out.println("distPerpendicular: " + this.distPerpendicular);
System.out.println("distParallel: " + this.distParallelStart);
}


public boolean sameLine(final TextChunk as) {
if (this.orientationMagnitude != as.orientationMagnitude) {
return false;
}
if (this.distPerpendicular != as.distPerpendicular) {
return false;
}
return true;
}


public float distanceFromEndOf(final TextChunk other) {
final float distance = this.distParallelStart - other.distParallelEnd;
return distance;
}

public float myDistanceFromEndOf(final TextChunk other) {
final float distance = this.distParallelStart - other.distParallelEnd;
return distance;
}


@Override
public int compareTo(final TextChunk rhs) {
if (this == rhs) {
return 0; // not really needed, but just in case
}

int rslt;
rslt = compareInts(this.orientationMagnitude, rhs.orientationMagnitude);
if (rslt != 0) {
return rslt;
}

rslt = compareInts(this.distPerpendicular, rhs.distPerpendicular);
if (rslt != 0) {
return rslt;
}

return Float.compare(this.distParallelStart, rhs.distParallelStart);
}

private static int compareInts(final int int1, final int int2) {
return int1 == int2 ? 0 : int1 < int2 ? -1 : 1;
}


public TextRenderInfo getInfo() {
return this.info;
}

}


@Override
public void renderImage(final ImageRenderInfo renderInfo) {
// do nothing
}


public static interface TextChunkFilter {

public boolean accept(TextChunk textChunk);
}


}

正如你所看到的,大部分与原始类相同。我刚刚添加了这个:

                final Float dist = chunk.distanceFromEndOf(lastChunk)/3;
for(int i = 0; i<Math.round(dist); i++) {
sb.append(' ');
}

到 getResultantText 方法以用空格扩展间隙。但问题是:

距离似乎不准确或不精确。结果看起来像

this:这个:

有人知道如何计算距离的更好值或值吗?我认为这是因为原始字体类型是 ArialMT 并且我的编辑器是 courier,但要使用此表,建议我可以将表格拆分到正确的位置以获取数据。由于值 usw 的 float 开始和结束,这很困难。

:-/

最佳答案

像这样插入空格的方法存在问题

            final Float dist = chunk.distanceFromEndOf(lastChunk)/3;
for(int i = 0; i<Math.round(dist); i++) {
sb.append(' ');
}

是它假设StringBuffer中的当前位置完全对应于lastChunk的末尾,假设字符宽度为3个用户空间单位。情况不一定如此,通常每次添加字符都会破坏以前的对应关系。例如。使用比例字体时,这两行的宽度不同:

ililili

MWMWMWM

StringBuffer中它们占据相同的长度。

因此,您必须查看 block 相对于左页面边框的起始位置,并相应地向缓冲区添加空格。

此外,您的代码完全忽略行开头的可用空间。

如果您用以下代码替换原始方法 getResultantText(TextChunkFilter),您的结果应该会有所改善:

public String getResultantText(TextChunkFilter chunkFilter){
if (DUMP_STATE) dumpState();

List<TextChunk> filteredTextChunks = filterTextChunks(locationalResult, chunkFilter);
Collections.sort(filteredTextChunks);

int startOfLinePosition = 0;
StringBuffer sb = new StringBuffer();
TextChunk lastChunk = null;
for (TextChunk chunk : filteredTextChunks) {

if (lastChunk == null){
insertSpaces(sb, startOfLinePosition, chunk.distParallelStart, false);
sb.append(chunk.text);
} else {
if (chunk.sameLine(lastChunk))
{
if (isChunkAtWordBoundary(chunk, lastChunk))
{
insertSpaces(sb, startOfLinePosition, chunk.distParallelStart, !startsWithSpace(chunk.text) && !endsWithSpace(lastChunk.text));
}

sb.append(chunk.text);
} else {
sb.append('\n');
startOfLinePosition = sb.length();
insertSpaces(sb, startOfLinePosition, chunk.distParallelStart, false);
sb.append(chunk.text);
}
}
lastChunk = chunk;
}

return sb.toString();
}

void insertSpaces(StringBuffer sb, int startOfLinePosition, float chunkStart, boolean spaceRequired)
{
int indexNow = sb.length() - startOfLinePosition;
int indexToBe = (int)((chunkStart - pageLeft) / fixedCharWidth);
int spacesToInsert = indexToBe - indexNow;
if (spacesToInsert < 1 && spaceRequired)
spacesToInsert = 1;
for (; spacesToInsert > 0; spacesToInsert--)
{
sb.append(' ');
}
}

public float pageLeft = 0;
public float fixedCharWidth = 6;

pageLeft 是左页面边框的坐标。策略不知道这一点,因此必须明确告知;但在许多情况下,0 是正确的值。

或者可以使用所有 block 的最小 distParallelStart 值。这会切断左边距,但不需要您注入(inject)精确的左页面边框值。

fixedCharWidth 是假定的字符宽度。根据相关 PDF 中的书写情况,不同的值可能更合适。在你的例子中,值 3 似乎比我的 6 更好。

这段代码还有很大的改进空间。例如

  • 它假设没有跨越多个表列的文本 block 。这种假设通常是正确的,但我见过奇怪的 PDF,其中正常的字间距是在某个偏移处使用单独的文本 block 实现的,但列间距由单个 block 中的单个空格字符表示(跨越一栏的结束和下一栏的开始)!该空格字符的宽度已由 PDF 图形状态的字间距设置控制。

  • 它忽略不同数量的垂直空间。

关于java - IText 像 pdftotext -layout 一样读取 PDF?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/24887784/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com