gpt4 book ai didi

java - 使用 iText5 从 PDF 中提取文本和图像

转载 作者:搜寻专家 更新时间:2023-10-31 08:04:10 24 4
gpt4 key购买 nike

我需要帮助使用 Java 库 iText5 从 PDF 文件中提取文本和图像,并映射或引用提取文本中的图像。如果 iText5 是错误的工具,请推荐另一个具有相同功能的 Java 库来告诉我。

这就是我目前所做的

import java.io.IOException;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import com.itextpdf.text.Paragraph;

public class Iconverter {

/**
* @param args
*/
static int PAGE_NUMBER;


/** The new document to which we've added a border rectangle. */
public static final String RESULT = "/home/sarah/Java for Dummies 4th Edition/Img%s.%s";
public static void main(String[] args) {
String docText = "";
String pdfName = "/home/sarah/Java for Dummies 4th Edition.pdf";
Document document = new Document();
document.open();
try {
PdfReader reader = new PdfReader(pdfName);
PAGE_NUMBER = reader.getNumberOfPages();
for(int i = 1; i <=PAGE_NUMBER; i++){
docText = PdfTextExtractor.getTextFromPage(reader, i);
}
extractImages(pdfName);
document.add(new Paragraph(".."));
} catch (Exception e) {
e.printStackTrace();
}
document.close();
}
/**
* Parses a PDF and extracts all the images.
* @param src the source PDF
* @param dest the resulting PDF
*/
public static void extractImages(String filename) throws IOException, DocumentException {
PdfReader reader = new PdfReader(filename);
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
MyImageRenderListener listener = new MyImageRenderListener(RESULT);
for (int i = 1; i <= PAGE_NUMBER; i++) {
parser.processContent(i, listener);
}
}
}

import java.awt.image.BufferedImage;
import java.io.FileOutputStream;
import java.io.IOException;

import javax.imageio.ImageIO;

import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.parser.ImageRenderInfo;
import com.itextpdf.text.pdf.parser.PdfImageObject;
import com.itextpdf.text.pdf.parser.RenderListener;
import com.itextpdf.text.pdf.parser.TextRenderInfo;

public class MyImageRenderListener implements RenderListener {

/** The new document to which we've added a border rectangle. */
protected String path = "";

/**
* Creates a RenderListener that will look for images.
*/
public MyImageRenderListener(String path) {
this.path = path;
}

/**
* @see com.itextpdf.text.pdf.parser.RenderListener#beginTextBlock()
*/
public void beginTextBlock() {
}

/**
* @see com.itextpdf.text.pdf.parser.RenderListener#endTextBlock()
*/
public void endTextBlock() {
}

/**
* @see com.itextpdf.text.pdf.parser.RenderListener#renderImage(
* com.itextpdf.text.pdf.parser.ImageRenderInfo)
*/
public void renderImage(ImageRenderInfo renderInfo) {
try {
String filename;
FileOutputStream os;
PdfImageObject image = renderInfo.getImage();
PdfName filter = (PdfName)image.get(PdfName.FILTER);
if (PdfName.DCTDECODE.equals(filter)) {
filename = String.format(path, renderInfo.getRef().getNumber(), "JPG");
os = new FileOutputStream(filename);
os.write(image.getStreamBytes());
os.flush();
os.close();
}
else if (PdfName.JPXDECODE.equals(filter)) {
filename = String.format(path, renderInfo.getRef().getNumber(), "jp2");
os = new FileOutputStream(filename);
os.write(image.getStreamBytes());
os.flush();
os.close();
}
else if (PdfName.JBIG2DECODE.equals(filter)) {
// ignore: filter not supported.
}
else {
BufferedImage awtimage = renderInfo.getImage().getBufferedImage();
if (awtimage != null) {
filename = String.format(path, renderInfo.getRef().getNumber(), "png");
ImageIO.write(awtimage, "png", new FileOutputStream(filename));
}
}
} catch (IOException e) {
e.printStackTrace();
}
}

/**
* @see com.itextpdf.text.pdf.parser.RenderListener#renderText(
* com.itextpdf.text.pdf.parser.TextRenderInfo)
*/
public void renderText(TextRenderInfo renderInfo) {
}
}

最佳答案

作为 iText 的替代品,Apache PDFBox可以帮助你。

看看这些类:

org.apache.pdfbox.ExtractImages

org.apache.pdfbox.ExtractText

关于java - 使用 iText5 从 PDF 中提取文本和图像,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/8700570/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com