gpt4 book ai didi

python - 从pdf转换为文本: lines and words are broken

转载 作者:行者123 更新时间:2023-12-02 04:25:09 25 4
gpt4 key购买 nike

我想通过 PyPDF2 将 pdf 文件转换为文本,但转换后的文本看起来与 PDF 文件不同。具体来说,PDF 中的一行在文本中被分成多行,单词也可能被打破。附件是我用下面的代码得到的 PDF 和文本文件。谁能帮我解决这个问题?

enter code here

import PyPDF2

def extractPdfText(filePath=''):

# Open the pdf file in read binary mode.
fileObject = open(filePath, 'rb') # rb

# Create a pdf reader .
pdfFileReader = PyPDF2.PdfFileReader(fileObject)

# Get total pdf page number.
totalPageNumber = pdfFileReader.numPages

# Print pdf total page number.
print('This pdf file contains totally ' + str(totalPageNumber) + ' pages.')

currentPageNumber = 0
text = ''

# Loop in all the pdf pages.
while(currentPageNumber < totalPageNumber ):

# Get the specified pdf page object.
pdfPage = pdfFileReader.getPage(currentPageNumber)

# Get pdf page text.
text = text + pdfPage.extractText()

# Process next page.
currentPageNumber += 1

return text

pdfFilePath = 'PDF file path'

pdfText = extractPdfText(pdfFilePath)

pdf file

converted text

最佳答案

这就是我要做的。

from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt

#converts pdf, returns its text content as a string
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)

output = io.StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)

infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text

#converts all pdfs in directory pdfDir, saves all resulting txt files to txtdir
def convertMultiple(pdfDir, txtDir):
if pdfDir == "": pdfDir = os.getcwd() + "\\" #if no pdfDir passed in
for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
fileExtension = pdf.split(".")[-1]
if fileExtension == "pdf":
pdfFilename = pdfDir + pdf
text = convert(pdfFilename) #get string of text content of pdf
textFilename = txtDir + pdf + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file

# set paths accordingly:
pdfDir = "C://your_path_here/"
txtDir = "C://your_path_here/"
convertMultiple(pdfDir, txtDir)

关于python - 从pdf转换为文本: lines and words are broken,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/55220455/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com