gpt4 book ai didi

python - 如何使 PDF 可用于 flask 搜索应用程序的搜索?

转载 作者:行者123 更新时间:2023-12-03 17:38:18 25 4
gpt4 key购买 nike

我一直在为一个非常重要的个人项目做研究。我想创建一个 Flask 搜索应用程序,它允许我在 100 多个 PDF 文件中搜索内容。我发现了一些关于 A ElasticSearch Lib 的信息,它可以很好地与 flask 配合使用。

#!/usr/bin/env python3
#-*- coding: utf-8 -*-

# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
import json
from flask import Flask, jsonify, request, render_template, json
from datetime import datetime
import pandas as pd

# import the Elasticsearch low-level client library
from elasticsearch import Elasticsearch
# create a new client instance of Elasticsearch
elastic_client = Elasticsearch(hosts=["localhost"])
es = Elasticsearch("http://localhost:9200/")
app = Flask(__name__)

# create a new PDF object with FPDF
pdf = FPDF()

# use an iterator to create 10 pages
for page in range(10):
pdf.add_page()
pdf.set_font("Arial", size=14)
pdf.cell(150, 12, txt="Object Rocket ROCKS!!", ln=1, align="C")

# output all of the data to a new PDF file
pdf.output("object_rocket.pdf")

'''
read_pdf = PyPDF2.PdfFileReader("object_rocket.pdf")
page = read_pdf.getPage(0)
page_mode = read_pdf.getPageMode()
page_text = page.extractText()
print (type(page_text))
'''
#with open(path, 'rb') as file:

# get the PDF path and read the file
file = "Sheet3.pdf"
read_pdf = PyPDF2.PdfFileReader(file, strict=False)
#print (read_pdf)

# get the read object's meta info
pdf_meta = read_pdf.getDocumentInfo()

# get the page numbers
num = read_pdf.getNumPages()
print ("PDF pages:", num)

# create a dictionary object for page data
all_pages = {}

# put meta data into a dict key
all_pages["meta"] = {}

# Use 'iteritems()` instead of 'items()' for Python 2
for meta, value in pdf_meta.items():
print (meta, value)
all_pages["meta"][meta] = value

# iterate the page numbers
for page in range(num):
data = read_pdf.getPage(page)
#page_mode = read_pdf.getPageMode()

# extract the page's text
page_text = data.extractText()

# put the text data into the dict
all_pages[page] = page_text

# create a JSON string from the dictionary
json_data = json.dumps(all_pages)
#print ("\nJSON:", json_data)

# convert JSON string to bytes-like obj
bytes_string = bytes(json_data, 'utf-8')
#print ("\nbytes_string:", bytes_string)

# convert bytes to base64 encoded string
encoded_pdf = base64.b64encode(bytes_string)
encoded_pdf = str(encoded_pdf)
#print ("\nbase64:", encoded_pdf)

# put the PDF data into a dictionary body to pass to the API request
body_doc = {"data": encoded_pdf}

# call the index() method to index the data
result = elastic_client.index(index="pdf", doc_type="_doc", id="42", body=body_doc)

# print the returned sresults
#print ("\nindex result:", result['result'])

# make another Elasticsearch API request to get the indexed PDF
result = elastic_client.get(index="pdf", doc_type='_doc', id=42)

# print the data to terminal
result_data = result["_source"]["data"]
#print ("\nresult_data:", result_data, '-- type:', type(result_data))

# decode the base64 data (use to [:] to slice off
# the 'b and ' in the string)
decoded_pdf = base64.b64decode(result_data[2:-1]).decode("utf-8")
#print ("\ndecoded_pdf:", decoded_pdf)

# take decoded string and make into JSON object
json_dict = json.loads(decoded_pdf)
#print ("\njson_str:", json_dict, "\n\ntype:", type(json_dict))
result2 = elastic_client.index(index="pdftext", doc_type="_doc", id="42", body=json_dict)

# create new FPDF object
pdf = FPDF()

# build the new PDF from the Elasticsearch dictionary
# Use 'iteritems()` instead of 'items()' for Python 2
""" for page, value in json_data:
if page != "meta":
# create new page
pdf.add_page()
pdf.set_font("Arial", size=14)

# add content to page
output = value + " -- Page: " + str(int(page)+1)
pdf.cell(150, 12, txt=output, ln=1, align="C")
else:
# create the meta data for the new PDF
for meta, meta_val in json_dict["meta"].items():
if "title" in meta.lower():
pdf.set_title(meta_val)
elif "producer" in meta.lower() or "creator" in meta.lower():
pdf.set_creator(meta_val)
"""
# output the PDF object's data to a PDF file
#pdf.output("object_rocket_from_elaticsearch.pdf" )

@app.route('/', methods=['GET'])
def index():

return jsonify(json_dict)

@app.route('/<id>', methods=['GET'])
def index_by_id(id):

return jsonify(json_dict[id])


""" @app.route('/insert_data', methods=['PUT'])
def insert_data():
slug = request.form['slug']
title = request.form['title']
content = request.form['content']

body = {
'slug': slug,
'title': title,
'content': content,
'timestamp': datetime.now()
}

result = es.index(index='contents', doc_type='title', id=slug, body=body)

return jsonify(result) """



app.run(port=5003, debug=True)

------进步------ 我现在有一个没有前端搜索功能的工作解决方案:
# Load_single_PDF_BY_PAGE_TO_index.py
#!/usr/bin/env python3
#-*- coding: utf-8 -*-

# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64

from flask import Flask, jsonify, request, render_template, json
from datetime import datetime
import pandas as pd

# import the Elasticsearch low-level client library
from elasticsearch import Elasticsearch
# create a new client instance of Elasticsearch
elastic_client = Elasticsearch(hosts=["localhost"])
es = Elasticsearch("http://localhost:9200/")
app = Flask(__name__)


#with open(path, 'rb') as file:

# get the PDF path and read the file
file = "Sheet3.pdf"
read_pdf = PyPDF2.PdfFileReader(file, strict=False)
#print (read_pdf)

# get the read object's meta info
pdf_meta = read_pdf.getDocumentInfo()

# get the page numbers
num = read_pdf.getNumPages()
print ("PDF pages:", num)

# create a dictionary object for page data
all_pages = {}

# put meta data into a dict key
all_pages["meta"] = {}

# Use 'iteritems()` instead of 'items()' for Python 2
for meta, value in pdf_meta.items():
print (meta, value)
all_pages["meta"][meta] = value

x = 44
# iterate the page numbers
for page in range(num):
data = read_pdf.getPage(page)
#page_mode = read_pdf.getPageMode()

# extract the page's text
page_text = data.extractText()

# put the text data into the dict
all_pages[page] = page_text

body_doc2 = {"data": page_text}
result3 = elastic_client.index(index="pdfclearn", doc_type="_doc", id=x, body=body_doc2)
x += 1

上面的代码按页将单个 pdf 加载到 elasticsearch 中。
from flask import Flask, jsonify, request,render_template
from elasticsearch import Elasticsearch
from datetime import datetime
es = Elasticsearch("http://localhost:9200/")

app = Flask(__name__)

@app.route('/pdf', methods=['GET'])
def index():
results = es.get(index='pdfclearn', doc_type='_doc', id='44')
return jsonify(results['_source'])


@app.route('/pdf/<id>', methods=['GET'])
def index_by_id(id):
results = es.get(index='pdfclearn', doc_type='_doc', id=id)
return jsonify(results['_source'])



@app.route('/search/<keyword>', methods=['POST','GET'])
def search(keyword):
keyword = keyword

body = {
"query": {
"multi_match": {
"query": keyword,
"fields": ["data"]
}
}
}

res = es.search(index="pdfclearn", doc_type="_doc", body=body)

return jsonify(res['hits']['hits'])

@app.route("/searhbar")
def searhbar():
return render_template("index.html")

@app.route("/searhbar/<string:box>")
def process(box):
query = request.args.get('query')
if box == 'names':
keyword = box

body = {
"query": {
"multi_match": {
"query": keyword,
"fields": ["data"]
}
}
}

res = es.search(index="pdfclearn", doc_type="_doc", body=body)

return jsonify(res['hits']['hits'])

app.run(port=5003, debug=True)

在上面的代码中,我们可以在所有页面中搜索关键字或短语。
curl http://127.0.0.1:5003/search/test //it works!!

我找到了一篇关于如何在 ElasticSearch 中将 PDF 文件作为 Base64 索引的博客。我已经看到 DocuSign 的 API 为文档模板执行此操作。然而, 我不明白如何以 ElasticSearch 可搜索的方式对 Base64 PDF 进行 Jsonify。
curl "http://localhost:9200/pdftext/_doc/42"

curl -X POST "http://localhost:9200/pdf/_search?q=*"

我可以检索 700 页文档的 Base64。但我认为我需要的是索引和检索文档的每一页。

我研究过的博客让我分道扬镳:
  • https://kb.objectrocket.com/elasticsearch/how-to-index-a-pdf-file-as-an-elasticsearch-index-267
  • https://blog.miguelgrinberg.com/post/the-flask-mega-tutorial-part-xvi-full-text-search

  • 残局:
  • https://towardsdatascience.com/create-a-full-search-engine-via-flask-elasticsearch-javascript-d3js-and-bootstrap-275f9dc6efe1

  • 我将继续研究 Elastic Search 和 Base64 编码和解码。但我需要一些帮助来实现我的目标。任何详细的例子将不胜感激。

    最佳答案

    - - - 进步 - - -
    我现在有一个没有前端搜索功能的工作解决方案:

    # Load_single_PDF_BY_PAGE_TO_index.py
    #!/usr/bin/env python3
    #-*- coding: utf-8 -*-

    # import libraries to help read and create PDF
    import PyPDF2
    from fpdf import FPDF
    import base64

    from flask import Flask, jsonify, request, render_template, json
    from datetime import datetime
    import pandas as pd

    # import the Elasticsearch low-level client library
    from elasticsearch import Elasticsearch
    # create a new client instance of Elasticsearch
    elastic_client = Elasticsearch(hosts=["localhost"])
    es = Elasticsearch("http://localhost:9200/")
    app = Flask(__name__)


    #with open(path, 'rb') as file:

    # get the PDF path and read the file
    file = "Sheet3.pdf"
    read_pdf = PyPDF2.PdfFileReader(file, strict=False)
    #print (read_pdf)

    # get the read object's meta info
    pdf_meta = read_pdf.getDocumentInfo()

    # get the page numbers
    num = read_pdf.getNumPages()
    print ("PDF pages:", num)

    # create a dictionary object for page data
    all_pages = {}

    # put meta data into a dict key
    all_pages["meta"] = {}

    # Use 'iteritems()` instead of 'items()' for Python 2
    for meta, value in pdf_meta.items():
    print (meta, value)
    all_pages["meta"][meta] = value

    x = 44
    # iterate the page numbers
    for page in range(num):
    data = read_pdf.getPage(page)
    #page_mode = read_pdf.getPageMode()

    # extract the page's text
    page_text = data.extractText()

    # put the text data into the dict
    all_pages[page] = page_text

    body_doc2 = {"data": page_text}
    result3 = elastic_client.index(index="pdfclearn", doc_type="_doc", id=x, body=body_doc2)
    x += 1

    上面的代码按页将单个 pdf 加载到 elasticsearch 中。
    from flask import Flask, jsonify, request,render_template
    from elasticsearch import Elasticsearch
    from datetime import datetime
    es = Elasticsearch("http://localhost:9200/")

    app = Flask(__name__)

    @app.route('/pdf', methods=['GET'])
    def index():
    results = es.get(index='pdfclearn', doc_type='_doc', id='44')
    return jsonify(results['_source'])


    @app.route('/pdf/<id>', methods=['GET'])
    def index_by_id(id):
    results = es.get(index='pdfclearn', doc_type='_doc', id=id)
    return jsonify(results['_source'])



    @app.route('/search/<keyword>', methods=['POST','GET'])
    def search(keyword):
    keyword = keyword

    body = {
    "query": {
    "multi_match": {
    "query": keyword,
    "fields": ["data"]
    }
    }
    }

    res = es.search(index="pdfclearn", doc_type="_doc", body=body)

    return jsonify(res['hits']['hits'])

    @app.route("/searhbar")
    def searhbar():
    return render_template("index.html")

    @app.route("/searhbar/<string:box>")
    def process(box):
    query = request.args.get('query')
    if box == 'names':
    keyword = box

    body = {
    "query": {
    "multi_match": {
    "query": keyword,
    "fields": ["data"]
    }
    }
    }

    res = es.search(index="pdfclearn", doc_type="_doc", body=body)

    return jsonify(res['hits']['hits'])

    app.run(port=5003, debug=True)

    在上面的代码中,我们可以在所有页面中搜索关键字或短语。
    curl http://127.0.0.1:5003/search/test //it works!!

    关于python - 如何使 PDF 可用于 flask 搜索应用程序的搜索?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/60031112/

    25 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com