gpt4 book ai didi

openai-api - 如何使用 chatgpt dev api 增量构建索引

转载 作者:行者123 更新时间:2023-12-02 05:49:24 25 4
gpt4 key购买 nike

我正在使用 chatgpt dev api 在我的自定义数据上训练模型,但我需要增量训练它,因为每次添加一些新数据时在所有文档上创建索引并不理想,因为需要计算成本在完整的文档列表中,那么正确的方法是什么,以便我只为附加的新数据付费,并使用该新数据更新索引。

下面是我的实现

import hashlib

from llama_index import StorageContext, load_index_from_storage, GPTVectorStoreIndex, LLMPredictor, PromptHelper
from langchain import OpenAI
from typing import List
import gradio as gr
import os

os.environ["OPENAI_API_KEY"] = 'xxxxxxxx'

class Document:
def __init__(self,
text,
doc_id,
metadata=None,
extra_info_str: str = "",
embedding: List[float] = None,
extra_info=None):
self.text = text
self.doc_id = doc_id
self.metadata = metadata if metadata is not None else {}
self.extra_info_str = extra_info_str
self.extra_info = extra_info
self.embedding = embedding

def get_doc_id(self):
return self.doc_id

def get_doc_hash(self):
return hashlib.md5(self.text.encode('utf-8')).hexdigest()

def get_text(self):
return self.text


def construct_index(file_path, checkpoint_file):
max_input_size = 4096
num_outputs = 512
max_chunk_overlap = 20
chunk_size_limit = 600

prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)

llm_predictor = LLMPredictor(llm=OpenAI(temperature=0.7, model_name="text-davinci-003", max_tokens=num_outputs))

# Load the checkpoint file
checkpoint = 0
if os.path.exists(checkpoint_file):
with open(checkpoint_file, "r") as f:
checkpoint = int(f.read().strip())

# Load the new data
with open(file_path, "r") as f:
new_entries = f.readlines()[checkpoint:]

if len(new_entries) == 0:
return

concatenated_text = ''.join(new_entries)
document = Document(text=concatenated_text, doc_id="123")

folder_path = "/Media/Disk1/sandbox/ml/chatgpt/index_storage/"
files = [file for file in os.listdir(folder_path)]

if len(files) > 0:

merged_document_list = []
# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="/Media/Disk1/sandbox/ml/chatgpt/index_storage/")

# load index
existing_index = load_index_from_storage(storage_context)

for doc_id in list(existing_index.docstore.to_dict().get("docstore/data").keys()):

# doc_id = list(existing_index.docstore.to_dict().get("docstore/metadata").keys())[1]
old_document_data = existing_index.docstore.get_document(doc_id)
old_document = Document(text=old_document_data.text, doc_id=doc_id)
merged_document_list.append(old_document)

merged_document_list.append(document)

new_index = GPTVectorStoreIndex.from_documents(merged_document_list,
llm_predictor=llm_predictor,
prompt_helper=prompt_helper)

new_index.storage_context.persist(persist_dir="/Media/Disk1/sandbox/ml/chatgpt/index_storage/")

# Update the checkpoint file
with open(checkpoint_file, "w") as f:
f.write(str(len(new_entries) + checkpoint))

return new_index

else:
new_index = GPTVectorStoreIndex.from_documents([document],
llm_predictor=llm_predictor,
prompt_helper=prompt_helper)
new_index.storage_context.persist(persist_dir="/Media/Disk1/sandbox/ml/chatgpt/index_storage/")

# Update the checkpoint file
with open(checkpoint_file, "w") as f:
f.write(str(len(new_entries) + checkpoint))

return new_index


def chatbot(input_text):
# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="/Media/Disk1/sandbox/ml/chatgpt/index_storage/")

# load index
read_index = load_index_from_storage(storage_context)
query_engine = read_index.as_query_engine()
response = query_engine.query(input_text)
return response.response


checkpoint_path = "checkpoint.txt"
index = construct_index("docs/test.txt", checkpoint_path)
iface = gr.Interface(fn=chatbot,
inputs=gr.components.Textbox(lines=7, label="Enter your text"),
outputs="text",
title="My AI Chatbot")

iface.launch(share=True)

最佳答案

关于引用文档 https://gpt-index.readthedocs.io/en/latest/reference/indices/vector_store.html我发现我们可以使用 insert 函数将新文档添加到现有索引中,无需获取所有现有文档并在其中附加新文档。

existing_index.insert(document)

关于openai-api - 如何使用 chatgpt dev api 增量构建索引,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/76232902/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com