stivenDR14
commited on
Commit
·
4779173
1
Parent(s):
4fafabd
chroma_db in the state of Gradio
Browse files- app.py +16 -15
- pdf_processor.py +21 -24
app.py
CHANGED
@@ -57,17 +57,17 @@ class PDFProcessorUI:
|
|
57 |
else:
|
58 |
return gr.update(visible=False), gr.update(visible=False)
|
59 |
|
60 |
-
def process_pdf(self, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx):
|
61 |
-
return self.processor.process_pdf(pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx)
|
62 |
|
63 |
-
def qa_interface(self, message, history, ai_model, type_model, api_key, project_id_watsonx):
|
64 |
-
return self.processor.get_qa_response(message, history, ai_model, type_model, api_key, project_id_watsonx)
|
65 |
|
66 |
-
def summarize_interface(self, ai_model, type_model, api_key, project_id_watsonx):
|
67 |
-
return self.processor.get_summary(ai_model, type_model, api_key, project_id_watsonx)
|
68 |
|
69 |
-
def specialist_opinion(self, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
|
70 |
-
return self.processor.get_specialist_opinion(ai_model, type_model, api_key, project_id_watsonx, specialist_prompt)
|
71 |
|
72 |
def upload_file(files):
|
73 |
file_paths = [file.name for file in files]
|
@@ -75,6 +75,7 @@ class PDFProcessorUI:
|
|
75 |
|
76 |
def create_ui(self):
|
77 |
with gr.Blocks() as demo:
|
|
|
78 |
title = gr.Markdown(TRANSLATIONS[self.current_language]["title"])
|
79 |
|
80 |
with gr.Row():
|
@@ -164,8 +165,8 @@ class PDFProcessorUI:
|
|
164 |
label=TRANSLATIONS[self.current_language]["mini_analysis_title"],
|
165 |
lines=10
|
166 |
)
|
167 |
-
specialist_output = gr.Textbox(label=TRANSLATIONS[self.current_language]["specialist_output"], lines=20)
|
168 |
specialist_btn = gr.Button(TRANSLATIONS[self.current_language]["specialist_btn"])
|
|
|
169 |
|
170 |
|
171 |
language_dropdown.change(
|
@@ -210,31 +211,31 @@ class PDFProcessorUI:
|
|
210 |
|
211 |
chat_placeholder.submit(
|
212 |
fn=self.qa_interface,
|
213 |
-
inputs=[chat_placeholder, chatbot, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
|
214 |
outputs=[chatbot]
|
215 |
)
|
216 |
|
217 |
process_btn.click(
|
218 |
fn=self.process_pdf,
|
219 |
-
inputs=[pdf_file, chunk_size, chunk_overlap, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
|
220 |
-
outputs=[process_output]
|
221 |
)
|
222 |
|
223 |
summarize_btn.click(
|
224 |
fn=self.summarize_interface,
|
225 |
-
inputs=[ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
|
226 |
outputs=[summary_output]
|
227 |
)
|
228 |
|
229 |
specialist_btn.click(
|
230 |
fn=self.specialist_opinion,
|
231 |
-
inputs=[ai_model_dropdown, type_model, api_key_input, project_id_watsonx, specialist_placeholder],
|
232 |
outputs=[specialist_output]
|
233 |
)
|
234 |
|
235 |
chat_btn.click(
|
236 |
fn=self.qa_interface,
|
237 |
-
inputs=[chat_placeholder, chatbot, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
|
238 |
outputs=[chatbot]
|
239 |
)
|
240 |
|
|
|
57 |
else:
|
58 |
return gr.update(visible=False), gr.update(visible=False)
|
59 |
|
60 |
+
def process_pdf(self, vectorstore, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx):
|
61 |
+
return self.processor.process_pdf(vectorstore, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx)
|
62 |
|
63 |
+
def qa_interface(self, vectorstore, message, history, ai_model, type_model, api_key, project_id_watsonx):
|
64 |
+
return self.processor.get_qa_response(vectorstore, message, history, ai_model, type_model, api_key, project_id_watsonx)
|
65 |
|
66 |
+
def summarize_interface(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx):
|
67 |
+
return self.processor.get_summary(vectorstore, ai_model, type_model, api_key, project_id_watsonx)
|
68 |
|
69 |
+
def specialist_opinion(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
|
70 |
+
return self.processor.get_specialist_opinion(vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt)
|
71 |
|
72 |
def upload_file(files):
|
73 |
file_paths = [file.name for file in files]
|
|
|
75 |
|
76 |
def create_ui(self):
|
77 |
with gr.Blocks() as demo:
|
78 |
+
vectorstore = gr.State()
|
79 |
title = gr.Markdown(TRANSLATIONS[self.current_language]["title"])
|
80 |
|
81 |
with gr.Row():
|
|
|
165 |
label=TRANSLATIONS[self.current_language]["mini_analysis_title"],
|
166 |
lines=10
|
167 |
)
|
|
|
168 |
specialist_btn = gr.Button(TRANSLATIONS[self.current_language]["specialist_btn"])
|
169 |
+
specialist_output = gr.Textbox(label=TRANSLATIONS[self.current_language]["specialist_output"], lines=20)
|
170 |
|
171 |
|
172 |
language_dropdown.change(
|
|
|
211 |
|
212 |
chat_placeholder.submit(
|
213 |
fn=self.qa_interface,
|
214 |
+
inputs=[vectorstore, chat_placeholder, chatbot, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
|
215 |
outputs=[chatbot]
|
216 |
)
|
217 |
|
218 |
process_btn.click(
|
219 |
fn=self.process_pdf,
|
220 |
+
inputs=[vectorstore, pdf_file, chunk_size, chunk_overlap, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
|
221 |
+
outputs=[process_output, vectorstore]
|
222 |
)
|
223 |
|
224 |
summarize_btn.click(
|
225 |
fn=self.summarize_interface,
|
226 |
+
inputs=[vectorstore, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
|
227 |
outputs=[summary_output]
|
228 |
)
|
229 |
|
230 |
specialist_btn.click(
|
231 |
fn=self.specialist_opinion,
|
232 |
+
inputs=[vectorstore, ai_model_dropdown, type_model, api_key_input, project_id_watsonx, specialist_placeholder],
|
233 |
outputs=[specialist_output]
|
234 |
)
|
235 |
|
236 |
chat_btn.click(
|
237 |
fn=self.qa_interface,
|
238 |
+
inputs=[vectorstore, chat_placeholder, chatbot, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
|
239 |
outputs=[chatbot]
|
240 |
)
|
241 |
|
pdf_processor.py
CHANGED
@@ -95,7 +95,6 @@ def authenticate_watsonx(api_key):
|
|
95 |
|
96 |
class PDFProcessor:
|
97 |
def __init__(self):
|
98 |
-
self.vectorstore = None
|
99 |
self.language = "English"
|
100 |
|
101 |
def set_language(self, language):
|
@@ -145,7 +144,7 @@ class PDFProcessor:
|
|
145 |
return current_llm, embeding_model
|
146 |
|
147 |
@spaces.GPU
|
148 |
-
def process_pdf(self, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx):
|
149 |
defined_chunk_size = 1000
|
150 |
defined_chunk_overlap = 150
|
151 |
if (ai_model == "Open AI / GPT-4o-mini" and (api_key == "")) : #or (ai_model == "IBM Granite3.1 dense / Ollama local" and type_model == "Api Key" and (api_key == "" or project_id_watsonx == "")
|
@@ -178,13 +177,13 @@ class PDFProcessor:
|
|
178 |
_, embeddings = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
|
179 |
|
180 |
#delete all documents from the vectorstore
|
181 |
-
if
|
182 |
-
|
183 |
|
184 |
chromadb.api.client.SharedSystemClient.clear_system_cache()
|
185 |
new_client = chromadb.EphemeralClient()
|
186 |
|
187 |
-
|
188 |
documents=texts,
|
189 |
embedding=embeddings,
|
190 |
client=new_client,
|
@@ -192,19 +191,19 @@ class PDFProcessor:
|
|
192 |
#persist_directory="./chroma_db"
|
193 |
)
|
194 |
|
195 |
-
return TRANSLATIONS[self.language]["pdf_processed"] #+ f" ---- Chunks: {len(
|
196 |
|
197 |
else:
|
198 |
-
return TRANSLATIONS[self.language]["load_pdf_first"]
|
199 |
|
200 |
@spaces.GPU
|
201 |
-
def get_qa_response(self, message, history, ai_model, type_model, api_key, project_id_watsonx, k=4):
|
202 |
current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
|
203 |
|
204 |
-
if not
|
205 |
return TRANSLATIONS[self.language]["load_pdf_first"]
|
206 |
|
207 |
-
retriever =
|
208 |
|
209 |
qa_chain = RetrievalQA.from_chain_type(
|
210 |
llm=current_llm,
|
@@ -222,13 +221,14 @@ class PDFProcessor:
|
|
222 |
return result["result"] + "\n\nSources: " + page_labels_text
|
223 |
|
224 |
@spaces.GPU
|
225 |
-
def summarizer_by_k_top_n(self, ai_model, type_model, api_key, project_id_watsonx, k, summary_prompt, just_get_documments=False):
|
226 |
-
|
|
|
227 |
return TRANSLATIONS[self.language]["load_pdf_first"]
|
228 |
|
229 |
current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
|
230 |
# Get all documents from the vectorstore
|
231 |
-
retriever =
|
232 |
documents = retriever.invoke('Summary of the document and key points')
|
233 |
|
234 |
if just_get_documments:
|
@@ -239,7 +239,7 @@ class PDFProcessor:
|
|
239 |
return final_summary
|
240 |
|
241 |
# Get the top k documents by score
|
242 |
-
def get_summary(self, ai_model, type_model, api_key, project_id_watsonx, just_get_documments=False, k=10):
|
243 |
|
244 |
final_summary_prompt = PromptTemplate(
|
245 |
input_variables=["texts", "language"],
|
@@ -255,11 +255,11 @@ class PDFProcessor:
|
|
255 |
"""
|
256 |
)
|
257 |
|
258 |
-
return self.summarizer_by_k_top_n(ai_model, type_model, api_key, project_id_watsonx, k, final_summary_prompt, just_get_documments)
|
259 |
|
260 |
|
261 |
@spaces.GPU
|
262 |
-
def get_specialist_opinion(self, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
|
263 |
questions_prompt = PromptTemplate(
|
264 |
input_variables=["text", "specialist_prompt", "language"],
|
265 |
template="""
|
@@ -303,22 +303,19 @@ class PDFProcessor:
|
|
303 |
Answer:
|
304 |
"""
|
305 |
)
|
306 |
-
if not
|
307 |
return TRANSLATIONS[self.language]["load_pdf_first"]
|
308 |
|
|
|
|
|
309 |
current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
|
310 |
|
311 |
-
summary_text = self.get_summary(ai_model, type_model, api_key, project_id_watsonx, True, 10)
|
312 |
questions_chain = questions_prompt | current_llm
|
313 |
questions = questions_chain.invoke({"text": summary_text, "specialist_prompt": specialist_prompt, "language": self.language})
|
314 |
|
315 |
print(questions)
|
316 |
|
317 |
-
#clean the questions variable, delete all the text before the json and after the json
|
318 |
-
questions = questions.split("{")[1]
|
319 |
-
questions = questions.split("}")[0]
|
320 |
-
questions = questions.strip()
|
321 |
-
print(questions)
|
322 |
questions = json.loads(questions)
|
323 |
|
324 |
print(questions)
|
@@ -328,7 +325,7 @@ class PDFProcessor:
|
|
328 |
else:
|
329 |
questions["aspects"] = questions["aspects"]
|
330 |
|
331 |
-
aspects_text = "\n".join([f"* {aspect}: {self.get_qa_response(aspect, [], ai_model, type_model, api_key, project_id_watsonx, 2)}" for aspect in questions["aspects"]])
|
332 |
|
333 |
return aspects_text
|
334 |
|
|
|
95 |
|
96 |
class PDFProcessor:
|
97 |
def __init__(self):
|
|
|
98 |
self.language = "English"
|
99 |
|
100 |
def set_language(self, language):
|
|
|
144 |
return current_llm, embeding_model
|
145 |
|
146 |
@spaces.GPU
|
147 |
+
def process_pdf(self, vectorstore, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx):
|
148 |
defined_chunk_size = 1000
|
149 |
defined_chunk_overlap = 150
|
150 |
if (ai_model == "Open AI / GPT-4o-mini" and (api_key == "")) : #or (ai_model == "IBM Granite3.1 dense / Ollama local" and type_model == "Api Key" and (api_key == "" or project_id_watsonx == "")
|
|
|
177 |
_, embeddings = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
|
178 |
|
179 |
#delete all documents from the vectorstore
|
180 |
+
if vectorstore:
|
181 |
+
vectorstore.delete_collection()
|
182 |
|
183 |
chromadb.api.client.SharedSystemClient.clear_system_cache()
|
184 |
new_client = chromadb.EphemeralClient()
|
185 |
|
186 |
+
vectorstore = Chroma.from_documents(
|
187 |
documents=texts,
|
188 |
embedding=embeddings,
|
189 |
client=new_client,
|
|
|
191 |
#persist_directory="./chroma_db"
|
192 |
)
|
193 |
|
194 |
+
return TRANSLATIONS[self.language]["pdf_processed"], vectorstore #+ f" ---- Chunks: {len(vectorstore.get()["documents"])}"
|
195 |
|
196 |
else:
|
197 |
+
return TRANSLATIONS[self.language]["load_pdf_first"], None
|
198 |
|
199 |
@spaces.GPU
|
200 |
+
def get_qa_response(self, vectorstore, message, history, ai_model, type_model, api_key, project_id_watsonx, k=4):
|
201 |
current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
|
202 |
|
203 |
+
if not vectorstore:
|
204 |
return TRANSLATIONS[self.language]["load_pdf_first"]
|
205 |
|
206 |
+
retriever = vectorstore.as_retriever(search_kwargs={"k": k})
|
207 |
|
208 |
qa_chain = RetrievalQA.from_chain_type(
|
209 |
llm=current_llm,
|
|
|
221 |
return result["result"] + "\n\nSources: " + page_labels_text
|
222 |
|
223 |
@spaces.GPU
|
224 |
+
def summarizer_by_k_top_n(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, summary_prompt, just_get_documments=False):
|
225 |
+
print("Summarizer by k top n in language: ", self.language)
|
226 |
+
if not vectorstore:
|
227 |
return TRANSLATIONS[self.language]["load_pdf_first"]
|
228 |
|
229 |
current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
|
230 |
# Get all documents from the vectorstore
|
231 |
+
retriever = vectorstore.as_retriever(search_kwargs={"k": k})
|
232 |
documents = retriever.invoke('Summary of the document and key points')
|
233 |
|
234 |
if just_get_documments:
|
|
|
239 |
return final_summary
|
240 |
|
241 |
# Get the top k documents by score
|
242 |
+
def get_summary(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, just_get_documments=False, k=10):
|
243 |
|
244 |
final_summary_prompt = PromptTemplate(
|
245 |
input_variables=["texts", "language"],
|
|
|
255 |
"""
|
256 |
)
|
257 |
|
258 |
+
return self.summarizer_by_k_top_n(vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, final_summary_prompt, just_get_documments)
|
259 |
|
260 |
|
261 |
@spaces.GPU
|
262 |
+
def get_specialist_opinion(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
|
263 |
questions_prompt = PromptTemplate(
|
264 |
input_variables=["text", "specialist_prompt", "language"],
|
265 |
template="""
|
|
|
303 |
Answer:
|
304 |
"""
|
305 |
)
|
306 |
+
if not vectorstore:
|
307 |
return TRANSLATIONS[self.language]["load_pdf_first"]
|
308 |
|
309 |
+
print(ai_model)
|
310 |
+
print(type_model)
|
311 |
current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
|
312 |
|
313 |
+
summary_text = self.get_summary(vectorstore, ai_model, type_model, api_key, project_id_watsonx, True, 10)
|
314 |
questions_chain = questions_prompt | current_llm
|
315 |
questions = questions_chain.invoke({"text": summary_text, "specialist_prompt": specialist_prompt, "language": self.language})
|
316 |
|
317 |
print(questions)
|
318 |
|
|
|
|
|
|
|
|
|
|
|
319 |
questions = json.loads(questions)
|
320 |
|
321 |
print(questions)
|
|
|
325 |
else:
|
326 |
questions["aspects"] = questions["aspects"]
|
327 |
|
328 |
+
aspects_text = "\n".join([f"* {aspect}: {self.get_qa_response(vectorstore, aspect, [], ai_model, type_model, api_key, project_id_watsonx, 2)}" for aspect in questions["aspects"]])
|
329 |
|
330 |
return aspects_text
|
331 |
|