stivenDR14 commited on
Commit
4779173
·
1 Parent(s): 4fafabd

chroma_db in the state of Gradio

Browse files
Files changed (2) hide show
  1. app.py +16 -15
  2. pdf_processor.py +21 -24
app.py CHANGED
@@ -57,17 +57,17 @@ class PDFProcessorUI:
57
  else:
58
  return gr.update(visible=False), gr.update(visible=False)
59
 
60
- def process_pdf(self, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx):
61
- return self.processor.process_pdf(pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx)
62
 
63
- def qa_interface(self, message, history, ai_model, type_model, api_key, project_id_watsonx):
64
- return self.processor.get_qa_response(message, history, ai_model, type_model, api_key, project_id_watsonx)
65
 
66
- def summarize_interface(self, ai_model, type_model, api_key, project_id_watsonx):
67
- return self.processor.get_summary(ai_model, type_model, api_key, project_id_watsonx)
68
 
69
- def specialist_opinion(self, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
70
- return self.processor.get_specialist_opinion(ai_model, type_model, api_key, project_id_watsonx, specialist_prompt)
71
 
72
  def upload_file(files):
73
  file_paths = [file.name for file in files]
@@ -75,6 +75,7 @@ class PDFProcessorUI:
75
 
76
  def create_ui(self):
77
  with gr.Blocks() as demo:
 
78
  title = gr.Markdown(TRANSLATIONS[self.current_language]["title"])
79
 
80
  with gr.Row():
@@ -164,8 +165,8 @@ class PDFProcessorUI:
164
  label=TRANSLATIONS[self.current_language]["mini_analysis_title"],
165
  lines=10
166
  )
167
- specialist_output = gr.Textbox(label=TRANSLATIONS[self.current_language]["specialist_output"], lines=20)
168
  specialist_btn = gr.Button(TRANSLATIONS[self.current_language]["specialist_btn"])
 
169
 
170
 
171
  language_dropdown.change(
@@ -210,31 +211,31 @@ class PDFProcessorUI:
210
 
211
  chat_placeholder.submit(
212
  fn=self.qa_interface,
213
- inputs=[chat_placeholder, chatbot, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
214
  outputs=[chatbot]
215
  )
216
 
217
  process_btn.click(
218
  fn=self.process_pdf,
219
- inputs=[pdf_file, chunk_size, chunk_overlap, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
220
- outputs=[process_output]
221
  )
222
 
223
  summarize_btn.click(
224
  fn=self.summarize_interface,
225
- inputs=[ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
226
  outputs=[summary_output]
227
  )
228
 
229
  specialist_btn.click(
230
  fn=self.specialist_opinion,
231
- inputs=[ai_model_dropdown, type_model, api_key_input, project_id_watsonx, specialist_placeholder],
232
  outputs=[specialist_output]
233
  )
234
 
235
  chat_btn.click(
236
  fn=self.qa_interface,
237
- inputs=[chat_placeholder, chatbot, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
238
  outputs=[chatbot]
239
  )
240
 
 
57
  else:
58
  return gr.update(visible=False), gr.update(visible=False)
59
 
60
+ def process_pdf(self, vectorstore, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx):
61
+ return self.processor.process_pdf(vectorstore, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx)
62
 
63
+ def qa_interface(self, vectorstore, message, history, ai_model, type_model, api_key, project_id_watsonx):
64
+ return self.processor.get_qa_response(vectorstore, message, history, ai_model, type_model, api_key, project_id_watsonx)
65
 
66
+ def summarize_interface(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx):
67
+ return self.processor.get_summary(vectorstore, ai_model, type_model, api_key, project_id_watsonx)
68
 
69
+ def specialist_opinion(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
70
+ return self.processor.get_specialist_opinion(vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt)
71
 
72
  def upload_file(files):
73
  file_paths = [file.name for file in files]
 
75
 
76
  def create_ui(self):
77
  with gr.Blocks() as demo:
78
+ vectorstore = gr.State()
79
  title = gr.Markdown(TRANSLATIONS[self.current_language]["title"])
80
 
81
  with gr.Row():
 
165
  label=TRANSLATIONS[self.current_language]["mini_analysis_title"],
166
  lines=10
167
  )
 
168
  specialist_btn = gr.Button(TRANSLATIONS[self.current_language]["specialist_btn"])
169
+ specialist_output = gr.Textbox(label=TRANSLATIONS[self.current_language]["specialist_output"], lines=20)
170
 
171
 
172
  language_dropdown.change(
 
211
 
212
  chat_placeholder.submit(
213
  fn=self.qa_interface,
214
+ inputs=[vectorstore, chat_placeholder, chatbot, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
215
  outputs=[chatbot]
216
  )
217
 
218
  process_btn.click(
219
  fn=self.process_pdf,
220
+ inputs=[vectorstore, pdf_file, chunk_size, chunk_overlap, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
221
+ outputs=[process_output, vectorstore]
222
  )
223
 
224
  summarize_btn.click(
225
  fn=self.summarize_interface,
226
+ inputs=[vectorstore, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
227
  outputs=[summary_output]
228
  )
229
 
230
  specialist_btn.click(
231
  fn=self.specialist_opinion,
232
+ inputs=[vectorstore, ai_model_dropdown, type_model, api_key_input, project_id_watsonx, specialist_placeholder],
233
  outputs=[specialist_output]
234
  )
235
 
236
  chat_btn.click(
237
  fn=self.qa_interface,
238
+ inputs=[vectorstore, chat_placeholder, chatbot, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
239
  outputs=[chatbot]
240
  )
241
 
pdf_processor.py CHANGED
@@ -95,7 +95,6 @@ def authenticate_watsonx(api_key):
95
 
96
  class PDFProcessor:
97
  def __init__(self):
98
- self.vectorstore = None
99
  self.language = "English"
100
 
101
  def set_language(self, language):
@@ -145,7 +144,7 @@ class PDFProcessor:
145
  return current_llm, embeding_model
146
 
147
  @spaces.GPU
148
- def process_pdf(self, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx):
149
  defined_chunk_size = 1000
150
  defined_chunk_overlap = 150
151
  if (ai_model == "Open AI / GPT-4o-mini" and (api_key == "")) : #or (ai_model == "IBM Granite3.1 dense / Ollama local" and type_model == "Api Key" and (api_key == "" or project_id_watsonx == "")
@@ -178,13 +177,13 @@ class PDFProcessor:
178
  _, embeddings = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
179
 
180
  #delete all documents from the vectorstore
181
- if self.vectorstore:
182
- self.vectorstore.delete_collection()
183
 
184
  chromadb.api.client.SharedSystemClient.clear_system_cache()
185
  new_client = chromadb.EphemeralClient()
186
 
187
- self.vectorstore = Chroma.from_documents(
188
  documents=texts,
189
  embedding=embeddings,
190
  client=new_client,
@@ -192,19 +191,19 @@ class PDFProcessor:
192
  #persist_directory="./chroma_db"
193
  )
194
 
195
- return TRANSLATIONS[self.language]["pdf_processed"] #+ f" ---- Chunks: {len(self.vectorstore.get()["documents"])}"
196
 
197
  else:
198
- return TRANSLATIONS[self.language]["load_pdf_first"]
199
 
200
  @spaces.GPU
201
- def get_qa_response(self, message, history, ai_model, type_model, api_key, project_id_watsonx, k=4):
202
  current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
203
 
204
- if not self.vectorstore:
205
  return TRANSLATIONS[self.language]["load_pdf_first"]
206
 
207
- retriever = self.vectorstore.as_retriever(search_kwargs={"k": k})
208
 
209
  qa_chain = RetrievalQA.from_chain_type(
210
  llm=current_llm,
@@ -222,13 +221,14 @@ class PDFProcessor:
222
  return result["result"] + "\n\nSources: " + page_labels_text
223
 
224
  @spaces.GPU
225
- def summarizer_by_k_top_n(self, ai_model, type_model, api_key, project_id_watsonx, k, summary_prompt, just_get_documments=False):
226
- if not self.vectorstore:
 
227
  return TRANSLATIONS[self.language]["load_pdf_first"]
228
 
229
  current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
230
  # Get all documents from the vectorstore
231
- retriever = self.vectorstore.as_retriever(search_kwargs={"k": k})
232
  documents = retriever.invoke('Summary of the document and key points')
233
 
234
  if just_get_documments:
@@ -239,7 +239,7 @@ class PDFProcessor:
239
  return final_summary
240
 
241
  # Get the top k documents by score
242
- def get_summary(self, ai_model, type_model, api_key, project_id_watsonx, just_get_documments=False, k=10):
243
 
244
  final_summary_prompt = PromptTemplate(
245
  input_variables=["texts", "language"],
@@ -255,11 +255,11 @@ class PDFProcessor:
255
  """
256
  )
257
 
258
- return self.summarizer_by_k_top_n(ai_model, type_model, api_key, project_id_watsonx, k, final_summary_prompt, just_get_documments)
259
 
260
 
261
  @spaces.GPU
262
- def get_specialist_opinion(self, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
263
  questions_prompt = PromptTemplate(
264
  input_variables=["text", "specialist_prompt", "language"],
265
  template="""
@@ -303,22 +303,19 @@ class PDFProcessor:
303
  Answer:
304
  """
305
  )
306
- if not self.vectorstore:
307
  return TRANSLATIONS[self.language]["load_pdf_first"]
308
 
 
 
309
  current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
310
 
311
- summary_text = self.get_summary(ai_model, type_model, api_key, project_id_watsonx, True, 10)
312
  questions_chain = questions_prompt | current_llm
313
  questions = questions_chain.invoke({"text": summary_text, "specialist_prompt": specialist_prompt, "language": self.language})
314
 
315
  print(questions)
316
 
317
- #clean the questions variable, delete all the text before the json and after the json
318
- questions = questions.split("{")[1]
319
- questions = questions.split("}")[0]
320
- questions = questions.strip()
321
- print(questions)
322
  questions = json.loads(questions)
323
 
324
  print(questions)
@@ -328,7 +325,7 @@ class PDFProcessor:
328
  else:
329
  questions["aspects"] = questions["aspects"]
330
 
331
- aspects_text = "\n".join([f"* {aspect}: {self.get_qa_response(aspect, [], ai_model, type_model, api_key, project_id_watsonx, 2)}" for aspect in questions["aspects"]])
332
 
333
  return aspects_text
334
 
 
95
 
96
  class PDFProcessor:
97
  def __init__(self):
 
98
  self.language = "English"
99
 
100
  def set_language(self, language):
 
144
  return current_llm, embeding_model
145
 
146
  @spaces.GPU
147
+ def process_pdf(self, vectorstore, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx):
148
  defined_chunk_size = 1000
149
  defined_chunk_overlap = 150
150
  if (ai_model == "Open AI / GPT-4o-mini" and (api_key == "")) : #or (ai_model == "IBM Granite3.1 dense / Ollama local" and type_model == "Api Key" and (api_key == "" or project_id_watsonx == "")
 
177
  _, embeddings = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
178
 
179
  #delete all documents from the vectorstore
180
+ if vectorstore:
181
+ vectorstore.delete_collection()
182
 
183
  chromadb.api.client.SharedSystemClient.clear_system_cache()
184
  new_client = chromadb.EphemeralClient()
185
 
186
+ vectorstore = Chroma.from_documents(
187
  documents=texts,
188
  embedding=embeddings,
189
  client=new_client,
 
191
  #persist_directory="./chroma_db"
192
  )
193
 
194
+ return TRANSLATIONS[self.language]["pdf_processed"], vectorstore #+ f" ---- Chunks: {len(vectorstore.get()["documents"])}"
195
 
196
  else:
197
+ return TRANSLATIONS[self.language]["load_pdf_first"], None
198
 
199
  @spaces.GPU
200
+ def get_qa_response(self, vectorstore, message, history, ai_model, type_model, api_key, project_id_watsonx, k=4):
201
  current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
202
 
203
+ if not vectorstore:
204
  return TRANSLATIONS[self.language]["load_pdf_first"]
205
 
206
+ retriever = vectorstore.as_retriever(search_kwargs={"k": k})
207
 
208
  qa_chain = RetrievalQA.from_chain_type(
209
  llm=current_llm,
 
221
  return result["result"] + "\n\nSources: " + page_labels_text
222
 
223
  @spaces.GPU
224
+ def summarizer_by_k_top_n(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, summary_prompt, just_get_documments=False):
225
+ print("Summarizer by k top n in language: ", self.language)
226
+ if not vectorstore:
227
  return TRANSLATIONS[self.language]["load_pdf_first"]
228
 
229
  current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
230
  # Get all documents from the vectorstore
231
+ retriever = vectorstore.as_retriever(search_kwargs={"k": k})
232
  documents = retriever.invoke('Summary of the document and key points')
233
 
234
  if just_get_documments:
 
239
  return final_summary
240
 
241
  # Get the top k documents by score
242
+ def get_summary(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, just_get_documments=False, k=10):
243
 
244
  final_summary_prompt = PromptTemplate(
245
  input_variables=["texts", "language"],
 
255
  """
256
  )
257
 
258
+ return self.summarizer_by_k_top_n(vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, final_summary_prompt, just_get_documments)
259
 
260
 
261
  @spaces.GPU
262
+ def get_specialist_opinion(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
263
  questions_prompt = PromptTemplate(
264
  input_variables=["text", "specialist_prompt", "language"],
265
  template="""
 
303
  Answer:
304
  """
305
  )
306
+ if not vectorstore:
307
  return TRANSLATIONS[self.language]["load_pdf_first"]
308
 
309
+ print(ai_model)
310
+ print(type_model)
311
  current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
312
 
313
+ summary_text = self.get_summary(vectorstore, ai_model, type_model, api_key, project_id_watsonx, True, 10)
314
  questions_chain = questions_prompt | current_llm
315
  questions = questions_chain.invoke({"text": summary_text, "specialist_prompt": specialist_prompt, "language": self.language})
316
 
317
  print(questions)
318
 
 
 
 
 
 
319
  questions = json.loads(questions)
320
 
321
  print(questions)
 
325
  else:
326
  questions["aspects"] = questions["aspects"]
327
 
328
+ aspects_text = "\n".join([f"* {aspect}: {self.get_qa_response(vectorstore, aspect, [], ai_model, type_model, api_key, project_id_watsonx, 2)}" for aspect in questions["aspects"]])
329
 
330
  return aspects_text
331