stivenDR14 commited on
Commit
07c0a81
·
1 Parent(s): c5332dd

update method for summary and models

Browse files
Files changed (3) hide show
  1. app.py +4 -4
  2. pdf_processor.py +67 -11
  3. utils.py +2 -2
app.py CHANGED
@@ -146,13 +146,13 @@ class PDFProcessorUI:
146
  label=TRANSLATIONS[self.current_language]["mini_analysis_title"],
147
  lines=10
148
  )
149
- summary_output = gr.Textbox(
150
- label=TRANSLATIONS[self.current_language]["summary_label"],
151
- lines=10
152
- )
153
  summarize_btn = gr.Button(
154
  TRANSLATIONS[self.current_language]["generate_summary"]
155
  )
 
 
 
 
156
 
157
  with specialist_tab:
158
  specialist_title = gr.Markdown(TRANSLATIONS[self.current_language]["specialist_title"])
 
146
  label=TRANSLATIONS[self.current_language]["mini_analysis_title"],
147
  lines=10
148
  )
 
 
 
 
149
  summarize_btn = gr.Button(
150
  TRANSLATIONS[self.current_language]["generate_summary"]
151
  )
152
+ summary_output = gr.Markdown(
153
+ label=TRANSLATIONS[self.current_language]["summary_label"],
154
+ height=400
155
+ )
156
 
157
  with specialist_tab:
158
  specialist_title = gr.Markdown(TRANSLATIONS[self.current_language]["specialist_title"])
pdf_processor.py CHANGED
@@ -17,6 +17,8 @@ import requests
17
  import os
18
  from dotenv import load_dotenv
19
  import re
 
 
20
 
21
  OLLAMA_LLM = "granite3.1-dense"
22
  OLLAMA_EMBEDDINGS = "granite-embedding:278m"
@@ -228,21 +230,74 @@ class PDFProcessor:
228
 
229
  return result["result"] + "\n\nSources: " + page_labels_text
230
 
231
- def summarizer_by_k_top_n(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, summary_prompt, just_get_documments=False):
232
- print("Summarizer by k top n in language: ", self.language)
233
  if not vectorstore:
234
  return TRANSLATIONS[self.language]["load_pdf_first"]
235
 
236
  current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
237
- # Get all documents from the vectorstore
238
- retriever = vectorstore.as_retriever(search_kwargs={"k": k})
239
- documents = retriever.invoke('Summary of the document and key points')
240
 
241
- if just_get_documments:
242
- return "\n".join([doc.page_content for doc in documents])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  summary_chain = summary_prompt | current_llm
245
- final_summary = summary_chain.invoke({"texts": "\n".join([doc.page_content for doc in documents]), "language": self.language})
 
246
  return final_summary
247
 
248
  def get_summary(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, just_get_documments=False, k=10):
@@ -250,18 +305,19 @@ class PDFProcessor:
250
  final_summary_prompt = PromptTemplate(
251
  input_variables=["texts", "language"],
252
  template="""
253
- Combine the following texts into a cohesive and structured final summary:
254
  ------------
255
  {texts}
256
  ------------
257
- The final summary should be between 2 and 4 paragraphs.
258
  Preserve the original meaning without adding external information or interpretations.
259
  Ensure clarity, logical flow, and coherence between the combined points.
260
  The summary must be in {language}.
 
 
261
  """
262
  )
263
 
264
- return self.summarizer_by_k_top_n(vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, final_summary_prompt, just_get_documments)
265
 
266
 
267
  def get_specialist_opinion(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
 
17
  import os
18
  from dotenv import load_dotenv
19
  import re
20
+ from sklearn.cluster import KMeans
21
+ from sklearn.metrics.pairwise import cosine_similarity
22
 
23
  OLLAMA_LLM = "granite3.1-dense"
24
  OLLAMA_EMBEDDINGS = "granite-embedding:278m"
 
230
 
231
  return result["result"] + "\n\nSources: " + page_labels_text
232
 
233
+ def summarizer_by_k_means(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, summary_prompt, just_get_documments=False):
234
+ print("Summarizer by k means in language: ", self.language)
235
  if not vectorstore:
236
  return TRANSLATIONS[self.language]["load_pdf_first"]
237
 
238
  current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
 
 
 
239
 
240
+ # Get all the documents from the vectorstore
241
+ documents = vectorstore.get(include=["embeddings", "documents"])
242
+ documentsByIds = documents["ids"]
243
+ documentsByEmbeddings = documents["embeddings"]
244
+ documentsByDocuments = documents["documents"]
245
+
246
+ print("documents length: ", len(documentsByEmbeddings))
247
+
248
+ #depending on the length of the documents, create a number of clusters, if is les than 12, create 3 clusters, if is les than 36, create 6 clusters, if is less than 108, create 12 clusters, else create 24 clusters
249
+ number_for_CreateClusters = 2
250
+ if len(documentsByEmbeddings) <= 16:
251
+ number_for_CreateClusters = 2
252
+ elif len(documentsByEmbeddings) <= 64:
253
+ number_for_CreateClusters = 4
254
+ elif len(documentsByEmbeddings) <= 128:
255
+ number_for_CreateClusters = 8
256
+ else:
257
+ number_for_CreateClusters = 12
258
 
259
+ num_clusters = max(1, len(documentsByEmbeddings) // number_for_CreateClusters)
260
+
261
+ print("num_clusters: ", num_clusters)
262
+ kmeans = KMeans(n_clusters=num_clusters, random_state=42)
263
+ kmeans.fit(documentsByEmbeddings)
264
+
265
+ summary_documents = []
266
+ map_ids_documents = {}
267
+ #for each cluster, choose the document embedding with the highest similarity to the centroid, based on numpy cosine similarity, and keep a map of ids of the documents
268
+ for i in range(num_clusters):
269
+ # Get the indices of the documents in the cluster
270
+ cluster_indices = [j for j, label in enumerate(kmeans.labels_) if label == i]
271
+
272
+ if not cluster_indices: # If there are no documents in this cluster, continue
273
+ continue
274
+
275
+ # Get the embeddings of the documents in this cluster
276
+ cluster_embeddings = [documentsByEmbeddings[j] for j in cluster_indices]
277
+
278
+ # Calculate the similarity with the centroid
279
+ centroid = kmeans.cluster_centers_[i]
280
+ similarities = [cosine_similarity([embedding], [centroid])[0][0] for embedding in cluster_embeddings]
281
+
282
+ # Find the most similar document to the centroid
283
+ most_similar_index = cluster_indices[similarities.index(max(similarities))]
284
+
285
+ # Add the most similar document to the summary list
286
+ summary_documents.append(documentsByDocuments[most_similar_index])
287
+ map_ids_documents[most_similar_index] = documentsByIds[most_similar_index]
288
+
289
+ print("map_ids_documents: ", map_ids_documents)
290
+
291
+ # Join the summary documents into a single string
292
+ summary_text = "\n".join(summary_documents)
293
+ print("summary_documents: ", summary_text)
294
+
295
+ if just_get_documments:
296
+ return summary_text
297
+
298
  summary_chain = summary_prompt | current_llm
299
+ final_summary = summary_chain.invoke({"texts": summary_text, "language": self.language})
300
+
301
  return final_summary
302
 
303
  def get_summary(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, just_get_documments=False, k=10):
 
305
  final_summary_prompt = PromptTemplate(
306
  input_variables=["texts", "language"],
307
  template="""
308
+ Combine the following texts into a cohesive and structured summary:
309
  ------------
310
  {texts}
311
  ------------
 
312
  Preserve the original meaning without adding external information or interpretations.
313
  Ensure clarity, logical flow, and coherence between the combined points.
314
  The summary must be in {language}.
315
+ The output must be in markdown format.
316
+ Output:
317
  """
318
  )
319
 
320
+ return self.summarizer_by_k_means(vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, final_summary_prompt, just_get_documments)
321
 
322
 
323
  def get_specialist_opinion(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
utils.py CHANGED
@@ -10,7 +10,7 @@ if ENVIRONMENT == "dev":
10
  AI_MODELS = {
11
  "Huggingface / Mistral Nemo Instruct": "mistralai/Mistral-Nemo-Instruct-2407",
12
  "Huggingface / Microsoft Phi 3.5 Mini Instruct": "microsoft/Phi-3.5-mini-instruct",
13
- "Huggingface / Google Gemma 2 9B Instruct": "google/gemma-2-9b-it",
14
  "Huggingface / Meta Llama 3.1 8B Instruct": "meta-llama/Llama-3.1-8B-Instruct",
15
  "IBM Granite3.1 dense / Ollama local": "ollama",
16
  "Open AI / GPT-4o-mini": "openai",
@@ -19,7 +19,7 @@ else:
19
  AI_MODELS = {
20
  "Huggingface / Mistral Nemo Instruct": "mistralai/Mistral-Nemo-Instruct-2407",
21
  "Huggingface / Microsoft Phi 3.5 Mini Instruct": "microsoft/Phi-3.5-mini-instruct",
22
- "Huggingface / Google Gemma 2 9B Instruct": "google/gemma-2-9b-it",
23
  "Huggingface / Meta Llama 3.1 8B Instruct": "meta-llama/Llama-3.1-8B-Instruct",
24
  "Open AI / GPT-4o-mini": "openai",
25
  }
 
10
  AI_MODELS = {
11
  "Huggingface / Mistral Nemo Instruct": "mistralai/Mistral-Nemo-Instruct-2407",
12
  "Huggingface / Microsoft Phi 3.5 Mini Instruct": "microsoft/Phi-3.5-mini-instruct",
13
+ "Huggingface / Google Gemma 3 12B Instruct": "google/gemma-3-12b-it",
14
  "Huggingface / Meta Llama 3.1 8B Instruct": "meta-llama/Llama-3.1-8B-Instruct",
15
  "IBM Granite3.1 dense / Ollama local": "ollama",
16
  "Open AI / GPT-4o-mini": "openai",
 
19
  AI_MODELS = {
20
  "Huggingface / Mistral Nemo Instruct": "mistralai/Mistral-Nemo-Instruct-2407",
21
  "Huggingface / Microsoft Phi 3.5 Mini Instruct": "microsoft/Phi-3.5-mini-instruct",
22
+ "Huggingface / Google Gemma 3 12B Instruct": "google/gemma-3-12b-it",
23
  "Huggingface / Meta Llama 3.1 8B Instruct": "meta-llama/Llama-3.1-8B-Instruct",
24
  "Open AI / GPT-4o-mini": "openai",
25
  }