stivenDR14
commited on
Commit
·
07c0a81
1
Parent(s):
c5332dd
update method for summary and models
Browse files- app.py +4 -4
- pdf_processor.py +67 -11
- utils.py +2 -2
app.py
CHANGED
@@ -146,13 +146,13 @@ class PDFProcessorUI:
|
|
146 |
label=TRANSLATIONS[self.current_language]["mini_analysis_title"],
|
147 |
lines=10
|
148 |
)
|
149 |
-
summary_output = gr.Textbox(
|
150 |
-
label=TRANSLATIONS[self.current_language]["summary_label"],
|
151 |
-
lines=10
|
152 |
-
)
|
153 |
summarize_btn = gr.Button(
|
154 |
TRANSLATIONS[self.current_language]["generate_summary"]
|
155 |
)
|
|
|
|
|
|
|
|
|
156 |
|
157 |
with specialist_tab:
|
158 |
specialist_title = gr.Markdown(TRANSLATIONS[self.current_language]["specialist_title"])
|
|
|
146 |
label=TRANSLATIONS[self.current_language]["mini_analysis_title"],
|
147 |
lines=10
|
148 |
)
|
|
|
|
|
|
|
|
|
149 |
summarize_btn = gr.Button(
|
150 |
TRANSLATIONS[self.current_language]["generate_summary"]
|
151 |
)
|
152 |
+
summary_output = gr.Markdown(
|
153 |
+
label=TRANSLATIONS[self.current_language]["summary_label"],
|
154 |
+
height=400
|
155 |
+
)
|
156 |
|
157 |
with specialist_tab:
|
158 |
specialist_title = gr.Markdown(TRANSLATIONS[self.current_language]["specialist_title"])
|
pdf_processor.py
CHANGED
@@ -17,6 +17,8 @@ import requests
|
|
17 |
import os
|
18 |
from dotenv import load_dotenv
|
19 |
import re
|
|
|
|
|
20 |
|
21 |
OLLAMA_LLM = "granite3.1-dense"
|
22 |
OLLAMA_EMBEDDINGS = "granite-embedding:278m"
|
@@ -228,21 +230,74 @@ class PDFProcessor:
|
|
228 |
|
229 |
return result["result"] + "\n\nSources: " + page_labels_text
|
230 |
|
231 |
-
def
|
232 |
-
print("Summarizer by k
|
233 |
if not vectorstore:
|
234 |
return TRANSLATIONS[self.language]["load_pdf_first"]
|
235 |
|
236 |
current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
|
237 |
-
# Get all documents from the vectorstore
|
238 |
-
retriever = vectorstore.as_retriever(search_kwargs={"k": k})
|
239 |
-
documents = retriever.invoke('Summary of the document and key points')
|
240 |
|
241 |
-
|
242 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
summary_chain = summary_prompt | current_llm
|
245 |
-
final_summary = summary_chain.invoke({"texts":
|
|
|
246 |
return final_summary
|
247 |
|
248 |
def get_summary(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, just_get_documments=False, k=10):
|
@@ -250,18 +305,19 @@ class PDFProcessor:
|
|
250 |
final_summary_prompt = PromptTemplate(
|
251 |
input_variables=["texts", "language"],
|
252 |
template="""
|
253 |
-
Combine the following texts into a cohesive and structured
|
254 |
------------
|
255 |
{texts}
|
256 |
------------
|
257 |
-
The final summary should be between 2 and 4 paragraphs.
|
258 |
Preserve the original meaning without adding external information or interpretations.
|
259 |
Ensure clarity, logical flow, and coherence between the combined points.
|
260 |
The summary must be in {language}.
|
|
|
|
|
261 |
"""
|
262 |
)
|
263 |
|
264 |
-
return self.
|
265 |
|
266 |
|
267 |
def get_specialist_opinion(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
|
|
|
17 |
import os
|
18 |
from dotenv import load_dotenv
|
19 |
import re
|
20 |
+
from sklearn.cluster import KMeans
|
21 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
22 |
|
23 |
OLLAMA_LLM = "granite3.1-dense"
|
24 |
OLLAMA_EMBEDDINGS = "granite-embedding:278m"
|
|
|
230 |
|
231 |
return result["result"] + "\n\nSources: " + page_labels_text
|
232 |
|
233 |
+
def summarizer_by_k_means(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, summary_prompt, just_get_documments=False):
|
234 |
+
print("Summarizer by k means in language: ", self.language)
|
235 |
if not vectorstore:
|
236 |
return TRANSLATIONS[self.language]["load_pdf_first"]
|
237 |
|
238 |
current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
|
|
|
|
|
|
|
239 |
|
240 |
+
# Get all the documents from the vectorstore
|
241 |
+
documents = vectorstore.get(include=["embeddings", "documents"])
|
242 |
+
documentsByIds = documents["ids"]
|
243 |
+
documentsByEmbeddings = documents["embeddings"]
|
244 |
+
documentsByDocuments = documents["documents"]
|
245 |
+
|
246 |
+
print("documents length: ", len(documentsByEmbeddings))
|
247 |
+
|
248 |
+
#depending on the length of the documents, create a number of clusters, if is les than 12, create 3 clusters, if is les than 36, create 6 clusters, if is less than 108, create 12 clusters, else create 24 clusters
|
249 |
+
number_for_CreateClusters = 2
|
250 |
+
if len(documentsByEmbeddings) <= 16:
|
251 |
+
number_for_CreateClusters = 2
|
252 |
+
elif len(documentsByEmbeddings) <= 64:
|
253 |
+
number_for_CreateClusters = 4
|
254 |
+
elif len(documentsByEmbeddings) <= 128:
|
255 |
+
number_for_CreateClusters = 8
|
256 |
+
else:
|
257 |
+
number_for_CreateClusters = 12
|
258 |
|
259 |
+
num_clusters = max(1, len(documentsByEmbeddings) // number_for_CreateClusters)
|
260 |
+
|
261 |
+
print("num_clusters: ", num_clusters)
|
262 |
+
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
|
263 |
+
kmeans.fit(documentsByEmbeddings)
|
264 |
+
|
265 |
+
summary_documents = []
|
266 |
+
map_ids_documents = {}
|
267 |
+
#for each cluster, choose the document embedding with the highest similarity to the centroid, based on numpy cosine similarity, and keep a map of ids of the documents
|
268 |
+
for i in range(num_clusters):
|
269 |
+
# Get the indices of the documents in the cluster
|
270 |
+
cluster_indices = [j for j, label in enumerate(kmeans.labels_) if label == i]
|
271 |
+
|
272 |
+
if not cluster_indices: # If there are no documents in this cluster, continue
|
273 |
+
continue
|
274 |
+
|
275 |
+
# Get the embeddings of the documents in this cluster
|
276 |
+
cluster_embeddings = [documentsByEmbeddings[j] for j in cluster_indices]
|
277 |
+
|
278 |
+
# Calculate the similarity with the centroid
|
279 |
+
centroid = kmeans.cluster_centers_[i]
|
280 |
+
similarities = [cosine_similarity([embedding], [centroid])[0][0] for embedding in cluster_embeddings]
|
281 |
+
|
282 |
+
# Find the most similar document to the centroid
|
283 |
+
most_similar_index = cluster_indices[similarities.index(max(similarities))]
|
284 |
+
|
285 |
+
# Add the most similar document to the summary list
|
286 |
+
summary_documents.append(documentsByDocuments[most_similar_index])
|
287 |
+
map_ids_documents[most_similar_index] = documentsByIds[most_similar_index]
|
288 |
+
|
289 |
+
print("map_ids_documents: ", map_ids_documents)
|
290 |
+
|
291 |
+
# Join the summary documents into a single string
|
292 |
+
summary_text = "\n".join(summary_documents)
|
293 |
+
print("summary_documents: ", summary_text)
|
294 |
+
|
295 |
+
if just_get_documments:
|
296 |
+
return summary_text
|
297 |
+
|
298 |
summary_chain = summary_prompt | current_llm
|
299 |
+
final_summary = summary_chain.invoke({"texts": summary_text, "language": self.language})
|
300 |
+
|
301 |
return final_summary
|
302 |
|
303 |
def get_summary(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, just_get_documments=False, k=10):
|
|
|
305 |
final_summary_prompt = PromptTemplate(
|
306 |
input_variables=["texts", "language"],
|
307 |
template="""
|
308 |
+
Combine the following texts into a cohesive and structured summary:
|
309 |
------------
|
310 |
{texts}
|
311 |
------------
|
|
|
312 |
Preserve the original meaning without adding external information or interpretations.
|
313 |
Ensure clarity, logical flow, and coherence between the combined points.
|
314 |
The summary must be in {language}.
|
315 |
+
The output must be in markdown format.
|
316 |
+
Output:
|
317 |
"""
|
318 |
)
|
319 |
|
320 |
+
return self.summarizer_by_k_means(vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, final_summary_prompt, just_get_documments)
|
321 |
|
322 |
|
323 |
def get_specialist_opinion(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
|
utils.py
CHANGED
@@ -10,7 +10,7 @@ if ENVIRONMENT == "dev":
|
|
10 |
AI_MODELS = {
|
11 |
"Huggingface / Mistral Nemo Instruct": "mistralai/Mistral-Nemo-Instruct-2407",
|
12 |
"Huggingface / Microsoft Phi 3.5 Mini Instruct": "microsoft/Phi-3.5-mini-instruct",
|
13 |
-
"Huggingface / Google Gemma
|
14 |
"Huggingface / Meta Llama 3.1 8B Instruct": "meta-llama/Llama-3.1-8B-Instruct",
|
15 |
"IBM Granite3.1 dense / Ollama local": "ollama",
|
16 |
"Open AI / GPT-4o-mini": "openai",
|
@@ -19,7 +19,7 @@ else:
|
|
19 |
AI_MODELS = {
|
20 |
"Huggingface / Mistral Nemo Instruct": "mistralai/Mistral-Nemo-Instruct-2407",
|
21 |
"Huggingface / Microsoft Phi 3.5 Mini Instruct": "microsoft/Phi-3.5-mini-instruct",
|
22 |
-
"Huggingface / Google Gemma
|
23 |
"Huggingface / Meta Llama 3.1 8B Instruct": "meta-llama/Llama-3.1-8B-Instruct",
|
24 |
"Open AI / GPT-4o-mini": "openai",
|
25 |
}
|
|
|
10 |
AI_MODELS = {
|
11 |
"Huggingface / Mistral Nemo Instruct": "mistralai/Mistral-Nemo-Instruct-2407",
|
12 |
"Huggingface / Microsoft Phi 3.5 Mini Instruct": "microsoft/Phi-3.5-mini-instruct",
|
13 |
+
"Huggingface / Google Gemma 3 12B Instruct": "google/gemma-3-12b-it",
|
14 |
"Huggingface / Meta Llama 3.1 8B Instruct": "meta-llama/Llama-3.1-8B-Instruct",
|
15 |
"IBM Granite3.1 dense / Ollama local": "ollama",
|
16 |
"Open AI / GPT-4o-mini": "openai",
|
|
|
19 |
AI_MODELS = {
|
20 |
"Huggingface / Mistral Nemo Instruct": "mistralai/Mistral-Nemo-Instruct-2407",
|
21 |
"Huggingface / Microsoft Phi 3.5 Mini Instruct": "microsoft/Phi-3.5-mini-instruct",
|
22 |
+
"Huggingface / Google Gemma 3 12B Instruct": "google/gemma-3-12b-it",
|
23 |
"Huggingface / Meta Llama 3.1 8B Instruct": "meta-llama/Llama-3.1-8B-Instruct",
|
24 |
"Open AI / GPT-4o-mini": "openai",
|
25 |
}
|