Spaces:

stiv14
/

pdf-multilanguage-qa-role

Running

App Files Files Community

stivenDR14 commited on Mar 19

Commit

07c0a81

1 Parent(s): c5332dd

update method for summary and models

Browse files

Files changed (3) hide show

app.py +4 -4
pdf_processor.py +67 -11
utils.py +2 -2

app.py CHANGED Viewed

@@ -146,13 +146,13 @@ class PDFProcessorUI:
                         label=TRANSLATIONS[self.current_language]["mini_analysis_title"],
                         lines=10
                     )
-                summary_output = gr.Textbox(
-                    label=TRANSLATIONS[self.current_language]["summary_label"],
-                    lines=10
-                )
                 summarize_btn = gr.Button(
                     TRANSLATIONS[self.current_language]["generate_summary"]
                 )
             with specialist_tab:
                 specialist_title = gr.Markdown(TRANSLATIONS[self.current_language]["specialist_title"])

                         label=TRANSLATIONS[self.current_language]["mini_analysis_title"],
                         lines=10
                     )
                 summarize_btn = gr.Button(
                     TRANSLATIONS[self.current_language]["generate_summary"]
                 )
+                summary_output = gr.Markdown(
+                    label=TRANSLATIONS[self.current_language]["summary_label"],
+                    height=400
+                )
             with specialist_tab:
                 specialist_title = gr.Markdown(TRANSLATIONS[self.current_language]["specialist_title"])

pdf_processor.py CHANGED Viewed

@@ -17,6 +17,8 @@ import requests
 import os
 from dotenv import load_dotenv
 import re
 OLLAMA_LLM = "granite3.1-dense"
 OLLAMA_EMBEDDINGS = "granite-embedding:278m"
@@ -228,21 +230,74 @@ class PDFProcessor:
         return result["result"] + "\n\nSources: " + page_labels_text
-    def summarizer_by_k_top_n(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, summary_prompt, just_get_documments=False):
-        print("Summarizer by k top n in language: ", self.language)
         if not vectorstore:
             return TRANSLATIONS[self.language]["load_pdf_first"]
         current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
-        # Get all documents from the vectorstore
-        retriever = vectorstore.as_retriever(search_kwargs={"k": k})
-        documents = retriever.invoke('Summary of the document and key points')
-        if just_get_documments:
-            return  "\n".join([doc.page_content for doc in documents])
         summary_chain = summary_prompt | current_llm
-        final_summary = summary_chain.invoke({"texts": "\n".join([doc.page_content for doc in documents]), "language": self.language})
         return final_summary
     def get_summary(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, just_get_documments=False, k=10):
@@ -250,18 +305,19 @@ class PDFProcessor:
         final_summary_prompt = PromptTemplate(
             input_variables=["texts", "language"],
             template="""
-            Combine the following texts into a cohesive and structured final summary:
             ------------
             {texts}
             ------------
-            The final summary should be between 2 and 4 paragraphs.
             Preserve the original meaning without adding external information or interpretations.
             Ensure clarity, logical flow, and coherence between the combined points.
             The summary must be in {language}.
             """
         )
-        return self.summarizer_by_k_top_n(vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, final_summary_prompt, just_get_documments)
     def get_specialist_opinion(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):

 import os
 from dotenv import load_dotenv
 import re
+from sklearn.cluster import KMeans
+from sklearn.metrics.pairwise import cosine_similarity
 OLLAMA_LLM = "granite3.1-dense"
 OLLAMA_EMBEDDINGS = "granite-embedding:278m"
         return result["result"] + "\n\nSources: " + page_labels_text
+    def summarizer_by_k_means(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, summary_prompt, just_get_documments=False):
+        print("Summarizer by k means in language: ", self.language)
         if not vectorstore:
             return TRANSLATIONS[self.language]["load_pdf_first"]
         current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
+        # Get all the documents from the vectorstore
+        documents = vectorstore.get(include=["embeddings", "documents"])
+        documentsByIds = documents["ids"]
+        documentsByEmbeddings = documents["embeddings"]
+        documentsByDocuments = documents["documents"]
+        print("documents length: ", len(documentsByEmbeddings))
+       #depending on the length of the documents, create a number of clusters, if is les than 12, create 3 clusters, if is les than 36, create 6 clusters, if is less than 108, create 12 clusters, else create 24 clusters
+        number_for_CreateClusters = 2
+        if len(documentsByEmbeddings) <= 16:
+            number_for_CreateClusters = 2
+        elif len(documentsByEmbeddings) <= 64:
+            number_for_CreateClusters = 4
+        elif len(documentsByEmbeddings) <= 128:
+            number_for_CreateClusters = 8
+        else:
+            number_for_CreateClusters = 12
+        num_clusters = max(1, len(documentsByEmbeddings) // number_for_CreateClusters)
+        print("num_clusters: ", num_clusters)
+        kmeans = KMeans(n_clusters=num_clusters, random_state=42)
+        kmeans.fit(documentsByEmbeddings)
+        summary_documents = []
+        map_ids_documents = {}
+        #for each cluster, choose the document embedding with the highest similarity to the centroid, based on numpy cosine similarity, and keep a map of ids of the documents
+        for i in range(num_clusters):
+            # Get the indices of the documents in the cluster
+            cluster_indices = [j for j, label in enumerate(kmeans.labels_) if label == i]
+            if not cluster_indices:  # If there are no documents in this cluster, continue
+                continue
+            # Get the embeddings of the documents in this cluster
+            cluster_embeddings = [documentsByEmbeddings[j] for j in cluster_indices]
+            # Calculate the similarity with the centroid
+            centroid = kmeans.cluster_centers_[i]
+            similarities = [cosine_similarity([embedding], [centroid])[0][0] for embedding in cluster_embeddings]
+            # Find the most similar document to the centroid
+            most_similar_index = cluster_indices[similarities.index(max(similarities))]
+            # Add the most similar document to the summary list
+            summary_documents.append(documentsByDocuments[most_similar_index])
+            map_ids_documents[most_similar_index] = documentsByIds[most_similar_index]
+        print("map_ids_documents: ", map_ids_documents)
+        # Join the summary documents into a single string
+        summary_text = "\n".join(summary_documents)
+        print("summary_documents: ", summary_text)
+        if just_get_documments:
+            return summary_text
         summary_chain = summary_prompt | current_llm
+        final_summary = summary_chain.invoke({"texts": summary_text, "language": self.language})
         return final_summary
     def get_summary(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, just_get_documments=False, k=10):
         final_summary_prompt = PromptTemplate(
             input_variables=["texts", "language"],
             template="""
+            Combine the following texts into a cohesive and structured summary:
             ------------
             {texts}
             ------------
             Preserve the original meaning without adding external information or interpretations.
             Ensure clarity, logical flow, and coherence between the combined points.
             The summary must be in {language}.
+            The output must be in markdown format.
+            Output:
             """
         )
+        return self.summarizer_by_k_means(vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, final_summary_prompt, just_get_documments)
     def get_specialist_opinion(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):

utils.py CHANGED Viewed

@@ -10,7 +10,7 @@ if ENVIRONMENT == "dev":
     AI_MODELS = {
         "Huggingface / Mistral Nemo Instruct": "mistralai/Mistral-Nemo-Instruct-2407",
         "Huggingface / Microsoft Phi 3.5 Mini Instruct": "microsoft/Phi-3.5-mini-instruct",
-        "Huggingface / Google Gemma 2 9B Instruct": "google/gemma-2-9b-it",
         "Huggingface / Meta Llama 3.1 8B Instruct": "meta-llama/Llama-3.1-8B-Instruct",
         "IBM Granite3.1 dense / Ollama local": "ollama",
         "Open AI / GPT-4o-mini": "openai",
@@ -19,7 +19,7 @@ else:
     AI_MODELS = {
         "Huggingface / Mistral Nemo Instruct": "mistralai/Mistral-Nemo-Instruct-2407",
         "Huggingface / Microsoft Phi 3.5 Mini Instruct": "microsoft/Phi-3.5-mini-instruct",
-        "Huggingface / Google Gemma 2 9B Instruct": "google/gemma-2-9b-it",
         "Huggingface / Meta Llama 3.1 8B Instruct": "meta-llama/Llama-3.1-8B-Instruct",
         "Open AI / GPT-4o-mini": "openai",
     }

     AI_MODELS = {
         "Huggingface / Mistral Nemo Instruct": "mistralai/Mistral-Nemo-Instruct-2407",
         "Huggingface / Microsoft Phi 3.5 Mini Instruct": "microsoft/Phi-3.5-mini-instruct",
+        "Huggingface / Google Gemma 3 12B Instruct": "google/gemma-3-12b-it",
         "Huggingface / Meta Llama 3.1 8B Instruct": "meta-llama/Llama-3.1-8B-Instruct",
         "IBM Granite3.1 dense / Ollama local": "ollama",
         "Open AI / GPT-4o-mini": "openai",
     AI_MODELS = {
         "Huggingface / Mistral Nemo Instruct": "mistralai/Mistral-Nemo-Instruct-2407",
         "Huggingface / Microsoft Phi 3.5 Mini Instruct": "microsoft/Phi-3.5-mini-instruct",
+        "Huggingface / Google Gemma 3 12B Instruct": "google/gemma-3-12b-it",
         "Huggingface / Meta Llama 3.1 8B Instruct": "meta-llama/Llama-3.1-8B-Instruct",
         "Open AI / GPT-4o-mini": "openai",
     }