File size: 19,180 Bytes
67f0d18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b469fbd
07c0a81
 
67f0d18
 
 
 
 
 
 
ce74d46
 
67f0d18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9217f3e
67f0d18
 
 
77773f3
67f0d18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce74d46
 
 
 
 
 
 
 
 
 
 
 
67f0d18
dd1aad4
67f0d18
 
 
4779173
67f0d18
a71c291
67f0d18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c92be56
67f0d18
6693c55
 
67f0d18
4fafabd
67f0d18
 
4779173
67f0d18
 
 
 
 
 
b4def70
 
67f0d18
4779173
67f0d18
 
4779173
67f0d18
4779173
67f0d18
 
4779173
67f0d18
 
4779173
67f0d18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07c0a81
 
4779173
67f0d18
 
 
 
07c0a81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67f0d18
07c0a81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67f0d18
07c0a81
 
67f0d18
 
4779173
67f0d18
 
 
 
07c0a81
67f0d18
 
 
 
 
 
07c0a81
1c7d5c8
67f0d18
 
 
07c0a81
67f0d18
 
4779173
67f0d18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4779173
67f0d18
 
4779173
 
67f0d18
 
4779173
67f0d18
 
 
 
 
b469fbd
 
 
 
 
 
 
 
67f0d18
 
 
 
 
 
 
 
 
4779173
67f0d18
 
 
 
 
 
 
 
 
 
 
 
 
 
b469fbd
 
 
 
 
 
 
 
 
22baa1c
 
 
 
 
 
 
 
67f0d18
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
import json
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_ollama import OllamaLLM
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ibm_watsonx_ai.metanames import EmbedTextParamsMetaNames
from langchain_ibm import WatsonxLLM, WatsonxEmbeddings
from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings
from ibm_watsonx_ai import APIClient, Credentials
from utils import AI_MODELS, TRANSLATIONS
import chromadb
import requests
import os
from dotenv import load_dotenv
import re
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

OLLAMA_LLM = "granite3.1-dense"
OLLAMA_EMBEDDINGS = "granite-embedding:278m"


load_dotenv()

ENVIRONMENT = os.getenv("ENVIRONMENT")
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
api_key_watsonx = os.getenv('WATSONX_APIKEY')
projectid_watsonx = os.getenv('WATSONX_PROJECT_ID')
endpoint_watsonx = "https://us-south.ml.cloud.ibm.com"

def set_up_watsonx():
    token_watsonx = authenticate_watsonx(api_key_watsonx)
    if token_watsonx == None:
        return None
    parameters = {
        "max_new_tokens": 1500,
        "min_new_tokens": 1,
        "temperature": 0.7,
        "top_k": 50,
        "top_p": 1,
    }

    embed_params = {
        EmbedTextParamsMetaNames.TRUNCATE_INPUT_TOKENS: 1,
        EmbedTextParamsMetaNames.RETURN_OPTIONS: {"input_text": True},
    }

    credentials = Credentials(
        url = endpoint_watsonx,
        api_key = api_key_watsonx,
    )

    client = APIClient(credentials, project_id=projectid_watsonx)

    client.set_token(token_watsonx)

    watsonx_llm = WatsonxLLM(
        model_id="ibm/granite-3-2-8b-instruct",
        watsonx_client=client,
        params = parameters
    )


    watsonx_embedding = WatsonxEmbeddings(
        model_id="ibm/granite-embedding-278m-multilingual",
        url=endpoint_watsonx,
        project_id=projectid_watsonx,
        params=embed_params,
    ) 

    return watsonx_llm, watsonx_embedding

def authenticate_watsonx(api_key):
    url = "https://iam.cloud.ibm.com/identity/token"
    headers = {
        "Content-Type": "application/x-www-form-urlencoded"
    }
    data = {
        "grant_type": "urn:ibm:params:oauth:grant-type:apikey",
        "apikey": api_key
    }

    response = requests.post(url, headers=headers, data=data)
    
    if response.status_code == 200:
        token = response.json().get('access_token')
        os.environ["WATSONX_TOKEN"] = token
        return token
    else:
        print("Authentication failed. Status code:", response.status_code)
        print("Response:", response.text)
        return None

class PDFProcessor:
    def __init__(self):
        self.language = list(TRANSLATIONS.keys())[0]
    
    def set_language(self, language):
        self.language = language
        
    def set_llm(self, ai_model, type_model, api_key, project_id_watsonx):
        if ai_model == "Open AI / GPT-4o-mini":
            current_llm = ChatOpenAI(
                    model="gpt-4o",
                    temperature=0.5,
                    max_tokens=None,
                    timeout=None,
                    max_retries=2,
                    api_key=api_key, 
            )
            embeding_model = OpenAIEmbeddings(
                model="text-embedding-3-small",
                api_key=api_key,
            )


        elif ai_model == "IBM Granite3.1 dense / Ollama local":
            if type_model == "Local":
                try:
                    # Verificar que Ollama est谩 funcionando y el modelo est谩 disponible
                    current_llm = OllamaLLM(model=OLLAMA_LLM)
                    # Intenta hacer un embedding de prueba
                    test_embedding = OllamaEmbeddings(model=OLLAMA_EMBEDDINGS)
                    test_embedding.embed_query("test")
                    embeding_model = test_embedding
                except Exception as e:
                    print(f"Error with Ollama: {e}")
                    # Fallback a otro modelo o manejo de error
                    raise Exception("Please ensure Ollama is running and the models are pulled: \n" +
                                  f"ollama pull {OLLAMA_LLM}\n" +
                                  f"ollama pull {OLLAMA_EMBEDDINGS}")
            else:
                current_llm, embeding_model = set_up_watsonx()
        else: 
            if ENVIRONMENT != "dev":
                print("HUGGINGFACE accessing")
                current_llm = HuggingFaceEndpoint(
                    repo_id= AI_MODELS[ai_model],
                    temperature=0.2,
                    huggingfacehub_api_token=HUGGINGFACE_TOKEN,
                )
            else: 
                current_llm = HuggingFaceEndpoint(
                    repo_id= AI_MODELS[ai_model],
                    temperature=0.2,
                )
            embeding_model = HuggingFaceEmbeddings(
                model_name="ibm-granite/granite-embedding-278m-multilingual",
            )
        return current_llm, embeding_model
    
    def process_pdf(self, vectorstore, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx):
        defined_chunk_size = 1000
        defined_chunk_overlap = 150
        if (ai_model == "Open AI / GPT-4o-mini" and (api_key == "")) : #or (ai_model == "IBM Granite3.1 dense / Ollama local" and type_model == "Api Key" and (api_key == "" or project_id_watsonx == "")
            return TRANSLATIONS[self.language]["api_key_required"]
        if pdf_file is not None:
                loader = PyPDFLoader(file_path=pdf_file.name)
                documents = loader.load()
                #delete empty page_content documents from documents
                documents = [doc for doc in documents if doc.page_content]
                if(ai_model == "Open AI / GPT-4o-mini" or ai_model == "IBM Granite3.1 dense / Ollama local"):
                    if type_model == "Api Key":
                        text_splitter = RecursiveCharacterTextSplitter(
                            chunk_size=defined_chunk_size,
                            chunk_overlap=defined_chunk_overlap,
                            separators=["\n\n", "\n"] 
                        )
                    else:
                        text_splitter = RecursiveCharacterTextSplitter(
                            chunk_size=defined_chunk_size,
                            chunk_overlap=defined_chunk_overlap,
                        )
                else: 
                    text_splitter = RecursiveCharacterTextSplitter(
                        chunk_size=defined_chunk_size,
                        chunk_overlap=defined_chunk_overlap
                    )

                #print(text_splitter)
                texts = text_splitter.split_documents(documents)
                _, embeddings = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
                print("vectorstore: ", vectorstore)
                #delete all documents from the vectorstore
                if vectorstore:
                    vectorstore.delete_collection()
                
                chromadb.api.client.SharedSystemClient.clear_system_cache()
                new_client = chromadb.EphemeralClient()
                
                vectorstore = Chroma.from_documents(
                    documents=texts,
                    embedding=embeddings,
                    client=new_client,
                    collection_name="pdf_collection"
                    #persist_directory="./chroma_db"
                )

                print("vectorstore: ", vectorstore)
                
                return TRANSLATIONS[self.language]["pdf_processed"], vectorstore #+ f" ---- Chunks: {len(vectorstore.get()["documents"])}"
        
        else:
            return TRANSLATIONS[self.language]["load_pdf_first"], None
        
    def get_qa_response(self, vectorstore, message, history, ai_model, type_model, api_key, project_id_watsonx, k=4):
        current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)

        if not vectorstore:
            return TRANSLATIONS[self.language]["load_pdf_first"]
        
        retriever = vectorstore.as_retriever(search_kwargs={"k": k})

        qa_chain = RetrievalQA.from_chain_type(
            llm=current_llm,
            chain_type="stuff",
            retriever=retriever,
            return_source_documents=True,
        )
        
        result = qa_chain.invoke({"query": f"{message}.\n You must answer it in {self.language}. Remember not to mention anything that is not in the text. Do not extend information that is not provided in the text. "})

        unique_page_labels = {doc.metadata['page_label'] for doc in result["source_documents"]}
        
        page_labels_text = " & ".join([f"Page: {page}" for page in sorted(unique_page_labels)])

        return result["result"] + "\n\nSources: " + page_labels_text
        
    def summarizer_by_k_means(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, summary_prompt, just_get_documments=False):
        print("Summarizer by k means in language: ", self.language)
        if not vectorstore:
            return TRANSLATIONS[self.language]["load_pdf_first"]
        
        current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)

        # Get all the documents from the vectorstore
        documents = vectorstore.get(include=["embeddings", "documents"])
        documentsByIds = documents["ids"]
        documentsByEmbeddings = documents["embeddings"]
        documentsByDocuments = documents["documents"]

        print("documents length: ", len(documentsByEmbeddings))

       #depending on the length of the documents, create a number of clusters, if is les than 12, create 3 clusters, if is les than 36, create 6 clusters, if is less than 108, create 12 clusters, else create 24 clusters
        number_for_CreateClusters = 2
        if len(documentsByEmbeddings) <= 16:
            number_for_CreateClusters = 2
        elif len(documentsByEmbeddings) <= 64:
            number_for_CreateClusters = 4
        elif len(documentsByEmbeddings) <= 128:
            number_for_CreateClusters = 8
        else:
            number_for_CreateClusters = 12
        
        num_clusters = max(1, len(documentsByEmbeddings) // number_for_CreateClusters)

        print("num_clusters: ", num_clusters)
        kmeans = KMeans(n_clusters=num_clusters, random_state=42)
        kmeans.fit(documentsByEmbeddings)

        summary_documents = []
        map_ids_documents = {}
        #for each cluster, choose the document embedding with the highest similarity to the centroid, based on numpy cosine similarity, and keep a map of ids of the documents
        for i in range(num_clusters):
            # Get the indices of the documents in the cluster
            cluster_indices = [j for j, label in enumerate(kmeans.labels_) if label == i]
            
            if not cluster_indices:  # If there are no documents in this cluster, continue
                continue
            
            # Get the embeddings of the documents in this cluster
            cluster_embeddings = [documentsByEmbeddings[j] for j in cluster_indices]
            
            # Calculate the similarity with the centroid
            centroid = kmeans.cluster_centers_[i]
            similarities = [cosine_similarity([embedding], [centroid])[0][0] for embedding in cluster_embeddings]
            
            # Find the most similar document to the centroid
            most_similar_index = cluster_indices[similarities.index(max(similarities))]
            
            # Add the most similar document to the summary list
            summary_documents.append(documentsByDocuments[most_similar_index])
            map_ids_documents[most_similar_index] = documentsByIds[most_similar_index]

        print("map_ids_documents: ", map_ids_documents)

        # Join the summary documents into a single string
        summary_text = "\n".join(summary_documents)
        print("summary_documents: ", summary_text)

        if just_get_documments:
            return summary_text

        summary_chain = summary_prompt | current_llm
        final_summary = summary_chain.invoke({"texts": summary_text, "language": self.language})
        
        return final_summary

    def get_summary(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, just_get_documments=False, k=10):

        final_summary_prompt = PromptTemplate(
            input_variables=["texts", "language"],
            template="""
            Combine the following texts into a cohesive and structured summary:   
            ------------
            {texts}
            ------------
            Preserve the original meaning without adding external information or interpretations.
            Ensure clarity, logical flow, and coherence between the combined points.
            The summary must be in {language}.
            The output must be in markdown format.
            Summary:
            """
        )
        
        return self.summarizer_by_k_means(vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, final_summary_prompt, just_get_documments)
    
    
    def get_specialist_opinion(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
        questions_prompt = PromptTemplate(
            input_variables=["text", "specialist_prompt", "language"],
            template="""
            * Act as a specialist based on the following instructions and behaviour that you will follow:
            ------------
            {specialist_prompt}
            ------------
            * Based on your role as specialist, create some different sintetized and concise aspects to ask to the knowledge base of the document about the following text:
            ------------
            {text}
            ------------
            * The key aspects and questions must be provided in JSON format with the following structure:
            {{
                "aspects": [
                    "Aspect 1",
                    "Aspect 2",
                    "Aspect 3",
                    "Aspect 4",
                    "Aspect 5",
                    "Aspect 6",
                    "Aspect 7",
                    "Aspect 8",
                    "Aspect 9",
                    "Aspect 10",
                ]
            }}
            ------------
            *Example of valid output:
            {{
                "aspects": [
                    "Finished date of the project",
                    "Payment of the project",
                    "Project extension"
                    ]
            }}
            ------------
            * The aspects must be redacted in the language of {language}.
            * The given structure must be followed strictly in front of the keys, just use the list of aspects, do not add any other key.
            * Generate until 10 different aspects.
            ------------
            Answer: 
            """
        )
        if not vectorstore:
            return TRANSLATIONS[self.language]["load_pdf_first"]
        
        print(ai_model)
        print(type_model)
        current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)

        summary_text = self.get_summary(vectorstore, ai_model, type_model, api_key, project_id_watsonx, True, 10)
        questions_chain = questions_prompt | current_llm
        questions = questions_chain.invoke({"text": summary_text, "specialist_prompt": specialist_prompt, "language": self.language})

        print(questions)

        # Usar una expresi贸n regular para extraer el JSON
        match = re.search(r'\{.*\}', questions, re.DOTALL)
        if match:
            questions = match.group(0)
        else:
            raise ValueError("No valid JSON found in the response")

        questions = questions.strip()
        questions = json.loads(questions)

        print(questions)

        if len(questions["aspects"]) > 15:
            questions["aspects"] = questions["aspects"][:15]
        else:
            questions["aspects"] = questions["aspects"]

        aspects_text = "\n".join([f"* {aspect}: {self.get_qa_response(vectorstore, aspect, [], ai_model, type_model, api_key, project_id_watsonx, 2)}" for aspect in questions["aspects"]])

        return aspects_text
    
    
    """ Act煤a como un abogado altamente experimentado en derecho civil y contractual.

    Examina si existen cl谩usulas abusivas, desproporcionadas o contrarias a la normativa vigente, y expl铆calas con claridad.
    Basa tu an谩lisis en principios relevantes del derecho civil y contractual.
    Ofrece un argumento estructurado y recomendaciones pr谩cticas.
    Si hay m煤ltiples interpretaciones posibles, pres茅ntalas de manera objetiva.
    Mant茅n un tono profesional, preciso y fundamentado.

    Basado en lo que analices, proporciona una evaluaci贸n legal detallada """

    """ Eres profesional en gerencia de proyectos y tienes una amplia experiencia en la creaci贸n, direcci贸n y ejecuci贸n de proyectos de tecnologia.
    
    
    - Basa tu analisis en los objetivos el proyecto, el nicho en que se enfocan y su propuesta de valor.
    - Ofrece un argumento estructurado y recomendaciones pr谩cticas en base a otros posibles nichos y soluciones relacionadas.
    - Mant茅n un tono profesional, preciso y fundamentado.
    Basado en el documento y tu experiencia, proporciona una evaluaci贸n detallada de los proyectos y actividades que se analizaron.
    """

    """ Act煤a como un psicologo experto en recursos humanos, con amplia experiencia en el mejoramiento de hoas de vida de aspirantes a empleados.

    Basado en el siguiente texto que detalla una vacante de trabajo, proporciona una evaluaci贸n detallada de c贸mo esa persona puede mejorar su perfil para ser contratada.

    Descripci贸n de la vacante:

    """

    """ Act煤a como un asesor e ingeniero financiero experto en lectura de reportes y an谩lisis de datos.
    
    Basado en los datos y conclusiones del reporte, proporciona una evaluaci贸n financiera detallada y posibles escenarios tanto negativos como positivos que se puedan presentar.
    Establece el riesgo que se corre en cada escenario, la probabilidad de ocurrencia de cada uno y la magnitud del impacto en el recurso.
    Si hay m煤ltiples interpretaciones posibles, pres茅ntalas de manera objetiva.
    Realiza una hip贸tesis que pronostique el futuro de la situaci贸n o recurso analizado, teniendo en cuenta los datos y conclusiones del reporte.
    Presenta tus hipotesis en 3 aspectos, corto, mediano y largo plazo.
    Mant茅n un tono profesional, preciso y fundamentado.
    
    Basado en lo que analices, proporciona una evaluaci贸n en detalle sobre los activos, reportes y/o recursos que se analizaron"""