|
import json |
|
import tempfile |
|
from langchain_community.document_loaders import PyPDFLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_ollama import OllamaEmbeddings |
|
from langchain_community.vectorstores import Chroma |
|
from langchain_ollama import OllamaLLM |
|
from langchain.chains import RetrievalQA |
|
from langchain.prompts import PromptTemplate |
|
from langchain_openai import ChatOpenAI, OpenAIEmbeddings |
|
from ibm_watsonx_ai.metanames import EmbedTextParamsMetaNames |
|
from langchain_ibm import WatsonxLLM, WatsonxEmbeddings |
|
from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings |
|
from ibm_watsonx_ai import APIClient, Credentials |
|
from utils import AI_MODELS, TRANSLATIONS |
|
import chromadb |
|
import requests |
|
import os |
|
from dotenv import load_dotenv |
|
|
|
OLLAMA_LLM = "granite3.1-dense" |
|
OLLAMA_EMBEDDINGS = "granite-embedding:278m" |
|
|
|
|
|
load_dotenv() |
|
|
|
api_key_watsonx = os.getenv('WATSONX_APIKEY') |
|
projectid_watsonx = os.getenv('WATSONX_PROJECT_ID') |
|
endpoint_watsonx = "https://us-south.ml.cloud.ibm.com" |
|
|
|
def set_up_watsonx(): |
|
token_watsonx = authenticate_watsonx(api_key_watsonx) |
|
if token_watsonx == None: |
|
return None |
|
parameters = { |
|
"max_new_tokens": 1500, |
|
"min_new_tokens": 1, |
|
"temperature": 0.7, |
|
"top_k": 50, |
|
"top_p": 1, |
|
} |
|
|
|
embed_params = { |
|
EmbedTextParamsMetaNames.TRUNCATE_INPUT_TOKENS: 1, |
|
EmbedTextParamsMetaNames.RETURN_OPTIONS: {"input_text": True}, |
|
} |
|
|
|
credentials = Credentials( |
|
url = endpoint_watsonx, |
|
api_key = api_key_watsonx, |
|
) |
|
|
|
client = APIClient(credentials, project_id=projectid_watsonx) |
|
|
|
client.set_token(token_watsonx) |
|
|
|
watsonx_llm = WatsonxLLM( |
|
model_id="ibm/granite-3-2-8b-instruct", |
|
watsonx_client=client, |
|
params = parameters |
|
) |
|
|
|
|
|
watsonx_embedding = WatsonxEmbeddings( |
|
model_id="ibm/granite-embedding-278m-multilingual", |
|
url=endpoint_watsonx, |
|
project_id=projectid_watsonx, |
|
params=embed_params, |
|
) |
|
|
|
return watsonx_llm, watsonx_embedding |
|
|
|
def authenticate_watsonx(api_key): |
|
url = "https://iam.cloud.ibm.com/identity/token" |
|
headers = { |
|
"Content-Type": "application/x-www-form-urlencoded" |
|
} |
|
data = { |
|
"grant_type": "urn:ibm:params:oauth:grant-type:apikey", |
|
"apikey": api_key |
|
} |
|
|
|
response = requests.post(url, headers=headers, data=data) |
|
|
|
if response.status_code == 200: |
|
token = response.json().get('access_token') |
|
os.environ["WATSONX_TOKEN"] = token |
|
return token |
|
else: |
|
print("Authentication failed. Status code:", response.status_code) |
|
print("Response:", response.text) |
|
return None |
|
|
|
|
|
class PDFProcessor: |
|
def __init__(self): |
|
self.vectorstore = None |
|
self.language = "English" |
|
|
|
def set_language(self, language): |
|
self.language = language |
|
|
|
def set_llm(self, ai_model, type_model, api_key, project_id_watsonx): |
|
if ai_model == "Open AI / GPT-4o-mini": |
|
current_llm = ChatOpenAI( |
|
model="gpt-4o", |
|
temperature=0.5, |
|
max_tokens=None, |
|
timeout=None, |
|
max_retries=2, |
|
api_key=api_key, |
|
) |
|
embeding_model = OpenAIEmbeddings( |
|
model="text-embedding-3-small", |
|
api_key=api_key, |
|
) |
|
|
|
|
|
elif ai_model == "IBM Granite3.1 dense / Ollama local": |
|
if type_model == "Local": |
|
try: |
|
|
|
current_llm = OllamaLLM(model=OLLAMA_LLM) |
|
|
|
test_embedding = OllamaEmbeddings(model=OLLAMA_EMBEDDINGS) |
|
test_embedding.embed_query("test") |
|
embeding_model = test_embedding |
|
except Exception as e: |
|
print(f"Error with Ollama: {e}") |
|
|
|
raise Exception("Please ensure Ollama is running and the models are pulled: \n" + |
|
f"ollama pull {OLLAMA_LLM}\n" + |
|
f"ollama pull {OLLAMA_EMBEDDINGS}") |
|
else: |
|
current_llm, embeding_model = set_up_watsonx() |
|
else: |
|
current_llm = HuggingFaceEndpoint( |
|
repo_id= AI_MODELS[ai_model], |
|
temperature=0.5, |
|
) |
|
embeding_model = HuggingFaceEmbeddings( |
|
model_name="ibm-granite/granite-embedding-278m-multilingual", |
|
) |
|
return current_llm, embeding_model |
|
|
|
|
|
def process_pdf(self, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx): |
|
defined_chunk_size = 1000 |
|
defined_chunk_overlap = 100 |
|
if (ai_model == "Open AI / GPT-4o-mini" and (api_key == "")) : |
|
return TRANSLATIONS[self.language]["api_key_required"] |
|
if pdf_file is not None: |
|
loader = PyPDFLoader(file_path=pdf_file.name) |
|
documents = loader.load() |
|
|
|
documents = [doc for doc in documents if doc.page_content] |
|
if(ai_model == "Open AI / GPT-4o-mini" or ai_model == "IBM Granite3.1 dense / Ollama local"): |
|
if type_model == "Api Key": |
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=defined_chunk_size, |
|
chunk_overlap=defined_chunk_overlap, |
|
separators=["\n\n", "\n"] |
|
) |
|
else: |
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=defined_chunk_size, |
|
chunk_overlap=defined_chunk_overlap, |
|
) |
|
else: |
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=defined_chunk_size, |
|
chunk_overlap=defined_chunk_overlap |
|
) |
|
|
|
|
|
texts = text_splitter.split_documents(documents) |
|
_, embeddings = self.set_llm(ai_model, type_model, api_key, project_id_watsonx) |
|
|
|
|
|
if self.vectorstore: |
|
self.vectorstore.delete_collection() |
|
|
|
new_client = chromadb.EphemeralClient() |
|
|
|
self.vectorstore = Chroma.from_documents( |
|
documents=texts, |
|
embedding=embeddings, |
|
client=new_client, |
|
collection_name="pdf_collection" |
|
|
|
) |
|
|
|
return TRANSLATIONS[self.language]["pdf_processed"] + f" ---- Chunks: {len(self.vectorstore.get()["documents"])}" |
|
|
|
else: |
|
return TRANSLATIONS[self.language]["load_pdf_first"] |
|
|
|
|
|
def get_qa_response(self, message, history, ai_model, type_model, api_key, project_id_watsonx, k=4): |
|
current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx) |
|
|
|
if not self.vectorstore: |
|
return TRANSLATIONS[self.language]["load_pdf_first"] |
|
|
|
retriever = self.vectorstore.as_retriever(search_kwargs={"k": k}) |
|
|
|
qa_chain = RetrievalQA.from_chain_type( |
|
llm=current_llm, |
|
chain_type="stuff", |
|
retriever=retriever, |
|
return_source_documents=True, |
|
) |
|
|
|
result = qa_chain.invoke({"query": f"{message}.\n You must answer it in {self.language}. Remember not to mention anything that is not in the text. Do not extend information that is not provided in the text. "}) |
|
|
|
unique_page_labels = {doc.metadata['page_label'] for doc in result["source_documents"]} |
|
|
|
page_labels_text = " & ".join([f"Page: {page}" for page in sorted(unique_page_labels)]) |
|
|
|
return result["result"] + "\n\nSources: " + page_labels_text |
|
|
|
|
|
def summarizer_by_k_top_n(self, ai_model, type_model, api_key, project_id_watsonx, k, summary_prompt, just_get_documments=False): |
|
if not self.vectorstore: |
|
return TRANSLATIONS[self.language]["load_pdf_first"] |
|
|
|
current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx) |
|
|
|
retriever = self.vectorstore.as_retriever(search_kwargs={"k": k}) |
|
documents = retriever.invoke('Summary of the document and key points') |
|
|
|
if just_get_documments: |
|
return "\n".join([doc.page_content for doc in documents]) |
|
|
|
summary_chain = summary_prompt | current_llm |
|
final_summary = summary_chain.invoke({"texts": "\n".join([doc.page_content for doc in documents]), "language": self.language}) |
|
return final_summary |
|
|
|
|
|
def get_summary(self, ai_model, type_model, api_key, project_id_watsonx, just_get_documments=False, k=10): |
|
|
|
final_summary_prompt = PromptTemplate( |
|
input_variables=["texts", "language"], |
|
template=""" |
|
Combine the following texts into a cohesive and structured final summary: |
|
------------ |
|
{texts} |
|
------------ |
|
The final summary should be between 2 and 4 paragraphs. |
|
Preserve the original meaning without adding external information or interpretations. |
|
Ensure clarity, logical flow, and coherence between the combined points. |
|
The summary must be in {language}. |
|
""" |
|
) |
|
|
|
return self.summarizer_by_k_top_n(ai_model, type_model, api_key, project_id_watsonx, k, final_summary_prompt, just_get_documments) |
|
|
|
|
|
|
|
def get_specialist_opinion(self, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt): |
|
questions_prompt = PromptTemplate( |
|
input_variables=["text", "specialist_prompt", "language"], |
|
template=""" |
|
* Act as a specialist based on the following instructions and behaviour that you will follow: |
|
------------ |
|
{specialist_prompt} |
|
------------ |
|
* Based on your role as specialist, create some different sintetized and concise aspects to ask to the knowledge base of the document about the following text: |
|
------------ |
|
{text} |
|
------------ |
|
* The key aspects and questions must be provided in JSON format with the following structure: |
|
{{ |
|
"aspects": [ |
|
"Aspect 1", |
|
"Aspect 2", |
|
"Aspect 3", |
|
"Aspect 4", |
|
"Aspect 5", |
|
"Aspect 6", |
|
"Aspect 7", |
|
"Aspect 8", |
|
"Aspect 9", |
|
"Aspect 10", |
|
] |
|
}} |
|
------------ |
|
*Example of valid output: |
|
{{ |
|
"aspects": [ |
|
"Finished date of the project", |
|
"Payment of the project", |
|
"Project extension" |
|
] |
|
}} |
|
------------ |
|
* The aspects must be redacted in the language of {language}. |
|
* The given structure must be followed strictly in front of the keys, just use the list of aspects, do not add any other key. |
|
* Generate until 10 different aspects. |
|
------------ |
|
Answer: |
|
""" |
|
) |
|
if not self.vectorstore: |
|
return TRANSLATIONS[self.language]["load_pdf_first"] |
|
|
|
current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx) |
|
|
|
summary_text = self.get_summary(ai_model, type_model, api_key, project_id_watsonx, True, 10) |
|
questions_chain = questions_prompt | current_llm |
|
questions = questions_chain.invoke({"text": summary_text, "specialist_prompt": specialist_prompt, "language": self.language}) |
|
|
|
print(questions) |
|
|
|
|
|
questions = questions.split("{")[1] |
|
questions = questions.split("}")[0] |
|
questions = questions.strip() |
|
print(questions) |
|
questions = json.loads(questions) |
|
|
|
print(questions) |
|
|
|
if len(questions["aspects"]) > 15: |
|
questions["aspects"] = questions["aspects"][:15] |
|
else: |
|
questions["aspects"] = questions["aspects"] |
|
|
|
aspects_text = "\n".join([f"* {aspect}: {self.get_qa_response(aspect, [], ai_model, type_model, api_key, project_id_watsonx, 2)}" for aspect in questions["aspects"]]) |
|
|
|
return aspects_text |
|
|
|
|
|
""" Actúa como un abogado altamente experimentado en derecho civil y contractual. |
|
|
|
Examina si existen cláusulas abusivas, desproporcionadas o contrarias a la normativa vigente, y explícalas con claridad. |
|
Basa tu análisis en principios relevantes del derecho civil y contractual. |
|
Ofrece un argumento estructurado y recomendaciones prácticas. |
|
Si hay múltiples interpretaciones posibles, preséntalas de manera objetiva. |
|
Mantén un tono profesional, preciso y fundamentado. |
|
|
|
Basado en lo que analices, proporciona una evaluación legal detallada """ |
|
|
|
""" Actúa como un asesor e ingeniero financiero experto en lectura de reportes y análisis de datos. |
|
|
|
Basado en los datos y conclusiones del reporte, proporciona una evaluación financiera detallada y posibles escenarios tanto negativos como positivos que se puedan presentar. |
|
Establece el riesgo que se corre en cada escenario, la probabilidad de ocurrencia de cada uno y la magnitud del impacto en el recurso. |
|
Si hay múltiples interpretaciones posibles, preséntalas de manera objetiva. |
|
Realiza una hipótesis que pronostique el futuro de la situación o recurso analizado, teniendo en cuenta los datos y conclusiones del reporte. |
|
Presenta tus hipotesis en 3 aspectos, corto, mediano y largo plazo. |
|
Mantén un tono profesional, preciso y fundamentado. |
|
|
|
Basado en lo que analices, proporciona una evaluación en detalle sobre los activos, reportes y/o recursos que se analizaron""" |