manuelcozar55 commited on
Commit
c8ef645
verified
1 Parent(s): aeeb3c0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -115
app.py CHANGED
@@ -1,83 +1,60 @@
1
  import streamlit as st
2
- from huggingface_hub import snapshot_download
3
- from pathlib import Path
4
- from mistral_inference.model import Transformer
5
- from mistral_inference.generate import generate
6
- from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
7
- from mistral_common.protocol.instruct.messages import UserMessage
8
- from mistral_common.protocol.instruct.request import ChatCompletionRequest
9
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
10
  from PyPDF2 import PdfReader
11
  from docx import Document
12
  import csv
13
  import json
14
- import os
15
  import torch
16
- from langchain.text_splitter import RecursiveCharacterTextSplitter
17
- from langchain.embeddings import HuggingFaceEmbeddings
18
  from langchain.vectorstores import FAISS
 
19
 
20
- # Descargar y configurar el modelo
21
- mistral_models_path = Path.home().joinpath('mistral_models', '7B-Instruct-v0.3')
22
- mistral_models_path.mkdir(parents=True, exist_ok=True)
23
- snapshot_download(repo_id="mistralai/Mistral-7B-Instruct-v0.3", allow_patterns=["params.json", "consolidated.safetensors", "tokenizer.model.v3"], local_dir=mistral_models_path)
 
 
24
 
25
- # Configurar el modelo y el tokenizador
26
- tokenizer = MistralTokenizer.from_file(f"{mistral_models_path}/tokenizer.model.v3")
27
- model = Transformer.from_folder(mistral_models_path)
28
 
29
- # Configuraci贸n del modelo de clasificaci贸n
30
- @st.cache_resource
31
- def load_classification_model():
32
- tokenizer_cls = AutoTokenizer.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")
33
- model_cls = AutoModelForSequenceClassification.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")
34
- return model_cls, tokenizer_cls
 
 
 
35
 
36
- classification_model, classification_tokenizer = load_classification_model()
 
 
37
 
38
- id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"}
39
 
40
- def classify_text(text):
41
- inputs = classification_tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length")
42
- classification_model.eval()
43
- with torch.no_grad():
44
- outputs = classification_model(**inputs)
45
- logits = outputs.logits
46
- predicted_class_id = logits.argmax(dim=-1).item()
47
- predicted_label = id2label[predicted_class_id]
48
- return predicted_label
49
 
50
- def load_json_documents(category):
51
- with open(f"./{category}.json", "r", encoding="utf-8") as f:
52
- data = json.load(f)["questions_and_answers"]
53
- documents = [entry["question"] + " " + entry["answer"] for entry in data]
54
- return documents
55
 
56
- def create_vector_store(docs):
57
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2", model_kwargs={"device": "cpu"})
58
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
59
- split_docs = text_splitter.split_text(docs)
60
- vector_store = FAISS.from_texts(split_docs, embeddings)
61
- return vector_store
62
 
63
- def translate(text, target_language):
64
- completion_request = ChatCompletionRequest(
65
- messages=[UserMessage(content=f"Por favor, traduzca el siguiente documento al {target_language}:\n{text}\nAseg煤rese de que la traducci贸n sea precisa y conserve el significado original del documento.")]
66
- )
67
- tokens = tokenizer.encode_chat_completion(completion_request).tokens
68
- out_tokens, _ = generate([tokens], model, max_tokens=512, temperature=0.0, eos_id=tokenizer.instruct_tokenizer.tokenizer.eos_id)
69
- translated_text = tokenizer.instruct_tokenizer.tokenizer.decode(out_tokens[0])
70
- return translated_text
71
 
72
- def summarize(text, length):
73
- completion_request = ChatCompletionRequest(
74
- messages=[UserMessage(content=f"Por favor, haga un resumen {length} del siguiente documento:\n{text}\nAseg煤rese de que el resumen sea conciso y conserve el significado original del documento.")]
75
- )
76
- tokens = tokenizer.encode_chat_completion(completion_request).tokens
77
- out_tokens, _ = generate([tokens], model, max_tokens=512, temperature=0.0, eos_id=tokenizer.instruct_tokenizer.tokenizer.eos_id)
78
- summarized_text = tokenizer.instruct_tokenizer.tokenizer.decode(out_tokens[0])
79
- return summarized_text
80
 
 
81
  def handle_uploaded_file(uploaded_file):
82
  try:
83
  if uploaded_file.name.endswith(".txt"):
@@ -104,66 +81,109 @@ def handle_uploaded_file(uploaded_file):
104
  except Exception as e:
105
  return str(e)
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  def main():
108
  st.title("LexAIcon")
109
  st.write("Puedes conversar con este chatbot basado en Mistral-7B-Instruct y subir archivos para que el chatbot los procese.")
110
 
111
- if "messages" not in st.session_state:
112
- st.session_state["messages"] = []
113
-
114
  with st.sidebar:
115
- st.text_input("HuggingFace Token", value=huggingface_token, type="password", key="huggingface_token")
116
  st.caption("[Consigue un HuggingFace Token](https://huggingface.co/settings/tokens)")
117
 
118
- user_input = st.text_input("Introduce tu consulta:", "")
119
-
120
- if user_input:
121
- st.session_state.messages.append({"role": "user", "content": user_input})
122
-
123
- operation = st.radio("Selecciona una operaci贸n", ["Resumir", "Traducir", "Explicar"])
124
- target_language = None
125
- summary_length = None
126
-
127
- if operation == "Traducir":
128
- target_language = st.selectbox("Selecciona el idioma de traducci贸n", ["espa帽ol", "ingl茅s", "franc茅s", "alem谩n"])
129
-
130
- if operation == "Resumir":
131
- summary_length = st.selectbox("Selecciona la longitud del resumen", ["corto", "medio", "largo"])
132
-
133
- if uploaded_files := st.file_uploader("Sube un archivo", type=["txt", "pdf", "docx", "csv", "json"], accept_multiple_files=True):
134
- for uploaded_file in uploaded_files:
135
- file_content = handle_uploaded_file(uploaded_file)
136
- classification = classify_text(file_content)
137
- docs = load_json_documents(classification)
138
- vector_store = create_vector_store(docs)
139
- search_docs = vector_store.similarity_search(user_input)
140
- context = " ".join([doc.page_content for doc in search_docs])
141
- completion_request = ChatCompletionRequest(
142
- messages=[UserMessage(content=f"Contexto: {context}\n\nPregunta: {user_input}")]
143
- )
144
- tokens = tokenizer.encode_chat_completion(completion_request).tokens
145
- out_tokens, _ = generate([tokens], model, max_tokens=512, temperature=0.0, eos_id=tokenizer.instruct_tokenizer.tokenizer.eos_id)
146
- bot_response = tokenizer.instruct_tokenizer.tokenizer.decode(out_tokens[0])
147
- elif operation == "Resumir":
148
- if summary_length == "corto":
149
- length = "de aproximadamente 50 palabras"
150
- elif summary_length == "medio":
151
- length = "de aproximadamente 100 palabras"
152
- elif summary_length == "largo":
153
- length = "de aproximadamente 500 palabras"
154
- bot_response = summarize(user_input, length)
155
- elif operation == "Traducir":
156
- bot_response = translate(user_input, target_language)
157
- else:
158
- completion_request = ChatCompletionRequest(
159
- messages=[UserMessage(content=user_input)]
160
- )
161
- tokens = tokenizer.encode_chat_completion(completion_request).tokens
162
- out_tokens, _ = generate([tokens], model, max_tokens=512, temperature=0.0, eos_id=tokenizer.instruct_tokenizer.tokenizer.eos_id)
163
- bot_response = tokenizer.instruct_tokenizer.tokenizer.decode(out_tokens[0])
164
-
165
- st.session_state.messages.append({"role": "assistant", "content": bot_response})
166
- st.write(f"**Assistant:** {bot_response}")
167
 
168
  if __name__ == "__main__":
169
  main()
 
1
  import streamlit as st
2
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, AutoConfig, AutoModelForSequenceClassification
3
+ from langchain.llms import HuggingFacePipeline
4
+ from langchain.prompts import PromptTemplate
5
+ from langchain.chains import LLMChain
6
+ from langchain.embeddings.huggingface import HuggingFaceEmbeddings
 
 
 
7
  from PyPDF2 import PdfReader
8
  from docx import Document
9
  import csv
10
  import json
 
11
  import torch
 
 
12
  from langchain.vectorstores import FAISS
13
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
14
 
15
+ # Configurar modelo y tokenizador
16
+ model_name = 'mistralai/Mistral-7B-Instruct-v0.3'
17
+ model_config = AutoConfig.from_pretrained(model_name)
18
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
19
+ tokenizer.pad_token = tokenizer.eos_token
20
+ tokenizer.padding_side = "right"
21
 
22
+ model = AutoModelForCausalLM.from_pretrained(model_name)
 
 
23
 
24
+ text_generation_pipeline = pipeline(
25
+ model=model,
26
+ tokenizer=tokenizer,
27
+ task="text-generation",
28
+ temperature=0.2,
29
+ repetition_penalty=1.1,
30
+ return_full_text=True,
31
+ max_new_tokens=300,
32
+ )
33
 
34
+ prompt_template = """
35
+ ### [INST]
36
+ Instruction: Answer the question based on your knowledge. Here is context to help:
37
 
38
+ {context}
39
 
40
+ ### QUESTION:
41
+ {question}
 
 
 
 
 
 
 
42
 
43
+ [/INST]
44
+ """
 
 
 
45
 
46
+ mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
 
 
 
 
 
47
 
48
+ # Crear el prompt desde la plantilla de prompt
49
+ prompt = PromptTemplate(
50
+ input_variables=["context", "question"],
51
+ template=prompt_template,
52
+ )
 
 
 
53
 
54
+ # Crear la cadena LLM
55
+ llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)
 
 
 
 
 
 
56
 
57
+ # Funci贸n para manejar archivos subidos
58
  def handle_uploaded_file(uploaded_file):
59
  try:
60
  if uploaded_file.name.endswith(".txt"):
 
81
  except Exception as e:
82
  return str(e)
83
 
84
+ # Funci贸n para traducir texto
85
+ def translate(text, target_language):
86
+ context = ""
87
+ question = f"Por favor, traduzca el siguiente documento al {target_language}:\n{text}\nAseg煤rese de que la traducci贸n sea precisa y conserve el significado original del documento."
88
+ response = llm_chain.run(context=context, question=question)
89
+ return response
90
+
91
+ # Funci贸n para resumir texto
92
+ def summarize(text, length):
93
+ context = ""
94
+ question = f"Por favor, haga un resumen {length} del siguiente documento:\n{text}\nAseg煤rese de que el resumen sea conciso y conserve el significado original del documento."
95
+ response = llm_chain.run(context=context, question=question)
96
+ return response
97
+
98
+ # Configuraci贸n del modelo de clasificaci贸n
99
+ @st.cache_resource
100
+ def load_classification_model():
101
+ tokenizer_cls = AutoTokenizer.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")
102
+ model_cls = AutoModelForSequenceClassification.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")
103
+ return model_cls, tokenizer_cls
104
+
105
+ classification_model, classification_tokenizer = load_classification_model()
106
+
107
+ id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"}
108
+
109
+ def classify_text(text):
110
+ inputs = classification_tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length")
111
+ classification_model.eval()
112
+ with torch.no_grad():
113
+ outputs = classification_model(**inputs)
114
+ logits = outputs.logits
115
+ predicted_class_id = logits.argmax(dim=-1).item()
116
+ predicted_label = id2label[predicted_class_id]
117
+ return predicted_label
118
+
119
+ # Funci贸n para cargar documentos JSON
120
+ def load_json_documents(category):
121
+ try:
122
+ with open(f"./{category}.json", "r", encoding="utf-8") as f:
123
+ data = json.load(f)["questions_and_answers"]
124
+ documents = [entry["question"] + " " + entry["answer"] for entry in data]
125
+ return documents
126
+ except FileNotFoundError:
127
+ return []
128
+
129
+ # Configuraci贸n de FAISS y embeddings
130
+ @st.cache_resource
131
+ def create_vector_store(docs):
132
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2", model_kwargs={"device": "cpu"})
133
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
134
+ split_docs = text_splitter.split_text(docs)
135
+ vector_store = FAISS.from_texts(split_docs, embeddings)
136
+ return vector_store
137
+
138
+ def explain_text(user_input, document_text):
139
+ classification = classify_text(document_text)
140
+ if classification in ["multas", "politicas_de_privacidad", "contratos", "denuncias"]:
141
+ docs = load_json_documents(classification)
142
+ if docs:
143
+ vector_store = create_vector_store(docs)
144
+ search_docs = vector_store.similarity_search(user_input)
145
+ context = " ".join([doc.page_content for doc in search_docs])
146
+ else:
147
+ context = ""
148
+ else:
149
+ context = ""
150
+ question = user_input
151
+ response = llm_chain.run(context=context, question=question)
152
+ return response
153
+
154
  def main():
155
  st.title("LexAIcon")
156
  st.write("Puedes conversar con este chatbot basado en Mistral-7B-Instruct y subir archivos para que el chatbot los procese.")
157
 
 
 
 
158
  with st.sidebar:
 
159
  st.caption("[Consigue un HuggingFace Token](https://huggingface.co/settings/tokens)")
160
 
161
+ operation = st.radio("Selecciona una operaci贸n", ["Resumir", "Traducir", "Explicar"])
162
+
163
+ if operation == "Explicar":
164
+ user_input = st.text_area("Introduce tu pregunta:", "")
165
+ uploaded_file = st.file_uploader("Sube un archivo", type=["txt", "pdf", "docx", "csv", "json"])
166
+ if uploaded_file and user_input:
167
+ document_text = handle_uploaded_file(uploaded_file)
168
+ bot_response = explain_text(user_input, document_text)
169
+ st.write(f"**Assistant:** {bot_response}")
170
+ else:
171
+ uploaded_file = st.file_uploader("Sube un archivo", type=["txt", "pdf", "docx", "csv", "json"])
172
+ if uploaded_file:
173
+ document_text = handle_uploaded_file(uploaded_file)
174
+ if operation == "Traducir":
175
+ target_language = st.selectbox("Selecciona el idioma de traducci贸n", ["espa帽ol", "ingl茅s", "franc茅s", "alem谩n"])
176
+ bot_response = translate(document_text, target_language)
177
+ elif operation == "Resumir":
178
+ summary_length = st.selectbox("Selecciona la longitud del resumen", ["corto", "medio", "largo"])
179
+ if summary_length == "corto":
180
+ length = "de aproximadamente 50 palabras"
181
+ elif summary_length == "medio":
182
+ length = "de aproximadamente 100 palabras"
183
+ elif summary_length == "largo":
184
+ length = "de aproximadamente 500 palabras"
185
+ bot_response = summarize(document_text, length)
186
+ st.write(f"**Assistant:** {bot_response}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
  if __name__ == "__main__":
189
  main()