Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import streamlit as st
|
2 |
from PyPDF2 import PdfReader
|
3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
-
import os
|
5 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
6 |
from langchain.vectorstores import FAISS
|
7 |
from langchain_groq import ChatGroq
|
@@ -11,74 +10,27 @@ from dotenv import load_dotenv
|
|
11 |
import re
|
12 |
|
13 |
load_dotenv()
|
14 |
-
os.getenv("GROQ_API_KEY")
|
15 |
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
17 |
<style>
|
18 |
-
.
|
19 |
-
|
20 |
-
|
21 |
-
}
|
22 |
-
.response-box {
|
23 |
-
padding: 20px;
|
24 |
-
background-color: #f8f9fa;
|
25 |
-
border-radius: 10px;
|
26 |
-
border-left: 5px solid #252850;
|
27 |
-
margin: 20px 0;
|
28 |
-
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
29 |
-
}
|
30 |
-
.metadata-box {
|
31 |
-
padding: 20px;
|
32 |
-
background-color: #f0f2f6;
|
33 |
-
border-radius: 10px;
|
34 |
-
margin-bottom: 20px;
|
35 |
-
}
|
36 |
-
.custom-input {
|
37 |
-
font-size: 16px;
|
38 |
-
padding: 10px;
|
39 |
-
border-radius: 5px;
|
40 |
-
border: 1px solid #ccc;
|
41 |
-
}
|
42 |
-
.suggestion-container {
|
43 |
-
border: 1px solid #e0e0e0;
|
44 |
-
border-radius: 8px;
|
45 |
-
padding: 15px;
|
46 |
-
margin: 10px 0;
|
47 |
-
background: #f8f9fa;
|
48 |
-
}
|
49 |
-
.suggestion-btn {
|
50 |
-
width: 100%;
|
51 |
-
margin: 3px 0;
|
52 |
-
padding: 8px;
|
53 |
-
border-radius: 5px;
|
54 |
-
border: 1px solid #252850;
|
55 |
-
background: white;
|
56 |
-
cursor: pointer;
|
57 |
-
transition: all 0.2s;
|
58 |
-
}
|
59 |
-
.suggestion-btn:hover {
|
60 |
-
background: #252850;
|
61 |
-
color: white;
|
62 |
-
}
|
63 |
</style>
|
64 |
-
"""
|
65 |
|
|
|
66 |
def eliminar_proceso_pensamiento(texto):
|
67 |
-
|
68 |
-
|
69 |
-
return lineas[-1] if lineas else "Respuesta no disponible"
|
70 |
|
71 |
def get_pdf_text(pdf_docs):
|
72 |
-
|
73 |
-
for pdf in pdf_docs:
|
74 |
-
pdf_reader = PdfReader(pdf)
|
75 |
-
for page in pdf_reader.pages:
|
76 |
-
text += page.extract_text()
|
77 |
-
return text
|
78 |
-
|
79 |
-
def get_text_chunks(text):
|
80 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
|
81 |
-
return text_splitter.split_text(text)
|
82 |
|
83 |
def get_vector_store(text_chunks):
|
84 |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
@@ -86,9 +38,8 @@ def get_vector_store(text_chunks):
|
|
86 |
|
87 |
def get_conversational_chain():
|
88 |
prompt_template = """
|
89 |
-
Responde en español exclusivamente con la información solicitada usando el contexto
|
90 |
-
|
91 |
-
Formato: Respuesta directa sin prefijos. Si no hay información, di "No disponible".
|
92 |
|
93 |
Contexto:
|
94 |
{context}
|
@@ -98,209 +49,89 @@ def get_conversational_chain():
|
|
98 |
|
99 |
Respuesta:
|
100 |
"""
|
101 |
-
model = ChatGroq(
|
102 |
-
|
103 |
-
model_name="deepseek-r1-distill-llama-70b",
|
104 |
-
groq_api_key=os.getenv("GROQ_API_KEY")
|
105 |
-
)
|
106 |
-
return load_qa_chain(model, chain_type="stuff",
|
107 |
-
prompt=PromptTemplate(template=prompt_template,
|
108 |
-
input_variables=["context", "question"]))
|
109 |
|
110 |
-
def
|
111 |
-
|
112 |
-
"
|
113 |
-
|
114 |
-
"date": "¿A qué fecha corresponde el documento? Si existen indicios indica la fecha, sino di 'No disponible'"
|
115 |
-
}
|
116 |
|
117 |
-
metadata = {}
|
118 |
chain = get_conversational_chain()
|
|
|
119 |
|
120 |
-
|
121 |
-
|
122 |
-
response = chain(
|
123 |
-
{"input_documents": docs, "question": question},
|
124 |
-
return_only_outputs=True
|
125 |
-
)
|
126 |
-
clean_response = eliminar_proceso_pensamiento(response['output_text'])
|
127 |
-
metadata[key] = clean_response if clean_response else "No disponible"
|
128 |
|
129 |
-
|
|
|
130 |
|
131 |
-
def mostrar_respuesta(
|
132 |
-
|
133 |
-
|
|
|
134 |
|
135 |
def generar_sugerencias():
|
136 |
-
"""Genera preguntas sugeridas simples y generales"""
|
137 |
if 'vector_store' not in st.session_state:
|
138 |
-
return
|
139 |
|
140 |
-
|
141 |
-
|
142 |
-
context = "\n".join([doc.page_content for doc in docs])
|
143 |
-
|
144 |
-
prompt_template = """
|
145 |
-
Genera exactamente 3 preguntas en español basadas en el contexto.
|
146 |
-
Las preguntas deben ser en español, simples y sencillas de máximo 10 palabras.
|
147 |
-
Formato de respuesta:
|
148 |
-
1. [Pregunta completa en español]
|
149 |
-
2. [Pregunta completa en español]
|
150 |
-
3. [Pregunta completa en español]
|
151 |
-
|
152 |
-
Contexto:
|
153 |
-
{context}
|
154 |
-
"""
|
155 |
-
|
156 |
-
model = ChatGroq(
|
157 |
-
temperature=0.4,
|
158 |
-
model_name="deepseek-r1-distill-llama-70b",
|
159 |
-
groq_api_key=os.getenv("GROQ_API_KEY")
|
160 |
-
)
|
161 |
-
|
162 |
-
response = model.invoke(prompt_template.format(context=context))
|
163 |
-
|
164 |
-
preguntas = []
|
165 |
-
for line in response.content.split("\n"):
|
166 |
-
line = line.strip()
|
167 |
-
if line and line[0].isdigit():
|
168 |
-
pregunta = line.split('. ', 1)[1] if '. ' in line else line[2:]
|
169 |
-
if pregunta:
|
170 |
-
preguntas.append(pregunta)
|
171 |
-
|
172 |
-
return preguntas[:3]
|
173 |
-
|
174 |
-
except Exception as e:
|
175 |
-
st.error(f"Error generando sugerencias: {str(e)}")
|
176 |
-
return
|
177 |
-
|
178 |
-
def procesar_consulta(user_question):
|
179 |
-
if 'vector_store' not in st.session_state:
|
180 |
-
st.error("Por favor carga un documento primero")
|
181 |
-
return
|
182 |
|
183 |
-
|
184 |
-
|
185 |
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
|
|
|
|
194 |
|
|
|
195 |
def main():
|
196 |
-
st.set_page_config(page_title="PDF Consultor 🔍", page_icon="🔍", layout="wide")
|
197 |
st.title("PDF Consultor 🔍")
|
198 |
-
st.markdown(css_style, unsafe_allow_html=True)
|
199 |
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
'respuestas': []
|
206 |
-
}
|
207 |
-
for key, value in estados.items():
|
208 |
-
if key not in st.session_state:
|
209 |
-
st.session_state[key] = value
|
210 |
-
|
211 |
-
# Sidebar - Carga de documentos
|
212 |
-
with st.sidebar:
|
213 |
-
st.markdown('<p class="step-number">1 Subir archivos</p>', unsafe_allow_html=True)
|
214 |
-
pdf_docs = st.file_uploader(
|
215 |
-
"Subir PDF(s)",
|
216 |
-
accept_multiple_files=True,
|
217 |
-
type=["pdf"],
|
218 |
-
label_visibility="collapsed"
|
219 |
-
)
|
220 |
|
221 |
-
#
|
222 |
-
|
223 |
-
|
224 |
-
|
|
|
|
|
|
|
225 |
raw_text = get_pdf_text(pdf_docs)
|
226 |
-
text_chunks =
|
227 |
vector_store = get_vector_store(text_chunks)
|
228 |
-
|
229 |
-
st.session_state.metadata = extract_metadata(vector_store)
|
230 |
st.session_state.vector_store = vector_store
|
231 |
st.session_state.documento_cargado = True
|
232 |
st.session_state.sugerencias = generar_sugerencias()
|
233 |
-
|
234 |
-
st.
|
235 |
-
|
236 |
-
except Exception as e:
|
237 |
-
st.error(f"Error procesando documento: {str(e)}")
|
238 |
|
239 |
-
#
|
240 |
-
if
|
241 |
-
# Mostrar metadatos
|
242 |
-
st.markdown("---")
|
243 |
-
cols = st.columns(3)
|
244 |
-
campos_metadata = [
|
245 |
-
("📄 Título", "title"),
|
246 |
-
("🏛️ Entidad", "entity"),
|
247 |
-
("📅 Fecha", "date")
|
248 |
-
]
|
249 |
-
|
250 |
-
for col, (icono, key) in zip(cols, campos_metadata):
|
251 |
-
with col:
|
252 |
-
st.markdown(f"""
|
253 |
-
<div class="metadata-box">
|
254 |
-
<div style="font-size:16px; margin-bottom:10px;">{icono}</div>
|
255 |
-
{st.session_state.metadata[key]}
|
256 |
-
</div>
|
257 |
-
""", unsafe_allow_html=True)
|
258 |
-
|
259 |
-
# Sugerencias
|
260 |
if st.session_state.sugerencias:
|
261 |
-
st.
|
262 |
-
|
263 |
-
st.
|
264 |
-
|
265 |
-
<div style="font-size:14px; color:#666; margin-bottom:8px;">💡 ¿Necesitas ideas?</div>
|
266 |
-
""", unsafe_allow_html=True)
|
267 |
-
|
268 |
-
cols_sugerencias = st.columns(3)
|
269 |
-
for i, (col, pregunta) in enumerate(zip(cols_sugerencias, st.session_state.sugerencias)):
|
270 |
-
with col:
|
271 |
-
if st.button(
|
272 |
-
pregunta,
|
273 |
-
key=f"sug_{i}",
|
274 |
-
help="Haz clic para usar esta pregunta",
|
275 |
-
use_container_width=True
|
276 |
-
):
|
277 |
-
st.session_state.pregunta_actual = pregunta
|
278 |
-
|
279 |
-
st.markdown("</div>", unsafe_allow_html=True)
|
280 |
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
pregunta_usuario = st.text_input(
|
287 |
-
"Escribe tu pregunta:",
|
288 |
-
value=st.session_state.get('pregunta_actual', ''),
|
289 |
-
placeholder="Ej: ¿De qué trata este documento?",
|
290 |
-
label_visibility="collapsed"
|
291 |
-
)
|
292 |
-
with col2:
|
293 |
-
st.markdown("<br>", unsafe_allow_html=True)
|
294 |
-
enviar = st.form_submit_button("Enviar ▶")
|
295 |
-
|
296 |
-
if enviar or st.session_state.pregunta_actual:
|
297 |
-
pregunta_final = pregunta_usuario or st.session_state.pregunta_actual
|
298 |
-
procesar_consulta(pregunta_final)
|
299 |
-
if 'pregunta_actual' in st.session_state:
|
300 |
-
del st.session_state.pregunta_actual
|
301 |
|
302 |
-
elif not st.session_state.documento_cargado:
|
303 |
-
st.info("Por favor, sube un documento PDF para comenzar.")
|
304 |
-
|
305 |
if __name__ == "__main__":
|
306 |
-
main()
|
|
|
1 |
import streamlit as st
|
2 |
from PyPDF2 import PdfReader
|
3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
4 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
5 |
from langchain.vectorstores import FAISS
|
6 |
from langchain_groq import ChatGroq
|
|
|
10 |
import re
|
11 |
|
12 |
load_dotenv()
|
|
|
13 |
|
14 |
+
# Configuración inicial
|
15 |
+
st.set_page_config(page_title="PDF Consultor 🔍", page_icon="🔍", layout="wide")
|
16 |
+
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
17 |
+
|
18 |
+
# CSS personalizado
|
19 |
+
st.markdown("""
|
20 |
<style>
|
21 |
+
.response-box { padding: 20px; background-color: #f8f9fa; border-radius: 10px; border-left: 5px solid #252850; margin: 20px 0; }
|
22 |
+
.metadata-box { padding: 20px; background-color: #f0f2f6; border-radius: 10px; margin-bottom: 20px; }
|
23 |
+
.step-number { font-size: 24px; font-weight: bold; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
</style>
|
25 |
+
""", unsafe_allow_html=True)
|
26 |
|
27 |
+
# Funciones auxiliares
|
28 |
def eliminar_proceso_pensamiento(texto):
|
29 |
+
limpio = re.sub(r'<think>.*?</think>', '', texto, flags=re.DOTALL)
|
30 |
+
return limpio.strip(), re.search(r'<think>(.*?)</think>', texto, re.DOTALL).group(1) if "<think>" in texto else "No disponible"
|
|
|
31 |
|
32 |
def get_pdf_text(pdf_docs):
|
33 |
+
return "".join([page.extract_text() for pdf in pdf_docs for page in PdfReader(pdf).pages])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
def get_vector_store(text_chunks):
|
36 |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
|
|
38 |
|
39 |
def get_conversational_chain():
|
40 |
prompt_template = """
|
41 |
+
Responde en español exclusivamente con la información solicitada usando el contexto.
|
42 |
+
Si no hay información, di "No disponible".
|
|
|
43 |
|
44 |
Contexto:
|
45 |
{context}
|
|
|
49 |
|
50 |
Respuesta:
|
51 |
"""
|
52 |
+
model = ChatGroq(temperature=0.2, model_name="deepseek-r1-distill-llama-70b", groq_api_key=GROQ_API_KEY)
|
53 |
+
return load_qa_chain(model, chain_type="stuff", prompt=PromptTemplate(template=prompt_template, input_variables=["context", "question"]))
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
+
def procesar_consulta(pregunta):
|
56 |
+
if 'vector_store' not in st.session_state:
|
57 |
+
st.error("Por favor carga un documento primero")
|
58 |
+
return
|
|
|
|
|
59 |
|
|
|
60 |
chain = get_conversational_chain()
|
61 |
+
docs = st.session_state.vector_store.similarity_search(pregunta)
|
62 |
|
63 |
+
with st.spinner("Analizando documento..."):
|
64 |
+
response = chain({"input_documents": docs, "question": pregunta}, return_only_outputs=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
+
respuesta_final, pensamiento = eliminar_proceso_pensamiento(response['output_text'])
|
67 |
+
mostrar_respuesta(respuesta_final, pensamiento)
|
68 |
|
69 |
+
def mostrar_respuesta(respuesta, pensamiento):
|
70 |
+
st.markdown(f'<div class="response-box">{respuesta}</div>', unsafe_allow_html=True)
|
71 |
+
with st.expander("💭 Pensamiento del modelo"):
|
72 |
+
st.write(pensamiento)
|
73 |
|
74 |
def generar_sugerencias():
|
|
|
75 |
if 'vector_store' not in st.session_state:
|
76 |
+
return []
|
77 |
|
78 |
+
docs = st.session_state.vector_store.similarity_search("", k=3)
|
79 |
+
context = "\n".join([doc.page_content for doc in docs])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
+
prompt_template = """
|
82 |
+
Genera exactamente 3 preguntas simples en español basadas en este contexto.
|
83 |
|
84 |
+
Contexto:
|
85 |
+
{context}
|
86 |
+
|
87 |
+
Preguntas sugeridas:
|
88 |
+
"""
|
89 |
+
|
90 |
+
model = ChatGroq(temperature=0.4, model_name="deepseek-r1-distill-llama-70b", groq_api_key=GROQ_API_KEY)
|
91 |
+
response = model.invoke(prompt_template.format(context=context))
|
92 |
+
|
93 |
+
preguntas = [line.split('. ', 1)[1] for line in response.content.split("\n") if line.strip() and line[0].isdigit()]
|
94 |
+
return preguntas[:3]
|
95 |
|
96 |
+
# Aplicación principal
|
97 |
def main():
|
|
|
98 |
st.title("PDF Consultor 🔍")
|
|
|
99 |
|
100 |
+
# Estados de sesión
|
101 |
+
if 'documento_cargado' not in st.session_state:
|
102 |
+
st.session_state.documento_cargado = False
|
103 |
+
st.session_state.sugerencias = []
|
104 |
+
st.session_state.pregunta_actual = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
+
# Sidebar de carga de documentos
|
107 |
+
with st.sidebar:
|
108 |
+
st.markdown('<p class="step-number">1. Subir archivos</p>', unsafe_allow_html=True)
|
109 |
+
pdf_docs = st.file_uploader("Subir PDF(s)", accept_multiple_files=True, type=["pdf"])
|
110 |
+
|
111 |
+
if pdf_docs and not st.session_state.documento_cargado:
|
112 |
+
with st.spinner("Procesando documento..."):
|
113 |
raw_text = get_pdf_text(pdf_docs)
|
114 |
+
text_chunks = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500).split_text(raw_text)
|
115 |
vector_store = get_vector_store(text_chunks)
|
|
|
|
|
116 |
st.session_state.vector_store = vector_store
|
117 |
st.session_state.documento_cargado = True
|
118 |
st.session_state.sugerencias = generar_sugerencias()
|
119 |
+
st.success("Documento procesado exitosamente.")
|
120 |
+
st.experimental_rerun()
|
|
|
|
|
|
|
121 |
|
122 |
+
# Mostrar sugerencias y formulario principal
|
123 |
+
if st.session_state.documento_cargado:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
if st.session_state.sugerencias:
|
125 |
+
st.subheader("💡 Preguntas sugeridas:")
|
126 |
+
for pregunta in st.session_state.sugerencias:
|
127 |
+
if st.button(pregunta):
|
128 |
+
procesar_consulta(pregunta)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
+
with st.form("consulta_form"):
|
131 |
+
pregunta_usuario = st.text_input("Escribe tu pregunta:", placeholder="Ej: ¿Qué normativa regula este proceso?")
|
132 |
+
enviar = st.form_submit_button("Enviar ▶")
|
133 |
+
if enviar and pregunta_usuario:
|
134 |
+
procesar_consulta(pregunta_usuario)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
|
|
|
|
|
|
136 |
if __name__ == "__main__":
|
137 |
+
main()
|