Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import streamlit as st
|
|
|
2 |
from PyPDF2 import PdfReader
|
3 |
from docx import Document
|
4 |
import csv
|
@@ -8,8 +9,7 @@ import torch
|
|
8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
from langchain.embeddings import HuggingFaceEmbeddings
|
10 |
from langchain.vectorstores import FAISS
|
11 |
-
from huggingface_hub import login
|
12 |
-
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
13 |
|
14 |
huggingface_token = os.getenv('HUGGINGFACE_TOKEN')
|
15 |
|
@@ -36,32 +36,6 @@ classification_model, classification_tokenizer = load_classification_model()
|
|
36 |
|
37 |
id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"}
|
38 |
|
39 |
-
# Cargar documentos JSON para cada categor铆a
|
40 |
-
@st.cache_resource
|
41 |
-
def load_json_documents():
|
42 |
-
documents = {}
|
43 |
-
categories = ["multas", "politicas_de_privacidad", "contratos", "denuncias", "otros"]
|
44 |
-
for category in categories:
|
45 |
-
with open(f"./{category}.json", "r", encoding="utf-8") as f:
|
46 |
-
data = json.load(f)["questions_and_answers"]
|
47 |
-
documents[category] = [entry["question"] + " " + entry["answer"] for entry in data]
|
48 |
-
return documents
|
49 |
-
|
50 |
-
json_documents = load_json_documents()
|
51 |
-
|
52 |
-
# Configuraci贸n de Embeddings y Vector Stores
|
53 |
-
@st.cache_resource
|
54 |
-
def create_vector_store():
|
55 |
-
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2", model_kwargs={"device": "cpu"})
|
56 |
-
vector_stores = {}
|
57 |
-
for category, docs in json_documents.items():
|
58 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
|
59 |
-
split_docs = text_splitter.split_text(docs)
|
60 |
-
vector_stores[category] = FAISS.from_texts(split_docs, embeddings)
|
61 |
-
return vector_stores
|
62 |
-
|
63 |
-
vector_stores = create_vector_store()
|
64 |
-
|
65 |
def classify_text(text):
|
66 |
inputs = classification_tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length")
|
67 |
classification_model.eval()
|
@@ -72,6 +46,19 @@ def classify_text(text):
|
|
72 |
predicted_label = id2label[predicted_class_id]
|
73 |
return predicted_label
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
def translate(text, target_language):
|
76 |
template = f'''
|
77 |
Por favor, traduzca el siguiente documento al {target_language}:
|
@@ -157,7 +144,8 @@ def main():
|
|
157 |
for uploaded_file in uploaded_files:
|
158 |
file_content = handle_uploaded_file(uploaded_file)
|
159 |
classification = classify_text(file_content)
|
160 |
-
|
|
|
161 |
search_docs = vector_store.similarity_search(user_input)
|
162 |
context = " ".join([doc.page_content for doc in search_docs])
|
163 |
prompt_with_context = f"Contexto: {context}\n\nPregunta: {user_input}"
|
|
|
1 |
import streamlit as st
|
2 |
+
from transformers import InferenceClient, AutoTokenizer, AutoModelForSequenceClassification
|
3 |
from PyPDF2 import PdfReader
|
4 |
from docx import Document
|
5 |
import csv
|
|
|
9 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
10 |
from langchain.embeddings import HuggingFaceEmbeddings
|
11 |
from langchain.vectorstores import FAISS
|
12 |
+
from huggingface_hub import login
|
|
|
13 |
|
14 |
huggingface_token = os.getenv('HUGGINGFACE_TOKEN')
|
15 |
|
|
|
36 |
|
37 |
id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"}
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
def classify_text(text):
|
40 |
inputs = classification_tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length")
|
41 |
classification_model.eval()
|
|
|
46 |
predicted_label = id2label[predicted_class_id]
|
47 |
return predicted_label
|
48 |
|
49 |
+
def load_json_documents(category):
|
50 |
+
with open(f"./{category}.json", "r", encoding="utf-8") as f:
|
51 |
+
data = json.load(f)["questions_and_answers"]
|
52 |
+
documents = [entry["question"] + " " + entry["answer"] for entry in data]
|
53 |
+
return documents
|
54 |
+
|
55 |
+
def create_vector_store(docs):
|
56 |
+
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2", model_kwargs={"device": "cpu"})
|
57 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
|
58 |
+
split_docs = text_splitter.split_text(docs)
|
59 |
+
vector_store = FAISS.from_texts(split_docs, embeddings)
|
60 |
+
return vector_store
|
61 |
+
|
62 |
def translate(text, target_language):
|
63 |
template = f'''
|
64 |
Por favor, traduzca el siguiente documento al {target_language}:
|
|
|
144 |
for uploaded_file in uploaded_files:
|
145 |
file_content = handle_uploaded_file(uploaded_file)
|
146 |
classification = classify_text(file_content)
|
147 |
+
docs = load_json_documents(classification)
|
148 |
+
vector_store = create_vector_store(docs)
|
149 |
search_docs = vector_store.similarity_search(user_input)
|
150 |
context = " ".join([doc.page_content for doc in search_docs])
|
151 |
prompt_with_context = f"Contexto: {context}\n\nPregunta: {user_input}"
|