manuelcozar55 commited on
Commit
d7ccb12
verified
1 Parent(s): 9d7d60a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -29
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import streamlit as st
 
2
  from PyPDF2 import PdfReader
3
  from docx import Document
4
  import csv
@@ -8,8 +9,7 @@ import torch
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
  from langchain.embeddings import HuggingFaceEmbeddings
10
  from langchain.vectorstores import FAISS
11
- from huggingface_hub import login, InferenceClient
12
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
13
 
14
  huggingface_token = os.getenv('HUGGINGFACE_TOKEN')
15
 
@@ -36,32 +36,6 @@ classification_model, classification_tokenizer = load_classification_model()
36
 
37
  id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"}
38
 
39
- # Cargar documentos JSON para cada categor铆a
40
- @st.cache_resource
41
- def load_json_documents():
42
- documents = {}
43
- categories = ["multas", "politicas_de_privacidad", "contratos", "denuncias", "otros"]
44
- for category in categories:
45
- with open(f"./{category}.json", "r", encoding="utf-8") as f:
46
- data = json.load(f)["questions_and_answers"]
47
- documents[category] = [entry["question"] + " " + entry["answer"] for entry in data]
48
- return documents
49
-
50
- json_documents = load_json_documents()
51
-
52
- # Configuraci贸n de Embeddings y Vector Stores
53
- @st.cache_resource
54
- def create_vector_store():
55
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2", model_kwargs={"device": "cpu"})
56
- vector_stores = {}
57
- for category, docs in json_documents.items():
58
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
59
- split_docs = text_splitter.split_text(docs)
60
- vector_stores[category] = FAISS.from_texts(split_docs, embeddings)
61
- return vector_stores
62
-
63
- vector_stores = create_vector_store()
64
-
65
  def classify_text(text):
66
  inputs = classification_tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length")
67
  classification_model.eval()
@@ -72,6 +46,19 @@ def classify_text(text):
72
  predicted_label = id2label[predicted_class_id]
73
  return predicted_label
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  def translate(text, target_language):
76
  template = f'''
77
  Por favor, traduzca el siguiente documento al {target_language}:
@@ -157,7 +144,8 @@ def main():
157
  for uploaded_file in uploaded_files:
158
  file_content = handle_uploaded_file(uploaded_file)
159
  classification = classify_text(file_content)
160
- vector_store = vector_stores[classification]
 
161
  search_docs = vector_store.similarity_search(user_input)
162
  context = " ".join([doc.page_content for doc in search_docs])
163
  prompt_with_context = f"Contexto: {context}\n\nPregunta: {user_input}"
 
1
  import streamlit as st
2
+ from transformers import InferenceClient, AutoTokenizer, AutoModelForSequenceClassification
3
  from PyPDF2 import PdfReader
4
  from docx import Document
5
  import csv
 
9
  from langchain.text_splitter import RecursiveCharacterTextSplitter
10
  from langchain.embeddings import HuggingFaceEmbeddings
11
  from langchain.vectorstores import FAISS
12
+ from huggingface_hub import login
 
13
 
14
  huggingface_token = os.getenv('HUGGINGFACE_TOKEN')
15
 
 
36
 
37
  id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"}
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def classify_text(text):
40
  inputs = classification_tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length")
41
  classification_model.eval()
 
46
  predicted_label = id2label[predicted_class_id]
47
  return predicted_label
48
 
49
+ def load_json_documents(category):
50
+ with open(f"./{category}.json", "r", encoding="utf-8") as f:
51
+ data = json.load(f)["questions_and_answers"]
52
+ documents = [entry["question"] + " " + entry["answer"] for entry in data]
53
+ return documents
54
+
55
+ def create_vector_store(docs):
56
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2", model_kwargs={"device": "cpu"})
57
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
58
+ split_docs = text_splitter.split_text(docs)
59
+ vector_store = FAISS.from_texts(split_docs, embeddings)
60
+ return vector_store
61
+
62
  def translate(text, target_language):
63
  template = f'''
64
  Por favor, traduzca el siguiente documento al {target_language}:
 
144
  for uploaded_file in uploaded_files:
145
  file_content = handle_uploaded_file(uploaded_file)
146
  classification = classify_text(file_content)
147
+ docs = load_json_documents(classification)
148
+ vector_store = create_vector_store(docs)
149
  search_docs = vector_store.similarity_search(user_input)
150
  context = " ".join([doc.page_content for doc in search_docs])
151
  prompt_with_context = f"Contexto: {context}\n\nPregunta: {user_input}"