Spaces:
Paused
Paused
File size: 7,898 Bytes
3ceffca c8ef645 4d16851 c8ef645 4d16851 3ceffca 30ed7b0 4d16851 c8ef645 4d16851 de5e6eb c8ef645 fe8644d c8ef645 de5e6eb c8ef645 5ef82d0 c8ef645 e601cc3 c8ef645 3ceffca c8ef645 d7ccb12 c8ef645 d7ccb12 c8ef645 30ed7b0 c8ef645 3ceffca c8ef645 3ceffca c8ef645 d6bfd3e ec25508 46a011a 052856c ec25508 d6bfd3e c8ef645 4d16851 c8ef645 4d16851 d6bfd3e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, AutoConfig, AutoModelForSequenceClassification
from langchain_community.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.embeddings import HuggingFaceEmbeddings
from PyPDF2 import PdfReader
from docx import Document
import csv
import json
import torch
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from huggingface_hub import login
# Autenticaci贸n en Hugging Face
huggingface_token = st.secrets["HUGGINGFACE_TOKEN"]
login(huggingface_token)
# Configurar modelo y tokenizador
model_name = 'Qwen/Qwen2-1.5B'
model_config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
text_generation_pipeline = pipeline(
model=model_name,
tokenizer=tokenizer,
task="text-generation",
temperature=0.2,
repetition_penalty=1.1,
return_full_text=True,
max_new_tokens=1000,
)
prompt_template = """
### [INST]
Instruction: Answer the question based on your knowledge. Here is context to help:
{context}
### QUESTION:
{question}
[/INST]
"""
mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
# Crear el prompt desde la plantilla de prompt
prompt = PromptTemplate(
input_variables=["context", "question"],
template=prompt_template,
)
# Crear la cadena LLM
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)
# Funci贸n para manejar archivos subidos
def handle_uploaded_file(uploaded_file):
try:
if uploaded_file.name.endswith(".txt"):
text = uploaded_file.read().decode("utf-8")
elif uploaded_file.name.endswith(".pdf"):
reader = PdfReader(uploaded_file)
text = ""
for page in range(len(reader.pages)):
text += reader.pages[page].extract_text()
elif uploaded_file.name.endswith(".docx"):
doc = Document(uploaded_file)
text = "\n".join([para.text for para in doc.paragraphs])
elif uploaded_file.name.endswith(".csv"):
text = ""
content = uploaded_file.read().decode("utf-8").splitlines()
reader = csv.reader(content)
text = " ".join([" ".join(row) for row in reader])
elif uploaded_file.name.endswith(".json"):
data = json.load(uploaded_file)
text = json.dumps(data, indent=4)
else:
text = "Tipo de archivo no soportado."
return text
except Exception as e:
return str(e)
# Funci贸n para traducir texto
def translate(text, target_language):
context = ""
question = f"Por favor, traduzca el siguiente documento al {target_language}:\n{text}\nAseg煤rese de que la traducci贸n sea precisa y conserve el significado original del documento."
response = llm_chain.run(context=context, question=question)
return response
# Funci贸n para resumir texto
def summarize(text, length):
context = ""
question = f"Por favor, haga un resumen {length} del siguiente documento:\n{text}\nAseg煤rese de que el resumen sea conciso y conserve el significado original del documento."
response = llm_chain.run(context=context, question=question)
return response
# Configuraci贸n del modelo de clasificaci贸n
@st.cache_resource
def load_classification_model():
tokenizer_cls = AutoTokenizer.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")
model_cls = AutoModelForSequenceClassification.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")
return model_cls, tokenizer_cls
classification_model, classification_tokenizer = load_classification_model()
id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"}
def classify_text(text):
inputs = classification_tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length")
classification_model.eval()
with torch.no_grad():
outputs = classification_model(**inputs)
logits = outputs.logits
predicted_class_id = logits.argmax(dim=-1).item()
predicted_label = id2label[predicted_class_id]
return predicted_label
# Funci贸n para cargar documentos JSON
def load_json_documents(category):
try:
with open(f"./{category}.json", "r", encoding="utf-8") as f:
data = json.load(f)["questions_and_answers"]
documents = [entry["question"] + " " + entry["answer"] for entry in data]
return documents
except FileNotFoundError:
return []
# Configuraci贸n de FAISS y embeddings
@st.cache_resource
def create_vector_store(docs):
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-l6-v2", model_kwargs={"device": "cpu"})
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
split_docs = text_splitter.split_text(docs)
vector_store = FAISS.from_texts(split_docs, embeddings)
return vector_store
def explain_text(user_input, document_text):
classification = classify_text(document_text)
if classification in ["multas", "politicas_de_privacidad", "contratos", "denuncias"]:
docs = load_json_documents(classification)
if docs:
vector_store = create_vector_store(docs)
search_docs = vector_store.similarity_search(user_input)
context = " ".join([doc.page_content for doc in search_docs])
else:
context = ""
else:
context = ""
question = user_input
response = llm_chain.run(context=context, question=question)
return response
def main():
st.title("LexAIcon")
st.write("Puedes conversar con este chatbot basado en Mistral-7B-Instruct y subir archivos para que el chatbot los procese.")
with st.sidebar:
st.caption("[Consigue un HuggingFace Token](https://huggingface.co/settings/tokens)")
operation = st.radio("Selecciona una operaci贸n", ["Resumir", "Traducir", "Explicar"])
if operation == "Explicar":
user_input = st.text_area("Introduce tu pregunta:", "")
uploaded_file = st.file_uploader("Sube un archivo", type=["txt", "pdf", "docx", "csv", "json"])
if uploaded_file and user_input:
document_text = handle_uploaded_file(uploaded_file)
bot_response = explain_text(user_input, document_text)
st.write(f"**Assistant:** {bot_response}")
else:
uploaded_file = st.file_uploader("Sube un archivo", type=["txt", "pdf", "docx", "csv", "json"])
if uploaded_file:
document_text = handle_uploaded_file(uploaded_file)
if operation == "Traducir":
target_language = st.selectbox("Selecciona el idioma de traducci贸n", ["espa帽ol", "ingl茅s", "franc茅s", "alem谩n"])
if target_language:
bot_response = translate(document_text, target_language)
st.write(f"**Assistant:** {bot_response}")
elif operation == "Resumir":
summary_length = st.selectbox("Selecciona la longitud del resumen", ["corto", "medio", "largo"])
if summary_length:
if summary_length == "corto":
length = "de aproximadamente 50 palabras"
elif summary_length == "medio":
length = "de aproximadamente 100 palabras"
elif summary_length == "largo":
length = "de aproximadamente 500 palabras"
bot_response = summarize(document_text, length)
st.write(f"**Assistant:** {bot_response}")
if __name__ == "__main__":
main() |