Update app.py
Browse files
app.py
CHANGED
@@ -12,11 +12,7 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
12 |
import torch
|
13 |
import os
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
# Realizar el inicio de sesi贸n de Hugging Face solo si el token est谩 disponible
|
18 |
-
if huggingface_token:
|
19 |
-
login(token=huggingface_token)
|
20 |
|
21 |
# Configuraci贸n del modelo LLM
|
22 |
llm = HuggingFaceEndpoint(
|
@@ -34,14 +30,6 @@ model = AutoModelForSequenceClassification.from_pretrained("mrm8488/legal-longfo
|
|
34 |
|
35 |
id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"}
|
36 |
|
37 |
-
def read_file(file):
|
38 |
-
file_path = file.name
|
39 |
-
if file_path.endswith('.pdf'):
|
40 |
-
return read_pdf(file_path)
|
41 |
-
else:
|
42 |
-
with open(file_path, 'r', encoding='utf-8') as f:
|
43 |
-
return f.read()
|
44 |
-
|
45 |
def read_pdf(file_path):
|
46 |
pdf_reader = PyPDF2.PdfReader(file_path)
|
47 |
text = ""
|
@@ -49,20 +37,21 @@ def read_pdf(file_path):
|
|
49 |
text += pdf_reader.pages[page].extract_text()
|
50 |
return text
|
51 |
|
52 |
-
def summarize(
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
else:
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
61 |
Por favor, lea detenidamente el siguiente documento:
|
62 |
<document>
|
63 |
-
{
|
64 |
</document>
|
65 |
-
Despu茅s de leer el documento, identifique los puntos clave y las ideas principales cubiertas en el texto.
|
66 |
Su objetivo es ser exhaustivo en la captura del contenido central del documento, mientras que tambi茅n es conciso en la expresi贸n de cada punto del resumen. Omita los detalles menores y conc茅ntrese en los temas centrales y hechos importantes.
|
67 |
'''
|
68 |
|
@@ -74,7 +63,7 @@ Su objetivo es ser exhaustivo en la captura del contenido central del documento,
|
|
74 |
formatted_prompt = prompt.format(TEXT=text)
|
75 |
output_summary = llm_engine_hf.invoke(formatted_prompt)
|
76 |
|
77 |
-
return output_summary.content
|
78 |
|
79 |
def classify_text(text):
|
80 |
inputs = tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length")
|
@@ -84,9 +73,17 @@ def classify_text(text):
|
|
84 |
logits = outputs.logits
|
85 |
predicted_class_id = logits.argmax(dim=-1).item()
|
86 |
predicted_label = id2label[predicted_class_id]
|
87 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
-
def translate(text, target_language):
|
90 |
template = '''
|
91 |
Por favor, traduzca el siguiente documento al {LANGUAGE}:
|
92 |
<document>
|
@@ -105,17 +102,35 @@ Aseg煤rese de que la traducci贸n sea precisa y conserve el significado original
|
|
105 |
|
106 |
return f"Prompt:\n{formatted_prompt}\n\nTraducci贸n:\n{translated_text.content}"
|
107 |
|
108 |
-
def process_file(file, action, target_language=None
|
109 |
-
text = read_file(file)
|
110 |
if action == "Resumen":
|
111 |
-
return summarize(
|
112 |
elif action == "Clasificar":
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
return classify_text(text)
|
114 |
elif action == "Traducir":
|
115 |
-
return translate(
|
116 |
else:
|
117 |
return "Acci贸n no v谩lida"
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
# Crear la interfaz de Gradio
|
120 |
with gr.Blocks() as demo:
|
121 |
gr.Markdown("## Procesador de Documentos")
|
@@ -124,26 +139,34 @@ with gr.Blocks() as demo:
|
|
124 |
with gr.Column():
|
125 |
file = gr.File(label="Subir un archivo")
|
126 |
action = gr.Radio(label="Seleccione una acci贸n", choices=["Resumen", "Clasificar", "Traducir"])
|
127 |
-
summary_length = gr.Radio(label="Seleccione la longitud del resumen", choices=["Corto", "Medio", "Largo"], visible=False)
|
128 |
target_language = gr.Dropdown(label="Seleccionar idioma de traducci贸n", choices=["en", "fr", "de"], visible=False)
|
129 |
|
130 |
with gr.Column():
|
131 |
-
output_text = gr.Textbox(label="Resultado", lines=
|
132 |
|
133 |
-
def
|
134 |
if action == "Traducir":
|
135 |
-
return gr.update(visible=
|
136 |
-
elif action == "Resumen":
|
137 |
-
return gr.update(visible=True), gr.update(visible=False)
|
138 |
-
elif action == "Clasificar":
|
139 |
-
return gr.update(visible=False), gr.update(visible(False))
|
140 |
else:
|
141 |
-
return gr.update(visible=False)
|
142 |
|
143 |
-
action.change(
|
144 |
|
145 |
submit_button = gr.Button("Procesar")
|
146 |
-
submit_button.click(process_file, inputs=[file, action, target_language
|
147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
# Ejecutar la aplicaci贸n Gradio
|
149 |
-
demo.launch(share=True)
|
|
|
12 |
import torch
|
13 |
import os
|
14 |
|
15 |
+
login(token=os.getenv('HUGGINGFACE_TOKEN'))
|
|
|
|
|
|
|
|
|
16 |
|
17 |
# Configuraci贸n del modelo LLM
|
18 |
llm = HuggingFaceEndpoint(
|
|
|
30 |
|
31 |
id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"}
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
def read_pdf(file_path):
|
34 |
pdf_reader = PyPDF2.PdfReader(file_path)
|
35 |
text = ""
|
|
|
37 |
text += pdf_reader.pages[page].extract_text()
|
38 |
return text
|
39 |
|
40 |
+
def summarize(file):
|
41 |
+
# Leer el contenido del archivo subido
|
42 |
+
file_path = file.name
|
43 |
+
if file_path.endswith('.pdf'):
|
44 |
+
text = read_pdf(file_path)
|
45 |
else:
|
46 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
47 |
+
text = f.read()
|
48 |
+
|
49 |
+
template = '''
|
50 |
Por favor, lea detenidamente el siguiente documento:
|
51 |
<document>
|
52 |
+
{TEXT}
|
53 |
</document>
|
54 |
+
Despu茅s de leer el documento, identifique los puntos clave y las ideas principales cubiertas en el texto. Organice estos puntos clave en una lista con vi帽etas concisa que resuma la informaci贸n esencial del documento. El resumen debe tener un m谩ximo de 10 puntos.
|
55 |
Su objetivo es ser exhaustivo en la captura del contenido central del documento, mientras que tambi茅n es conciso en la expresi贸n de cada punto del resumen. Omita los detalles menores y conc茅ntrese en los temas centrales y hechos importantes.
|
56 |
'''
|
57 |
|
|
|
63 |
formatted_prompt = prompt.format(TEXT=text)
|
64 |
output_summary = llm_engine_hf.invoke(formatted_prompt)
|
65 |
|
66 |
+
return f"Prompt:\n{formatted_prompt}\n\nResumen:\n{output_summary.content}"
|
67 |
|
68 |
def classify_text(text):
|
69 |
inputs = tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length")
|
|
|
73 |
logits = outputs.logits
|
74 |
predicted_class_id = logits.argmax(dim=-1).item()
|
75 |
predicted_label = id2label[predicted_class_id]
|
76 |
+
return predicted_label
|
77 |
+
|
78 |
+
def translate(file, target_language):
|
79 |
+
# Leer el contenido del archivo subido
|
80 |
+
file_path = file.name
|
81 |
+
if file_path.endswith('.pdf'):
|
82 |
+
text = read_pdf(file_path)
|
83 |
+
else:
|
84 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
85 |
+
text = f.read()
|
86 |
|
|
|
87 |
template = '''
|
88 |
Por favor, traduzca el siguiente documento al {LANGUAGE}:
|
89 |
<document>
|
|
|
102 |
|
103 |
return f"Prompt:\n{formatted_prompt}\n\nTraducci贸n:\n{translated_text.content}"
|
104 |
|
105 |
+
def process_file(file, action, target_language=None):
|
|
|
106 |
if action == "Resumen":
|
107 |
+
return summarize(file)
|
108 |
elif action == "Clasificar":
|
109 |
+
file_path = file.name
|
110 |
+
if file_path.endswith('.pdf'):
|
111 |
+
text = read_pdf(file_path)
|
112 |
+
else:
|
113 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
114 |
+
text = f.read()
|
115 |
return classify_text(text)
|
116 |
elif action == "Traducir":
|
117 |
+
return translate(file, target_language)
|
118 |
else:
|
119 |
return "Acci贸n no v谩lida"
|
120 |
|
121 |
+
def download_text(output_text, filename='output.txt'):
|
122 |
+
if output_text:
|
123 |
+
file_path = Path(filename)
|
124 |
+
with open(file_path, 'w', encoding='utf-8') as f:
|
125 |
+
f.write(output_text)
|
126 |
+
return file_path
|
127 |
+
else:
|
128 |
+
return None
|
129 |
+
|
130 |
+
def create_download_file(output_text, filename='output.txt'):
|
131 |
+
file_path = download_text(output_text, filename)
|
132 |
+
return str(file_path) if file_path else None
|
133 |
+
|
134 |
# Crear la interfaz de Gradio
|
135 |
with gr.Blocks() as demo:
|
136 |
gr.Markdown("## Procesador de Documentos")
|
|
|
139 |
with gr.Column():
|
140 |
file = gr.File(label="Subir un archivo")
|
141 |
action = gr.Radio(label="Seleccione una acci贸n", choices=["Resumen", "Clasificar", "Traducir"])
|
|
|
142 |
target_language = gr.Dropdown(label="Seleccionar idioma de traducci贸n", choices=["en", "fr", "de"], visible=False)
|
143 |
|
144 |
with gr.Column():
|
145 |
+
output_text = gr.Textbox(label="Resultado", lines=20)
|
146 |
|
147 |
+
def update_language_dropdown(action):
|
148 |
if action == "Traducir":
|
149 |
+
return gr.update(visible=True)
|
|
|
|
|
|
|
|
|
150 |
else:
|
151 |
+
return gr.update(visible=False)
|
152 |
|
153 |
+
action.change(update_language_dropdown, inputs=action, outputs=target_language)
|
154 |
|
155 |
submit_button = gr.Button("Procesar")
|
156 |
+
submit_button.click(process_file, inputs=[file, action, target_language], outputs=output_text)
|
157 |
+
|
158 |
+
def generate_file():
|
159 |
+
summary_text = output_text.value
|
160 |
+
filename = 'translation.txt' if action.value == 'Traducir' else 'summary.txt'
|
161 |
+
file_path = download_text(summary_text, filename)
|
162 |
+
return file_path
|
163 |
+
|
164 |
+
download_button = gr.Button("Descargar Resultado")
|
165 |
+
download_button.click(
|
166 |
+
fn=generate_file,
|
167 |
+
inputs=[],
|
168 |
+
outputs=gr.File()
|
169 |
+
)
|
170 |
+
|
171 |
# Ejecutar la aplicaci贸n Gradio
|
172 |
+
demo.launch(share=True)
|