Commit
·
1b4e9c9
1
Parent(s):
1f4cd81
Upload 2 files
Browse files- app.py +225 -0
- requirements.txt +11 -0
app.py
ADDED
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import openai
|
3 |
+
import re
|
4 |
+
from os.path import splitext, exists
|
5 |
+
import nltk
|
6 |
+
from nltk.tokenize import word_tokenize
|
7 |
+
import gradio as gr
|
8 |
+
import backoff
|
9 |
+
import markdown
|
10 |
+
from docx import Document
|
11 |
+
from io import StringIO
|
12 |
+
from datetime import datetime
|
13 |
+
import tempfile
|
14 |
+
|
15 |
+
|
16 |
+
nltk.download('punkt')
|
17 |
+
os.environ["OPENAI_API_KEY"] = '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'
|
18 |
+
|
19 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
20 |
+
|
21 |
+
|
22 |
+
def clean_webvtt(filepath: str) -> str:
|
23 |
+
"""Clean up the content of a subtitle file (vtt) to a string
|
24 |
+
|
25 |
+
Args:
|
26 |
+
filepath (str): path to vtt file
|
27 |
+
|
28 |
+
Returns:
|
29 |
+
str: clean content
|
30 |
+
"""
|
31 |
+
# read file content
|
32 |
+
with open(filepath, "r", encoding="utf-8") as fp:
|
33 |
+
content = fp.read()
|
34 |
+
|
35 |
+
# remove header & empty lines
|
36 |
+
lines = [line.strip() for line in content.split("\n") if line.strip()]
|
37 |
+
lines = lines[1:] if lines[0].upper() == "WEBVTT" else lines
|
38 |
+
|
39 |
+
# remove indexes
|
40 |
+
lines = [lines[i] for i in range(len(lines)) if not lines[i].isdigit()]
|
41 |
+
|
42 |
+
# remove tcode
|
43 |
+
#pattern = re.compile(r'^[0-9:.]{12} --> [0-9:.]{12}')
|
44 |
+
pattern = r'[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}\/\d+-\d'
|
45 |
+
lines = [lines[i] for i in range(len(lines))
|
46 |
+
if not re.match(pattern, lines[i])]
|
47 |
+
|
48 |
+
# remove timestamps
|
49 |
+
pattern = r"^\d{2}:\d{2}:\d{2}.\d{3}.*\d{2}:\d{2}:\d{2}.\d{3}$"
|
50 |
+
lines = [lines[i] for i in range(len(lines))
|
51 |
+
if not re.match(pattern, lines[i])]
|
52 |
+
|
53 |
+
content = " ".join(lines)
|
54 |
+
|
55 |
+
# remove duplicate spaces
|
56 |
+
pattern = r"\s+"
|
57 |
+
content = re.sub(pattern, r" ", content)
|
58 |
+
|
59 |
+
# add space after punctuation marks if it doesn't exist
|
60 |
+
pattern = r"([\.!?])(\w)"
|
61 |
+
content = re.sub(pattern, r"\1 \2", content)
|
62 |
+
|
63 |
+
return content
|
64 |
+
|
65 |
+
|
66 |
+
def vtt_to_clean_file(file_in: str, file_out=None, **kwargs) -> str:
|
67 |
+
"""Save clean content of a subtitle file to text file
|
68 |
+
|
69 |
+
Args:
|
70 |
+
file_in (str): path to vtt file
|
71 |
+
file_out (None, optional): path to text file
|
72 |
+
**kwargs (optional): arguments for other parameters
|
73 |
+
- no_message (bool): do not show message of result.
|
74 |
+
Default is False
|
75 |
+
|
76 |
+
Returns:
|
77 |
+
str: path to text file
|
78 |
+
"""
|
79 |
+
# set default values
|
80 |
+
no_message = kwargs.get("no_message", False)
|
81 |
+
if not file_out:
|
82 |
+
filename = splitext(file_in)[0]
|
83 |
+
file_out = "%s.txt" % filename
|
84 |
+
i = 0
|
85 |
+
while exists(file_out):
|
86 |
+
i += 1
|
87 |
+
file_out = "%s_%s.txt" % (filename, i)
|
88 |
+
|
89 |
+
content = clean_webvtt(file_in)
|
90 |
+
with open(file_out, "w+", encoding="utf-8") as fp:
|
91 |
+
fp.write(content)
|
92 |
+
if not no_message:
|
93 |
+
print("clean content is written to file: %s" % file_out)
|
94 |
+
|
95 |
+
return file_out
|
96 |
+
|
97 |
+
|
98 |
+
def get_summary(filepath):
|
99 |
+
filepath = filepath
|
100 |
+
vtt_to_clean_file(filepath)
|
101 |
+
|
102 |
+
|
103 |
+
def count_tokens(filename):
|
104 |
+
with open(filename, 'r') as f:
|
105 |
+
text = f.read()
|
106 |
+
tokens = word_tokenize(text)
|
107 |
+
return len(tokens)
|
108 |
+
|
109 |
+
|
110 |
+
def break_up_file(tokens, chunk_size, overlap_size):
|
111 |
+
if len(tokens) <= chunk_size:
|
112 |
+
yield tokens
|
113 |
+
else:
|
114 |
+
chunk = tokens[:chunk_size]
|
115 |
+
yield chunk
|
116 |
+
yield from break_up_file(tokens[chunk_size-overlap_size:], chunk_size, overlap_size)
|
117 |
+
|
118 |
+
|
119 |
+
def break_up_file_to_chunks(filename, chunk_size=4000, overlap_size=100):
|
120 |
+
with open(filename, 'r') as f:
|
121 |
+
text = f.read()
|
122 |
+
tokens = word_tokenize(text)
|
123 |
+
return list(break_up_file(tokens, chunk_size, overlap_size))
|
124 |
+
|
125 |
+
|
126 |
+
def convert_to_prompt_text(tokenized_text):
|
127 |
+
prompt_text = " ".join(tokenized_text)
|
128 |
+
prompt_text = prompt_text.replace(" 's", "'s")
|
129 |
+
return prompt_text
|
130 |
+
|
131 |
+
|
132 |
+
def markdown_to_docx(md_text, output_file):
|
133 |
+
# Convert the Markdown text to HTML
|
134 |
+
html_text = markdown.markdown(md_text)
|
135 |
+
|
136 |
+
# Create a new Document object
|
137 |
+
doc = Document()
|
138 |
+
|
139 |
+
# Parse the HTML and add its content to the .docx document
|
140 |
+
for p in html_text.split('</p>'):
|
141 |
+
if '<p>' in p:
|
142 |
+
clean_p = p.replace('<p>', '').strip()
|
143 |
+
if clean_p:
|
144 |
+
doc.add_paragraph(clean_p)
|
145 |
+
|
146 |
+
# Save the document to the specified file
|
147 |
+
doc.save(output_file)
|
148 |
+
|
149 |
+
|
150 |
+
@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
|
151 |
+
@backoff.on_exception(backoff.expo, openai.error.APIConnectionError)
|
152 |
+
def summarize_meeting(filepath):
|
153 |
+
filename = filepath
|
154 |
+
token_count = count_tokens(filename)
|
155 |
+
|
156 |
+
prompt_response = []
|
157 |
+
# Break the text of the meeting transcripts into chunks of 4000 tokens.
|
158 |
+
chunks = break_up_file_to_chunks(filename)
|
159 |
+
# Summarize each chunk.
|
160 |
+
for i, chunk in enumerate(chunks):
|
161 |
+
prompt_request = convert_to_prompt_text(chunks[i])
|
162 |
+
|
163 |
+
messages = [
|
164 |
+
{"role": "system", "content": "Summarize this meeting transcript in the same language as the user's input."}]
|
165 |
+
messages.append({"role": "user", "content": prompt_request})
|
166 |
+
|
167 |
+
response = openai.ChatCompletion.create(
|
168 |
+
model="gpt-4",
|
169 |
+
messages=messages,
|
170 |
+
temperature=.4,
|
171 |
+
top_p=1,
|
172 |
+
frequency_penalty=0,
|
173 |
+
presence_penalty=0
|
174 |
+
)
|
175 |
+
|
176 |
+
prompt_response.append(
|
177 |
+
response["choices"][0]["message"]['content'].strip())
|
178 |
+
|
179 |
+
# Consolidate these meeting summaries.
|
180 |
+
prompt_request = "Consolidate these meeting summaries: " + \
|
181 |
+
str(prompt_response)
|
182 |
+
|
183 |
+
# Summarize the text of the meeting transcripts.
|
184 |
+
messages = [{"role": "system", "content": "Summarize the text of the meeting transcripts. The output format should be markdown in the same language as the user's input. Start with a brief summary of the meeting, continue with bullets outlining the most important points of discussion. Finally, provide a list of action items with a due date from the provided meeting transcript text."}]
|
185 |
+
messages.append({"role": "user", "content": prompt_request})
|
186 |
+
response = openai.ChatCompletion.create(
|
187 |
+
model="gpt-4",
|
188 |
+
messages=messages,
|
189 |
+
temperature=.4,
|
190 |
+
top_p=1,
|
191 |
+
frequency_penalty=0,
|
192 |
+
presence_penalty=0
|
193 |
+
)
|
194 |
+
|
195 |
+
summary_text = response["choices"][0]["message"]['content'].strip()
|
196 |
+
#outfilepath = "Resumen-Minuta-" + datetime.now().strftime("%d-%m-%Y-%H-%M") + ".docx"
|
197 |
+
# Convert the summary to a .docx file with the name "Resumen-Minuta-<download-date>.docx"
|
198 |
+
#markdown_to_docx(
|
199 |
+
# summary_text, outfilepath)
|
200 |
+
|
201 |
+
|
202 |
+
return summary_text
|
203 |
+
|
204 |
+
|
205 |
+
def summarize_meeting_vtt(file):
|
206 |
+
temp_file_path = file.name
|
207 |
+
summary_text = summarize_meeting(temp_file_path)
|
208 |
+
|
209 |
+
return summary_text
|
210 |
+
|
211 |
+
|
212 |
+
demo = gr.Interface(
|
213 |
+
fn=summarize_meeting_vtt,
|
214 |
+
# input
|
215 |
+
inputs=gr.File(label="Archivo .vtt"),
|
216 |
+
# output
|
217 |
+
outputs=[
|
218 |
+
gr.Markdown(label="Resumen de la reunión")
|
219 |
+
],
|
220 |
+
title="Hexagon Data - Resumen de reuniones con I.A.",
|
221 |
+
description="Descarga la transcripción de la reunión en formato .vtt y carga el archivo aquí para obtener el resumen de la reunión.")
|
222 |
+
|
223 |
+
|
224 |
+
if __name__ == "__main__":
|
225 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
python-docx
|
2 |
+
nltk
|
3 |
+
openai
|
4 |
+
markdown
|
5 |
+
backoff
|
6 |
+
docx
|
7 |
+
io
|
8 |
+
datetime
|
9 |
+
tempfile
|
10 |
+
os
|
11 |
+
re
|