andreinigo commited on
Commit
1b4e9c9
·
1 Parent(s): 1f4cd81

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +225 -0
  2. requirements.txt +11 -0
app.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ import re
4
+ from os.path import splitext, exists
5
+ import nltk
6
+ from nltk.tokenize import word_tokenize
7
+ import gradio as gr
8
+ import backoff
9
+ import markdown
10
+ from docx import Document
11
+ from io import StringIO
12
+ from datetime import datetime
13
+ import tempfile
14
+
15
+
16
+ nltk.download('punkt')
17
+ os.environ["OPENAI_API_KEY"] = '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'
18
+
19
+ openai.api_key = os.getenv("OPENAI_API_KEY")
20
+
21
+
22
+ def clean_webvtt(filepath: str) -> str:
23
+ """Clean up the content of a subtitle file (vtt) to a string
24
+
25
+ Args:
26
+ filepath (str): path to vtt file
27
+
28
+ Returns:
29
+ str: clean content
30
+ """
31
+ # read file content
32
+ with open(filepath, "r", encoding="utf-8") as fp:
33
+ content = fp.read()
34
+
35
+ # remove header & empty lines
36
+ lines = [line.strip() for line in content.split("\n") if line.strip()]
37
+ lines = lines[1:] if lines[0].upper() == "WEBVTT" else lines
38
+
39
+ # remove indexes
40
+ lines = [lines[i] for i in range(len(lines)) if not lines[i].isdigit()]
41
+
42
+ # remove tcode
43
+ #pattern = re.compile(r'^[0-9:.]{12} --> [0-9:.]{12}')
44
+ pattern = r'[a-f\d]{8}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{4}-[a-f\d]{12}\/\d+-\d'
45
+ lines = [lines[i] for i in range(len(lines))
46
+ if not re.match(pattern, lines[i])]
47
+
48
+ # remove timestamps
49
+ pattern = r"^\d{2}:\d{2}:\d{2}.\d{3}.*\d{2}:\d{2}:\d{2}.\d{3}$"
50
+ lines = [lines[i] for i in range(len(lines))
51
+ if not re.match(pattern, lines[i])]
52
+
53
+ content = " ".join(lines)
54
+
55
+ # remove duplicate spaces
56
+ pattern = r"\s+"
57
+ content = re.sub(pattern, r" ", content)
58
+
59
+ # add space after punctuation marks if it doesn't exist
60
+ pattern = r"([\.!?])(\w)"
61
+ content = re.sub(pattern, r"\1 \2", content)
62
+
63
+ return content
64
+
65
+
66
+ def vtt_to_clean_file(file_in: str, file_out=None, **kwargs) -> str:
67
+ """Save clean content of a subtitle file to text file
68
+
69
+ Args:
70
+ file_in (str): path to vtt file
71
+ file_out (None, optional): path to text file
72
+ **kwargs (optional): arguments for other parameters
73
+ - no_message (bool): do not show message of result.
74
+ Default is False
75
+
76
+ Returns:
77
+ str: path to text file
78
+ """
79
+ # set default values
80
+ no_message = kwargs.get("no_message", False)
81
+ if not file_out:
82
+ filename = splitext(file_in)[0]
83
+ file_out = "%s.txt" % filename
84
+ i = 0
85
+ while exists(file_out):
86
+ i += 1
87
+ file_out = "%s_%s.txt" % (filename, i)
88
+
89
+ content = clean_webvtt(file_in)
90
+ with open(file_out, "w+", encoding="utf-8") as fp:
91
+ fp.write(content)
92
+ if not no_message:
93
+ print("clean content is written to file: %s" % file_out)
94
+
95
+ return file_out
96
+
97
+
98
+ def get_summary(filepath):
99
+ filepath = filepath
100
+ vtt_to_clean_file(filepath)
101
+
102
+
103
+ def count_tokens(filename):
104
+ with open(filename, 'r') as f:
105
+ text = f.read()
106
+ tokens = word_tokenize(text)
107
+ return len(tokens)
108
+
109
+
110
+ def break_up_file(tokens, chunk_size, overlap_size):
111
+ if len(tokens) <= chunk_size:
112
+ yield tokens
113
+ else:
114
+ chunk = tokens[:chunk_size]
115
+ yield chunk
116
+ yield from break_up_file(tokens[chunk_size-overlap_size:], chunk_size, overlap_size)
117
+
118
+
119
+ def break_up_file_to_chunks(filename, chunk_size=4000, overlap_size=100):
120
+ with open(filename, 'r') as f:
121
+ text = f.read()
122
+ tokens = word_tokenize(text)
123
+ return list(break_up_file(tokens, chunk_size, overlap_size))
124
+
125
+
126
+ def convert_to_prompt_text(tokenized_text):
127
+ prompt_text = " ".join(tokenized_text)
128
+ prompt_text = prompt_text.replace(" 's", "'s")
129
+ return prompt_text
130
+
131
+
132
+ def markdown_to_docx(md_text, output_file):
133
+ # Convert the Markdown text to HTML
134
+ html_text = markdown.markdown(md_text)
135
+
136
+ # Create a new Document object
137
+ doc = Document()
138
+
139
+ # Parse the HTML and add its content to the .docx document
140
+ for p in html_text.split('</p>'):
141
+ if '<p>' in p:
142
+ clean_p = p.replace('<p>', '').strip()
143
+ if clean_p:
144
+ doc.add_paragraph(clean_p)
145
+
146
+ # Save the document to the specified file
147
+ doc.save(output_file)
148
+
149
+
150
+ @backoff.on_exception(backoff.expo, openai.error.RateLimitError)
151
+ @backoff.on_exception(backoff.expo, openai.error.APIConnectionError)
152
+ def summarize_meeting(filepath):
153
+ filename = filepath
154
+ token_count = count_tokens(filename)
155
+
156
+ prompt_response = []
157
+ # Break the text of the meeting transcripts into chunks of 4000 tokens.
158
+ chunks = break_up_file_to_chunks(filename)
159
+ # Summarize each chunk.
160
+ for i, chunk in enumerate(chunks):
161
+ prompt_request = convert_to_prompt_text(chunks[i])
162
+
163
+ messages = [
164
+ {"role": "system", "content": "Summarize this meeting transcript in the same language as the user's input."}]
165
+ messages.append({"role": "user", "content": prompt_request})
166
+
167
+ response = openai.ChatCompletion.create(
168
+ model="gpt-4",
169
+ messages=messages,
170
+ temperature=.4,
171
+ top_p=1,
172
+ frequency_penalty=0,
173
+ presence_penalty=0
174
+ )
175
+
176
+ prompt_response.append(
177
+ response["choices"][0]["message"]['content'].strip())
178
+
179
+ # Consolidate these meeting summaries.
180
+ prompt_request = "Consolidate these meeting summaries: " + \
181
+ str(prompt_response)
182
+
183
+ # Summarize the text of the meeting transcripts.
184
+ messages = [{"role": "system", "content": "Summarize the text of the meeting transcripts. The output format should be markdown in the same language as the user's input. Start with a brief summary of the meeting, continue with bullets outlining the most important points of discussion. Finally, provide a list of action items with a due date from the provided meeting transcript text."}]
185
+ messages.append({"role": "user", "content": prompt_request})
186
+ response = openai.ChatCompletion.create(
187
+ model="gpt-4",
188
+ messages=messages,
189
+ temperature=.4,
190
+ top_p=1,
191
+ frequency_penalty=0,
192
+ presence_penalty=0
193
+ )
194
+
195
+ summary_text = response["choices"][0]["message"]['content'].strip()
196
+ #outfilepath = "Resumen-Minuta-" + datetime.now().strftime("%d-%m-%Y-%H-%M") + ".docx"
197
+ # Convert the summary to a .docx file with the name "Resumen-Minuta-<download-date>.docx"
198
+ #markdown_to_docx(
199
+ # summary_text, outfilepath)
200
+
201
+
202
+ return summary_text
203
+
204
+
205
+ def summarize_meeting_vtt(file):
206
+ temp_file_path = file.name
207
+ summary_text = summarize_meeting(temp_file_path)
208
+
209
+ return summary_text
210
+
211
+
212
+ demo = gr.Interface(
213
+ fn=summarize_meeting_vtt,
214
+ # input
215
+ inputs=gr.File(label="Archivo .vtt"),
216
+ # output
217
+ outputs=[
218
+ gr.Markdown(label="Resumen de la reunión")
219
+ ],
220
+ title="Hexagon Data - Resumen de reuniones con I.A.",
221
+ description="Descarga la transcripción de la reunión en formato .vtt y carga el archivo aquí para obtener el resumen de la reunión.")
222
+
223
+
224
+ if __name__ == "__main__":
225
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python-docx
2
+ nltk
3
+ openai
4
+ markdown
5
+ backoff
6
+ docx
7
+ io
8
+ datetime
9
+ tempfile
10
+ os
11
+ re