camparchimedes commited on
Commit
abc89d1
·
verified ·
1 Parent(s): 8723cb5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +179 -76
app.py CHANGED
@@ -1,6 +1,8 @@
1
- # app.py
2
- # Version: 1.07a (08.27.24), ALPHA
3
- #---------------------------------------------------------------------------------------------------------------------------------------------
 
 
4
  # Licensed under the Apache License, Version 2.0 (the "License");
5
  # you may not use this file except in compliance with the License.
6
  # You may obtain a copy of the License at
@@ -12,11 +14,12 @@
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
- #---------------------------------------------------------------------------------------------------------------------------------------------
 
16
  import spaces
17
  import gradio as gr
18
  from PIL import Image
19
- from pydub import AudioSegment
20
  #from scipy.io import wavfile
21
 
22
  import os
@@ -24,6 +27,8 @@ import re
24
  import time
25
  import warnings
26
  #import datetime
 
 
27
  import subprocess
28
  from pathlib import Path
29
  import tempfile
@@ -31,8 +36,7 @@ from fpdf import FPDF
31
 
32
  import psutil
33
  from gpuinfo import GPUInfo
34
- #import pandas as pd
35
- #import csv
36
  import numpy as np
37
  import torch
38
  import torchaudio
@@ -58,38 +62,28 @@ SIDEBAR_INFO = f"""
58
  </div>
59
  """
60
 
61
- # ------------transcribe section------------
62
-
63
 
64
  pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", chunk_length_s=30, generate_kwargs={'task': 'transcribe', 'language': 'no'})
65
 
66
  @spaces.GPU()
67
- def transcribe_audio(audio, batch_size=16):
68
-
69
- with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
70
- # --copy contents of uploaded audio file to temporary file
71
- temp_audio_file.write(open(audio, "rb").read())
72
- temp_audio_file.flush()
73
- # --use torchaudio to load it
74
- waveform, sample_rate = torchaudio.load(temp_audio_file.name)
75
- # --resample to 16kHz
76
- resampler = torchaudio.transforms.Resample(sample_rate, 16000)
77
- waveform = resampler(waveform)
78
-
79
- # --convert to mono
80
- if waveform.ndim > 1:
81
- waveform = waveform[0, :]
82
- # Convert tensor@ndarray
83
- waveform = waveform.numpy()
84
 
85
  start_time = time.time()
86
-
87
- # --pipe it
88
- with torch.no_grad():
89
- outputs = pipe(waveform, sampling_rate=sample_rate, batch_size=batch_size, return_timestamps=False)
90
 
91
  end_time = time.time()
92
-
93
  output_time = end_time - start_time
94
  word_count = len(text.split())
95
 
@@ -111,49 +105,58 @@ def transcribe_audio(audio, batch_size=16):
111
  *CPU Usage: {cpu_usage}%*
112
  """
113
 
114
- return text.strip(), system_info
 
115
 
116
 
117
- # ------------summaries section------------
118
 
119
- # [------------for app integration later------------]
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
  @spaces.GPU()
122
- # --btw, who is doing this...?
123
  def clean_text(text):
124
  text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
125
  text = re.sub(r'[^\w\s]', '', text)
126
  text = re.sub(r'\s+', ' ', text).strip()
127
  return text
128
 
129
- # --SpaCy params
130
- nlp = spacy.blank("nb") # ---==> codename ("norsk bokmål")
131
- nlp.add_pipe('sentencizer')
132
- spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS
133
 
134
- # --model (has tokenizer?)
135
- summarization_model = AutoModel.from_pretrained("NbAiLab/nb-bert-large")
136
- # pipe = pipeline("fill-mask", model="NbAiLab/nb-bert-large") -----hm..
 
 
137
 
138
- # --process text with SpaCy
139
- @spaces.GPU()
140
- def preprocess_text(text):
141
- doc = nlp(text)
142
  stop_words = spacy_stop_words
143
  words = [token.text for token in doc if token.text.lower() not in stop_words]
144
  return ' '.join(words)
145
 
146
- # --model is called to summarize (need to be placed *after* the three styles and call them)
147
  @spaces.GPU()
148
- def summarize_text(text):
 
 
 
149
  preprocessed_text = preprocess_text(text)
150
  inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
151
  inputs = inputs.to(device)
152
  summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
153
  return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
154
-
155
  @spaces.GPU()
156
- def build_similarity_matrix(sentences, stop_words):
157
  similarity_matrix = nx.Graph()
158
  for i, tokens_a in enumerate(sentences):
159
  for j, tokens_b in enumerate(sentences):
@@ -161,33 +164,38 @@ def build_similarity_matrix(sentences, stop_words):
161
  common_words = set(tokens_a) & set(tokens_b)
162
  similarity_matrix.add_edge(i, j, weight=len(common_words))
163
  return similarity_matrix
164
- # [------------model needs to be called for these------------]
165
 
166
-
167
- # --PageRank
168
  @spaces.GPU()
169
- def graph_based_summary(text, num_paragraphs=3):
170
- doc = nlp(text)
 
 
171
  sentences = [sent.text for sent in doc.sents]
172
  if len(sentences) < num_paragraphs:
173
- return ' '.join(sentences)
174
 
175
  sentence_tokens = [nlp(sent) for sent in sentences]
176
  stop_words = spacy_stop_words
177
  filtered_tokens = [[token.text for token in tokens if token.text.lower() not in stop_words] for tokens in sentence_tokens]
178
- similarity_matrix = build_similarity_matrix(filtered_tokens, stop_words)
179
 
180
  scores = nx.pagerank(similarity_matrix)
181
  ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
182
  return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]])
183
 
184
- # --LexRank
185
  @spaces.GPU()
186
- def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
187
- doc = nlp(text)
 
 
 
 
 
 
188
  sentences = [sent.text for sent in doc.sents]
189
  if len(sentences) < num_paragraphs:
190
- return ' '.join(sentences) # Adjusted to return a single string
191
 
192
  stop_words = spacy_stop_words
193
  vectorizer = TfidfVectorizer(stop_words=list(stop_words))
@@ -201,13 +209,18 @@ def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
201
  ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
202
  return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
203
 
204
- # --TextRank
205
  @spaces.GPU()
206
- def text_rank_summary(text, num_paragraphs=3):
207
- doc = nlp(text)
 
 
 
 
 
 
208
  sentences = [sent.text for sent in doc.sents]
209
  if len(sentences) < num_paragraphs:
210
- return ' '.join(sentences)
211
 
212
  stop_words = spacy_stop_words
213
  vectorizer = TfidfVectorizer(stop_words=list(stop_words))
@@ -219,21 +232,111 @@ def text_rank_summary(text, num_paragraphs=3):
219
  ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
220
  return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
221
 
222
-
 
 
 
 
 
 
 
 
223
 
224
- # ------------interface section------------
 
 
 
 
 
 
 
225
 
226
  iface = gr.Blocks()
 
227
  with iface:
 
228
  gr.HTML(SIDEBAR_INFO)
229
  gr.Markdown(HEADER_INFO)
230
-
231
- audio_input = gr.Audio(label="Upload Audio File")
232
- transcribed_text = gr.Textbox(label="Transcribed Text")
233
- system_info = gr.Textbox(label="System Info")
234
-
235
- transcribe_button = gr.Button("Transcribe")
236
- transcribe_button.click(fn=transcribe_audio, inputs=audio_input, outputs=[transcribed_text, system_info])
237
-
238
- iface.launch(share=True, debug=True)
239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### -----------------------------------------------------------------------
2
+ ### (FULL, Revised) version_1.07ALPHA_app.py
3
+ ### -----------------------------------------------------------------------
4
+
5
+ # -------------------------------------------------------------------------
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
  # you may not use this file except in compliance with the License.
8
  # You may obtain a copy of the License at
 
14
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
  # See the License for the specific language governing permissions and
16
  # limitations under the License.
17
+ # -------------------------------------------------------------------------
18
+
19
  import spaces
20
  import gradio as gr
21
  from PIL import Image
22
+ #from pydub import AudioSegment
23
  #from scipy.io import wavfile
24
 
25
  import os
 
27
  import time
28
  import warnings
29
  #import datetime
30
+ #import pandas as pd
31
+ #import csv
32
  import subprocess
33
  from pathlib import Path
34
  import tempfile
 
36
 
37
  import psutil
38
  from gpuinfo import GPUInfo
39
+
 
40
  import numpy as np
41
  import torch
42
  import torchaudio
 
62
  </div>
63
  """
64
 
65
+ # ------------transcribe section------------
 
66
 
67
  pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", chunk_length_s=30, generate_kwargs={'task': 'transcribe', 'language': 'no'})
68
 
69
  @spaces.GPU()
70
+ def transcribe(microphone, file_upload, batch_size=15):
71
+ warn_output = ""
72
+ if (microphone is not None) and (file_upload is not None):
73
+ warn_output = (
74
+ "WARNING: You've uploaded an audio file and used the microphone. "
75
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
76
+ )
77
+
78
+ elif (microphone is None) and (file_upload is None):
79
+ return "ERROR: You have to either use the microphone or upload an audio file"
80
+
81
+ file = microphone if microphone is not None else file_upload
 
 
 
 
 
82
 
83
  start_time = time.time()
84
+ text = pipe(file, batch_size=batch_size, return_timestamps=False)["text"]
 
 
 
85
 
86
  end_time = time.time()
 
87
  output_time = end_time - start_time
88
  word_count = len(text.split())
89
 
 
105
  *CPU Usage: {cpu_usage}%*
106
  """
107
 
108
+ return warn_output + text.strip(), system_info
109
+
110
 
111
 
 
112
 
113
+
114
+ # ------------summary section------------
115
+
116
+
117
+ # ------------for app integration later------------
118
+
119
+ nlp = spacy.blank("nb") # codename 'nb' = Norwegian Bokmål
120
+ nlp.add_pipe('sentencizer')
121
+ spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS
122
+
123
+ summarization_model = AutoModel.from_pretrained("NbAiLab/nb-bert-large")
124
+ summarization_tokenizer = AutoTokenizer.from_pretrained("NbAiLab/nb-bert-large") # <--not sure if this is needed..is not the tokenizer already part of this model..?
125
+ # pipe = pipeline("fill-mask", model="NbAiLab/nb-bert-large")
126
 
127
  @spaces.GPU()
 
128
  def clean_text(text):
129
  text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
130
  text = re.sub(r'[^\w\s]', '', text)
131
  text = re.sub(r'\s+', ' ', text).strip()
132
  return text
133
 
134
+ @spaces.GPU()
135
+ def preprocess_text(text, file_upload):
 
 
136
 
137
+ if (text is not None) and (file_upload is None):
138
+ doc = nlp(text)
139
+
140
+ elif (text is None) and (file_upload is not None):
141
+ doc = nlp(file_upload)
142
 
 
 
 
 
143
  stop_words = spacy_stop_words
144
  words = [token.text for token in doc if token.text.lower() not in stop_words]
145
  return ' '.join(words)
146
 
 
147
  @spaces.GPU()
148
+ def summarize_text(text, file_upload):
149
+ #
150
+ # ----add same if/elif logic as above here----
151
+ #
152
  preprocessed_text = preprocess_text(text)
153
  inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
154
  inputs = inputs.to(device)
155
  summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
156
  return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
157
+
158
  @spaces.GPU()
159
+ def build_similarity_matrix(sentences):
160
  similarity_matrix = nx.Graph()
161
  for i, tokens_a in enumerate(sentences):
162
  for j, tokens_b in enumerate(sentences):
 
164
  common_words = set(tokens_a) & set(tokens_b)
165
  similarity_matrix.add_edge(i, j, weight=len(common_words))
166
  return similarity_matrix
 
167
 
168
+ # PageRank
 
169
  @spaces.GPU()
170
+ def graph_based_summary(text, file_upload, num_paragraphs=3):
171
+ #
172
+ # ----add same if/elif logic as above here----
173
+ #
174
  sentences = [sent.text for sent in doc.sents]
175
  if len(sentences) < num_paragraphs:
176
+ return ' '.join(sentences)
177
 
178
  sentence_tokens = [nlp(sent) for sent in sentences]
179
  stop_words = spacy_stop_words
180
  filtered_tokens = [[token.text for token in tokens if token.text.lower() not in stop_words] for tokens in sentence_tokens]
181
+ similarity_matrix = build_similarity_matrix(filtered_tokens)
182
 
183
  scores = nx.pagerank(similarity_matrix)
184
  ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
185
  return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]])
186
 
 
187
  @spaces.GPU()
188
+ def lex_rank_summary(text, file_upload, num_paragraphs=3, threshold=0.1):
189
+
190
+ if (text is not None) and (file_upload is None):
191
+ doc = nlp(text)
192
+
193
+ elif (text is None) and (file_upload is not None):
194
+ doc = nlp(file_upload)
195
+
196
  sentences = [sent.text for sent in doc.sents]
197
  if len(sentences) < num_paragraphs:
198
+ return ' '.join(sentences)
199
 
200
  stop_words = spacy_stop_words
201
  vectorizer = TfidfVectorizer(stop_words=list(stop_words))
 
209
  ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
210
  return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
211
 
 
212
  @spaces.GPU()
213
+ def text_rank_summary(text, file_upload, num_paragraphs=3):
214
+
215
+ if (text is not None) and (file_upload is None):
216
+ doc = nlp(text)
217
+
218
+ elif (text is None) and (file_upload is not None):
219
+ doc = nlp(file_upload)
220
+
221
  sentences = [sent.text for sent in doc.sents]
222
  if len(sentences) < num_paragraphs:
223
+ return ' '.join(sentences)
224
 
225
  stop_words = spacy_stop_words
226
  vectorizer = TfidfVectorizer(stop_words=list(stop_words))
 
232
  ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
233
  return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
234
 
235
+ def save_to_pdf(text, summary):
236
+ pdf = FPDF()
237
+ pdf.add_page()
238
+ pdf.set_font("Arial", size=12)
239
+ #
240
+ # ----add same if/elif logic as above here----
241
+ #
242
+ if text:
243
+ pdf.multi_cell(0, 10, "Text:\n" + text)
244
 
245
+ pdf.ln(10) # Paragraph metric
246
+
247
+ if summary:
248
+ pdf.multi_cell(0, 10, "Summary:\n" + summary)
249
+
250
+ pdf_output_path = "transcription_.pdf"
251
+ pdf.output(pdf_output_path)
252
+ return pdf_output_path
253
 
254
  iface = gr.Blocks()
255
+
256
  with iface:
257
+
258
  gr.HTML(SIDEBAR_INFO)
259
  gr.Markdown(HEADER_INFO)
 
 
 
 
 
 
 
 
 
260
 
261
+ with gr.Row():
262
+ gr.Markdown('''
263
+ ##### Here you will get transcription output
264
+ ##### ''')
265
+ microphone = gr.Audio(sources="microphone", type="filepath")
266
+ upload = gr.Audio(sources="upload", type="filepath")
267
+
268
+
269
+ transcribe_btn = gr.Button("Transcribe Interview")
270
+ text_output = gr.Textbox()
271
+ system_info = gr.Textbox(label="System Info")
272
+
273
+ transcribe_btn.click(transcribe_audio,
274
+ [microphone, upload], [text_output]
275
+ [system_info]
276
+ )
277
+
278
+ with gr.Tabs():
279
+
280
+ with gr.TabItem("Summary | PageRank"):
281
+ text_input_graph = gr.Textbox(label="Input Text", placeholder="txt2summarize")
282
+ summary_output_graph = gr.Textbox(label="PageRank | token-based similarity")
283
+
284
+ gr.Markdown("""
285
+ **token-based**: similarity matrix edge weights representing token overlap/
286
+ ranked by their centrality in the graph (good with dense inter-sentence relationships)
287
+ """)
288
+ gr.Markdown("""
289
+ *Bjørn*: **gir sammendrag som fanger opp de mest relevante setninger i teksten**
290
+ """)
291
+
292
+ summarize_transcribed_button_graph = gr.Button("Summary of Transcribed Text, Click Here")
293
+ summarize_transcribed_button_graph.click(fn=lambda text: graph_based_summary(text), inputs=[transcribed_text], outputs=[summary_output_graph])
294
+ summarize_uploaded_button_graph = gr.Button("Upload Text to Summarize, Click Here")
295
+ summarize_uploaded_button_graph.click(fn=graph_based_summary(file_upload), inputs=[text_input_graph], outputs=[summary_output_graph])
296
+
297
+ with gr.TabItem("Summary | LexRank"):with gr.Blocks():
298
+ text_output = gr.Textbox(label="Transcription Output")
299
+ text_input_lex = gr.Textbox(label="Input Text", placeholder="txt2summarize")
300
+ summary_output_lex = gr.Textbox(label="LexRank | cosine similarity")
301
+
302
+ gr.Markdown("""
303
+ **semantic**: TF-IDF vectorization@cosine similarity matrix, ranked by eigenvector centrality.
304
+ (good for sparse graph structures with thresholding)
305
+ """)
306
+ gr.Markdown("""
307
+ *Bjørn*: **gir sammendrag som best fanger opp betydningen av hele teksten**
308
+ """)
309
+
310
+ summarize_transcribed_button_lex = gr.Button("Summary of Transcribed Text, Click Here")
311
+ summarize_transcribed_button_lex.click(fn=lambda text: lex_rank_summary(text), inputs=[transcribed_text], outputs=[summary_output_lex])
312
+ summarize_uploaded_button_lex = gr.Button("Upload Text to Summarize, Click Here")
313
+ summarize_uploaded_button_lex.click(fn=lex_rank_summary(file_upload), inputs=[text_input_lex], outputs=[summary_output_lex])
314
+
315
+ with gr.TabItem("Summary | TextRank"):
316
+ text_input_text_rank = gr.Textbox(label="Input Text", placeholder="txt2summarize")
317
+ summary_output_text_rank = gr.Textbox(label="TextRank | lexical similarity")
318
+
319
+ gr.Markdown("""
320
+ **sentence**: graph with weighted edges based on lexical similarity. (i.e" "sentence similarity"word overlap)/sentence similarity
321
+ """)
322
+ gr.Markdown("""
323
+ *Bjørn*: **sammendrag basert på i de setningene som ligner mest på hverandre fra teksten**
324
+
325
+ """)
326
+
327
+ summarize_transcribed_button_text_rank = gr.Button("Summary of Transcribed Text, Click Here")
328
+ summarize_transcribed_button_text_rank.click(fn=lambda text: text_rank_summary(text), inputs=[transcribed_text], outputs=[summary_output_text_rank])
329
+ summarize_uploaded_button_text_rank = gr.Button("Upload Text to Summarize, Click Here")
330
+ summarize_uploaded_button_text_rank.click(fn=text_rank_summary(file_upload), inputs=[text_input_text_rank], outputs=[summary_output_text_rank])
331
+
332
+
333
+ with gr.TabItem("Download PDF"):
334
+ pdf_text_only = gr.Button("Download PDF with Transcribed Text Only")
335
+ pdf_summary_only = gr.Button("Download PDF with Summary-of-Transcribed-Text Only")
336
+ pdf_both = gr.Button("Download PDF with Both")
337
+
338
+ pdf_output = gr.File(label="Download PDF")
339
+
340
+ pdf_text_only.click(fn=lambda text: save_to_pdf(text, ""), inputs=[transcribed_text], outputs=[pdf_output])
341
+ pdf_summary_only.click(fn=lambda summary: save_to_pdf("", summary), inputs=[summary_output_graph, summary_output_lex, summary_output_text_rank], outputs=[pdf_output]) # Includes all summary outputs
342
+ pdf_both.click(fn=lambda text, summary: save_to_pdf(text, summary), inputs=[transcribed_text, summary_output_graph], outputs=[pdf_output])