Spaces:

camparchimedes
/

nb

Build error

App Files Files

camparchimedes commited on Aug 28, 2024

Commit

abc89d1

verified ·

1 Parent(s): 8723cb5

Update app.py

Browse files

Files changed (1) hide show

app.py +179 -76

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
-# app.py
-# Version: 1.07a (08.27.24), ALPHA
-#---------------------------------------------------------------------------------------------------------------------------------------------
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,11 +14,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#---------------------------------------------------------------------------------------------------------------------------------------------
 import spaces
 import gradio as gr
 from PIL import Image
-from pydub import AudioSegment
 #from scipy.io import wavfile
 import os
@@ -24,6 +27,8 @@ import re
 import time
 import warnings
 #import datetime
 import subprocess
 from pathlib import Path
 import tempfile
@@ -31,8 +36,7 @@ from fpdf import FPDF
 import psutil
 from gpuinfo import GPUInfo
-#import pandas as pd
-#import csv
 import numpy as np
 import torch
 import torchaudio
@@ -58,38 +62,28 @@ SIDEBAR_INFO = f"""
 </div>
 """
-#                                                       ------------transcribe section------------
 pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", chunk_length_s=30, generate_kwargs={'task': 'transcribe', 'language': 'no'})
 @spaces.GPU()
-def transcribe_audio(audio, batch_size=16):
-    with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
-        # --copy contents of uploaded audio file to temporary file
-        temp_audio_file.write(open(audio, "rb").read())
-        temp_audio_file.flush()
-        # --use torchaudio to load it
-        waveform, sample_rate = torchaudio.load(temp_audio_file.name)
-        # --resample to 16kHz
-        resampler = torchaudio.transforms.Resample(sample_rate, 16000)
-        waveform = resampler(waveform)
-    # --convert to mono
-    if waveform.ndim > 1:
-        waveform = waveform[0, :]
-    # Convert tensor@ndarray
-    waveform = waveform.numpy()
     start_time = time.time()
-    # --pipe it
-    with torch.no_grad():
-        outputs = pipe(waveform, sampling_rate=sample_rate, batch_size=batch_size, return_timestamps=False)
     end_time = time.time()
     output_time = end_time - start_time
     word_count = len(text.split())
@@ -111,49 +105,58 @@ def transcribe_audio(audio, batch_size=16):
     *CPU Usage: {cpu_usage}%*
     """
-    return text.strip(), system_info
-#                                                       ------------summaries section------------
-#                                                   [------------for app integration later------------]
 @spaces.GPU()
-# --btw, who is doing this...?
 def clean_text(text):
     text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
     text = re.sub(r'[^\w\s]', '', text)
     text = re.sub(r'\s+', ' ', text).strip()
     return text
-# --SpaCy params
-nlp = spacy.blank("nb")  # ---==> codename ("norsk bokmål")
-nlp.add_pipe('sentencizer')
-spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS
-# --model (has tokenizer?)
-summarization_model = AutoModel.from_pretrained("NbAiLab/nb-bert-large")
-# pipe = pipeline("fill-mask", model="NbAiLab/nb-bert-large") -----hm..
-# --process text with SpaCy
-@spaces.GPU()
-def preprocess_text(text):
-    doc = nlp(text)
     stop_words = spacy_stop_words
     words = [token.text for token in doc if token.text.lower() not in stop_words]
     return ' '.join(words)
-# --model is called to summarize (need to be placed *after* the three styles and call them)
 @spaces.GPU()
-def summarize_text(text):
     preprocessed_text = preprocess_text(text)
     inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
     inputs = inputs.to(device)
     summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
     return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 @spaces.GPU()
-def build_similarity_matrix(sentences, stop_words):
     similarity_matrix = nx.Graph()
     for i, tokens_a in enumerate(sentences):
         for j, tokens_b in enumerate(sentences):
@@ -161,33 +164,38 @@ def build_similarity_matrix(sentences, stop_words):
                 common_words = set(tokens_a) & set(tokens_b)
                 similarity_matrix.add_edge(i, j, weight=len(common_words))
     return similarity_matrix
-#                                                   [------------model needs to be called for these------------]
-# --PageRank
 @spaces.GPU()
-def graph_based_summary(text, num_paragraphs=3):
-    doc = nlp(text)
     sentences = [sent.text for sent in doc.sents]
     if len(sentences) < num_paragraphs:
-        return ' '.join(sentences)
     sentence_tokens = [nlp(sent) for sent in sentences]
     stop_words = spacy_stop_words
     filtered_tokens = [[token.text for token in tokens if token.text.lower() not in stop_words] for tokens in sentence_tokens]
-    similarity_matrix = build_similarity_matrix(filtered_tokens, stop_words)
     scores = nx.pagerank(similarity_matrix)
     ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
     return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]])
-# --LexRank
 @spaces.GPU()
-def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
-    doc = nlp(text)
     sentences = [sent.text for sent in doc.sents]
     if len(sentences) < num_paragraphs:
-        return ' '.join(sentences)  # Adjusted to return a single string
     stop_words = spacy_stop_words
     vectorizer = TfidfVectorizer(stop_words=list(stop_words))
@@ -201,13 +209,18 @@ def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
     ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
     return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
-# --TextRank
 @spaces.GPU()
-def text_rank_summary(text, num_paragraphs=3):
-    doc = nlp(text)
     sentences = [sent.text for sent in doc.sents]
     if len(sentences) < num_paragraphs:
-        return ' '.join(sentences)
     stop_words = spacy_stop_words
     vectorizer = TfidfVectorizer(stop_words=list(stop_words))
@@ -219,21 +232,111 @@ def text_rank_summary(text, num_paragraphs=3):
     ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
     return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
-#                                                       ------------interface section------------
 iface = gr.Blocks()
 with iface:
     gr.HTML(SIDEBAR_INFO)
     gr.Markdown(HEADER_INFO)
-    audio_input = gr.Audio(label="Upload Audio File")
-    transcribed_text = gr.Textbox(label="Transcribed Text")
-    system_info = gr.Textbox(label="System Info")
-    transcribe_button = gr.Button("Transcribe")
-    transcribe_button.click(fn=transcribe_audio, inputs=audio_input, outputs=[transcribed_text, system_info])
-iface.launch(share=True, debug=True)

+### -----------------------------------------------------------------------
+### (FULL, Revised) version_1.07ALPHA_app.py
+### -----------------------------------------------------------------------
+# -------------------------------------------------------------------------
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# -------------------------------------------------------------------------
 import spaces
 import gradio as gr
 from PIL import Image
+#from pydub import AudioSegment
 #from scipy.io import wavfile
 import os
 import time
 import warnings
 #import datetime
+#import pandas as pd
+#import csv
 import subprocess
 from pathlib import Path
 import tempfile
 import psutil
 from gpuinfo import GPUInfo
 import numpy as np
 import torch
 import torchaudio
 </div>
 """
+# ------------transcribe section------------
 pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", chunk_length_s=30, generate_kwargs={'task': 'transcribe', 'language': 'no'})
 @spaces.GPU()
+def transcribe(microphone, file_upload, batch_size=15):
+    warn_output = ""
+    if (microphone is not None) and (file_upload is not None):
+        warn_output = (
+            "WARNING: You've uploaded an audio file and used the microphone. "
+            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
+        )
+    elif (microphone is None) and (file_upload is None):
+        return "ERROR: You have to either use the microphone or upload an audio file"
+    file = microphone if microphone is not None else file_upload
     start_time = time.time()
+    text = pipe(file, batch_size=batch_size, return_timestamps=False)["text"]
     end_time = time.time()
     output_time = end_time - start_time
     word_count = len(text.split())
     *CPU Usage: {cpu_usage}%*
     """
+    return warn_output + text.strip(), system_info
+#              ------------summary section------------
+#          ------------for app integration later------------
+nlp = spacy.blank("nb")  # codename 'nb' = Norwegian Bokmål
+nlp.add_pipe('sentencizer')
+spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS
+summarization_model = AutoModel.from_pretrained("NbAiLab/nb-bert-large")
+summarization_tokenizer = AutoTokenizer.from_pretrained("NbAiLab/nb-bert-large") # <--not sure if this is needed..is not the tokenizer already part of this model..?
+# pipe = pipeline("fill-mask", model="NbAiLab/nb-bert-large")
 @spaces.GPU()
 def clean_text(text):
     text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
     text = re.sub(r'[^\w\s]', '', text)
     text = re.sub(r'\s+', ' ', text).strip()
     return text
+@spaces.GPU()
+def preprocess_text(text, file_upload):
+    if (text is not None) and (file_upload is None):
+        doc = nlp(text)
+    elif (text is None) and (file_upload is not None):
+        doc = nlp(file_upload)
     stop_words = spacy_stop_words
     words = [token.text for token in doc if token.text.lower() not in stop_words]
     return ' '.join(words)
 @spaces.GPU()
+def summarize_text(text, file_upload):
+#
+# ----add same if/elif logic as above here----
+#
     preprocessed_text = preprocess_text(text)
     inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
     inputs = inputs.to(device)
     summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
     return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 @spaces.GPU()
+def build_similarity_matrix(sentences):
     similarity_matrix = nx.Graph()
     for i, tokens_a in enumerate(sentences):
         for j, tokens_b in enumerate(sentences):
                 common_words = set(tokens_a) & set(tokens_b)
                 similarity_matrix.add_edge(i, j, weight=len(common_words))
     return similarity_matrix
+# PageRank
 @spaces.GPU()
+def graph_based_summary(text, file_upload, num_paragraphs=3):
+#
+# ----add same if/elif logic as above here----
+#
     sentences = [sent.text for sent in doc.sents]
     if len(sentences) < num_paragraphs:
+        return ' '.join(sentences)
     sentence_tokens = [nlp(sent) for sent in sentences]
     stop_words = spacy_stop_words
     filtered_tokens = [[token.text for token in tokens if token.text.lower() not in stop_words] for tokens in sentence_tokens]
+    similarity_matrix = build_similarity_matrix(filtered_tokens)
     scores = nx.pagerank(similarity_matrix)
     ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
     return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]])
 @spaces.GPU()
+def lex_rank_summary(text, file_upload, num_paragraphs=3, threshold=0.1):
+    if (text is not None) and (file_upload is None):
+        doc = nlp(text)
+    elif (text is None) and (file_upload is not None):
+        doc = nlp(file_upload)
     sentences = [sent.text for sent in doc.sents]
     if len(sentences) < num_paragraphs:
+        return ' '.join(sentences)
     stop_words = spacy_stop_words
     vectorizer = TfidfVectorizer(stop_words=list(stop_words))
     ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
     return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
 @spaces.GPU()
+def text_rank_summary(text, file_upload, num_paragraphs=3):
+    if (text is not None) and (file_upload is None):
+        doc = nlp(text)
+    elif (text is None) and (file_upload is not None):
+        doc = nlp(file_upload)
     sentences = [sent.text for sent in doc.sents]
     if len(sentences) < num_paragraphs:
+        return ' '.join(sentences)
     stop_words = spacy_stop_words
     vectorizer = TfidfVectorizer(stop_words=list(stop_words))
     ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
     return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])
+def save_to_pdf(text, summary):
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.set_font("Arial", size=12)
+#
+# ----add same if/elif logic as above here----
+#
+    if text:
+        pdf.multi_cell(0, 10, "Text:\n" + text)
+    pdf.ln(10)  # Paragraph metric
+    if summary:
+        pdf.multi_cell(0, 10, "Summary:\n" + summary)
+    pdf_output_path = "transcription_.pdf"
+    pdf.output(pdf_output_path)
+    return pdf_output_path
 iface = gr.Blocks()
 with iface:
     gr.HTML(SIDEBAR_INFO)
     gr.Markdown(HEADER_INFO)
+    with gr.Row():
+            gr.Markdown('''
+            ##### Here you will get transcription output
+            ##### ''')
+            microphone = gr.Audio(sources="microphone", type="filepath")
+            upload = gr.Audio(sources="upload", type="filepath")
+            transcribe_btn = gr.Button("Transcribe Interview")
+            text_output = gr.Textbox()
+            system_info = gr.Textbox(label="System Info")
+            transcribe_btn.click(transcribe_audio,
+                                [microphone, upload], [text_output]
+                                [system_info]
+                                )
+    with gr.Tabs():
+        with gr.TabItem("Summary | PageRank"):
+            text_input_graph = gr.Textbox(label="Input Text", placeholder="txt2summarize")
+            summary_output_graph = gr.Textbox(label="PageRank | token-based similarity")
+            gr.Markdown("""
+            **token-based**: similarity matrix edge weights representing token overlap/
+            ranked by their centrality in the graph (good with dense inter-sentence relationships)
+            """)
+            gr.Markdown("""
+            *Bjørn*: **gir sammendrag som fanger opp de mest relevante setninger i teksten**
+            """)
+            summarize_transcribed_button_graph = gr.Button("Summary of Transcribed Text, Click Here")
+            summarize_transcribed_button_graph.click(fn=lambda text: graph_based_summary(text), inputs=[transcribed_text], outputs=[summary_output_graph])
+            summarize_uploaded_button_graph = gr.Button("Upload Text to Summarize, Click Here")
+            summarize_uploaded_button_graph.click(fn=graph_based_summary(file_upload), inputs=[text_input_graph], outputs=[summary_output_graph])
+        with gr.TabItem("Summary | LexRank"):with gr.Blocks():
+            text_output = gr.Textbox(label="Transcription Output")
+            text_input_lex = gr.Textbox(label="Input Text", placeholder="txt2summarize")
+            summary_output_lex = gr.Textbox(label="LexRank | cosine similarity")
+            gr.Markdown("""
+            **semantic**: TF-IDF vectorization@cosine similarity matrix, ranked by eigenvector centrality.
+            (good for sparse graph structures with thresholding)
+            """)
+            gr.Markdown("""
+            *Bjørn*: **gir sammendrag som best fanger opp betydningen av hele teksten**
+            """)
+            summarize_transcribed_button_lex = gr.Button("Summary of Transcribed Text, Click Here")
+            summarize_transcribed_button_lex.click(fn=lambda text: lex_rank_summary(text), inputs=[transcribed_text], outputs=[summary_output_lex])
+            summarize_uploaded_button_lex = gr.Button("Upload Text to Summarize, Click Here")
+            summarize_uploaded_button_lex.click(fn=lex_rank_summary(file_upload), inputs=[text_input_lex], outputs=[summary_output_lex])
+        with gr.TabItem("Summary | TextRank"):
+            text_input_text_rank = gr.Textbox(label="Input Text", placeholder="txt2summarize")
+            summary_output_text_rank = gr.Textbox(label="TextRank | lexical similarity")
+            gr.Markdown("""
+            **sentence**: graph with weighted edges based on lexical similarity. (i.e" "sentence similarity"word overlap)/sentence similarity
+            """)
+            gr.Markdown("""
+            *Bjørn*: **sammendrag basert på i de setningene som ligner mest på hverandre fra teksten**
+            """)
+            summarize_transcribed_button_text_rank = gr.Button("Summary of Transcribed Text, Click Here")
+            summarize_transcribed_button_text_rank.click(fn=lambda text: text_rank_summary(text), inputs=[transcribed_text], outputs=[summary_output_text_rank])
+            summarize_uploaded_button_text_rank = gr.Button("Upload Text to Summarize, Click Here")
+            summarize_uploaded_button_text_rank.click(fn=text_rank_summary(file_upload), inputs=[text_input_text_rank], outputs=[summary_output_text_rank])
+        with gr.TabItem("Download PDF"):
+            pdf_text_only = gr.Button("Download PDF with Transcribed Text Only")
+            pdf_summary_only = gr.Button("Download PDF with Summary-of-Transcribed-Text Only")
+            pdf_both = gr.Button("Download PDF with Both")
+            pdf_output = gr.File(label="Download PDF")
+            pdf_text_only.click(fn=lambda text: save_to_pdf(text, ""), inputs=[transcribed_text], outputs=[pdf_output])
+            pdf_summary_only.click(fn=lambda summary: save_to_pdf("", summary), inputs=[summary_output_graph, summary_output_lex, summary_output_text_rank], outputs=[pdf_output])  # Includes all summary outputs
+            pdf_both.click(fn=lambda text, summary: save_to_pdf(text, summary), inputs=[transcribed_text, summary_output_graph], outputs=[pdf_output])