nb / app.py
camparchimedes's picture
Update app.py
638acc9 verified
raw
history blame
6.1 kB
### -----------------------------------------------------------------------
### (FULL, Revised) version_1.07ALPHA_app.py
### -----------------------------------------------------------------------
# -------------------------------------------------------------------------
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -------------------------------------------------------------------------
import spaces
import gradio as gr
from PIL import Image
#from pydub import AudioSegment
#from scipy.io import wavfile
import os
import re
import time
import warnings
#import datetime
#import pandas as pd
#import csv
import subprocess
from pathlib import Path
import tempfile
from fpdf import FPDF
import psutil
from gpuinfo import GPUInfo
#import numpy as np
import torch
#import torchaudio
#import torchaudio.transforms as transforms
from transformers import pipeline #AutoModel
#import spacy
#import networkx as nx
#from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings("ignore")
# ------------header section------------
HEADER_INFO = """
# SWITCHVOX ✨|🇳🇴 *Switch Work Web app*
**Transkribering av lydfiler til norsk skrift**
""".strip()
LOGO = "https://cdn-lfs-us-1.huggingface.co/repos/fe/3b/fe3bd7c8beece8b087fddcc2278295e7f56c794c8dcf728189f4af8bddc585e1/24ad06a03a5bc66f3eba361b94e45ad17e46f98b76632f2d17faf8a0b4f9ab6b?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27banner_trans.png%3B+filename%3D%22banner_trans.png%22%3B&response-content-type=image%2Fpng&Expires=1725145079&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyNTE0NTA3OX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2ZlLzNiL2ZlM2JkN2M4YmVlY2U4YjA4N2ZkZGNjMjI3ODI5NWU3ZjU2Yzc5NGM4ZGNmNzI4MTg5ZjRhZjhiZGRjNTg1ZTEvMjRhZDA2YTAzYTViYzY2ZjNlYmEzNjFiOTRlNDVhZDE3ZTQ2Zjk4Yjc2NjMyZjJkMTdmYWY4YTBiNGY5YWI2Yj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=PCB1BZeLzsZXnn4lRi1Fj50%7E0E4G39u6-GKLNLLgxhDyhXlqb3BJkR7IOgdyjuNyBh8Iz2d7QqhzNSsOlQmqR30cJLl6aDM5eJO01OlWXoD3Z0KcphoVBFyyrkoxe2oS8i2mdlbFRYn7oc%7EhyOcW46zR6HtqAB91iEydhEa5WTyz3C9nWasgMZevb0vRJtzwhplM9e-%7EbRrZTm2fMzkL14IGWpTpUOGBe93BDSAYbPhrZK1jvuY8p0Tmy1iEKVP3Zdzix5U5lrbxit5luitEhK8x6q2t63Gdv7F0CZvjQtTh7MYkB5GNiru8bTGKAgCdHGiZbG7VCGfhlX3UKvUTPg__&Key-Pair-Id=K24J24Z295AEI9"
SIDEBAR_INFO = f"""
<div align="center">
<img src="{LOGO}" style="width: 100%; height: auto;"/>
</div>
"""
@spaces.GPU()
def transcribe(microphone, file_upload):
file = microphone if microphone is not None else file_upload
start_time = time.time()
#--------------____________________________________________--------------"
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", device=device)
# chunk_length_s=30, generate_kwargs={'task': 'transcribe', 'language': 'no'}
text = pipe(file)["text"]
#--------------____________________________________________--------------"
end_time = time.time()
output_time = end_time - start_time
word_count = len(text.split())
# --GPU metrics
memory = psutil.virtual_memory()
gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
# --CPU metric
cpu_usage = psutil.cpu_percent(interval=1)
# --system info string
system_info = f"""
*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
*Processing time: {output_time:.2f} seconds.*
*Number of words: {word_count}*
*GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}*
*CPU Usage: {cpu_usage}%*
"""
return warn_output + text, system_info
def save_to_pdf(text, summary):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
#
# ----add same if/elif logic as above here----
#
if text:
pdf.multi_cell(0, 10, "Text:\n" + text)
pdf.ln(10) # Paragraph metric
if summary:
pdf.multi_cell(0, 10, "Summary:\n" + summary)
pdf_output_path = "transcription_.pdf"
pdf.output(pdf_output_path)
return pdf_output_path
iface = gr.Blocks()
with iface:
gr.HTML(SIDEBAR_INFO)
gr.Markdown(HEADER_INFO)
with gr.Row():
gr.Markdown('''
##### 1. Last opp lydfila 🔊
2. Trykk på "Transkriber" knappen og vent på svar ☕️
3. Går rimelig bra kjapt med Norwegian NB-Whisper Large⚡️
4. Planlegger tilleggs-funksjoner senere😅
##### ''')
microphone = gr.Audio(sources="microphone", type="filepath")
upload = gr.Audio(sources="upload", type="filepath")
transcribe_btn = gr.Button("Transcribe Interview")
with gr.Row():
text_output = gr.Textbox(label="Transkribert Tekst")
with gr.Column():
system_info = gr.Textbox(label="System Info")
with gr.Tabs():
with gr.TabItem("Download PDF"):
pdf_text_only = gr.Button("Download PDF with Transcribed Text")
pdf_output = gr.File(label="Download PDF")
pdf_text_only.click(fn=lambda text: save_to_pdf(text, ""), inputs=[text_output], outputs=[pdf_output])
transcribe_btn.click(fn=transcribe, inputs=[microphone, upload], outputs=[text_output, system_info])
iface.launch(debug=True)