KG0101 commited on
Commit
24124c5
·
verified ·
1 Parent(s): 3de905c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -19
app.py CHANGED
@@ -1,24 +1,20 @@
1
  import spaces
2
  import torch
3
  import gradio as gr
4
- from transformers import pipeline, AutoModel, AutoTokenizer
5
  from llama_cpp import Llama
6
- import os
7
 
8
  MODEL_NAME = "openai/whisper-large-v3-turbo"
9
- MODEL_PATH = "model.gguf" # Path to the downloaded model
10
  BATCH_SIZE = 8
11
  FILE_LIMIT_MB = 1000
12
 
13
  device = 0 if torch.cuda.is_available() else "cpu"
14
 
15
- # Download and load model if not already present
16
- if not os.path.exists(MODEL_PATH):
17
- from huggingface_hub import hf_hub_download
18
- hf_hub_download(repo_id="MaziyarPanahi/Qwen2-7B-Instruct-GGUF", filename="model.gguf", local_dir="./")
19
-
20
- # Load the Llama model with specified context and threading
21
- llm = Llama(model_path=MODEL_PATH, n_ctx=8000, n_threads=2, chat_format="chatml")
22
 
23
  # Initialize the transcription pipeline
24
  pipe = pipeline(
@@ -51,16 +47,15 @@ def transcribe(inputs, task):
51
 
52
  # Function to generate SOAP notes using Llama model
53
  def generate_soap(transcribed_text):
54
- prompt = [{"role": "system", "content": sys_prompt}]
55
- prompt.append({"role": "user", "content": f"{task_prompt}\n{transcribed_text}"})
 
 
 
56
 
57
- # Generate a response using the Llama model in streaming mode
58
- stream_response = llm.create_chat_completion(messages=prompt, temperature=0.7, max_tokens=2048, stream=True)
59
- response = ""
60
- for chunk in stream_response:
61
- if "content" in chunk['choices'][0]["delta"]:
62
- response += chunk['choices'][0]["delta"]["content"]
63
- return response
64
 
65
  # Gradio Interfaces for different inputs
66
  demo = gr.Blocks(theme=gr.themes.Ocean())
 
1
  import spaces
2
  import torch
3
  import gradio as gr
4
+ from transformers import pipeline
5
  from llama_cpp import Llama
 
6
 
7
  MODEL_NAME = "openai/whisper-large-v3-turbo"
 
8
  BATCH_SIZE = 8
9
  FILE_LIMIT_MB = 1000
10
 
11
  device = 0 if torch.cuda.is_available() else "cpu"
12
 
13
+ # Load the Llama model directly from Hugging Face
14
+ llm = Llama.from_pretrained(
15
+ repo_id="MaziyarPanahi/Qwen2-7B-Instruct-GGUF",
16
+ filename="Qwen2-7B-Instruct.Q4_K_M.gguf"
17
+ )
 
 
18
 
19
  # Initialize the transcription pipeline
20
  pipe = pipeline(
 
47
 
48
  # Function to generate SOAP notes using Llama model
49
  def generate_soap(transcribed_text):
50
+ # Format the conversation for the Llama model
51
+ prompt = [
52
+ {"role": "system", "content": sys_prompt},
53
+ {"role": "user", "content": f"{task_prompt}\n{transcribed_text}"}
54
+ ]
55
 
56
+ # Generate a response
57
+ response = llm.create_chat_completion(messages=prompt, temperature=0.7, max_tokens=2048)
58
+ return response["choices"][0]["message"]["content"]
 
 
 
 
59
 
60
  # Gradio Interfaces for different inputs
61
  demo = gr.Blocks(theme=gr.themes.Ocean())