Gijs Wijngaard
commited on
Commit
·
b95f6f3
1
Parent(s):
84c21bd
Add semantics
Browse files- .gitattributes +14 -0
- app.py +36 -72
- model2/README.md +3 -0
- model2/adapter_config.json +3 -0
- model2/adapter_model.safetensors +3 -0
- model2/added_tokens.json +3 -0
- model2/merges.txt +3 -0
- model2/optimizer.pt +3 -0
- model2/rng_state.pth +3 -0
- model2/scheduler.pt +3 -0
- model2/special_tokens_map.json +3 -0
- model2/tokenizer.json +3 -0
- model2/tokenizer_config.json +3 -0
- model2/trainer_state.json +3 -0
- model2/training_args.bin +3 -0
- model2/vocab.json +3 -0
.gitattributes
CHANGED
@@ -47,3 +47,17 @@ model/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
|
|
47 |
model/trainer_state.json filter=lfs diff=lfs merge=lfs -text
|
48 |
model/training_args.bin filter=lfs diff=lfs merge=lfs -text
|
49 |
model/added_tokens.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
model/trainer_state.json filter=lfs diff=lfs merge=lfs -text
|
48 |
model/training_args.bin filter=lfs diff=lfs merge=lfs -text
|
49 |
model/added_tokens.json filter=lfs diff=lfs merge=lfs -text
|
50 |
+
model2/merges.txt filter=lfs diff=lfs merge=lfs -text
|
51 |
+
model2/rng_state.pth filter=lfs diff=lfs merge=lfs -text
|
52 |
+
model2/scheduler.pt filter=lfs diff=lfs merge=lfs -text
|
53 |
+
model2/special_tokens_map.json filter=lfs diff=lfs merge=lfs -text
|
54 |
+
model2/training_args.bin filter=lfs diff=lfs merge=lfs -text
|
55 |
+
model2/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
|
56 |
+
model2/added_tokens.json filter=lfs diff=lfs merge=lfs -text
|
57 |
+
model2/optimizer.pt filter=lfs diff=lfs merge=lfs -text
|
58 |
+
model2/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
59 |
+
model2/trainer_state.json filter=lfs diff=lfs merge=lfs -text
|
60 |
+
model2/vocab.json filter=lfs diff=lfs merge=lfs -text
|
61 |
+
model2/README.md filter=lfs diff=lfs merge=lfs -text
|
62 |
+
model2/adapter_config.json filter=lfs diff=lfs merge=lfs -text
|
63 |
+
model2/adapter_model.safetensors filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
import spaces
|
2 |
import os
|
3 |
import re
|
4 |
import gradio as gr
|
@@ -9,19 +9,27 @@ from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration, Text
|
|
9 |
import torchaudio
|
10 |
from threading import Thread
|
11 |
|
12 |
-
# Model
|
13 |
-
|
|
|
14 |
base_model_id = "Qwen/Qwen2-Audio-7B-Instruct"
|
15 |
|
|
|
|
|
|
|
16 |
# Load the model and processor
|
17 |
-
def load_model():
|
|
|
|
|
|
|
|
|
18 |
# Load the processor from the base model
|
19 |
processor = AutoProcessor.from_pretrained(
|
20 |
base_model_id,
|
21 |
trust_remote_code=True,
|
22 |
)
|
23 |
|
24 |
-
# Load the
|
25 |
model = Qwen2AudioForConditionalGeneration.from_pretrained(
|
26 |
model_path,
|
27 |
torch_dtype=torch.bfloat16,
|
@@ -31,68 +39,14 @@ def load_model():
|
|
31 |
|
32 |
model.eval()
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
# Initialize model and processor
|
37 |
-
model, processor = load_model()
|
38 |
-
|
39 |
-
# Function to extract components from model output
|
40 |
-
def extract_components(text):
|
41 |
-
thinking = ""
|
42 |
-
semantic = ""
|
43 |
-
answer = ""
|
44 |
-
|
45 |
-
# Extract thinking
|
46 |
-
think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
|
47 |
-
if think_match:
|
48 |
-
thinking = think_match.group(1).strip()
|
49 |
-
|
50 |
-
# Extract semantic elements
|
51 |
-
semantic_match = re.search(r"<semantic_elements>(.*?)</semantic_elements>", text, re.DOTALL)
|
52 |
-
if semantic_match:
|
53 |
-
semantic = semantic_match.group(1).strip()
|
54 |
|
55 |
-
|
56 |
-
answer_match = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL)
|
57 |
-
if answer_match:
|
58 |
-
answer = answer_match.group(1).strip()
|
59 |
-
|
60 |
-
return thinking, semantic, answer
|
61 |
|
62 |
-
#
|
63 |
-
|
64 |
-
chat = []
|
65 |
-
for item in history:
|
66 |
-
chat.append({"role": "user", "content": item[0]})
|
67 |
-
if item[1] is not None:
|
68 |
-
chat.append({"role": "assistant", "content": item[1]})
|
69 |
-
chat.append({"role": "user", "content": message})
|
70 |
-
messages = processor.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
71 |
-
# Tokenize the messages string
|
72 |
-
model_inputs = processor([messages], return_tensors="pt").to(model.device)
|
73 |
-
streamer = TextIteratorStreamer(
|
74 |
-
processor.tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
|
75 |
-
generate_kwargs = dict(
|
76 |
-
model_inputs,
|
77 |
-
streamer=streamer,
|
78 |
-
max_new_tokens=1024,
|
79 |
-
do_sample=True,
|
80 |
-
top_p=0.95,
|
81 |
-
top_k=1000,
|
82 |
-
temperature=0.75,
|
83 |
-
num_beams=1,
|
84 |
-
)
|
85 |
-
t = Thread(target=model.generate, kwargs=generate_kwargs)
|
86 |
-
t.start()
|
87 |
|
88 |
-
# Initialize an empty string to store the generated text
|
89 |
-
partial_text = ""
|
90 |
-
for new_text in streamer:
|
91 |
-
# print(new_text)
|
92 |
-
partial_text += new_text
|
93 |
-
# Yield an empty string to cleanup the message textbox and the updated conversation history
|
94 |
-
yield partial_text
|
95 |
-
|
96 |
|
97 |
def process_output(output):
|
98 |
if "<think>" in output:
|
@@ -106,18 +60,25 @@ def process_output(output):
|
|
106 |
output = "<answer>\n" + rest
|
107 |
elif "</think>" in output:
|
108 |
rest = output.split("</think>")[0]
|
109 |
-
output = rest + "\n</think>\n"
|
110 |
elif "</semantic_elements>" in output:
|
111 |
rest = output.split("</semantic_elements>")[0]
|
112 |
-
output = rest + "\n</semantic_elements>\n"
|
113 |
elif "</answer>" in output:
|
114 |
rest = output.split("</answer>")[0]
|
115 |
output = rest + "\n</answer>\n"
|
|
|
|
|
|
|
116 |
return output
|
117 |
|
118 |
# Keep only the process_audio_streaming function that's actually used in the Gradio interface
|
119 |
-
@spaces.GPU
|
120 |
-
def process_audio_streaming(audio_file):
|
|
|
|
|
|
|
|
|
121 |
# Load and process the audio with torchaudio
|
122 |
waveform, sr = torchaudio.load(audio_file)
|
123 |
|
@@ -182,11 +143,14 @@ def process_audio_streaming(audio_file):
|
|
182 |
# Create Gradio interface for audio processing
|
183 |
audio_demo = gr.Interface(
|
184 |
fn=process_audio_streaming,
|
185 |
-
inputs=
|
186 |
-
|
|
|
|
|
|
|
187 |
title="SemThink",
|
188 |
-
description="Upload an audio file and the model will provide detailed analysis and description.",
|
189 |
-
examples=["examples/1.wav"], #
|
190 |
cache_examples=False,
|
191 |
live=True # Enable live updates
|
192 |
)
|
|
|
1 |
+
# import spaces
|
2 |
import os
|
3 |
import re
|
4 |
import gradio as gr
|
|
|
9 |
import torchaudio
|
10 |
from threading import Thread
|
11 |
|
12 |
+
# Model paths and configuration
|
13 |
+
model_path_1 = "./model"
|
14 |
+
model_path_2 = "./model2"
|
15 |
base_model_id = "Qwen/Qwen2-Audio-7B-Instruct"
|
16 |
|
17 |
+
# Dictionary to store loaded models and processors
|
18 |
+
loaded_models = {}
|
19 |
+
|
20 |
# Load the model and processor
|
21 |
+
def load_model(model_path):
|
22 |
+
# Check if model is already loaded
|
23 |
+
if model_path in loaded_models:
|
24 |
+
return loaded_models[model_path]
|
25 |
+
|
26 |
# Load the processor from the base model
|
27 |
processor = AutoProcessor.from_pretrained(
|
28 |
base_model_id,
|
29 |
trust_remote_code=True,
|
30 |
)
|
31 |
|
32 |
+
# Load the model
|
33 |
model = Qwen2AudioForConditionalGeneration.from_pretrained(
|
34 |
model_path,
|
35 |
torch_dtype=torch.bfloat16,
|
|
|
39 |
|
40 |
model.eval()
|
41 |
|
42 |
+
# Store in cache
|
43 |
+
loaded_models[model_path] = (model, processor)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
+
return model, processor
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
+
# Initialize first model and processor
|
48 |
+
model, processor = load_model(model_path_1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
def process_output(output):
|
52 |
if "<think>" in output:
|
|
|
60 |
output = "<answer>\n" + rest
|
61 |
elif "</think>" in output:
|
62 |
rest = output.split("</think>")[0]
|
63 |
+
output = rest + "\n</think>\n\n"
|
64 |
elif "</semantic_elements>" in output:
|
65 |
rest = output.split("</semantic_elements>")[0]
|
66 |
+
output = rest + "\n</semantic_elements>\n\n"
|
67 |
elif "</answer>" in output:
|
68 |
rest = output.split("</answer>")[0]
|
69 |
output = rest + "\n</answer>\n"
|
70 |
+
output = output.replace("\\n", "\n")
|
71 |
+
output = output.replace("\\", "\n")
|
72 |
+
output = output.replace("\n-", "-")
|
73 |
return output
|
74 |
|
75 |
# Keep only the process_audio_streaming function that's actually used in the Gradio interface
|
76 |
+
# @spaces.GPU
|
77 |
+
def process_audio_streaming(audio_file, model_choice):
|
78 |
+
# Load the selected model
|
79 |
+
model_path = model_path_1 if model_choice == "Think" else model_path_2
|
80 |
+
model, processor = load_model(model_path)
|
81 |
+
|
82 |
# Load and process the audio with torchaudio
|
83 |
waveform, sr = torchaudio.load(audio_file)
|
84 |
|
|
|
143 |
# Create Gradio interface for audio processing
|
144 |
audio_demo = gr.Interface(
|
145 |
fn=process_audio_streaming,
|
146 |
+
inputs=[
|
147 |
+
gr.Audio(type="filepath", label="Upload Audio"),
|
148 |
+
gr.Radio(["Think", "Think + Semantics"], label="Select Model", value="Think + Semantics")
|
149 |
+
],
|
150 |
+
outputs=gr.Textbox(label="Generated Output", lines=30),
|
151 |
title="SemThink",
|
152 |
+
description="Upload an audio file and the model will provide detailed analysis and description. Choose between different model versions.",
|
153 |
+
examples=[["examples/1.wav", "Think + Semantics"]], # Updated default model in examples
|
154 |
cache_examples=False,
|
155 |
live=True # Enable live updates
|
156 |
)
|
model2/README.md
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a91b1820ee38f2fe4be96b8431300dc9296ec83df43d36f32551cb1bd496b6ac
|
3 |
+
size 5102
|
model2/adapter_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:17d8a6c2d1bdf0ef57df3f012388ea935e6871857aa58176e74ceb4f3a9b098a
|
3 |
+
size 738
|
model2/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b6fc0e27c7bc6237c5aae1ee934949e8815d3dd25db5094a25a46139bef0875e
|
3 |
+
size 22056664
|
model2/added_tokens.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1e3a16457638c3955f95f98446d42eab5096a074daba4dec5d569e2177568a2b
|
3 |
+
size 77138
|
model2/merges.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8831e4f1a044471340f7c0a83d7bd71306a5b867e95fd870f74d0c5308a904d5
|
3 |
+
size 1671853
|
model2/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cf13cbc8b97f25e074014f88000bcaa13df6d80563f3292ef20c558639effca7
|
3 |
+
size 44254970
|
model2/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2bf528318c661dc0b61146db44dd65fdefa0d749d5c3cf7ad5b70a3eb0223f43
|
3 |
+
size 14244
|
model2/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d40c58b45a1ae8a479cc013e372bb19d9ca5414b63e6c42a5552daa0c9020545
|
3 |
+
size 1064
|
model2/special_tokens_map.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d1ed3a229905e152acdb6943f501075b5957bd5774c5940edb81ec1b55e86389
|
3 |
+
size 57715
|
model2/tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fecdb47d281073055efd605d080013e3114ed0f3c5d8af201e245b199864c9c7
|
3 |
+
size 12030779
|
model2/tokenizer_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0ed216fb2f9e3f05ef5d667a2f644a2f91034b500e5224c003f1437247ad8e46
|
3 |
+
size 638366
|
model2/trainer_state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1ba1de525be38f9b4f973fea466347ad7214cc12c4a07c9ba0573d6d0eb9b1c3
|
3 |
+
size 125458
|
model2/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c2c797b9b82e633c023712edc136372641c83ad01beb94ca50f97b2639489c1e
|
3 |
+
size 5880
|
model2/vocab.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910
|
3 |
+
size 2776833
|