Gijs Wijngaard commited on
Commit
b95f6f3
·
1 Parent(s): 84c21bd

Add semantics

Browse files
.gitattributes CHANGED
@@ -47,3 +47,17 @@ model/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
47
  model/trainer_state.json filter=lfs diff=lfs merge=lfs -text
48
  model/training_args.bin filter=lfs diff=lfs merge=lfs -text
49
  model/added_tokens.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  model/trainer_state.json filter=lfs diff=lfs merge=lfs -text
48
  model/training_args.bin filter=lfs diff=lfs merge=lfs -text
49
  model/added_tokens.json filter=lfs diff=lfs merge=lfs -text
50
+ model2/merges.txt filter=lfs diff=lfs merge=lfs -text
51
+ model2/rng_state.pth filter=lfs diff=lfs merge=lfs -text
52
+ model2/scheduler.pt filter=lfs diff=lfs merge=lfs -text
53
+ model2/special_tokens_map.json filter=lfs diff=lfs merge=lfs -text
54
+ model2/training_args.bin filter=lfs diff=lfs merge=lfs -text
55
+ model2/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
56
+ model2/added_tokens.json filter=lfs diff=lfs merge=lfs -text
57
+ model2/optimizer.pt filter=lfs diff=lfs merge=lfs -text
58
+ model2/tokenizer.json filter=lfs diff=lfs merge=lfs -text
59
+ model2/trainer_state.json filter=lfs diff=lfs merge=lfs -text
60
+ model2/vocab.json filter=lfs diff=lfs merge=lfs -text
61
+ model2/README.md filter=lfs diff=lfs merge=lfs -text
62
+ model2/adapter_config.json filter=lfs diff=lfs merge=lfs -text
63
+ model2/adapter_model.safetensors filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1,4 +1,4 @@
1
- import spaces
2
  import os
3
  import re
4
  import gradio as gr
@@ -9,19 +9,27 @@ from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration, Text
9
  import torchaudio
10
  from threading import Thread
11
 
12
- # Model path and configuration
13
- model_path = "./model"
 
14
  base_model_id = "Qwen/Qwen2-Audio-7B-Instruct"
15
 
 
 
 
16
  # Load the model and processor
17
- def load_model():
 
 
 
 
18
  # Load the processor from the base model
19
  processor = AutoProcessor.from_pretrained(
20
  base_model_id,
21
  trust_remote_code=True,
22
  )
23
 
24
- # Load the base model
25
  model = Qwen2AudioForConditionalGeneration.from_pretrained(
26
  model_path,
27
  torch_dtype=torch.bfloat16,
@@ -31,68 +39,14 @@ def load_model():
31
 
32
  model.eval()
33
 
34
- return model, processor
35
-
36
- # Initialize model and processor
37
- model, processor = load_model()
38
-
39
- # Function to extract components from model output
40
- def extract_components(text):
41
- thinking = ""
42
- semantic = ""
43
- answer = ""
44
-
45
- # Extract thinking
46
- think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
47
- if think_match:
48
- thinking = think_match.group(1).strip()
49
-
50
- # Extract semantic elements
51
- semantic_match = re.search(r"<semantic_elements>(.*?)</semantic_elements>", text, re.DOTALL)
52
- if semantic_match:
53
- semantic = semantic_match.group(1).strip()
54
 
55
- # Extract answer
56
- answer_match = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL)
57
- if answer_match:
58
- answer = answer_match.group(1).strip()
59
-
60
- return thinking, semantic, answer
61
 
62
- # Function to handle chat messages
63
- def chat(message, history):
64
- chat = []
65
- for item in history:
66
- chat.append({"role": "user", "content": item[0]})
67
- if item[1] is not None:
68
- chat.append({"role": "assistant", "content": item[1]})
69
- chat.append({"role": "user", "content": message})
70
- messages = processor.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
71
- # Tokenize the messages string
72
- model_inputs = processor([messages], return_tensors="pt").to(model.device)
73
- streamer = TextIteratorStreamer(
74
- processor.tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
75
- generate_kwargs = dict(
76
- model_inputs,
77
- streamer=streamer,
78
- max_new_tokens=1024,
79
- do_sample=True,
80
- top_p=0.95,
81
- top_k=1000,
82
- temperature=0.75,
83
- num_beams=1,
84
- )
85
- t = Thread(target=model.generate, kwargs=generate_kwargs)
86
- t.start()
87
 
88
- # Initialize an empty string to store the generated text
89
- partial_text = ""
90
- for new_text in streamer:
91
- # print(new_text)
92
- partial_text += new_text
93
- # Yield an empty string to cleanup the message textbox and the updated conversation history
94
- yield partial_text
95
-
96
 
97
  def process_output(output):
98
  if "<think>" in output:
@@ -106,18 +60,25 @@ def process_output(output):
106
  output = "<answer>\n" + rest
107
  elif "</think>" in output:
108
  rest = output.split("</think>")[0]
109
- output = rest + "\n</think>\n"
110
  elif "</semantic_elements>" in output:
111
  rest = output.split("</semantic_elements>")[0]
112
- output = rest + "\n</semantic_elements>\n"
113
  elif "</answer>" in output:
114
  rest = output.split("</answer>")[0]
115
  output = rest + "\n</answer>\n"
 
 
 
116
  return output
117
 
118
  # Keep only the process_audio_streaming function that's actually used in the Gradio interface
119
- @spaces.GPU
120
- def process_audio_streaming(audio_file):
 
 
 
 
121
  # Load and process the audio with torchaudio
122
  waveform, sr = torchaudio.load(audio_file)
123
 
@@ -182,11 +143,14 @@ def process_audio_streaming(audio_file):
182
  # Create Gradio interface for audio processing
183
  audio_demo = gr.Interface(
184
  fn=process_audio_streaming,
185
- inputs=gr.Audio(type="filepath", label="Upload Audio"),
186
- outputs=gr.Textbox(label="Generated Output", lines=24),
 
 
 
187
  title="SemThink",
188
- description="Upload an audio file and the model will provide detailed analysis and description.",
189
- examples=["examples/1.wav"], # Add example files here if available
190
  cache_examples=False,
191
  live=True # Enable live updates
192
  )
 
1
+ # import spaces
2
  import os
3
  import re
4
  import gradio as gr
 
9
  import torchaudio
10
  from threading import Thread
11
 
12
+ # Model paths and configuration
13
+ model_path_1 = "./model"
14
+ model_path_2 = "./model2"
15
  base_model_id = "Qwen/Qwen2-Audio-7B-Instruct"
16
 
17
+ # Dictionary to store loaded models and processors
18
+ loaded_models = {}
19
+
20
  # Load the model and processor
21
+ def load_model(model_path):
22
+ # Check if model is already loaded
23
+ if model_path in loaded_models:
24
+ return loaded_models[model_path]
25
+
26
  # Load the processor from the base model
27
  processor = AutoProcessor.from_pretrained(
28
  base_model_id,
29
  trust_remote_code=True,
30
  )
31
 
32
+ # Load the model
33
  model = Qwen2AudioForConditionalGeneration.from_pretrained(
34
  model_path,
35
  torch_dtype=torch.bfloat16,
 
39
 
40
  model.eval()
41
 
42
+ # Store in cache
43
+ loaded_models[model_path] = (model, processor)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ return model, processor
 
 
 
 
 
46
 
47
+ # Initialize first model and processor
48
+ model, processor = load_model(model_path_1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
 
 
 
 
 
 
 
 
50
 
51
  def process_output(output):
52
  if "<think>" in output:
 
60
  output = "<answer>\n" + rest
61
  elif "</think>" in output:
62
  rest = output.split("</think>")[0]
63
+ output = rest + "\n</think>\n\n"
64
  elif "</semantic_elements>" in output:
65
  rest = output.split("</semantic_elements>")[0]
66
+ output = rest + "\n</semantic_elements>\n\n"
67
  elif "</answer>" in output:
68
  rest = output.split("</answer>")[0]
69
  output = rest + "\n</answer>\n"
70
+ output = output.replace("\\n", "\n")
71
+ output = output.replace("\\", "\n")
72
+ output = output.replace("\n-", "-")
73
  return output
74
 
75
  # Keep only the process_audio_streaming function that's actually used in the Gradio interface
76
+ # @spaces.GPU
77
+ def process_audio_streaming(audio_file, model_choice):
78
+ # Load the selected model
79
+ model_path = model_path_1 if model_choice == "Think" else model_path_2
80
+ model, processor = load_model(model_path)
81
+
82
  # Load and process the audio with torchaudio
83
  waveform, sr = torchaudio.load(audio_file)
84
 
 
143
  # Create Gradio interface for audio processing
144
  audio_demo = gr.Interface(
145
  fn=process_audio_streaming,
146
+ inputs=[
147
+ gr.Audio(type="filepath", label="Upload Audio"),
148
+ gr.Radio(["Think", "Think + Semantics"], label="Select Model", value="Think + Semantics")
149
+ ],
150
+ outputs=gr.Textbox(label="Generated Output", lines=30),
151
  title="SemThink",
152
+ description="Upload an audio file and the model will provide detailed analysis and description. Choose between different model versions.",
153
+ examples=[["examples/1.wav", "Think + Semantics"]], # Updated default model in examples
154
  cache_examples=False,
155
  live=True # Enable live updates
156
  )
model2/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a91b1820ee38f2fe4be96b8431300dc9296ec83df43d36f32551cb1bd496b6ac
3
+ size 5102
model2/adapter_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17d8a6c2d1bdf0ef57df3f012388ea935e6871857aa58176e74ceb4f3a9b098a
3
+ size 738
model2/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6fc0e27c7bc6237c5aae1ee934949e8815d3dd25db5094a25a46139bef0875e
3
+ size 22056664
model2/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e3a16457638c3955f95f98446d42eab5096a074daba4dec5d569e2177568a2b
3
+ size 77138
model2/merges.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8831e4f1a044471340f7c0a83d7bd71306a5b867e95fd870f74d0c5308a904d5
3
+ size 1671853
model2/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf13cbc8b97f25e074014f88000bcaa13df6d80563f3292ef20c558639effca7
3
+ size 44254970
model2/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bf528318c661dc0b61146db44dd65fdefa0d749d5c3cf7ad5b70a3eb0223f43
3
+ size 14244
model2/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d40c58b45a1ae8a479cc013e372bb19d9ca5414b63e6c42a5552daa0c9020545
3
+ size 1064
model2/special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1ed3a229905e152acdb6943f501075b5957bd5774c5940edb81ec1b55e86389
3
+ size 57715
model2/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fecdb47d281073055efd605d080013e3114ed0f3c5d8af201e245b199864c9c7
3
+ size 12030779
model2/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ed216fb2f9e3f05ef5d667a2f644a2f91034b500e5224c003f1437247ad8e46
3
+ size 638366
model2/trainer_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ba1de525be38f9b4f973fea466347ad7214cc12c4a07c9ba0573d6d0eb9b1c3
3
+ size 125458
model2/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2c797b9b82e633c023712edc136372641c83ad01beb94ca50f97b2639489c1e
3
+ size 5880
model2/vocab.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910
3
+ size 2776833