Gijs Wijngaard commited on
Commit
39d1328
·
1 Parent(s): 044a34e
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md DELETED
@@ -1,12 +0,0 @@
1
- ---
2
- title: SemThink
3
- emoji: 🐨
4
- colorFrom: red
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 5.20.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import gradio as gr
4
+ import torch
5
+ from transformers import AutoProcessor
6
+ from qwen import Qwen2AudioForConditionalGeneration
7
+ from peft import PeftModel, PeftConfig
8
+
9
+ # Model path and configuration
10
+ model_path = "./model"
11
+ base_model_id = "Qwen/Qwen2-Audio-7B-Instruct"
12
+
13
+ # Load the model and processor
14
+ def load_model():
15
+ # Load the processor from the base model
16
+ processor = AutoProcessor.from_pretrained(
17
+ base_model_id,
18
+ trust_remote_code=True,
19
+ )
20
+
21
+ # Load the base model
22
+ base_model = Qwen2AudioForConditionalGeneration.from_pretrained(
23
+ base_model_id,
24
+ torch_dtype=torch.bfloat16,
25
+ trust_remote_code=True,
26
+ device_map="auto",
27
+ )
28
+
29
+ # Load the LoRA adapter
30
+ model = PeftModel.from_pretrained(base_model, model_path)
31
+
32
+ return model, processor
33
+
34
+ # Initialize model and processor
35
+ model, processor = load_model()
36
+
37
+ # Function to extract components from model output
38
+ def extract_components(text):
39
+ thinking = ""
40
+ semantic = ""
41
+ answer = ""
42
+
43
+ # Extract thinking
44
+ think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
45
+ if think_match:
46
+ thinking = think_match.group(1).strip()
47
+
48
+ # Extract semantic elements
49
+ semantic_match = re.search(r"<semantic_elements>(.*?)</semantic_elements>", text, re.DOTALL)
50
+ if semantic_match:
51
+ semantic = semantic_match.group(1).strip()
52
+
53
+ # Extract answer
54
+ answer_match = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL)
55
+ if answer_match:
56
+ answer = answer_match.group(1).strip()
57
+
58
+ return thinking, semantic, answer
59
+
60
+ # Function to process audio and return components
61
+ def process_audio(audio_file):
62
+ # Load and process the audio
63
+ sampling_rate = processor.feature_extractor.sampling_rate
64
+
65
+ # Create conversation format
66
+ conversation = [
67
+ {"role": "user", "content": [
68
+ {"type": "audio", "audio": audio_file},
69
+ {"type": "text", "text": "Describe the audio in detail."}
70
+ ]}
71
+ ]
72
+
73
+ # Format the chat
74
+ chat_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
75
+
76
+ # Process the inputs
77
+ inputs = processor(
78
+ text=chat_text,
79
+ audios=[audio_file],
80
+ return_tensors="pt",
81
+ sampling_rate=sampling_rate,
82
+ ).to(model.device)
83
+
84
+ # Generate the output
85
+ with torch.no_grad():
86
+ outputs = model.generate(
87
+ **inputs,
88
+ max_new_tokens=768,
89
+ do_sample=False,
90
+ )
91
+
92
+ # Decode the output
93
+ generated_text = processor.tokenizer.decode(outputs[0], skip_special_tokens=False)
94
+ assistant_text = generated_text.split("<|im_start|>assistant\n")[-1].split("<|im_end|>")[0].strip()
95
+
96
+ # Extract components
97
+ thinking, semantic, answer = extract_components(assistant_text)
98
+
99
+ return thinking, semantic, answer
100
+
101
+ # Create Gradio interface
102
+ demo = gr.Interface(
103
+ fn=process_audio,
104
+ inputs=gr.Audio(type="filepath", label="Upload Audio"),
105
+ outputs=[
106
+ gr.Textbox(label="Thinking Process", lines=10),
107
+ gr.Textbox(label="Semantic Elements", lines=5),
108
+ gr.Textbox(label="Answer", lines=5)
109
+ ],
110
+ title="Qwen2Audio Audio Description Demo",
111
+ description="Upload an audio file and the model will provide detailed analysis and description.",
112
+ examples=[], # Add example files here if available
113
+ cache_examples=False,
114
+ )
115
+
116
+ # Launch the app
117
+ if __name__ == "__main__":
118
+ demo.launch()
model/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a91b1820ee38f2fe4be96b8431300dc9296ec83df43d36f32551cb1bd496b6ac
3
+ size 5102
model/adapter_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b69cf0e03533c61b50cedb46e542755522c3b648edcfa13797dccb3d09e597b
3
+ size 738
model/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c55095f34cd9d4450569b93545bd40482a7a033173f2241d2a52a6b69a741d73
3
+ size 22056664
model/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e3a16457638c3955f95f98446d42eab5096a074daba4dec5d569e2177568a2b
3
+ size 77138
model/merges.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8831e4f1a044471340f7c0a83d7bd71306a5b867e95fd870f74d0c5308a904d5
3
+ size 1671853
model/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59927c3c5eaecef430745b66190edac4504648dac188a78b34a3e2aebfc37784
3
+ size 44254970
model/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:000068d3df893f9adfe39892ef754472eebe0945e015804556ea73b1840be65c
3
+ size 14244
model/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ae2a6ae5ac67c587ae44c5692203cf77c0ed6a71b7a4a293d9ec49164b5d659
3
+ size 1064
model/special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1ed3a229905e152acdb6943f501075b5957bd5774c5940edb81ec1b55e86389
3
+ size 57715
model/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fecdb47d281073055efd605d080013e3114ed0f3c5d8af201e245b199864c9c7
3
+ size 12030779
model/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ed216fb2f9e3f05ef5d667a2f644a2f91034b500e5224c003f1437247ad8e46
3
+ size 638366
model/trainer_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:917b9ca9c249f65eea8e7970856b3e03f590ee032a26a503cf78d58ac21124e6
3
+ size 125231
model/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eaeb2c00be2b954438fd006b4c6ee5c73d08a402950561ead283cc8968c44a55
3
+ size 5944
model/vocab.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca10d7e9fb3ed18575dd1e277a2579c16d108e32f27439684afa0e10b1440910
3
+ size 2776833
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ transformers
4
+ qwen
5
+ peft