yujiepan commited on
Commit
41f285a
·
verified ·
1 Parent(s): 9bfebe5

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -13,14 +13,24 @@ This tiny model is for debugging. It is randomly initialized with the config ada
13
  ### Example usage:
14
 
15
  ```python
 
 
 
 
16
  import soundfile as sf
17
  from qwen_omni_utils import process_mm_info
18
- from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor
 
 
 
 
19
 
20
  model_id = "yujiepan/qwen2.5-omni-tiny-random"
21
  # model = Qwen2_5OmniModel.from_pretrained(model_id, torch_dtype="auto", device_map="auto").eval()
22
  # We recommend enabling flash_attention_2 for better acceleration and memory saving.
23
- model = Qwen2_5OmniModel.from_pretrained(
 
 
24
  model_id,
25
  torch_dtype="auto",
26
  device_map="auto",
@@ -31,14 +41,16 @@ processor = Qwen2_5OmniProcessor.from_pretrained(model_id)
31
  conversation = [
32
  {
33
  "role": "system",
34
- "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.",
 
 
35
  },
36
  {
37
  "role": "user",
38
  "content": [
39
  {"type": "text", "text": "Hi, can you tell me a joke?"},
40
- {"type": "audio", "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/glass-breaking-151256.mp3"},
41
- {"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"},
42
  {"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"},
43
  ],
44
  },
@@ -57,6 +69,7 @@ inputs = inputs.to(model.device).to(model.dtype)
57
  text_ids, audio = model.generate(
58
  **inputs, use_audio_in_video=True,
59
  thinker_max_new_tokens=16, talker_max_new_tokens=16,
 
60
  )
61
 
62
  text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
@@ -71,17 +84,20 @@ sf.write(
71
  ### Codes to create this repo:
72
 
73
  ```python
 
74
  from pathlib import Path
75
 
76
  import torch
77
 
 
78
  from huggingface_hub import hf_hub_download
79
  from transformers import (
80
  AutoConfig,
81
  AutoModelForCausalLM,
82
  AutoTokenizer,
83
  GenerationConfig,
84
- Qwen2_5OmniModel,
 
85
  Qwen2_5OmniProcessor,
86
  pipeline,
87
  set_seed,
@@ -166,8 +182,11 @@ for _, info in spk_dict.items():
166
  info['cond'] = info['cond'][:, :config.token2wav_config.dit_config.enc_emb_dim].clone()
167
  torch.save(spk_dict, Path(save_folder, "spk_dict.pt"))
168
 
 
 
 
169
  torch.set_default_dtype(torch.bfloat16)
170
- model = Qwen2_5OmniModel(
171
  config,
172
  )
173
  torch.set_default_dtype(torch.float32)
 
13
  ### Example usage:
14
 
15
  ```python
16
+ import unittest
17
+
18
+ import torch
19
+
20
  import soundfile as sf
21
  from qwen_omni_utils import process_mm_info
22
+ from transformers import (
23
+ Qwen2_5OmniForConditionalGeneration,
24
+ Qwen2_5OmniPreTrainedModel,
25
+ Qwen2_5OmniProcessor,
26
+ )
27
 
28
  model_id = "yujiepan/qwen2.5-omni-tiny-random"
29
  # model = Qwen2_5OmniModel.from_pretrained(model_id, torch_dtype="auto", device_map="auto").eval()
30
  # We recommend enabling flash_attention_2 for better acceleration and memory saving.
31
+
32
+ Qwen2_5OmniPreTrainedModel._init_weights = unittest.mock.Mock()
33
+ model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
34
  model_id,
35
  torch_dtype="auto",
36
  device_map="auto",
 
41
  conversation = [
42
  {
43
  "role": "system",
44
+ "content": [
45
+ {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
46
+ ],
47
  },
48
  {
49
  "role": "user",
50
  "content": [
51
  {"type": "text", "text": "Hi, can you tell me a joke?"},
52
+ # {"type": "audio", "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/glass-breaking-151256.mp3"},
53
+ # {"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"},
54
  {"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"},
55
  ],
56
  },
 
69
  text_ids, audio = model.generate(
70
  **inputs, use_audio_in_video=True,
71
  thinker_max_new_tokens=16, talker_max_new_tokens=16,
72
+ temperature=0.1,
73
  )
74
 
75
  text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 
84
  ### Codes to create this repo:
85
 
86
  ```python
87
+ import unittest
88
  from pathlib import Path
89
 
90
  import torch
91
 
92
+ import accelerate
93
  from huggingface_hub import hf_hub_download
94
  from transformers import (
95
  AutoConfig,
96
  AutoModelForCausalLM,
97
  AutoTokenizer,
98
  GenerationConfig,
99
+ Qwen2_5OmniForConditionalGeneration,
100
+ Qwen2_5OmniPreTrainedModel,
101
  Qwen2_5OmniProcessor,
102
  pipeline,
103
  set_seed,
 
182
  info['cond'] = info['cond'][:, :config.token2wav_config.dit_config.enc_emb_dim].clone()
183
  torch.save(spk_dict, Path(save_folder, "spk_dict.pt"))
184
 
185
+ # patch for non-affine layernorm
186
+ Qwen2_5OmniPreTrainedModel._init_weights = unittest.mock.Mock()
187
+
188
  torch.set_default_dtype(torch.bfloat16)
189
+ model = Qwen2_5OmniForConditionalGeneration(
190
  config,
191
  )
192
  torch.set_default_dtype(torch.float32)
chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set audio_count = namespace(value=0) %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_bos|><|IMAGE|><|vision_eos|>{% elif content['type'] == 'audio' or 'audio' in content or 'audio_url' in content %}{% set audio_count.value = audio_count.value + 1 %}{% if add_audio_id %}Audio {{ audio_count.value }}: {% endif %}<|audio_bos|><|AUDIO|><|audio_eos|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_bos|><|VIDEO|><|vision_eos|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
config.json CHANGED
@@ -1,13 +1,12 @@
1
  {
2
  "architectures": [
3
- "Qwen2_5OmniModel"
4
  ],
5
  "enable_audio_output": true,
6
  "enable_talker": true,
7
  "model_type": "qwen2_5_omni",
8
  "talker_config": {
9
  "_attn_implementation_autoset": true,
10
- "_name_or_path": "Qwen2.5-Omni-7B/talker",
11
  "architectures": [
12
  "Qwen2OmniTalkerForConditionalGeneration"
13
  ],
@@ -61,13 +60,11 @@
61
  },
62
  "thinker_config": {
63
  "_attn_implementation_autoset": true,
64
- "_name_or_path": "Qwen2.5-Omni-7B/thinker",
65
  "architectures": [
66
  "Qwen2OmniNaViTThinkerForConditionalGeneration"
67
  ],
68
  "audio_config": {
69
  "_attn_implementation_autoset": true,
70
- "_name_or_path": "",
71
  "activation_dropout": 0.0,
72
  "activation_function": "gelu",
73
  "add_cross_attention": false,
@@ -99,6 +96,7 @@
99
  "1": "LABEL_1"
100
  },
101
  "init_std": 0.02,
 
102
  "is_decoder": false,
103
  "is_encoder_decoder": false,
104
  "label2id": {
@@ -153,13 +151,13 @@
153
  "ignore_index": -100,
154
  "image_token_index": 151655,
155
  "init_std": 0.02,
 
156
  "model_type": "qwen2_5_omni_thinker",
157
  "pad_token_id": 151643,
158
  "position_id_per_seconds": 25,
159
  "seconds_per_chunk": 2,
160
  "text_config": {
161
  "_attn_implementation_autoset": false,
162
- "_name_or_path": "",
163
  "add_cross_attention": false,
164
  "architectures": null,
165
  "attention_dropout": 0.0,
@@ -185,6 +183,7 @@
185
  "1": "LABEL_1"
186
  },
187
  "init_std": 0.02,
 
188
  "intermediate_size": 32,
189
  "is_decoder": false,
190
  "is_encoder_decoder": false,
@@ -251,7 +250,6 @@
251
  "video_token_index": 151656,
252
  "vision_config": {
253
  "_attn_implementation_autoset": true,
254
- "_name_or_path": "",
255
  "add_cross_attention": false,
256
  "architectures": null,
257
  "bad_words_ids": null,
@@ -283,6 +281,7 @@
283
  "in_channels": 3,
284
  "in_chans": 3,
285
  "init_std": 0.02,
 
286
  "intermediate_size": 32,
287
  "is_decoder": false,
288
  "is_encoder_decoder": false,
@@ -340,7 +339,6 @@
340
  "_attn_implementation_autoset": true,
341
  "bigvgan_config": {
342
  "_attn_implementation_autoset": true,
343
- "_name_or_path": "",
344
  "add_cross_attention": false,
345
  "architectures": null,
346
  "bad_words_ids": null,
@@ -425,7 +423,6 @@
425
  },
426
  "dit_config": {
427
  "_attn_implementation_autoset": true,
428
- "_name_or_path": "",
429
  "add_cross_attention": false,
430
  "architectures": null,
431
  "bad_words_ids": null,
@@ -534,5 +531,5 @@
534
  "model_type": "qwen2_5_omni_token2wav"
535
  },
536
  "torch_dtype": "bfloat16",
537
- "transformers_version": "4.50.0.dev0"
538
  }
 
1
  {
2
  "architectures": [
3
+ "Qwen2_5OmniForConditionalGeneration"
4
  ],
5
  "enable_audio_output": true,
6
  "enable_talker": true,
7
  "model_type": "qwen2_5_omni",
8
  "talker_config": {
9
  "_attn_implementation_autoset": true,
 
10
  "architectures": [
11
  "Qwen2OmniTalkerForConditionalGeneration"
12
  ],
 
60
  },
61
  "thinker_config": {
62
  "_attn_implementation_autoset": true,
 
63
  "architectures": [
64
  "Qwen2OmniNaViTThinkerForConditionalGeneration"
65
  ],
66
  "audio_config": {
67
  "_attn_implementation_autoset": true,
 
68
  "activation_dropout": 0.0,
69
  "activation_function": "gelu",
70
  "add_cross_attention": false,
 
96
  "1": "LABEL_1"
97
  },
98
  "init_std": 0.02,
99
+ "initializer_range": 0.02,
100
  "is_decoder": false,
101
  "is_encoder_decoder": false,
102
  "label2id": {
 
151
  "ignore_index": -100,
152
  "image_token_index": 151655,
153
  "init_std": 0.02,
154
+ "initializer_range": 0.02,
155
  "model_type": "qwen2_5_omni_thinker",
156
  "pad_token_id": 151643,
157
  "position_id_per_seconds": 25,
158
  "seconds_per_chunk": 2,
159
  "text_config": {
160
  "_attn_implementation_autoset": false,
 
161
  "add_cross_attention": false,
162
  "architectures": null,
163
  "attention_dropout": 0.0,
 
183
  "1": "LABEL_1"
184
  },
185
  "init_std": 0.02,
186
+ "initializer_range": 0.02,
187
  "intermediate_size": 32,
188
  "is_decoder": false,
189
  "is_encoder_decoder": false,
 
250
  "video_token_index": 151656,
251
  "vision_config": {
252
  "_attn_implementation_autoset": true,
 
253
  "add_cross_attention": false,
254
  "architectures": null,
255
  "bad_words_ids": null,
 
281
  "in_channels": 3,
282
  "in_chans": 3,
283
  "init_std": 0.02,
284
+ "initializer_range": 0.02,
285
  "intermediate_size": 32,
286
  "is_decoder": false,
287
  "is_encoder_decoder": false,
 
339
  "_attn_implementation_autoset": true,
340
  "bigvgan_config": {
341
  "_attn_implementation_autoset": true,
 
342
  "add_cross_attention": false,
343
  "architectures": null,
344
  "bad_words_ids": null,
 
423
  },
424
  "dit_config": {
425
  "_attn_implementation_autoset": true,
 
426
  "add_cross_attention": false,
427
  "architectures": null,
428
  "bad_words_ids": null,
 
531
  "model_type": "qwen2_5_omni_token2wav"
532
  },
533
  "torch_dtype": "bfloat16",
534
+ "transformers_version": "4.52.0.dev0"
535
  }
generation_config.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
  "_from_model_config": true,
3
- "transformers_version": "4.50.0.dev0"
4
  }
 
1
  {
2
  "_from_model_config": true,
3
+ "transformers_version": "4.52.0.dev0"
4
  }
tokenizer_config.json CHANGED
@@ -197,7 +197,6 @@
197
  "audio_eos_token": "<|audio_eos|>",
198
  "audio_token": "<|AUDIO|>",
199
  "bos_token": null,
200
- "chat_template": "{% set audio_count = namespace(value=0) %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_bos|><|IMAGE|><|vision_eos|>{% elif content['type'] == 'audio' or 'audio' in content or 'audio_url' in content %}{% set audio_count.value = audio_count.value + 1 %}{% if add_audio_id %}Audio {{ audio_count.value }}: {% endif %}<|audio_bos|><|AUDIO|><|audio_eos|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_bos|><|VIDEO|><|vision_eos|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
201
  "clean_up_tokenization_spaces": false,
202
  "eos_token": "<|im_end|>",
203
  "errors": "replace",
 
197
  "audio_eos_token": "<|audio_eos|>",
198
  "audio_token": "<|AUDIO|>",
199
  "bos_token": null,
 
200
  "clean_up_tokenization_spaces": false,
201
  "eos_token": "<|im_end|>",
202
  "errors": "replace",