--- library_name: transformers pipeline_tag: any-to-any inference: true widget: - text: Hello! example_title: Hello world group: Python --- This tiny model is for debugging. It is randomly initialized with the config adapted from [Qwen/Qwen2.5-Omni-7B](https://huggingface.co/Qwen/Qwen2.5-Omni-7B). ### Example usage: ```python import unittest import torch import soundfile as sf from qwen_omni_utils import process_mm_info from transformers import ( Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniPreTrainedModel, Qwen2_5OmniProcessor, ) model_id = "tiny-random/qwen2.5-omni" # model = Qwen2_5OmniModel.from_pretrained(model_id, torch_dtype="auto", device_map="auto").eval() # We recommend enabling flash_attention_2 for better acceleration and memory saving. Qwen2_5OmniPreTrainedModel._init_weights = unittest.mock.Mock() model = Qwen2_5OmniForConditionalGeneration.from_pretrained( model_id, torch_dtype="auto", device_map="auto", attn_implementation="flash_attention_2", ).eval() processor = Qwen2_5OmniProcessor.from_pretrained(model_id) conversation = [ { "role": "system", "content": [ {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."} ], }, { "role": "user", "content": [ {"type": "text", "text": "Hi, can you tell me a joke?"}, # {"type": "audio", "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/glass-breaking-151256.mp3"}, # {"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"}, {"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}, ], }, ] # Preparation for inference text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) audios, images, videos = process_mm_info(conversation, use_audio_in_video=True) print('Audios:', audios) print('Images:', images) print('Videos:', videos) inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors="pt", padding=True) inputs = inputs.to(model.device).to(model.dtype) # Inference: Generation of the output text and audio text_ids, audio = model.generate( **inputs, use_audio_in_video=True, thinker_max_new_tokens=16, talker_max_new_tokens=16, temperature=0.1, ) text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) print(text, '\n' * 3) sf.write( "/tmp/output.wav", audio.reshape(-1).detach().cpu().numpy(), samplerate=24000, ) ``` ### Codes to create this repo: ```python import unittest from pathlib import Path import torch import accelerate from huggingface_hub import hf_hub_download from transformers import ( AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniPreTrainedModel, Qwen2_5OmniProcessor, pipeline, set_seed, ) source_model_id = "Qwen/Qwen2.5-Omni-7B" save_folder = "/tmp/tiny-random/qwen2.5-omni" processor = Qwen2_5OmniProcessor.from_pretrained( source_model_id, trust_remote_code=True, ) processor.save_pretrained(save_folder) config = AutoConfig.from_pretrained( source_model_id, trust_remote_code=True, ) OUTPUT_DIM = 16 config.talker_config.num_hidden_layers = 1 config.talker_config.hidden_size = 16 config.talker_config.embedding_size = OUTPUT_DIM config.talker_config.head_dim = 16 config.talker_config.num_attention_heads = 1 config.talker_config.num_key_value_heads = 1 config.talker_config.intermediate_size = 32 config.talker_config.rope_scaling['mrope_section'] = [2, 2, 4] assert 2 * sum(config.talker_config.rope_scaling['mrope_section'] ) == config.talker_config.hidden_size / config.talker_config.num_attention_heads config.thinker_config.audio_config.num_hidden_layers = 1 config.thinker_config.audio_config.encoder_layers = 1 config.thinker_config.audio_config.d_model = 16 config.thinker_config.audio_config.encoder_attention_heads = 1 config.thinker_config.audio_config.encoder_ffn_dim = 32 config.thinker_config.audio_config.output_dim = OUTPUT_DIM config.thinker_config.text_config.num_hidden_layers = 1 config.thinker_config.text_config.hidden_size = OUTPUT_DIM config.thinker_config.text_config.intermediate_size = 32 config.thinker_config.text_config.num_attention_heads = 1 config.thinker_config.text_config.num_key_value_heads = 1 config.thinker_config.text_config.rope_scaling['mrope_section'] = [2, 2, 4] assert 2 * sum(config.thinker_config.text_config.rope_scaling['mrope_section'] ) == config.thinker_config.text_config.hidden_size / config.thinker_config.text_config.num_attention_heads config.thinker_config.vision_config.depth = 2 config.thinker_config.vision_config.embed_dim = 16 config.thinker_config.vision_config.hidden_size = 16 config.thinker_config.vision_config.intermediate_size = 32 config.thinker_config.vision_config.out_hidden_size = OUTPUT_DIM config.thinker_config.vision_config.num_heads = 1 config.thinker_config.vision_config.fullatt_block_indexes = [1] config.token2wav_config.bigvgan_config.resblock_dilation_sizes = [[1, 3, 5]] config.token2wav_config.bigvgan_config.resblock_kernel_sizes = [7] config.token2wav_config.bigvgan_config.upsample_initial_channel = 32 config.token2wav_config.bigvgan_config.upsample_kernel_sizes = [11, 4] config.token2wav_config.bigvgan_config.upsample_rates = [5, 2] config.token2wav_config.dit_config.depth = 2 config.token2wav_config.dit_config.num_hidden_layers = 2 config.token2wav_config.dit_config.hidden_size = 16 config.token2wav_config.dit_config.dim = 16 config.token2wav_config.dit_config.emb_dim = 16 config.token2wav_config.dit_config.enc_attention_channels = 16 config.token2wav_config.dit_config.enc_channels = [32, 32, 32] config.token2wav_config.dit_config.enc_dilations = [1, 3, 4] config.token2wav_config.dit_config.enc_kernel_sizes = [5, 3, 1] config.token2wav_config.dit_config.enc_dim = 16 config.token2wav_config.dit_config.enc_emb_dim = 16 config.token2wav_config.dit_config.enc_lin_neurons = 16 config.token2wav_config.dit_config.head_dim = 16 config.token2wav_config.dit_config.num_attention_heads = 1 config.token2wav_config.dit_config.heads = 1 config.token2wav_config.dit_config.look_ahead_layers = [1] config.token2wav_config.dit_config.look_backward_layers = [0] # avoid mismatch in vocab size because this is random model! config.token2wav_config.dit_config.num_embeds = config.talker_config.vocab_size print(config) spk_dict = torch.load(hf_hub_download(source_model_id, 'spk_dict.pt', repo_type='model')) for _, info in spk_dict.items(): info['cond'] = info['cond'][:, :config.token2wav_config.dit_config.enc_emb_dim].clone() torch.save(spk_dict, Path(save_folder, "spk_dict.pt")) # patch for non-affine layernorm Qwen2_5OmniPreTrainedModel._init_weights = unittest.mock.Mock() torch.set_default_dtype(torch.bfloat16) model = Qwen2_5OmniForConditionalGeneration( config, ) torch.set_default_dtype(torch.float32) model.generation_config = GenerationConfig.from_pretrained( source_model_id, trust_remote_code=True, ) set_seed(42) with torch.no_grad(): for name, p in sorted(model.named_parameters()): torch.nn.init.normal_(p, 0, 0.5) print(name, p.shape, p.dtype) model.save_pretrained(save_folder) ```