Update README.md

1471b92 verified about 1 month ago

6.15 kB

	---
	license: mit
	language:
	- en
	- de
	- fr
	- it
	- pt
	- hi
	- es
	- th
	base_model:
	- Qwen/Qwen2.5-Omni-7B
	pipeline_tag: any-to-any
	tags:
	- gptqmodel
	- FunAGI
	- Qwen
	- int4
	---


	This model has been 4-bit quantized Qwen2.5-Omni-7B model with [GPTQModel](https://github.com/ModelCloud/GPTQModel).

	- bits: 4
	- dynamic: null
	- group_size: 128
	- desc_act: true
	- static_groups: false
	- sym: false
	- lm_head: false
	- true_sequential: true
	- quant_method: "gptq"
	- checkpoint_format: "gptq"
	- meta：
	- quantizer: gptqmodel:1.1.0
	- uri: https://github.com/modelcloud/gptqmodel
	- damp_percent: 0.1
	- damp_auto_increment: 0.0015
	# Model Size
	\| Model Size \| FP \| 4-bit \|
	\|------------\|--------\|---------\|
	\| \| 22.39G \| 12.71G \|
	# Qwen 2.5 Omni Model Loading Guide

	## Installation

	According to the Qwen official [documentation](https://github.com/QwenLM/Qwen2.5-Omni/tree/main), follow these installation steps:

	```bash
	pip uninstall transformers
	pip install git+https://github.com/huggingface/transformers@3a1ead0aabed473eafe527915eea8c197d424356
	pip install accelerate
	pip install qwen-omni-utils[decord]
	```

	Install GPTQModel from the [GitHub repository](https://github.com/ModelCloud/GPTQModel)

	## Loading Model Code

	```python
	import os
	import json
	import torch
	import torch.nn.functional as F
	import numpy as np
	from PIL import Image
	from typing import Any, Dict, List, Optional, Tuple, Union

	from transformers import (
	Qwen2_5OmniModel,
	Qwen2_5OmniProcessor,
	AutoModelForVision2Seq,
	AutoProcessor,
	AutoTokenizer
	)
	from transformers.utils.hub import cached_file
	from transformers.generation.utils import GenerateOutput

	from gptqmodel import GPTQModel, QuantizeConfig, BACKEND
	from gptqmodel.models.base import BaseGPTQModel
	from gptqmodel.models.auto import MODEL_MAP, SUPPORTED_MODELS
	from gptqmodel.models._const import CPU

	from datasets import load_dataset
	from qwen_omni_utils import process_mm_info

	class Qwen25OmniThiknerGPTQ(BaseGPTQModel):
	loader = Qwen2_5OmniModel
	base_modules = [
	"thinker.model.embed_tokens",
	"thinker.model.norm",
	"token2wav",
	"thinker.audio_tower",
	"thinker.model.rotary_emb",
	"thinker.visual",
	"talker"
	]
	pre_lm_head_norm_module = "thinker.model.norm"
	require_monkeypatch = False
	layers_node = "thinker.model.layers"
	layer_type = "Qwen2_5OmniDecoderLayer"
	layer_modules = [
	["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
	["self_attn.o_proj"],
	["mlp.up_proj", "mlp.gate_proj"],
	["mlp.down_proj"],
	]

	def pre_quantize_generate_hook_start(self):
	self.thinker.visual = move_to(self.thinker.visual, device=self.quantize_config.device)
	self.thinker.audio_tower = move_to(self.thinker.audio_tower, device=self.quantize_config.device)

	def pre_quantize_generate_hook_end(self):
	self.thinker.visual = move_to(self.thinker.visual, device=CPU)
	self.thinker.audio_tower = move_to(self.thinker.audio_tower, device=CPU)

	def preprocess_dataset(self, sample: Dict) -> Dict:
	return sample

	MODEL_MAP["qwen2_5_omni"] = Qwen25OmniThiknerGPTQ
	SUPPORTED_MODELS.append("qwen2_5_omni")

	model_path = "/home/chentianqi/model/Qwen/Qwen2.5-Omni-7B-GPTQ-4bit"

	from types import MethodType

	@classmethod
	def patched_from_config(cls, config, args, *kwargs):
	kwargs.pop("trust_remote_code", None)


	model = cls._from_config(config, **kwargs)
	spk_path = cached_file(
	model_path,
	"spk_dict.pt",
	subfolder=kwargs.pop("subfolder", None),
	cache_dir=kwargs.pop("cache_dir", None),
	force_download=kwargs.pop("force_download", False),
	proxies=kwargs.pop("proxies", None),
	resume_download=kwargs.pop("resume_download", None),
	local_files_only=kwargs.pop("local_files_only", False),
	token=kwargs.pop("use_auth_token", None),
	revision=kwargs.pop("revision", None),
	)
	if spk_path is None:
	raise ValueError(f"Speaker dictionary not found at {spk_path}")

	model.load_speakers(spk_path)
	return model

	Qwen2_5OmniModel.from_config = patched_from_config

	# FP Model
	# model = Qwen2_5OmniModel.from_pretrained(
	# model_path,
	# torch_dtype=torch.bfloat16,
	# device_map="auto",
	# attn_implementation="flash_attention_2",
	# )

	# GPTQ MODEL
	model = GPTQModel.load(
	model_path,
	device_map="cuda",
	torch_dtype=torch.float16,
	attn_implementation="flash_attention_2"
	)

	```
	## Testing Model
	```python

	from qwen_omni_utils import process_mm_info
	processor = Qwen2_5OmniProcessor.from_pretrained(model_path)
	# @title inference function
	def inference(video_path, prompt, sys_prompt):
	messages = [
	{"role": "system", "content": sys_prompt},
	{"role": "user", "content": [
	{"type": "text", "text": prompt},
	{"type": "video", "video": video_path},
	]
	},
	]
	text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	# image_inputs, video_inputs = process_vision_info([messages])
	audios, images, videos = process_mm_info(messages, use_audio_in_video=False)
	inputs = processor(text=text, audios=audios, images=images, videos=videos, return_tensors="pt", padding=True)
	inputs = inputs.to(model.device).to(model.dtype)

	output = model.generate(**inputs, use_audio_in_video=False, return_audio=False)

	text = processor.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)
	return text

	video_path = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/screen.mp4"
	prompt = "Please trranslate the abstract of paper into Chinese."

	# display(Video(video_path, width=640, height=360))

	## Use a local HuggingFace model to inference.
	response = inference(video_path, prompt=prompt, sys_prompt="You are a helpful assistant.")
	print(response[0])
	```



	## Notes

	- The code provides both commented-out FP model loading and GPTQ model loading