Spaces:

VITA-MLLM
/

VITA-1.5

Running on Zero

App Files Files Community

lxysl commited on Mar 28

Commit

fbe8f3a

1 Parent(s): 8301b1c

achieve normal interaction

Browse files

Files changed (3) hide show

app.py +215 -94
requirements.txt +2 -2
vita/model/vita_arch.py +8 -0

app.py CHANGED Viewed

@@ -8,33 +8,42 @@ import re
 import torchaudio
 import io
 import cv2
 import math
-import spaces
 from numba import jit
 from huggingface_hub import snapshot_download
-from vita.constants import DEFAULT_AUDIO_TOKEN, DEFAULT_IMAGE_TOKEN, MAX_IMAGE_LENGTH, MIN_IMAGE_LENGTH, IMAGE_TOKEN_INDEX, AUDIO_TOKEN_INDEX
 from vita.conversation import conv_templates, SeparatorStyle
-from vita.util.mm_utils import tokenizer_image_token, tokenizer_image_audio_token
 from PIL import Image
 from decord import VideoReader, cpu
-from vita.model.builder import load_pretrained_model
 from vita.model.vita_tts.decoder.llm2tts import llm2TTS
 from vita.model.language_model.vita_qwen2 import VITAQwen2Config, VITAQwen2ForCausalLM
 decoder_topk = 2
 codec_chunk_size = 40
 codec_padding_size = 10
-PUNCTUATION = "！？。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛""„‟…‧﹏."
-MODEL_NAME = "VITA-MLLM/VITA-1.5"
-model_path = snapshot_download(MODEL_NAME, local_dir="VITA_ckpt")
-tokenizer, model, feature_extractor, context_len = load_pretrained_model(
-    model_path, model_base=None, model_name="VITA-1.5", model_type="qwen2p5_instruct"
-)
-llm_embedding = model.get_input_embeddings().cuda()
-tts = llm2TTS(os.path.join(model_path, 'vita_tts_ckpt/'))
 @jit
 def float_to_int16(audio: np.ndarray) -> np.ndarray:
@@ -42,7 +51,6 @@ def float_to_int16(audio: np.ndarray) -> np.ndarray:
     am = 32767 * 32768 // am
     return np.multiply(audio, am).astype(np.int16)
 def remove_special_characters(input_str):
     # Remove special tokens
     special_tokens = ['☞', '☟', '☜', '<unk>', '<|im_end|>']
@@ -50,7 +58,6 @@ def remove_special_characters(input_str):
         input_str = input_str.replace(token, '')
     return input_str
 def replace_equation(sentence):
     special_notations = {
         "sin": " sine ",
@@ -139,7 +146,7 @@ def is_wav(file_path):
     return ext.lower() in wav_extensions
 def load_model_embemding(model_path):
-    config_path = os.path.join(model_path, 'origin_config.json')
     config = VITAQwen2Config.from_pretrained(config_path)
     model = VITAQwen2ForCausalLM.from_pretrained(model_path, config=config, low_cpu_mem_usage=True)
     embedding = model.get_input_embeddings()
@@ -170,14 +177,26 @@ def convert_webm_to_mp4(input_file, output_file):
         raise
-def _get_rawvideo_dec(video_path, max_frames=MAX_IMAGE_LENGTH, min_frames=MIN_IMAGE_LENGTH, video_framerate=1, s=None, e=None):
-    if s is None or e is None:
         start_time, end_time = None, None
     else:
         start_time = int(s)
         end_time = int(e)
-        start_time = max(start_time, 0)
-        end_time = max(end_time, 0)
         if start_time > end_time:
             start_time, end_time = end_time, start_time
         elif start_time == end_time:
@@ -192,21 +211,58 @@ def _get_rawvideo_dec(video_path, max_frames=MAX_IMAGE_LENGTH, min_frames=MIN_IM
     f_start = 0 if start_time is None else int(start_time * fps)
     f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
     num_frames = f_end - f_start + 1
     if num_frames > 0:
         sample_fps = int(video_framerate)
         t_stride = int(round(float(fps) / sample_fps))
-        all_pos = list(range(f_start, f_end + 1, t_stride))
         if len(all_pos) > max_frames:
-            sample_pos = [all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)]
         elif len(all_pos) < min_frames:
-            sample_pos = [all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=min_frames, dtype=int)]
         else:
             sample_pos = all_pos
-        patch_images = [Image.fromarray(f).convert("RGB") for f in vreader.get_batch(sample_pos).asnumpy()]
-        return patch_images, len(patch_images)
     else:
         print(f"video path: {video_path} error.")
@@ -241,6 +297,27 @@ def _parse_text(text):
     return "".join(lines)
 @spaces.GPU
 def predict(_chatbot, task_history):
@@ -258,13 +335,25 @@ def predict(_chatbot, task_history):
     for i, (q, a) in enumerate(task_history):
         if isinstance(q, (tuple, list)):
             if is_image(q[0]):
-                images = [Image.open(q[0]).convert("RGB")]
-                all_visual_tensor.extend(images)
                 input_mode = 'image'
-                qs += DEFAULT_IMAGE_TOKEN * len(images) + '\n'
             elif is_video(q[0]):
-                video_frames, slice_len = _get_rawvideo_dec(q[0])
-                all_visual_tensor.extend(video_frames)
                 input_mode = 'video'
                 qs += DEFAULT_IMAGE_TOKEN * slice_len + '\n'
             elif is_wav(q[0]):
@@ -282,66 +371,85 @@ def predict(_chatbot, task_history):
             conv.append_message(conv.roles[0], new_q)
             conv.append_message(conv.roles[1], a)
     prompt = conv.get_prompt(input_mode)
-    if all_audio_path != []:
-        input_ids = tokenizer_image_audio_token(
-            prompt, tokenizer,
-            image_token_index=IMAGE_TOKEN_INDEX,
-            audio_token_index=AUDIO_TOKEN_INDEX
         )
-        audio_list = []
-        for single_audio_path in all_audio_path:
-            try:
-                audio, original_sr = torchaudio.load(single_audio_path)
-                target_sr = 16000
-                if original_sr != target_sr:
-                    resampler = torchaudio.transforms.Resample(orig_freq=original_sr, new_freq=target_sr)
-                    audio = resampler(audio)
-                audio_features = feature_extractor(audio, sampling_rate=target_sr, return_tensors="pt")["input_features"]
-                audio_list.append(audio_features.squeeze(0))
-            except Exception as e:
-                print(f"Error processing {single_audio_path}: {e}")
     else:
-        input_ids = tokenizer_image_token(
-            prompt, tokenizer,
-            image_token_index=IMAGE_TOKEN_INDEX
         )
-    if all_visual_tensor == [] and all_audio_path == []:
-        datapromt = {
-                "prompt_token_ids": input_ids,
-        }
-    elif all_visual_tensor != [] and all_audio_path == []:
-        datapromt = {
-            "prompt_token_ids": input_ids,
-            "multi_modal_data": {
-                "image": all_visual_tensor
-            },
-        }
-    elif all_visual_tensor == [] and all_audio_path != []:
-        datapromt = {
-            "prompt_token_ids": input_ids,
-            "multi_modal_data": {
-                "audio": audio_list
-            },
-        }
     else:
-        datapromt = {
-            "prompt_token_ids": input_ids,
-            "multi_modal_data": {
-                "image": all_visual_tensor,
-                "audio": audio_list
-            },
-        }
-    print(datapromt)
     with torch.inference_mode():
         output_ids = model.generate(
             input_ids,
-            images=all_visual_tensor if all_visual_tensor else None,
-            audios=audio_list if audio_list else None,
             do_sample=False,
             temperature=0.01,
             top_p=None,
@@ -350,18 +458,30 @@ def predict(_chatbot, task_history):
             return_dict_in_generate=True,
             max_new_tokens=1024,
             use_cache=True,
         )
     outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=False)[0]
     outputs = outputs.strip()
     task_history[-1] = (chat_query, outputs)
     remove_special_characters_output = remove_special_characters(outputs)
     _chatbot[-1] = (chat_query, _parse_text(remove_special_characters_output))
-    print("query", chat_query)
-    print("task_history", task_history)
     print(_chatbot)
-    print("answer:  ", outputs)
     yield _chatbot
@@ -393,6 +513,7 @@ def add_video(history, task_history, file):
     new_file_name = file.replace(".webm",".mp4")
     if file.endswith(".webm"):
         convert_webm_to_mp4(file, new_file_name)
     task_history = task_history + [((new_file_name,), None)]
     return history, task_history
@@ -406,10 +527,14 @@ def reset_state(task_history):
 @spaces.GPU
 def stream_audio_output(history, task_history):
-    text = task_history[-1][-1]
     if not text:
         # import pdb;pdb.set_trace()
-        yield None,None
     llm_resounse = replace_equation(remove_special_characters(text))
     #print('tts_text', llm_resounse)
     for idx, text in enumerate(split_into_sentences(llm_resounse)):
@@ -459,24 +584,20 @@ with gr.Blocks(title="VideoMLLM") as demo:
                 ),
             )
     add_text_button.click(add_text, [chatbot, task_history, query], [chatbot, task_history], show_progress=True).then(
         reset_user_input, [], [query]
     ).then(
-            predict, [chatbot, task_history], [chatbot], show_progress=True
     ).then(
         stream_audio_output,[chatbot, task_history], [audio_output],
     )
     video_input.stop_recording(add_video, [chatbot, task_history, video_input], [chatbot, task_history], show_progress=True)
     empty_bin.click(reset_state, [task_history], [chatbot], show_progress=True)
     addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True)
     add_audio_button.click(add_audio, [chatbot, task_history,record_btn], [chatbot, task_history], show_progress=True).then(
-            predict, [chatbot, task_history], [chatbot], show_progress=True
     ).then(
         stream_audio_output,[chatbot, task_history], [audio_output],
     )

 import torchaudio
 import io
 import cv2
+import time
 import math
 from numba import jit
+import spaces
 from huggingface_hub import snapshot_download
+from vita.constants import (
+    DEFAULT_AUDIO_TOKEN,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_VIDEO_TOKEN,
+    IGNORE_INDEX,
+    IMAGE_TOKEN_INDEX,
+    MAX_IMAGE_LENGTH,
+    MIN_IMAGE_LENGTH,
+)
 from vita.conversation import conv_templates, SeparatorStyle
+from vita.model.builder import load_pretrained_model
+from vita.util.mm_utils import (
+    KeywordsStoppingCriteria,
+    get_model_name_from_path,
+    tokenizer_image_token,
+    tokenizer_image_audio_token,
+)
+from vita.util.utils import disable_torch_init
 from PIL import Image
 from decord import VideoReader, cpu
 from vita.model.vita_tts.decoder.llm2tts import llm2TTS
 from vita.model.language_model.vita_qwen2 import VITAQwen2Config, VITAQwen2ForCausalLM
+from vita.util.data_utils_video_audio_neg_patch import dynamic_preprocess
+from transformers import AutoConfig, AutoModel, AutoTokenizer, AutoFeatureExtractor
 decoder_topk = 2
 codec_chunk_size = 40
 codec_padding_size = 10
+PUNCTUATION = "！？。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 @jit
 def float_to_int16(audio: np.ndarray) -> np.ndarray:
     am = 32767 * 32768 // am
     return np.multiply(audio, am).astype(np.int16)
 def remove_special_characters(input_str):
     # Remove special tokens
     special_tokens = ['☞', '☟', '☜', '<unk>', '<|im_end|>']
         input_str = input_str.replace(token, '')
     return input_str
 def replace_equation(sentence):
     special_notations = {
         "sin": " sine ",
     return ext.lower() in wav_extensions
 def load_model_embemding(model_path):
+    config_path = os.path.join(model_path, 'config.json')
     config = VITAQwen2Config.from_pretrained(config_path)
     model = VITAQwen2ForCausalLM.from_pretrained(model_path, config=config, low_cpu_mem_usage=True)
     embedding = model.get_input_embeddings()
         raise
+def _get_rawvideo_dec(
+    video_path,
+    image_processor=None,
+    max_frames=MAX_IMAGE_LENGTH,
+    min_frames=MIN_IMAGE_LENGTH,
+    image_resolution=384,
+    video_framerate=1,
+    s=None,
+    e=None,
+    image_aspect_ratio="pad",
+):
+    # speed up video decode via decord.
+    if s is None:
         start_time, end_time = None, None
     else:
         start_time = int(s)
         end_time = int(e)
+        start_time = start_time if start_time >= 0.0 else 0.0
+        end_time = end_time if end_time >= 0.0 else 0.0
         if start_time > end_time:
             start_time, end_time = end_time, start_time
         elif start_time == end_time:
     f_start = 0 if start_time is None else int(start_time * fps)
     f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
     num_frames = f_end - f_start + 1
     if num_frames > 0:
+        # T x 3 x H x W
         sample_fps = int(video_framerate)
         t_stride = int(round(float(fps) / sample_fps))
+        all_pos = list(range(f_start, f_end + 1, t_stride))
         if len(all_pos) > max_frames:
+            sample_pos = [
+                all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)
+            ]
         elif len(all_pos) < min_frames:
+            sample_pos = [
+                all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=min_frames, dtype=int)
+            ]
         else:
             sample_pos = all_pos
+        patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+        if image_aspect_ratio == "pad":
+            def expand2square(pil_img, background_color):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+            patch_images = [
+                expand2square(i, tuple(int(x * 255) for x in image_processor.image_mean))
+                for i in patch_images
+            ]
+            patch_images = [
+                image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                for i in patch_images
+            ]
+        else:
+            patch_images = [
+                image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                for i in patch_images
+            ]
+        patch_images = torch.stack(patch_images)
+        slice_len = patch_images.shape[0]
+        return patch_images, slice_len
     else:
         print(f"video path: {video_path} error.")
     return "".join(lines)
+MODEL_NAME = "VITA-MLLM/VITA-1.5"
+model_path = snapshot_download(MODEL_NAME, local_dir="VITA_ckpt")
+model_type = "qwen2p5_instruct"
+tokenizer, model, feature_extractor, context_len = load_pretrained_model(
+    model_path, model_base=None, model_name="VITA-1.5", model_type="qwen2p5_instruct"
+)
+model.resize_token_embeddings(len(tokenizer))
+vision_tower = model.get_vision_tower()
+if not vision_tower.is_loaded:
+    vision_tower.load_model()
+image_processor = vision_tower.image_processor
+audio_encoder = model.get_audio_encoder()
+audio_encoder.to(dtype=torch.float16)
+audio_processor = audio_encoder.audio_processor
+model.eval()
+tts = llm2TTS(os.path.join(model_path, 'vita_tts_ckpt/'))
+llm_embedding = load_model_embemding(model_path).to(device)
 @spaces.GPU
 def predict(_chatbot, task_history):
     for i, (q, a) in enumerate(task_history):
         if isinstance(q, (tuple, list)):
             if is_image(q[0]):
+                image = Image.open(q[0]).convert("RGB")
+                image, p_num = dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=True)
+                assert len(p_num) == 1
+                image_tensor = model.process_images(image, model.config).to(
+                    dtype=model.dtype, device="cuda"
+                )
+                all_visual_tensor.append(image_tensor)
                 input_mode = 'image'
+                qs += DEFAULT_IMAGE_TOKEN * p_num[0] + '\n'
             elif is_video(q[0]):
+                video_frames, slice_len = _get_rawvideo_dec(
+                    q[0],
+                    image_processor,
+                    max_frames=MAX_IMAGE_LENGTH,
+                    video_framerate=1,
+                    image_aspect_ratio=getattr(model.config, "image_aspect_ratio", None),
+                )
+                image_tensor = video_frames.half().cuda()
+                all_visual_tensor.append(image_tensor)
                 input_mode = 'video'
                 qs += DEFAULT_IMAGE_TOKEN * slice_len + '\n'
             elif is_wav(q[0]):
             conv.append_message(conv.roles[0], new_q)
             conv.append_message(conv.roles[1], a)
+    if qs:
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
     prompt = conv.get_prompt(input_mode)
+    if all_audio_path:
+        # 处理多个音频并合并
+        all_audio_features = []
+        all_audio_lengths = []
+        all_audio_for_llm_lens = []
+        for audio_path in all_audio_path:
+            audio, audio_for_llm_lens = audio_processor.process(os.path.join(audio_path))
+            all_audio_features.append(audio)
+            all_audio_lengths.append(audio.shape[0])
+            all_audio_for_llm_lens.append(audio_for_llm_lens)
+        # 合并音频特征
+        combined_audio = torch.cat(all_audio_features, dim=0)
+        combined_audio = torch.unsqueeze(combined_audio, dim=0)
+        # 合并长度信息
+        combined_length = torch.tensor(sum(all_audio_lengths))
+        combined_length = torch.unsqueeze(combined_length, dim=0)
+        # 合并LLM长度
+        combined_for_llm_lens = torch.tensor(sum(all_audio_for_llm_lens))
+        combined_for_llm_lens = torch.unsqueeze(combined_for_llm_lens, dim=0)
+        audios = dict()
+        audios["audios"] = combined_audio.half().cuda()
+        audios["lengths"] = combined_length.half().cuda()
+        audios["lengths_for_llm"] = combined_for_llm_lens.cuda()
+        input_ids = (
+            tokenizer_image_audio_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
+            .unsqueeze(0)
+            .cuda()
         )
     else:
+        # 空音频处理
+        audio = torch.zeros(400, 80)
+        audio_length = audio.shape[0]
+        audio_for_llm_lens = 60
+        audio = torch.unsqueeze(audio, dim=0)
+        audio_length = torch.unsqueeze(torch.tensor(audio_length), dim=0)
+        audio_for_llm_lens = torch.unsqueeze(torch.tensor(audio_for_llm_lens), dim=0)
+        audios = dict()
+        audios["audios"] = audio.half().cuda()
+        audios["lengths"] = audio_length.half().cuda()
+        audios["lengths_for_llm"] = audio_for_llm_lens.cuda()
+        input_ids = (
+            tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
+            .unsqueeze(0)
+            .cuda()
         )
+    if len(all_visual_tensor) > 0:
+        all_visual_tensor = torch.cat(all_visual_tensor, dim=0)
     else:
+        all_visual_tensor = torch.zeros((1, 3, 448, 448)).to(dtype=model.dtype, device="cuda")
+    if type(all_visual_tensor) is list:
+        print("all_visual_tensor is a list: ", len(all_visual_tensor))
+    if type(all_visual_tensor) is torch.Tensor:
+        print("all_visual_tensor is a tensor: ", all_visual_tensor.shape)
+    # 停止条件设置
+    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+    keywords = [stop_str]
+    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+    # 生成文本
+    start_time = time.time()
     with torch.inference_mode():
         output_ids = model.generate(
             input_ids,
+            images=all_visual_tensor,
+            audios=audios,
             do_sample=False,
             temperature=0.01,
             top_p=None,
             return_dict_in_generate=True,
             max_new_tokens=1024,
             use_cache=True,
+            stopping_criteria=[stopping_criteria],
+            shared_v_pid_stride=None,
         )
+    infer_time = time.time() - start_time
+    output_ids = output_ids.sequences
+    input_token_len = input_ids.shape[1]
     outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=False)[0]
+    outputs = outputs.strip()
+    if outputs.endswith(stop_str):
+        outputs = outputs[: -len(stop_str)]
     outputs = outputs.strip()
+    print(f"Generated output: {outputs}")
+    print(f"Time consumed: {infer_time}")
     task_history[-1] = (chat_query, outputs)
     remove_special_characters_output = remove_special_characters(outputs)
     _chatbot[-1] = (chat_query, _parse_text(remove_special_characters_output))
+    print("query",chat_query)
+    print("task_history",task_history)
     print(_chatbot)
+    print("answer:  ",outputs)
     yield _chatbot
     new_file_name = file.replace(".webm",".mp4")
     if file.endswith(".webm"):
         convert_webm_to_mp4(file, new_file_name)
+    history = history + [((new_file_name,), None)]
     task_history = task_history + [((new_file_name,), None)]
     return history, task_history
 @spaces.GPU
 def stream_audio_output(history, task_history):
+    print("stream_audio_output", history, task_history)
+    text = history[-1][-1]
+    print("text", text)
     if not text:
         # import pdb;pdb.set_trace()
+        yield None, None
+        return
     llm_resounse = replace_equation(remove_special_characters(text))
     #print('tts_text', llm_resounse)
     for idx, text in enumerate(split_into_sentences(llm_resounse)):
                 ),
             )
     add_text_button.click(add_text, [chatbot, task_history, query], [chatbot, task_history], show_progress=True).then(
         reset_user_input, [], [query]
     ).then(
+        predict, [chatbot, task_history], [chatbot], show_progress=True
     ).then(
         stream_audio_output,[chatbot, task_history], [audio_output],
     )
     video_input.stop_recording(add_video, [chatbot, task_history, video_input], [chatbot, task_history], show_progress=True)
     empty_bin.click(reset_state, [task_history], [chatbot], show_progress=True)
     addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True)
     add_audio_button.click(add_audio, [chatbot, task_history,record_btn], [chatbot, task_history], show_progress=True).then(
+        predict, [chatbot, task_history], [chatbot], show_progress=True
     ).then(
         stream_audio_output,[chatbot, task_history], [audio_output],
     )

requirements.txt CHANGED Viewed

@@ -114,14 +114,14 @@ starlette==0.41.3
 sympy==1.13.1
 threadpoolctl==3.5.0
 timm==1.0.15
-tokenizers==0.21.0
 tomlkit==0.13.2
 torch==2.4.0
 torchaudio==2.4.0
 torchvision==0.19.0
 tqdm==4.67.1
 traitlets==5.14.3
-transformers==4.49.0
 triton==3.0.0
 typer==0.15.1
 typing_extensions==4.12.2

 sympy==1.13.1
 threadpoolctl==3.5.0
 timm==1.0.15
+tokenizers==0.20.3
 tomlkit==0.13.2
 torch==2.4.0
 torchaudio==2.4.0
 torchvision==0.19.0
 tqdm==4.67.1
 traitlets==5.14.3
+transformers==4.46.3
 triton==3.0.0
 typer==0.15.1
 typing_extensions==4.12.2

vita/model/vita_arch.py CHANGED Viewed

@@ -388,6 +388,14 @@ class VITAMetaForCausalLM(ABC):
         v_start_end = []
         cur_image_idx = 0
         cur_audio_idx = 0
         assert (
             sum([(cur == IMAGE_TOKEN_INDEX).sum() for cur in input_ids])
             + sum([(IMAGE_TOKEN_INDEX not in cur) for cur in input_ids])

         v_start_end = []
         cur_image_idx = 0
         cur_audio_idx = 0
+        print("sum1",sum([(cur == IMAGE_TOKEN_INDEX).sum() for cur in input_ids]))
+        print("sum2",sum([(IMAGE_TOKEN_INDEX not in cur) for cur in input_ids]))
+        print("len",len(image_features))
+        if type(image_features) is list:
+            print("image_features is a list: ", len(image_features))
+            print("image_features[0] is a tensor: ", image_features[0].shape)
+        if type(image_features) is torch.Tensor:
+            print("image_features is a tensor: ", image_features.shape)
         assert (
             sum([(cur == IMAGE_TOKEN_INDEX).sum() for cur in input_ids])
             + sum([(IMAGE_TOKEN_INDEX not in cur) for cur in input_ids])