I'm trying to do some Video Inference:

messages = [
{
"role": "user",
"content": [
{
"type": "video",
"video": "/home/dtesta/MAIA_def/dataset_def/videos/3.mp4",
"max_pixels": 360 * 420,
"fps": 1.0,
},
{"type": "text", "text": "Describe this video."},
],
}
]

Preparation for inference

text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)

image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)

inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
**video_kwargs,
).to(model.device)

model.eval()
with torch.no_grad():
print("Running forward pass with output_hidden_states=True...")
outputs = model(
**inputs,
output_hidden_states=True
)
print("Forward pass complete.")

RuntimeError Traceback (most recent call last)
Cell In[7], line 5
2 with torch.no_grad():
3 print("Running forward pass with output_hidden_states=True...")
----> 4 outputs = model(
5 **inputs,
6 output_hidden_states=True
7 )
8 print("Forward pass complete.")

File ~/miniconda3/envs/MAIA.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1738 else:
-> 1739 return self._call_impl(*args, **kwargs)

File ~/miniconda3/envs/MAIA.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1750, in Module._call_impl(self, *args, **kwargs)
1745 # If we don't have any hooks, we want to skip the rest of the logic in
1746 # this function, and just call forward.
1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1748 or _global_backward_pre_hooks or _global_backward_hooks
1749 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1750 return forward_call(*args, **kwargs)
1752 result = None
...
-> 1684 llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
1685 position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
1686 mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))

RuntimeError: torch.cat(): expected a non-empty list of Tensors

Why??
I also called:
model.get_rope_index(input_ids=inputs["input_ids"],
video_grid_thw=inputs["video_grid_thw"],
attention_mask=inputs["attention_mask"], second_per_grid_ts=inputs["second_per_grid_ts"])

and this is the output:
(tensor([[[ 0, 1, 2, ..., 79, 80, 81]],

     [[ 0,  1,  2,  ..., 79, 80, 81]],

     [[ 0,  1,  2,  ..., 79, 80, 81]]], device='cuda:0'),

tensor([[-2643]], device='cuda:0'))

Qwen
/

Qwen2.5-VL-7B-Instruct

Video Inference --> RuntimeError: torch.cat(): expected a non-empty list of Tensors

Preparation for inference