Spaces:

chenjoya
/

LiveCC

Running on Zero

App Files Files Community

chenjoya commited on about 1 month ago

Commit

b9706c3

1 Parent(s): 64d3228

fix

Browse files

Files changed (1) hide show

demo/infer.py +54 -48

demo/infer.py CHANGED Viewed

@@ -156,57 +156,63 @@ class LiveCCDemoInfer:
             state['past_ids'] = outputs.sequences[:, :-1]
             yield (start_timestamp, stop_timestamp), self.processor.decode(outputs.sequences[0, inputs.input_ids.size(1):], skip_special_tokens=True), state
     def video_qa(
-        model,
-        processor,
-        video_path: str,
-        query: str,
-        answer_prefix: str = '',
-        video_start: float = None,
-        video_end: float = None,
-        strict_fps: bool = False,
-        strict_abcd_ids: list[int] = None,
-        do_sample: bool = False,
-        max_new_tokens: int = 128
-    ):
-        if strict_fps:
-            video_inputs, _ = _read_video_decord_plus({'video': video_path, 'video_start': video_start, 'video_end': video_end}, strict_fps=True, drop_last=False)
-            video_inputs = _spatial_resize_video(video_inputs)
-            conversation = [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "video", "video": video_inputs},
-                        {"type": "text", "text": query},
-                    ],
-                }
-            ]
-            image_inputs = None
         else:
-            conversation = [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "video", "video": video_path, "video_start": video_start, "video_end": video_end},
-                        {"type": "text", "text": query},
-                    ],
-                }
-            ]
-            image_inputs, video_inputs = process_vision_info(conversation)
-        text = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True) + answer_prefix
-        inputs = processor(
-            text=[text],
             images=image_inputs,
             videos=video_inputs,
             return_tensors="pt",
         )
-        print(text)
-        inputs = inputs.to("cuda")
-        if not strict_abcd_ids:
-            generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=do_sample)
-            output_text = processor.decode(generated_ids[0, inputs.input_ids.size(1):], clean_up_tokenization_spaces=False)
-        else:
-            outputs = model.generate(**inputs, do_sample=do_sample, top_p=None, temperature=None, top_k=None, max_new_tokens=1, return_dict_in_generate=True, output_scores=True, repetition_penalty=1)
-            print(outputs.scores[0][0, strict_abcd_ids])
-            output_text = ['A', 'B', 'C', 'D'][outputs.scores[0][0, strict_abcd_ids].argmax()]
-        return output_text

             state['past_ids'] = outputs.sequences[:, :-1]
             yield (start_timestamp, stop_timestamp), self.processor.decode(outputs.sequences[0, inputs.input_ids.size(1):], skip_special_tokens=True), state
+    @torch.inference_mode()
     def video_qa(
+        self,
+        query: str,
+        state: dict,
+        default_query: str = 'Please describe the video.',
+        do_sample: bool = False,
+        repetition_penalty: float = 1.05,
+        **kwargs,
+    ):
+        """
+        state: dict, (maybe) with keys:
+            video_path: str, video path
+            video_timestamp: float, current video timestamp
+            last_timestamp: float, last processed video timestamp
+            last_video_pts_index: int, last processed video frame index
+            video_pts: np.ndarray, video pts
+            last_history: list, last processed history
+        """
+        video_path = state.get('video_path', None)
+        if video_path:
+            message = {
+                "role": "user",
+                "content": [
+                    {"type": "video", "video": video_path},
+                    {"type": "text", "text": query if query else default_query},
+                ],
+            }
         else:
+            message = {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": query if query else default_query},
+                ],
+            }
+        image_inputs, video_inputs = process_vision_info([message])
+        texts = self.processor.apply_chat_template([message], tokenize=False, add_generation_prompt=True, return_tensors='pt')
+        past_ids = state.get('past_ids', None)
+        if past_ids is not None:
+            texts = '<|im_end|>\n' + texts[self.system_prompt_offset:]
+        inputs = self.processor(
+            text=texts,
             images=image_inputs,
             videos=video_inputs,
             return_tensors="pt",
+            return_attention_mask=False
         )
+        inputs.to('cuda')
+        if past_ids is not None:
+            inputs['input_ids'] = torch.cat([past_ids, inputs.input_ids], dim=1)
+        outputs = self.model.generate(
+            **inputs, past_key_values=state.get('past_key_values', None),
+            return_dict_in_generate=True, do_sample=do_sample,
+            repetition_penalty=repetition_penalty,
+            max_new_tokens=512,
+        )
+        state['past_key_values'] = outputs.past_key_values
+        state['past_ids'] = outputs.sequences[:, :-1]
+        return self.processor.decode(outputs.sequences[0, inputs.input_ids.size(1):], skip_special_tokens=True), state