Spaces:

lucas-ventura
/

chapter-llama

Running on Zero

App Files Files Community

lucas-ventura commited on about 1 month ago

Commit

90559ad

verified ·

1 Parent(s): 36a1678

Upload 4 files

Browse files

Files changed (4) hide show

llama_inference.py +204 -0
single_video.py +70 -0
utils_asr.py +95 -0
vidchapters.py +107 -0

llama_inference.py ADDED Viewed

	@@ -0,0 +1,204 @@

+from pathlib import Path
+import torch
+from llama_cookbook.inference.model_utils import load_model as load_model_llamarecipes
+from llama_cookbook.inference.model_utils import load_peft_model
+from transformers import AutoTokenizer
+from src.utils import RankedLogger
+log = RankedLogger(__name__, rank_zero_only=True)
+def load_model(
+    ckpt_path, quantization=None, use_fast_kernels=False, peft_model=False, **kwargs
+):
+    model = load_model_llamarecipes(
+        model_name=ckpt_path,
+        quantization=quantization,
+        use_fast_kernels=use_fast_kernels,
+        device_map="auto",
+        **kwargs,
+    )
+    if peft_model:
+        model = load_peft_model(model, peft_model)
+    tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
+    tokenizer.pad_token = tokenizer.eos_token
+    # special_tokens = {"additional_special_tokens": ["<image>"]}
+    # tokenizer.add_special_tokens(special_tokens)
+    return model, tokenizer
+@torch.no_grad()
+def inference(
+    model,
+    tokenizer: AutoTokenizer,
+    prompt: str,
+    add_special_tokens: bool = True,
+    temperature: float = 1.0,
+    max_new_tokens=1024,
+    top_p: float = 1.0,
+    top_k: int = 50,
+    use_cache: bool = True,
+    max_padding_length: int = None,
+    do_sample: bool = False,
+    min_length: int = None,
+    repetition_penalty: float = 1.0,
+    length_penalty: int = 1,
+    max_prompt_tokens: int = 35_000,
+    **kwargs,
+):
+    """
+    temperature: float, optional (default=1.0) The value used to module the next token probabilities.
+    max_new_tokens: int, optional (default=1024) The maximum number of tokens to generate.
+    top_p: float, optional (default=1.0) If set to float < 1 only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+    top_k: int, optional (default=50) The number of highest probability vocabulary tokens to keep for top-k-filtering.
+    use_cache: bool, optional (default=True) Whether or not the model should use the past last key/values attentions Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.
+    max_padding_length: int, optional (default=None) the max padding length to be used with tokenizer padding the prompts.
+    do_sample: bool, optional (default=True) Whether or not to use sampling ; use greedy decoding otherwise.
+    min_length: int, optional (default=None) The minimum length of the sequence to be generated input prompt + min_new_tokens
+    repetition_penalty: float, optional (default=1.0) The parameter for repetition penalty. 1.0 means no penalty.
+    length_penalty: int, optional (default=1) Exponential penalty to the length that is used with beam-based generation.
+    """
+    if add_special_tokens:
+        prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
+        # prompt = f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
+    batch = tokenizer(
+        prompt,
+        truncation=True,
+        max_length=max_padding_length,
+        return_tensors="pt",
+    )
+    # if the input is too long, return the length of the input
+    n_tokens = len(batch["input_ids"][0])
+    if max_prompt_tokens is not None and n_tokens > max_prompt_tokens:
+        return n_tokens
+    batch = {k: v.to("cuda") for k, v in batch.items()}
+    terminators = [
+        tokenizer.eos_token_id,
+        tokenizer.convert_tokens_to_ids("<|eot_id|>"),
+    ]
+    try:
+        outputs = model.generate(
+            **batch,
+            max_new_tokens=max_new_tokens,
+            do_sample=do_sample,
+            top_p=top_p,
+            temperature=temperature,
+            min_length=min_length,
+            use_cache=use_cache,
+            top_k=top_k,
+            repetition_penalty=repetition_penalty,
+            length_penalty=length_penalty,
+            eos_token_id=terminators,
+            pad_token_id=tokenizer.eos_token_id,
+            **kwargs,
+        )
+        output_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
+        output = output_text.split("<|start_header_id|>assistant<|end_header_id|>")[1]
+        output = output.strip()
+        output = output.removesuffix("<|eot_id|>")
+    except torch.cuda.OutOfMemoryError as e:
+        log.error(f"CUDA out of memory error: {e}")
+        torch.cuda.empty_cache()
+        return n_tokens
+    return output
+class LlamaInference:
+    def __init__(
+        self,
+        ckpt_path,
+        quantization=None,
+        use_fast_kernels=False,
+        peft_model=False,
+        add_special_tokens: bool = True,
+        temperature: float = 1.0,
+        max_new_tokens: int = 1024,
+        top_p: float = 1.0,
+        top_k: int = 50,
+        use_cache: bool = True,
+        max_padding_length: int = None,
+        do_sample: bool = False,
+        min_length: int = None,
+        repetition_penalty: float = 1.0,
+        length_penalty: int = 1,
+        max_prompt_tokens: int = 35_000,
+        **kwargs,
+    ):
+        # Check if LLaMA model exists
+        # if not Path(ckpt_path).exists():
+        #     log.warning(f"Model checkpoint does not exist at {ckpt_path}")
+        #     return None
+        # If PEFT model is specified, check if it exists
+        if peft_model and not Path(peft_model).exists():
+            log.warning(f"PEFT model does not exist at {peft_model}")
+            return None
+        if peft_model:
+            log.info(f"PEFT model found at {peft_model}")
+        model = load_model_llamarecipes(
+            model_name=ckpt_path,
+            quantization=quantization,
+            use_fast_kernels=use_fast_kernels,
+            device_map="auto",
+            **kwargs,
+        )
+        if peft_model:
+            model = load_peft_model(model, peft_model)
+        model.eval()
+        tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
+        tokenizer.pad_token = tokenizer.eos_token
+        self.model = model
+        self.tokenizer = tokenizer
+        self.add_special_tokens = add_special_tokens
+        self.temperature = temperature
+        self.max_new_tokens = max_new_tokens
+        self.top_p = top_p
+        self.top_k = top_k
+        self.use_cache = use_cache
+        self.max_padding_length = max_padding_length
+        self.do_sample = do_sample
+        self.min_length = min_length
+        self.repetition_penalty = repetition_penalty
+        self.length_penalty = length_penalty
+        self.max_prompt_tokens = max_prompt_tokens
+    def __call__(self, prompt: str, **kwargs):
+        # Create a dict of default parameters from instance attributes
+        params = {
+            "model": self.model,
+            "tokenizer": self.tokenizer,
+            "prompt": prompt,
+            "add_special_tokens": self.add_special_tokens,
+            "temperature": self.temperature,
+            "max_new_tokens": self.max_new_tokens,
+            "top_p": self.top_p,
+            "top_k": self.top_k,
+            "use_cache": self.use_cache,
+            "max_padding_length": self.max_padding_length,
+            "do_sample": self.do_sample,
+            "min_length": self.min_length,
+            "repetition_penalty": self.repetition_penalty,
+            "length_penalty": self.length_penalty,
+            "max_prompt_tokens": self.max_prompt_tokens,
+        }
+        # Update with any overrides passed in kwargs
+        params.update(kwargs)
+        return inference(**params)

single_video.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from pathlib import Path
+from lutils import openf, writef
+from src.data.chapters import sec_to_hms
+from tools.extract.asr import ASRProcessor
+class SingleVideo:
+    """
+    A simplified implementation of the src.data.chapters.Chapters interface for single video inference.
+    This class mimics the behavior of the ChaptersASR class but is designed to work with
+    a single video file rather than a dataset. It provides the necessary methods
+    required by the PromptASR class for generating chapter timestamps and titles.
+    Note: This class is intended for inference only and should not be used for
+    training or evaluation purposes.
+    """
+    def __init__(self, video_path: Path):
+        self.video_path = video_path
+        self.video_ids = [video_path.stem]
+        assert video_path.exists(), f"Video file {video_path} not found"
+        self.asr, self.duration = get_asr(video_path, overwrite=True)
+    def __len__(self):
+        return len(self.video_ids)
+    def __iter__(self):
+        return iter(self.video_ids)
+    def __contains__(self, vid_id):
+        return vid_id in self.video_ids
+    def get_duration(self, vid_id, hms=False):
+        assert vid_id == self.video_ids[0], f"Invalid video ID: {vid_id}"
+        if hms:
+            return sec_to_hms(self.duration)
+        return self.duration
+    def get_asr(self, vid_id):
+        assert vid_id == self.video_ids[0], f"Invalid video ID: {vid_id}"
+        return self.asr
+def get_asr(video_path: Path, overwrite=False):
+    output_dir = Path(f"outputs/inference/{video_path.stem}")
+    asr_output = output_dir / "asr.txt"
+    duration_output = output_dir / "duration.txt"
+    if asr_output.exists() and duration_output.exists() and not overwrite:
+        asr = openf(asr_output)
+        asr = "\n".join(asr) + "\n"
+        duration = openf(duration_output)
+        assert isinstance(duration, list) and len(duration) == 1, (
+            f"Duration is not a list of length 1: {duration}"
+        )
+        duration = float(duration[0])
+        assert duration > 0, f"Duration is not positive: {duration}"
+        return asr, duration
+    print(f"\n=== 🎙️ Processing ASR for {video_path} ===")
+    asr_processor = ASRProcessor()
+    asr, duration = asr_processor.get_asr(video_path)
+    print(f"=== ✅ ASR processing complete for {video_path} ===\n")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    writef(asr_output, asr)
+    writef(duration_output, str(duration))
+    return asr, duration

utils_asr.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from lutils import openf, writef
+from src.data.chapters import Chapters, sec_to_hms
+from src.data.prompt import Prompt
+from src.utils import RankedLogger
+log = RankedLogger(__name__, rank_zero_only=True)
+class ChaptersASR(Chapters):
+    def __init__(self, vidc_dir: str = "dataset/", subset=""):
+        super().__init__(vidc_dir=vidc_dir, subset=subset)
+        self._asrs = None
+    @property
+    def asrs(self):
+        if self._asrs is None:
+            self.load_asr_data()
+        return self._asrs
+    def load_asr_data(self):
+        if self._asrs is not None:
+            return
+        if self.subset:
+            asr_pth = self.vidc_dir / f"docs/subset_data/asrs/asrs_{self.subset}.json"
+            if asr_pth.exists():
+                self._asrs = openf(asr_pth)
+            else:
+                log.info(f"ASR data not found for subset {self.subset}.")
+                asr_val_pth = self.vidc_dir / "docs/subset_data/asrs/asrs_val.json"
+                asr_train_pth = self.vidc_dir / "docs/subset_data/asrs/asrs_train.json"
+                if "val" in self.subset and asr_val_pth.exists():
+                    log.info("Loading from ASR validation file.")
+                    asrs = openf(asr_val_pth)
+                elif "train" in self.subset and asr_train_pth.exists():
+                    log.info("Loading from ASR training file.")
+                    asrs = openf(asr_train_pth)
+                else:
+                    log.info("Loading from ASR file.")
+                    asrs = openf(self.vidc_dir / "docs/asrs.json")
+                video_ids = set(self.video_ids) & set(asrs.keys())
+                self._asrs = {vid_id: asrs[vid_id] for vid_id in video_ids}
+                asr_pth.parent.mkdir(exist_ok=True)
+                writef(asr_pth, self._asrs)
+        else:
+            self._asrs = openf(self.vidc_dir / "docs/asrs.json")
+    def get_asr(self, video_id, add_end=False):
+        if video_id not in self.asrs:
+            return None
+        asr = self.asrs[video_id]
+        asr_clean = []
+        for t, s, e in zip(asr["text"], asr["start"], asr["end"]):
+            t = t.strip()
+            s = sec_to_hms(s)
+            e = sec_to_hms(e)
+            if add_end:
+                asr_clean.append(f"{s} - {e}: {t}")
+            else:
+                asr_clean.append(f"{s}: {t}")
+        return "\n".join(asr_clean) + "\n"
+    def __contains__(self, vid_id):
+        return vid_id in self.asrs
+class PromptASR(Prompt):
+    def __init__(self, chapters: ChaptersASR, add_end=False):
+        super().__init__(chapters=chapters)
+        self.add_end = add_end
+    def get_task_prompt(self):
+        return "segment the text into distinct chapters based on thematic shifts or changes in topics.\n"
+    def get_transcript(self, vid_id):
+        vid_asr = self.chapters.get_asr(vid_id, add_end=self.add_end)
+        assert vid_asr is not None, f"ASR not found for video ID: {vid_id}"
+        return vid_asr
+    def __contains__(self, vid_id):
+        return vid_id in self.chapters
+if __name__ == "__main__":
+    chapters = ChaptersASR(subset="s10k_train")
+    vid_id = chapters.sample()
+    prompt = PromptASR(chapters=chapters)
+    print(prompt.get_prompt_train(vid_id))
+    print(prompt.get_transcript(vid_id))
+    print(prompt.get_output(vid_id))

vidchapters.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from pathlib import Path
+from lutils import writef
+from tqdm import tqdm
+from src.test.utils_chapters import extract_chapters, filter_chapters
+from src.utils import RankedLogger
+log = RankedLogger(__name__, rank_zero_only=True)
+def get_chapters(
+    inference,
+    prompt,
+    max_new_tokens,
+    do_sample=False,
+    vid_duration=None,
+    use_cache=True,
+    vid_id="",
+):
+    output_text = inference(
+        prompt=prompt,
+        max_new_tokens=max_new_tokens,
+        add_special_tokens=True,
+        do_sample=do_sample,
+        use_cache=use_cache,
+    )
+    if isinstance(output_text, int):
+        # the input is too long, return the length of the input
+        return output_text, None
+    chapters = extract_chapters(output_text)
+    chapters = filter_chapters(chapters, vid_duration=vid_duration)
+    if not chapters and not do_sample:
+        log.info(f"No chapters found for {vid_id}, trying again with sampling")
+        return get_chapters(
+            inference,
+            prompt,
+            max_new_tokens,
+            do_sample=True,
+            vid_duration=vid_duration,
+        )
+    return output_text, chapters
+class VidChaptersTester:
+    def __init__(self, save_dir: str, do_sample=False, **kwargs):
+        self.save_dir = Path(save_dir)
+        self.save_dir.mkdir(exist_ok=True)
+        self.do_sample = do_sample
+    def __call__(
+        self,
+        inference,
+        test_dataloader,
+        max_new_tokens=1024,
+    ):
+        pbar = tqdm(
+            total=len(test_dataloader),
+            desc="Evaluating chapters",
+        )
+        for batch in test_dataloader:
+            vid_id = batch["vid_id"][0]
+            prompt = batch["prompt"][0]
+            transcript = batch["transcript"][0]
+            vid_duration = batch["vid_duration"][0]
+            prompt += transcript
+            chapters_pth = self.save_dir / f"{vid_id[:2]}" / f"{vid_id}.json"
+            chapters_pth.parent.mkdir(exist_ok=True)
+            if chapters_pth.exists():
+                pbar.update(1)
+                continue
+            pbar.set_description(f"vid_id: {vid_id}")
+            output_text, chapters = get_chapters(
+                inference,
+                prompt,
+                max_new_tokens,
+                do_sample=self.do_sample,
+                vid_duration=vid_duration,
+                vid_id=vid_id,
+            )
+            if chapters is None:
+                log.info(f"Input too long for {vid_id}, {output_text} tokens")
+                error_pth = chapters_pth.with_suffix(".txt")
+                writef(error_pth, [output_text])
+                pbar.update(1)
+                continue
+            if chapters:
+                vid_data = {
+                    "chapters": chapters,
+                    "output": output_text,
+                }
+                writef(chapters_pth, vid_data)
+            pbar.update(1)
+        pbar.close()