File size: 5,034 Bytes
ed4d993
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from typing import Any, Dict, List, Optional

from langchain.chains.base import Chain
from langchain_core.callbacks import CallbackManagerForChainRun
from langchain_core.language_models import BaseLanguageModel
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import Extra

from langchain_experimental.video_captioning.services.audio_service import (
    AudioProcessor,
)
from langchain_experimental.video_captioning.services.caption_service import (
    CaptionProcessor,
)
from langchain_experimental.video_captioning.services.combine_service import (
    CombineProcessor,
)
from langchain_experimental.video_captioning.services.image_service import (
    ImageProcessor,
)
from langchain_experimental.video_captioning.services.srt_service import SRTProcessor


class VideoCaptioningChain(Chain):
    """
    Video Captioning Chain.
    """

    llm: BaseLanguageModel
    assemblyai_key: str
    prompt: Optional[PromptTemplate] = None
    verbose: bool = True
    use_logging: Optional[bool] = True
    frame_skip: int = -1
    image_delta_threshold: int = 3000000
    closed_caption_char_limit: int = 20
    closed_caption_similarity_threshold: int = 80
    use_unclustered_video_models: bool = False

    class Config:
        extra = Extra.allow
        arbitrary_types_allowed = True

    @property
    def input_keys(self) -> List[str]:
        return ["video_file_path"]

    @property
    def output_keys(self) -> List[str]:
        return ["srt"]

    def _call(
        self,
        inputs: Dict[str, Any],
        run_manager: Optional[CallbackManagerForChainRun] = None,
    ) -> Dict[str, str]:
        if "video_file_path" not in inputs:
            raise ValueError(
                "Missing 'video_file_path' in inputs for video captioning."
            )
        video_file_path = inputs["video_file_path"]
        nl = "\n"

        run_manager.on_text(
            "Loading processors..." + nl
        ) if self.use_logging and run_manager else None

        audio_processor = AudioProcessor(api_key=self.assemblyai_key)
        image_processor = ImageProcessor(
            frame_skip=self.frame_skip, threshold=self.image_delta_threshold
        )
        caption_processor = CaptionProcessor(
            llm=self.llm,
            verbose=self.verbose,
            similarity_threshold=self.closed_caption_similarity_threshold,
            use_unclustered_models=self.use_unclustered_video_models,
        )
        combine_processor = CombineProcessor(
            llm=self.llm,
            verbose=self.verbose,
            char_limit=self.closed_caption_char_limit,
        )
        srt_processor = SRTProcessor()

        run_manager.on_text(
            "Finished loading processors."
            + nl
            + "Generating subtitles from audio..."
            + nl
        ) if self.use_logging and run_manager else None

        # Get models for speech to text subtitles
        audio_models = audio_processor.process(video_file_path, run_manager)
        run_manager.on_text(
            "Finished generating subtitles:"
            + nl
            + f"{nl.join(str(obj) for obj in audio_models)}"
            + nl
            + "Generating closed captions from video..."
            + nl
        ) if self.use_logging and run_manager else None

        # Get models for image frame description
        image_models = image_processor.process(video_file_path, run_manager)
        run_manager.on_text(
            "Finished generating closed captions:"
            + nl
            + f"{nl.join(str(obj) for obj in image_models)}"
            + nl
            + "Refining closed captions..."
            + nl
        ) if self.use_logging and run_manager else None

        # Get models for video event closed-captions
        video_models = caption_processor.process(image_models, run_manager)
        run_manager.on_text(
            "Finished refining closed captions:"
            + nl
            + f"{nl.join(str(obj) for obj in video_models)}"
            + nl
            + "Combining subtitles with closed captions..."
            + nl
        ) if self.use_logging and run_manager else None

        # Combine the subtitle models with the closed-caption models
        caption_models = combine_processor.process(
            video_models, audio_models, run_manager
        )
        run_manager.on_text(
            "Finished combining subtitles with closed captions:"
            + nl
            + f"{nl.join(str(obj) for obj in caption_models)}"
            + nl
            + "Generating SRT file..."
            + nl
        ) if self.use_logging and run_manager else None

        # Convert the combined model to SRT format
        srt_content = srt_processor.process(caption_models)
        run_manager.on_text(
            "Finished generating srt file." + nl
        ) if self.use_logging and run_manager else None

        return {"srt": srt_content}

    @property
    def _chain_type(self) -> str:
        return "video_captioning_chain"