File size: 4,401 Bytes
ed4d993
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import json
import re
import warnings
from typing import List, Tuple

import requests
from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader

# Pre-compile regular expressions for video ID extraction
BV_PATTERN = re.compile(r"BV\w+")
AV_PATTERN = re.compile(r"av[0-9]+")


class BiliBiliLoader(BaseLoader):
    """
    Load fetching transcripts from BiliBili videos.
    """

    def __init__(
        self,
        video_urls: List[str],
        sessdata: str = "",
        bili_jct: str = "",
        buvid3: str = "",
    ):
        """
        Initialize the loader with BiliBili video URLs and authentication cookies.
        if no authentication cookies are provided, the loader can't get transcripts
        and will only fetch videos info.

        Args:
            video_urls (List[str]): List of BiliBili video URLs.
            sessdata (str): SESSDATA cookie value for authentication.
            bili_jct (str): BILI_JCT cookie value for authentication.
            buvid3 (str): BUVI3 cookie value for authentication.
        """
        self.video_urls = video_urls
        self.credential = None
        try:
            from bilibili_api import video
        except ImportError:
            raise ImportError(
                "requests package not found, please install it with "
                "`pip install bilibili-api-python`"
            )
        if sessdata and bili_jct and buvid3:
            self.credential = video.Credential(
                sessdata=sessdata, bili_jct=bili_jct, buvid3=buvid3
            )

    def load(self) -> List[Document]:
        """
        Load and return a list of documents containing video transcripts.

        Returns:
            List[Document]: List of Document objects transcripts and metadata.
        """
        results = []
        for url in self.video_urls:
            transcript, video_info = self._get_bilibili_subs_and_info(url)
            doc = Document(page_content=transcript, metadata=video_info)
            results.append(doc)

        return results

    def _get_bilibili_subs_and_info(self, url: str) -> Tuple[str, dict]:
        """
        Retrieve video information and transcript for a given BiliBili URL.
        """
        bvid = BV_PATTERN.search(url)
        try:
            from bilibili_api import sync, video
        except ImportError:
            raise ImportError(
                "requests package not found, please install it with "
                "`pip install bilibili-api-python`"
            )
        if bvid:
            v = video.Video(bvid=bvid.group(), credential=self.credential)
        else:
            aid = AV_PATTERN.search(url)
            if aid:
                v = video.Video(aid=int(aid.group()[2:]), credential=self.credential)
            else:
                raise ValueError(f"Unable to find a valid video ID in URL: {url}")

        video_info = sync(v.get_info())
        video_info.update({"url": url})

        # Return if no credential is provided
        if not self.credential:
            return "", video_info

        # Fetching and processing subtitles
        sub = sync(v.get_subtitle(video_info["cid"]))
        sub_list = sub.get("subtitles", [])
        if sub_list:
            sub_url = sub_list[0].get("subtitle_url", "")
            if not sub_url.startswith("http"):
                sub_url = "https:" + sub_url

            response = requests.get(sub_url)
            if response.status_code == 200:
                raw_sub_titles = json.loads(response.content).get("body", [])
                raw_transcript = " ".join([c["content"] for c in raw_sub_titles])

                raw_transcript_with_meta_info = (
                    f"Video Title: {video_info['title']}, "
                    f"description: {video_info['desc']}\n\n"
                    f"Transcript: {raw_transcript}"
                )
                return raw_transcript_with_meta_info, video_info
            else:
                warnings.warn(
                    f"Failed to fetch subtitles for {url}. "
                    f"HTTP Status Code: {response.status_code}"
                )
        else:
            warnings.warn(
                f"No subtitles found for video: {url}. Returning empty transcript."
            )

        # Return empty transcript if no subtitles are found
        return "", video_info