Spaces:
Runtime error
Runtime error
langchain-qa-bot
/
docs
/langchain
/libs
/community
/langchain_community
/document_loaders
/bilibili.py
import json | |
import re | |
import warnings | |
from typing import List, Tuple | |
import requests | |
from langchain_core.documents import Document | |
from langchain_community.document_loaders.base import BaseLoader | |
# Pre-compile regular expressions for video ID extraction | |
BV_PATTERN = re.compile(r"BV\w+") | |
AV_PATTERN = re.compile(r"av[0-9]+") | |
class BiliBiliLoader(BaseLoader): | |
""" | |
Load fetching transcripts from BiliBili videos. | |
""" | |
def __init__( | |
self, | |
video_urls: List[str], | |
sessdata: str = "", | |
bili_jct: str = "", | |
buvid3: str = "", | |
): | |
""" | |
Initialize the loader with BiliBili video URLs and authentication cookies. | |
if no authentication cookies are provided, the loader can't get transcripts | |
and will only fetch videos info. | |
Args: | |
video_urls (List[str]): List of BiliBili video URLs. | |
sessdata (str): SESSDATA cookie value for authentication. | |
bili_jct (str): BILI_JCT cookie value for authentication. | |
buvid3 (str): BUVI3 cookie value for authentication. | |
""" | |
self.video_urls = video_urls | |
self.credential = None | |
try: | |
from bilibili_api import video | |
except ImportError: | |
raise ImportError( | |
"requests package not found, please install it with " | |
"`pip install bilibili-api-python`" | |
) | |
if sessdata and bili_jct and buvid3: | |
self.credential = video.Credential( | |
sessdata=sessdata, bili_jct=bili_jct, buvid3=buvid3 | |
) | |
def load(self) -> List[Document]: | |
""" | |
Load and return a list of documents containing video transcripts. | |
Returns: | |
List[Document]: List of Document objects transcripts and metadata. | |
""" | |
results = [] | |
for url in self.video_urls: | |
transcript, video_info = self._get_bilibili_subs_and_info(url) | |
doc = Document(page_content=transcript, metadata=video_info) | |
results.append(doc) | |
return results | |
def _get_bilibili_subs_and_info(self, url: str) -> Tuple[str, dict]: | |
""" | |
Retrieve video information and transcript for a given BiliBili URL. | |
""" | |
bvid = BV_PATTERN.search(url) | |
try: | |
from bilibili_api import sync, video | |
except ImportError: | |
raise ImportError( | |
"requests package not found, please install it with " | |
"`pip install bilibili-api-python`" | |
) | |
if bvid: | |
v = video.Video(bvid=bvid.group(), credential=self.credential) | |
else: | |
aid = AV_PATTERN.search(url) | |
if aid: | |
v = video.Video(aid=int(aid.group()[2:]), credential=self.credential) | |
else: | |
raise ValueError(f"Unable to find a valid video ID in URL: {url}") | |
video_info = sync(v.get_info()) | |
video_info.update({"url": url}) | |
# Return if no credential is provided | |
if not self.credential: | |
return "", video_info | |
# Fetching and processing subtitles | |
sub = sync(v.get_subtitle(video_info["cid"])) | |
sub_list = sub.get("subtitles", []) | |
if sub_list: | |
sub_url = sub_list[0].get("subtitle_url", "") | |
if not sub_url.startswith("http"): | |
sub_url = "https:" + sub_url | |
response = requests.get(sub_url) | |
if response.status_code == 200: | |
raw_sub_titles = json.loads(response.content).get("body", []) | |
raw_transcript = " ".join([c["content"] for c in raw_sub_titles]) | |
raw_transcript_with_meta_info = ( | |
f"Video Title: {video_info['title']}, " | |
f"description: {video_info['desc']}\n\n" | |
f"Transcript: {raw_transcript}" | |
) | |
return raw_transcript_with_meta_info, video_info | |
else: | |
warnings.warn( | |
f"Failed to fetch subtitles for {url}. " | |
f"HTTP Status Code: {response.status_code}" | |
) | |
else: | |
warnings.warn( | |
f"No subtitles found for video: {url}. Returning empty transcript." | |
) | |
# Return empty transcript if no subtitles are found | |
return "", video_info | |