Spaces:

anpigon
/

langchain-qa-bot

Runtime error

App Files Files Community

langchain-qa-bot / docs /langchain /libs /community /langchain_community /document_loaders /bilibili.py

anpigon

add langchain docs

ed4d993 11 months ago

raw

history blame

4.4 kB

	import json
	import re
	import warnings
	from typing import List, Tuple

	import requests
	from langchain_core.documents import Document

	from langchain_community.document_loaders.base import BaseLoader

	# Pre-compile regular expressions for video ID extraction
	BV_PATTERN = re.compile(r"BV\w+")
	AV_PATTERN = re.compile(r"av[0-9]+")


	class BiliBiliLoader(BaseLoader):
	"""
	Load fetching transcripts from BiliBili videos.
	"""

	def __init__(
	self,
	video_urls: List[str],
	sessdata: str = "",
	bili_jct: str = "",
	buvid3: str = "",
	):
	"""
	Initialize the loader with BiliBili video URLs and authentication cookies.
	if no authentication cookies are provided, the loader can't get transcripts
	and will only fetch videos info.

	Args:
	video_urls (List[str]): List of BiliBili video URLs.
	sessdata (str): SESSDATA cookie value for authentication.
	bili_jct (str): BILI_JCT cookie value for authentication.
	buvid3 (str): BUVI3 cookie value for authentication.
	"""
	self.video_urls = video_urls
	self.credential = None
	try:
	from bilibili_api import video
	except ImportError:
	raise ImportError(
	"requests package not found, please install it with "
	"`pip install bilibili-api-python`"
	)
	if sessdata and bili_jct and buvid3:
	self.credential = video.Credential(
	sessdata=sessdata, bili_jct=bili_jct, buvid3=buvid3
	)

	def load(self) -> List[Document]:
	"""
	Load and return a list of documents containing video transcripts.

	Returns:
	List[Document]: List of Document objects transcripts and metadata.
	"""
	results = []
	for url in self.video_urls:
	transcript, video_info = self._get_bilibili_subs_and_info(url)
	doc = Document(page_content=transcript, metadata=video_info)
	results.append(doc)

	return results

	def _get_bilibili_subs_and_info(self, url: str) -> Tuple[str, dict]:
	"""
	Retrieve video information and transcript for a given BiliBili URL.
	"""
	bvid = BV_PATTERN.search(url)
	try:
	from bilibili_api import sync, video
	except ImportError:
	raise ImportError(
	"requests package not found, please install it with "
	"`pip install bilibili-api-python`"
	)
	if bvid:
	v = video.Video(bvid=bvid.group(), credential=self.credential)
	else:
	aid = AV_PATTERN.search(url)
	if aid:
	v = video.Video(aid=int(aid.group()[2:]), credential=self.credential)
	else:
	raise ValueError(f"Unable to find a valid video ID in URL: {url}")

	video_info = sync(v.get_info())
	video_info.update({"url": url})

	# Return if no credential is provided
	if not self.credential:
	return "", video_info

	# Fetching and processing subtitles
	sub = sync(v.get_subtitle(video_info["cid"]))
	sub_list = sub.get("subtitles", [])
	if sub_list:
	sub_url = sub_list[0].get("subtitle_url", "")
	if not sub_url.startswith("http"):
	sub_url = "https:" + sub_url

	response = requests.get(sub_url)
	if response.status_code == 200:
	raw_sub_titles = json.loads(response.content).get("body", [])
	raw_transcript = " ".join([c["content"] for c in raw_sub_titles])

	raw_transcript_with_meta_info = (
	f"Video Title: {video_info['title']}, "
	f"description: {video_info['desc']}\n\n"
	f"Transcript: {raw_transcript}"
	)
	return raw_transcript_with_meta_info, video_info
	else:
	warnings.warn(
	f"Failed to fetch subtitles for {url}. "
	f"HTTP Status Code: {response.status_code}"
	)
	else:
	warnings.warn(
	f"No subtitles found for video: {url}. Returning empty transcript."
	)

	# Return empty transcript if no subtitles are found
	return "", video_info