Spaces:
Sleeping
Sleeping
import validators | |
import streamlit as st | |
from urllib.parse import urlparse, parse_qs | |
from langchain.prompts import PromptTemplate | |
from langchain.chains.summarize import load_summarize_chain | |
from langchain_core.documents import Document | |
from langchain_community.document_loaders import UnstructuredURLLoader | |
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled | |
import yt_dlp | |
from langchain_groq import ChatGroq | |
from langchain_openai import ChatOpenAI | |
from langchain_huggingface import HuggingFaceEndpoint | |
from fpdf import FPDF | |
# π§ Streamlit Page Setup | |
st.set_page_config(page_title="π¦ LangChain: All-in-One Summarizer", page_icon="π§ ", layout="centered") | |
st.title("π§ LangChain: All-in-One Summarizer") | |
st.caption("Summarize πΉ YouTube videos or π Web pages using your favorite LLM provider.") | |
# π Sidebar Inputs | |
with st.sidebar: | |
st.header("π API Keys") | |
provider = st.selectbox("Choose LLM Provider", ["Groq", "OpenAI", "HuggingFace"]) | |
groq_api_key = st.text_input("Groq API Key", type="password") if provider == "Groq" else None | |
openai_api_key = st.text_input("OpenAI API Key", type="password") if provider == "OpenAI" else None | |
hf_api_key = st.text_input("HuggingFace API Token", type="password") if provider == "HuggingFace" else None | |
# π URL Input | |
generic_url = st.text_input("Enter YouTube or Website URL", "") | |
# π§ Prompt Template | |
prompt_template = """ | |
You are a helpful assistant. Please summarize the following content in no more than 300 words: | |
Content: | |
{text} | |
""" | |
prompt = PromptTemplate(template=prompt_template, input_variables=["text"]) | |
# π§ LLM Selection | |
llm = None | |
if provider == "Groq" and groq_api_key: | |
llm = ChatGroq(model="llama3-8b-8192", groq_api_key=groq_api_key) | |
elif provider == "OpenAI" and openai_api_key: | |
llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key) | |
elif provider == "HuggingFace" and hf_api_key: | |
llm = HuggingFaceEndpoint( | |
repo_id="mistralai/Mistral-7B-Instruct-v0.3", | |
token=hf_api_key, | |
task="text-generation", | |
max_length=500, | |
temperature=0.7 | |
) | |
# πΌ YouTube Transcript Fallback | |
def get_youtube_transcript(url): | |
parsed = urlparse(url) | |
if "youtu.be" in parsed.netloc: | |
video_id = parsed.path[1:] | |
else: | |
video_id = parse_qs(parsed.query).get("v", [None])[0] | |
if not video_id: | |
return None | |
try: | |
transcript = YouTubeTranscriptApi.get_transcript(video_id) | |
return " ".join(entry["text"] for entry in transcript) | |
except TranscriptsDisabled: | |
# fallback using yt_dlp | |
try: | |
ydl_opts = { | |
"quiet": True, | |
"skip_download": True, | |
"writesubtitles": True, | |
"writeautomaticsub": True, | |
"subtitlesformat": "vtt", | |
"outtmpl": "%(id)s.%(ext)s" | |
} | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
info = ydl.extract_info(url, download=False) | |
subtitles = info.get("subtitles") or info.get("automatic_captions") | |
if subtitles and "en" in subtitles: | |
st.warning("π Subtitles available but direct download not implemented.") | |
return None | |
else: | |
return None | |
except Exception: | |
return None | |
# π₯ Content Loader | |
def load_content(url): | |
if "youtube.com" in url or "youtu.be" in url: | |
transcript = get_youtube_transcript(url) | |
if transcript: | |
return [Document(page_content=transcript)] | |
else: | |
st.error("β Could not extract transcript from the YouTube video.") | |
return [] | |
else: | |
loader = UnstructuredURLLoader( | |
urls=[url], | |
ssl_verify=False, | |
headers={"User-Agent": "Mozilla/5.0"} | |
) | |
return loader.load() | |
# π€ PDF Downloader | |
def generate_pdf(text): | |
pdf = FPDF() | |
pdf.add_page() | |
pdf.set_auto_page_break(auto=True, margin=15) | |
pdf.set_font("Arial", size=12) | |
for line in text.split("\n"): | |
pdf.multi_cell(0, 10, line) | |
pdf_path = "/tmp/summary.pdf" | |
pdf.output(pdf_path) | |
return pdf_path | |
# π Main Logic | |
if st.button("β¨ Summarize Now"): | |
if not generic_url.strip(): | |
st.error("β Please enter a valid URL.") | |
elif not validators.url(generic_url): | |
st.error("β οΈ The entered text is not a valid URL.") | |
elif not llm: | |
st.error("β οΈ Please enter the correct API key for the selected provider.") | |
else: | |
try: | |
with st.spinner("π Loading and summarizing..."): | |
docs = load_content(generic_url) | |
if not docs: | |
st.stop() | |
trimmed_text = docs[0].page_content[:12000] | |
st.info(f"π Content length: {len(trimmed_text)} characters") | |
with st.expander("π Preview Fetched Content"): | |
st.write(trimmed_text[:1000] + "..." if len(trimmed_text) > 1000 else trimmed_text) | |
chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt) | |
summary = chain.run([Document(page_content=trimmed_text)]) | |
st.success("β Summary Generated!") | |
st.markdown(summary) | |
# π½ PDF Download | |
pdf_path = generate_pdf(summary) | |
with open(pdf_path, "rb") as f: | |
st.download_button("π Download Summary as PDF", f, file_name="summary.pdf") | |
except Exception as e: | |
st.exception(f"β Summarization failed: {e}") | |