|
import validators
|
|
import streamlit as st
|
|
from urllib.parse import urlparse, parse_qs
|
|
from langchain.prompts import PromptTemplate
|
|
from langchain.chains.summarize import load_summarize_chain
|
|
from langchain_core.documents import Document
|
|
from langchain_community.document_loaders import UnstructuredURLLoader
|
|
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
|
|
import yt_dlp
|
|
from langchain_groq import ChatGroq
|
|
from langchain_openai import ChatOpenAI
|
|
from langchain_huggingface import HuggingFaceEndpoint
|
|
from fpdf import FPDF
|
|
|
|
|
|
|
|
st.set_page_config(page_title="π¦ LangChain: All-in-One Summarizer", page_icon="π§ ", layout="centered")
|
|
st.title("π§ LangChain: All-in-One Summarizer")
|
|
st.caption("Summarize πΉ YouTube videos or π Web pages using your favorite LLM provider.")
|
|
|
|
|
|
with st.sidebar:
|
|
st.header("π API Keys")
|
|
provider = st.selectbox("Choose LLM Provider", ["Groq", "OpenAI", "HuggingFace"])
|
|
groq_api_key = st.text_input("Groq API Key", type="password") if provider == "Groq" else None
|
|
openai_api_key = st.text_input("OpenAI API Key", type="password") if provider == "OpenAI" else None
|
|
hf_api_key = st.text_input("HuggingFace API Token", type="password") if provider == "HuggingFace" else None
|
|
|
|
|
|
|
|
generic_url = st.text_input("Enter YouTube or Website URL", "")
|
|
|
|
|
|
prompt_template = """
|
|
You are a helpful assistant. Please summarize the following content in no more than 300 words:
|
|
|
|
Content:
|
|
{text}
|
|
"""
|
|
prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
|
|
|
|
|
|
llm = None
|
|
if provider == "Groq" and groq_api_key:
|
|
llm = ChatGroq(model="llama3-8b-8192", groq_api_key=groq_api_key)
|
|
elif provider == "OpenAI" and openai_api_key:
|
|
llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key)
|
|
elif provider == "HuggingFace" and hf_api_key:
|
|
llm = HuggingFaceEndpoint(
|
|
repo_id="mistralai/Mistral-7B-Instruct-v0.3",
|
|
token=hf_api_key,
|
|
task="text-generation",
|
|
max_length=500,
|
|
temperature=0.7
|
|
)
|
|
|
|
|
|
@st.cache_data(show_spinner=False)
|
|
def get_youtube_transcript(url):
|
|
parsed = urlparse(url)
|
|
if "youtu.be" in parsed.netloc:
|
|
video_id = parsed.path[1:]
|
|
else:
|
|
video_id = parse_qs(parsed.query).get("v", [None])[0]
|
|
|
|
if not video_id:
|
|
return None
|
|
|
|
try:
|
|
transcript = YouTubeTranscriptApi.get_transcript(video_id)
|
|
return " ".join(entry["text"] for entry in transcript)
|
|
except TranscriptsDisabled:
|
|
|
|
try:
|
|
ydl_opts = {
|
|
"quiet": True,
|
|
"skip_download": True,
|
|
"writesubtitles": True,
|
|
"writeautomaticsub": True,
|
|
"subtitlesformat": "vtt",
|
|
"outtmpl": "%(id)s.%(ext)s"
|
|
}
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
info = ydl.extract_info(url, download=False)
|
|
subtitles = info.get("subtitles") or info.get("automatic_captions")
|
|
if subtitles and "en" in subtitles:
|
|
st.warning("π Subtitles available but direct download not implemented.")
|
|
return None
|
|
else:
|
|
return None
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
|
|
def load_content(url):
|
|
if "youtube.com" in url or "youtu.be" in url:
|
|
transcript = get_youtube_transcript(url)
|
|
if transcript:
|
|
return [Document(page_content=transcript)]
|
|
else:
|
|
st.error("β Could not extract transcript from the YouTube video.")
|
|
return []
|
|
else:
|
|
loader = UnstructuredURLLoader(
|
|
urls=[url],
|
|
ssl_verify=False,
|
|
headers={"User-Agent": "Mozilla/5.0"}
|
|
)
|
|
return loader.load()
|
|
|
|
|
|
def generate_pdf(text):
|
|
pdf = FPDF()
|
|
pdf.add_page()
|
|
pdf.set_auto_page_break(auto=True, margin=15)
|
|
pdf.set_font("Arial", size=12)
|
|
for line in text.split("\n"):
|
|
pdf.multi_cell(0, 10, line)
|
|
pdf_path = "/tmp/summary.pdf"
|
|
pdf.output(pdf_path)
|
|
return pdf_path
|
|
|
|
|
|
|
|
if st.button("β¨ Summarize Now"):
|
|
if not generic_url.strip():
|
|
st.error("β Please enter a valid URL.")
|
|
elif not validators.url(generic_url):
|
|
st.error("β οΈ The entered text is not a valid URL.")
|
|
elif not llm:
|
|
st.error("β οΈ Please enter the correct API key for the selected provider.")
|
|
else:
|
|
try:
|
|
with st.spinner("π Loading and summarizing..."):
|
|
docs = load_content(generic_url)
|
|
if not docs:
|
|
st.stop()
|
|
|
|
trimmed_text = docs[0].page_content[:12000]
|
|
st.info(f"π Content length: {len(trimmed_text)} characters")
|
|
|
|
with st.expander("π Preview Fetched Content"):
|
|
st.write(trimmed_text[:1000] + "..." if len(trimmed_text) > 1000 else trimmed_text)
|
|
|
|
chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
|
|
summary = chain.run([Document(page_content=trimmed_text)])
|
|
|
|
st.success("β
Summary Generated!")
|
|
st.markdown(summary)
|
|
|
|
|
|
pdf_path = generate_pdf(summary)
|
|
with open(pdf_path, "rb") as f:
|
|
st.download_button("π Download Summary as PDF", f, file_name="summary.pdf")
|
|
|
|
except Exception as e:
|
|
st.exception(f"β Summarization failed: {e}")
|
|
|