Kunal Kurve
Update app.py
d71a127
import validators
import streamlit as st
from urllib.parse import urlparse, parse_qs
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain_core.documents import Document
from langchain_community.document_loaders import UnstructuredURLLoader
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
import yt_dlp
from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI
from langchain_huggingface import HuggingFaceEndpoint
from fpdf import FPDF
# πŸ”§ Streamlit Page Setup
st.set_page_config(page_title="🦜 LangChain: All-in-One Summarizer", page_icon="🧠", layout="centered")
st.title("🧠 LangChain: All-in-One Summarizer")
st.caption("Summarize πŸ“Ή YouTube videos or 🌐 Web pages using your favorite LLM provider.")
# πŸ”‘ Sidebar Inputs
with st.sidebar:
st.header("πŸ”‘ API Keys")
provider = st.selectbox("Choose LLM Provider", ["Groq", "OpenAI", "HuggingFace"])
groq_api_key = st.text_input("Groq API Key", type="password") if provider == "Groq" else None
openai_api_key = st.text_input("OpenAI API Key", type="password") if provider == "OpenAI" else None
hf_api_key = st.text_input("HuggingFace API Token", type="password") if provider == "HuggingFace" else None
# 🌐 URL Input
generic_url = st.text_input("Enter YouTube or Website URL", "")
# 🧠 Prompt Template
prompt_template = """
You are a helpful assistant. Please summarize the following content in no more than 300 words:
Content:
{text}
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
# 🧠 LLM Selection
llm = None
if provider == "Groq" and groq_api_key:
llm = ChatGroq(model="llama3-8b-8192", groq_api_key=groq_api_key)
elif provider == "OpenAI" and openai_api_key:
llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key)
elif provider == "HuggingFace" and hf_api_key:
llm = HuggingFaceEndpoint(
repo_id="mistralai/Mistral-7B-Instruct-v0.3",
token=hf_api_key,
task="text-generation",
max_length=500,
temperature=0.7
)
# πŸ“Ό YouTube Transcript Fallback
@st.cache_data(show_spinner=False)
def get_youtube_transcript(url):
parsed = urlparse(url)
if "youtu.be" in parsed.netloc:
video_id = parsed.path[1:]
else:
video_id = parse_qs(parsed.query).get("v", [None])[0]
if not video_id:
return None
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id)
return " ".join(entry["text"] for entry in transcript)
except TranscriptsDisabled:
# fallback using yt_dlp
try:
ydl_opts = {
"quiet": True,
"skip_download": True,
"writesubtitles": True,
"writeautomaticsub": True,
"subtitlesformat": "vtt",
"outtmpl": "%(id)s.%(ext)s"
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=False)
subtitles = info.get("subtitles") or info.get("automatic_captions")
if subtitles and "en" in subtitles:
st.warning("πŸ“„ Subtitles available but direct download not implemented.")
return None
else:
return None
except Exception:
return None
# πŸ“₯ Content Loader
def load_content(url):
if "youtube.com" in url or "youtu.be" in url:
transcript = get_youtube_transcript(url)
if transcript:
return [Document(page_content=transcript)]
else:
st.error("❌ Could not extract transcript from the YouTube video.")
return []
else:
loader = UnstructuredURLLoader(
urls=[url],
ssl_verify=False,
headers={"User-Agent": "Mozilla/5.0"}
)
return loader.load()
# πŸ“€ PDF Downloader
def generate_pdf(text):
pdf = FPDF()
pdf.add_page()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.set_font("Arial", size=12)
for line in text.split("\n"):
pdf.multi_cell(0, 10, line)
pdf_path = "/tmp/summary.pdf"
pdf.output(pdf_path)
return pdf_path
# πŸ”˜ Main Logic
if st.button("✨ Summarize Now"):
if not generic_url.strip():
st.error("❌ Please enter a valid URL.")
elif not validators.url(generic_url):
st.error("⚠️ The entered text is not a valid URL.")
elif not llm:
st.error("⚠️ Please enter the correct API key for the selected provider.")
else:
try:
with st.spinner("πŸ”„ Loading and summarizing..."):
docs = load_content(generic_url)
if not docs:
st.stop()
trimmed_text = docs[0].page_content[:12000]
st.info(f"πŸ“ Content length: {len(trimmed_text)} characters")
with st.expander("πŸ“– Preview Fetched Content"):
st.write(trimmed_text[:1000] + "..." if len(trimmed_text) > 1000 else trimmed_text)
chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
summary = chain.run([Document(page_content=trimmed_text)])
st.success("βœ… Summary Generated!")
st.markdown(summary)
# πŸ”½ PDF Download
pdf_path = generate_pdf(summary)
with open(pdf_path, "rb") as f:
st.download_button("πŸ“„ Download Summary as PDF", f, file_name="summary.pdf")
except Exception as e:
st.exception(f"❌ Summarization failed: {e}")