Spaces:

spark-ds549
/

BPL-RAG-Spring-2025

Running

BPL-RAG-Spring-2025

File size: 8,523 Bytes

import streamlit as st
import os
from typing import List, Tuple, Optional
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from dotenv import load_dotenv
from RAG import RAG
import logging
from image_scraper import DigitalCommonwealthScraper
import shutil

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Page configuration
st.set_page_config(
    page_title="Boston Public Library Chatbot",
    page_icon="🤖",
    layout="wide"
)

def initialize_models() -> Tuple[Optional[ChatOpenAI], HuggingFaceEmbeddings]:
    """Initialize the language model and embeddings."""
    try:
        load_dotenv()
        
        if "llm" not in st.session_state:
            # Initialize OpenAI model
            st.session_state.llm = ChatOpenAI(
                model="gpt-3.5-turbo", 
                temperature=0,
                timeout=60,  # Added reasonable timeout
                max_retries=2
            )
        
        if "embeddings" not in st.session_state:
            # Initialize embeddings
            st.session_state.embeddings = HuggingFaceEmbeddings(
                model_name="sentence-transformers/all-mpnet-base-v2"
                #model_name="sentence-transformers/all-MiniLM-L6-v2"
            )

        if "pinecone" not in st.session_state:
            pinecone_api_key = os.getenv("PINECONE_API_KEY")
            INDEX_NAME = 'bpl-test'
            #initialize vectorstore
            pc = Pinecone(api_key=pinecone_api_key)
            
            index = pc.Index(INDEX_NAME)
            st.session_state.pinecone = PineconeVectorStore(index=index, embedding=st.session_state.embeddings)
        
        if "vectorstore" not in st.session_state:
            #st.session_state.vectorstore = CloudSQLVectorStore(embedding=st.session_state.embeddings)
            st.session_state.vectorstore = st.session_state.pinecone
        
    except Exception as e:
        logger.error(f"Error initializing models: {str(e)}")
        st.error(f"Failed to initialize models: {str(e)}")
        return None, None

def process_message(

    query: str,

    llm: ChatOpenAI,

    vectorstore: PineconeVectorStore,



) -> Tuple[str, List]:
    """Process the user message using the RAG system."""
    try:
        response, sources = RAG(
            query=query,
            llm=llm,
            vectorstore=vectorstore,
        )
        return response, sources
    except Exception as e:
        logger.error(f"Error in process_message: {str(e)}")
        return f"Error processing message: {str(e)}", []

def display_sources(sources: List) -> None:
    """Display sources with minimal output: content preview, source, URL, and image/audio if available."""
    if not sources:
        st.info("No sources available for this response.")
        return

    st.subheader("Sources")
    for doc in sources:
        try:
            metadata = doc.metadata
            source = metadata.get("source", "Unknown Source")
            title = metadata.get("title_info_primary_tsi", "Unknown Title")
            format_type = metadata.get("format", "").lower()

            is_audio = "audio" in format_type

            expander_title = f"🔊 {title}" if is_audio else title

            with st.expander(expander_title):
                # Content preview
                if hasattr(doc, 'page_content'):
                    st.markdown(f"**Content:** {doc.page_content[:300]} ...")

                # URL building
                doc_url = metadata.get("URL", "").strip()
                if not doc_url and source:
                    doc_url = f"https://www.digitalcommonwealth.org/search/{source}"

                st.markdown(f"**Source ID:** {source}")
                st.markdown(f"**Format:** {format_type if format_type else 'Not specified'}")
                st.markdown(f"**URL:** {doc_url}")

                # 🔊 Try to show audio if it's an audio entry and there's a media file
                if is_audio:
                    # Try to find a playable media file — if metadata has audio URLs
                    # For now, just embed a dummy player or placeholder
                    st.info("This is an audio entry.")
                    # Optionally:
                    # st.audio("https://example.com/audio-file.mp3")  # replace with real audio URL
                else:
                    # 🖼️ Show image if it's not audio
                    scraper = DigitalCommonwealthScraper()
                    images = scraper.extract_images(doc_url)
                    images = images[:1]

                    if images:
                        output_dir = 'downloaded_images'
                        if os.path.exists(output_dir):
                            shutil.rmtree(output_dir)
                        downloaded_files = scraper.download_images(images)
                        st.image(downloaded_files, width=400, caption=[
                            img.get('alt', f'Image') for img in images
                        ])
        except Exception as e:
            logger.warning(f"[display_sources] Error displaying document: {e}")
            st.error("Error displaying one of the sources.")



def main():
    st.title("Digital Commonwealth RAG 🤖")

    INDEX_NAME = 'bpl-rag'

    # Initialize session state
    if "messages" not in st.session_state:
        st.session_state.messages = []

    if "show_settings" not in st.session_state:
        st.session_state.show_settings = False

    if "num_sources" not in st.session_state:
        st.session_state.num_sources = 10
        

    initialize_models()

    # 🔵 Settings button
    open_settings = st.button("⚙️ Settings")

    if open_settings:
        st.session_state.show_settings = True

    if st.session_state.show_settings:
        with st.container():
            st.markdown("---")
            st.markdown("### ⚙️ Settings")

            num_sources = st.number_input(
                "Number of Sources to Display",
                min_value=1,
                max_value=100,
                value=st.session_state.num_sources,
                step=1,
            )
            st.session_state.num_sources = num_sources

            close_settings = st.button("❌ Close Settings")
            if close_settings:
                st.session_state.show_settings = False
            st.markdown("---")

    # Show chat history
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            st.markdown(message["content"])

    # ⬇️ CHAT INPUT BOX always stuck to bottom
    user_input = st.chat_input("Type your question here...")

    if user_input:
        with st.chat_message("user"):
            st.markdown(user_input)
        st.session_state.messages.append({"role": "user", "content": user_input})

        with st.chat_message("assistant"):
            with st.spinner("Thinking... Please be patient..."):
                response, sources = process_message(
                    query=user_input,
                    llm=st.session_state.llm,
                    vectorstore=st.session_state.vectorstore
                )

                if isinstance(response, str):
                    st.markdown(response)
                    st.session_state.messages.append({
                        "role": "assistant",
                        "content": response
                    })

                    display_sources(sources[:int(st.session_state.num_sources)])
                else:
                    st.error("Received an invalid response format")

    # Footer (optional, will be above chat input)
    st.markdown("---")
    st.markdown(
        "Built with Langchain + Streamlit + Pinecone",
        help="Natural Language Querying for Digital Commonwealth"
    )
    st.markdown(
        "The Digital Commonwealth site provides access to photographs, manuscripts, books, "
        "audio recordings, and other materials of historical interest that have been digitized "
        "and made available by members of Digital Commonwealth."
    )

if __name__ == "__main__":
    main()