Spaces:

spark-ds549
/

BPL-RAG-Spring-2025

Running

App Files Files Community

rithvik213 commited on 22 days ago

Commit

5e31ea3

1 Parent(s): c009c1e

updated RAG pipeline and streamlit file

Browse files

Files changed (2) hide show

RAG.py +30 -202
streamlit_app.py +37 -21

RAG.py CHANGED Viewed

@@ -62,44 +62,32 @@ def extract_text_from_json(json_data: Dict) -> str:
     return " ".join(text_parts) if text_parts else "No content available"
 def rephrase_and_expand_query(query: str, llm: Any) -> str:
-    # Use LLM to rewrite and expand a query for better alignment with archive metadata.
     prompt_template = PromptTemplate.from_template(
         """
         You are a professional librarian skilled at historical research.
-        Your task is to improve and expand the following search query to better match metadata in a historical archive.
-        - First, rewrite the query to improve clarity and fit how librarians would search.
-        - Second, expand the query by adding related terms (synonyms, related concepts, historical terminology, etc.).
-        Return your output strictly in this format (no extra explanation):
         <IMPROVED_QUERY>your improved query here</IMPROVED_QUERY>
         <EXPANDED_QUERY>your expanded query here</EXPANDED_QUERY>
         Original Query: {query}
         """
     )
     prompt = prompt_template.invoke({"query": query})
     response = llm.invoke(prompt)
-    # Extract just the improved and expanded queries
     improved_match = re.search(r"<IMPROVED_QUERY>(.*?)</IMPROVED_QUERY>", response.content, re.DOTALL)
     expanded_match = re.search(r"<EXPANDED_QUERY>(.*?)</EXPANDED_QUERY>", response.content, re.DOTALL)
     improved_query = improved_match.group(1).strip() if improved_match else query
     expanded_query = expanded_match.group(1).strip() if expanded_match else ""
-    final_query = f"{improved_query} {expanded_query}".strip()
-    logging.info(f"Original Query: {query}")
-    logging.info(f"Improved Query: {improved_query}")
-    logging.info(f"Expanded Query: {expanded_query}")
-    logging.info(f"Final Query for Retrieval: {final_query}")
-    return final_query
 weights = {
     "title_info_primary_tsi": 1.5,  # Titles should be prioritized
@@ -164,132 +152,13 @@ def get_metadata_from_api(document_ids: List[str]) -> Dict[str, Dict]:
             metadata_dict[doc_id] = extract_text_from_json(json_data)
     return metadata_dict
-"""
-def rerank(documents: List[Document], query: str) -> List[Document]:
-    \"\"\"Ingest more metadata. Rerank documents using BM25\"\"\"
-    start = time.time()
-    if not documents:
-        return []
-    full_docs = []
-    seen_sources = set()
-    meta_start = time.time()
-    for doc in documents:
-        source = doc.metadata.get('source')
-        if not source or source in seen_sources:
-            continue  # Skip duplicate sources
-        seen_sources.add(source)
-        url = f"https://www.digitalcommonwealth.org/search/{source}"
-        json_data = safe_get_json(f"{url}.json")
-        if json_data:
-            text_content = extract_text_from_json(json_data)
-            if text_content:  # Only add documents with actual content
-                full_docs.append(Document(page_content=text_content, metadata={"source": source, "field": doc.metadata.get("field", ""), "URL": url}))
-    logging.info(f"Took {time.time()-meta_start} seconds to retrieve all metadata")
-    if not full_docs:
-        return []
-    # Create BM25 retriever with the processed documents
-    bm25 = BM25Retriever.from_documents(full_docs, k=min(10, len(full_docs)))
-    bm25_ranked_docs = bm25.invoke(query)
-    ranked_docs = []
-    for doc in bm25_ranked_docs:
-        bm25_score = 1.0
-        # Compute metadata multiplier
-        metadata_multiplier = 1.0
-        for field, weight in weights.items():
-            if field in doc.metadata and doc.metadata[field]:
-                metadata_multiplier += weight
-        # Compute final score: BM25 weight * Metadata multiplier
-        final_score = bm25_score * metadata_multiplier
-        ranked_docs.append((doc, final_score))
-    # Sort by final score
-    ranked_docs.sort(key=lambda x: x[1], reverse=True)
-    logging.info(f"Finished reranking: {time.time()-start}")
-    return [doc for doc, _ in ranked_docs]
-"""
-'''
 def rerank(documents: List[Document], query: str) -> List[Document]:
-    """Retrieve metadata from the database and rerank using BM25"""
-    start = time.time()
     if not documents:
         return []
-    document_ids = [doc.metadata.get('source') for doc in documents if doc.metadata.get('source')]
-    # Fetch metadata from PostgreSQL
-    metadata_dict = get_metadata_from_db(document_ids)
-    full_docs = []
-    for doc in documents:
-        doc_id = doc.metadata.get('source')
-        metadata = metadata_dict.get(doc_id, {})
-        if metadata:
-            text_content = " ".join([
-                metadata.get("title", ""),
-                metadata.get("abstract", ""),
-                " ".join(metadata.get("subjects", [])),
-                metadata.get("institution", "")
-            ]).strip()
-            if text_content:
-                full_docs.append(Document(page_content=text_content, metadata={
-                    "source": doc_id,
-                    "URL": metadata.get("metadata_url", ""),
-                    "image_url": metadata.get("image_url", "")
-                }))
-    logging.info(f"Took {time.time()-start} seconds to retrieve all metadata from PostgreSQL")
-    if not full_docs:
-        return []
-    # Rerank using BM25
-    bm25 = BM25Retriever.from_documents(full_docs, k=min(10, len(full_docs)))
-    bm25_ranked_docs = bm25.invoke(query)
-    ranked_docs = []
-    for doc in bm25_ranked_docs:
-        bm25_score = 1.0
-        # Compute metadata multiplier
-        metadata_multiplier = 1.0
-        for field, weight in weights.items():
-            if field in doc.metadata and doc.metadata[field]:
-                metadata_multiplier += weight
-        # Compute final score: BM25 weight * Metadata multiplier
-        final_score = bm25_score * metadata_multiplier
-        ranked_docs.append((doc, final_score))
-    # Sort by final score
-    ranked_docs.sort(key=lambda x: x[1], reverse=True)
-    logging.info(f"Finished reranking: {time.time()-start}")
-    return [doc for doc, _ in ranked_docs]
-'''
-def rerank(documents: List[Document], query: str) -> List[Document]:
-    """Rerank using BM25 and enhance scores using document metadata."""
-    start = time.time()
-    if not documents:
-        return []
-    # Group document chunks by source_id
     grouped = defaultdict(list)
     for doc in documents:
         source_id = doc.metadata.get("source")
@@ -298,49 +167,39 @@ def rerank(documents: List[Document], query: str) -> List[Document]:
     full_docs = []
     for source_id, chunks in grouped.items():
-        combined_text = " ".join([chunk.page_content for chunk in chunks if chunk.page_content])
-        representative_metadata = chunks[0].metadata or {}
-        #logging.debug(f"Metadata for doc {source_id}: {representative_metadata}")
-        if combined_text.strip():
-            full_docs.append(Document(
-                page_content=combined_text.strip(),
-                metadata={
-                    "source": source_id,
-                    "URL": representative_metadata.get("metadata_url", ""),
-                    "image_url": representative_metadata.get("image_url", ""),
-                    **representative_metadata  # preserve all original fields
-                }
-            ))
-    logging.info(f"Built {len(full_docs)} documents for reranking in {time.time() - start:.2f} seconds.")
     if not full_docs:
         return []
-    # BM25 reranking
-    bm25 = BM25Retriever.from_documents(full_docs, k=min(10, len(full_docs)))
     bm25_ranked_docs = bm25.invoke(query)
-    # Score enhancement using metadata weights
     ranked_docs = []
     for doc in bm25_ranked_docs:
-        bm25_score = 1.0  # BM25 returns sorted, so base score is 1
         metadata_multiplier = 1.0
         for field, weight in weights.items():
             if field in doc.metadata and doc.metadata[field]:
                 metadata_multiplier += weight
         final_score = bm25_score * metadata_multiplier
         ranked_docs.append((doc, final_score))
-    # Sort by enhanced score
     ranked_docs.sort(key=lambda x: x[1], reverse=True)
-    logging.info(f"Finished reranking in {time.time() - start:.2f} seconds")
-    return [doc for doc, _ in ranked_docs]
 def parse_xml_and_query(query:str,xml_string:str) -> str:
     """parse xml and return rephrased query"""
@@ -376,43 +235,12 @@ def RAG(llm: Any, query: str,vectorstore:PineconeVectorStore, top: int = 10, k:
         # Query alignment is commented our, however I have decided to leave it in for potential future use.
-        # Retrieve initial documents using rephrased query -- not working as intended currently, maybe would be better for data with more words.
-        # query_template = PromptTemplate.from_template(
-        #     """
-        #     Your job is to think about a query and then generate a statement that only includes information from the query that would answer the query.
-        #     You will be provided with a query in <QUERY></QUERY> tags.
-        #     Then you will think about what kind of information the query is looking for between <REASONING></REASONING> tags.
-        #     Then, based on the reasoning, you will generate a sample response to the query that only includes information from the query between <STATEMENT></STATEMENT> tags.
-        #     Afterwards, you will determine and reason about whether or not the statement you generated only includes information from the original query and would answer the query between <DETERMINATION></DETERMINATION> tags.
-        #     Finally, you will return a YES, or NO response between <VALID></VALID> tags based on whether or not you determined the statment to be valid.
-        #     Let me provide you with an exmaple:
-        #     <QUERY>I would really like to learn more about Bermudan geography<QUERY>
-        #     <REASONING>This query is interested in geograph as it relates to Bermuda. Some things they might be interested in are Bermudan climate, towns, cities, and geography</REASONING>
-        #     <STATEMENT>Bermuda's Climate is [blank]. Some of Bermuda's cities and towns are [blank]. Other points of interested about Bermuda's geography are [blank].</STATEMENT>
-        #     <DETERMINATION>The query originally only mentions bermuda and geography. The answers do not provide any false information, instead replacing meaningful responses with a placeholder [blank]. If it had hallucinated, it would not be valid. Because the statements do not hallucinate anything, this is a valid statement.</DETERMINATION>
-        #     <VALID>YES</VALID>
-        #     Now it's your turn! Remember not to hallucinate:
-        #     <QUERY>{query}</QUERY>
-        #     """
-        # )
-        # query_prompt = query_template.invoke({"query":query})
-        # query_response = llm.invoke(query_prompt)
-        # new_query = parse_xml_and_query(query=query,xml_string=query_response.content)
-        #logging.info(f"\n---\nQUERY: {query}")
-        #new query rephrasing
-        #query = rephrase_and_expand_query(query, llm)
-        #logging.info(f"\n---\nRephrased QUERY: {query}")
         retrieved, _ = retrieve(query=query, vectorstore=vectorstore, k=k)
         if not retrieved:
             return "No documents found for your query.", []

     return " ".join(text_parts) if text_parts else "No content available"
 def rephrase_and_expand_query(query: str, llm: Any) -> str:
+    """Use LLM to rewrite and expand a query for better alignment with archive metadata."""
     prompt_template = PromptTemplate.from_template(
         """
         You are a professional librarian skilled at historical research.
+        Rewrite and expand the query to match metadata tags. Include related terms (synonyms, historical names, places, events).
         <IMPROVED_QUERY>your improved query here</IMPROVED_QUERY>
         <EXPANDED_QUERY>your expanded query here</EXPANDED_QUERY>
         Original Query: {query}
         """
     )
     prompt = prompt_template.invoke({"query": query})
     response = llm.invoke(prompt)
     improved_match = re.search(r"<IMPROVED_QUERY>(.*?)</IMPROVED_QUERY>", response.content, re.DOTALL)
     expanded_match = re.search(r"<EXPANDED_QUERY>(.*?)</EXPANDED_QUERY>", response.content, re.DOTALL)
     improved_query = improved_match.group(1).strip() if improved_match else query
     expanded_query = expanded_match.group(1).strip() if expanded_match else ""
+    return f"{improved_query} {expanded_query}".strip()
+def extract_years_from_query(query: str) -> List[str]:
+    """Extract 4-digit years from query for boosting."""
+    return re.findall(r"\b(1[5-9]\d{2}|20\d{2}|21\d{2}|22\d{2}|23\d{2})\b", query)
 weights = {
     "title_info_primary_tsi": 1.5,  # Titles should be prioritized
             metadata_dict[doc_id] = extract_text_from_json(json_data)
     return metadata_dict
 def rerank(documents: List[Document], query: str) -> List[Document]:
+    """Rerank documents using BM25 and metadata, boost if year matches."""
     if not documents:
         return []
+    query_years = extract_years_from_query(query)
     grouped = defaultdict(list)
     for doc in documents:
         source_id = doc.metadata.get("source")
     full_docs = []
     for source_id, chunks in grouped.items():
+        combined_text = " ".join(chunk.page_content for chunk in chunks if chunk.page_content)
+        metadata = chunks[0].metadata if chunks else {}
+        full_docs.append(Document(
+            page_content=combined_text.strip(),
+            metadata={**metadata, "source": source_id}
+        ))
     if not full_docs:
         return []
+    bm25 = BM25Retriever.from_documents(full_docs, k=len(full_docs))
     bm25_ranked_docs = bm25.invoke(query)
     ranked_docs = []
     for doc in bm25_ranked_docs:
+        bm25_score = 1.0
         metadata_multiplier = 1.0
         for field, weight in weights.items():
             if field in doc.metadata and doc.metadata[field]:
                 metadata_multiplier += weight
+        date_field = str(doc.metadata.get("date_tsim", ""))
+        for year in query_years:
+            if re.search(rf"\b{year}\b", date_field) or re.search(rf"{year[:-2]}\d{{2}}–{year[:-2]}\d{{2}}", date_field):
+                metadata_multiplier += 50
+                break
         final_score = bm25_score * metadata_multiplier
         ranked_docs.append((doc, final_score))
     ranked_docs.sort(key=lambda x: x[1], reverse=True)
+    return [doc for doc, _ in ranked_docs[:10]]
 def parse_xml_and_query(query:str,xml_string:str) -> str:
     """parse xml and return rephrased query"""
         # Query alignment is commented our, however I have decided to leave it in for potential future use.
+      # 🔄 Rephrase and expand the user query for better Pinecone matching
+        query = rephrase_and_expand_query(query, llm)
+        logging.info(f"Rephrased Query for Retrieval: {query}")
         retrieved, _ = retrieve(query=query, vectorstore=vectorstore, k=k)
         if not retrieved:
             return "No documents found for your query.", []

streamlit_app.py CHANGED Viewed

@@ -31,7 +31,7 @@ def initialize_models() -> Tuple[Optional[ChatOpenAI], HuggingFaceEmbeddings]:
         if "llm" not in st.session_state:
             # Initialize OpenAI model
             st.session_state.llm = ChatOpenAI(
-                model="gpt-4o-mini",  # Changed from gpt-4o-mini which appears to be a typo
                 temperature=0,
                 timeout=60,  # Added reasonable timeout
                 max_retries=2
@@ -81,7 +81,7 @@ def process_message(
         return f"Error processing message: {str(e)}", []
 def display_sources(sources: List) -> None:
-    """Display sources with minimal output: content preview, source, URL, and image if available."""
     if not sources:
         st.info("No sources available for this response.")
         return
@@ -89,40 +89,56 @@ def display_sources(sources: List) -> None:
     st.subheader("Sources")
     for doc in sources:
         try:
-            source = doc.metadata.get("source", "Unknown Source")
-            title = doc.metadata.get("title_info_primary_tsi", "Unknown Title")
-            with st.expander(f"{title}"):
                 # Content preview
                 if hasattr(doc, 'page_content'):
-                    st.markdown(f"**Content:** {doc.page_content[:100]} ...")
-                # Extract URL
-                doc_url = doc.metadata.get("URL", "").strip()
                 if not doc_url and source:
                     doc_url = f"https://www.digitalcommonwealth.org/search/{source}"
                 st.markdown(f"**Source ID:** {source}")
                 st.markdown(f"**URL:** {doc_url}")
-                # Try to show an image
-                scraper = DigitalCommonwealthScraper()
-                images = scraper.extract_images(doc_url)
-                images = images[:1]
-                if images:
-                    output_dir = 'downloaded_images'
-                    if os.path.exists(output_dir):
-                        shutil.rmtree(output_dir)
-                    downloaded_files = scraper.download_images(images)
-                    st.image(downloaded_files, width=400, caption=[
-                        img.get('alt', f'Image') for img in images
-                    ])
         except Exception as e:
             logger.warning(f"[display_sources] Error displaying document: {e}")
             st.error("Error displaying one of the sources.")
 def main():
     st.title("Digital Commonwealth RAG 🤖")

         if "llm" not in st.session_state:
             # Initialize OpenAI model
             st.session_state.llm = ChatOpenAI(
+                model="gpt-3.5-turbo",
                 temperature=0,
                 timeout=60,  # Added reasonable timeout
                 max_retries=2
         return f"Error processing message: {str(e)}", []
 def display_sources(sources: List) -> None:
+    """Display sources with minimal output: content preview, source, URL, and image/audio if available."""
     if not sources:
         st.info("No sources available for this response.")
         return
     st.subheader("Sources")
     for doc in sources:
         try:
+            metadata = doc.metadata
+            source = metadata.get("source", "Unknown Source")
+            title = metadata.get("title_info_primary_tsi", "Unknown Title")
+            format_type = metadata.get("format", "").lower()
+            is_audio = "audio" in format_type
+            expander_title = f"🔊 {title}" if is_audio else title
+            with st.expander(expander_title):
                 # Content preview
                 if hasattr(doc, 'page_content'):
+                    st.markdown(f"**Content:** {doc.page_content[:300]} ...")
+                # URL building
+                doc_url = metadata.get("URL", "").strip()
                 if not doc_url and source:
                     doc_url = f"https://www.digitalcommonwealth.org/search/{source}"
                 st.markdown(f"**Source ID:** {source}")
+                st.markdown(f"**Format:** {format_type if format_type else 'Not specified'}")
                 st.markdown(f"**URL:** {doc_url}")
+                # 🔊 Try to show audio if it's an audio entry and there's a media file
+                if is_audio:
+                    # Try to find a playable media file — if metadata has audio URLs
+                    # For now, just embed a dummy player or placeholder
+                    st.info("This is an audio entry.")
+                    # Optionally:
+                    # st.audio("https://example.com/audio-file.mp3")  # replace with real audio URL
+                else:
+                    # 🖼️ Show image if it's not audio
+                    scraper = DigitalCommonwealthScraper()
+                    images = scraper.extract_images(doc_url)
+                    images = images[:1]
+                    if images:
+                        output_dir = 'downloaded_images'
+                        if os.path.exists(output_dir):
+                            shutil.rmtree(output_dir)
+                        downloaded_files = scraper.download_images(images)
+                        st.image(downloaded_files, width=400, caption=[
+                            img.get('alt', f'Image') for img in images
+                        ])
         except Exception as e:
             logger.warning(f"[display_sources] Error displaying document: {e}")
             st.error("Error displaying one of the sources.")
 def main():
     st.title("Digital Commonwealth RAG 🤖")