Spaces:

pasupuletkarthiksai
/

medico-bot

Running

App Files Files Community

kap2403 commited on 21 days ago

Commit

5e433de

1 Parent(s): ab45c95

"added files"

Browse files

Files changed (27) hide show

.gitignore +37 -0
__init__.py +0 -0
app.py +98 -0
demo.py +83 -0
requirements.txt +21 -0
src/__init__.py +0 -0
src/auth/auth.py +74 -0
src/auth/db.py +19 -0
src/auth_app.py +45 -0
src/bot/__init__.py +0 -0
src/bot/bot.py +83 -0
src/bot/configs/config.json +1 -0
src/bot/configs/prompt.toml +25 -0
src/bot/extract_metadata.py +80 -0
src/bot/utils.py +23 -0
src/config.py +5 -0
src/data_preprocessing/__init__.py +0 -0
src/data_preprocessing/converting_text_to_embeddings.py +89 -0
src/data_preprocessing/dataloader.py +174 -0
src/data_preprocessing/docling/docling_utils.py +199 -0
src/data_preprocessing/docling/document_conversion.py +187 -0
src/data_preprocessing/docling/indexing.py +177 -0
src/data_preprocessing/docling/utils.py +266 -0
src/data_preprocessing/docling/vector_database_pipeline.py +158 -0
src/data_preprocessing/download_azure_data.py +34 -0
src/data_preprocessing/utils.py +95 -0
src/interface.py +142 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,37 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+# Logs
+*.log
+# Environment files
+.env
+.venv/
+venv/
+# Gradio, VSCode, PyCharm
+.gradio/
+# .vscode/
+.idea/
+# Jupyter Notebook checkpoints
+.ipynb_checkpoints/
+# System files
+.DS_Store
+Thumbs.db
+# Ignore local folders
+dataset/
+database/
+notebooks/
+mbbs_bot/
+faiss_index/
+# Ignore test or temp files
+*.tmp
+*.bak
+.env

__init__.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import gradio as gr
+import gradio as gr
+from src.auth.auth import handle_login
+from src.auth.db import initialize_db
+from dotenv import load_dotenv
+from src.interface import Interface
+# Load environment variables
+initialize_db()
+load_dotenv()
+bot = None  # Initially, no bot is created until the user logs in or registers
+def start_bot(userid, password, api_key_input):
+    global bot
+    login_status = handle_login(userid, password, api_key_input)
+    if "successful" in login_status:  # Check for successful login
+        bot = Interface()  # Initialize after login success
+        return (
+            login_status,
+            gr.update(visible=False),  # Hide login/registration section
+            gr.update(visible=True)    # Show chat section
+        )
+    else:
+        return (
+            login_status,
+            gr.update(visible=True),   # Keep login/registration section visible
+            gr.update(visible=False)   # Keep chat section hidden
+        )
+def answer(message, history):
+    answer_md, tables_display, images_display, retrieved_display = bot.get_answer(message)
+    # Combine all parts into a single response string for chat
+    combined_response = f"{answer_md}\n\n{tables_display}"
+    # Add images as markdown
+    if images_display:
+        combined_response += "\n\n" + "\n\n".join(images_display)
+    return combined_response
+with gr.Blocks(fill_height=True, fill_width=True) as app:
+    with gr.Column(visible=True) as login_register_section:
+        gr.Markdown("# 🔐 MediBot Login & Registration")
+        with gr.Tabs():
+            with gr.TabItem("Login"):
+                userid_login = gr.Textbox(label="UserID")
+                password_login = gr.Textbox(label="Password", type="password")
+                login_btn = gr.Button("Login")
+                login_output = gr.Textbox(label="Login Status", interactive=False)
+            with gr.TabItem("Register"):
+                gr.Markdown("## 🔐 Enter Your Groq Cloud API Key")
+                gr.Markdown("You can create an API key at [Groq Cloud Console]"
+                "(https://console.groq.com/keys)")
+                userid_register = gr.Textbox(label="UserID")
+                password_register = gr.Textbox(label="Password", type="password")
+                api_key_register = gr.Textbox(
+                    label="Groq API Key",
+                    type="password",
+                    placeholder="sk-... (required)"
+                )
+                register_btn = gr.Button("Register")
+                register_output = gr.Textbox(label="Registration Status",
+                                             interactive=False)
+    # Chat Section (Initially hidden)
+    with gr.Column(visible=False) as chat_section:
+        gr.ChatInterface(
+                answer,
+                title="🩺 Medico-Bot",
+                examples=["briefly explain me about cancer", "types of skin diseases?"],
+                flagging_options = ['Like', 'Dislike']
+                )
+    # Function connections
+    login_btn.click(
+        start_bot,
+        inputs=[userid_login, password_login],
+        outputs=[login_output, login_register_section, chat_section]
+    )
+    register_btn.click(
+        start_bot,
+        inputs=[userid_register, password_register, api_key_register],
+        outputs=[register_output, login_register_section, chat_section]
+    )
+app.launch(share=True, show_error=True)

demo.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import gradio as gr
+import os
+from dotenv import load_dotenv
+from src.interface import (
+    Interface,
+    handle_login,
+)
+from src import config
+# Load environment variables
+load_dotenv()
+bot = None  # Initially, no bot is created until the user logs in or registers
+# Function to handle bot initialization after successful login
+def start_bot(userid, password, api_key_input):
+    # Run login first and get success
+    global bot
+    login_status, login_section, chat_section = handle_login(userid, password, api_key_input)
+    # Initialize the bot after login is successful
+    if "successful" in login_status:  # Check for successful login
+        bot = Interface()  # Initialize after login success
+        return login_status, login_section, chat_section, bot  # Return all sections and bot
+    else:
+        return login_status, login_section, chat_section, None  # Return failure and no bot
+def answer(message, history):
+    answer_md, tables_display, images_display, retrieved_display = bot.get_answer(message)
+    # Combine all parts into a single response string for chat
+    combined_response = f"{answer_md}\n\n{tables_display}"
+    # Add images as markdown
+    if images_display:
+        combined_response += "\n\n" + "\n\n".join(images_display)
+    return combined_response
+# Build Gradio Interface
+with gr.Blocks(fill_height=True, fill_width = True) as app:
+    # gr.Markdown("# 🧪 MediBot Login & Chat App")
+    # Login Section
+    with gr.Column(visible=True) as login_section:
+        gr.Markdown("## 🔐 Enter Your Groq Cloud API Key")
+        gr.Markdown("You can create an API key at [Groq Cloud Console](https://console.groq.com/keys)")
+        gr.Markdown("## 🔐 Login or Register")
+        userid_input = gr.Textbox(label="UserID")
+        password_input = gr.Textbox(label="Password", type="password")
+        api_key_input = gr.Textbox(
+            label="Groq API Key (only needed for registration)",
+            type="password",
+            placeholder="sk-... (optional)"
+        )
+        login_btn = gr.Button("Login / Register")
+        login_output = gr.Textbox(label="Login Status", interactive=False)
+    # Initialize the bot
+    # Chat Section (Initially hidden)
+    with gr.Column(visible=False) as chat_section:
+        gr.ChatInterface(
+                        answer,
+                        title="🩺 MediBot Chat Interface",
+                        examples=["briefly explain me about cancer", "types of skin diseases?"],
+                        flagging_options = ['Like', 'Dislike']
+                    )
+    login_btn.click(
+        fn=start_bot,
+        inputs=[userid_input, password_input, api_key_input],
+        outputs=[login_output, login_section, chat_section]
+    )
+app.launch(share=True, show_error=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+docling
+langchain-huggingface
+accelerate
+tiktoken==0.9.0
+langchain-openai
+langchain-community
+faiss-cpu
+groq
+toml
+langchain-groq
+gradio==5.26.0
+markdown
+unstructured[all-docs]
+pillow
+lxml
+pillow
+bcrypt

src/__init__.py ADDED Viewed

File without changes

src/auth/auth.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import os
+import bcrypt
+from src.auth.db import get_db_connection
+from src import config
+from groq import Groq
+def register_user(userid, password, api_key):
+    conn = get_db_connection()
+    cursor = conn.cursor()
+    password_hash = bcrypt.hashpw(password.encode(), bcrypt.gensalt())
+    try:
+        cursor.execute('INSERT INTO users (userid, password_hash, api_key) VALUES (?, ?, ?)',
+                       (userid, password_hash, api_key))
+        conn.commit()
+        return True, "✅ Registered successfully!"
+    except:
+        return False, "❌ User already exists."
+    finally:
+        conn.close()
+def login_user(userid, password):
+    conn = get_db_connection()
+    cursor = conn.cursor()
+    cursor.execute('SELECT password_hash, api_key FROM users WHERE userid=?', (userid,))
+    result = cursor.fetchone()
+    conn.close()
+    if result:
+        stored_hash, api_key = result
+        if bcrypt.checkpw(password.encode(), stored_hash):
+            return True, api_key
+    return False, None
+def verify_login(userid, password):
+    # Verify the user's login credentials
+    success, saved_api_key = login_user(userid, password)
+    if success:
+        config.api_key = saved_api_key
+        os.environ["GROQ_API_KEY"] = saved_api_key
+        return "✅ Login successful!"
+    else:
+        return "❌ Incorrect userid or password."
+def register_user_with_api_key(userid, password, user_api_key):
+    # Validate the API Key first
+    try:
+        client = Groq(api_key=user_api_key)
+        response = client.chat.completions.create(
+            messages=[{"role": "user", "content": "Hello"}],
+            model="llama3-70b-8192"
+        )
+        # If API key is valid, proceed to register the user
+        success, msg = register_user(userid, password, user_api_key)
+        if success:
+            config.api_key = user_api_key
+            os.environ["GROQ_API_KEY"] = user_api_key
+            return "✅ API Key validated & registered!"
+        else:
+            return msg
+    except Exception as e:
+        # API key invalid
+        return f"❌ Invalid API Key: {str(e)}"
+def handle_login(userid, password, user_api_key):
+    if user_api_key:
+        # Handle registration with API key validation
+        return register_user_with_api_key(userid, password, user_api_key)
+    else:
+        # Handle standard login
+        return verify_login(userid, password)

src/auth/db.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import sqlite3
+from src.config import DB_PATH
+def get_db_connection():
+    conn = sqlite3.connect(DB_PATH)
+    return conn
+def initialize_db():
+    conn = get_db_connection()
+    cursor = conn.cursor()
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS users (
+            userid TEXT PRIMARY KEY,
+            password_hash TEXT NOT NULL,
+            api_key TEXT NOT NULL
+        )
+    ''')
+    conn.commit()
+    conn.close()

src/auth_app.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import gradio as gr
+import os
+import subprocess
+import webbrowser
+from groq import Groq
+# Function to validate and save the API key
+def validate_api_key(user_api_key):
+    global api_key
+    if not user_api_key:
+        return "❌ Please enter your Groq Cloud API key."
+    try:
+        # Initialize Groq client
+        client = Groq(api_key=user_api_key)
+        # Make a test request
+        response = client.chat.completions.create(
+            messages=[{"role": "user", "content": "Hello"}],
+            model="llama-3.1-8b-instant"
+        )
+        # Save API key to env variable if successful
+        api_key = user_api_key
+        os.environ["GROQ_API_KEY"] = api_key
+        return "✅ API key is valid and saved!"
+    except Exception as e:
+        return f"❌ Invalid API key: {str(e)}"
+# Gradio Interface
+with gr.Blocks() as demo:
+    gr.Markdown("## 🔐 Enter Your Groq Cloud API Key")
+    gr.Markdown("You can create an API key at [Groq Cloud Console](https://console.groq.com/keys)")
+    api_key_input = gr.Textbox(label="Groq Cloud API Key", type="password", placeholder="sk-...")
+    submit_button = gr.Button("Validate API Key")
+    output_text = gr.Textbox(label="Status", interactive=False)
+    # Use the validation function on button click
+    submit_button.click(fn=validate_api_key, inputs=api_key_input, outputs=output_text)
+demo.launch()

src/bot/__init__.py ADDED Viewed

File without changes

src/bot/bot.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import os
+import toml
+from typing import Optional
+from groq import Groq
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_community.vectorstores import FAISS
+from langchain_openai import OpenAIEmbeddings
+from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
+import logging
+from langchain_groq import ChatGroq
+from src.bot.extract_metadata import Metadata
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+class Medibot:
+    def __init__(self, config_path: str = "src/bot/configs/prompt.toml",
+                 metadata_database: str = "database/metadata.csv",
+                 faiss_database: str = "database/faiss_index"
+                 ):
+        """Initialize Medibot with configuration and Groq client."""
+        # Load environment variables
+        api_key = os.environ.get("GROQ_API_KEY")
+        if not api_key:
+            logger.error("GROQ_API_KEY not found in environment variables")
+            raise ValueError("GROQ_API_KEY is required")
+        # Load prompt configuration
+        try:
+            config = toml.load(config_path)
+            system_prompt = config["rag_prompt"]["system_prompt"]
+            user_prompt_template = config["rag_prompt"]["user_prompt_template"]
+        except (FileNotFoundError, toml.TomlDecodeError) as e:
+            logger.error(f"Failed to load config from {config_path}: {e}")
+            raise
+        # Initialize prompt template
+        self.prompt_template = ChatPromptTemplate.from_messages([
+            ("system", system_prompt),
+            ("user", user_prompt_template)
+        ])
+        # initialize vector database
+        embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
+        vector_store = FAISS.load_local(
+                        faiss_database, embeddings, allow_dangerous_deserialization=True
+                    )
+        self.retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 10})
+        # Initialize Groq client
+        self.model = ChatGroq(
+                            model="llama-3.1-8b-instant",
+                            temperature=0.2,
+                            max_tokens=None,
+                            timeout=None,
+                            max_retries=2,
+                            )
+        self.metadata_extactor = Metadata(metadata_database)
+    def query(self, question: str) -> str:
+        retrieved_docs = self.retriever.invoke(question)
+        # RunnableParallel({"context": retriever, "question": RunnablePassthrough()})
+        rag_chain = (
+            RunnableParallel({
+            "context": RunnableLambda(lambda _: retrieved_docs),  # Reuse retrieved docs
+            "question": RunnablePassthrough()
+        })
+            | self.prompt_template
+            | self.model
+            | StrOutputParser()
+        )
+        answer = rag_chain.invoke({"question": question})
+        refered_tables , refered_images = self.metadata_extactor.get_data_from_ref(retrieved_docs)
+        return answer, retrieved_docs, refered_tables , refered_images

src/bot/configs/config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ model_id = "llama-3.3-70b-versatile"

src/bot/configs/prompt.toml ADDED Viewed

	@@ -0,0 +1,25 @@

+[rag_prompt]
+system_prompt = """
+You are a helpful and knowledgeable medical education assistant. You assist medical students by answering their questions using only the information retrieved from a trusted knowledge base. Your goal is to explain medical concepts clearly, concisely, and in a well-structured manner suitable for students.
+- You must base all answers strictly on the retrieved content.
+- Do not make up any information (no hallucination).
+- If the answer cannot be found in the retrieved content, inform the user that the information was not found.
+- Structure your answers with headings, bullet points, or numbered lists when appropriate.
+- Aim to make complex topics easy to understand for learners.
+- When providing an answer, always include a reference to the source (book name) from the metadata of the retrieved content.
+"""
+user_prompt_template = """
+You are a medical assistant helping students understand complex topics. Use the following retrieved context to answer the question. If the answer is not in the context, say that you couldn't find relevant information.
+Context:
+{context}
+Question:
+{question}
+Answer:
+Reference Books:
+"""

src/bot/extract_metadata.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from langchain_core.documents import Document
+from typing import Tuple, List
+import pandas as pd
+import re
+class Metadata:
+    def __init__(self, ref_database_path: str):
+        self.df = pd.read_csv(ref_database_path)
+    def extract_ref_from_metadata(self, meta_data: dict) -> List[str]:
+        """Extract references from metadata of images and tables."""
+        meta_data_dict = {}
+        meta_data_dict["source"] = meta_data.get("source", "")
+        self_ref = meta_data.get("self_ref", "")
+        parent_ref = meta_data.get("parent_ref", "")
+        child_ref = meta_data.get("child_ref", "")
+        formated_self_ref = re.split(r'[,\s]+', self_ref or "")
+        formated_parent_ref = re.split(r'[,\s]+', parent_ref or "")
+        formated_child_ref = re.split(r'[,\s]+', child_ref or "")
+        filtered_self_ref_ids = [item for item in formated_self_ref
+                                 if item.startswith('#/tables/') or item.startswith('#/pictures/')]
+        filtered_parent_ref_ids = [item for item in formated_parent_ref
+                                   if item.startswith('#/tables/') or item.startswith('#/pictures/')]
+        filtered_child_ref_ids = [item for item in formated_child_ref
+                                  if item.startswith('#/tables/') or item.startswith('#/pictures/')]
+        # Combine all filtered references into a set (to avoid duplicates)
+        all_filtered_references = set(filtered_self_ref_ids +
+                                       filtered_parent_ref_ids +
+                                       filtered_child_ref_ids)
+        if len(all_filtered_references) > 0:
+            meta_data_dict["self_ref"] = list(all_filtered_references)
+            return meta_data_dict
+    def extract_all_ref_from_retrived_chunks(self, chunks: Document) -> dict:
+        all_metadata = {}
+        # Example: Iterate over documents and add extracted metadata to the new dictionary
+        for idx, doc in enumerate(chunks):  # Assuming `docs` is a list of documents
+            meta_data = doc.metadata  # Extract metadata from the document
+            extracted_ref_data = self.extract_ref_from_metadata(meta_data)  # Extract references
+            # Add the extracted metadata to the all_metadata dictionary
+            if extracted_ref_data:
+                all_metadata[f"doc_{idx}"] = extracted_ref_data
+        return all_metadata
+    def get_data_from_ref(self, chunks:Document) -> Tuple[str, str]:
+        """Extract tables and pictures from metadata using references."""
+        tables = {}
+        images = {}
+        all_metadata = self.extract_all_ref_from_retrived_chunks(chunks)
+        for meta in all_metadata.values():
+            source = meta.get("source", "")
+            ref = meta.get("self_ref", [])
+            for r in ref:
+                reference_rows = self.df[
+                    (self.df['source'] == source) &
+                    (self.df['self_ref'].isin([r]))
+                ]
+                if not reference_rows.empty:
+                    chunk_type = reference_rows["chunk_type"].values[0]
+                    page_content = reference_rows["page_content"].values[0]
+                    if chunk_type == "table":
+                        tables[r] = page_content
+                    elif chunk_type == "picture":
+                        images[r] = page_content
+        return tables, images

src/bot/utils.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import toml
+import json
+class config_file_reader:
+    @staticmethod
+    def read_json(config_file: str):
+        with open(config_file, 'r') as file:
+            config_data = json.load(file)
+        return config_data
+    @staticmethod
+    def read_toml(config_file: str):
+        with open(config_file, 'r') as file:
+            config_data = toml.load(file)
+        return config_data
+    @staticmethod
+    def read_configs(file_path:str)->dict|str:
+        if file_path.endswith(".json"):
+            configs = config_file_reader.read_json(file_path)
+        else:
+            configs = config_file_reader.read_toml(file_path)
+        return configs

src/config.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import os
+DB_PATH = os.getenv("DB_PATH", "users.db")
+api_key = None

src/data_preprocessing/__init__.py ADDED Viewed

File without changes

src/data_preprocessing/converting_text_to_embeddings.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# This module is responsible for converting text data into embeddings using the
+# OpenAI API and storing in Faiss database.
+import faiss
+import tiktoken
+from langchain_community.docstore.in_memory import InMemoryDocstore
+from langchain_community.vectorstores import FAISS
+from langchain_openai import OpenAIEmbeddings
+from typing import List, Tuple
+from uuid import uuid4
+from dotenv import load_dotenv
+import logging
+# other imports
+from dataloader import dataloader
+logging.basicConfig(level=logging.INFO)
+def main(folder_path: str)-> None:
+    """
+    Main function to convert text data into embeddings and store them in a Faiss database.
+    The function uses the OpenAI API to generate embeddings and the Faiss library
+    to manage the index.
+    Args:
+        folder_path (str): path to the folder containing the data files.
+    """
+    logging.info("Loading environment variables...")
+    load_dotenv()  # Load environment variables from .env file
+    logging.info("Environment variables loaded.")
+    logging.info("Loading OpenAI embeddings...")
+    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
+    logging.info("OpenAI embeddings loaded.")
+    logging.info("Creating Faiss index...")
+    # Create a Faiss inde
+    index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))
+    # load the emcoder to calculate the number of tokens
+    enc = tiktoken.get_encoding("cl100k_base")
+    vector_store = FAISS(
+        embedding_function=embeddings,
+        index=index,
+        docstore=InMemoryDocstore(),
+        index_to_docstore_id={},
+        )
+    logging.info("Faiss index created.")
+    logging.info("Loading data from folder...")
+    # Load the data
+    chunks_list, _, _, _ = dataloader(folder_path)
+    logging.info(f"Loaded {len(chunks_list)} chunks from folder: {folder_path}")
+    # calculte the number of tokens
+    total_tokens = sum(len(enc.encode(doc.page_content)) for doc in chunks_list)
+    cost = (total_tokens / 1000000) * 0.13
+    logging.info(f"Total tokens: {total_tokens}")
+    logging.info(f"Estimated cost of using text-embedding-3-large: ${cost:.2f}")
+    # Ask user for confirmation
+    proceed = input("Do you want to proceed with embedding and storing the data in Faiss? (yes/no): ").strip().lower()
+    if proceed not in ['yes', 'y']:
+        logging.info("Operation cancelled by the user.")
+        return
+    logging.info("Proceeding with embedding and storing the data in Faiss...")
+    logging.info("Converting text data to embeddings...")
+    # Convert text data to embeddings
+    uuids = [str(uuid4()) for _ in range(len(chunks_list))]
+    vector_store.add_documents(documents=chunks_list, ids=uuids)
+    logging.info("Text data converted to embeddings and stored in Faiss index.")
+    vector_store.save_local("faiss_index")
+    logging.info("Faiss index saved to local storage.")
+if __name__ == "__main__":
+    folder_path = "dataset/converted_json_docs"
+    main(folder_path)

src/data_preprocessing/dataloader.py ADDED Viewed

	@@ -0,0 +1,174 @@

+# this mo
+import re
+import os
+import json
+import docling
+from langchain_core.documents import Document
+from typing import List, Dict, Any, Optional, Tuple
+import logging
+logging.basicConfig(level=logging.INFO)
+#============================
+# data loader from json and md files
+#============================
+def load_json_file(file_path: str)-> dict:
+    """
+    Load a JSON file and return its content as a dictionary.
+    Args:
+        file_path (str): Path to the JSON file.
+    Returns:
+        dict: Dictionary containing the JSON data.
+    """
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+    return data
+def load_md_file(file_path: str) -> str:
+    """
+    Load a Markdown file and return its content as a string.
+    The function reads the file in UTF-8 encoding.
+    Args:
+        file_path (str): Path to the Markdown file.
+    Returns:
+        str: Content of the Markdown file as a string.
+    """
+    with open(file_path, 'r', encoding='utf-8') as file:
+        content = file.read()
+    return content
+def data_preprocess(folder_path: str) -> dict:
+    """
+    Load data from a folder containing JSON files and a Markdown file.
+    The function reads the following files:
+    - tables.json
+    - images.json
+    - text.json
+    - chunks.json
+    - {base_folder_name}-with-images.md
+    Args:
+        folder_path (str): Path to the folder containing the JSON and Markdown files.
+    Returns:
+        dict: A dictionary containing the loaded data from the JSON files and the
+        Markdown file.
+    """
+    tables_path = os.path.join(folder_path, "tables.json")
+    images_path = os.path.join(folder_path, "images.json")
+    text_path = os.path.join(folder_path, "text.json")
+    chunks_path = os.path.join(folder_path, "chunks.json")
+    # Extract base folder name for md and images folder
+    base_folder_name = os.path.basename(folder_path)
+    images_folder_path = os.path.join(folder_path, f"{base_folder_name}-with-images_artifacts")
+    md_file_path = os.path.join(folder_path, f"{base_folder_name}-with-images.md")
+    # Load JSON contents
+    tables = load_json_file(tables_path)
+    images = load_json_file(images_path)
+    text = load_json_file(text_path)
+    chunks = load_json_file(chunks_path)
+    # Load Markdown content
+    markdown = load_md_file(md_file_path)
+    return {
+        "tables": tables,
+        "images": images,
+        "text": text,
+        "chunks": chunks,
+        "images_folder": images_folder_path,
+        "markdown": markdown
+    }
+def load_json_data_documents(converted_document: dict, data_type: str)-> Document:
+    """
+    Load JSON data documents from the converted document.
+    This function takes a converted document and a data type (e.g., "tables", "images", "text", "chunks")
+    and returns a list of Document objects.
+    Args:
+        converted_document (dict): The converted document containing data.
+        data_type (str): The type of data to load (e.g., "tables", "images", "text", "chunks").
+    Returns:
+        Document: A list of Document objects containing the loaded data.
+    """
+    documents = []
+    for chunk in converted_document[data_type]:
+        content = chunk["content"]
+        metadata = chunk["metadata"]
+        # Create Document object
+        document = Document(
+            page_content=content,
+            metadata=metadata
+        )
+        documents.append(document)
+    return documents
+#============================
+#  dataloader for all the data
+#  from the folder
+#  containing json and md files
+#  and images
+#============================
+def dataloader(folder_path: str)-> Tuple[list, list, list, list]:
+    """
+    Load data from a folder containing JSON files and a Markdown file.
+    The function reads the following files:
+    Args:
+        folder_path (str): Folder path containing all folders with JSON files and
+        Markdown files.
+    Returns:
+        Tuple[list, list, list, list]: list of chunks, list of pictures, list of tables,
+        and list of text of overall data.
+    """
+    chunks_list = []
+    pictures_list = []
+    tables_list = []
+    text_list = []
+    logging.info(f"Loading data from folder: {folder_path}")
+    for file_name in os.listdir(folder_path):
+        logging.info(f"Processing file: {file_name}")
+        file_path = os.path.join(folder_path, file_name)
+        # load the data
+        dict_data = data_preprocess(file_path)
+        chunks_data = load_json_data_documents(dict_data, "chunks")
+        pictures_data = load_json_data_documents(dict_data, "images")
+        tables_data = load_json_data_documents(dict_data, "tables")
+        text_data = load_json_data_documents(dict_data, "text")
+        # adding the data to the list
+        chunks_list.extend(chunks_data)
+        pictures_list.extend(pictures_data)
+        tables_list.extend(tables_data)
+        text_list.extend(text_data)
+        logging.info(f"Loaded {len(chunks_data)} chunks, {len(pictures_data)} pictures, "
+                     f"{len(tables_data)} tables, and {len(text_data)} text documents from {file_name}")
+    return chunks_list, pictures_list, tables_list, text_list
+if __name__ == "__main__":
+    # Example usage
+    folder_path = "dataset/converted_json_docs"
+    chunks, pictures, tables, text = dataloader(folder_path)

src/data_preprocessing/docling/docling_utils.py ADDED Viewed

	@@ -0,0 +1,199 @@

+"""
+convert Docling documents to Langchain documents
+1. Extract images and tables from the Docling document.
+2. Extract the text from the Docling document.
+3. Create Langchain documents from the extracted images, tables, and text.
+4. save the data in json file.
+"""
+import json
+import os
+import itertools
+from uuid import uuid4
+from docling.document_converter import DocumentConverter
+from docling_core.types.doc.document import TableItem,PictureItem
+from docling_core.types.doc.labels import DocItemLabel
+from langchain_core.documents import Document
+from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
+import logging
+def adding_metadata_chunks(chunks: HybridChunker, file_name: str, speciality: str) -> list[Document]:
+    """Adding metadata to the chunks
+    This function processes a list of chunks and adds metadata to each chunk.
+    Args:
+        chunks (Hybridchunker): The chunks to be processed.
+        file_name (str): The name of the file from which the chunks were created.
+        specality (str): specalization of the book.
+    Returns:
+        List[Document]: A list of Document objects with added metadata.
+    """
+    documents = []
+    for idx, chunk in enumerate(chunks):
+        items = chunk.meta.doc_items
+        if len(items) == 1 and isinstance(items[0], TableItem):
+            # If the chunk is a table, we can skip it
+            continue
+        main_ref = ",".join([item.get_ref().cref for item in items])
+        parent_ref = ",".join([item.parent.get_ref().cref for item in items])
+        child_ref = ",".join([str(child) for sublist in [item.children for item in items] for child in sublist])
+        text = chunk.text # The text of the chunk
+        metadata = {
+            "source": file_name,
+            "specilization": speciality,
+            "chunk_index": idx,
+            "self_ref": main_ref,
+            "parent_ref": parent_ref,
+            "child_ref": child_ref,
+            "chunk_type": "text",
+        }
+        document = Document(page_content=text, metadata=metadata)
+        documents.append(document)
+    return documents
+def extract_all_text(docling_document: DocumentConverter,
+                     file_name: str,
+                     medical_specialty: str) -> list[Document]:
+    """To exract all the text from the docling document and convert it to langchain
+    document. This is useful for creating a vector store from the text.
+    Args:
+        docling_document (DocumentConverter): _docling_document_
+        file_name (str): name of the file
+        medical_specialty (str): book category
+    Returns:
+        list[Document]: _list of langchain documents_
+    """
+    documents_list = list()
+    for text in docling_document.texts:
+        content = text.text
+        main_ref = " ".join([text.get_ref().cref])
+        parent_ref = " ".join([text.parent.get_ref().cref])
+        child_ref = ", ".join([ref.get_ref().cref for ref in text.children])
+        document = Document(page_content=content, metadata={
+            "source": file_name,
+            "chunk_index": None,
+            "self_ref": {main_ref},
+            "parent_ref": {parent_ref},
+            "child_ref": {child_ref},
+            "chunk_type": "text",
+            "medical_specialty" : medical_specialty,
+            "reference": None
+        })
+        documents_list.append(document)
+    return documents_list
+def extract_tables(docling_document: DocumentConverter,
+                     file_name: str,
+                     medical_specialty: str) -> list[Document]:
+    """Extract the tables from the converted document and add metadata.
+    Args:
+        document (DocumentConverter): converted document.
+        file_name (str): file name.
+        medical_specialty (str): book category
+    Returns:
+        list[TableItem]: A list of documents containing table data with
+        reference IDs in the metadata.
+    """
+    tables: list[Document] = []
+    for table in docling_document.tables:
+        if table.label in [DocItemLabel.TABLE]:
+            main_ref = " ".join([table.get_ref().cref])
+            parent_ref = " ".join([table.parent.get_ref().cref])
+            child_ref = table.children
+            text = table.export_to_markdown()
+            metadata = {
+                "source": file_name,
+                "chunk_index": None,
+                "self_ref": main_ref,
+                "parent_ref": parent_ref,
+                "child_ref": child_ref,
+                "chunk_type": "table",
+                "medical_specialty" : medical_specialty,
+            }
+            document = Document(page_content=text, metadata=metadata)
+            tables.append(document)
+    return tables
+def extract_text_ids(data: dict) -> list:
+    """
+    Extract all references from a dictionary and return a list of numbers
+    from any '#/texts/{number}' references.
+    Args:
+        data (dict): The dictionary to extract from.
+    Returns:
+        list: List of integers extracted from '#/texts/{number}' refs.
+    """
+    refs = [v for k, v in data.items() if k.endswith('_ref') and isinstance(v, str)]
+    text_ids = [int(ref.split('/')[2]) for ref in refs if ref.startswith('#/texts/')]
+    return text_ids
+def save_json(file_path: str, category: str,data: list[Document]) -> None:
+    """Save the data in json format.
+    Args:
+        file_path (str): path of the file.
+        data (list[Document]): list of documents.
+    """
+    doc_dicts = [{"content": doc.page_content, "metadata": doc.metadata} for doc in data]
+    with open(f"{file_path}/{category}.json", "w") as f:
+        json.dump(doc_dicts, f)
+# def main(file_path: str,
+#          file_name: str,
+#          save_path: str,
+#         ) -> list[Document]:
+#     """Main function to convert docling documents to langchain documents.
+#     Args:
+#         file_path (str): path of the file.
+#         file_name (str): name of the file.
+#     Returns:
+#         list[Document]: list of langchain documents.
+#     """
+#     # Extract all text from the docling document
+#     docling_document = DocumentConverter(file_path)
+#     texts = extract_all_text(docling_document, file_name)
+#     # Extract tables from the docling document
+#     tables = modifying_tables(docling_document, file_name)
+#     # Extract images from the docling document
+#     # Combine all documents into a single list
+#     documents = list(itertools.chain(texts, tables))
+#     save_json(save_path, documents)
+# if __name__ == "__main__":
+#     logging.basicConfig(
+#         level=logging.DEBUG,
+#         format='%(asctime)s - %(levelname)s - %(message)s',
+#         handlers=[
+#             logging.StreamHandler(),
+#             logging.FileHandler("app.log", mode='a')
+#         ]
+#     )
+#     logging.info("Creating the dataset")
+#     main(r"dataset",
+#          file_name="medical_textbook",
+#          save_path=r"dataset"
+#         )
+#     logging.info("Dataset created successfully")
+#     logging.info("Dataset saved successfully")

src/data_preprocessing/docling/document_conversion.py ADDED Viewed

	@@ -0,0 +1,187 @@

+"""
+Script to convert all the pdf documents to markdown format in azure.
+"""
+import logging
+import time
+from pathlib import Path
+import os
+import yaml
+from azureml.fsspec import AzureMachineLearningFileSystem
+import shutil
+from concurrent.futures import ThreadPoolExecutor
+from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
+from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+from docling.datamodel.base_models import ConversionStatus, InputFormat
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.settings import settings
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling_core.types.doc import ImageRefMode
+from huggingface_hub import snapshot_download
+from docling.datamodel.settings import settings
+from docling.datamodel.pipeline_options import (
+            AcceleratorDevice,
+            AcceleratorOptions,
+            PdfPipelineOptions,
+            TesseractCliOcrOptions,
+            TableFormerMode,
+        )
+from indexing import document_indexing
+from docling_utils import save_json
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+class Docling_Coversion:
+    def __init__(self, image_scale=1.0):
+        logging.info("Initializing Docling_Coversion with image_scale=%s", image_scale)
+        accelerator_options = AcceleratorOptions(
+                                num_threads=8, device=AcceleratorDevice.CUDA
+                            )
+        # Turn on inline debug visualizations:
+        settings.debug.visualize_layout = True
+        settings.debug.visualize_ocr = True
+        settings.debug.visualize_tables = True
+        settings.debug.visualize_cells = True
+        pipeline_options = PdfPipelineOptions(
+            do_ocr=True,
+            do_table_structure=True,
+            images_scale=image_scale,
+            generate_page_images=True,
+            generate_picture_images=True,
+            accelerator_options=accelerator_options,
+            ocr_options=TesseractCliOcrOptions(force_full_page_ocr=True)
+        )
+        pipeline_options.table_structure_options.do_cell_matching = True
+        pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
+        self.converter = DocumentConverter(
+            format_options={
+                InputFormat.PDF: PdfFormatOption(
+                    pipeline_options=pipeline_options,
+                    backend=DoclingParseV4DocumentBackend,
+                )
+            }
+        )
+        logging.info("Docling_Coversion initialized successfully.")
+    def document_conversion(self, file_path):
+        """Convert a file and return the document object."""
+        logging.info("Starting document conversion for file: %s", file_path)
+        return self.converter.convert(Path(file_path)).document
+    def save_document(self, file_path, output_dir, azure_fs):
+        """Convert a file, save the output as markdown with embedded images,
+           and upload to Azure."""
+        input_path = Path(file_path)
+        logging.info("Processing file: %s", file_path)
+        try:
+            result = self.converter.convert(input_path)
+            doc_name = input_path.stem
+            temp_md_file_path = Path(output_dir) / f"{doc_name}-with-images.md"
+            docling_document_class = document_indexing(result,
+                                           "ibm-granite/granite-embedding-125m-english",
+                                           speciality= input_path.parent.name,
+                                           file_name=input_path.stem
+                                           )
+            tables_doc = docling_document_class.extract_tables()
+            images_doc = docling_document_class.extract_images()
+            text_doc = docling_document_class.extract_all_text()
+            chunks_doc = docling_document_class.create_chunks()
+            # Save the extracted data as JSON
+            save_json(file_path=output_dir, category="tables", data=tables_doc)
+            save_json(file_path=output_dir, category="images", data=images_doc)
+            save_json(file_path=output_dir, category="text", data=text_doc)
+            save_json(file_path=output_dir, category="chunks", data=chunks_doc)
+            logging.info("Saved extracted data as JSON files.")
+            # Save locally first
+            result.document.save_as_markdown(temp_md_file_path, image_mode=ImageRefMode.REFERENCED)
+            logging.info("Saved locally: %s", temp_md_file_path)
+            # Upload to Azure
+            azure_output_path = f"converted_docs_json/{doc_name}"
+            azure_fs.upload(lpath=str(output_dir), rpath=azure_output_path, recursive=True)
+            logging.info("Uploaded to Azure: %s", azure_output_path)
+            # Optionally, delete the local file after upload
+            if output_dir.exists() and output_dir.is_dir():
+                shutil.rmtree(output_dir)
+                logging.info("Deleted local directory: %s", output_dir)
+        except Exception as e:
+            logging.error("Error processing file %s: %s", file_path, e)
+def main(source_dir: str):
+    logging.info("Starting main function with source_dir: %s", source_dir)
+    # Set the temporary output directory
+    # Set the local directory to save PDFs
+    local_pdf_dir = Path("./local_pdfs")
+    local_pdf_dir.mkdir(parents=True, exist_ok=True)  # Create the directory if it doesn't exist
+    logging.info("Local PDF directory created: %s", local_pdf_dir)
+    fs = AzureMachineLearningFileSystem(source_dir)
+    all_pdf_files = fs.glob('**/*.pdf')
+    logging.info("Found %d PDF files in source directory.", len(all_pdf_files))
+    converter = Docling_Coversion(image_scale=2)
+    for file_path in all_pdf_files:
+        # file_path = Path(file_path)
+        output_dir = Path("./temp")
+        output_dir.mkdir(parents=True, exist_ok=True)  # Create the directory if it doesn't exist
+        logging.info("Temporary output directory created: %s", output_dir)
+        file_path_ = Path(file_path)
+        file_name = file_path_.name
+        local_pdf_path = local_pdf_dir / file_name
+        azure_output_path = f"converted_docs_json/{file_path_.stem}"
+        # Check if the file already exists in Azure
+        if fs.exists(azure_output_path):
+            logging.info("Skipping %s, already processed.", file_name)
+            continue
+        # Save the PDF locally
+        logging.info("Downloading file: %s", file_name)
+        with fs.open(file_path, "rb") as remote_file:
+            with open(local_pdf_path, "wb") as local_file:
+                local_file.write(remote_file.read())
+        logging.info("File saved locally: %s", local_pdf_path)
+        # Process the local PDF file
+        logging.info("Processing: %s", file_name)
+        converter.save_document(local_pdf_path, output_dir, fs)
+        # Optionally, delete the local PDF after processing
+        local_pdf_path.unlink()
+        logging.info("Deleted local PDF: %s", local_pdf_path)
+    logging.info("Processing completed for all files.")
+if __name__ == "__main__":
+    logging.info("Script started.")
+    main(source_dir=(
+        'azureml://subscriptions/485363cd-687d-4adb-a30b-35108c11d682/resourcegroups/medbot/workspaces/karthik/datastores/workspaceartifactstore/paths/UI/2025-04-11_075006_UTC/PdfFiles/'
+    ))
+    logging.info("Script finished.")

src/data_preprocessing/docling/indexing.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""
+create chunks and create clusters usign raptor architecture.
+"""
+import json
+import os
+import itertools
+import logging
+from uuid import uuid4
+from docling.document_converter import DocumentConverter
+from docling_core.experimental.serializer.markdown import MarkdownTableSerializer
+from docling_core.transforms.chunker.hierarchical_chunker import ChunkingDocSerializer
+from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
+from docling_core.types.doc.document import DoclingDocument
+from docling_core.types.doc.labels import DocItemLabel
+from langchain_core.documents import Document
+from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
+from transformers import AutoTokenizer
+# imports from another scripts
+def adding_metadata_chunks(chunks: HybridChunker, file_name: str, speciality: str) -> list[Document]:
+    """Adding metadata to the chunks
+    This function processes a list of chunks and adds metadata to each chunk.
+    Args:
+        chunks (Hybridchunker): The chunks to be processed.
+        file_name (str): The name of the file from which the chunks were created.
+        specality (str): specalization of the book.
+    Returns:
+        List[Document]: A list of Document objects with added metadata.
+    """
+    documents = []
+    for idx, chunk in enumerate(chunks):
+        items = chunk.meta.doc_items
+        if len(items) == 1 and isinstance(items[0], TableItem):
+            # If the chunk is a table, we can skip it
+            continue
+        main_ref = " ".join([item.get_ref().cref for item in items])
+        parent_ref = " ".join([item.parent.get_ref().cref for item in items])
+        child_ref = " ".join([str(child) for sublist in [item.children for item in items] for child in sublist])
+        text = chunk.text # The text of the chunk
+        metadata = {
+            "source": file_name,
+            "specilization": speciality,
+            "chunk_index": idx,
+            "self_ref": main_ref,
+            "parent_ref": parent_ref,
+            "child_ref": child_ref,
+            "chunk_type": "text",
+        }
+        document = Document(page_content=text, metadata=metadata)
+        documents.append(document)
+    return documents
+class document_indexing:
+    def __init__(self,
+                 docling_converted_document: DocumentConverter,
+                 embeddings_model: str,
+                 speciality: str,
+                 file_name: str):
+        # convert the document
+        self.converted_document = docling_converted_document.document
+        # hybrid chunking
+        self.embeddings_tokenizer = AutoTokenizer.from_pretrained(embeddings_model)
+        self.speciality = speciality
+        self.file_name = file_name
+    def create_chunks(self):
+        chunks = HybridChunker(tokenizer=self.embeddings_tokenizer).chunk(self.converted_document)
+        updated_chunks = adding_metadata_chunks(chunks = chunks,
+                                                file_name = self.file_name ,
+                                                speciality = self.speciality)
+        return updated_chunks
+    def extract_all_text(self) -> list[Document]:
+        """To exract all the text from the docling document and convert it to langchain
+        document. This is useful for creating a vector store from the text.
+        Args:
+            docling_document (DocumentConverter): _docling_document_
+            file_name (str): name of the file
+            medical_specialty (str): book category
+        Returns:
+            list[Document]: _list of langchain documents_
+        """
+        documents_list = list()
+        for text in self.converted_document.texts:
+            content = text.text
+            main_ref = ",".join([text.get_ref().cref])
+            parent_ref = ",".join([text.parent.get_ref().cref])
+            child_ref = ",".join([ref.get_ref().cref for ref in text.children])
+            document = Document(page_content=content, metadata={
+                "source": self.file_name,
+                "chunk_index": None,
+                "self_ref": main_ref,
+                "parent_ref": parent_ref,
+                "child_ref": child_ref,
+                "chunk_type": "text",
+                "medical_specialty" : self.speciality,
+                "reference": None
+            })
+            documents_list.append(document)
+        return documents_list
+    def extract_tables(self) -> list[Document]:
+        """Extract the tables from the converted document and add metadata.
+        Args:
+            document (DocumentConverter): converted document.
+            file_name (str): file name.
+            medical_specialty (str): book category
+        Returns:
+            list[TableItem]: A list of documents containing table data with
+            reference IDs in the metadata.
+        """
+        tables: list[Document] = []
+        for table in self.converted_document.tables:
+            if table.label in [DocItemLabel.TABLE]:
+                main_ref = ",".join([table.get_ref().cref])
+                parent_ref = ",".join([table.parent.get_ref().cref])
+                child_ref = ",".join([ref.get_ref().cref for ref in table.children])
+                text = table.export_to_markdown()
+                metadata = {
+                    "source": self.file_name,
+                    "chunk_index": None,
+                    "self_ref": main_ref,
+                    "parent_ref": parent_ref,
+                    "child_ref": child_ref,
+                    "chunk_type": "table",
+                    "medical_specialty" : self.speciality,
+                }
+                document = Document(page_content=text, metadata=metadata)
+                tables.append(document)
+        return tables
+    def extract_images(self) -> list[Document]:
+        """Extract the tables from the converted document and add metadata.
+        Args:
+            document (DocumentConverter): converted document.
+            file_name (str): file name.
+            medical_specialty (str): book category
+        Returns:
+            list[TableItem]: A list of documents containing table data with
+            reference IDs in the metadata.
+        """
+        images: list[Document] = []
+        for picture in self.converted_document.pictures:
+            if picture.label in [DocItemLabel.PICTURE]:
+                main_ref = ",".join([picture.get_ref().cref])
+                parent_ref = ",".join([picture.parent.get_ref().cref])
+                child_ref = ",".join([ref.get_ref().cref for ref in picture.children])
+                metadata = {
+                    "source": self.file_name,
+                    "chunk_index": None,
+                    "self_ref": main_ref,
+                    "parent_ref": parent_ref,
+                    "child_ref": child_ref,
+                    "chunk_type": "table",
+                    "medical_specialty" : self.speciality,
+                }
+                document = Document(page_content=main_ref, metadata=metadata)
+                images.append(document)
+        return images

src/data_preprocessing/docling/utils.py ADDED Viewed

	@@ -0,0 +1,266 @@

+"""
+contains all the functions to extract the tables, images and, text from the converted
+documents.
+"""
+import os
+import re
+from typing import List
+from docling.chunking import HybridChunker
+from docling_core.types.doc.document import TableItem
+from langchain_core.documents import Document
+from docling_core.types.doc.labels import DocItemLabel
+from docling_core.types.doc.document import TableItem
+from transformers import AutoTokenizer
+from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
+__all__ = [
+    "sanitize_name",
+    "rename_items",
+    "find_matching_fig_ref",
+    "find_image_by_number",
+    "extract_images",
+    "extract_tables",
+    "extract_texts",
+    "find_relevant_folder"
+]
+def sanitize_name(name:str)-> str:
+    """Replace '-', '_', and '–' with a single hyphen '-' and remove extra spaces.
+    Args:
+        name (str): file or folder name
+    Returns:
+        str: processed name
+    """
+     # Replace -, _, – with '-'
+    name = re.sub(r'[-_– ]+', '-', name)
+    # Replace multiple spaces with a single space
+    name = re.sub(r'\s+', ' ', name).strip()
+    return name
+def rename_items(directory:str):
+    """Rename all files and folders inside the given directory.
+    Args:
+        directory (str): file or folder name
+    """
+    items = os.listdir(directory)  # Get all files and folders inside the directory
+    for item in items:
+        old_path = os.path.join(directory, item)
+        new_name = sanitize_name(item)  # Clean up the name
+        new_path = os.path.join(directory, new_name)
+        if old_path != new_path:  # Rename only if the name changes
+            os.rename(old_path, new_path)
+            print(f"Renamed: {old_path} -> {new_path}")
+def find_matching_fig_ref(doc1:dict, doc2:dict)-> str|None:
+    """Check the texts ids from text chunks metadata and pictures metadata if any id
+    matches then returns the image id.
+    Args:
+        doc1 (dict): text chunks metadata
+        doc2 (dict): picture metadata
+    Returns:
+        str|None: if similar text id matched in both the metadata then returns the
+        figure reference which is figure number. if no match None
+    """
+    # Extract and split self_ref and parent_ref into sets
+    doc1_self_refs = set(doc1['self_ref'].split())  # Split multiple self_refs
+    doc1_parent_refs = set(doc1['parent_ref'].split())  # Split multiple parent_refs
+    # Extract text_ref and fig_ref from doc2
+    doc2_text_ref = doc2['text_ref']
+    doc2_fig_ref = doc2['fig_ref']
+    # Check if text_ref exists in self_ref or parent_ref
+    if doc2_text_ref in doc1_self_refs or doc2_text_ref in doc1_parent_refs:
+        return doc2_fig_ref  # Return fig_ref if there's a match
+    return None  # No match found
+def find_image_by_number(folder_path: str, img_number:int)-> str|None:
+    """Search for an image with the specified number in the folder.
+    Args:
+        folder_path (str): artifacts path where all the images were stored.
+        img_number (int): image id
+    Returns:
+        str|None: image path
+    """
+    pattern = re.compile(rf"image-0*{img_number}-[a-fA-F0-9]+\.png")  # Regex pattern
+    for filename in os.listdir(folder_path):
+        if pattern.match(filename):  # Check if the filename matches the pattern
+            return os.path.join(folder_path, filename)  # Return full path
+    return None  # Return None if no match found
+def extract_images(conv_document: Document) -> Document:
+    """Extract the images from the converted document and add the metadata.
+    Args:
+        conv_document (Document): converted document
+    Returns:
+        Document: pictures with the metadata.
+    """
+    pictures: list[Document] = []
+    for picture in conv_document.pictures:
+        figure_ref = picture.get_ref().cref
+        text_ref = picture.parent.get_ref().cref
+        document = Document(
+                page_content="",
+                metadata={
+                    "fig_ref": figure_ref,
+                    "text_ref": text_ref,
+                },)
+        pictures.append(document)
+    return pictures
+def extract_tables(document: Document,
+                   file_name: str) -> list[TableItem]:
+    """Extract the tables from the converted document and add metadata.
+    Args:
+        document (Document): converted document.
+        file_name (str): file name.
+    Returns:
+        list[TableItem]: A list of documents containing table data with
+        reference IDs in the metadata.
+    """
+    tables = []
+    for table in document.tables:
+        if table.label in [DocItemLabel.TABLE]:
+            self_refs = table.get_ref().cref
+            parent_refs = table.parent.get_ref().cref if table.parent else ""
+            text = table.export_to_markdown()
+            document = Document(
+                page_content=text,
+                metadata={
+                    "source": file_name,
+                    "self_ref": self_refs,
+                    "parent_ref": parent_refs,
+                },
+            )
+            tables.append(document)
+    return tables
+def extract_texts(conv_document: Document,
+                  pictures:List[Document],
+                  images_artifacts: str,
+                  embeddings_tokenizer: AutoTokenizer,
+                  file_name: str
+                  )-> List[Document]:
+    """Extract the text data from converted document and add the image path in the
+       metadata.
+    Args:
+        conv_document (Document): converted document.
+        pictures (List[Document]): extracted pictures list.
+        images_artifacts (str): artifacts path to extact image path.
+        embeddings_tokenizer (AutoTokenizer): tokenizer to chunk the texts.
+        file_name (str): file name.
+    Returns:
+        List[Document]: chunks with updated metadata.
+    """
+    texts = []
+    doc_id = 0
+    for chunk in HybridChunker(tokenizer=embeddings_tokenizer).chunk(conv_document):
+        items = chunk.meta.doc_items
+        self_refs = " ".join(map(lambda item: item.get_ref().cref, items))
+        parent_refs = items[0].parent.get_ref().cref if len(items) > 0 else ""
+        meta_data_dict = {
+            "source": file_name,
+            "self_ref": self_refs,
+            "parent_ref": parent_refs,
+        }
+        for picture in pictures:
+            fig_metadata = picture.metadata
+            fig_ref = find_matching_fig_ref(meta_data_dict, fig_metadata)
+            if fig_ref:
+                fig_number = int(fig_ref.split("/")[-1])
+                image_path = find_image_by_number(images_artifacts, fig_number)
+                meta_data_dict["fig_ref"] = image_path
+                meta_data_dict["fig_number"] = fig_number
+        text = chunk.text
+        document = Document(
+                page_content=text,
+                metadata= meta_data_dict,
+            )
+        texts.append(document)
+    return texts
+def find_relevant_folder(folder_path:str)->dict:
+    """create a dict with markdown file(key) and
+       artfacts (value).
+    Args:
+        folder_path (str): folder path where all the converted documents are stored.
+    Returns:
+        dict: dict with file with artifacts folder
+    """
+    # Renaming the files and folders by removing the spaces
+    rename_items(folder_path)
+    # Initialize the dataset dictionary
+    dataset_dict = {}
+    # Get all files and folders in the directory (do this only once)
+    all_items = os.listdir(folder_path)
+    # Split files and folders in one pass
+    md_files = {file for file in all_items if file.endswith(".md")}
+    folders = {folder for folder in all_items if not folder.endswith(".md")}
+    # Create a dictionary of folder name splits for efficient matching
+    folder_splits = {tuple(folder.split("-")[:-2]): folder for folder in folders}
+    for file in md_files:
+        file_split = tuple(file.split("-")[:-1])
+        # Check if file_split matches any folder's split
+        if file_split in folder_splits:
+            dataset_dict[file] = folder_splits[file_split]
+    return dataset_dict
+def extract_ref_text_ids(meta_data):
+    all_refs = []
+    # Go through all 3 ref fields
+    for key in ["self_ref", "parent_ref", "child_ref"]:
+        ref_str = meta_data.get(key)
+        if ref_str:
+            refs = ref_str.split(",")  # split in case of multiple refs
+            all_refs.extend(refs)
+    # Remove duplicates
+    unique_refs = set(all_refs)
+    # Extract /texts/ IDs as integers
+    text_refs = [int(ref.split("/")[2]) for ref in unique_refs if "/texts/" in ref]
+    return text_refs

src/data_preprocessing/docling/vector_database_pipeline.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""
+To preprocess the data and create a vector database using docling and langchain,
+openai embeddings.
+"""
+import getpass
+import os
+from dotenv import load_dotenv
+import itertools
+from uuid import uuid4
+import faiss
+from langchain_community.docstore.in_memory import InMemoryDocstore
+from langchain_community.vectorstores import FAISS
+from langchain_openai import OpenAIEmbeddings
+from docling.document_converter import DocumentConverter
+from langchain_huggingface import HuggingFaceEmbeddings
+from transformers import AutoTokenizer
+from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
+from docling_core.types.doc.document import TableItem,PictureItem
+from docling_core.types.doc.labels import DocItemLabel
+from langchain_core.documents import Document
+import logging
+load_dotenv()
+def adding_metadata_chunks(chunks: HybridChunker, file_name: str, speciality: str) -> list[Document]:
+    """Adding metadata to the chunks
+    This function processes a list of chunks and adds metadata to each chunk.
+    Args:
+        chunks (Hybridchunker): The chunks to be processed.
+        file_name (str): The name of the file from which the chunks were created.
+        specality (str): specalization of the book.
+    Returns:
+        List[Document]: A list of Document objects with added metadata.
+    """
+    documents = []
+    for idx, chunk in enumerate(chunks):
+        items = chunk.meta.doc_items
+        if len(items) == 1 and isinstance(items[0], TableItem):
+            # If the chunk is a table, we can skip it
+            continue
+        main_ref = " ".join([item.get_ref().cref for item in items])
+        parent_ref = " ".join([item.parent.get_ref().cref for item in items])
+        child_ref = " ".join([str(child) for sublist in [item.children for item in items] for child in sublist])
+        text = chunk.text # The text of the chunk
+        metadata = {
+            "source": file_name,
+            "specilization": speciality,
+            "chunk_index": idx,
+            "self_ref": main_ref,
+            "parent_ref": parent_ref,
+            "child_ref": child_ref,
+            "chunk_type": "text",
+        }
+        document = Document(page_content=text, metadata=metadata)
+        documents.append(document)
+    return documents
+def modifying_tables(docling_document, file_name: str, speciality: str) -> list[Document]:
+    """Extract the tables from the converted document and add metadata.
+    Args:
+        document (Document): converted document.
+        file_name (str): file name.
+        specality (str): specalization of the book.
+    Returns:
+        list[TableItem]: A list of documents containing table data with
+        reference IDs in the metadata.
+    """
+    tables: list[Document] = []
+    for table in docling_document.tables:
+        if table.label in [DocItemLabel.TABLE]:
+            main_ref = table.get_ref().cref
+            parent_ref = table.parent.get_ref().cref
+            child_ref = table.children
+            text = table.export_to_markdown()
+            metadata = {
+                "source": file_name,
+                "chunk_index": None,
+                "self_ref": main_ref,
+                "parent_ref": parent_ref,
+                "child_ref": child_ref,
+                "chunk_type": "table",
+            }
+            document = Document(page_content=text, metadata=metadata)
+            tables.append(document)
+    return tables
+def dataloader(file_path:str, embeddings_model:str) -> list[Document]:
+    logging.info("Converting the document to docling format...")
+    docling_document = DocumentConverter().convert(source=file_path).document
+    file_name = file_path.split("\\")[-1].split(".")[0]
+    # Create a hybrid chunker to chunk the document
+    embeddings_tokenizer = AutoTokenizer.from_pretrained(embeddings_model)
+    logging.info("Chunking the document...")
+    chunks = HybridChunker(tokenizer=embeddings_tokenizer).chunk(docling_document)
+    # Add metadata to the chunks
+    logging.info("Adding metadata to the chunks...")
+    texts = adding_metadata_chunks(chunks, file_name)
+    logging.info("Modifying tables...")
+    tables = modifying_tables(docling_document, file_name)
+    # Combine the text and table documents into a single list
+    documents = list(itertools.chain(texts, tables))
+    logging.info(f"Loaded {len(documents)} documents from {file_name}.")
+    return documents
+def create_vector_database(documents: list[Document]) -> FAISS:
+    """Create a vector database from the documents.
+    Args:
+        file_path (str): The path to the document file.
+        embeddings_model (str): The model name for embeddings.
+    Returns:
+        list[Document]: A list of Document objects with embeddings.
+    """
+    logging.info("Creating the vector database...")
+    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
+    index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))
+    vector_store = FAISS(
+                        embedding_function=embeddings,
+                        index=index,
+                        docstore=InMemoryDocstore(),
+                        index_to_docstore_id={},
+                    )
+    uuids = [str(uuid4()) for _ in range(len(documents))]
+    vector_store.add_documents(documents=documents, ids=uuids)
+    logging.info("Vector database created successfully.")
+def main(file_path:str, embeddings_model:str) -> FAISS:
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger(__name__)
+    documents = dataloader(file_path, embeddings_model)
+    create_vector_database(documents)
+if __name__ == "__main__":
+    file_path = r"converted\ROBBINS-&-COTRAN-PATHOLOGIC-BASIS-OF-DISEASE-10TH-ED-with-image-refs.md"
+    embeddings_model = "ibm-granite/granite-embedding-125m-english"
+    main(file_path, embeddings_model)

src/data_preprocessing/download_azure_data.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# from azureml.core import Workspace, Dataset, Datastore
+# # Azure ML workspace details
+# subscription_id = '485363cd-687d-4adb-a30b-35108c11d682'
+# resource_group = 'medbot'
+# workspace_name = 'karthik'
+# # Connect to the Azure ML workspace
+# workspace = Workspace(subscription_id, resource_group, workspace_name)
+# # Access the datastore
+# datastore = Datastore.get(workspace, "workspaceartifactstore")
+# # Access the dataset
+# dataset = Dataset.File.from_files(path=(datastore, 'converted_document_reference'))
+# # Download the dataset to the current directory
+# dataset.download(target_path='.', overwrite=True)
+# print("Download completed successfully.")
+from azureml.core import Workspace, Dataset, Datastore
+subscription_id = '485363cd-687d-4adb-a30b-35108c11d682'
+resource_group = 'medbot'
+workspace_name = 'karthik'
+workspace = Workspace(subscription_id, resource_group, workspace_name)
+datastore = Datastore.get(workspace, "workspaceartifactstore")
+dataset = Dataset.File.from_files(path=(datastore, 'converted_docs_json'))
+dataset.download(target_path='/home/kap2403/Desktop/Medico-AI-Bot/dataset/converted_json_docs', overwrite=True)

src/data_preprocessing/utils.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""
+In the current version images were not considered in the vector database due to some computing power. so there are the utils to extract references from the retrieved documents and search for pictures. if there is any ref match with picture ref the the images are extracted.
+"""
+import os
+import re
+import json
+from langchain_core.documents import Document
+from docling.document_converter import DocumentConverter
+def extract_metadata(documents: list[Document])-> list[str]:
+    references = []
+    for doc in documents:
+        meta_data = doc.metadata
+        self_ref =  meta_data["self_ref"]
+        parent_ref = meta_data["parent_ref"]
+        if self_ref:
+            references.append(self_ref)
+        if parent_ref:
+            references.append(parent_ref)
+    unique_ref = list(set(references))
+    return unique_ref
+def images_data(docling_document: DocumentConverter)-> dict:
+    images_data = {}
+    for image in docling_document.pictures:
+        self_ref = image.self_ref
+        parent_ref = image.parent.cref
+        images_data[self_ref] = parent_ref
+    return images_data
+def find_image_by_number(folder_path: str, img_number:int)-> str|None:
+    """Search for an image with the specified number in the folder.
+    Args:
+        folder_path (str): artifacts path where all the images were stored.
+        img_number (int): image id
+    Returns:
+        str|None: image path
+    """
+    pattern = re.compile(rf"image-0*{img_number}-[a-fA-F0-9]+\.png")  # Regex pattern
+    for filename in os.listdir(folder_path):
+        if pattern.match(filename):  # Check if the filename matches the pattern
+            return os.path.join(folder_path, filename)  # Return full path
+    return None  # Return None if no match found
+def extract_matching_pictures(ref_list: list, images_dict:dict) -> list[int]:
+    def extract_image_numbers(picture_refs):
+        image_numbers = [int(ref.split('/')[-1]) for ref in picture_refs]
+        return image_numbers
+    all_refs = set()
+    for ref_string in ref_list:
+        refs = ref_string.split(',')
+        all_refs.update(refs)
+    # Find matching picture keys where the image's value (text ref) is in all_refs
+    matching_pictures = [pic for pic, text_ref in images_dict.items() if text_ref in all_refs]
+    image_numbers = extract_image_numbers(matching_pictures)
+    return image_numbers
+def extract_ref_paths(images_num_list: list[int])-> list[str]:
+    folder_path = "/home/kap2403/Desktop/Medico-AI-Bot/converted/ROBBINS-&-COTRAN-PATHOLOGIC-BASIS-OF-DISEASE-10TH-ED-with-image-refs-artifacts"
+    paths = []
+    for img_num in images_num_list:
+        path = find_image_by_number(folder_path = folder_path,
+         img_number= img_num)
+        paths.append(path)
+    return paths
+def images_ref_pipeline(retriever):
+    with open(r"/home/kap2403/Desktop/Medico-AI-Bot/dataset/pictures.json", "r") as file:
+        images_data = json.load(file)
+    meta_data = extract_metadata(retriever)
+    image_numbers = extract_matching_pictures(meta_data, images_data)
+    paths_list = extract_ref_paths(image_numbers)
+    return paths_list

src/interface.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import os
+import base64
+import io
+from PIL import Image
+import gradio as gr
+from src.bot.bot import Medibot
+from bs4 import BeautifulSoup
+import markdown
+from src.auth.auth import register_user, login_user
+from src.auth.db import initialize_db
+from groq import Groq
+from src import config
+#======================================
+#=============utils====================
+#======================================
+# Helper functions
+def markdown_to_plain_text(md_text: str) -> str:
+    html = markdown.markdown(md_text)
+    soup = BeautifulSoup(html, "html.parser")
+    return soup.get_text()
+# Ensure base64 strings are properly formatted (no newlines/whitespace)
+def decode_base64_to_image(base64_string):
+    # Clean the string before decoding
+    base64_string = base64_string.replace("\n", "").replace(" ", "")
+    image_data = base64.b64decode(base64_string)
+    return Image.open(io.BytesIO(image_data))
+# Step 1: API Key Validation Logic
+def validate_api_key(user_api_key):
+    global api_key
+    if not user_api_key:
+        return "❌ Please enter your Groq Cloud API key.", gr.update(visible=True), gr.update(visible=False)
+    try:
+        client = Groq(api_key=user_api_key)
+        response = client.chat.completions.create(
+            messages=[{"role": "user", "content": "Hello"}],
+            model="llama3-70b-8192"
+        )
+        api_key = user_api_key
+        os.environ["GROQ_API_KEY"] = api_key
+        return "✅ API key is valid and saved!", gr.update(visible=False), gr.update(visible=True)
+    except Exception as e:
+        return f"❌ Invalid API key: {str(e)}", gr.update(visible=True), gr.update(visible=False)
+def handle_login(userid, password, user_api_key):
+    if user_api_key:
+        # Step 1: Validate API Key first
+        try:
+            client = Groq(api_key=user_api_key)
+            response = client.chat.completions.create(
+                messages=[{"role": "user", "content": "Hello"}],
+                model="llama3-70b-8192"
+            )
+            # If API key is valid, proceed to register
+            success, msg = register_user(userid, password, user_api_key)
+            if success:
+                config.api_key = user_api_key
+                os.environ["GROQ_API_KEY"] = user_api_key
+                return "✅ API Key validated & registered!", gr.update(visible=False), gr.update(visible=True)
+            else:
+                return msg, gr.update(visible=True), gr.update(visible=False)
+        except Exception as e:
+            # API key invalid
+            return f"❌ Invalid API Key: {str(e)}", gr.update(visible=True), gr.update(visible=False)
+    else:
+        # User is trying to login
+        success, saved_api_key = login_user(userid, password)
+        if success:
+            config.api_key = saved_api_key
+            os.environ["GROQ_API_KEY"] = saved_api_key
+            return "✅ Login successful!", gr.update(visible=False), gr.update(visible=True)
+        else:
+            return "❌ Incorrect userid or password.", gr.update(visible=True), gr.update(visible=False)
+#======================================
+#=============Interface================
+#======================================
+class Interface:
+    def __init__(self, config_path: str = "src/bot/configs/prompt.toml",
+                 metadata_database: str = "database/metadata.csv",
+                 faiss_database: str = "database/faiss_index"):
+        self.bot = Medibot(config_path = config_path,
+                      metadata_database = metadata_database,
+                      faiss_database = faiss_database,
+                      )
+    def get_answer(self, question: str):
+        try:
+            answer_md, retrieved_docs, refered_tables, refered_images = self.bot.query(question)
+            # Convert answer to markdown display
+            answer_display = answer_md
+            # Format referenced tables as markdown
+            tables_display = "### Referenced Tables:\n\n"
+            if refered_tables:
+                for table_name, table_content in refered_tables.items():
+                    tables_display += f"{table_content}\n\n"
+            else:
+                tables_display += "_No tables referenced._"
+            # Decode images
+            # Format images as markdown (base64)
+            images_display = []
+            if refered_images:
+                for image_name, base64_string in refered_images.items():
+                    data_uri = f"data:image/png;base64,{base64_string}"
+                    images_display.append(f'![]({data_uri})')  # Markdown embedding for images
+            else:
+                images_display = None
+            # Combine retrieved document texts
+            retrieved_display = "### Retrieved Documents:\n\n"
+            if retrieved_docs:
+                for i, doc in enumerate(retrieved_docs):
+                    retrieved_display += f"**Doc {i+1}:**\n{doc.page_content}\n\n"
+            else:
+                retrieved_display += "_No documents retrieved._"
+            return answer_display, tables_display, images_display, retrieved_display
+        except Exception as e:
+            return f"Error: {str(e)}", "", [], ""