kap2403 commited on
Commit
5e433de
Β·
1 Parent(s): ab45c95

"added files"

Browse files
.gitignore ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.pyo
5
+ *.pyd
6
+
7
+ # Logs
8
+ *.log
9
+
10
+ # Environment files
11
+ .env
12
+ .venv/
13
+ venv/
14
+
15
+ # Gradio, VSCode, PyCharm
16
+ .gradio/
17
+ # .vscode/
18
+ .idea/
19
+
20
+ # Jupyter Notebook checkpoints
21
+ .ipynb_checkpoints/
22
+
23
+ # System files
24
+ .DS_Store
25
+ Thumbs.db
26
+
27
+ # Ignore local folders
28
+ dataset/
29
+ database/
30
+ notebooks/
31
+ mbbs_bot/
32
+ faiss_index/
33
+
34
+ # Ignore test or temp files
35
+ *.tmp
36
+ *.bak
37
+ .env
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import gradio as gr
3
+ from src.auth.auth import handle_login
4
+ from src.auth.db import initialize_db
5
+ from dotenv import load_dotenv
6
+ from src.interface import Interface
7
+
8
+ # Load environment variables
9
+ initialize_db()
10
+ load_dotenv()
11
+
12
+ bot = None # Initially, no bot is created until the user logs in or registers
13
+
14
+
15
+ def start_bot(userid, password, api_key_input):
16
+ global bot
17
+ login_status = handle_login(userid, password, api_key_input)
18
+
19
+ if "successful" in login_status: # Check for successful login
20
+ bot = Interface() # Initialize after login success
21
+ return (
22
+ login_status,
23
+ gr.update(visible=False), # Hide login/registration section
24
+ gr.update(visible=True) # Show chat section
25
+ )
26
+ else:
27
+ return (
28
+ login_status,
29
+ gr.update(visible=True), # Keep login/registration section visible
30
+ gr.update(visible=False) # Keep chat section hidden
31
+ )
32
+
33
+
34
+ def answer(message, history):
35
+ answer_md, tables_display, images_display, retrieved_display = bot.get_answer(message)
36
+
37
+ # Combine all parts into a single response string for chat
38
+ combined_response = f"{answer_md}\n\n{tables_display}"
39
+
40
+ # Add images as markdown
41
+ if images_display:
42
+ combined_response += "\n\n" + "\n\n".join(images_display)
43
+
44
+ return combined_response
45
+
46
+
47
+
48
+ with gr.Blocks(fill_height=True, fill_width=True) as app:
49
+
50
+ with gr.Column(visible=True) as login_register_section:
51
+ gr.Markdown("# πŸ” MediBot Login & Registration")
52
+ with gr.Tabs():
53
+ with gr.TabItem("Login"):
54
+ userid_login = gr.Textbox(label="UserID")
55
+ password_login = gr.Textbox(label="Password", type="password")
56
+ login_btn = gr.Button("Login")
57
+ login_output = gr.Textbox(label="Login Status", interactive=False)
58
+
59
+ with gr.TabItem("Register"):
60
+ gr.Markdown("## πŸ” Enter Your Groq Cloud API Key")
61
+ gr.Markdown("You can create an API key at [Groq Cloud Console]"
62
+ "(https://console.groq.com/keys)")
63
+ userid_register = gr.Textbox(label="UserID")
64
+ password_register = gr.Textbox(label="Password", type="password")
65
+ api_key_register = gr.Textbox(
66
+ label="Groq API Key",
67
+ type="password",
68
+ placeholder="sk-... (required)"
69
+ )
70
+ register_btn = gr.Button("Register")
71
+ register_output = gr.Textbox(label="Registration Status",
72
+ interactive=False)
73
+
74
+ # Chat Section (Initially hidden)
75
+ with gr.Column(visible=False) as chat_section:
76
+ gr.ChatInterface(
77
+ answer,
78
+ title="🩺 Medico-Bot",
79
+ examples=["briefly explain me about cancer", "types of skin diseases?"],
80
+ flagging_options = ['Like', 'Dislike']
81
+ )
82
+
83
+
84
+ # Function connections
85
+ login_btn.click(
86
+ start_bot,
87
+ inputs=[userid_login, password_login],
88
+ outputs=[login_output, login_register_section, chat_section]
89
+ )
90
+
91
+ register_btn.click(
92
+ start_bot,
93
+ inputs=[userid_register, password_register, api_key_register],
94
+ outputs=[register_output, login_register_section, chat_section]
95
+ )
96
+
97
+
98
+ app.launch(share=True, show_error=True)
demo.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from src.interface import (
5
+ Interface,
6
+ handle_login,
7
+ )
8
+ from src import config
9
+
10
+ # Load environment variables
11
+ load_dotenv()
12
+
13
+ bot = None # Initially, no bot is created until the user logs in or registers
14
+
15
+
16
+ # Function to handle bot initialization after successful login
17
+ def start_bot(userid, password, api_key_input):
18
+ # Run login first and get success
19
+ global bot
20
+ login_status, login_section, chat_section = handle_login(userid, password, api_key_input)
21
+
22
+ # Initialize the bot after login is successful
23
+ if "successful" in login_status: # Check for successful login
24
+ bot = Interface() # Initialize after login success
25
+ return login_status, login_section, chat_section, bot # Return all sections and bot
26
+ else:
27
+ return login_status, login_section, chat_section, None # Return failure and no bot
28
+
29
+
30
+
31
+ def answer(message, history):
32
+ answer_md, tables_display, images_display, retrieved_display = bot.get_answer(message)
33
+
34
+ # Combine all parts into a single response string for chat
35
+ combined_response = f"{answer_md}\n\n{tables_display}"
36
+
37
+ # Add images as markdown
38
+ if images_display:
39
+ combined_response += "\n\n" + "\n\n".join(images_display)
40
+
41
+ return combined_response
42
+
43
+
44
+ # Build Gradio Interface
45
+ with gr.Blocks(fill_height=True, fill_width = True) as app:
46
+ # gr.Markdown("# πŸ§ͺ MediBot Login & Chat App")
47
+
48
+ # Login Section
49
+ with gr.Column(visible=True) as login_section:
50
+ gr.Markdown("## πŸ” Enter Your Groq Cloud API Key")
51
+ gr.Markdown("You can create an API key at [Groq Cloud Console](https://console.groq.com/keys)")
52
+ gr.Markdown("## πŸ” Login or Register")
53
+
54
+ userid_input = gr.Textbox(label="UserID")
55
+ password_input = gr.Textbox(label="Password", type="password")
56
+ api_key_input = gr.Textbox(
57
+ label="Groq API Key (only needed for registration)",
58
+ type="password",
59
+ placeholder="sk-... (optional)"
60
+ )
61
+
62
+ login_btn = gr.Button("Login / Register")
63
+ login_output = gr.Textbox(label="Login Status", interactive=False)
64
+
65
+ # Initialize the bot
66
+ # Chat Section (Initially hidden)
67
+ with gr.Column(visible=False) as chat_section:
68
+ gr.ChatInterface(
69
+ answer,
70
+ title="🩺 MediBot Chat Interface",
71
+ examples=["briefly explain me about cancer", "types of skin diseases?"],
72
+ flagging_options = ['Like', 'Dislike']
73
+ )
74
+
75
+
76
+ login_btn.click(
77
+ fn=start_bot,
78
+ inputs=[userid_input, password_input, api_key_input],
79
+ outputs=[login_output, login_section, chat_section]
80
+ )
81
+
82
+
83
+ app.launch(share=True, show_error=True)
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ docling
2
+ langchain-huggingface
3
+ accelerate
4
+ tiktoken==0.9.0
5
+ langchain-openai
6
+ langchain-community
7
+ faiss-cpu
8
+ groq
9
+ toml
10
+ langchain-groq
11
+ gradio==5.26.0
12
+ markdown
13
+ unstructured[all-docs]
14
+ pillow
15
+ lxml
16
+ pillow
17
+ bcrypt
18
+
19
+
20
+
21
+
src/__init__.py ADDED
File without changes
src/auth/auth.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import bcrypt
3
+ from src.auth.db import get_db_connection
4
+ from src import config
5
+ from groq import Groq
6
+
7
+ def register_user(userid, password, api_key):
8
+ conn = get_db_connection()
9
+ cursor = conn.cursor()
10
+ password_hash = bcrypt.hashpw(password.encode(), bcrypt.gensalt())
11
+ try:
12
+ cursor.execute('INSERT INTO users (userid, password_hash, api_key) VALUES (?, ?, ?)',
13
+ (userid, password_hash, api_key))
14
+ conn.commit()
15
+ return True, "βœ… Registered successfully!"
16
+ except:
17
+ return False, "❌ User already exists."
18
+ finally:
19
+ conn.close()
20
+
21
+ def login_user(userid, password):
22
+ conn = get_db_connection()
23
+ cursor = conn.cursor()
24
+ cursor.execute('SELECT password_hash, api_key FROM users WHERE userid=?', (userid,))
25
+ result = cursor.fetchone()
26
+ conn.close()
27
+
28
+ if result:
29
+ stored_hash, api_key = result
30
+ if bcrypt.checkpw(password.encode(), stored_hash):
31
+ return True, api_key
32
+ return False, None
33
+
34
+
35
+ def verify_login(userid, password):
36
+ # Verify the user's login credentials
37
+ success, saved_api_key = login_user(userid, password)
38
+ if success:
39
+ config.api_key = saved_api_key
40
+ os.environ["GROQ_API_KEY"] = saved_api_key
41
+ return "βœ… Login successful!"
42
+ else:
43
+ return "❌ Incorrect userid or password."
44
+
45
+ def register_user_with_api_key(userid, password, user_api_key):
46
+ # Validate the API Key first
47
+ try:
48
+ client = Groq(api_key=user_api_key)
49
+ response = client.chat.completions.create(
50
+ messages=[{"role": "user", "content": "Hello"}],
51
+ model="llama3-70b-8192"
52
+ )
53
+
54
+ # If API key is valid, proceed to register the user
55
+ success, msg = register_user(userid, password, user_api_key)
56
+ if success:
57
+ config.api_key = user_api_key
58
+ os.environ["GROQ_API_KEY"] = user_api_key
59
+ return "βœ… API Key validated & registered!"
60
+ else:
61
+ return msg
62
+
63
+ except Exception as e:
64
+ # API key invalid
65
+ return f"❌ Invalid API Key: {str(e)}"
66
+
67
+
68
+ def handle_login(userid, password, user_api_key):
69
+ if user_api_key:
70
+ # Handle registration with API key validation
71
+ return register_user_with_api_key(userid, password, user_api_key)
72
+ else:
73
+ # Handle standard login
74
+ return verify_login(userid, password)
src/auth/db.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ from src.config import DB_PATH
3
+
4
+ def get_db_connection():
5
+ conn = sqlite3.connect(DB_PATH)
6
+ return conn
7
+
8
+ def initialize_db():
9
+ conn = get_db_connection()
10
+ cursor = conn.cursor()
11
+ cursor.execute('''
12
+ CREATE TABLE IF NOT EXISTS users (
13
+ userid TEXT PRIMARY KEY,
14
+ password_hash TEXT NOT NULL,
15
+ api_key TEXT NOT NULL
16
+ )
17
+ ''')
18
+ conn.commit()
19
+ conn.close()
src/auth_app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import subprocess
4
+ import webbrowser
5
+ from groq import Groq
6
+
7
+
8
+ # Function to validate and save the API key
9
+ def validate_api_key(user_api_key):
10
+ global api_key
11
+ if not user_api_key:
12
+ return "❌ Please enter your Groq Cloud API key."
13
+
14
+ try:
15
+ # Initialize Groq client
16
+ client = Groq(api_key=user_api_key)
17
+
18
+ # Make a test request
19
+ response = client.chat.completions.create(
20
+ messages=[{"role": "user", "content": "Hello"}],
21
+ model="llama-3.1-8b-instant"
22
+ )
23
+
24
+ # Save API key to env variable if successful
25
+ api_key = user_api_key
26
+ os.environ["GROQ_API_KEY"] = api_key
27
+
28
+ return "βœ… API key is valid and saved!"
29
+ except Exception as e:
30
+ return f"❌ Invalid API key: {str(e)}"
31
+
32
+ # Gradio Interface
33
+ with gr.Blocks() as demo:
34
+ gr.Markdown("## πŸ” Enter Your Groq Cloud API Key")
35
+ gr.Markdown("You can create an API key at [Groq Cloud Console](https://console.groq.com/keys)")
36
+
37
+ api_key_input = gr.Textbox(label="Groq Cloud API Key", type="password", placeholder="sk-...")
38
+ submit_button = gr.Button("Validate API Key")
39
+ output_text = gr.Textbox(label="Status", interactive=False)
40
+
41
+ # Use the validation function on button click
42
+ submit_button.click(fn=validate_api_key, inputs=api_key_input, outputs=output_text)
43
+
44
+ demo.launch()
45
+
src/bot/__init__.py ADDED
File without changes
src/bot/bot.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import toml
3
+ from typing import Optional
4
+ from groq import Groq
5
+ from langchain_core.prompts import ChatPromptTemplate
6
+ from langchain_core.output_parsers import StrOutputParser
7
+ from langchain_community.vectorstores import FAISS
8
+ from langchain_openai import OpenAIEmbeddings
9
+ from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
10
+ import logging
11
+ from langchain_groq import ChatGroq
12
+ from src.bot.extract_metadata import Metadata
13
+
14
+
15
+ # Configure logging
16
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
17
+ logger = logging.getLogger(__name__)
18
+
19
+ class Medibot:
20
+ def __init__(self, config_path: str = "src/bot/configs/prompt.toml",
21
+ metadata_database: str = "database/metadata.csv",
22
+ faiss_database: str = "database/faiss_index"
23
+ ):
24
+ """Initialize Medibot with configuration and Groq client."""
25
+ # Load environment variables
26
+ api_key = os.environ.get("GROQ_API_KEY")
27
+ if not api_key:
28
+ logger.error("GROQ_API_KEY not found in environment variables")
29
+ raise ValueError("GROQ_API_KEY is required")
30
+
31
+ # Load prompt configuration
32
+ try:
33
+ config = toml.load(config_path)
34
+ system_prompt = config["rag_prompt"]["system_prompt"]
35
+ user_prompt_template = config["rag_prompt"]["user_prompt_template"]
36
+
37
+ except (FileNotFoundError, toml.TomlDecodeError) as e:
38
+ logger.error(f"Failed to load config from {config_path}: {e}")
39
+ raise
40
+
41
+ # Initialize prompt template
42
+ self.prompt_template = ChatPromptTemplate.from_messages([
43
+ ("system", system_prompt),
44
+ ("user", user_prompt_template)
45
+ ])
46
+
47
+ # initialize vector database
48
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
49
+ vector_store = FAISS.load_local(
50
+ faiss_database, embeddings, allow_dangerous_deserialization=True
51
+ )
52
+ self.retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 10})
53
+ # Initialize Groq client
54
+
55
+ self.model = ChatGroq(
56
+ model="llama-3.1-8b-instant",
57
+ temperature=0.2,
58
+ max_tokens=None,
59
+ timeout=None,
60
+ max_retries=2,
61
+ )
62
+
63
+ self.metadata_extactor = Metadata(metadata_database)
64
+
65
+
66
+ def query(self, question: str) -> str:
67
+ retrieved_docs = self.retriever.invoke(question)
68
+
69
+ # RunnableParallel({"context": retriever, "question": RunnablePassthrough()})
70
+ rag_chain = (
71
+ RunnableParallel({
72
+ "context": RunnableLambda(lambda _: retrieved_docs), # Reuse retrieved docs
73
+ "question": RunnablePassthrough()
74
+ })
75
+ | self.prompt_template
76
+ | self.model
77
+ | StrOutputParser()
78
+ )
79
+
80
+ answer = rag_chain.invoke({"question": question})
81
+
82
+ refered_tables , refered_images = self.metadata_extactor.get_data_from_ref(retrieved_docs)
83
+ return answer, retrieved_docs, refered_tables , refered_images
src/bot/configs/config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ model_id = "llama-3.3-70b-versatile"
src/bot/configs/prompt.toml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [rag_prompt]
2
+ system_prompt = """
3
+ You are a helpful and knowledgeable medical education assistant. You assist medical students by answering their questions using only the information retrieved from a trusted knowledge base. Your goal is to explain medical concepts clearly, concisely, and in a well-structured manner suitable for students.
4
+
5
+ - You must base all answers strictly on the retrieved content.
6
+ - Do not make up any information (no hallucination).
7
+ - If the answer cannot be found in the retrieved content, inform the user that the information was not found.
8
+ - Structure your answers with headings, bullet points, or numbered lists when appropriate.
9
+ - Aim to make complex topics easy to understand for learners.
10
+ - When providing an answer, always include a reference to the source (book name) from the metadata of the retrieved content.
11
+ """
12
+
13
+ user_prompt_template = """
14
+ You are a medical assistant helping students understand complex topics. Use the following retrieved context to answer the question. If the answer is not in the context, say that you couldn't find relevant information.
15
+
16
+ Context:
17
+ {context}
18
+
19
+ Question:
20
+ {question}
21
+
22
+ Answer:
23
+
24
+ Reference Books:
25
+ """
src/bot/extract_metadata.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.documents import Document
2
+ from typing import Tuple, List
3
+ import pandas as pd
4
+ import re
5
+
6
+ class Metadata:
7
+ def __init__(self, ref_database_path: str):
8
+ self.df = pd.read_csv(ref_database_path)
9
+
10
+ def extract_ref_from_metadata(self, meta_data: dict) -> List[str]:
11
+ """Extract references from metadata of images and tables."""
12
+
13
+ meta_data_dict = {}
14
+ meta_data_dict["source"] = meta_data.get("source", "")
15
+ self_ref = meta_data.get("self_ref", "")
16
+ parent_ref = meta_data.get("parent_ref", "")
17
+ child_ref = meta_data.get("child_ref", "")
18
+
19
+ formated_self_ref = re.split(r'[,\s]+', self_ref or "")
20
+ formated_parent_ref = re.split(r'[,\s]+', parent_ref or "")
21
+ formated_child_ref = re.split(r'[,\s]+', child_ref or "")
22
+
23
+ filtered_self_ref_ids = [item for item in formated_self_ref
24
+ if item.startswith('#/tables/') or item.startswith('#/pictures/')]
25
+ filtered_parent_ref_ids = [item for item in formated_parent_ref
26
+ if item.startswith('#/tables/') or item.startswith('#/pictures/')]
27
+ filtered_child_ref_ids = [item for item in formated_child_ref
28
+ if item.startswith('#/tables/') or item.startswith('#/pictures/')]
29
+
30
+ # Combine all filtered references into a set (to avoid duplicates)
31
+ all_filtered_references = set(filtered_self_ref_ids +
32
+ filtered_parent_ref_ids +
33
+ filtered_child_ref_ids)
34
+ if len(all_filtered_references) > 0:
35
+ meta_data_dict["self_ref"] = list(all_filtered_references)
36
+ return meta_data_dict
37
+
38
+ def extract_all_ref_from_retrived_chunks(self, chunks: Document) -> dict:
39
+ all_metadata = {}
40
+ # Example: Iterate over documents and add extracted metadata to the new dictionary
41
+ for idx, doc in enumerate(chunks): # Assuming `docs` is a list of documents
42
+ meta_data = doc.metadata # Extract metadata from the document
43
+ extracted_ref_data = self.extract_ref_from_metadata(meta_data) # Extract references
44
+
45
+ # Add the extracted metadata to the all_metadata dictionary
46
+ if extracted_ref_data:
47
+ all_metadata[f"doc_{idx}"] = extracted_ref_data
48
+
49
+ return all_metadata
50
+
51
+
52
+ def get_data_from_ref(self, chunks:Document) -> Tuple[str, str]:
53
+ """Extract tables and pictures from metadata using references."""
54
+
55
+
56
+ tables = {}
57
+ images = {}
58
+
59
+ all_metadata = self.extract_all_ref_from_retrived_chunks(chunks)
60
+
61
+ for meta in all_metadata.values():
62
+ source = meta.get("source", "")
63
+ ref = meta.get("self_ref", [])
64
+
65
+ for r in ref:
66
+ reference_rows = self.df[
67
+ (self.df['source'] == source) &
68
+ (self.df['self_ref'].isin([r]))
69
+ ]
70
+
71
+ if not reference_rows.empty:
72
+ chunk_type = reference_rows["chunk_type"].values[0]
73
+ page_content = reference_rows["page_content"].values[0]
74
+
75
+ if chunk_type == "table":
76
+ tables[r] = page_content
77
+ elif chunk_type == "picture":
78
+ images[r] = page_content
79
+
80
+ return tables, images
src/bot/utils.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import toml
2
+ import json
3
+
4
+ class config_file_reader:
5
+ @staticmethod
6
+ def read_json(config_file: str):
7
+ with open(config_file, 'r') as file:
8
+ config_data = json.load(file)
9
+ return config_data
10
+
11
+ @staticmethod
12
+ def read_toml(config_file: str):
13
+ with open(config_file, 'r') as file:
14
+ config_data = toml.load(file)
15
+ return config_data
16
+
17
+ @staticmethod
18
+ def read_configs(file_path:str)->dict|str:
19
+ if file_path.endswith(".json"):
20
+ configs = config_file_reader.read_json(file_path)
21
+ else:
22
+ configs = config_file_reader.read_toml(file_path)
23
+ return configs
src/config.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import os
2
+
3
+ DB_PATH = os.getenv("DB_PATH", "users.db")
4
+
5
+ api_key = None
src/data_preprocessing/__init__.py ADDED
File without changes
src/data_preprocessing/converting_text_to_embeddings.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This module is responsible for converting text data into embeddings using the
2
+ # OpenAI API and storing in Faiss database.
3
+
4
+ import faiss
5
+ import tiktoken
6
+ from langchain_community.docstore.in_memory import InMemoryDocstore
7
+ from langchain_community.vectorstores import FAISS
8
+ from langchain_openai import OpenAIEmbeddings
9
+ from typing import List, Tuple
10
+ from uuid import uuid4
11
+ from dotenv import load_dotenv
12
+ import logging
13
+ # other imports
14
+ from dataloader import dataloader
15
+
16
+ logging.basicConfig(level=logging.INFO)
17
+
18
+ def main(folder_path: str)-> None:
19
+ """
20
+ Main function to convert text data into embeddings and store them in a Faiss database.
21
+ The function uses the OpenAI API to generate embeddings and the Faiss library
22
+ to manage the index.
23
+
24
+ Args:
25
+ folder_path (str): path to the folder containing the data files.
26
+ """
27
+ logging.info("Loading environment variables...")
28
+ load_dotenv() # Load environment variables from .env file
29
+ logging.info("Environment variables loaded.")
30
+ logging.info("Loading OpenAI embeddings...")
31
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
32
+ logging.info("OpenAI embeddings loaded.")
33
+ logging.info("Creating Faiss index...")
34
+
35
+ # Create a Faiss inde
36
+ index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))
37
+ # load the emcoder to calculate the number of tokens
38
+ enc = tiktoken.get_encoding("cl100k_base")
39
+
40
+
41
+ vector_store = FAISS(
42
+ embedding_function=embeddings,
43
+ index=index,
44
+ docstore=InMemoryDocstore(),
45
+ index_to_docstore_id={},
46
+ )
47
+ logging.info("Faiss index created.")
48
+ logging.info("Loading data from folder...")
49
+ # Load the data
50
+ chunks_list, _, _, _ = dataloader(folder_path)
51
+ logging.info(f"Loaded {len(chunks_list)} chunks from folder: {folder_path}")
52
+ # calculte the number of tokens
53
+ total_tokens = sum(len(enc.encode(doc.page_content)) for doc in chunks_list)
54
+ cost = (total_tokens / 1000000) * 0.13
55
+ logging.info(f"Total tokens: {total_tokens}")
56
+ logging.info(f"Estimated cost of using text-embedding-3-large: ${cost:.2f}")
57
+
58
+ # Ask user for confirmation
59
+ proceed = input("Do you want to proceed with embedding and storing the data in Faiss? (yes/no): ").strip().lower()
60
+ if proceed not in ['yes', 'y']:
61
+ logging.info("Operation cancelled by the user.")
62
+ return
63
+ logging.info("Proceeding with embedding and storing the data in Faiss...")
64
+
65
+ logging.info("Converting text data to embeddings...")
66
+
67
+ # Convert text data to embeddings
68
+
69
+ uuids = [str(uuid4()) for _ in range(len(chunks_list))]
70
+ vector_store.add_documents(documents=chunks_list, ids=uuids)
71
+ logging.info("Text data converted to embeddings and stored in Faiss index.")
72
+ vector_store.save_local("faiss_index")
73
+ logging.info("Faiss index saved to local storage.")
74
+
75
+
76
+ if __name__ == "__main__":
77
+ folder_path = "dataset/converted_json_docs"
78
+ main(folder_path)
79
+
80
+
81
+
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
src/data_preprocessing/dataloader.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this mo
2
+
3
+ import re
4
+ import os
5
+ import json
6
+ import docling
7
+ from langchain_core.documents import Document
8
+ from typing import List, Dict, Any, Optional, Tuple
9
+ import logging
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+
13
+ #============================
14
+ # data loader from json and md files
15
+ #============================
16
+
17
+ def load_json_file(file_path: str)-> dict:
18
+ """
19
+ Load a JSON file and return its content as a dictionary.
20
+
21
+ Args:
22
+ file_path (str): Path to the JSON file.
23
+
24
+ Returns:
25
+ dict: Dictionary containing the JSON data.
26
+ """
27
+ with open(file_path, 'r') as file:
28
+ data = json.load(file)
29
+ return data
30
+
31
+ def load_md_file(file_path: str) -> str:
32
+ """
33
+ Load a Markdown file and return its content as a string.
34
+ The function reads the file in UTF-8 encoding.
35
+
36
+ Args:
37
+ file_path (str): Path to the Markdown file.
38
+
39
+ Returns:
40
+ str: Content of the Markdown file as a string.
41
+ """
42
+ with open(file_path, 'r', encoding='utf-8') as file:
43
+ content = file.read()
44
+ return content
45
+
46
+
47
+ def data_preprocess(folder_path: str) -> dict:
48
+ """
49
+ Load data from a folder containing JSON files and a Markdown file.
50
+ The function reads the following files:
51
+ - tables.json
52
+ - images.json
53
+ - text.json
54
+ - chunks.json
55
+ - {base_folder_name}-with-images.md
56
+
57
+ Args:
58
+ folder_path (str): Path to the folder containing the JSON and Markdown files.
59
+
60
+ Returns:
61
+ dict: A dictionary containing the loaded data from the JSON files and the
62
+ Markdown file.
63
+ """
64
+ tables_path = os.path.join(folder_path, "tables.json")
65
+ images_path = os.path.join(folder_path, "images.json")
66
+ text_path = os.path.join(folder_path, "text.json")
67
+ chunks_path = os.path.join(folder_path, "chunks.json")
68
+
69
+ # Extract base folder name for md and images folder
70
+ base_folder_name = os.path.basename(folder_path)
71
+ images_folder_path = os.path.join(folder_path, f"{base_folder_name}-with-images_artifacts")
72
+ md_file_path = os.path.join(folder_path, f"{base_folder_name}-with-images.md")
73
+
74
+ # Load JSON contents
75
+ tables = load_json_file(tables_path)
76
+ images = load_json_file(images_path)
77
+ text = load_json_file(text_path)
78
+ chunks = load_json_file(chunks_path)
79
+
80
+ # Load Markdown content
81
+ markdown = load_md_file(md_file_path)
82
+
83
+ return {
84
+ "tables": tables,
85
+ "images": images,
86
+ "text": text,
87
+ "chunks": chunks,
88
+ "images_folder": images_folder_path,
89
+ "markdown": markdown
90
+ }
91
+
92
+
93
+ def load_json_data_documents(converted_document: dict, data_type: str)-> Document:
94
+ """
95
+ Load JSON data documents from the converted document.
96
+ This function takes a converted document and a data type (e.g., "tables", "images", "text", "chunks")
97
+ and returns a list of Document objects.
98
+
99
+ Args:
100
+ converted_document (dict): The converted document containing data.
101
+ data_type (str): The type of data to load (e.g., "tables", "images", "text", "chunks").
102
+ Returns:
103
+ Document: A list of Document objects containing the loaded data.
104
+ """
105
+ documents = []
106
+ for chunk in converted_document[data_type]:
107
+ content = chunk["content"]
108
+ metadata = chunk["metadata"]
109
+ # Create Document object
110
+ document = Document(
111
+ page_content=content,
112
+ metadata=metadata
113
+ )
114
+ documents.append(document)
115
+
116
+ return documents
117
+
118
+
119
+
120
+ #============================
121
+ # dataloader for all the data
122
+ # from the folder
123
+ # containing json and md files
124
+ # and images
125
+ #============================
126
+
127
+
128
+ def dataloader(folder_path: str)-> Tuple[list, list, list, list]:
129
+ """
130
+ Load data from a folder containing JSON files and a Markdown file.
131
+ The function reads the following files:
132
+
133
+ Args:
134
+ folder_path (str): Folder path containing all folders with JSON files and
135
+ Markdown files.
136
+ Returns:
137
+ Tuple[list, list, list, list]: list of chunks, list of pictures, list of tables,
138
+ and list of text of overall data.
139
+ """
140
+
141
+ chunks_list = []
142
+ pictures_list = []
143
+ tables_list = []
144
+ text_list = []
145
+
146
+ logging.info(f"Loading data from folder: {folder_path}")
147
+ for file_name in os.listdir(folder_path):
148
+ logging.info(f"Processing file: {file_name}")
149
+ file_path = os.path.join(folder_path, file_name)
150
+
151
+ # load the data
152
+ dict_data = data_preprocess(file_path)
153
+ chunks_data = load_json_data_documents(dict_data, "chunks")
154
+ pictures_data = load_json_data_documents(dict_data, "images")
155
+ tables_data = load_json_data_documents(dict_data, "tables")
156
+ text_data = load_json_data_documents(dict_data, "text")
157
+
158
+ # adding the data to the list
159
+ chunks_list.extend(chunks_data)
160
+ pictures_list.extend(pictures_data)
161
+ tables_list.extend(tables_data)
162
+ text_list.extend(text_data)
163
+ logging.info(f"Loaded {len(chunks_data)} chunks, {len(pictures_data)} pictures, "
164
+ f"{len(tables_data)} tables, and {len(text_data)} text documents from {file_name}")
165
+
166
+ return chunks_list, pictures_list, tables_list, text_list
167
+
168
+
169
+
170
+ if __name__ == "__main__":
171
+ # Example usage
172
+ folder_path = "dataset/converted_json_docs"
173
+ chunks, pictures, tables, text = dataloader(folder_path)
174
+
src/data_preprocessing/docling/docling_utils.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ convert Docling documents to Langchain documents
3
+ 1. Extract images and tables from the Docling document.
4
+ 2. Extract the text from the Docling document.
5
+ 3. Create Langchain documents from the extracted images, tables, and text.
6
+ 4. save the data in json file.
7
+ """
8
+ import json
9
+ import os
10
+ import itertools
11
+ from uuid import uuid4
12
+
13
+ from docling.document_converter import DocumentConverter
14
+ from docling_core.types.doc.document import TableItem,PictureItem
15
+ from docling_core.types.doc.labels import DocItemLabel
16
+ from langchain_core.documents import Document
17
+ from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
18
+
19
+ import logging
20
+
21
+ def adding_metadata_chunks(chunks: HybridChunker, file_name: str, speciality: str) -> list[Document]:
22
+ """Adding metadata to the chunks
23
+ This function processes a list of chunks and adds metadata to each chunk.
24
+
25
+ Args:
26
+ chunks (Hybridchunker): The chunks to be processed.
27
+ file_name (str): The name of the file from which the chunks were created.
28
+ specality (str): specalization of the book.
29
+
30
+ Returns:
31
+ List[Document]: A list of Document objects with added metadata.
32
+ """
33
+ documents = []
34
+ for idx, chunk in enumerate(chunks):
35
+ items = chunk.meta.doc_items
36
+ if len(items) == 1 and isinstance(items[0], TableItem):
37
+ # If the chunk is a table, we can skip it
38
+ continue
39
+
40
+ main_ref = ",".join([item.get_ref().cref for item in items])
41
+ parent_ref = ",".join([item.parent.get_ref().cref for item in items])
42
+ child_ref = ",".join([str(child) for sublist in [item.children for item in items] for child in sublist])
43
+
44
+ text = chunk.text # The text of the chunk
45
+ metadata = {
46
+ "source": file_name,
47
+ "specilization": speciality,
48
+ "chunk_index": idx,
49
+ "self_ref": main_ref,
50
+ "parent_ref": parent_ref,
51
+ "child_ref": child_ref,
52
+ "chunk_type": "text",
53
+
54
+ }
55
+ document = Document(page_content=text, metadata=metadata)
56
+ documents.append(document)
57
+ return documents
58
+
59
+ def extract_all_text(docling_document: DocumentConverter,
60
+ file_name: str,
61
+ medical_specialty: str) -> list[Document]:
62
+ """To exract all the text from the docling document and convert it to langchain
63
+ document. This is useful for creating a vector store from the text.
64
+
65
+ Args:
66
+ docling_document (DocumentConverter): _docling_document_
67
+ file_name (str): name of the file
68
+ medical_specialty (str): book category
69
+
70
+ Returns:
71
+ list[Document]: _list of langchain documents_
72
+ """
73
+
74
+ documents_list = list()
75
+ for text in docling_document.texts:
76
+ content = text.text
77
+ main_ref = " ".join([text.get_ref().cref])
78
+ parent_ref = " ".join([text.parent.get_ref().cref])
79
+ child_ref = ", ".join([ref.get_ref().cref for ref in text.children])
80
+ document = Document(page_content=content, metadata={
81
+ "source": file_name,
82
+ "chunk_index": None,
83
+ "self_ref": {main_ref},
84
+ "parent_ref": {parent_ref},
85
+ "child_ref": {child_ref},
86
+ "chunk_type": "text",
87
+ "medical_specialty" : medical_specialty,
88
+ "reference": None
89
+ })
90
+
91
+ documents_list.append(document)
92
+ return documents_list
93
+
94
+
95
+ def extract_tables(docling_document: DocumentConverter,
96
+ file_name: str,
97
+ medical_specialty: str) -> list[Document]:
98
+ """Extract the tables from the converted document and add metadata.
99
+
100
+ Args:
101
+ document (DocumentConverter): converted document.
102
+ file_name (str): file name.
103
+ medical_specialty (str): book category
104
+ Returns:
105
+ list[TableItem]: A list of documents containing table data with
106
+ reference IDs in the metadata.
107
+ """
108
+ tables: list[Document] = []
109
+ for table in docling_document.tables:
110
+ if table.label in [DocItemLabel.TABLE]:
111
+ main_ref = " ".join([table.get_ref().cref])
112
+ parent_ref = " ".join([table.parent.get_ref().cref])
113
+ child_ref = table.children
114
+
115
+ text = table.export_to_markdown()
116
+ metadata = {
117
+ "source": file_name,
118
+ "chunk_index": None,
119
+ "self_ref": main_ref,
120
+ "parent_ref": parent_ref,
121
+ "child_ref": child_ref,
122
+ "chunk_type": "table",
123
+ "medical_specialty" : medical_specialty,
124
+ }
125
+ document = Document(page_content=text, metadata=metadata)
126
+ tables.append(document)
127
+ return tables
128
+
129
+
130
+ def extract_text_ids(data: dict) -> list:
131
+ """
132
+ Extract all references from a dictionary and return a list of numbers
133
+ from any '#/texts/{number}' references.
134
+
135
+ Args:
136
+ data (dict): The dictionary to extract from.
137
+
138
+ Returns:
139
+ list: List of integers extracted from '#/texts/{number}' refs.
140
+ """
141
+ refs = [v for k, v in data.items() if k.endswith('_ref') and isinstance(v, str)]
142
+ text_ids = [int(ref.split('/')[2]) for ref in refs if ref.startswith('#/texts/')]
143
+ return text_ids
144
+
145
+
146
+ def save_json(file_path: str, category: str,data: list[Document]) -> None:
147
+ """Save the data in json format.
148
+
149
+ Args:
150
+ file_path (str): path of the file.
151
+ data (list[Document]): list of documents.
152
+ """
153
+ doc_dicts = [{"content": doc.page_content, "metadata": doc.metadata} for doc in data]
154
+ with open(f"{file_path}/{category}.json", "w") as f:
155
+ json.dump(doc_dicts, f)
156
+
157
+
158
+ # def main(file_path: str,
159
+ # file_name: str,
160
+ # save_path: str,
161
+ # ) -> list[Document]:
162
+ # """Main function to convert docling documents to langchain documents.
163
+
164
+ # Args:
165
+ # file_path (str): path of the file.
166
+ # file_name (str): name of the file.
167
+ # Returns:
168
+ # list[Document]: list of langchain documents.
169
+ # """
170
+ # # Extract all text from the docling document
171
+ # docling_document = DocumentConverter(file_path)
172
+ # texts = extract_all_text(docling_document, file_name)
173
+
174
+ # # Extract tables from the docling document
175
+ # tables = modifying_tables(docling_document, file_name)
176
+
177
+ # # Extract images from the docling document
178
+ # # Combine all documents into a single list
179
+ # documents = list(itertools.chain(texts, tables))
180
+
181
+ # save_json(save_path, documents)
182
+
183
+
184
+ # if __name__ == "__main__":
185
+ # logging.basicConfig(
186
+ # level=logging.DEBUG,
187
+ # format='%(asctime)s - %(levelname)s - %(message)s',
188
+ # handlers=[
189
+ # logging.StreamHandler(),
190
+ # logging.FileHandler("app.log", mode='a')
191
+ # ]
192
+ # )
193
+ # logging.info("Creating the dataset")
194
+ # main(r"dataset",
195
+ # file_name="medical_textbook",
196
+ # save_path=r"dataset"
197
+ # )
198
+ # logging.info("Dataset created successfully")
199
+ # logging.info("Dataset saved successfully")
src/data_preprocessing/docling/document_conversion.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Script to convert all the pdf documents to markdown format in azure.
3
+ """
4
+
5
+ import logging
6
+ import time
7
+ from pathlib import Path
8
+ import os
9
+ import yaml
10
+ from azureml.fsspec import AzureMachineLearningFileSystem
11
+ import shutil
12
+
13
+ from concurrent.futures import ThreadPoolExecutor
14
+ from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
15
+ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
16
+ from docling.datamodel.base_models import ConversionStatus, InputFormat
17
+ from docling.datamodel.document import ConversionResult
18
+ from docling.datamodel.settings import settings
19
+ from docling.document_converter import DocumentConverter, PdfFormatOption
20
+ from docling_core.types.doc import ImageRefMode
21
+ from huggingface_hub import snapshot_download
22
+ from docling.datamodel.settings import settings
23
+
24
+
25
+ from docling.datamodel.pipeline_options import (
26
+ AcceleratorDevice,
27
+ AcceleratorOptions,
28
+ PdfPipelineOptions,
29
+ TesseractCliOcrOptions,
30
+ TableFormerMode,
31
+ )
32
+
33
+
34
+ from indexing import document_indexing
35
+ from docling_utils import save_json
36
+
37
+
38
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
39
+
40
+ class Docling_Coversion:
41
+ def __init__(self, image_scale=1.0):
42
+ logging.info("Initializing Docling_Coversion with image_scale=%s", image_scale)
43
+ accelerator_options = AcceleratorOptions(
44
+ num_threads=8, device=AcceleratorDevice.CUDA
45
+ )
46
+
47
+ # Turn on inline debug visualizations:
48
+ settings.debug.visualize_layout = True
49
+ settings.debug.visualize_ocr = True
50
+ settings.debug.visualize_tables = True
51
+ settings.debug.visualize_cells = True
52
+
53
+
54
+ pipeline_options = PdfPipelineOptions(
55
+ do_ocr=True,
56
+ do_table_structure=True,
57
+ images_scale=image_scale,
58
+ generate_page_images=True,
59
+ generate_picture_images=True,
60
+ accelerator_options=accelerator_options,
61
+ ocr_options=TesseractCliOcrOptions(force_full_page_ocr=True)
62
+ )
63
+
64
+ pipeline_options.table_structure_options.do_cell_matching = True
65
+ pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
66
+
67
+ self.converter = DocumentConverter(
68
+ format_options={
69
+ InputFormat.PDF: PdfFormatOption(
70
+ pipeline_options=pipeline_options,
71
+ backend=DoclingParseV4DocumentBackend,
72
+ )
73
+ }
74
+ )
75
+ logging.info("Docling_Coversion initialized successfully.")
76
+
77
+ def document_conversion(self, file_path):
78
+ """Convert a file and return the document object."""
79
+ logging.info("Starting document conversion for file: %s", file_path)
80
+ return self.converter.convert(Path(file_path)).document
81
+
82
+ def save_document(self, file_path, output_dir, azure_fs):
83
+ """Convert a file, save the output as markdown with embedded images,
84
+ and upload to Azure."""
85
+ input_path = Path(file_path)
86
+ logging.info("Processing file: %s", file_path)
87
+
88
+ try:
89
+ result = self.converter.convert(input_path)
90
+ doc_name = input_path.stem
91
+ temp_md_file_path = Path(output_dir) / f"{doc_name}-with-images.md"
92
+
93
+ docling_document_class = document_indexing(result,
94
+ "ibm-granite/granite-embedding-125m-english",
95
+ speciality= input_path.parent.name,
96
+ file_name=input_path.stem
97
+ )
98
+ tables_doc = docling_document_class.extract_tables()
99
+ images_doc = docling_document_class.extract_images()
100
+ text_doc = docling_document_class.extract_all_text()
101
+ chunks_doc = docling_document_class.create_chunks()
102
+
103
+ # Save the extracted data as JSON
104
+ save_json(file_path=output_dir, category="tables", data=tables_doc)
105
+ save_json(file_path=output_dir, category="images", data=images_doc)
106
+ save_json(file_path=output_dir, category="text", data=text_doc)
107
+ save_json(file_path=output_dir, category="chunks", data=chunks_doc)
108
+ logging.info("Saved extracted data as JSON files.")
109
+
110
+
111
+ # Save locally first
112
+ result.document.save_as_markdown(temp_md_file_path, image_mode=ImageRefMode.REFERENCED)
113
+ logging.info("Saved locally: %s", temp_md_file_path)
114
+
115
+ # Upload to Azure
116
+ azure_output_path = f"converted_docs_json/{doc_name}"
117
+ azure_fs.upload(lpath=str(output_dir), rpath=azure_output_path, recursive=True)
118
+ logging.info("Uploaded to Azure: %s", azure_output_path)
119
+
120
+ # Optionally, delete the local file after upload
121
+ if output_dir.exists() and output_dir.is_dir():
122
+ shutil.rmtree(output_dir)
123
+ logging.info("Deleted local directory: %s", output_dir)
124
+
125
+ except Exception as e:
126
+ logging.error("Error processing file %s: %s", file_path, e)
127
+
128
+
129
+
130
+
131
+ def main(source_dir: str):
132
+
133
+ logging.info("Starting main function with source_dir: %s", source_dir)
134
+
135
+ # Set the temporary output directory
136
+
137
+ # Set the local directory to save PDFs
138
+ local_pdf_dir = Path("./local_pdfs")
139
+ local_pdf_dir.mkdir(parents=True, exist_ok=True) # Create the directory if it doesn't exist
140
+ logging.info("Local PDF directory created: %s", local_pdf_dir)
141
+
142
+ fs = AzureMachineLearningFileSystem(source_dir)
143
+ all_pdf_files = fs.glob('**/*.pdf')
144
+ logging.info("Found %d PDF files in source directory.", len(all_pdf_files))
145
+
146
+ converter = Docling_Coversion(image_scale=2)
147
+
148
+ for file_path in all_pdf_files:
149
+ # file_path = Path(file_path)
150
+ output_dir = Path("./temp")
151
+ output_dir.mkdir(parents=True, exist_ok=True) # Create the directory if it doesn't exist
152
+ logging.info("Temporary output directory created: %s", output_dir)
153
+
154
+ file_path_ = Path(file_path)
155
+ file_name = file_path_.name
156
+ local_pdf_path = local_pdf_dir / file_name
157
+ azure_output_path = f"converted_docs_json/{file_path_.stem}"
158
+
159
+ # Check if the file already exists in Azure
160
+ if fs.exists(azure_output_path):
161
+ logging.info("Skipping %s, already processed.", file_name)
162
+ continue
163
+
164
+ # Save the PDF locally
165
+ logging.info("Downloading file: %s", file_name)
166
+ with fs.open(file_path, "rb") as remote_file:
167
+ with open(local_pdf_path, "wb") as local_file:
168
+ local_file.write(remote_file.read())
169
+ logging.info("File saved locally: %s", local_pdf_path)
170
+
171
+ # Process the local PDF file
172
+ logging.info("Processing: %s", file_name)
173
+ converter.save_document(local_pdf_path, output_dir, fs)
174
+
175
+ # Optionally, delete the local PDF after processing
176
+ local_pdf_path.unlink()
177
+ logging.info("Deleted local PDF: %s", local_pdf_path)
178
+
179
+ logging.info("Processing completed for all files.")
180
+
181
+
182
+ if __name__ == "__main__":
183
+ logging.info("Script started.")
184
+ main(source_dir=(
185
+ 'azureml://subscriptions/485363cd-687d-4adb-a30b-35108c11d682/resourcegroups/medbot/workspaces/karthik/datastores/workspaceartifactstore/paths/UI/2025-04-11_075006_UTC/PdfFiles/'
186
+ ))
187
+ logging.info("Script finished.")
src/data_preprocessing/docling/indexing.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ create chunks and create clusters usign raptor architecture.
3
+ """
4
+
5
+ import json
6
+ import os
7
+ import itertools
8
+ import logging
9
+ from uuid import uuid4
10
+
11
+ from docling.document_converter import DocumentConverter
12
+ from docling_core.experimental.serializer.markdown import MarkdownTableSerializer
13
+ from docling_core.transforms.chunker.hierarchical_chunker import ChunkingDocSerializer
14
+ from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
15
+ from docling_core.types.doc.document import DoclingDocument
16
+ from docling_core.types.doc.labels import DocItemLabel
17
+ from langchain_core.documents import Document
18
+ from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
19
+
20
+ from transformers import AutoTokenizer
21
+
22
+ # imports from another scripts
23
+ def adding_metadata_chunks(chunks: HybridChunker, file_name: str, speciality: str) -> list[Document]:
24
+ """Adding metadata to the chunks
25
+ This function processes a list of chunks and adds metadata to each chunk.
26
+
27
+ Args:
28
+ chunks (Hybridchunker): The chunks to be processed.
29
+ file_name (str): The name of the file from which the chunks were created.
30
+ specality (str): specalization of the book.
31
+
32
+ Returns:
33
+ List[Document]: A list of Document objects with added metadata.
34
+ """
35
+ documents = []
36
+ for idx, chunk in enumerate(chunks):
37
+ items = chunk.meta.doc_items
38
+ if len(items) == 1 and isinstance(items[0], TableItem):
39
+ # If the chunk is a table, we can skip it
40
+ continue
41
+
42
+ main_ref = " ".join([item.get_ref().cref for item in items])
43
+ parent_ref = " ".join([item.parent.get_ref().cref for item in items])
44
+ child_ref = " ".join([str(child) for sublist in [item.children for item in items] for child in sublist])
45
+
46
+ text = chunk.text # The text of the chunk
47
+ metadata = {
48
+ "source": file_name,
49
+ "specilization": speciality,
50
+ "chunk_index": idx,
51
+ "self_ref": main_ref,
52
+ "parent_ref": parent_ref,
53
+ "child_ref": child_ref,
54
+ "chunk_type": "text",
55
+
56
+ }
57
+ document = Document(page_content=text, metadata=metadata)
58
+ documents.append(document)
59
+ return documents
60
+
61
+
62
+ class document_indexing:
63
+ def __init__(self,
64
+ docling_converted_document: DocumentConverter,
65
+ embeddings_model: str,
66
+ speciality: str,
67
+ file_name: str):
68
+ # convert the document
69
+ self.converted_document = docling_converted_document.document
70
+ # hybrid chunking
71
+ self.embeddings_tokenizer = AutoTokenizer.from_pretrained(embeddings_model)
72
+ self.speciality = speciality
73
+ self.file_name = file_name
74
+
75
+ def create_chunks(self):
76
+ chunks = HybridChunker(tokenizer=self.embeddings_tokenizer).chunk(self.converted_document)
77
+ updated_chunks = adding_metadata_chunks(chunks = chunks,
78
+ file_name = self.file_name ,
79
+ speciality = self.speciality)
80
+ return updated_chunks
81
+
82
+ def extract_all_text(self) -> list[Document]:
83
+ """To exract all the text from the docling document and convert it to langchain
84
+ document. This is useful for creating a vector store from the text.
85
+
86
+ Args:
87
+ docling_document (DocumentConverter): _docling_document_
88
+ file_name (str): name of the file
89
+ medical_specialty (str): book category
90
+
91
+ Returns:
92
+ list[Document]: _list of langchain documents_
93
+ """
94
+
95
+ documents_list = list()
96
+ for text in self.converted_document.texts:
97
+ content = text.text
98
+ main_ref = ",".join([text.get_ref().cref])
99
+ parent_ref = ",".join([text.parent.get_ref().cref])
100
+ child_ref = ",".join([ref.get_ref().cref for ref in text.children])
101
+ document = Document(page_content=content, metadata={
102
+ "source": self.file_name,
103
+ "chunk_index": None,
104
+ "self_ref": main_ref,
105
+ "parent_ref": parent_ref,
106
+ "child_ref": child_ref,
107
+ "chunk_type": "text",
108
+ "medical_specialty" : self.speciality,
109
+ "reference": None
110
+ })
111
+
112
+ documents_list.append(document)
113
+ return documents_list
114
+
115
+ def extract_tables(self) -> list[Document]:
116
+ """Extract the tables from the converted document and add metadata.
117
+
118
+ Args:
119
+ document (DocumentConverter): converted document.
120
+ file_name (str): file name.
121
+ medical_specialty (str): book category
122
+ Returns:
123
+ list[TableItem]: A list of documents containing table data with
124
+ reference IDs in the metadata.
125
+ """
126
+ tables: list[Document] = []
127
+ for table in self.converted_document.tables:
128
+ if table.label in [DocItemLabel.TABLE]:
129
+ main_ref = ",".join([table.get_ref().cref])
130
+ parent_ref = ",".join([table.parent.get_ref().cref])
131
+ child_ref = ",".join([ref.get_ref().cref for ref in table.children])
132
+
133
+ text = table.export_to_markdown()
134
+ metadata = {
135
+ "source": self.file_name,
136
+ "chunk_index": None,
137
+ "self_ref": main_ref,
138
+ "parent_ref": parent_ref,
139
+ "child_ref": child_ref,
140
+ "chunk_type": "table",
141
+ "medical_specialty" : self.speciality,
142
+ }
143
+ document = Document(page_content=text, metadata=metadata)
144
+ tables.append(document)
145
+ return tables
146
+
147
+ def extract_images(self) -> list[Document]:
148
+ """Extract the tables from the converted document and add metadata.
149
+
150
+ Args:
151
+ document (DocumentConverter): converted document.
152
+ file_name (str): file name.
153
+ medical_specialty (str): book category
154
+ Returns:
155
+ list[TableItem]: A list of documents containing table data with
156
+ reference IDs in the metadata.
157
+ """
158
+ images: list[Document] = []
159
+ for picture in self.converted_document.pictures:
160
+ if picture.label in [DocItemLabel.PICTURE]:
161
+ main_ref = ",".join([picture.get_ref().cref])
162
+ parent_ref = ",".join([picture.parent.get_ref().cref])
163
+ child_ref = ",".join([ref.get_ref().cref for ref in picture.children])
164
+ metadata = {
165
+ "source": self.file_name,
166
+ "chunk_index": None,
167
+ "self_ref": main_ref,
168
+ "parent_ref": parent_ref,
169
+ "child_ref": child_ref,
170
+ "chunk_type": "table",
171
+ "medical_specialty" : self.speciality,
172
+ }
173
+ document = Document(page_content=main_ref, metadata=metadata)
174
+ images.append(document)
175
+ return images
176
+
177
+
src/data_preprocessing/docling/utils.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ contains all the functions to extract the tables, images and, text from the converted
3
+ documents.
4
+ """
5
+
6
+ import os
7
+ import re
8
+
9
+ from typing import List
10
+
11
+ from docling.chunking import HybridChunker
12
+ from docling_core.types.doc.document import TableItem
13
+ from langchain_core.documents import Document
14
+ from docling_core.types.doc.labels import DocItemLabel
15
+
16
+ from docling_core.types.doc.document import TableItem
17
+ from transformers import AutoTokenizer
18
+ from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
19
+
20
+ __all__ = [
21
+ "sanitize_name",
22
+ "rename_items",
23
+ "find_matching_fig_ref",
24
+ "find_image_by_number",
25
+ "extract_images",
26
+ "extract_tables",
27
+ "extract_texts",
28
+ "find_relevant_folder"
29
+ ]
30
+
31
+
32
+ def sanitize_name(name:str)-> str:
33
+ """Replace '-', '_', and '–' with a single hyphen '-' and remove extra spaces.
34
+
35
+ Args:
36
+ name (str): file or folder name
37
+
38
+ Returns:
39
+ str: processed name
40
+ """
41
+ # Replace -, _, – with '-'
42
+ name = re.sub(r'[-_– ]+', '-', name)
43
+ # Replace multiple spaces with a single space
44
+ name = re.sub(r'\s+', ' ', name).strip()
45
+ return name
46
+
47
+ def rename_items(directory:str):
48
+ """Rename all files and folders inside the given directory.
49
+
50
+ Args:
51
+ directory (str): file or folder name
52
+ """
53
+ items = os.listdir(directory) # Get all files and folders inside the directory
54
+ for item in items:
55
+ old_path = os.path.join(directory, item)
56
+ new_name = sanitize_name(item) # Clean up the name
57
+ new_path = os.path.join(directory, new_name)
58
+
59
+ if old_path != new_path: # Rename only if the name changes
60
+ os.rename(old_path, new_path)
61
+ print(f"Renamed: {old_path} -> {new_path}")
62
+
63
+ def find_matching_fig_ref(doc1:dict, doc2:dict)-> str|None:
64
+ """Check the texts ids from text chunks metadata and pictures metadata if any id
65
+ matches then returns the image id.
66
+
67
+ Args:
68
+ doc1 (dict): text chunks metadata
69
+ doc2 (dict): picture metadata
70
+
71
+ Returns:
72
+ str|None: if similar text id matched in both the metadata then returns the
73
+ figure reference which is figure number. if no match None
74
+ """
75
+
76
+ # Extract and split self_ref and parent_ref into sets
77
+ doc1_self_refs = set(doc1['self_ref'].split()) # Split multiple self_refs
78
+ doc1_parent_refs = set(doc1['parent_ref'].split()) # Split multiple parent_refs
79
+
80
+ # Extract text_ref and fig_ref from doc2
81
+ doc2_text_ref = doc2['text_ref']
82
+ doc2_fig_ref = doc2['fig_ref']
83
+
84
+ # Check if text_ref exists in self_ref or parent_ref
85
+ if doc2_text_ref in doc1_self_refs or doc2_text_ref in doc1_parent_refs:
86
+ return doc2_fig_ref # Return fig_ref if there's a match
87
+ return None # No match found
88
+
89
+ def find_image_by_number(folder_path: str, img_number:int)-> str|None:
90
+ """Search for an image with the specified number in the folder.
91
+
92
+ Args:
93
+ folder_path (str): artifacts path where all the images were stored.
94
+ img_number (int): image id
95
+
96
+ Returns:
97
+ str|None: image path
98
+ """
99
+
100
+ pattern = re.compile(rf"image-0*{img_number}-[a-fA-F0-9]+\.png") # Regex pattern
101
+
102
+ for filename in os.listdir(folder_path):
103
+ if pattern.match(filename): # Check if the filename matches the pattern
104
+ return os.path.join(folder_path, filename) # Return full path
105
+
106
+ return None # Return None if no match found
107
+
108
+ def extract_images(conv_document: Document) -> Document:
109
+ """Extract the images from the converted document and add the metadata.
110
+
111
+ Args:
112
+ conv_document (Document): converted document
113
+
114
+ Returns:
115
+ Document: pictures with the metadata.
116
+ """
117
+
118
+ pictures: list[Document] = []
119
+ for picture in conv_document.pictures:
120
+ figure_ref = picture.get_ref().cref
121
+ text_ref = picture.parent.get_ref().cref
122
+ document = Document(
123
+ page_content="",
124
+ metadata={
125
+ "fig_ref": figure_ref,
126
+ "text_ref": text_ref,
127
+ },)
128
+ pictures.append(document)
129
+ return pictures
130
+
131
+ def extract_tables(document: Document,
132
+ file_name: str) -> list[TableItem]:
133
+ """Extract the tables from the converted document and add metadata.
134
+
135
+ Args:
136
+ document (Document): converted document.
137
+ file_name (str): file name.
138
+
139
+ Returns:
140
+ list[TableItem]: A list of documents containing table data with
141
+ reference IDs in the metadata.
142
+ """
143
+ tables = []
144
+ for table in document.tables:
145
+ if table.label in [DocItemLabel.TABLE]:
146
+
147
+ self_refs = table.get_ref().cref
148
+ parent_refs = table.parent.get_ref().cref if table.parent else ""
149
+
150
+ text = table.export_to_markdown()
151
+ document = Document(
152
+ page_content=text,
153
+ metadata={
154
+ "source": file_name,
155
+ "self_ref": self_refs,
156
+ "parent_ref": parent_refs,
157
+
158
+ },
159
+ )
160
+ tables.append(document)
161
+ return tables
162
+
163
+ def extract_texts(conv_document: Document,
164
+ pictures:List[Document],
165
+ images_artifacts: str,
166
+ embeddings_tokenizer: AutoTokenizer,
167
+ file_name: str
168
+ )-> List[Document]:
169
+ """Extract the text data from converted document and add the image path in the
170
+ metadata.
171
+
172
+ Args:
173
+ conv_document (Document): converted document.
174
+ pictures (List[Document]): extracted pictures list.
175
+ images_artifacts (str): artifacts path to extact image path.
176
+ embeddings_tokenizer (AutoTokenizer): tokenizer to chunk the texts.
177
+ file_name (str): file name.
178
+
179
+ Returns:
180
+ List[Document]: chunks with updated metadata.
181
+ """
182
+ texts = []
183
+ doc_id = 0
184
+ for chunk in HybridChunker(tokenizer=embeddings_tokenizer).chunk(conv_document):
185
+ items = chunk.meta.doc_items
186
+ self_refs = " ".join(map(lambda item: item.get_ref().cref, items))
187
+ parent_refs = items[0].parent.get_ref().cref if len(items) > 0 else ""
188
+ meta_data_dict = {
189
+ "source": file_name,
190
+ "self_ref": self_refs,
191
+ "parent_ref": parent_refs,
192
+ }
193
+
194
+ for picture in pictures:
195
+ fig_metadata = picture.metadata
196
+ fig_ref = find_matching_fig_ref(meta_data_dict, fig_metadata)
197
+ if fig_ref:
198
+ fig_number = int(fig_ref.split("/")[-1])
199
+ image_path = find_image_by_number(images_artifacts, fig_number)
200
+ meta_data_dict["fig_ref"] = image_path
201
+ meta_data_dict["fig_number"] = fig_number
202
+
203
+ text = chunk.text
204
+ document = Document(
205
+ page_content=text,
206
+ metadata= meta_data_dict,
207
+ )
208
+ texts.append(document)
209
+ return texts
210
+
211
+
212
+
213
+
214
+ def find_relevant_folder(folder_path:str)->dict:
215
+ """create a dict with markdown file(key) and
216
+ artfacts (value).
217
+
218
+ Args:
219
+ folder_path (str): folder path where all the converted documents are stored.
220
+
221
+ Returns:
222
+ dict: dict with file with artifacts folder
223
+ """
224
+ # Renaming the files and folders by removing the spaces
225
+ rename_items(folder_path)
226
+
227
+ # Initialize the dataset dictionary
228
+ dataset_dict = {}
229
+
230
+ # Get all files and folders in the directory (do this only once)
231
+ all_items = os.listdir(folder_path)
232
+
233
+ # Split files and folders in one pass
234
+ md_files = {file for file in all_items if file.endswith(".md")}
235
+ folders = {folder for folder in all_items if not folder.endswith(".md")}
236
+
237
+ # Create a dictionary of folder name splits for efficient matching
238
+ folder_splits = {tuple(folder.split("-")[:-2]): folder for folder in folders}
239
+
240
+ for file in md_files:
241
+ file_split = tuple(file.split("-")[:-1])
242
+
243
+ # Check if file_split matches any folder's split
244
+ if file_split in folder_splits:
245
+ dataset_dict[file] = folder_splits[file_split]
246
+
247
+ return dataset_dict
248
+
249
+
250
+ def extract_ref_text_ids(meta_data):
251
+ all_refs = []
252
+
253
+ # Go through all 3 ref fields
254
+ for key in ["self_ref", "parent_ref", "child_ref"]:
255
+ ref_str = meta_data.get(key)
256
+ if ref_str:
257
+ refs = ref_str.split(",") # split in case of multiple refs
258
+ all_refs.extend(refs)
259
+
260
+ # Remove duplicates
261
+ unique_refs = set(all_refs)
262
+
263
+ # Extract /texts/ IDs as integers
264
+ text_refs = [int(ref.split("/")[2]) for ref in unique_refs if "/texts/" in ref]
265
+
266
+ return text_refs
src/data_preprocessing/docling/vector_database_pipeline.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ To preprocess the data and create a vector database using docling and langchain,
3
+ openai embeddings.
4
+ """
5
+ import getpass
6
+ import os
7
+ from dotenv import load_dotenv
8
+ import itertools
9
+ from uuid import uuid4
10
+
11
+ import faiss
12
+ from langchain_community.docstore.in_memory import InMemoryDocstore
13
+ from langchain_community.vectorstores import FAISS
14
+
15
+ from langchain_openai import OpenAIEmbeddings
16
+
17
+ from docling.document_converter import DocumentConverter
18
+ from langchain_huggingface import HuggingFaceEmbeddings
19
+ from transformers import AutoTokenizer
20
+
21
+ from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
22
+ from docling_core.types.doc.document import TableItem,PictureItem
23
+ from docling_core.types.doc.labels import DocItemLabel
24
+ from langchain_core.documents import Document
25
+
26
+ import logging
27
+
28
+ load_dotenv()
29
+
30
+ def adding_metadata_chunks(chunks: HybridChunker, file_name: str, speciality: str) -> list[Document]:
31
+ """Adding metadata to the chunks
32
+ This function processes a list of chunks and adds metadata to each chunk.
33
+
34
+ Args:
35
+ chunks (Hybridchunker): The chunks to be processed.
36
+ file_name (str): The name of the file from which the chunks were created.
37
+ specality (str): specalization of the book.
38
+
39
+ Returns:
40
+ List[Document]: A list of Document objects with added metadata.
41
+ """
42
+ documents = []
43
+ for idx, chunk in enumerate(chunks):
44
+ items = chunk.meta.doc_items
45
+ if len(items) == 1 and isinstance(items[0], TableItem):
46
+ # If the chunk is a table, we can skip it
47
+ continue
48
+
49
+ main_ref = " ".join([item.get_ref().cref for item in items])
50
+ parent_ref = " ".join([item.parent.get_ref().cref for item in items])
51
+ child_ref = " ".join([str(child) for sublist in [item.children for item in items] for child in sublist])
52
+
53
+ text = chunk.text # The text of the chunk
54
+ metadata = {
55
+ "source": file_name,
56
+ "specilization": speciality,
57
+ "chunk_index": idx,
58
+ "self_ref": main_ref,
59
+ "parent_ref": parent_ref,
60
+ "child_ref": child_ref,
61
+ "chunk_type": "text",
62
+
63
+ }
64
+ document = Document(page_content=text, metadata=metadata)
65
+ documents.append(document)
66
+ return documents
67
+
68
+
69
+ def modifying_tables(docling_document, file_name: str, speciality: str) -> list[Document]:
70
+ """Extract the tables from the converted document and add metadata.
71
+
72
+ Args:
73
+ document (Document): converted document.
74
+ file_name (str): file name.
75
+ specality (str): specalization of the book.
76
+
77
+ Returns:
78
+ list[TableItem]: A list of documents containing table data with
79
+ reference IDs in the metadata.
80
+ """
81
+ tables: list[Document] = []
82
+ for table in docling_document.tables:
83
+ if table.label in [DocItemLabel.TABLE]:
84
+ main_ref = table.get_ref().cref
85
+ parent_ref = table.parent.get_ref().cref
86
+ child_ref = table.children
87
+
88
+ text = table.export_to_markdown()
89
+ metadata = {
90
+ "source": file_name,
91
+ "chunk_index": None,
92
+ "self_ref": main_ref,
93
+ "parent_ref": parent_ref,
94
+ "child_ref": child_ref,
95
+ "chunk_type": "table",
96
+ }
97
+ document = Document(page_content=text, metadata=metadata)
98
+ tables.append(document)
99
+ return tables
100
+
101
+
102
+ def dataloader(file_path:str, embeddings_model:str) -> list[Document]:
103
+
104
+ logging.info("Converting the document to docling format...")
105
+ docling_document = DocumentConverter().convert(source=file_path).document
106
+ file_name = file_path.split("\\")[-1].split(".")[0]
107
+ # Create a hybrid chunker to chunk the document
108
+ embeddings_tokenizer = AutoTokenizer.from_pretrained(embeddings_model)
109
+ logging.info("Chunking the document...")
110
+ chunks = HybridChunker(tokenizer=embeddings_tokenizer).chunk(docling_document)
111
+
112
+ # Add metadata to the chunks
113
+ logging.info("Adding metadata to the chunks...")
114
+ texts = adding_metadata_chunks(chunks, file_name)
115
+ logging.info("Modifying tables...")
116
+ tables = modifying_tables(docling_document, file_name)
117
+ # Combine the text and table documents into a single list
118
+ documents = list(itertools.chain(texts, tables))
119
+ logging.info(f"Loaded {len(documents)} documents from {file_name}.")
120
+ return documents
121
+
122
+
123
+ def create_vector_database(documents: list[Document]) -> FAISS:
124
+ """Create a vector database from the documents.
125
+
126
+ Args:
127
+ file_path (str): The path to the document file.
128
+ embeddings_model (str): The model name for embeddings.
129
+
130
+ Returns:
131
+ list[Document]: A list of Document objects with embeddings.
132
+ """
133
+
134
+ logging.info("Creating the vector database...")
135
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
136
+ index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))
137
+ vector_store = FAISS(
138
+ embedding_function=embeddings,
139
+ index=index,
140
+ docstore=InMemoryDocstore(),
141
+ index_to_docstore_id={},
142
+ )
143
+ uuids = [str(uuid4()) for _ in range(len(documents))]
144
+ vector_store.add_documents(documents=documents, ids=uuids)
145
+ logging.info("Vector database created successfully.")
146
+
147
+
148
+ def main(file_path:str, embeddings_model:str) -> FAISS:
149
+ logging.basicConfig(level=logging.INFO)
150
+ logger = logging.getLogger(__name__)
151
+ documents = dataloader(file_path, embeddings_model)
152
+ create_vector_database(documents)
153
+
154
+
155
+ if __name__ == "__main__":
156
+ file_path = r"converted\ROBBINS-&-COTRAN-PATHOLOGIC-BASIS-OF-DISEASE-10TH-ED-with-image-refs.md"
157
+ embeddings_model = "ibm-granite/granite-embedding-125m-english"
158
+ main(file_path, embeddings_model)
src/data_preprocessing/download_azure_data.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from azureml.core import Workspace, Dataset, Datastore
2
+
3
+ # # Azure ML workspace details
4
+ # subscription_id = '485363cd-687d-4adb-a30b-35108c11d682'
5
+ # resource_group = 'medbot'
6
+ # workspace_name = 'karthik'
7
+
8
+ # # Connect to the Azure ML workspace
9
+ # workspace = Workspace(subscription_id, resource_group, workspace_name)
10
+
11
+ # # Access the datastore
12
+ # datastore = Datastore.get(workspace, "workspaceartifactstore")
13
+
14
+ # # Access the dataset
15
+ # dataset = Dataset.File.from_files(path=(datastore, 'converted_document_reference'))
16
+
17
+ # # Download the dataset to the current directory
18
+ # dataset.download(target_path='.', overwrite=True)
19
+
20
+ # print("Download completed successfully.")
21
+
22
+
23
+
24
+ from azureml.core import Workspace, Dataset, Datastore
25
+
26
+ subscription_id = '485363cd-687d-4adb-a30b-35108c11d682'
27
+ resource_group = 'medbot'
28
+ workspace_name = 'karthik'
29
+
30
+ workspace = Workspace(subscription_id, resource_group, workspace_name)
31
+
32
+ datastore = Datastore.get(workspace, "workspaceartifactstore")
33
+ dataset = Dataset.File.from_files(path=(datastore, 'converted_docs_json'))
34
+ dataset.download(target_path='/home/kap2403/Desktop/Medico-AI-Bot/dataset/converted_json_docs', overwrite=True)
src/data_preprocessing/utils.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ In the current version images were not considered in the vector database due to some computing power. so there are the utils to extract references from the retrieved documents and search for pictures. if there is any ref match with picture ref the the images are extracted.
3
+ """
4
+ import os
5
+ import re
6
+ import json
7
+ from langchain_core.documents import Document
8
+ from docling.document_converter import DocumentConverter
9
+
10
+
11
+ def extract_metadata(documents: list[Document])-> list[str]:
12
+
13
+ references = []
14
+ for doc in documents:
15
+ meta_data = doc.metadata
16
+ self_ref = meta_data["self_ref"]
17
+ parent_ref = meta_data["parent_ref"]
18
+ if self_ref:
19
+ references.append(self_ref)
20
+ if parent_ref:
21
+ references.append(parent_ref)
22
+ unique_ref = list(set(references))
23
+ return unique_ref
24
+
25
+
26
+ def images_data(docling_document: DocumentConverter)-> dict:
27
+ images_data = {}
28
+ for image in docling_document.pictures:
29
+ self_ref = image.self_ref
30
+ parent_ref = image.parent.cref
31
+ images_data[self_ref] = parent_ref
32
+ return images_data
33
+
34
+
35
+ def find_image_by_number(folder_path: str, img_number:int)-> str|None:
36
+ """Search for an image with the specified number in the folder.
37
+
38
+ Args:
39
+ folder_path (str): artifacts path where all the images were stored.
40
+ img_number (int): image id
41
+
42
+ Returns:
43
+ str|None: image path
44
+ """
45
+
46
+ pattern = re.compile(rf"image-0*{img_number}-[a-fA-F0-9]+\.png") # Regex pattern
47
+
48
+ for filename in os.listdir(folder_path):
49
+ if pattern.match(filename): # Check if the filename matches the pattern
50
+ return os.path.join(folder_path, filename) # Return full path
51
+
52
+ return None # Return None if no match found
53
+
54
+
55
+ def extract_matching_pictures(ref_list: list, images_dict:dict) -> list[int]:
56
+
57
+ def extract_image_numbers(picture_refs):
58
+ image_numbers = [int(ref.split('/')[-1]) for ref in picture_refs]
59
+ return image_numbers
60
+
61
+ all_refs = set()
62
+ for ref_string in ref_list:
63
+ refs = ref_string.split(',')
64
+ all_refs.update(refs)
65
+
66
+ # Find matching picture keys where the image's value (text ref) is in all_refs
67
+ matching_pictures = [pic for pic, text_ref in images_dict.items() if text_ref in all_refs]
68
+
69
+ image_numbers = extract_image_numbers(matching_pictures)
70
+
71
+ return image_numbers
72
+
73
+ def extract_ref_paths(images_num_list: list[int])-> list[str]:
74
+ folder_path = "/home/kap2403/Desktop/Medico-AI-Bot/converted/ROBBINS-&-COTRAN-PATHOLOGIC-BASIS-OF-DISEASE-10TH-ED-with-image-refs-artifacts"
75
+ paths = []
76
+ for img_num in images_num_list:
77
+ path = find_image_by_number(folder_path = folder_path,
78
+ img_number= img_num)
79
+ paths.append(path)
80
+
81
+ return paths
82
+
83
+
84
+ def images_ref_pipeline(retriever):
85
+ with open(r"/home/kap2403/Desktop/Medico-AI-Bot/dataset/pictures.json", "r") as file:
86
+ images_data = json.load(file)
87
+
88
+ meta_data = extract_metadata(retriever)
89
+ image_numbers = extract_matching_pictures(meta_data, images_data)
90
+ paths_list = extract_ref_paths(image_numbers)
91
+
92
+ return paths_list
93
+
94
+
95
+
src/interface.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ import io
4
+ from PIL import Image
5
+ import gradio as gr
6
+ from src.bot.bot import Medibot
7
+ from bs4 import BeautifulSoup
8
+ import markdown
9
+ from src.auth.auth import register_user, login_user
10
+ from src.auth.db import initialize_db
11
+ from groq import Groq
12
+
13
+ from src import config
14
+
15
+
16
+ #======================================
17
+ #=============utils====================
18
+ #======================================
19
+
20
+ # Helper functions
21
+ def markdown_to_plain_text(md_text: str) -> str:
22
+ html = markdown.markdown(md_text)
23
+ soup = BeautifulSoup(html, "html.parser")
24
+ return soup.get_text()
25
+
26
+ # Ensure base64 strings are properly formatted (no newlines/whitespace)
27
+ def decode_base64_to_image(base64_string):
28
+ # Clean the string before decoding
29
+ base64_string = base64_string.replace("\n", "").replace(" ", "")
30
+ image_data = base64.b64decode(base64_string)
31
+ return Image.open(io.BytesIO(image_data))
32
+
33
+
34
+ # Step 1: API Key Validation Logic
35
+ def validate_api_key(user_api_key):
36
+ global api_key
37
+ if not user_api_key:
38
+ return "❌ Please enter your Groq Cloud API key.", gr.update(visible=True), gr.update(visible=False)
39
+
40
+ try:
41
+ client = Groq(api_key=user_api_key)
42
+ response = client.chat.completions.create(
43
+ messages=[{"role": "user", "content": "Hello"}],
44
+ model="llama3-70b-8192"
45
+ )
46
+
47
+ api_key = user_api_key
48
+ os.environ["GROQ_API_KEY"] = api_key
49
+
50
+ return "βœ… API key is valid and saved!", gr.update(visible=False), gr.update(visible=True)
51
+
52
+ except Exception as e:
53
+ return f"❌ Invalid API key: {str(e)}", gr.update(visible=True), gr.update(visible=False)
54
+
55
+ def handle_login(userid, password, user_api_key):
56
+ if user_api_key:
57
+ # Step 1: Validate API Key first
58
+ try:
59
+ client = Groq(api_key=user_api_key)
60
+ response = client.chat.completions.create(
61
+ messages=[{"role": "user", "content": "Hello"}],
62
+ model="llama3-70b-8192"
63
+ )
64
+
65
+ # If API key is valid, proceed to register
66
+ success, msg = register_user(userid, password, user_api_key)
67
+ if success:
68
+ config.api_key = user_api_key
69
+ os.environ["GROQ_API_KEY"] = user_api_key
70
+ return "βœ… API Key validated & registered!", gr.update(visible=False), gr.update(visible=True)
71
+ else:
72
+ return msg, gr.update(visible=True), gr.update(visible=False)
73
+
74
+ except Exception as e:
75
+ # API key invalid
76
+ return f"❌ Invalid API Key: {str(e)}", gr.update(visible=True), gr.update(visible=False)
77
+
78
+ else:
79
+ # User is trying to login
80
+ success, saved_api_key = login_user(userid, password)
81
+ if success:
82
+ config.api_key = saved_api_key
83
+ os.environ["GROQ_API_KEY"] = saved_api_key
84
+ return "βœ… Login successful!", gr.update(visible=False), gr.update(visible=True)
85
+ else:
86
+ return "❌ Incorrect userid or password.", gr.update(visible=True), gr.update(visible=False)
87
+
88
+
89
+ #======================================
90
+ #=============Interface================
91
+ #======================================
92
+
93
+ class Interface:
94
+ def __init__(self, config_path: str = "src/bot/configs/prompt.toml",
95
+ metadata_database: str = "database/metadata.csv",
96
+ faiss_database: str = "database/faiss_index"):
97
+
98
+ self.bot = Medibot(config_path = config_path,
99
+ metadata_database = metadata_database,
100
+ faiss_database = faiss_database,
101
+ )
102
+
103
+ def get_answer(self, question: str):
104
+ try:
105
+ answer_md, retrieved_docs, refered_tables, refered_images = self.bot.query(question)
106
+
107
+ # Convert answer to markdown display
108
+ answer_display = answer_md
109
+
110
+ # Format referenced tables as markdown
111
+ tables_display = "### Referenced Tables:\n\n"
112
+ if refered_tables:
113
+ for table_name, table_content in refered_tables.items():
114
+ tables_display += f"{table_content}\n\n"
115
+ else:
116
+ tables_display += "_No tables referenced._"
117
+
118
+ # Decode images
119
+ # Format images as markdown (base64)
120
+ images_display = []
121
+ if refered_images:
122
+ for image_name, base64_string in refered_images.items():
123
+ data_uri = f"data:image/png;base64,{base64_string}"
124
+ images_display.append(f'![]({data_uri})') # Markdown embedding for images
125
+ else:
126
+ images_display = None
127
+
128
+ # Combine retrieved document texts
129
+ retrieved_display = "### Retrieved Documents:\n\n"
130
+ if retrieved_docs:
131
+ for i, doc in enumerate(retrieved_docs):
132
+ retrieved_display += f"**Doc {i+1}:**\n{doc.page_content}\n\n"
133
+ else:
134
+ retrieved_display += "_No documents retrieved._"
135
+
136
+ return answer_display, tables_display, images_display, retrieved_display
137
+
138
+ except Exception as e:
139
+ return f"Error: {str(e)}", "", [], ""
140
+
141
+
142
+