Spaces:

Pavan2k4
/

rag

Sleeping

App Files Files Community

Pavan2k4 commited on 10 days ago

Commit

6bccf2b

verified ·

1 Parent(s): 05b818f

Upload 7 files

Browse files

Files changed (7) hide show

app.py +359 -0
chains.py +262 -0
entities.py +12 -0
requirements.txt +213 -0
sample_user_data.json +37 -0
simple_rag.py +131 -0
tools.py +125 -0

app.py ADDED Viewed

	@@ -0,0 +1,359 @@

+import sys
+import os
+import pandas as pd
+import langchain
+os.environ['STREAMLIT_SERVER_ENABLE_STATIC_SERVING'] = 'false'
+from simple_rag import app
+import streamlit as st
+import json
+from io import StringIO
+import tiktoken
+import time
+from langchain_community.document_loaders import PyMuPDFLoader
+import traceback
+import sqlite3  # Import SQLite
+from dotenv import load_dotenv
+load_dotenv()
+import uuid  # Import the UUID library
+# Token limits
+config={"configurable": {"thread_id": "sample"}}
+GPT_LIMIT = 128000
+GEMINI_LIMIT = 1000000
+config={"configurable": {"thread_id": "sample"}}
+# Token counters
+def count_tokens_gpt(text):
+    enc = tiktoken.encoding_for_model("gpt-4")
+    return len(enc.encode(text))
+def count_tokens_gemini(text):
+    return len(text.split())  # Approximation
+# Calculate tokens for the entire context window
+def calculate_context_window_usage(json_data=None):
+    # Reconstruct the full conversation context
+    full_conversation = ""
+    for sender, message in st.session_state.chat_history:
+        full_conversation += f"{sender}: {message}\n\n"
+    # Add JSON context if provided
+    if json_data:
+        full_conversation += json.dumps(json_data)
+    gpt_tokens = count_tokens_gpt(full_conversation)
+    gemini_tokens = count_tokens_gemini(full_conversation)
+    return gpt_tokens, gemini_tokens
+# Page configuration
+st.set_page_config(page_title="📊 RAG Chat Assistant", layout="wide")
+# --- Database setup ---
+# DATABASE_PATH = "Data/chat_history.db"  # Original database path
+SESSION_DB_DIR = "Data/sessions"  # Directory to store individual session DBs
+def initialize_session_database(session_id):
+    """Initializes a new database for a chat session."""
+    db_path = os.path.join(SESSION_DB_DIR, f"{session_id}.db")
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    cursor.execute("""
+        CREATE TABLE IF NOT EXISTS chat_history (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            sender TEXT,
+            message TEXT,
+            timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
+        )
+    """)
+    conn.commit()
+    conn.close()
+    return db_path
+def save_message(db_path, sender, message):
+    """Saves a message to the specified session database."""
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    cursor.execute("INSERT INTO chat_history (sender, message) VALUES (?, ?)", (sender, message))
+    conn.commit()
+    conn.close()
+def clear_chat_history(db_path):
+    """Clears the chat history in the specified session database."""
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    cursor.execute("DELETE FROM chat_history")
+    conn.commit()
+    conn.close()
+# Initialize session DB directory
+if not os.path.exists(SESSION_DB_DIR):
+    os.makedirs(SESSION_DB_DIR)
+# --- Session state setup ---
+if "chat_history" not in st.session_state:
+    st.session_state.chat_history = [
+        ("assistant", "👋 Hello! I'm your RAG assistant. Please upload your JSON files and ask me a question about your portfolio.")
+    ]
+if "processing" not in st.session_state:
+    st.session_state.processing = False
+if "total_gpt_tokens" not in st.session_state:
+    st.session_state.total_gpt_tokens = 0  # Total accumulated
+if "total_gemini_tokens" not in st.session_state:
+    st.session_state.total_gemini_tokens = 0  # Total accumulated
+if "window_gpt_tokens" not in st.session_state:
+    st.session_state.window_gpt_tokens = 0  # Current context window
+if "window_gemini_tokens" not in st.session_state:
+    st.session_state.window_gemini_tokens = 0  # Current context window
+# Generate a unique session ID if one doesn't exist
+if "session_id" not in st.session_state:
+    st.session_state.session_id = str(uuid.uuid4())
+    st.session_state.session_db_path = initialize_session_database(st.session_state.session_id)  # Initialize session DB
+# --- Load chat history from the session database ---
+def load_chat_history(db_path):
+    conn = sqlite3.connect(db_path)
+    cursor = conn.cursor()
+    cursor.execute("SELECT sender, message FROM chat_history ORDER BY timestamp")
+    history = cursor.fetchall()
+    conn.close()
+    return history
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+# Go one level up to reach RAG_rubik/
+PROJECT_ROOT = os.path.dirname(BASE_DIR)
+print(PROJECT_ROOT, BASE_DIR)
+# --- Layout: Chat UI Left | Progress Bars Right ---
+col_chat, col_progress = st.columns([3, 1])
+# --- LEFT COLUMN: Chat UI ---
+with col_chat:
+    st.title("💬 RAG Assistant")
+    with st.expander("📂 Upload Required JSON Files", expanded=True):
+        # user_data_file = st.file_uploader("Upload user_data.json", type="json", key="user_data")
+        # allocations_file = st.file_uploader("Upload allocations.json", type="json", key="allocations")
+        user_data_path = os.getenv('USER_DATA_PATH')
+        allocations_path = os.getenv('ALLOCATIONS_PATH')
+        try:
+            with open(user_data_path, 'r') as f:
+                user_data = json.load(f)
+        except FileNotFoundError:
+            st.error(f"Error: user_data.json not found at {user_data_path}")
+            user_data = None
+        except json.JSONDecodeError:
+            st.error(f"Error: Could not decode user_data.json. Please ensure it is valid JSON.")
+            user_data = None
+        try:
+            with open(allocations_path, 'r') as f:
+                allocations = json.load(f)
+        except FileNotFoundError:
+            st.error(f"Error: allocations.json not found at {allocations_path}")
+            allocations = None
+        except json.JSONDecodeError:
+            st.error(f"Error: Could not decode allocations.json. Please ensure it is valid JSON.")
+            allocations = None
+        if user_data:
+            sematic = user_data.get("sematic", {})
+            demographic = sematic.get("demographic", {})
+            financial = sematic.get("financial", {})
+            episodic = user_data.get("episodic", {}).get("prefrences", [])
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                            st.markdown("### 🧾 **Demographic Info**")
+                            for key, value in demographic.items():
+                                st.markdown(f"- **{key.replace('_', ' ').title()}**: {value}")
+            with col2:
+                            st.markdown("### 📊 **Financial Status**")
+                            for key, value in financial.items():
+                                st.markdown(f"- **{key.replace('_', ' ').title()}**: {value}")
+            with col3:
+                            st.markdown("### ⚙️ **Preferences & Goals**")
+                            st.markdown("**User Preferences:**")
+                            for pref in user_data.get("episodic", {}).get("prefrences", []):
+                                st.markdown(f"- {pref.capitalize()}")
+                            st.markdown("**Goals:**")
+                            for goal in user_data.get("episodic", {}).get("goals", []):
+                                for k, v in goal.items():
+                                    st.markdown(f"- **{k.replace('_', ' ').title()}**: {v}")
+        if "allocations" not in st.session_state:
+            st.session_state.allocations = allocations
+        if st.session_state.allocations:
+            try:
+                # allocations = json.load(StringIO(allocations_file.getvalue().decode("utf-8")))
+                st.markdown("### 💼 Investment Allocations")
+                # Flatten data for display
+                records = []
+                for asset_class, entries in st.session_state.allocations.items():
+                    for item in entries:
+                        records.append({
+                            "Asset Class": asset_class.replace("_", " ").title(),
+                            "Type": item.get("type", ""),
+                            "Label": item.get("label", ""),
+                            "Amount (₹)": item.get("amount", 0)
+                        })
+                df = pd.DataFrame(records)
+                st.dataframe(df)
+            except Exception as e:
+                st.error(f"Failed to parse allocations.json: {e}")
+        # Clear chat button
+        if st.button("Clear Chat"):
+            st.session_state.chat_history = [
+                ("assistant", "👋 Hello! I'm your RAG assistant. Please upload your JSON files and ask me a question about your portfolio.")
+            ]
+            st.session_state.total_gpt_tokens = 0
+            st.session_state.total_gemini_tokens = 0
+            st.session_state.window_gpt_tokens = 0
+            st.session_state.window_gemini_tokens = 0
+            # Clear the chat history in the session database
+            clear_chat_history(st.session_state.session_db_path)
+            st.rerun()
+    st.markdown("---")
+    # Display chat history
+    chat_container = st.container()
+    with chat_container:
+        for sender, message in st.session_state.chat_history:
+            if sender == "user":
+                st.chat_message("user").write(message)
+            else:
+                st.chat_message("assistant").write(message)
+        # Show thinking animation if processing
+        if st.session_state.processing:
+            thinking_placeholder = st.empty()
+            with st.chat_message("assistant"):
+                for i in range(3):
+                    for dots in [".", "..", "..."]:
+                        thinking_placeholder.markdown(f"Thinking{dots}")
+                        time.sleep(0.3)
+    # Input box at the bottom
+    user_input = st.chat_input("Type your question...")
+    if user_input and not st.session_state.processing:
+        # Set processing flag
+        st.session_state.processing = True
+        # Add user message to history immediately
+        st.session_state.chat_history.append(("user", user_input))
+        save_message(st.session_state.session_db_path, "user", user_input)  # Save user message to session DB
+        # Force a rerun to show the message and thinking indicator
+        st.rerun()
+# This part runs after the rerun if we're processing
+if st.session_state.processing:
+    if not user_data or not allocations:
+        st.session_state.chat_history.append(("assistant", "⚠️ Please upload both JSON files before asking questions."))
+        st.session_state.processing = False
+        st.rerun()
+    else:
+        try:
+            # Load JSONs
+            # user_data = json.load(StringIO(user_data_file.getvalue().decode("utf-8")))
+            # allocations = json.load(StringIO(allocations_file.getvalue().decode("utf-8")))
+            # Combined JSON data (for token calculation)
+            combined_json_data = {"user_data": user_data, "allocations": allocations}
+            # Get the last user message
+            last_user_message = next((msg for sender, msg in reversed(st.session_state.chat_history) if sender == "user"), "")
+            # Count tokens for this user message
+            user_msg_gpt_tokens = count_tokens_gpt(last_user_message)
+            user_msg_gemini_tokens = count_tokens_gemini(last_user_message)
+            # Add to accumulated totals
+            st.session_state.total_gpt_tokens += user_msg_gpt_tokens
+            st.session_state.total_gemini_tokens += user_msg_gemini_tokens
+            # Calculate context window usage (conversation + JSON data)
+            window_gpt, window_gemini = calculate_context_window_usage(combined_json_data)
+            st.session_state.window_gpt_tokens = window_gpt
+            st.session_state.window_gemini_tokens = window_gemini
+            # Check token limits for context window
+            if window_gpt > GPT_LIMIT or window_gemini > GEMINI_LIMIT:
+                st.session_state.chat_history.append(("assistant", "⚠️ Your conversation has exceeded token limits. Please clear the chat to continue."))
+                st.session_state.processing = False
+                st.rerun()
+            else:
+                # --- Call LangGraph ---
+                inputs = {
+                    "query": last_user_message,
+                    "user_data": user_data,
+                    "allocations": allocations,
+                    #"data":"",
+                    "chat_history": st.session_state.chat_history
+                }
+                print(st.session_state.chat_history)
+                output = app.invoke(inputs, config = config)
+                response = output.get('output')
+                print(response)
+                # Check if the response contains allocation updates
+                if "allocations" in output:
+                    st.session_state.allocations = output["allocations"]
+                # Count tokens for the response
+                response_gpt_tokens = count_tokens_gpt(response)
+                response_gemini_tokens = count_tokens_gemini(response)
+                # Add to accumulated totals
+                st.session_state.total_gpt_tokens += response_gpt_tokens
+                st.session_state.total_gemini_tokens += response_gemini_tokens
+                # Add to chat history
+                st.session_state.chat_history.append(("assistant", response))
+                # Update context window calculations after adding response
+                window_gpt, window_gemini = calculate_context_window_usage(combined_json_data)
+                st.session_state.window_gpt_tokens = window_gpt
+                st.session_state.window_gemini_tokens = window_gemini
+        except Exception as e:
+            tb = traceback.extract_stack()
+            filename, line_number, function_name, text = tb[-2]
+            error_message = f"❌ Error: {str(e)} in {filename} at line {line_number}, function: {function_name}"
+            st.session_state.chat_history.append(("assistant", error_message))
+        # Reset processing flag
+        st.session_state.processing = False
+        st.rerun()

chains.py ADDED Viewed

	@@ -0,0 +1,262 @@

+"""All prompts utilized by the RAG pipeline"""
+from langchain_core.prompts import ChatPromptTemplate, PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder
+from langchain_core.output_parsers import StrOutputParser
+from langchain_openai import ChatOpenAI
+from pydantic import BaseModel, Field
+from langchain_google_genai import ChatGoogleGenerativeAI
+import os
+from tools import json_to_table, goal_feasibility, save_data, rag_tool
+from langchain.agents import initialize_agent, Tool
+from langchain.agents import AgentType
+from langgraph.prebuilt import create_react_agent
+from langchain.tools import Tool
+from dotenv import load_dotenv
+load_dotenv()
+gemini = ChatGoogleGenerativeAI(model = 'gemini-2.0-flash')
+llm = ChatOpenAI(
+    model='gpt-4.1-nano',
+    api_key=os.environ.get('OPEN_AI_KEY'),
+    temperature=0.2
+)
+# Schema for grading documents
+class GradeDocuments(BaseModel):
+    binary_score: str = Field(description="Documents are relevant to the question, 'yes' or 'no'")
+structured_llm_grader = llm.with_structured_output(GradeDocuments)
+system = """You are a grader assessing relevance of a retrieved document to a user question.
+    If the document contains keyword(s) or semantic meaning related to the question, grade it as relevant.
+    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
+grade_prompt = ChatPromptTemplate.from_messages([
+    ("system", system),
+    ("human", "Retrieved document: \n\n {data} \n\n User question: {query}")
+])
+retrieval_grader = grade_prompt | structured_llm_grader
+prompt = PromptTemplate(
+    template='''
+    You are a SEBI-Registered Investment Advisor (RIA) specializing in Indian financial markets and client relationship management.
+    Your task is to understand and respond to the user's financial query using the following inputs:
+    - Query: {query}
+    - Documents: {data}
+    - User Profile: {user_data}
+    - Savings Allocations: {allocations}
+    Instructions:
+    1. Understand the User's Intent: Carefully interpret what the user is asking about their investments.
+    2. Analyze Allocations: Evaluate the savings allocation data to understand the user's current financial posture.
+    3. Personalized Response:
+    - If detailed user profile and allocation data are available, prioritize your response based on this data.
+    - If profile or allocation data is sparse, rely more heavily on the query context.
+    4. Use Supporting Documents: Extract relevant insights from the provided documents ({data}) to support your answer.
+    5. When Unsure: If the documents or data do not contain the necessary information, say "I don't know" rather than guessing.
+    Always aim to give a response that is:
+    - Data-informed
+    - Client-centric
+    - Aligned with Indian financial regulations and norms
+    ''',
+    input_variables=['query', 'data', 'user_data', 'allocations']
+)
+rag_chain = prompt | gemini | StrOutputParser()
+# Prompt
+system_rewrite = """You a question re-writer that converts an input question to a better version that is optimized \n
+     for web search. Look at the input and try to reason about the underlying semantic intent / meaning."""
+re_write_prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", system_rewrite),
+        (
+            "human",
+            "Here is the initial question: \n\n {query} \n Formulate an improved question.",
+        ),
+    ]
+)
+question_rewriter = re_write_prompt | llm | StrOutputParser()
+from pydantic import BaseModel, Field, RootModel
+from typing import Dict
+from langchain_core.output_parsers import JsonOutputParser
+# Define the Pydantic model using RootModel
+class CategoryProbabilities(RootModel):
+    """Probabilities for different knowledge base categories."""
+    root: Dict[str, float] = Field(description="Dictionary mapping category names to probability scores")
+system_classifier = """You are a query classifier that determines the most relevant knowledge bases (KBs) for a given user query.
+Analyze the semantic meaning and intent of the query and assign probability scores (between 0 and 1) to each KB.
+Ensure the probabilities sum to 1 and output a JSON dictionary with category names as keys and probabilities as values.
+"""
+classification_prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", system_classifier),
+        (
+            "human",
+            "Here is the user query: \n\n {query} \n\n Assign probability scores to each of the following KBs:\n"
+            "{categories}\n\nReturn a JSON object with category names as keys and probability scores as values."
+        ),
+    ]
+)
+# Create a JSON output parser
+json_parser = JsonOutputParser(pydantic_object=CategoryProbabilities)
+# Create the chain with the structured output parser
+query_classifier = classification_prompt | llm | json_parser
+#query_classifier = classification_prompt | llm | StrOutputParser()
+"""
+name: str
+    position: Dict[str, int]
+    riskiness: int
+    illiquidity: int
+    amount: float
+    currency: str = "inr"
+    percentage: float
+    explanation: Dict[str, str]
+    assets: List[AssetAllocation]
+"""
+#--------------------------------------------------------------------------------------
+tools = [
+  {
+    "type": "function",
+    "function": {
+      "name": "json_to_table",
+      "description": "Convert JSON data to a markdown table. Use when user asks to visualise or tabulate structured data.",
+      "parameters": {
+        "type": "object",
+        "properties": {
+          "arguments": {
+            "type": "object",
+            "properties": {
+              "json_data": {
+                "type": "object",
+                "description": "The JSON data to convert to a table"
+              }
+            },
+            "required": ["json_data"]
+          }
+        },
+        "required": ["arguments"]
+      }
+    }
+  },
+  {
+    "type": "function",
+    "function": {
+      "name": "rag_tool",
+      "description": "Lets the agent use RAG system as a tool",
+      "parameters": {
+        "type": "object",
+        "properties": {
+          "arguments": {
+            "type": "object",
+            "properties": {
+              "query": {
+                "type": "string",
+                "description": "The query to search for in the RAG system"
+              }
+            },
+            "required": ["query"]
+          }
+        },
+        "required": ["arguments"]
+      }
+    }
+  }
+]
+template = '''You are a SEBI-Registered Investment Advisor (RIA) specializing in Indian financial markets and client relationship management.
+Your task is to understand and respond to the user's financial query using the following inputs:
+- Query: {query}
+- User Profile: {user_data}
+- Savings Allocations: {allocations}
+- Chat History: {chat_history}
+- 🔎 Retrieved Context (optional): {retrieved_context}
+Instructions:
+1. **Understand the User's Intent**: Carefully interpret what the user is asking about their investments. If a user input contradicts previously stated preferences or profile attributes (e.g., low risk appetite or crypto aversion), ask a clarifying question before proceeding. Do not update allocations or goals unless the user confirms the change explicitly.
+2. **Analyze Allocations**: Evaluate the savings allocation data to understand the user's current financial posture.
+3. **Use Retrieved Context**: If any contextual information is provided in `retrieved_context`, leverage it to improve your response quality and relevance.
+4. **Always Update Information**: If the user shares any new demographic, financial, or preference-related data, update the user profile accordingly. If they request changes in their allocations, ensure the changes are applied **proportionally** and that the total allocation always sums to 100%.
+5. **IMPORTANT: When displaying or updating allocations, you MUST format the data as a Markdown table and always display allocations as a table only** using the following columns:
+   - Asset Class
+   - Type
+   - Label
+   - Old Amount (₹)
+   - Change (₹)
+   - New Amount (₹)
+   - Justification
+7. **Maintain Conversational Memory**: Ensure updates are passed to memory using the specified `updates` structure.
+8. **Tool Use Policy**:
+   - ✅ Use `rag_tool` for retrieving **external financial knowledge or regulation** context when necessary.
+---
+### 🎯 Response Style Guide:
+- 📝 Keep it under 300 words.
+- 😊 Friendly tone: be warm and helpful.
+- 📚 Structured: use bullet points, short paragraphs, and headers.
+- 👀 Visually clear: break sections clearly.
+- 🌟 Use emojis to guide attention and convey tone.
+- 🎯 Be direct and focused on the user's request.
+---
+### 🔁 If There Are Allocation Changes:
+You **must** display a Markdown table as per the format above. Then, return memory update instructions using this JSON structure:
+```json
+{{
+"updates": {{
+    "user_data": {{ ... }},      // Include only changed fields
+    "allocations": {{...}}       // Include only changed rows
+}}
+}}
+'''
+# Create the prompt template
+simple_prompt = ChatPromptTemplate.from_messages([
+    SystemMessagePromptTemplate.from_template(template=template),
+    MessagesPlaceholder(variable_name="chat_history", optional=True),
+    HumanMessagePromptTemplate.from_template("User Query: {query}"),
+    HumanMessagePromptTemplate.from_template("Current User Profile:\n{user_data}"),
+    HumanMessagePromptTemplate.from_template("Current Allocations:\n{allocations}"),
+    HumanMessagePromptTemplate.from_template("🔎 Retrieved Context (if any):\n{retrieved_context}"),
+])
+# Create the chain with direct tool binding
+llm =  ChatOpenAI(
+    temperature=0.1,
+    model="gpt-4.1-nano",
+)
+llm_with_tools = llm.bind_tools(tools)
+simple_chain = simple_prompt | llm_with_tools

entities.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from enum import IntEnum, StrEnum, Enum
+class KBCategory(str, Enum):
+    ProductCategory = "product_category"
+    InvestmentRegulations = "investment_regulations"
+    TaxationDetails = "taxation_details"
+    MarketSegments = "market_segments"
+    CulturalAspects = "cultural_aspects"
+    General = "general"
+class THRESHOLD(Enum):
+    threshold = 0.2

requirements.txt ADDED Viewed

	@@ -0,0 +1,213 @@

+absl-py==2.2.2
+agents==1.4.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.10.11
+aiosignal==1.3.2
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.9.0
+asttokens==3.0.0
+astunparse==1.6.3
+attrs==25.3.0
+beautifulsoup4==4.12.2
+blinker==1.9.0
+cachetools==5.5.2
+certifi==2025.1.31
+charset-normalizer==3.4.1
+click==8.1.3
+cloudpickle==3.1.1
+colorama==0.4.6
+contourpy==1.3.1
+cycler==0.12.1
+dataclasses-json==0.6.7
+decorator==5.2.1
+distro==1.9.0
+executing==2.2.0
+fastapi==0.115.12
+filelock==3.18.0
+filetype==1.2.0
+flask==2.2.3
+flask-httpauth==4.8.0
+flatbuffers==25.2.10
+fonttools==4.57.0
+frozenlist==1.5.0
+fsspec==2025.3.2
+gast==0.6.0
+gitdb==4.0.12
+gitpython==3.1.44
+google-ai-generativelanguage==0.6.17
+google-api-core==2.24.2
+google-auth==2.38.0
+google-pasta==0.2.0
+googleapis-common-protos==1.69.2
+grandalf==0.8
+graphviz==0.20.3
+greenlet==3.1.1
+griffe==1.7.2
+grpcio==1.71.0
+grpcio-status==1.71.0
+gunicorn==21.2.0
+gym==0.26.2
+gym-notices==0.0.8
+h11==0.14.0
+h5py==3.13.0
+httpcore==1.0.7
+httpx==0.28.1
+httpx-sse==0.4.0
+huggingface-hub==0.30.1
+idna==3.10
+iniconfig==2.1.0
+ipython==9.0.2
+ipython-pygments-lexers==1.1.1
+itsdangerous==2.1.2
+jedi==0.19.2
+jinja2==3.1.2
+jiter==0.9.0
+joblib==1.4.2
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+keras==3.9.2
+kiwisolver==1.4.8
+langchain==0.3.21
+langchain-community==0.3.20
+langchain-core==0.3.49
+langchain-google-genai==2.1.2
+langchain-huggingface==0.1.2
+langchain-openai==0.3.11
+langchain-pinecone==0.2.3
+langchain-tests==0.3.17
+langchain-text-splitters==0.3.7
+langgraph==0.3.21
+langgraph-checkpoint==2.0.23
+langgraph-prebuilt==0.1.7
+langgraph-sdk==0.1.60
+langsmith==0.3.19
+libclang==18.1.1
+markdown==3.7
+markdown-it-py==3.0.0
+markupsafe==2.1.2
+marshmallow==3.26.1
+matplotlib==3.10.1
+matplotlib-inline==0.1.7
+mcp==1.6.0
+mdurl==0.1.2
+ml-dtypes==0.5.1
+mpmath==1.3.0
+multidict==6.3.1
+mypy-extensions==1.0.0
+namex==0.0.8
+narwhals==1.34.1
+networkx==3.4.2
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openai==1.69.0
+openai-agents==0.0.10
+opt-einsum==3.4.0
+optree==0.15.0
+orjson==3.10.16
+ormsgpack==1.9.1
+packaging==24.2
+pandas==2.2.3
+parso==0.8.4
+pexpect==4.9.0
+pillow==11.1.0
+pinecone==5.4.2
+pinecone-plugin-inference==3.1.0
+pinecone-plugin-interface==0.0.7
+pluggy==1.5.0
+prompt-toolkit==3.0.50
+propcache==0.3.1
+proto-plus==1.26.1
+protobuf==5.29.4
+ptyprocess==0.7.0
+pure-eval==0.2.3
+pyarrow==19.0.1
+pyasn1==0.6.1
+pyasn1-modules==0.4.2
+pydantic==2.11.1
+pydantic-core==2.33.0
+pydantic-settings==2.8.1
+pydeck==0.9.1
+pygments==2.19.1
+pymupdf==1.25.5
+pyparsing==3.2.3
+pypd==1.1.0
+pypdf==5.4.0
+pytest==8.3.5
+pytest-asyncio==0.26.0
+pytest-mock==3.14.0
+pytest-socket==0.7.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.0
+pytz==2025.2
+pyyaml==6.0.2
+referencing==0.36.2
+regex==2024.11.6
+requests==2.31.0
+requests-toolbelt==1.0.0
+rich==14.0.0
+rpds-py==0.24.0
+rsa==4.9
+ruamel-yaml==0.18.10
+ruamel-yaml-clib==0.2.12
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.2
+sentence-transformers==4.0.1
+setuptools==78.1.0
+six==1.17.0
+smmap==5.0.2
+sniffio==1.3.1
+soupsieve==2.6
+sqlalchemy==2.0.40
+sse-starlette==2.2.1
+stack-data==0.6.3
+starlette==0.46.1
+streamlit==1.44.1
+sympy==1.13.1
+syrupy==4.9.1
+tabulate==0.9.0
+tenacity==9.0.0
+tensorboard==2.19.0
+tensorboard-data-server==0.7.2
+termcolor==3.0.1
+threadpoolctl==3.6.0
+tiktoken==0.9.0
+tokenizers==0.21.1
+toml==0.10.2
+torch==2.6.0
+tornado==6.4.2
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.50.3
+triton==3.2.0
+types-requests==2.32.0.20250328
+typing-extensions==4.13.0
+typing-inspect==0.9.0
+typing-inspection==0.4.0
+tzdata==2025.2
+urllib3==2.3.0
+uvicorn==0.34.0
+watchdog==6.0.0
+wcwidth==0.2.13
+werkzeug==2.2.3
+wheel==0.45.1
+wrapt==1.17.2
+xxhash==3.5.0
+yarl==1.18.3
+zstandard==0.23.0

sample_user_data.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+    "sematic": {
+      "demographic": {
+        "age": 30,
+        "employment_type": "salaried",
+        "dependents": 0,
+        "health_status": "good",
+        "risk_appetite": 0,
+        "financial_maturity": 0,
+        "location": "tier_1"
+      },
+      "financial": {
+        "salary": 100000,
+        "business_value": 0,
+        "current_savings_and_investments": 1000000,
+        "debts": 0,
+        "market_outlook": "neutral",
+       "include_insights": true,
+        "is_housing_loan": false,
+        "monthly_expenses": 50000,
+        "property_value": 0,
+        "real_estate_type": "tier_1_residential",
+        "real_estate_value": 0,
+        "region": "ind",
+        "savings_percentage": 20
+      }
+    },
+    "episodic": {
+      "prefrences": [
+        "doesn't like crypto",
+        "doesn't want to be exposed to energy sector too much"
+      ]
+    }
+  }

simple_rag.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import sys
+import os
+from langgraph.graph import START, END, StateGraph
+from langchain_openai import OpenAIEmbeddings
+from chains import simple_chain, llm_with_tools
+from langchain_core.messages import BaseMessage, HumanMessage, ToolMessage, AIMessage
+from typing import TypedDict, Optional, Dict, List, Union, Annotated
+from langchain_core.messages import AnyMessage #human or AI message
+from langgraph.graph.message import add_messages # reducer in langgraph
+from langgraph.prebuilt import ToolNode, tools_condition
+from langchain.agents import initialize_agent, Tool
+from langchain.agents.agent_types import AgentType
+from langgraph.checkpoint.memory import MemorySaver
+import json
+import langchain
+from tools import json_to_table, goal_feasibility, rag_tool, save_data
+import re
+from dotenv import load_dotenv
+load_dotenv()
+memory = MemorySaver()
+config = {"thread_id":"sample"}
+tools = [json_to_table, rag_tool]
+#tool_executor = ToolExecutor([json_to_table, goal_feasibility])
+json_to_table_node = ToolNode([json_to_table])
+rag_tool_node = ToolNode([rag_tool])
+class Graph(TypedDict):
+    query: Annotated[list[AnyMessage], add_messages]
+    #chat_history : List[BaseMessage]
+    user_data : Dict
+    allocations : Dict
+    #data : str
+    output : Dict
+    retrieved_context: str
+def chat(state):
+    inputs = {
+        "query": state["query"],
+        "user_data": state["user_data"],
+        "allocations": state["allocations"],
+        #"data": state["data"],
+        "chat_history": state["query"],  # If you treat `query` as history
+        "retrieved_context": state.get("retrieved_context", "")
+    }
+    result = simple_chain.invoke(inputs)
+    #print(result)
+    return {
+        "query": state["query"],
+        "user_data": state["user_data"],
+        "allocations": state["allocations"],
+        #"data": state["data"],
+        "retrieved_context": "",  # clear after use
+        "output": result.content
+    }
+def json_to_table_node(state):
+    tool_output = json_to_table(state["allocations"])  # Or whatever your input is
+    return AIMessage(content=tool_output)
+def tools_condition(state):
+    last_message = state["query"][-1]  # Last user or AI message
+    if isinstance(last_message, AIMessage):
+        tool_calls = getattr(last_message, "tool_calls", None)
+        # Check if tool calls exist and handle them
+        if tool_calls:
+            tool_name = tool_calls[0].get('name', '')  # Safely access the tool name
+            if tool_name == "json_to_table":
+                return "show_allocation_table"
+            elif tool_name == "rag_tool":
+                return "query_rag"
+            else:
+                return "tools"  # Fallback in case of unknown tool names
+    return "END"  # End the flow if no tool calls are found
+# ---- GRAPH SETUP ----
+graph = StateGraph(Graph)
+# Nodes
+graph.add_node("chat", chat)
+graph.add_node("show_allocation_table", json_to_table_node)
+#graph.add_node("save_data_info", save_data_node)
+graph.add_node("query_rag", rag_tool_node)
+graph.add_node("tool_output_to_message", lambda state: AIMessage(content=state["tool_output"]))
+#graph.add_node("tools", ToolNode(tools))  # fallback for other tools
+# Main flow
+graph.add_edge(START, "chat")
+graph.add_conditional_edges("chat", tools_condition)
+# Each tool goes back to chat
+graph.add_edge("show_allocation_table", "chat")
+#graph.add_edge("save_data_info", "chat")
+graph.add_edge("query_rag", "chat")
+# End after a loop
+graph.add_edge("chat", END)
+# Compile
+app = graph.compile(checkpointer=memory)
+'''
+with open('/home/pavan/Desktop/FOLDERS/RUBIC/RAG_without_profiler/RAG_rubik/sample_data/sample_alloc.json', 'r') as f:
+    data = json.load(f)
+with open('/home/pavan/Desktop/FOLDERS/RUBIC/RAG_without_profiler/RAG_rubik/sample_data/sample_alloc.json', 'r') as f:
+    allocs = json.load(f)
+inputs = {
+    "query":"display my investments.",
+    "user_data":data,
+    "allocations":allocs,
+    "data":"",
+    "chat_history": [],
+}
+langchain.debug = True
+print(app.invoke(inputs, config={"configurable": {"thread_id": "sample"}}).get('output'))
+#print(json_to_table.args_schema.model_json_schema())
+'''

tools.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import sys
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from langchain.tools import tool
+import pandas as pd
+import json
+import re
+from copy import deepcopy
+from langchain_pinecone import PineconeVectorStore
+from dotenv import load_dotenv
+load_dotenv()
+from langchain_openai import OpenAIEmbeddings
+from pydantic import BaseModel
+from typing import Any, Optional
+api_key = os.getenv('PINCEONE_API_KEY')
+class JsonToTableInput(BaseModel):
+    json_data: Any
+class RagToolInput(BaseModel):
+    query: str
+# Define the tools with proper validation
+def json_to_table(input_data: JsonToTableInput):
+    """Convert JSON data to a markdown table. Use when user asks to visualise or tabulate structured data."""
+    json_data = input_data.json_data
+    if isinstance(json_data, str):
+        try:
+            json_data = json.loads(json_data)
+        except:
+            # If json_data has parsing issues, try to work with it directly
+            pass
+    # Handle a common case in the prompt where 'allocations' might be a nested key
+    if isinstance(json_data, dict) and 'allocations' in json_data:
+        json_data = json_data['allocations']
+    # Ensure we have a valid list or dict to convert to DataFrame
+    if not json_data:
+        json_data = [{"Note": "No allocation data available"}]
+    df = pd.json_normalize(json_data)
+    markdown_table = df.to_markdown(index=False)
+    print(f"[DEBUG] json_to_table output:\n{markdown_table}")
+    return markdown_table
+def rag_tool(input_data: RagToolInput):
+    """Lets the agent use RAG system as a tool"""
+    query = input_data.query
+    embedding_model = OpenAIEmbeddings(
+        model="text-embedding-3-small",
+        dimensions=384
+    )
+    kb = PineconeVectorStore(
+        pinecone_api_key=os.environ.get('PINCEONE_API_KEY'),
+        index_name='rag-rubic',
+        namespace='vectors_lightmodel'
+    )
+    retriever = kb.as_retriever(search_kwargs={"k": 10})
+    context = retriever.invoke(query)
+    return "\n".join([doc.page_content for doc in context])
+@tool
+def goal_feasibility(goal_amount: float, timeline: float, current_savings: float, income : float) -> dict:
+    """Evaluate if a financial goal is feasible based on user income, timeline, and savings. Use when user asks about goal feasibility."""
+    # Input checks
+    if timeline <= 0:
+        return {
+            "feasible": False,
+            "status": "Invalid",
+            "monthly_required": 0,
+            "reason": "Timeline must be greater than 0 months."
+        }
+    # Calculate the remaining amount
+    remaining_amount = goal_amount - current_savings
+    if remaining_amount <= 0:
+        return {
+            "feasible": True,
+            "status": "Already Achieved",
+            "monthly_required": 0,
+            "reason": "You have already met or exceeded your savings goal."
+        }
+    monthly_required = remaining_amount / timeline
+    income_ratio = monthly_required / income
+    # Feasibility classification
+    if income_ratio <= 0.3:
+        status = "Feasible"
+        feasible = True
+        reason = "The required savings per month is manageable for an average income."
+    elif income_ratio <= 0.7:
+        status = "Difficult"
+        feasible = False
+        reason = "The required monthly saving is high but may be possible with strict budgeting."
+    else:
+        status = "Infeasible"
+        feasible = False
+        reason = "The required monthly saving is unrealistic for an average income."
+    return {
+        "feasible": feasible,
+        "status": status,
+        "monthly_required": round(monthly_required, 2),
+        "reason": reason
+    }
+@tool
+def save_data(new_user_data:dict, new_alloc_data:dict):
+    "Saves the updated user_data and allocations data in a json file."
+    path = os.getenv("DATA_PATH", ".")
+    save_path = os.path.join(path, "updated_json")
+    os.makedirs(save_path, exist_ok=True)
+    with open(os.path.join(save_path, "updated_user_data.json"), "w") as f:
+        json.dump(new_user_data, f, indent=2)
+    with open(os.path.join(save_path, "updated_allocations.json"), "w") as f:
+        json.dump(new_alloc_data, f, indent=2)