Spaces:

nolanzandi
/

virtual-data-analyst

Running

App Files Files Community

doc_db_integration

#32

by nolanzandi - opened about 18 hours ago

base: refs/heads/main

←

from: refs/pr/32

Discussion Files changed

+330

-8

Files changed (10) hide show

app.py +3 -1
data_sources/__init__.py +2 -1
data_sources/connect_doc_db.py +36 -0
data_sources/connect_sql_db.py +1 -1
functions/__init__.py +4 -4
functions/chat_functions.py +79 -0
functions/query_functions.py +72 -0
requirements.txt +3 -0
templates/doc_db.py +94 -0
tools/tools.py +36 -1

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from utils import TEMP_DIR, message_dict
 import gradio as gr
-import templates.data_file as data_file, templates.sql_db as sql_db
 import os
 from getpass import getpass
@@ -74,6 +74,8 @@ with gr.Blocks(theme=theme, css=css, head=head, delete_cache=(3600,3600)) as dem
         data_file.demo.render()
     with gr.Tab("SQL Database"):
         sql_db.demo.render()
     footer = gr.HTML("""<!-- Footer -->
         <footer class="max-w-4xl mx-auto mt-12 text-center text-gray-500 text-sm">

 from utils import TEMP_DIR, message_dict
 import gradio as gr
+import templates.data_file as data_file, templates.sql_db as sql_db, templates.doc_db as doc_db
 import os
 from getpass import getpass
         data_file.demo.render()
     with gr.Tab("SQL Database"):
         sql_db.demo.render()
+    with gr.Tab("Document (MongoDB) Database"):
+        doc_db.demo.render()
     footer = gr.HTML("""<!-- Footer -->
         <footer class="max-w-4xl mx-auto mt-12 text-center text-gray-500 text-sm">

data_sources/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from .upload_file import process_data_upload
 from .connect_sql_db import connect_sql_db
-__all__ = ["process_data_upload","connect_sql_db"]

 from .upload_file import process_data_upload
 from .connect_sql_db import connect_sql_db
+from .connect_doc_db import connect_doc_db
+__all__ = ["process_data_upload","connect_sql_db","connect_doc_db"]

data_sources/connect_doc_db.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from pymongo import MongoClient
+import os
+from utils import TEMP_DIR
+from pymongo_schema.extract import extract_pymongo_client_schema
+def connect_doc_db(connection_string, nosql_db_name, session_hash):
+    try:
+        # Create a MongoClient object
+        client = MongoClient(connection_string)
+        print("Connected to NoSQL Mongo DB")
+        # Access a database
+        db = client[nosql_db_name]
+        collection_names = db.list_collection_names()
+        print(collection_names)
+        schema = extract_pymongo_client_schema(client)
+        # Close the connection
+        if 'client' in locals() and client:
+            client.close()
+            print("MongoDB Connection closed.")
+        session_path = 'doc_db'
+        dir_path = TEMP_DIR / str(session_hash) / str(session_path)
+        os.makedirs(dir_path, exist_ok=True)
+        return ["success","<p style='color:green;text-align:center;font-size:18px;'>Document database connected successful</p>", collection_names, schema]
+    except Exception as e:
+        print("DocDB CONNECTION ERROR")
+        print(e)
+        return ["error",f"<p style='color:red;text-align:center;font-size:18px;font-weight:bold;'>ERROR: {e}</p>"]

data_sources/connect_sql_db.py CHANGED Viewed

@@ -36,7 +36,7 @@ def connect_sql_db(url, sql_user, sql_port, sql_pass, sql_db_name, session_hash)
         return ["success","<p style='color:green;text-align:center;font-size:18px;'>SQL database connected successful</p>", table_names]
     except Exception as e:
-        print("UPLOAD ERROR")
         print(e)
         return ["error",f"<p style='color:red;text-align:center;font-size:18px;font-weight:bold;'>ERROR: {e}</p>"]

         return ["success","<p style='color:green;text-align:center;font-size:18px;'>SQL database connected successful</p>", table_names]
     except Exception as e:
+        print("SQL DB CONNECTION ERROR")
         print(e)
         return ["error",f"<p style='color:red;text-align:center;font-size:18px;font-weight:bold;'>ERROR: {e}</p>"]

functions/__init__.py CHANGED Viewed

@@ -1,9 +1,9 @@
-from .query_functions import SQLiteQuery, sqlite_query_func, PostgreSQLQuery, sql_query_func
 from .chart_functions import table_generation_func, scatter_chart_generation_func, \
 line_chart_generation_func, bar_chart_generation_func, pie_chart_generation_func, histogram_generation_func, scatter_chart_fig
-from .chat_functions import sql_example_question_generator, example_question_generator, chatbot_with_fc, sql_chatbot_with_fc
 from .stat_functions import regression_func
-__all__ = ["SQLiteQuery","sqlite_query_func","sql_query_func","table_generation_func","scatter_chart_generation_func",
            "line_chart_generation_func","bar_chart_generation_func","regression_func", "pie_chart_generation_func", "histogram_generation_func",
-           "scatter_chart_fig","sql_example_question_generator","example_question_generator","chatbot_with_fc","sql_chatbot_with_fc"]

+from .query_functions import SQLiteQuery, sqlite_query_func, PostgreSQLQuery, sql_query_func, doc_db_query_func
 from .chart_functions import table_generation_func, scatter_chart_generation_func, \
 line_chart_generation_func, bar_chart_generation_func, pie_chart_generation_func, histogram_generation_func, scatter_chart_fig
+from .chat_functions import sql_example_question_generator, example_question_generator, doc_db_example_question_generator, chatbot_with_fc, sql_chatbot_with_fc, doc_db_chatbot_with_fc
 from .stat_functions import regression_func
+__all__ = ["SQLiteQuery","sqlite_query_func","sql_query_func","doc_db_query_func","table_generation_func","scatter_chart_generation_func",
            "line_chart_generation_func","bar_chart_generation_func","regression_func", "pie_chart_generation_func", "histogram_generation_func",
+           "scatter_chart_fig","doc_db_example_question_generator","sql_example_question_generator","example_question_generator","chatbot_with_fc","sql_chatbot_with_fc","doc_db_chatbot_with_fc"]

functions/chat_functions.py CHANGED Viewed

@@ -57,6 +57,26 @@ def sql_example_question_generator(session_hash, db_tables, db_name):
     return example_response["replies"][0].text
 def chatbot_with_fc(message, history, session_hash):
     from functions import sqlite_query_func, table_generation_func, regression_func, scatter_chart_generation_func, \
         line_chart_generation_func,bar_chart_generation_func,pie_chart_generation_func,histogram_generation_func
@@ -170,4 +190,63 @@ def sql_chatbot_with_fc(message, history, session_hash, db_url, db_port, db_user
             message_dict[session_hash]['sql'].append(response["replies"][0])
             break
     return response["replies"][0].text

     return example_response["replies"][0].text
+def doc_db_example_question_generator(session_hash, db_collections, db_name, db_schema):
+    example_response = None
+    example_messages = [
+        ChatMessage.from_system(
+            f"You are a helpful and knowledgeable agent who has access to an MongoDB NoSQL document database called {db_name}."
+        )
+    ]
+    example_messages.append(ChatMessage.from_user(text=f"""We have a MongoDB NoSQL document database with the following collections: {db_collections}.
+                                                  The schema of these collections is: {db_schema}.
+                                                  We also have an AI agent with access to the same database that will be performing data analysis.
+                                                  Please return an array of seven strings, each one being a question for our data analysis agent
+                                                  that we can suggest that you believe will be insightful or helpful to a data analysis looking for
+                                                  data insights. Return nothing more than the array of questions because I need that specific data structure
+                                                  to process your response. No other response type or data structure will work."""))
+    example_response = chat_generator.run(messages=example_messages)
+    return example_response["replies"][0].text
 def chatbot_with_fc(message, history, session_hash):
     from functions import sqlite_query_func, table_generation_func, regression_func, scatter_chart_generation_func, \
         line_chart_generation_func,bar_chart_generation_func,pie_chart_generation_func,histogram_generation_func
             message_dict[session_hash]['sql'].append(response["replies"][0])
             break
+    return response["replies"][0].text
+def doc_db_chatbot_with_fc(message, history, session_hash, db_connection_string, db_name, db_collections, db_schema):
+    from functions import doc_db_query_func, table_generation_func, regression_func, scatter_chart_generation_func, \
+        line_chart_generation_func,bar_chart_generation_func,pie_chart_generation_func,histogram_generation_func
+    import tools.tools as tools
+    available_functions = {"doc_db_query_func": doc_db_query_func,"table_generation_func":table_generation_func,
+                           "line_chart_generation_func":line_chart_generation_func,"bar_chart_generation_func":bar_chart_generation_func,
+                           "scatter_chart_generation_func":scatter_chart_generation_func, "pie_chart_generation_func":pie_chart_generation_func,
+                           "histogram_generation_func":histogram_generation_func,
+                           "regression_func":regression_func }
+    if message_dict[session_hash]['doc_db'] != None:
+        message_dict[session_hash]['doc_db'].append(ChatMessage.from_user(message))
+    else:
+        messages = [
+            ChatMessage.from_system(
+                f"""You are a helpful and knowledgeable agent who has access to an NoSQL MongoDB Document database which has a series of collections called {db_collections}.
+                The schema of these collections is: {db_schema}.
+                You also have access to a function, called table_generation_func, that can take a query.csv file generated from our sql query and returns an iframe that we should display in our chat window.
+                You also have access to a scatter plot function, called scatter_chart_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a scatter plot and returns an iframe that we should display in our chat window.
+                You also have access to a line chart function, called line_chart_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a line chart and returns an iframe that we should display in our chat window.
+                You also have access to a bar graph function, called line_chart_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a bar graph and returns an iframe that we should display in our chat window.
+                You also have access to a pie chart function, called pie_chart_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a pie chart and returns an iframe that we should display in our chat window.
+                You also have access to a histogram function, called histogram_generation_func, that can take a query.csv file generated from our sql query and uses plotly dictionaries to generate a histogram and returns an iframe that we should display in our chat window.
+                You also have access to a linear regression function, called regression_func, that can take a query.csv file generated from our sql query and a list of column names for our independent and dependent variables and return a regression data string and a regression chart which is returned as an iframe.
+                Could you please always display the generated charts, tables, and visualizations as part of your output?"""
+            )
+        ]
+        messages.append(ChatMessage.from_user(message))
+        message_dict[session_hash]['doc_db'] = messages
+    response = chat_generator.run(messages=message_dict[session_hash]['doc_db'], generation_kwargs={"tools": tools.doc_db_tools_call(db_collections)})
+    while True:
+        # if OpenAI response is a tool call
+        if response and response["replies"][0].meta["finish_reason"] == "tool_calls" or response["replies"][0].tool_calls:
+            function_calls = response["replies"][0].tool_calls
+            for function_call in function_calls:
+                message_dict[session_hash]['doc_db'].append(ChatMessage.from_assistant(tool_calls=[function_call]))
+                ## Parse function calling information
+                function_name = function_call.tool_name
+                function_args = function_call.arguments
+                ## Find the corresponding function and call it with the given arguments
+                function_to_call = available_functions[function_name]
+                function_response = function_to_call(**function_args, session_hash=session_hash, connection_string=db_connection_string,
+                                                    doc_db_name=db_name, session_folder='doc_db')
+                print(function_name)
+                ## Append function response to the messages list using `ChatMessage.from_tool`
+                message_dict[session_hash]['doc_db'].append(ChatMessage.from_tool(tool_result=function_response['reply'], origin=function_call))
+                response = chat_generator.run(messages=message_dict[session_hash]['doc_db'], generation_kwargs={"tools": tools.doc_db_tools_call(db_collections)})
+        # Regular Conversation
+        else:
+            message_dict[session_hash]['doc_db'].append(response["replies"][0])
+            break
     return response["replies"][0].text

functions/query_functions.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from typing import List
 from haystack import component
 import pandas as pd
 pd.set_option('display.max_rows', None)
@@ -7,7 +8,10 @@ pd.set_option('display.width', None)
 pd.set_option('display.max_colwidth', None)
 import sqlite3
 import psycopg2
 from utils import TEMP_DIR
 @component
 class SQLiteQuery:
@@ -93,3 +97,71 @@ def sql_query_func(queries: List[str], session_hash, db_url, db_port, db_user, d
               """
       print(reply)
       return {"reply": reply}

 from typing import List
+from typing import AnyStr
 from haystack import component
 import pandas as pd
 pd.set_option('display.max_rows', None)
 pd.set_option('display.max_colwidth', None)
 import sqlite3
 import psycopg2
+from pymongo import MongoClient
+import pymongoarrow.monkey
 from utils import TEMP_DIR
+import ast
 @component
 class SQLiteQuery:
               """
       print(reply)
       return {"reply": reply}
+@component
+class DocDBQuery:
+    def __init__(self, connection_string: str, doc_db_name: str):
+      client = MongoClient(connection_string)
+      self.client = client
+      self.connection = client[doc_db_name]
+    @component.output_types(results=List[str], queries=List[str])
+    def run(self, aggregation_pipeline: List[str], db_collection,  session_hash):
+        pymongoarrow.monkey.patch_all()
+        print("ATTEMPTING TO RUN MONGODB QUERY")
+        dir_path = TEMP_DIR / str(session_hash)
+        results = []
+        print(aggregation_pipeline)
+        aggregation_pipeline = aggregation_pipeline.replace(" ", "")
+        false_replace = [':false', ': false']
+        false_value = ':False'
+        true_replace = [':true', ': true']
+        true_value = ':True'
+        for replace in false_replace:
+            aggregation_pipeline = aggregation_pipeline.replace(replace, false_value)
+        for replace in true_replace:
+            aggregation_pipeline = aggregation_pipeline.replace(replace, true_value)
+        query_list = ast.literal_eval(aggregation_pipeline)
+        print("QUERY List")
+        print(query_list)
+        print(db_collection)
+        db = self.connection
+        collection = db[db_collection]
+        print(collection)
+        docs = collection.aggregate_pandas_all(query_list)
+        print("DATA FRAME COMPLETE")
+        docs.to_csv(f'{dir_path}/doc_db/query.csv', index=False)
+        print("CSV COMPLETE")
+        results.append(f"{docs}")
+        self.client.close()
+        return {"results": results, "queries": aggregation_pipeline}
+def doc_db_query_func(aggregation_pipeline: List[str], db_collection: AnyStr, session_hash, connection_string, doc_db_name, **kwargs):
+    doc_db_query = DocDBQuery(connection_string, doc_db_name)
+    try:
+      result = doc_db_query.run(aggregation_pipeline, db_collection, session_hash)
+      print("RESULT")
+      if len(result["results"][0]) > 1000:
+        print("QUERY TOO LARGE")
+        return {"reply": "query result too large to be processed by llm, the query results are in our query.csv file. If you need to display the results directly, perhaps use the table_generation_func function."}
+      else:
+        return {"reply": result["results"][0]}
+    except Exception as e:
+      reply = f"""There was an error running the NoSQL (Mongo) Query = {aggregation_pipeline}
+              The error is {e},
+              You should probably try again.
+              """
+      print(reply)
+      return {"reply": reply}

requirements.txt CHANGED Viewed

@@ -7,3 +7,6 @@ openpyxl
 statsmodels
 xlrd
 psycopg2-binary

 statsmodels
 xlrd
 psycopg2-binary
+pymongo
+pymongoarrow
+pymongo_schema

templates/doc_db.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import ast
+import gradio as gr
+from functions import doc_db_example_question_generator, doc_db_chatbot_with_fc
+from data_sources import connect_doc_db
+from utils import message_dict
+def hide_info():
+    return gr.update(visible=False)
+with gr.Blocks() as demo:
+    description = gr.HTML("""
+                    <!-- Header -->
+                    <div class="max-w-4xl mx-auto mb-12 text-center">
+                        <div class="bg-blue-50 border border-blue-200 rounded-lg max-w-2xl mx-auto">
+                         <p>This tool allows users to communicate with and query real time data from a Document DB (MongoDB for now, others can be added if requested) using natural
+                          language and the above features.</p>
+                         <p style="font-weight:bold;">Notice: the way this system is designed, no login information is retained and credentials are passed as session variables until the user leaves or
+                          refreshes the page in which they disappear. They are never saved to any files. I also make use of the PyMongoArrow aggregate_pandas_all function to apply pipelines,
+                          which can't delete, drop, or add database lines to avoid unhappy accidents or glitches.
+                          That being said, it's probably not a good idea to connect a production database to a strange AI tool with an unfamiliar author.
+                          This should be for demonstration purposes.</p>
+                          <p>Contact me if this is something you would like built in your organization, on your infrastructure, and with the requisite privacy and control a production
+                          database analytics tool requires.</p>
+                        </div>
+                    </div>
+                        """, elem_classes="description_component")
+    connection_string = gr.Textbox(label="Connection String", value="dataanalyst0.l1klmww.mongodb.net/")
+    with gr.Row():
+        connection_user = gr.Textbox(label="Connection User", value="virtual-data-analyst")
+        connection_password = gr.Textbox(label="Connection Password", value="zcpbmoGJ3mC8o", type="password")
+        doc_db_name = gr.Textbox(label="Database Name", value="sample_mflix")
+    submit = gr.Button(value="Submit")
+    submit.click(fn=hide_info, outputs=description)
+    @gr.render(inputs=[connection_string,connection_user,connection_password,doc_db_name], triggers=[submit.click])
+    def sql_chat(request: gr.Request, connection_string=connection_string.value, connection_user=connection_user.value, connection_password=connection_password.value, doc_db_name=doc_db_name.value):
+        if request.session_hash not in message_dict:
+            message_dict[request.session_hash] = {}
+        message_dict[request.session_hash]['doc_db'] = None
+        connection_login_value = "mongodb+srv://" + connection_user + ":" + connection_password + "@" + connection_string
+        if connection_login_value:
+            print("MONGO APP")
+            process_message = process_doc_db(connection_login_value, doc_db_name, request.session_hash)
+            gr.HTML(value=process_message[1], padding=False)
+            if process_message[0] == "success":
+                if "dataanalyst0.l1klmww.mongodb.net" in connection_login_value:
+                    example_questions = [
+                                            ["Describe the dataset"],
+                                            ["What are the top 5 most common movie genres?"],
+                                            ["How do user comment counts on a movie correlate with the movie award wins?"],
+                                            ["Can you generate a pie chart showing the top 10 states with the most movie theaters?"],
+                                            ["What are the top 10 most represented directors in the database?"],
+                                            ["What are the different movie categories and how many movies are in each category?"]
+                                        ]
+                else:
+                    try:
+                        generated_examples = ast.literal_eval(doc_db_example_question_generator(request.session_hash, process_message[2], doc_db_name, process_message[3]))
+                        example_questions = [
+                                                ["Describe the dataset"]
+                                            ]
+                        for example in generated_examples:
+                            example_questions.append([example])
+                    except Exception as e:
+                        print("DOC DB QUESTION GENERATION ERROR")
+                        print(e)
+                        example_questions = [
+                                            ["Describe the dataset"],
+                                            ["List the columns in the dataset"],
+                                            ["What could this data be used for?"],
+                                        ]
+                session_hash = gr.Textbox(visible=False, value=request.session_hash)
+                db_connection_string = gr.Textbox(visible=False, value=connection_login_value)
+                db_name = gr.Textbox(visible=False, value=doc_db_name)
+                db_collections = gr.Textbox(value=process_message[2], interactive=False, label="DB Collections")
+                db_schema = gr.Textbox(visible=False, value=process_message[3])
+                bot = gr.Chatbot(type='messages', label="CSV Chat Window", render_markdown=True, sanitize_html=False, show_label=True, render=False, visible=True, elem_classes="chatbot")
+                chat = gr.ChatInterface(
+                                    fn=doc_db_chatbot_with_fc,
+                                    type='messages',
+                                    chatbot=bot,
+                                    title="Chat with your Database",
+                                    examples=example_questions,
+                                    concurrency_limit=None,
+                                    additional_inputs=[session_hash, db_connection_string, db_name, db_collections,db_schema]
+                                    )
+    def process_doc_db(connection_string, nosql_db_name, session_hash):
+        if connection_string:
+            process_message = connect_doc_db(connection_string, nosql_db_name, session_hash)
+        return process_message
+if __name__ == "__main__":
+    demo.launch()

tools/tools.py CHANGED Viewed

@@ -57,7 +57,7 @@ def sql_tools_call(db_tables):
             "function": {
                 "name": "sql_query_func",
                 "description": f"""This is a tool useful to query a PostgreSQL database with the following tables, {table_string}.
-                There may also be more tables in the database if the number of columns is too large to process.
                 This function also saves the results of the query to csv file called query.csv.""",
                 "parameters": {
                     "type": "object",
@@ -79,4 +79,39 @@ def sql_tools_call(db_tables):
     tools_calls.extend(chart_tools)
     tools_calls.extend(stats_tools)
     return tools_calls

             "function": {
                 "name": "sql_query_func",
                 "description": f"""This is a tool useful to query a PostgreSQL database with the following tables, {table_string}.
+                There may also be more tables in the database if the number of tables is too large to process.
                 This function also saves the results of the query to csv file called query.csv.""",
                 "parameters": {
                     "type": "object",
     tools_calls.extend(chart_tools)
     tools_calls.extend(stats_tools)
+    return tools_calls
+def doc_db_tools_call(db_collections):
+    collection_string = (db_collections[:625] + '..') if len(db_collections) > 625 else db_collections
+    tools_calls = [
+        {
+            "type": "function",
+            "function": {
+                "name": "doc_db_query_func",
+                "description": f"""This is a tool useful to build an aggregation pipeline to query a MongoDB NoSQL document database with the following collections, {collection_string}.
+                There may also be more collections in the database if the number of tables is too large to process.
+                This function also saves the results of the query to a csv file called query.csv.""",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "aggregation_pipeline": {
+                            "type": "string",
+                            "description": "The MongoDB aggregation pipeline to use in the search. Infer this from the user's message. It should be a question or a statement"
+                        },
+                        "db_collection": {
+                            "type": "string",
+                            "description": "The MongoDB collection to use in the search. Infer this from the user's message. It should be a question or a statement",
+                        }
+                    },
+                    "required": ["queries","db_collection"],
+                },
+            },
+        },
+    ]
+    tools_calls.extend(chart_tools)
+    tools_calls.extend(stats_tools)
     return tools_calls