Spaces:

Sharath7693
/

text2sql_rag

Running

App Files Files Community

Sharath7693 commited on Mar 15

Commit

c2b19f9

verified ·

1 Parent(s): d935373

Create app.py

Browse files

Files changed (1) hide show

app.py +143 -0

app.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import gradio as gr
+import pdfplumber
+import docx
+import json
+import re
+import sqlalchemy
+import requests
+from tenacity import retry, stop_after_attempt, wait_exponential
+from langchain_postgres.vectorstores import PGVector
+from langchain_core.documents import Document
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_postgres import PGVector
+# API Keys and Database Connection
+GROQ_API_KEY = "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"
+NEON_CONNECTION_STRING = "postgresql://neondb_owner:npg_TPtUn1ArS6bo@ep-crimson-king-a12tfmdw-pooler.ap-southeast-1.aws.neon.tech/neondb?sslmode=require"
+embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")
+# Extract text from various document types
+def extract_text_from_doc(file_path):
+    if file_path.endswith(".pdf"):
+        with pdfplumber.open(file_path) as pdf:
+            return "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
+    elif file_path.endswith(".docx"):
+        doc = docx.Document(file_path)
+        return "\n".join([p.text for p in doc.paragraphs])
+    elif file_path.endswith(".txt"):
+        with open(file_path, "r", encoding="utf-8") as f:
+            return f.read()
+    return ""
+# Database Connection
+engine = sqlalchemy.create_engine(url=NEON_CONNECTION_STRING, pool_pre_ping=True, pool_recycle=300)
+vector_store = PGVector(embeddings=embeddings, connection=engine, use_jsonb=True, collection_name="text-to-sql-context")
+# Retry for API Calls
+@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=2, max=10))
+def call_groq_api(prompt):
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {GROQ_API_KEY}",
+    }
+    data = {
+        "model": "llama-3.3-70b-versatile",
+        "messages": [{"role": "user", "content": prompt}]
+    }
+    response = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=headers, json=data)
+    if response.status_code != 200:
+        raise Exception(f"Groq API error: {response.text}")
+    result = response.json()
+    return result.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
+# Remove extra text and extract only JSON
+def extract_json(text):
+    match = re.search(r"\[.*\]", text, re.DOTALL)
+    if match:
+        return match.group(0)  # Extract only the JSON array
+    return None  # Invalid format
+# SQL Query Generation Prompt
+generation_prompt = """
+Generate 50 SQL queries based on this schema:
+<schema>
+{SCHEMA}
+</schema>
+Provide JSON output with 'question' and 'query'.
+"""
+# Process Schema, Generate Queries, and Answer User's Question
+def process_and_query(file, question):
+    schema_text = extract_text_from_doc(file.name)
+    # Generate Queries
+    response = call_groq_api(generation_prompt.format(SCHEMA=schema_text))
+    # Extract only valid JSON part
+    json_response = extract_json(response)
+    if not json_response:
+        return f"Error: Unexpected response format from Groq API: {response}"
+    try:
+        qa_pairs = json.loads(json_response)
+    except json.JSONDecodeError:
+        return f"Error: Could not parse JSON: {json_response}"
+    # Store Schema and Queries in Vector DB
+    schema_doc = Document(page_content=schema_text, metadata={"id": "schema", "topic": "ddl"})
+    query_docs = [Document(page_content=json.dumps(pair), metadata={"id": f"query-{i}", "topic": "query"}) for i, pair in enumerate(qa_pairs)]
+    vector_store.add_documents([schema_doc] + query_docs, ids=[doc.metadata["id"] for doc in [schema_doc] + query_docs])
+    # Retrieve Relevant Schema and Queries
+    relevant_ddl = vector_store.similarity_search(query=question, k=5, filter={"topic": {"$eq": "ddl"}})
+    similar_queries = vector_store.similarity_search(query=question, k=3, filter={"topic": {"$eq": "query"}})
+    schema = "\n".join([doc.page_content for doc in relevant_ddl])
+    examples = "\n".join([json.loads(doc.page_content)["question"] + "\nSQL: " + json.loads(doc.page_content)["query"] for doc in similar_queries])
+    query_prompt = f"""
+    You are an SQL expert. Generate a valid SQL query based on the schema and example queries.
+    1. Some DDL statements describing tables, columns and indexes in the database:
+    <schema>
+    {schema}
+    </schema>
+    2. Some example pairs demonstrating how to convert natural language text into a corresponding SQL query for this schema:
+    <examples>
+    {examples}
+    </examples>
+    3. The actual natural language question to convert into an SQL query:
+    <question>
+    {question}
+    </question>
+    Follow the instructions below:
+    1. Your task is to generate an SQL query that will retrieve the data needed to answer the question, based on the database schema.
+    2. First, carefully study the provided schema and examples to understand the structure of the database and how the examples map natural language to SQL for this schema.
+    3. Your answer should have two parts:
+    - Inside <scratchpad> XML tag, write out step-by-step reasoning to explain how you are generating the query based on the schema, example, and question.
+    - Then, inside <sql> XML tag, output your generated SQL.
+    SQL Query:
+    """
+    query_response = call_groq_api(query_prompt)
+    return query_response
+# Gradio UI
+with gr.Blocks() as app:
+    gr.Markdown("# Text-to-SQL Converter")
+    file_input = gr.File(label="Upload Schema File")
+    question_input = gr.Textbox(label="Ask a SQL-related Question")
+    submit_button = gr.Button("Process & Generate SQL")
+    query_output = gr.Textbox(label="Generated SQL Query")
+    submit_button.click(process_and_query, inputs=[file_input, question_input], outputs=query_output)
+app.launch(share=True)