Spaces:

Shreneek
/

chat-with-csv

Build error

App Files Files Community

chat-with-csv / app.py

Shreneek

Create app.py

28545e3 verified 2 months ago

raw

history blame

9.07 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from ydata_profiling import ProfileReport
	import json
	import os
	from langchain.llms import HuggingFaceHub
	from langchain.chains import LLMChain
	from langchain.prompts import PromptTemplate
	from langchain_core.output_parsers import StrOutputParser
	from langchain.tools.python.tool import PythonAstREPLTool
	from langchain.agents import AgentExecutor, create_react_agent
	from langchain_experimental.agents.agent_toolkits.pandas.base import create_pandas_dataframe_agent
	from langchain.agents.agent_types import AgentType

	# Set page configuration
	st.set_page_config(page_title="Interactive Data Profiler & Chat", layout="wide", page_icon="📊")

	# Create session states for DataFrame and chat history if they don't exist
	if 'df' not in st.session_state:
	st.session_state.df = None
	if 'chat_history' not in st.session_state:
	st.session_state.chat_history = []
	if 'suggestions' not in st.session_state:
	st.session_state.suggestions = []

	# Initialize Hugging Face API
	def get_llm():
	# Using a small but capable open-source model
	llm = HuggingFaceHub(
	repo_id="google/flan-t5-large",
	model_kwargs={"temperature": 0.1, "max_length": 512},
	huggingfacehub_api_token=os.environ.get("HUGGINGFACE_API_TOKEN", "")
	)
	return llm

	# Function to generate report
	def generate_profile_report(df):
	with st.spinner("Generating profile report..."):
	profile = ProfileReport(df,
	title="Profiling Report",
	explorative=True,
	minimal=True) # Minimal for faster processing
	return profile

	# Function to generate query suggestions
	def generate_suggestions(df):
	# Get basic info about the dataframe
	num_rows = df.shape[0]
	num_cols = df.shape[1]
	column_names = df.columns.tolist()
	data_types = df.dtypes.astype(str).tolist()

	# Sample suggestions based on dataframe structure
	suggestions = [
	f"How many rows are in this dataset?",
	f"What are all the column names?",
	f"Show me the first 5 rows",
	f"What is the average of {column_names[0] if len(column_names) > 0 else 'column'}"
	]

	# Add column-specific suggestions
	for col, dtype in zip(column_names[:min(3, len(column_names))], data_types[:min(3, len(data_types))]):
	if 'int' in dtype or 'float' in dtype:
	suggestions.append(f"What is the mean value of {col}?")
	suggestions.append(f"What is the maximum value of {col}?")
	elif 'object' in dtype or 'str' in dtype:
	suggestions.append(f"What are the unique values in {col}?")
	suggestions.append(f"How many missing values in {col}?")

	return suggestions

	# Function to execute pandas operations safely
	def execute_pandas_query(df, query):
	try:
	# Create pandas agent
	agent = create_pandas_dataframe_agent(
	llm=get_llm(),
	df=df,
	agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
	verbose=True
	)

	# Execute query
	result = agent.run(query)
	return result
	except Exception as e:
	# Fallback to basic operations if agent fails
	if "rows" in query.lower() and "how many" in query.lower():
	return f"The dataset has {df.shape[0]} rows."
	elif "columns" in query.lower() and "how many" in query.lower():
	return f"The dataset has {df.shape[1]} columns."
	elif "column names" in query.lower():
	return f"The column names are: {', '.join(df.columns.tolist())}"
	elif "first" in query.lower() and "rows" in query.lower():
	num = 5 # Default
	for word in query.split():
	if word.isdigit():
	num = int(word)
	break
	return df.head(num).to_string()
	elif "describe" in query.lower():
	return df.describe().to_string()
	else:
	return f"I couldn't process that query. Error: {str(e)}"

	# Main app header
	st.title("🔍 Interactive Data Profiler & Chat")
	st.markdown("""
	Upload your CSV file to get detailed profiling and ask questions about your data!
	This app combines interactive data profiling with a chat interface for data exploration.
	""")

	# File uploader
	uploaded_file = st.file_uploader("Upload a CSV file", type="csv")

	# Process uploaded file
	if uploaded_file is not None:
	try:
	# Read CSV into DataFrame
	df = pd.read_csv(uploaded_file)
	st.session_state.df = df
	st.success(f"✅ File uploaded successfully! Found {df.shape[0]} rows and {df.shape[1]} columns.")

	# Generate suggestions when a new file is uploaded
	if len(st.session_state.suggestions) == 0:
	st.session_state.suggestions = generate_suggestions(df)

	# Create tabs for different functionalities
	tab1, tab2 = st.tabs(["📊 Data Profiling", "💬 Data Chat"])

	# Tab 1: Data Profiling
	with tab1:
	st.header("Data Profiling")

	# Basic info
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Rows", df.shape[0])
	with col2:
	st.metric("Columns", df.shape[1])
	with col3:
	st.metric("Missing Values", df.isna().sum().sum())

	# Show raw data sample
	with st.expander("Preview Data"):
	st.dataframe(df.head(10))

	# Generate the profile report
	profile = generate_profile_report(df)

	# Convert report to HTML and display
	report_html = profile.to_html()
	st.components.v1.html(report_html, height=1000, scrolling=True)

	# Provide download button
	st.write("### Download the Profiling Report")
	report_bytes = report_html.encode('utf-8')
	st.download_button(
	label="Download Report (HTML)",
	data=report_bytes,
	file_name="profiling_report.html",
	mime="text/html"
	)

	# Tab 2: Interactive Chat
	with tab2:
	st.header("Chat with Your Data")
	st.info("Ask questions about your data and get instant answers!")

	# Chat input and suggested questions
	user_question = st.text_input("Your question:", key="question_input")

	# Show suggestion chips
	st.write("Suggested questions:")
	cols = st.columns(2)
	for i, suggestion in enumerate(st.session_state.suggestions):
	col_idx = i % 2
	with cols[col_idx]:
	if st.button(suggestion, key=f"suggestion_{i}"):
	user_question = suggestion
	st.session_state.question_input = suggestion
	st.experimental_rerun()

	# Process question
	if user_question:
	st.session_state.chat_history.append({"role": "user", "content": user_question})

	# Get answer
	with st.spinner("Thinking..."):
	answer = execute_pandas_query(df, user_question)

	# Add answer to chat history
	st.session_state.chat_history.append({"role": "assistant", "content": answer})

	# Display chat history
	st.write("### Conversation History")
	for message in st.session_state.chat_history:
	if message["role"] == "user":
	st.markdown(f"You: {message['content']}")
	else:
	st.markdown(f"Assistant: {message['content']}")
	st.markdown("---")

	# Clear chat button
	if st.button("Clear Chat History"):
	st.session_state.chat_history = []
	st.experimental_rerun()

	except Exception as e:
	st.error(f"An error occurred: {str(e)}")
	else:
	st.info("👆 Please upload a CSV file to begin.")

	# Placeholder visuals
	st.markdown("### What you can do with this app:")
	col1, col2 = st.columns(2)
	with col1:
	st.markdown("📊 Data Profiling")
	st.markdown("- Automatic data quality assessment")
	st.markdown("- Column statistics and distributions")
	st.markdown("- Correlation analysis")
	st.markdown("- Missing values analysis")
	with col2:
	st.markdown("💬 Interactive Data Chat")
	st.markdown("- Ask natural language questions")
	st.markdown("- Get instant insights")
	st.markdown("- Suggested questions for quick exploration")
	st.markdown("- No coding required!")