import gradio as gr import os import pandas as pd import requests import json from typing import List, Tuple import chardet # -- LLM Client Class -- class OllamaClient: def __init__(self, model_name: str = "phi3:latest", base_url: str = "http://localhost:11434"): self.model_name = model_name self.base_url = base_url def list_models(self): """List all available models from Ollama server""" try: response = requests.get(f"{self.base_url}/api/tags") if response.status_code == 200: data = response.json() return [model['name'] for model in data.get('models', [])] return [] except Exception as e: print(f"Error listing models: {e}") return [] def chat_completion(self, messages, max_tokens=4000, stream=True, temperature=0.3, top_p=0.7): # Convert messages to Ollama format ollama_messages = [] for msg in messages: if msg["role"] == "system": ollama_messages.append({"role": "system", "content": msg["content"]}) elif msg["role"] in ["user", "assistant"]: ollama_messages.append({"role": msg["role"], "content": msg["content"]}) # Prepare the request data data = { "model": self.model_name, "messages": ollama_messages, "options": { "temperature": temperature, "top_p": top_p, "num_predict": max_tokens }, "stream": stream } # Make the request to Ollama API response = requests.post( f"{self.base_url}/api/chat", json=data, stream=stream ) if response.status_code != 200: raise Exception(f"Ollama API error: {response.text}") if stream: for line in response.iter_lines(): if line: decoded_line = line.decode('utf-8') try: chunk = json.loads(decoded_line) if "message" in chunk and "content" in chunk["message"]: yield {"content": chunk["message"]["content"]} except json.JSONDecodeError: continue else: result = response.json() yield {"content": result["message"]["content"]} # -- check content -- def analyze_file_content(content, file_type): """Analyze file content and return structural summary""" if file_type in ['parquet', 'csv']: try: lines = content.split('\n') header = lines[0] columns = header.count('|') - 1 if '|' in header else len(header.split(',')) rows = len(lines) - 3 return f"📊 Dataset Structure: {columns} columns, {rows} data samples" except: return "❌ Dataset structure analysis failed" lines = content.split('\n') total_lines = len(lines) non_empty_lines = len([line for line in lines if line.strip()]) if any(keyword in content.lower() for keyword in ['def ', 'class ', 'import ', 'function']): functions = len([line for line in lines if 'def ' in line]) classes = len([line for line in lines if 'class ' in line]) imports = len([line for line in lines if 'import ' in line or 'from ' in line]) return f"💻 Code Structure: {total_lines} lines (Functions: {functions}, Classes: {classes}, Imports: {imports})" paragraphs = content.count('\n\n') + 1 words = len(content.split()) return f"📝 Document Structure: {total_lines} lines, {paragraphs} paragraphs, ~{words} words" # -- Basic stats on content -- def get_column_stats(df, col): stats = { 'type': str(df[col].dtype), 'missing': df[col].isna().sum(), 'unique': df[col].nunique() } if pd.api.types.is_numeric_dtype(df[col]): stats.update({ 'min': df[col].min(), 'max': df[col].max(), 'mean': df[col].mean() }) else: stats['examples'] = df[col].dropna().head(3).tolist() return stats # -- Identify Encoding -- def detect_file_encoding(file_path): """Improved encoding detection with fallback options""" try: with open(file_path, 'rb') as f: rawdata = f.read(100000) # Read more data for better detection # Try chardet first result = chardet.detect(rawdata) encoding = result['encoding'] confidence = result['confidence'] # If confidence is low, try some common encodings if confidence < 0.9: for test_encoding in ['utf-8', 'utf-16', 'latin1', 'cp1252']: try: rawdata.decode(test_encoding) return test_encoding except UnicodeDecodeError: continue return encoding if encoding else 'utf-8' except Exception as e: print(f"Encoding detection error: {e}") return 'utf-8' # Default fallback # -- Read file -- def read_uploaded_file(file): if file is None: return "", "" try: file_ext = os.path.splitext(file.name)[1].lower() if file_ext == '.parquet': df = pd.read_parquet(file.name, engine='pyarrow') content = df.head(10).to_markdown(index=False) return content, "parquet" if file_ext == '.csv': # First try to detect encoding try: encoding = detect_file_encoding(file.name) # Try reading with different delimiters delimiters = [',', ';', '\t', '|'] df = None best_delimiter = ',' max_columns = 1 # First pass to find the best delimiter for delimiter in delimiters: try: with open(file.name, 'r', encoding=encoding) as f: first_line = f.readline() current_columns = len(first_line.split(delimiter)) if current_columns > max_columns: max_columns = current_columns best_delimiter = delimiter except: continue # Now read with the best found delimiter try: df = pd.read_csv( file.name, encoding=encoding, delimiter=best_delimiter, on_bad_lines='warn', engine='python', quotechar='"' ) except: # Fallback to pandas auto-detection df = pd.read_csv(file.name, encoding=encoding, on_bad_lines='warn') if df is None or len(df.columns) < 1: return "❌ Could not parse CSV file - no valid columns detected", "error" # Generate comprehensive data summary content = "📊 CSV Metadata:\n" content += f"- Rows: {len(df):,}\n" content += f"- Columns: {len(df.columns):,}\n" content += f"- Missing Values: {df.isna().sum().sum():,}\n\n" content += "🔍 Column Details:\n" for col in df.columns: stats = get_column_stats(df, col) content += f"### {col}\n" content += f"- Type: {stats['type']}\n" content += f"- Unique: {stats['unique']}\n" content += f"- Missing: {stats['missing']}\n" if 'examples' in stats: content += f"- Examples: {stats['examples']}\n" else: content += ( f"- Range: {stats['min']} to {stats['max']}\n" f"- Mean: {stats['mean']:.2f}\n" ) content += "\n" content += "📋 Sample Data (First 3 Rows):\n" content += df.head(3).to_markdown(index=False) return content, "csv" except Exception as e: return f"❌ Error reading CSV file: {str(e)}", "error" else: encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1'] for encoding in encodings: try: with open(file.name, 'r', encoding=encoding) as f: content = f.read() return content, "text" except UnicodeDecodeError: continue raise UnicodeDecodeError(f"❌ Unable to read file with supported encodings ({', '.join(encodings)})") except Exception as e: return f"❌ Error reading file: {str(e)}", "error" def format_history(history): formatted_history = [] for user_msg, assistant_msg in history: formatted_history.append({"role": "user", "content": user_msg}) if assistant_msg: formatted_history.append({"role": "assistant", "content": assistant_msg}) return formatted_history def chat(message, history, uploaded_file, system_message="", max_tokens=4000, temperature=0.3, top_p=0.9, selected_model="phi3:latest"): system_prefix = """ You are a AI Data Scientist designed to provide expert guidance in data analysis, machine learning, and big data technologies, suitable for a wide range of users seeking data-driven insights and solutions. Analyze the uploaded file in depth from the following perspectives: 1. 📋 Overall file structure and format 2. ⭐ Data Quality and completeness evaluation 3. 💡 Suggested data fixes and improvements 4. 📈 Data characteristics, meaning and patterns 5. 📊 Key component analysis and potential segmentations 6. 🎯 Insights and suggested persuasive story telling Provide detailed and structured analysis from an expert perspective, but explain in an easy-to-understand way. Format the analysis results in Markdown and include specific examples where possible. """ if uploaded_file: content, file_type = read_uploaded_file(uploaded_file) if file_type == "error": return "", [{"role": "user", "content": message}, {"role": "assistant", "content": content}] file_summary = analyze_file_content(content, file_type) if file_type in ['parquet', 'csv']: system_message += f"\n\nFile Content:\n```markdown\n{content}\n```" else: system_message += f"\n\nFile Content:\n```\n{content}\n```" if message == "Starting file analysis...": message = f"""[Structure Analysis] {file_summary} Please provide detailed analysis from these perspectives: 1. 📋 Overall file structure and format 2. ⭐ Data Quality and completeness evaluation 3. 💡 Suggested data fixes and improvements 4. 📈 Data characteristics, meaning and patterns 5. 📊 Key component analysis and potential segmentations 6. 🎯 Insights and suggested persuasive story telling""" messages = [{"role": "system", "content": f"{system_prefix} {system_message}"}] # Convert history to message format if history is not None: for item in history: if isinstance(item, dict): messages.append(item) elif isinstance(item, (list, tuple)) and len(item) == 2: messages.append({"role": "user", "content": item[0]}) if item[1]: messages.append({"role": "assistant", "content": item[1]}) messages.append({"role": "user", "content": message}) try: client = OllamaClient(model_name=selected_model) partial_message = "" current_history = [] for response in client.chat_completion( messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): token = response.get('content', '') if token: partial_message += token current_history = [ {"role": "user", "content": message}, {"role": "assistant", "content": partial_message} ] yield "", current_history except Exception as e: error_msg = f"❌ Inference error: {str(e)}" error_history = [ {"role": "user", "content": message}, {"role": "assistant", "content": error_msg} ] yield "", error_history css = """ footer {visibility: hidden} """ with gr.Blocks(theme="gstaff/xkcd", css=css, title="Offline Sensitive Survey Data Analysis") as demo: gr.HTML( """