offline_data_analysis

Sleeping

File size: 18,316 Bytes

f9b9d56
574f73e
705c5b5
4875400
 
83ee74c
047d156
f9b9d56
047d156
4875400
047d156
4875400
 
 
047d156
 
 
 
 
 
 
 
 
 
 
 
760ba3d
4875400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
047d156
 
4875400
 
 
 
047d156
f9b9d56
047d156
2af89cf
1fd4ab2
99d94e0
da20c1b
99d94e0
 
047d156
1fd4ab2
 
da20c1b
1fd4ab2
2af89cf
 
 
 
 
da20c1b
2af89cf
 
da20c1b
1fd4ab2
da20c1b
 
 
1fd4ab2
2af89cf
047d156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
705c5b5
0997082
2af89cf
705c5b5
99d94e0
 
 
705c5b5
2af89cf
 
047d156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521288b
047d156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
705c5b5
521288b
 
 
 
 
 
 
 
1fd4ab2
705c5b5
1fd4ab2
0997082
d57197f
 
 
 
 
 
 
 
047d156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
705c5b5
7ffca43
2af89cf
 
2b6f16a
2af89cf
 
7ffca43
99d94e0
1fd4ab2
7ffca43
1fd4ab2
7ffca43
1fd4ab2
 
 
 
047d156
 
 
 
 
705c5b5
 
2b6f16a
 
566e2ba
 
 
 
 
 
 
 
 
0997082
705c5b5
0997082
047d156
da20c1b
566e2ba
da20c1b
047d156
f2c0975
 
83ee74c
0997082
 
83ee74c
047d156
705c5b5
da20c1b
566e2ba
 
 
 
 
da20c1b
0997082
1fd4ab2
566e2ba
 
 
 
 
705c5b5
7b3fa19
7ffca43
7b3fa19
 
047d156
 
 
1fd4ab2
 
760ba3d
047d156
760ba3d
1fd4ab2
 
 
 
047d156
 
 
63c5e29
 
b45e256
047d156
 
a3b8f2f
b45e256
7ffca43
1fd4ab2
7ffca43
047d156
7ffca43
 
760ba3d
047d156
760ba3d
63c5e29
 
047d156
7ffca43
a3b8f2f
047d156
d57197f
7ffca43
63c5e29
047d156
 
 
 
 
 
760ba3d
047d156
1fd4ab2
047d156
760ba3d
 
 
63c5e29
047d156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1fd4ab2
7ffca43
 
047d156
da20c1b
 
 
 
 
 
7ffca43
 
a3b8f2f
b45e256
047d156
b45e256
 
 
 
 
 
 
 
047d156
 
7ffca43
047d156
 
 
 
7ffca43
047d156
da20c1b
 
7ffca43
63c5e29
047d156
1fd4ab2
047d156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9b9d56
 
b45e256

import gradio as gr
import os
import pandas as pd
import requests
import json
from typing import List, Tuple
import chardet

# -- LLM Client Class --
class OllamaClient:
    def __init__(self, model_name: str = "phi3:latest", base_url: str = "http://localhost:11434"):
        self.model_name = model_name
        self.base_url = base_url
    
    def list_models(self):
        """List all available models from Ollama server"""
        try:
            response = requests.get(f"{self.base_url}/api/tags")
            if response.status_code == 200:
                data = response.json()
                return [model['name'] for model in data.get('models', [])]
            return []
        except Exception as e:
            print(f"Error listing models: {e}")
            return []
    
    def chat_completion(self, messages, max_tokens=4000, stream=True, temperature=0.3, top_p=0.7):
        # Convert messages to Ollama format
        ollama_messages = []
        for msg in messages:
            if msg["role"] == "system":
                ollama_messages.append({"role": "system", "content": msg["content"]})
            elif msg["role"] in ["user", "assistant"]:
                ollama_messages.append({"role": msg["role"], "content": msg["content"]})
        
        # Prepare the request data
        data = {
            "model": self.model_name,
            "messages": ollama_messages,
            "options": {
                "temperature": temperature,
                "top_p": top_p,
                "num_predict": max_tokens
            },
            "stream": stream
        }
        
        # Make the request to Ollama API
        response = requests.post(
            f"{self.base_url}/api/chat",
            json=data,
            stream=stream
        )
        
        if response.status_code != 200:
            raise Exception(f"Ollama API error: {response.text}")
        
        if stream:
            for line in response.iter_lines():
                if line:
                    decoded_line = line.decode('utf-8')
                    try:
                        chunk = json.loads(decoded_line)
                        if "message" in chunk and "content" in chunk["message"]:
                            yield {"content": chunk["message"]["content"]}
                    except json.JSONDecodeError:
                        continue
        else:
            result = response.json()
            yield {"content": result["message"]["content"]}

# -- check content --
def analyze_file_content(content, file_type):
    """Analyze file content and return structural summary"""
    if file_type in ['parquet', 'csv']:
        try:
            lines = content.split('\n')
            header = lines[0]
            columns = header.count('|') - 1 if '|' in header else len(header.split(','))
            rows = len(lines) - 3
            return f"📊 Dataset Structure: {columns} columns, {rows} data samples"
        except:
            return "❌ Dataset structure analysis failed"
    
    lines = content.split('\n')
    total_lines = len(lines)
    non_empty_lines = len([line for line in lines if line.strip()])
    
    if any(keyword in content.lower() for keyword in ['def ', 'class ', 'import ', 'function']):
        functions = len([line for line in lines if 'def ' in line])
        classes = len([line for line in lines if 'class ' in line])
        imports = len([line for line in lines if 'import ' in line or 'from ' in line])
        return f"💻 Code Structure: {total_lines} lines (Functions: {functions}, Classes: {classes}, Imports: {imports})"
    
    paragraphs = content.count('\n\n') + 1
    words = len(content.split())
    return f"📝 Document Structure: {total_lines} lines, {paragraphs} paragraphs, ~{words} words"

# -- Basic stats on content --
def get_column_stats(df, col):
    stats = {
        'type': str(df[col].dtype),
        'missing': df[col].isna().sum(),
        'unique': df[col].nunique()
    }
    
    if pd.api.types.is_numeric_dtype(df[col]):
        stats.update({
            'min': df[col].min(),
            'max': df[col].max(),
            'mean': df[col].mean()
        })
    else:
        stats['examples'] = df[col].dropna().head(3).tolist()
    
    return stats

# -- Identify Encoding --
def detect_file_encoding(file_path):
    """Improved encoding detection with fallback options"""
    try:
        with open(file_path, 'rb') as f:
            rawdata = f.read(100000)  # Read more data for better detection
        
        # Try chardet first
        result = chardet.detect(rawdata)
        encoding = result['encoding']
        confidence = result['confidence']
        
        # If confidence is low, try some common encodings
        if confidence < 0.9:
            for test_encoding in ['utf-8', 'utf-16', 'latin1', 'cp1252']:
                try:
                    rawdata.decode(test_encoding)
                    return test_encoding
                except UnicodeDecodeError:
                    continue
        
        return encoding if encoding else 'utf-8'
    except Exception as e:
        print(f"Encoding detection error: {e}")
        return 'utf-8'  # Default fallback

# -- Read file --
def read_uploaded_file(file):
    if file is None:
        return "", ""
    try:
        file_ext = os.path.splitext(file.name)[1].lower()
        
        if file_ext == '.parquet':
            df = pd.read_parquet(file.name, engine='pyarrow')
            content = df.head(10).to_markdown(index=False)
            return content, "parquet"
               
        if file_ext == '.csv':
            # First try to detect encoding
            try:
                encoding = detect_file_encoding(file.name)
                
                # Try reading with different delimiters
                delimiters = [',', ';', '\t', '|']
                df = None
                best_delimiter = ','
                max_columns = 1
                
                # First pass to find the best delimiter
                for delimiter in delimiters:
                    try:
                        with open(file.name, 'r', encoding=encoding) as f:
                            first_line = f.readline()
                            current_columns = len(first_line.split(delimiter))
                            if current_columns > max_columns:
                                max_columns = current_columns
                                best_delimiter = delimiter
                    except:
                        continue
                
                # Now read with the best found delimiter
                try:
                    df = pd.read_csv(
                        file.name,
                        encoding=encoding,
                        delimiter=best_delimiter,
                        on_bad_lines='warn',
                        engine='python',
                        quotechar='"'
                    )
                except:
                    # Fallback to pandas auto-detection
                    df = pd.read_csv(file.name, encoding=encoding, on_bad_lines='warn')
                
                if df is None or len(df.columns) < 1:
                    return "❌ Could not parse CSV file - no valid columns detected", "error"
                
                # Generate comprehensive data summary
                content = "📊 CSV Metadata:\n"
                content += f"- Rows: {len(df):,}\n"
                content += f"- Columns: {len(df.columns):,}\n"
                content += f"- Missing Values: {df.isna().sum().sum():,}\n\n"
                
                content += "🔍 Column Details:\n"
                for col in df.columns:
                    stats = get_column_stats(df, col)
                    content += f"### {col}\n"
                    content += f"- Type: {stats['type']}\n"
                    content += f"- Unique: {stats['unique']}\n"
                    content += f"- Missing: {stats['missing']}\n"
                    
                    if 'examples' in stats:
                        content += f"- Examples: {stats['examples']}\n"
                    else:
                        content += (
                            f"- Range: {stats['min']} to {stats['max']}\n"
                            f"- Mean: {stats['mean']:.2f}\n"
                        )
                    content += "\n"
                
                content += "📋 Sample Data (First 3 Rows):\n"
                content += df.head(3).to_markdown(index=False)
                
                return content, "csv"
            except Exception as e:
                return f"❌ Error reading CSV file: {str(e)}", "error"
        else:
            encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
            for encoding in encodings:
                try:
                    with open(file.name, 'r', encoding=encoding) as f:
                        content = f.read()
                    return content, "text"
                except UnicodeDecodeError:
                    continue
            raise UnicodeDecodeError(f"❌ Unable to read file with supported encodings ({', '.join(encodings)})")
    except Exception as e:
        return f"❌ Error reading file: {str(e)}", "error"

def format_history(history):
    formatted_history = []
    for user_msg, assistant_msg in history:
        formatted_history.append({"role": "user", "content": user_msg})
        if assistant_msg:
            formatted_history.append({"role": "assistant", "content": assistant_msg})
    return formatted_history

def chat(message, 
         history, 
         uploaded_file, 
         system_message="", 
         max_tokens=4000, 
         temperature=0.3,
        top_p=0.9, 
        selected_model="phi3:latest"):
    
    system_prefix = """
You are a AI Data Scientist designed to provide expert guidance in data analysis, machine learning, and big data technologies, suitable for a wide range of users seeking data-driven insights and solutions. 

Analyze the uploaded file in depth from the following perspectives:

1. 📋 Overall file structure and format
2. ⭐ Data Quality and completeness evaluation
3. 💡 Suggested data fixes and improvements
4. 📈 Data characteristics, meaning and patterns
5. 📊 Key component analysis and potential segmentations
6. 🎯 Insights and suggested persuasive story telling

Provide detailed and structured analysis from an expert perspective, but explain in an easy-to-understand way. 

Format the analysis results in Markdown and include specific examples where possible.
"""

    if uploaded_file:
        content, file_type = read_uploaded_file(uploaded_file)
        if file_type == "error":
            return "", [{"role": "user", "content": message}, {"role": "assistant", "content": content}]
        
        file_summary = analyze_file_content(content, file_type)
        
        if file_type in ['parquet', 'csv']:
            system_message += f"\n\nFile Content:\n```markdown\n{content}\n```"
        else:
            system_message += f"\n\nFile Content:\n```\n{content}\n```"
            
        if message == "Starting file analysis...":
            message = f"""[Structure Analysis] {file_summary}
Please provide detailed analysis from these perspectives:
1. 📋 Overall file structure and format
2. ⭐ Data Quality and completeness evaluation
3. 💡 Suggested data fixes and improvements
4. 📈 Data characteristics, meaning and patterns
5. 📊 Key component analysis and potential segmentations
6. 🎯 Insights and suggested persuasive story telling"""

    messages = [{"role": "system", "content": f"{system_prefix} {system_message}"}]
    
    # Convert history to message format
    if history is not None:
        for item in history:
            if isinstance(item, dict):
                messages.append(item)
            elif isinstance(item, (list, tuple)) and len(item) == 2:
                messages.append({"role": "user", "content": item[0]})
                if item[1]:
                    messages.append({"role": "assistant", "content": item[1]})

    messages.append({"role": "user", "content": message})

    try:
        client = OllamaClient(model_name=selected_model)
        partial_message = ""
        current_history = []
        
        for response in client.chat_completion(
            messages,
            max_tokens=max_tokens,
            stream=True,
            temperature=temperature,
            top_p=top_p,
        ):
            token = response.get('content', '')
            if token:
                partial_message += token
                current_history = [
                    {"role": "user", "content": message},
                    {"role": "assistant", "content": partial_message}
                ]
                yield "", current_history
                
    except Exception as e:
        error_msg = f"❌ Inference error: {str(e)}"
        error_history = [
            {"role": "user", "content": message},
            {"role": "assistant", "content": error_msg}
        ]
        yield "", error_history

css = """
footer {visibility: hidden}
"""

with gr.Blocks(theme="gstaff/xkcd", 
               css=css, 
               title="Offline Sensitive Survey Data Analysis") as demo:
    gr.HTML(
        """
        <div style="text-align: center; max-width: 1000px; margin: 0 auto;">
            <h1 style="font-size: 3em; font-weight: 600; margin: 0.5em;">Offline Sensitive Survey Data Analysis</h1>
            <h3 style="font-size: 1.2em; margin: 1em;">Leveraging your Local Ollama Inference Server</h3>
        </div>
        """
    )
    
    # Store the current model in a state variable
    current_model = gr.State("phi3:latest")
    
    with gr.Row():
        with gr.Column(scale=2):
            chatbot = gr.Chatbot(
                height=500, 
                label="Chat Interface",
                type="messages"
            )
            msg = gr.Textbox(
                label="Type your message",
                show_label=False,
                placeholder="Ask me anything about the uploaded data file... ",
                container=False
            )
            with gr.Row():                
                send = gr.Button("Send")
                clear = gr.ClearButton([msg, chatbot])
        
        with gr.Column(scale=1):
            gr.Markdown("### Upload File \nSupport: CSV, Parquet files, Text")
            file_upload = gr.File(
                label="Upload File",
                file_types=[".csv", ".parquet",".txt"],
                type="filepath"
            )
            
            with gr.Accordion("Model Settings", open=False):
                model_dropdown = gr.Dropdown(
                    label="Available Models",
                    choices=[],
                    interactive=True
                )
                refresh_models = gr.Button("Select Model")
                
            with gr.Accordion("Advanced Settings ⚙️", open=False):
                system_message = gr.Textbox(label="Override System Message 📝", value="")
                max_tokens = gr.Slider(minimum=1, maximum=8000, value=4000, label="Max Tokens (maximum number of words for generated response)")
                temperature = gr.Slider(minimum=0, maximum=1, value=0.3, label="Temperature (higher = more creative)")
                top_p = gr.Slider(minimum=0, maximum=1, value=0.7, label="Top P (word choices by probability threshold)")

    # Function to load available models
    def load_models():
        client = OllamaClient()
        models = client.list_models()
        return gr.Dropdown(choices=models, value=models[0] if models else "phi3:latest")
    
    # Refresh models button click handler
    refresh_models.click(
        load_models,
        outputs=model_dropdown
    )
    
    # Model dropdown change handler
    model_dropdown.change(
        lambda x: x,
        inputs=model_dropdown,
        outputs=current_model
    )
    
    # Load models when app starts
    demo.load(
        load_models,
        outputs=model_dropdown
    )
    
    # Event bindings
    msg.submit(
        chat,
        inputs=[msg, chatbot, file_upload, system_message, max_tokens, temperature, top_p, current_model],
        outputs=[msg, chatbot],
        queue=True
    ).then(
        lambda: gr.update(interactive=True),
        None,
        [msg]
    )

    send.click(
        chat,
        inputs=[msg, chatbot, file_upload, system_message, max_tokens, temperature, top_p, current_model],
        outputs=[msg, chatbot],
        queue=True
    ).then(
        lambda: gr.update(interactive=True),
        None,
        [msg]
    )

    # Auto-analysis on file upload with this hidden component
    auto_analyze_trigger = gr.Textbox(value="Analyze this file", visible=False)
    file_upload.change(
        lambda: gr.Chatbot(value=[]),  # Clear chat history
        outputs=[chatbot],
        queue=True
    ).then(
        chat,
        inputs=[auto_analyze_trigger, chatbot, file_upload, system_message, max_tokens, temperature, top_p, current_model],
        outputs=[msg, chatbot],
        queue=True
    )


    # Example queries
    with gr.Column():
        gr.Markdown("### Potential Follow-up Queries")
        with gr.Row():
            example_btns = [             
                gr.Button("Analyze open-ended responses for sentiment and recurring themes", size="lg", variant="secondary"),
                gr.Button("Compare responses between different groups and identify potential segmentation or cluster analysis", size="lg", variant="secondary"),
                gr.Button("Identify potential outcome variables and suggest a predicting model for it", size="lg", variant="secondary"),
                gr.Button("Generate a Quarto notebook in Python to process this dataset", size="lg", variant="secondary"),
                gr.Button("Generate a Rmd notebook in R to process this dataset", size="lg", variant="secondary"),

            ]
        
        # Add click handlers
        for btn in example_btns:
            btn.click(
                lambda x: x,
                inputs=[gr.Textbox(value=btn.value, visible=False)],
                outputs=msg
            )


if __name__ == "__main__":
    demo.launch()