File size: 18,316 Bytes
f9b9d56
574f73e
705c5b5
4875400
 
83ee74c
047d156
f9b9d56
047d156
4875400
047d156
4875400
 
 
047d156
 
 
 
 
 
 
 
 
 
 
 
760ba3d
4875400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
047d156
 
4875400
 
 
 
047d156
f9b9d56
047d156
2af89cf
1fd4ab2
99d94e0
da20c1b
99d94e0
 
047d156
1fd4ab2
 
da20c1b
1fd4ab2
2af89cf
 
 
 
 
da20c1b
2af89cf
 
da20c1b
1fd4ab2
da20c1b
 
 
1fd4ab2
2af89cf
047d156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
705c5b5
0997082
2af89cf
705c5b5
99d94e0
 
 
705c5b5
2af89cf
 
047d156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521288b
047d156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
705c5b5
521288b
 
 
 
 
 
 
 
1fd4ab2
705c5b5
1fd4ab2
0997082
d57197f
 
 
 
 
 
 
 
047d156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
705c5b5
7ffca43
2af89cf
 
2b6f16a
2af89cf
 
7ffca43
99d94e0
1fd4ab2
7ffca43
1fd4ab2
7ffca43
1fd4ab2
 
 
 
047d156
 
 
 
 
705c5b5
 
2b6f16a
 
566e2ba
 
 
 
 
 
 
 
 
0997082
705c5b5
0997082
047d156
da20c1b
566e2ba
da20c1b
047d156
f2c0975
 
83ee74c
0997082
 
83ee74c
047d156
705c5b5
da20c1b
566e2ba
 
 
 
 
da20c1b
0997082
1fd4ab2
566e2ba
 
 
 
 
705c5b5
7b3fa19
7ffca43
7b3fa19
 
047d156
 
 
1fd4ab2
 
760ba3d
047d156
760ba3d
1fd4ab2
 
 
 
047d156
 
 
63c5e29
 
b45e256
047d156
 
a3b8f2f
b45e256
7ffca43
1fd4ab2
7ffca43
047d156
7ffca43
 
760ba3d
047d156
760ba3d
63c5e29
 
047d156
7ffca43
a3b8f2f
047d156
d57197f
7ffca43
63c5e29
047d156
 
 
 
 
 
760ba3d
047d156
1fd4ab2
047d156
760ba3d
 
 
63c5e29
047d156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1fd4ab2
7ffca43
 
047d156
da20c1b
 
 
 
 
 
7ffca43
 
a3b8f2f
b45e256
047d156
b45e256
 
 
 
 
 
 
 
047d156
 
7ffca43
047d156
 
 
 
7ffca43
047d156
da20c1b
 
7ffca43
63c5e29
047d156
1fd4ab2
047d156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9b9d56
 
b45e256
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
import gradio as gr
import os
import pandas as pd
import requests
import json
from typing import List, Tuple
import chardet

# -- LLM Client Class --
class OllamaClient:
    def __init__(self, model_name: str = "phi3:latest", base_url: str = "http://localhost:11434"):
        self.model_name = model_name
        self.base_url = base_url
    
    def list_models(self):
        """List all available models from Ollama server"""
        try:
            response = requests.get(f"{self.base_url}/api/tags")
            if response.status_code == 200:
                data = response.json()
                return [model['name'] for model in data.get('models', [])]
            return []
        except Exception as e:
            print(f"Error listing models: {e}")
            return []
    
    def chat_completion(self, messages, max_tokens=4000, stream=True, temperature=0.3, top_p=0.7):
        # Convert messages to Ollama format
        ollama_messages = []
        for msg in messages:
            if msg["role"] == "system":
                ollama_messages.append({"role": "system", "content": msg["content"]})
            elif msg["role"] in ["user", "assistant"]:
                ollama_messages.append({"role": msg["role"], "content": msg["content"]})
        
        # Prepare the request data
        data = {
            "model": self.model_name,
            "messages": ollama_messages,
            "options": {
                "temperature": temperature,
                "top_p": top_p,
                "num_predict": max_tokens
            },
            "stream": stream
        }
        
        # Make the request to Ollama API
        response = requests.post(
            f"{self.base_url}/api/chat",
            json=data,
            stream=stream
        )
        
        if response.status_code != 200:
            raise Exception(f"Ollama API error: {response.text}")
        
        if stream:
            for line in response.iter_lines():
                if line:
                    decoded_line = line.decode('utf-8')
                    try:
                        chunk = json.loads(decoded_line)
                        if "message" in chunk and "content" in chunk["message"]:
                            yield {"content": chunk["message"]["content"]}
                    except json.JSONDecodeError:
                        continue
        else:
            result = response.json()
            yield {"content": result["message"]["content"]}

# -- check content --
def analyze_file_content(content, file_type):
    """Analyze file content and return structural summary"""
    if file_type in ['parquet', 'csv']:
        try:
            lines = content.split('\n')
            header = lines[0]
            columns = header.count('|') - 1 if '|' in header else len(header.split(','))
            rows = len(lines) - 3
            return f"πŸ“Š Dataset Structure: {columns} columns, {rows} data samples"
        except:
            return "❌ Dataset structure analysis failed"
    
    lines = content.split('\n')
    total_lines = len(lines)
    non_empty_lines = len([line for line in lines if line.strip()])
    
    if any(keyword in content.lower() for keyword in ['def ', 'class ', 'import ', 'function']):
        functions = len([line for line in lines if 'def ' in line])
        classes = len([line for line in lines if 'class ' in line])
        imports = len([line for line in lines if 'import ' in line or 'from ' in line])
        return f"πŸ’» Code Structure: {total_lines} lines (Functions: {functions}, Classes: {classes}, Imports: {imports})"
    
    paragraphs = content.count('\n\n') + 1
    words = len(content.split())
    return f"πŸ“ Document Structure: {total_lines} lines, {paragraphs} paragraphs, ~{words} words"

# -- Basic stats on content --
def get_column_stats(df, col):
    stats = {
        'type': str(df[col].dtype),
        'missing': df[col].isna().sum(),
        'unique': df[col].nunique()
    }
    
    if pd.api.types.is_numeric_dtype(df[col]):
        stats.update({
            'min': df[col].min(),
            'max': df[col].max(),
            'mean': df[col].mean()
        })
    else:
        stats['examples'] = df[col].dropna().head(3).tolist()
    
    return stats

# -- Identify Encoding --
def detect_file_encoding(file_path):
    """Improved encoding detection with fallback options"""
    try:
        with open(file_path, 'rb') as f:
            rawdata = f.read(100000)  # Read more data for better detection
        
        # Try chardet first
        result = chardet.detect(rawdata)
        encoding = result['encoding']
        confidence = result['confidence']
        
        # If confidence is low, try some common encodings
        if confidence < 0.9:
            for test_encoding in ['utf-8', 'utf-16', 'latin1', 'cp1252']:
                try:
                    rawdata.decode(test_encoding)
                    return test_encoding
                except UnicodeDecodeError:
                    continue
        
        return encoding if encoding else 'utf-8'
    except Exception as e:
        print(f"Encoding detection error: {e}")
        return 'utf-8'  # Default fallback

# -- Read file --
def read_uploaded_file(file):
    if file is None:
        return "", ""
    try:
        file_ext = os.path.splitext(file.name)[1].lower()
        
        if file_ext == '.parquet':
            df = pd.read_parquet(file.name, engine='pyarrow')
            content = df.head(10).to_markdown(index=False)
            return content, "parquet"
               
        if file_ext == '.csv':
            # First try to detect encoding
            try:
                encoding = detect_file_encoding(file.name)
                
                # Try reading with different delimiters
                delimiters = [',', ';', '\t', '|']
                df = None
                best_delimiter = ','
                max_columns = 1
                
                # First pass to find the best delimiter
                for delimiter in delimiters:
                    try:
                        with open(file.name, 'r', encoding=encoding) as f:
                            first_line = f.readline()
                            current_columns = len(first_line.split(delimiter))
                            if current_columns > max_columns:
                                max_columns = current_columns
                                best_delimiter = delimiter
                    except:
                        continue
                
                # Now read with the best found delimiter
                try:
                    df = pd.read_csv(
                        file.name,
                        encoding=encoding,
                        delimiter=best_delimiter,
                        on_bad_lines='warn',
                        engine='python',
                        quotechar='"'
                    )
                except:
                    # Fallback to pandas auto-detection
                    df = pd.read_csv(file.name, encoding=encoding, on_bad_lines='warn')
                
                if df is None or len(df.columns) < 1:
                    return "❌ Could not parse CSV file - no valid columns detected", "error"
                
                # Generate comprehensive data summary
                content = "πŸ“Š CSV Metadata:\n"
                content += f"- Rows: {len(df):,}\n"
                content += f"- Columns: {len(df.columns):,}\n"
                content += f"- Missing Values: {df.isna().sum().sum():,}\n\n"
                
                content += "πŸ” Column Details:\n"
                for col in df.columns:
                    stats = get_column_stats(df, col)
                    content += f"### {col}\n"
                    content += f"- Type: {stats['type']}\n"
                    content += f"- Unique: {stats['unique']}\n"
                    content += f"- Missing: {stats['missing']}\n"
                    
                    if 'examples' in stats:
                        content += f"- Examples: {stats['examples']}\n"
                    else:
                        content += (
                            f"- Range: {stats['min']} to {stats['max']}\n"
                            f"- Mean: {stats['mean']:.2f}\n"
                        )
                    content += "\n"
                
                content += "πŸ“‹ Sample Data (First 3 Rows):\n"
                content += df.head(3).to_markdown(index=False)
                
                return content, "csv"
            except Exception as e:
                return f"❌ Error reading CSV file: {str(e)}", "error"
        else:
            encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
            for encoding in encodings:
                try:
                    with open(file.name, 'r', encoding=encoding) as f:
                        content = f.read()
                    return content, "text"
                except UnicodeDecodeError:
                    continue
            raise UnicodeDecodeError(f"❌ Unable to read file with supported encodings ({', '.join(encodings)})")
    except Exception as e:
        return f"❌ Error reading file: {str(e)}", "error"

def format_history(history):
    formatted_history = []
    for user_msg, assistant_msg in history:
        formatted_history.append({"role": "user", "content": user_msg})
        if assistant_msg:
            formatted_history.append({"role": "assistant", "content": assistant_msg})
    return formatted_history

def chat(message, 
         history, 
         uploaded_file, 
         system_message="", 
         max_tokens=4000, 
         temperature=0.3,
        top_p=0.9, 
        selected_model="phi3:latest"):
    
    system_prefix = """
You are a AI Data Scientist designed to provide expert guidance in data analysis, machine learning, and big data technologies, suitable for a wide range of users seeking data-driven insights and solutions. 

Analyze the uploaded file in depth from the following perspectives:

1. πŸ“‹ Overall file structure and format
2. ⭐ Data Quality and completeness evaluation
3. πŸ’‘ Suggested data fixes and improvements
4. πŸ“ˆ Data characteristics, meaning and patterns
5. πŸ“Š Key component analysis and potential segmentations
6. 🎯 Insights and suggested persuasive story telling

Provide detailed and structured analysis from an expert perspective, but explain in an easy-to-understand way. 

Format the analysis results in Markdown and include specific examples where possible.
"""

    if uploaded_file:
        content, file_type = read_uploaded_file(uploaded_file)
        if file_type == "error":
            return "", [{"role": "user", "content": message}, {"role": "assistant", "content": content}]
        
        file_summary = analyze_file_content(content, file_type)
        
        if file_type in ['parquet', 'csv']:
            system_message += f"\n\nFile Content:\n```markdown\n{content}\n```"
        else:
            system_message += f"\n\nFile Content:\n```\n{content}\n```"
            
        if message == "Starting file analysis...":
            message = f"""[Structure Analysis] {file_summary}
Please provide detailed analysis from these perspectives:
1. πŸ“‹ Overall file structure and format
2. ⭐ Data Quality and completeness evaluation
3. πŸ’‘ Suggested data fixes and improvements
4. πŸ“ˆ Data characteristics, meaning and patterns
5. πŸ“Š Key component analysis and potential segmentations
6. 🎯 Insights and suggested persuasive story telling"""

    messages = [{"role": "system", "content": f"{system_prefix} {system_message}"}]
    
    # Convert history to message format
    if history is not None:
        for item in history:
            if isinstance(item, dict):
                messages.append(item)
            elif isinstance(item, (list, tuple)) and len(item) == 2:
                messages.append({"role": "user", "content": item[0]})
                if item[1]:
                    messages.append({"role": "assistant", "content": item[1]})

    messages.append({"role": "user", "content": message})

    try:
        client = OllamaClient(model_name=selected_model)
        partial_message = ""
        current_history = []
        
        for response in client.chat_completion(
            messages,
            max_tokens=max_tokens,
            stream=True,
            temperature=temperature,
            top_p=top_p,
        ):
            token = response.get('content', '')
            if token:
                partial_message += token
                current_history = [
                    {"role": "user", "content": message},
                    {"role": "assistant", "content": partial_message}
                ]
                yield "", current_history
                
    except Exception as e:
        error_msg = f"❌ Inference error: {str(e)}"
        error_history = [
            {"role": "user", "content": message},
            {"role": "assistant", "content": error_msg}
        ]
        yield "", error_history

css = """
footer {visibility: hidden}
"""

with gr.Blocks(theme="gstaff/xkcd", 
               css=css, 
               title="Offline Sensitive Survey Data Analysis") as demo:
    gr.HTML(
        """
        <div style="text-align: center; max-width: 1000px; margin: 0 auto;">
            <h1 style="font-size: 3em; font-weight: 600; margin: 0.5em;">Offline Sensitive Survey Data Analysis</h1>
            <h3 style="font-size: 1.2em; margin: 1em;">Leveraging your Local Ollama Inference Server</h3>
        </div>
        """
    )
    
    # Store the current model in a state variable
    current_model = gr.State("phi3:latest")
    
    with gr.Row():
        with gr.Column(scale=2):
            chatbot = gr.Chatbot(
                height=500, 
                label="Chat Interface",
                type="messages"
            )
            msg = gr.Textbox(
                label="Type your message",
                show_label=False,
                placeholder="Ask me anything about the uploaded data file... ",
                container=False
            )
            with gr.Row():                
                send = gr.Button("Send")
                clear = gr.ClearButton([msg, chatbot])
        
        with gr.Column(scale=1):
            gr.Markdown("### Upload File \nSupport: CSV, Parquet files, Text")
            file_upload = gr.File(
                label="Upload File",
                file_types=[".csv", ".parquet",".txt"],
                type="filepath"
            )
            
            with gr.Accordion("Model Settings", open=False):
                model_dropdown = gr.Dropdown(
                    label="Available Models",
                    choices=[],
                    interactive=True
                )
                refresh_models = gr.Button("Select Model")
                
            with gr.Accordion("Advanced Settings βš™οΈ", open=False):
                system_message = gr.Textbox(label="Override System Message πŸ“", value="")
                max_tokens = gr.Slider(minimum=1, maximum=8000, value=4000, label="Max Tokens (maximum number of words for generated response)")
                temperature = gr.Slider(minimum=0, maximum=1, value=0.3, label="Temperature (higher = more creative)")
                top_p = gr.Slider(minimum=0, maximum=1, value=0.7, label="Top P (word choices by probability threshold)")

    # Function to load available models
    def load_models():
        client = OllamaClient()
        models = client.list_models()
        return gr.Dropdown(choices=models, value=models[0] if models else "phi3:latest")
    
    # Refresh models button click handler
    refresh_models.click(
        load_models,
        outputs=model_dropdown
    )
    
    # Model dropdown change handler
    model_dropdown.change(
        lambda x: x,
        inputs=model_dropdown,
        outputs=current_model
    )
    
    # Load models when app starts
    demo.load(
        load_models,
        outputs=model_dropdown
    )
    
    # Event bindings
    msg.submit(
        chat,
        inputs=[msg, chatbot, file_upload, system_message, max_tokens, temperature, top_p, current_model],
        outputs=[msg, chatbot],
        queue=True
    ).then(
        lambda: gr.update(interactive=True),
        None,
        [msg]
    )

    send.click(
        chat,
        inputs=[msg, chatbot, file_upload, system_message, max_tokens, temperature, top_p, current_model],
        outputs=[msg, chatbot],
        queue=True
    ).then(
        lambda: gr.update(interactive=True),
        None,
        [msg]
    )

    # Auto-analysis on file upload with this hidden component
    auto_analyze_trigger = gr.Textbox(value="Analyze this file", visible=False)
    file_upload.change(
        lambda: gr.Chatbot(value=[]),  # Clear chat history
        outputs=[chatbot],
        queue=True
    ).then(
        chat,
        inputs=[auto_analyze_trigger, chatbot, file_upload, system_message, max_tokens, temperature, top_p, current_model],
        outputs=[msg, chatbot],
        queue=True
    )


    # Example queries
    with gr.Column():
        gr.Markdown("### Potential Follow-up Queries")
        with gr.Row():
            example_btns = [             
                gr.Button("Analyze open-ended responses for sentiment and recurring themes", size="lg", variant="secondary"),
                gr.Button("Compare responses between different groups and identify potential segmentation or cluster analysis", size="lg", variant="secondary"),
                gr.Button("Identify potential outcome variables and suggest a predicting model for it", size="lg", variant="secondary"),
                gr.Button("Generate a Quarto notebook in Python to process this dataset", size="lg", variant="secondary"),
                gr.Button("Generate a Rmd notebook in R to process this dataset", size="lg", variant="secondary"),

            ]
        
        # Add click handlers
        for btn in example_btns:
            btn.click(
                lambda x: x,
                inputs=[gr.Textbox(value=btn.value, visible=False)],
                outputs=msg
            )


if __name__ == "__main__":
    demo.launch()