edouardlgp commited on
Commit
047d156
Β·
verified Β·
1 Parent(s): 0b0fea8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +248 -83
app.py CHANGED
@@ -4,13 +4,27 @@ import pandas as pd
4
  import requests
5
  import json
6
  from typing import List, Tuple
 
7
 
 
8
  class OllamaClient:
9
- def __init__(self, model_name: str = "mistral-nemo", base_url: str = "http://localhost:11434"):
10
  self.model_name = model_name
11
  self.base_url = base_url
12
 
13
- def chat_completion(self, messages, max_tokens=4000, stream=True, temperature=0.7, top_p=0.9):
 
 
 
 
 
 
 
 
 
 
 
 
14
  # Convert messages to Ollama format
15
  ollama_messages = []
16
  for msg in messages:
@@ -47,33 +61,22 @@ class OllamaClient:
47
  decoded_line = line.decode('utf-8')
48
  try:
49
  chunk = json.loads(decoded_line)
50
- if "message" in chunk:
51
- yield {
52
- "choices": [{
53
- "delta": {
54
- "content": chunk["message"]["content"]
55
- }
56
- }]
57
- }
58
  except json.JSONDecodeError:
59
  continue
60
  else:
61
  result = response.json()
62
- yield {
63
- "choices": [{
64
- "delta": {
65
- "content": result["message"]["content"]
66
- }
67
- }]
68
- }
69
 
 
70
  def analyze_file_content(content, file_type):
71
  """Analyze file content and return structural summary"""
72
  if file_type in ['parquet', 'csv']:
73
  try:
74
  lines = content.split('\n')
75
  header = lines[0]
76
- columns = header.count('|') - 1
77
  rows = len(lines) - 3
78
  return f"πŸ“Š Dataset Structure: {columns} columns, {rows} data samples"
79
  except:
@@ -93,6 +96,52 @@ def analyze_file_content(content, file_type):
93
  words = len(content.split())
94
  return f"πŸ“ Document Structure: {total_lines} lines, {paragraphs} paragraphs, ~{words} words"
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  def read_uploaded_file(file):
97
  if file is None:
98
  return "", ""
@@ -103,28 +152,76 @@ def read_uploaded_file(file):
103
  df = pd.read_parquet(file.name, engine='pyarrow')
104
  content = df.head(10).to_markdown(index=False)
105
  return content, "parquet"
106
- elif file_ext == '.csv':
107
- encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
108
- for encoding in encodings:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  try:
110
- df = pd.read_csv(file.name, encoding=encoding)
111
- content = f"πŸ“Š Data Preview:\n{df.head(10).to_markdown(index=False)}\n\n"
112
- content += f"\nπŸ“ˆ Data Information:\n"
113
- content += f"- Total Rows: {len(df)}\n"
114
- content += f"- Total Columns: {len(df.columns)}\n"
115
- content += f"- Column List: {', '.join(df.columns)}\n"
116
- content += f"\nπŸ“‹ Column Data Types:\n"
117
- for col, dtype in df.dtypes.items():
118
- content += f"- {col}: {dtype}\n"
119
- null_counts = df.isnull().sum()
120
- if null_counts.any():
121
- content += f"\n⚠️ Missing Values:\n"
122
- for col, null_count in null_counts[null_counts > 0].items():
123
- content += f"- {col}: {null_count} missing\n"
124
- return content, "csv"
125
- except UnicodeDecodeError:
126
- continue
127
- raise UnicodeDecodeError(f"❌ Unable to read file with supported encodings ({', '.join(encodings)})")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  else:
129
  encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
130
  for encoding in encodings:
@@ -146,16 +243,31 @@ def format_history(history):
146
  formatted_history.append({"role": "assistant", "content": assistant_msg})
147
  return formatted_history
148
 
149
- def chat(message, history, uploaded_file, system_message="", max_tokens=4000, temperature=0.7, top_p=0.9):
150
- system_prefix = """You are a file analysis expert. Analyze the uploaded file in depth from the following perspectives:
151
- 1. πŸ“‹ Overall structure and composition
152
- 2. πŸ“Š Key content and pattern analysis
153
- 3. πŸ“ˆ Data characteristics and meaning
154
- - For datasets: Column meanings, data types, value distributions
155
- - For text/code: Structural features, main patterns
156
- 4. πŸ’‘ Potential applications
157
- 5. ✨ Data quality and areas for improvement
158
- Provide detailed and structured analysis from an expert perspective, but explain in an easy-to-understand way. Format the analysis results in Markdown and include specific examples where possible."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
  if uploaded_file:
161
  content, file_type = read_uploaded_file(uploaded_file)
@@ -173,11 +285,11 @@ Provide detailed and structured analysis from an expert perspective, but explain
173
  message = f"""[Structure Analysis] {file_summary}
174
  Please provide detailed analysis from these perspectives:
175
  1. πŸ“‹ Overall file structure and format
176
- 2. πŸ“Š Key content and component analysis
177
- 3. πŸ“ˆ Data/content characteristics and patterns
178
- 4. ⭐ Quality and completeness evaluation
179
- 5. πŸ’‘ Suggested improvements
180
- 6. 🎯 Practical applications and recommendations"""
181
 
182
  messages = [{"role": "system", "content": f"{system_prefix} {system_message}"}]
183
 
@@ -194,18 +306,18 @@ Please provide detailed analysis from these perspectives:
194
  messages.append({"role": "user", "content": message})
195
 
196
  try:
197
- client = OllamaClient()
198
  partial_message = ""
199
  current_history = []
200
 
201
- for msg in client.chat_completion(
202
  messages,
203
  max_tokens=max_tokens,
204
  stream=True,
205
  temperature=temperature,
206
  top_p=top_p,
207
  ):
208
- token = msg.choices[0].delta.get('content', None)
209
  if token:
210
  partial_message += token
211
  current_history = [
@@ -226,51 +338,89 @@ css = """
226
  footer {visibility: hidden}
227
  """
228
 
229
- with gr.Blocks(theme="gstaff/xkcd", css=css, title="Offline Survey Data Analysis πŸ“Š") as demo:
 
 
230
  gr.HTML(
231
  """
232
  <div style="text-align: center; max-width: 800px; margin: 0 auto;">
233
- <h1 style="font-size: 3em; font-weight: 600; margin: 0.5em;">Offline Survey Data Analysis</h1>
234
- <h3 style="font-size: 1.2em; margin: 1em;">Leveraging Mistral-Nemo via Ollama</h3>
235
  </div>
236
  """
237
  )
238
 
 
 
 
239
  with gr.Row():
240
  with gr.Column(scale=2):
241
  chatbot = gr.Chatbot(
242
- height=600,
243
- label="Chat Interface πŸ’¬",
244
  type="messages"
245
  )
246
  msg = gr.Textbox(
247
  label="Type your message",
248
  show_label=False,
249
- placeholder="Ask me anything about the uploaded data file... πŸ’­",
250
  container=False
251
  )
252
  with gr.Row():
253
  clear = gr.ClearButton([msg, chatbot])
254
- send = gr.Button("Send πŸ“€")
255
 
256
  with gr.Column(scale=1):
257
- gr.Markdown("### Upload File πŸ“\nSupport: Text, Code, CSV, Parquet files")
258
  file_upload = gr.File(
259
  label="Upload File",
260
- file_types=["text", ".csv", ".parquet"],
261
  type="filepath"
262
  )
263
 
 
 
 
 
 
 
 
 
264
  with gr.Accordion("Advanced Settings βš™οΈ", open=False):
265
- system_message = gr.Textbox(label="System Message πŸ“", value="")
266
  max_tokens = gr.Slider(minimum=1, maximum=8000, value=4000, label="Max Tokens πŸ“Š")
267
- temperature = gr.Slider(minimum=0, maximum=1, value=0.7, label="Temperature 🌑️")
268
  top_p = gr.Slider(minimum=0, maximum=1, value=0.9, label="Top P πŸ“ˆ")
269
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  # Event bindings
271
  msg.submit(
272
  chat,
273
- inputs=[msg, chatbot, file_upload, system_message, max_tokens, temperature, top_p],
274
  outputs=[msg, chatbot],
275
  queue=True
276
  ).then(
@@ -281,7 +431,7 @@ with gr.Blocks(theme="gstaff/xkcd", css=css, title="Offline Survey Data Analysis
281
 
282
  send.click(
283
  chat,
284
- inputs=[msg, chatbot, file_upload, system_message, max_tokens, temperature, top_p],
285
  outputs=[msg, chatbot],
286
  queue=True
287
  ).then(
@@ -290,26 +440,41 @@ with gr.Blocks(theme="gstaff/xkcd", css=css, title="Offline Survey Data Analysis
290
  [msg]
291
  )
292
 
293
- # Auto-analysis on file upload
 
294
  file_upload.change(
 
 
 
 
295
  chat,
296
- inputs=[gr.Textbox(value="Starting file analysis..."), chatbot, file_upload, system_message, max_tokens, temperature, top_p],
297
  outputs=[msg, chatbot],
298
  queue=True
299
  )
300
 
 
301
  # Example queries
302
- gr.Examples(
303
- examples=[
304
- ["Please explain the overall structure and features of the file in detail πŸ“‹"],
305
- ["Analyze the main patterns and characteristics of this file πŸ“Š"],
306
- ["Evaluate the file's quality and potential improvements πŸ’‘"],
307
- ["How can we practically utilize this file? 🎯"],
308
- ["Summarize the main content and derive key insights ✨"],
309
- ["Please continue with more detailed analysis πŸ“ˆ"],
310
- ],
311
- inputs=msg,
312
- )
 
 
 
 
 
 
 
 
 
313
 
314
  if __name__ == "__main__":
315
  demo.launch()
 
4
  import requests
5
  import json
6
  from typing import List, Tuple
7
+ import chardet
8
 
9
+ # -- LLM Client Class --
10
  class OllamaClient:
11
+ def __init__(self, model_name: str = "phi3:latest", base_url: str = "http://localhost:11434"):
12
  self.model_name = model_name
13
  self.base_url = base_url
14
 
15
+ def list_models(self):
16
+ """List all available models from Ollama server"""
17
+ try:
18
+ response = requests.get(f"{self.base_url}/api/tags")
19
+ if response.status_code == 200:
20
+ data = response.json()
21
+ return [model['name'] for model in data.get('models', [])]
22
+ return []
23
+ except Exception as e:
24
+ print(f"Error listing models: {e}")
25
+ return []
26
+
27
+ def chat_completion(self, messages, max_tokens=4000, stream=True, temperature=0.3, top_p=0.9):
28
  # Convert messages to Ollama format
29
  ollama_messages = []
30
  for msg in messages:
 
61
  decoded_line = line.decode('utf-8')
62
  try:
63
  chunk = json.loads(decoded_line)
64
+ if "message" in chunk and "content" in chunk["message"]:
65
+ yield {"content": chunk["message"]["content"]}
 
 
 
 
 
 
66
  except json.JSONDecodeError:
67
  continue
68
  else:
69
  result = response.json()
70
+ yield {"content": result["message"]["content"]}
 
 
 
 
 
 
71
 
72
+ # -- check content --
73
  def analyze_file_content(content, file_type):
74
  """Analyze file content and return structural summary"""
75
  if file_type in ['parquet', 'csv']:
76
  try:
77
  lines = content.split('\n')
78
  header = lines[0]
79
+ columns = header.count('|') - 1 if '|' in header else len(header.split(','))
80
  rows = len(lines) - 3
81
  return f"πŸ“Š Dataset Structure: {columns} columns, {rows} data samples"
82
  except:
 
96
  words = len(content.split())
97
  return f"πŸ“ Document Structure: {total_lines} lines, {paragraphs} paragraphs, ~{words} words"
98
 
99
+ # -- Basic stats on content --
100
+ def get_column_stats(df, col):
101
+ stats = {
102
+ 'type': str(df[col].dtype),
103
+ 'missing': df[col].isna().sum(),
104
+ 'unique': df[col].nunique()
105
+ }
106
+
107
+ if pd.api.types.is_numeric_dtype(df[col]):
108
+ stats.update({
109
+ 'min': df[col].min(),
110
+ 'max': df[col].max(),
111
+ 'mean': df[col].mean()
112
+ })
113
+ else:
114
+ stats['examples'] = df[col].dropna().head(3).tolist()
115
+
116
+ return stats
117
+
118
+ # -- Identify Encoding --
119
+ def detect_file_encoding(file_path):
120
+ """Improved encoding detection with fallback options"""
121
+ try:
122
+ with open(file_path, 'rb') as f:
123
+ rawdata = f.read(100000) # Read more data for better detection
124
+
125
+ # Try chardet first
126
+ result = chardet.detect(rawdata)
127
+ encoding = result['encoding']
128
+ confidence = result['confidence']
129
+
130
+ # If confidence is low, try some common encodings
131
+ if confidence < 0.9:
132
+ for test_encoding in ['utf-8', 'utf-16', 'latin1', 'cp1252']:
133
+ try:
134
+ rawdata.decode(test_encoding)
135
+ return test_encoding
136
+ except UnicodeDecodeError:
137
+ continue
138
+
139
+ return encoding if encoding else 'utf-8'
140
+ except Exception as e:
141
+ print(f"Encoding detection error: {e}")
142
+ return 'utf-8' # Default fallback
143
+
144
+ # -- Read file --
145
  def read_uploaded_file(file):
146
  if file is None:
147
  return "", ""
 
152
  df = pd.read_parquet(file.name, engine='pyarrow')
153
  content = df.head(10).to_markdown(index=False)
154
  return content, "parquet"
155
+
156
+ if file_ext == '.csv':
157
+ # First try to detect encoding
158
+ try:
159
+ encoding = detect_file_encoding(file.name)
160
+
161
+ # Try reading with different delimiters
162
+ delimiters = [',', ';', '\t', '|']
163
+ df = None
164
+ best_delimiter = ','
165
+ max_columns = 1
166
+
167
+ # First pass to find the best delimiter
168
+ for delimiter in delimiters:
169
+ try:
170
+ with open(file.name, 'r', encoding=encoding) as f:
171
+ first_line = f.readline()
172
+ current_columns = len(first_line.split(delimiter))
173
+ if current_columns > max_columns:
174
+ max_columns = current_columns
175
+ best_delimiter = delimiter
176
+ except:
177
+ continue
178
+
179
+ # Now read with the best found delimiter
180
  try:
181
+ df = pd.read_csv(
182
+ file.name,
183
+ encoding=encoding,
184
+ delimiter=best_delimiter,
185
+ on_bad_lines='warn',
186
+ engine='python',
187
+ quotechar='"'
188
+ )
189
+ except:
190
+ # Fallback to pandas auto-detection
191
+ df = pd.read_csv(file.name, encoding=encoding, on_bad_lines='warn')
192
+
193
+ if df is None or len(df.columns) < 1:
194
+ return "❌ Could not parse CSV file - no valid columns detected", "error"
195
+
196
+ # Generate comprehensive data summary
197
+ content = "πŸ“Š CSV Metadata:\n"
198
+ content += f"- Rows: {len(df):,}\n"
199
+ content += f"- Columns: {len(df.columns):,}\n"
200
+ content += f"- Missing Values: {df.isna().sum().sum():,}\n\n"
201
+
202
+ content += "πŸ” Column Details:\n"
203
+ for col in df.columns:
204
+ stats = get_column_stats(df, col)
205
+ content += f"### {col}\n"
206
+ content += f"- Type: {stats['type']}\n"
207
+ content += f"- Unique: {stats['unique']}\n"
208
+ content += f"- Missing: {stats['missing']}\n"
209
+
210
+ if 'examples' in stats:
211
+ content += f"- Examples: {stats['examples']}\n"
212
+ else:
213
+ content += (
214
+ f"- Range: {stats['min']} to {stats['max']}\n"
215
+ f"- Mean: {stats['mean']:.2f}\n"
216
+ )
217
+ content += "\n"
218
+
219
+ content += "πŸ“‹ Sample Data (First 3 Rows):\n"
220
+ content += df.head(3).to_markdown(index=False)
221
+
222
+ return content, "csv"
223
+ except Exception as e:
224
+ return f"❌ Error reading CSV file: {str(e)}", "error"
225
  else:
226
  encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
227
  for encoding in encodings:
 
243
  formatted_history.append({"role": "assistant", "content": assistant_msg})
244
  return formatted_history
245
 
246
+ def chat(message,
247
+ history,
248
+ uploaded_file,
249
+ system_message="",
250
+ max_tokens=4000,
251
+ temperature=0.3,
252
+ top_p=0.9,
253
+ selected_model="phi3:latest"):
254
+
255
+ system_prefix = """
256
+ You are a AI Data Scientist designed to provide expert guidance in data analysis, machine learning, and big data technologies, suitable for a wide range of users seeking data-driven insights and solutions.
257
+
258
+ Analyze the uploaded file in depth from the following perspectives:
259
+
260
+ 1. πŸ“‹ Overall file structure and format
261
+ 2. ⭐ Data Quality and completeness evaluation
262
+ 3. πŸ’‘ Suggested data fixes and improvements
263
+ 4. πŸ“ˆ Data characteristics, meaning and patterns
264
+ 5. πŸ“Š Key component analysis and potential segmentations
265
+ 6. 🎯 Insights and suggested persuasive story telling
266
+
267
+ Provide detailed and structured analysis from an expert perspective, but explain in an easy-to-understand way.
268
+
269
+ Format the analysis results in Markdown and include specific examples where possible.
270
+ """
271
 
272
  if uploaded_file:
273
  content, file_type = read_uploaded_file(uploaded_file)
 
285
  message = f"""[Structure Analysis] {file_summary}
286
  Please provide detailed analysis from these perspectives:
287
  1. πŸ“‹ Overall file structure and format
288
+ 2. ⭐ Data Quality and completeness evaluation
289
+ 3. πŸ’‘ Suggested data fixes and improvements
290
+ 4. πŸ“ˆ Data characteristics, meaning and patterns
291
+ 5. πŸ“Š Key component analysis and potential segmentations
292
+ 6. 🎯 Insights and suggested persuasive story telling"""
293
 
294
  messages = [{"role": "system", "content": f"{system_prefix} {system_message}"}]
295
 
 
306
  messages.append({"role": "user", "content": message})
307
 
308
  try:
309
+ client = OllamaClient(model_name=selected_model)
310
  partial_message = ""
311
  current_history = []
312
 
313
+ for response in client.chat_completion(
314
  messages,
315
  max_tokens=max_tokens,
316
  stream=True,
317
  temperature=temperature,
318
  top_p=top_p,
319
  ):
320
+ token = response.get('content', '')
321
  if token:
322
  partial_message += token
323
  current_history = [
 
338
  footer {visibility: hidden}
339
  """
340
 
341
+ with gr.Blocks(theme="gstaff/xkcd",
342
+ css=css,
343
+ title="Offline Sensitive Survey Data Analysis") as demo:
344
  gr.HTML(
345
  """
346
  <div style="text-align: center; max-width: 800px; margin: 0 auto;">
347
+ <h1 style="font-size: 3em; font-weight: 600; margin: 0.5em;">Offline Sensitive Survey Data Analysis</h1>
348
+ <h3 style="font-size: 1.2em; margin: 1em;">Leveraging Ollama Inference Server</h3>
349
  </div>
350
  """
351
  )
352
 
353
+ # Store the current model in a state variable
354
+ current_model = gr.State("phi3:latest")
355
+
356
  with gr.Row():
357
  with gr.Column(scale=2):
358
  chatbot = gr.Chatbot(
359
+ height=500,
360
+ label="Chat Interface",
361
  type="messages"
362
  )
363
  msg = gr.Textbox(
364
  label="Type your message",
365
  show_label=False,
366
+ placeholder="Ask me anything about the uploaded data file... ",
367
  container=False
368
  )
369
  with gr.Row():
370
  clear = gr.ClearButton([msg, chatbot])
371
+ send = gr.Button("Send")
372
 
373
  with gr.Column(scale=1):
374
+ gr.Markdown("### Upload File \nSupport: CSV, Parquet files, Text")
375
  file_upload = gr.File(
376
  label="Upload File",
377
+ file_types=[".csv", ".parquet",".txt"],
378
  type="filepath"
379
  )
380
 
381
+ with gr.Accordion("Model Settings", open=False):
382
+ model_dropdown = gr.Dropdown(
383
+ label="Available Models",
384
+ choices=[],
385
+ interactive=True
386
+ )
387
+ refresh_models = gr.Button("Refresh List of Models")
388
+
389
  with gr.Accordion("Advanced Settings βš™οΈ", open=False):
390
+ system_message = gr.Textbox(label="Override System Message πŸ“", value="")
391
  max_tokens = gr.Slider(minimum=1, maximum=8000, value=4000, label="Max Tokens πŸ“Š")
392
+ temperature = gr.Slider(minimum=0, maximum=1, value=0.3, label="Temperature 🌑️")
393
  top_p = gr.Slider(minimum=0, maximum=1, value=0.9, label="Top P πŸ“ˆ")
394
 
395
+ # Function to load available models
396
+ def load_models():
397
+ client = OllamaClient()
398
+ models = client.list_models()
399
+ return gr.Dropdown(choices=models, value=models[0] if models else "phi3:latest")
400
+
401
+ # Refresh models button click handler
402
+ refresh_models.click(
403
+ load_models,
404
+ outputs=model_dropdown
405
+ )
406
+
407
+ # Model dropdown change handler
408
+ model_dropdown.change(
409
+ lambda x: x,
410
+ inputs=model_dropdown,
411
+ outputs=current_model
412
+ )
413
+
414
+ # Load models when app starts
415
+ demo.load(
416
+ load_models,
417
+ outputs=model_dropdown
418
+ )
419
+
420
  # Event bindings
421
  msg.submit(
422
  chat,
423
+ inputs=[msg, chatbot, file_upload, system_message, max_tokens, temperature, top_p, current_model],
424
  outputs=[msg, chatbot],
425
  queue=True
426
  ).then(
 
431
 
432
  send.click(
433
  chat,
434
+ inputs=[msg, chatbot, file_upload, system_message, max_tokens, temperature, top_p, current_model],
435
  outputs=[msg, chatbot],
436
  queue=True
437
  ).then(
 
440
  [msg]
441
  )
442
 
443
+ # Auto-analysis on file upload with this hidden component
444
+ auto_analyze_trigger = gr.Textbox(value="Analyze this file", visible=False)
445
  file_upload.change(
446
+ lambda: gr.Chatbot(value=[]), # Clear chat history
447
+ outputs=[chatbot],
448
+ queue=True
449
+ ).then(
450
  chat,
451
+ inputs=[auto_analyze_trigger, chatbot, file_upload, system_message, max_tokens, temperature, top_p, current_model],
452
  outputs=[msg, chatbot],
453
  queue=True
454
  )
455
 
456
+
457
  # Example queries
458
+ with gr.Column():
459
+ gr.Markdown("### Potential Follow-up Queries")
460
+ with gr.Row():
461
+ example_btns = [
462
+ gr.Button("Analyze open-ended responses for sentiment and recurring themes", size="lg", variant="secondary"),
463
+ gr.Button("Compare responses between different groups and identify potential segmentation or cluster analysis", size="lg", variant="secondary"),
464
+ gr.Button("Identify potential outcome variables and suggest a predicting model for it", size="lg", variant="secondary"),
465
+ gr.Button("Generate a Quarto notebook in Python to process this dataset", size="lg", variant="secondary"),
466
+ gr.Button("Generate a Rmd notebook in R to process this dataset", size="lg", variant="secondary"),
467
+
468
+ ]
469
+
470
+ # Add click handlers
471
+ for btn in example_btns:
472
+ btn.click(
473
+ lambda x: x,
474
+ inputs=[gr.Textbox(value=btn.value, visible=False)],
475
+ outputs=msg
476
+ )
477
+
478
 
479
  if __name__ == "__main__":
480
  demo.launch()