openfree commited on
Commit
fa41b98
·
verified ·
1 Parent(s): ac2ccfd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -34
app.py CHANGED
@@ -1,19 +1,26 @@
1
  import gradio as gr
2
  import pandas as pd
 
3
  from io import BytesIO
4
 
 
 
 
 
 
 
5
  def convert_file(input_file, conversion_type):
6
  # Check if a file was uploaded
7
  if input_file is None:
8
- raise ValueError("Please upload a file.")
9
-
10
- # Determine if input_file is a file-like object or a file path string.
11
  try:
12
  # Try reading from file-like object
13
  file_bytes = input_file.read()
14
  file_name = input_file.name
15
  except AttributeError:
16
- # If there's an AttributeError, treat input_file as a file path.
17
  file_name = input_file
18
  with open(file_name, "rb") as f:
19
  file_bytes = f.read()
@@ -22,34 +29,62 @@ def convert_file(input_file, conversion_type):
22
  df = None
23
  output_file = None
24
  converted_format = None
25
-
26
- # Conversion: CSV to Parquet
27
- if conversion_type == "CSV to Parquet":
28
- if file_extension != "csv":
29
- raise ValueError("For CSV to Parquet conversion, please upload a CSV file.")
30
- df = pd.read_csv(BytesIO(file_bytes))
31
- output_file = "output.parquet"
32
- df.to_parquet(output_file, index=False)
33
- converted_format = "Parquet"
34
- # Conversion: Parquet to CSV
35
- elif conversion_type == "Parquet to CSV":
36
- if file_extension != "parquet":
37
- raise ValueError("For Parquet to CSV conversion, please upload a Parquet file.")
38
- df = pd.read_parquet(BytesIO(file_bytes))
39
- output_file = "output.csv"
40
- df.to_csv(output_file, index=False)
41
- converted_format = "CSV"
42
- else:
43
- raise ValueError("Invalid conversion type selected.")
44
-
45
- # Generate a preview of the top 10 rows
46
- preview = df.head(10).to_string(index=False)
47
- info_message = (
48
- f"Input file: {file_name}\n"
49
- f"Converted file format: {converted_format}\n\n"
50
- f"Preview (Top 10 Rows):\n{preview}"
51
- )
52
- return output_file, info_message
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  # Custom CSS for a modern and sleek look
55
  custom_css = """
@@ -93,7 +128,11 @@ with gr.Blocks(css=custom_css, title="CSV <-> Parquet Converter") as demo:
93
  with gr.Column(scale=1):
94
  input_file = gr.File(label="Upload CSV or Parquet File")
95
  with gr.Column(scale=1):
96
- conversion_type = gr.Radio(choices=["CSV to Parquet", "Parquet to CSV"], label="Conversion Type")
 
 
 
 
97
 
98
  convert_button = gr.Button("Convert", elem_classes=["gradio-button"])
99
 
@@ -102,5 +141,13 @@ with gr.Blocks(css=custom_css, title="CSV <-> Parquet Converter") as demo:
102
  preview = gr.Textbox(label="Preview (Top 10 Rows)", lines=15)
103
 
104
  convert_button.click(fn=convert_file, inputs=[input_file, conversion_type], outputs=[output_file, preview])
 
 
 
 
 
 
 
105
 
106
- demo.launch()
 
 
1
  import gradio as gr
2
  import pandas as pd
3
+ import chardet
4
  from io import BytesIO
5
 
6
+ def detect_encoding(file_bytes):
7
+ """Detect the encoding of the file."""
8
+ # Use chardet to detect encoding
9
+ result = chardet.detect(file_bytes)
10
+ return result['encoding']
11
+
12
  def convert_file(input_file, conversion_type):
13
  # Check if a file was uploaded
14
  if input_file is None:
15
+ return None, "Please upload a file."
16
+
17
+ # Read the file content
18
  try:
19
  # Try reading from file-like object
20
  file_bytes = input_file.read()
21
  file_name = input_file.name
22
  except AttributeError:
23
+ # If there's an AttributeError, treat input_file as a file path
24
  file_name = input_file
25
  with open(file_name, "rb") as f:
26
  file_bytes = f.read()
 
29
  df = None
30
  output_file = None
31
  converted_format = None
32
+
33
+ try:
34
+ # Conversion: CSV to Parquet
35
+ if conversion_type == "CSV to Parquet":
36
+ if file_extension != "csv":
37
+ return None, "For CSV to Parquet conversion, please upload a CSV file."
38
+
39
+ # Detect the encoding of the CSV file
40
+ encoding = detect_encoding(file_bytes)
41
+
42
+ # Try to read with detected encoding
43
+ try:
44
+ df = pd.read_csv(BytesIO(file_bytes), encoding=encoding)
45
+ except Exception as e:
46
+ # If that fails, try with other common encodings
47
+ for enc in ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']:
48
+ try:
49
+ df = pd.read_csv(BytesIO(file_bytes), encoding=enc)
50
+ encoding = enc
51
+ break
52
+ except:
53
+ continue
54
+ if df is None:
55
+ return None, f"Failed to read CSV with any encoding. Error: {str(e)}"
56
+
57
+ output_file = "output.parquet"
58
+ df.to_parquet(output_file, index=False)
59
+ converted_format = "Parquet"
60
+
61
+ # Conversion: Parquet to CSV
62
+ elif conversion_type == "Parquet to CSV":
63
+ if file_extension != "parquet":
64
+ return None, "For Parquet to CSV conversion, please upload a Parquet file."
65
+
66
+ df = pd.read_parquet(BytesIO(file_bytes))
67
+ output_file = "output.csv"
68
+ df.to_csv(output_file, index=False, encoding='utf-8')
69
+ converted_format = "CSV"
70
+ else:
71
+ return None, "Invalid conversion type selected."
72
+
73
+ # Generate a preview of the top 10 rows
74
+ preview = df.head(10).to_string(index=False)
75
+ info_message = (
76
+ f"Input file: {file_name}\n"
77
+ f"Converted file format: {converted_format}\n"
78
+ )
79
+ if conversion_type == "CSV to Parquet":
80
+ info_message += f"Detected encoding: {encoding}\n"
81
+
82
+ info_message += f"\nPreview (Top 10 Rows):\n{preview}"
83
+
84
+ return output_file, info_message
85
+
86
+ except Exception as e:
87
+ return None, f"Error during conversion: {str(e)}"
88
 
89
  # Custom CSS for a modern and sleek look
90
  custom_css = """
 
128
  with gr.Column(scale=1):
129
  input_file = gr.File(label="Upload CSV or Parquet File")
130
  with gr.Column(scale=1):
131
+ conversion_type = gr.Radio(
132
+ choices=["CSV to Parquet", "Parquet to CSV"],
133
+ label="Conversion Type",
134
+ value="CSV to Parquet" # Set default value
135
+ )
136
 
137
  convert_button = gr.Button("Convert", elem_classes=["gradio-button"])
138
 
 
141
  preview = gr.Textbox(label="Preview (Top 10 Rows)", lines=15)
142
 
143
  convert_button.click(fn=convert_file, inputs=[input_file, conversion_type], outputs=[output_file, preview])
144
+
145
+ gr.Markdown("""
146
+ ### Notes:
147
+ - This converter can handle various CSV encodings
148
+ - Parquet files are always encoded in UTF-8
149
+ - The preview shows only the first 10 rows of data
150
+ """)
151
 
152
+ if __name__ == "__main__":
153
+ demo.launch()