Update pages/6.Data Collection.py
Browse files- pages/6.Data Collection.py +102 -1
pages/6.Data Collection.py
CHANGED
@@ -21,4 +21,105 @@ if data_type == "Structured":
|
|
21 |
format_selected = st.radio(
|
22 |
"Select a data format to learn more:", ["Excel", "CSV", "SQL Databases"]
|
23 |
)
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
format_selected = st.radio(
|
22 |
"Select a data format to learn more:", ["Excel", "CSV", "SQL Databases"]
|
23 |
)
|
24 |
+
|
25 |
+
# Excel Format Section
|
26 |
+
if format_selected == "Excel":
|
27 |
+
st.write("#### Excel Format")
|
28 |
+
|
29 |
+
# Part (a) What it is
|
30 |
+
st.subheader("What is Excel?")
|
31 |
+
st.write("""
|
32 |
+
Excel is a popular file format used for storing structured data in tabular form.
|
33 |
+
It is commonly used in business and data analysis, and its file extensions include `.xlsx` and `.xls`.
|
34 |
+
""")
|
35 |
+
|
36 |
+
# Part (b) How to read these files
|
37 |
+
st.subheader("How to Read Excel Files?")
|
38 |
+
st.code("""
|
39 |
+
import pandas as pd
|
40 |
+
# Read an Excel file
|
41 |
+
df = pd.read_excel("file.xlsx")
|
42 |
+
print(df.head())
|
43 |
+
""")
|
44 |
+
|
45 |
+
# Part (c) Issues encountered
|
46 |
+
st.subheader("Common Issues Encountered When Handling Excel Files")
|
47 |
+
st.write("""
|
48 |
+
- **Missing Data**: Some cells may contain empty or null values.
|
49 |
+
- **Encoding Problems**: Files saved in non-standard formats may have encoding issues.
|
50 |
+
- **File Corruption**: The file may become unreadable if improperly saved or transferred.
|
51 |
+
- **Large Files**: Handling very large Excel files may exceed memory limits.
|
52 |
+
""")
|
53 |
+
|
54 |
+
# Part (d) How to overcome these errors/issues
|
55 |
+
st.subheader("How to Overcome These Issues?")
|
56 |
+
st.write("""
|
57 |
+
- **Missing Data**: Use data imputation techniques to fill in missing values.
|
58 |
+
- **Encoding Problems**: Specify the encoding format when reading the file, e.g., `encoding='utf-8'`.
|
59 |
+
- **File Corruption**: Use repair tools or convert to a compatible format like CSV.
|
60 |
+
- **Large Files**: Process the file in chunks using `pandas` or optimize it using external tools.
|
61 |
+
""")
|
62 |
+
|
63 |
+
# Downloadable Guide Button
|
64 |
+
st.markdown("### Download Coding Guide:")
|
65 |
+
if st.button("Download Excel Guide"):
|
66 |
+
# Provide a downloadable file
|
67 |
+
file_path = "Excel_guide.ipynb" # Ensure this file exists in the app directory
|
68 |
+
with open(file_path, "rb") as file:
|
69 |
+
st.download_button(
|
70 |
+
label="Download Excel Guide",
|
71 |
+
data=file,
|
72 |
+
file_name="Excel_guide.ipynb",
|
73 |
+
mime="application/octet-stream",
|
74 |
+
)
|
75 |
+
|
76 |
+
# CSV Format Section
|
77 |
+
elif format_selected == "CSV":
|
78 |
+
st.write("#### CSV Format")
|
79 |
+
|
80 |
+
# Part (a) What it is
|
81 |
+
st.subheader("What is CSV?")
|
82 |
+
st.write("""
|
83 |
+
CSV (Comma-Separated Values) is a lightweight text file format for structured data,
|
84 |
+
where values are separated by commas. It is widely used for data exchange between systems.
|
85 |
+
""")
|
86 |
+
|
87 |
+
# Part (b) How to read these files
|
88 |
+
st.subheader("How to Read CSV Files?")
|
89 |
+
st.code("""
|
90 |
+
import pandas as pd
|
91 |
+
# Read a CSV file
|
92 |
+
df = pd.read_csv("file.csv")
|
93 |
+
print(df.head())
|
94 |
+
""")
|
95 |
+
|
96 |
+
# Part (c) Issues encountered
|
97 |
+
st.subheader("Common Issues Encountered When Handling CSV Files")
|
98 |
+
st.write("""
|
99 |
+
- **Misaligned Rows**: Extra or missing delimiters can lead to misaligned rows.
|
100 |
+
- **Encoding Problems**: Non-standard characters may cause encoding errors.
|
101 |
+
- **Large Files**: Processing large CSV files can be resource-intensive.
|
102 |
+
""")
|
103 |
+
|
104 |
+
# Part (d) How to overcome these errors/issues
|
105 |
+
st.subheader("How to Overcome These Issues?")
|
106 |
+
st.write("""
|
107 |
+
- **Misaligned Rows**: Use a consistent delimiter and validate the file before processing.
|
108 |
+
- **Encoding Problems**: Explicitly specify the encoding format, e.g., `encoding='utf-8'`.
|
109 |
+
- **Large Files**: Process the file in chunks using `pandas` (`chunksize` parameter).
|
110 |
+
""")
|
111 |
+
|
112 |
+
# Downloadable Guide Button
|
113 |
+
st.markdown("### Download Coding Guide:")
|
114 |
+
if st.button("Download CSV Guide"):
|
115 |
+
# Provide a downloadable file
|
116 |
+
file_path = "CSV_guide.ipynb" # Ensure this file exists in the app directory
|
117 |
+
with open(file_path, "rb") as file:
|
118 |
+
st.download_button(
|
119 |
+
label="Download CSV Guide",
|
120 |
+
data=file,
|
121 |
+
file_name="CSV_guide.ipynb",
|
122 |
+
mime="application/octet-stream",
|
123 |
+
)
|
124 |
+
|
125 |
+
# Add similar sections for "Unstructured" and "Semi-Structured" data types as needed.
|