import streamlit as st import pandas as pd import numpy as np st.markdown(f""" """, unsafe_allow_html=True) import streamlit as st # Navigation st.title("Life Cycle of ML") if 'page' not in st.session_state: st.session_state['page'] = 'home' # Main Navigation if st.session_state['page'] == 'home': st.subheader("Explore the Life Cycle Stages") if st.button("Data Collection"): st.session_state['page'] = 'data_collection' elif st.session_state['page'] == 'data_collection': # Data Collection Page st.title("Data Collection") st.header("1. What is Data?") st.write( "Data refers to raw facts and figures that are collected, stored, and analyzed to derive insights. " "It serves as the foundation for any machine learning model." ) st.header("2. Types of Data") data_type = st.radio( "Select a type of data to learn more:", ("Structured", "Unstructured", "Semi-Structured") ) if data_type == "Structured": st.subheader("Structured Data") st.write( "Structured data is highly organized and easily searchable within databases. " "It includes rows and columns, such as in relational databases." ) st.write("Data Formats:") format_selected = st.radio( "Select a format to explore further:", ("Excel", "CSV") ) if format_selected == "Excel": # Excel Data Format Section st.subheader("Excel Data Format") st.write("*What is it?*") st.write( "Excel files are spreadsheets used to organize and analyze data in rows and columns. " "They are widely used due to their user-friendly nature and support for various data types." ) st.write("*How to Read Excel Files?*") st.code( """ import pandas as pd # Reading an Excel file df = pd.read_excel('file.xlsx') print(df.head()) """, language="python" ) st.write("*Common Issues When Handling Excel Files*") st.write( """ - Missing or corrupted files - Version incompatibilities - Incorrect file paths - Handling large Excel files """ ) st.write("*How to Overcome These Errors/Issues?*") st.write( """ - Use proper error handling with try-except. - Convert Excel files to CSV for better compatibility. - Use libraries like openpyxl or xlrd for specific Excel versions. - Break large files into smaller chunks for processing. """ ) # Button to open Jupyter Notebook or PDF if st.button("Open Excel Documentation"): st.write("Download the [documentation notebook](path/to/excel_notebook.ipynb) or [PDF](path/to/excel_documentation.pdf).") elif format_selected == "CSV": # CSV Data Format Section st.subheader("CSV Data Format") st.write("*What is it?*") st.write( "CSV (Comma-Separated Values) files store tabular data in plain text, where each line represents a record, " "and fields are separated by commas." ) st.write("*How to Read CSV Files?*") st.code( """ import pandas as pd # Reading a CSV file df = pd.read_csv('file.csv') print(df.head()) """, language="python" ) st.write("*Common Issues When Handling CSV Files*") st.write( """ - Encoding issues (e.g., UTF-8, ISO-8859-1) - Inconsistent delimiters - Missing or corrupted files - Large file sizes causing memory errors """ ) st.write("*How to Overcome These Errors/Issues?*") st.write( """ - Specify the correct encoding when reading files using encoding='utf-8' or similar. - Use libraries like csv or pandas to handle different delimiters. - Employ error handling to catch and manage missing/corrupted files. - Use chunking to read large files in smaller parts: pd.read_csv('file.csv', chunksize=1000). """ ) # Button to open Jupyter Notebook or PDF if st.button("Open CSV Documentation"): st.write("Download the [documentation notebook](path/to/csv_notebook.ipynb) or [PDF](path/to/csv_documentation.pdf).") elif data_type == "Unstructured": st.subheader("Unstructured Data") st.write( "Unstructured data refers to information that lacks a predefined format or organization, making it challenging to analyze using traditional tools." "Examples include text, images, videos, audio, and social media posts." ) st.write("Data Formats:") format_selected = st.radio( "Select a format to explore further:", ("IMAGE","VIDEO", "AUDIO") ) #HOW TO READ TEXT if format_selected == "IMAGE": st.subheader("IMAGE Data Format") st.write("*What is it?*") st.write( "Photos, medical scans, satellite images. " ) st.write("*How to Read IMAGE Files?*") st.code( """ from PIL import Image image = Image.open('example.jpg') image.show() """, language="python" ) st.write("*Common Issues When Handling image Files*") st.write( """ - data augumentation and overfitting - image processing challenges - Data Imbalance - High Dimensionality """ ) st.write("*How to Overcome These Errors/Issues?*") st.write( """ - Data Augumentaion. - Consistent image processing - Handling Class Imbalance. - Dimensionality Reduction and Feature Extraction """ ) # Button to open Jupyter Notebook or PDF if st.button("Open IMAGE Documentation"): st.write("Download the [documentation notebook](path/to/image_notebook.ipynb) or [PDF](path/to/image_documentation.pdf).") elif format_selected == "VIDEO": st.subheader("VIDEO Data Format") st.write("*What is it?*") st.write( "PNG,GIF,BNP,RAW videos,TIFF " ) st.write("*How to Read VIDEO Files?*") st.code( """ pip install opencv-python import cv2 # Open the video file video_path = 'path_to_your_video.mp4' cap = cv2.VideoCapture(video_path) """, language="python" ) st.write("*Common Issues When Handling video Files*") st.write( """ - File not found or Corrupted. - Incompatible Codec or Format. - Performance Issues with Large Videos. - Frame Dropping or Skipping. """ ) st.write("*How to Overcome These Errors/Issues?*") st.write( """ - Ensure Correct File Path and Handle Corrupted Files. - Install Missing Codecs or Use Supported Formats. - Optimize Performance for Large Videos - Control Frame Rate and Prevent Skipping """ ) # Button to open Jupyter Notebook or PDF if st.button("Open VIDEOS Documentation"): st.write("Download the [documentation notebook](path/to/videos_notebook.ipynb) or [PDF](path/to/videos_documentation.pdf).") elif format_selected == "AUDIO": st.subheader("AUDIO Data Format") st.write("*What is it?*") st.write( "MP3,WAV,FLAC,AAC,OGG " ) st.write("*How to Read AUDIO Files?*") st.code( """ pip install librosa import librosa # Load the audio file audio_path = 'path_to_audio_file.wav' y, sr = librosa.load(audio_path, sr=None) # sr=None to preserve the original sampling rate """, language="python" ) st.write("*Common Issues When Handling audio Files*") st.write( """ - File not found or Corrupted. - Incompatible Codec or Format. - Memory Overload or Performance Issues with Large Audios. - Encoding or File Corruption Issues """ ) st.write("*How to Overcome These Errors/Issues?*") st.write( """ - File Not Found or Corrupted: Always check if the file exists before attempting to load it. Handle errors gracefully with try-except. - Incompatible Format or Codec: Use pydub or ffmpeg to handle multiple formats, or convert the file to a more compatible format. - Memory Overload or Performance Issues: Process the audio in chunks or downsample large files to reduce memory consumption. - Encoding or File Corruption Issues: Ensure proper encoding and re-encode files using tools like ffmpeg if necessary. """ ) # Button to open Jupyter Notebook or PDF if st.button("Open AUDIO Documentation"): st.write("Download the [documentation notebook](path/to/audio_notebook.ipynb) or [PDF](path/to/audio_documentation.pdf).") elif data_type == "Semi-Structured": st.subheader("Semi-structured Data") st.write( "Semi-structured data is data that doesn’t fit into a rigid structure like relational databases but has some organizational properties, such as tags or key-value pairs, making it easier to analyze.") st.write("Data Formats:") format_selected = st.radio( "Select a format to explore further:", ("JSON","XML") ) #HOW TO READ TEXT if format_selected == "JSON": st.subheader("JSON Data Format") st.write("*What is it?*") st.write( "JSON is a lightweight data-interchange format that uses key-value pairs. It is commonly used in web services and APIs for exchanging data. " ) st.write("*How to Read JSON Files?*") st.code( """ import json # Open and read the JSON file with open('data.json', 'r') as file: data = json.load(file) """, language="python" ) st.write("*Common Issues When Handling json Files*") st.write( """ - File Encoding Issues - Invalid JSON Syntax - Large JSON Files Causing Memory Issues - Inconsistent Data Structure """ ) st.write("*How to Overcome These Errors/Issues?*") st.write( """ - Validate JSON Syntax: Use tools like JSONLint or json.decoder.JSONDecodeError in Python to ensure valid JSON format. - Handle Encoding: Specify the encoding when opening the file in Python (e.g., open('file.json', 'r', encoding='utf-8')). - Use Chunking or Streaming for Large Files: For large JSON files, load the file in chunks or use libraries that support JSON streaming like ijson or jsonlines. - Consistent Structure: Ensure consistent data structure when creating JSON files, or write code to handle missing or extra fields gracefully. """ ) # Button to open Jupyter Notebook or PDF if st.button("Open JSON Documentation"): st.write("Download the [documentation notebook](path/to/JSON_notebook.ipynb) or [PDF](path/to/JSON_documentation.pdf).") elif format_selected == "XML": st.subheader("XML Data Format") st.write("*What is it?*") st.write( "XML is a flexible, structured data format used to store and transport data, utilizing tags to define elements, attributes, and hierarchical relationships between different pieces of information. " ) st.write("*How to Read XML Files?*") st.code( """ import pandas as pd pd.read_xml("Data_path") """ , language="python" ) st.write("*Common Issues When Handling XML Files*") st.write( """ - Invalid XML Syntax. - Encoding Issues. - Large XML Files. - Inconsistent Structure. """ ) st.write("*How to Overcome These Errors/Issues?*") st.write( """ - Validate XML Syntax: Use XML validators and try-except blocks to catch and fix syntax errors during parsing. - Handle Encoding Issues: Specify the encoding when reading files and use libraries like chardet to detect encoding automatically. - Process Large Files Efficiently: Use streaming parsers (e.g., iterparse()) and iterative parsing to handle large files without consuming too much memory. - Ensure Consistent Structure: Check for missing elements before accessing them and handle inconsistencies with default values or conditional logic. """ ) # Button to open Jupyter Notebook or PDF if st.button("Open XML Documentation"): st.write("Download the [documentation notebook](path/to/XML_notebook.ipynb) or [PDF](path/to/XML_documentation.pdf).")