Spaces:

hi-paris
/

app-ai-ds-hec

Sleeping

App Files Files Community

laudavid commited on May 28, 2024

Commit

ce45b2a

1 Parent(s): ae65ca1

latest version of app

Browse files

Files changed (7) hide show

main_page.py +19 -17
pages/go_further.py +58 -40
pages/object_detection.py +8 -7
pages/recommendation_system.py +1 -1
pages/supervised_unsupervised_page.py +30 -22
pages/timeseries_analysis.py +3 -3
pages/topic_modeling.py +1 -1

main_page.py CHANGED Viewed

@@ -44,21 +44,21 @@ col1, col2 = st.columns([0.65,0.35], gap="medium")
 with col1:
     st.title("AI and Data Science Examples")
     st.subheader("HEC Paris, 2023-2024")
-    st.markdown("""**Course provided by Shirish C. SRIVASTAVA** <br>
-                **Hi! PARIS Engineering team**: Laurène DAVID, Salma HOUIDI and Maeva N'GUESSAN""", unsafe_allow_html=True)
 #st.markdown("in collaboration with Hi! PARIS engineers: Laurène DAVID, Salma HOUIDI and Maeva N'GUESSAN")
-with col2:
 #Hi! PARIS collaboration mention
-    st.markdown("  ")
-    st.markdown("  ")
-    st.markdown("  ")
-    image_hiparis = Image.open('images/hi-paris.png')
-    st.image(image_hiparis, width=150)
     url = "https://www.hi-paris.fr/"
     #st.markdown("This app was funded by the Hi! PARIS Center")
-    st.markdown("""###### **Made in collaboration with [Hi! PARIS](%s)** """ % url, unsafe_allow_html=True)
 st.markdown(" ")
@@ -114,23 +114,25 @@ show_pages(
 st.header("About the app")
-st.info("""The **AI and Data Science Examples** app was created as a tool to introduce students to the field of Data Science by showcasing real-life applications of AI.
-        It includes use cases using traditional Machine Learning algorithms on structured data, as well as models that analyze unstructured data (text, images,...).""")
 st.markdown(" ")
-st.markdown("""The app is structured into three sections:
-- 1️⃣ **Machine Learning**: This first section covers use cases where structured data (data in a tabular format) is fed to an AI model.
             You will find pages on *Supervised/Unsupervised Learning*, *Time Series Forecasting* and AI powered *Recommendation Systems*.
 - 2️⃣ **Natural Language Processing** (NLP): This second section showcases AI applications where large amounts of text data is analyzed using Deep Learning models.
-            Pages on *Topic Modeling* and *Sentiment Analysis*, which are types of NLP models, can be found in this section.
-- 3️⃣ **Computer Vision**: This final section covers a sub-field of AI called Computer Vision which deals with image/video data.
     The field of Computer Vision includes *Image classification* and *Object Detection*, which are both featured in this section.
             """)
 st.image("images/ML_domains.png",
-         caption="""This figure showcases a selection of sub-fields in Artificial Intelligence, such as traditional
-                    Machine Learning, NLP, Computer Vision and Robotics.""")
 # st.markdown(" ")

 with col1:
     st.title("AI and Data Science Examples")
     st.subheader("HEC Paris, 2023-2024")
+    # st.markdown("""**Course provided by Shirish C. SRIVASTAVA** <br>
+    #             **Hi! PARIS Engineering team**: Laurène DAVID, Salma HOUIDI and Maeva N'GUESSAN""", unsafe_allow_html=True)
 #st.markdown("in collaboration with Hi! PARIS engineers: Laurène DAVID, Salma HOUIDI and Maeva N'GUESSAN")
+# with col2:
 #Hi! PARIS collaboration mention
+    # st.markdown("  ")
+    # st.markdown("  ")
+    #st.markdown("  ")
     url = "https://www.hi-paris.fr/"
     #st.markdown("This app was funded by the Hi! PARIS Center")
+    st.markdown("""###### **The app was made in collaboration with [Hi! PARIS](%s)** """ % url, unsafe_allow_html=True)
+    image_hiparis = Image.open('images/hi-paris.png')
+    st.image(image_hiparis, width=150)
 st.markdown(" ")
 st.header("About the app")
+st.info("""The goal of the **AI and Data Science Examples** is to give an introduction to Data Science by showcasing real-life applications.
+        The app includes use cases using traditional Machine Learning algorithms on structured data, as well as models that analyze unstructured data (text, images,...).""")
 st.markdown(" ")
+st.markdown("""The app contains four sections:
+- 1️⃣ **Machine Learning**: This first section covers use cases where structured data (data in a tabular format) is used to train an AI model.
             You will find pages on *Supervised/Unsupervised Learning*, *Time Series Forecasting* and AI powered *Recommendation Systems*.
 - 2️⃣ **Natural Language Processing** (NLP): This second section showcases AI applications where large amounts of text data is analyzed using Deep Learning models.
+            Pages on *Topic Modeling* and *Sentiment Analysis*, which are different kinds of NLP models, can be found in this section.
+- 3️⃣ **Computer Vision**: This third section covers a sub-field of AI called Computer Vision, which deals with image/video data.
     The field of Computer Vision includes *Image classification* and *Object Detection*, which are both featured in this section.
+- 🚀 **Go further**: In the final section, you will gain a deeper understanding of AI models and how they function.
+            The page features multiple models to try, as well as different datasets to train a model on.
             """)
 st.image("images/ML_domains.png",
+         caption="""This figure showcases a selection of sub-fields of AI, which includes
+                    Machine Learning, NLP and Computer Vision.""")
 # st.markdown(" ")

pages/go_further.py CHANGED Viewed

@@ -43,11 +43,11 @@ def model_training(X, y, model_dict, _num_transformer=MinMaxScaler(),
         model_sklearn = KNeighborsClassifier(n_neighbors=param)
     if model == "Decision Tree 🌳":
-        model_sklearn = DecisionTreeClassifier(max_depth=param)
         explainability = True
     if model == "Random Forest 🏕️":
-        model_sklearn = RandomForestClassifier(max_depth=param)
         explainability = True
@@ -125,19 +125,15 @@ scores = np.diag(cm)
 st.image("images/ML_header.jpg")
 st.markdown("# Go further 🚀")
-st.markdown("""This page allows you to test and compare the results of different AI models, and gain a deeper understanding of how they function. <br>
             It includes three different types of **classification models** with Python code illustrations, as well as four datasets to choose from.
-            """, unsafe_allow_html=True)
-# st.markdown("""**Reminder**: Classification models are AI models that are trained to predict a finite number of values/categories.
-#         Examples can be found in the *Supervised vs Unsupervised* page with the credit score classification and customer churn prediction use cases.""")
-st.warning("""**Note**: Different types of models exists for most Machine Learning tasks.
-           Models tend to vary in complexity and picking which one to train for a specific use case isn't always straightforward.
-           Complex model might output better results but take longer to make predictions.
-           The model selection step requires a good amount of testing by practitioners.""")
-st.markdown("""All of the classification models used in this page come from `scikit-learn`, which is a popular Data Science library in Python.""")
 try:
     st.link_button("Go to the scikit-learn website", "https://scikit-learn.org/stable/index.html")
 except:
@@ -155,17 +151,20 @@ st.markdown("""**Reminder**: Classification models are AI models that are traine
 st.markdown("  ")
 st.markdown("  ")
 ########################## SELECT A DATASET ###############################
 st.markdown("### Select a dataset 📋")
-st.markdown("""To perform the classification task, you can choose between three different datasets: **Wine quality**, **Titanic** and **Car evaluation**. <br>
             Each dataset will be shown in its original format and will go through pre-processing steps to insure its quality and usability for the chosen model.
             """, unsafe_allow_html=True)
 st.warning("""**Note:** The performance of a Machine Learning model is sensitive to the data being used to train it.
     Data cleaning and pre-processing are usually as important as training the AI model. These steps can include removing missing values, identifying outliers and transforming columns from text to numbers.""")
-select_data = st.selectbox("Choose an option", ["Wine quality 🍷", "Titanic 🚢", "Car evaluation 🚙", "Diabetes 👩‍⚕️"]) #label_visibility="collapsed")
 st.markdown(" ")
 if select_data =="Wine quality 🍷":
@@ -259,7 +258,7 @@ if select_data == "Car evaluation 🚙":
 - **Evaluation**: Evaluation level (unacceptable, acceptable)""")
-if select_data == "Diabetes 👩‍⚕️":
     # Load data and clean it
     data = load_data_csv(path_data, "diabetes.csv")
     data["Outcome"] = data["Outcome"].map({1:"Yes", 0:"No"})
@@ -299,6 +298,8 @@ st.markdown(" ")
 st.markdown(" ")
 ########################## SELECT A MODEL ###############################
 st.markdown("### Select a model 📚")
@@ -306,6 +307,11 @@ st.markdown("""You can choose between three types of classification models: **K
     For each model, you will be given a short explanation as to how they function.
     """, unsafe_allow_html=True)
 select_model = st.selectbox("**Choose an option**", ["K-nearest-neighbor 🏘️", "Decision Tree 🌳", "Random Forest 🏕️"])
 st.markdown(" ")
@@ -313,19 +319,20 @@ st.markdown(" ")
 if select_model == "K-nearest-neighbor 🏘️":
     #st.markdown("#### Model: K-nearest-neighbor")
     st.info("""**About the model**: K-nearest-neighbor (or KNN) is a type of classification model that uses neighboring points to classify new data.
-            When trying to predict a class to new data points, the algorithm will look at points in close proximity (or in its neighborhood) to make a decision.
-            The most common class among its neighborhood will then be assigned to the data point.""")
     select_param = 6
     model_dict = {"model":select_model, "param":select_param}
-    learn_model = st.checkbox("Learn more", key="knn")
     if learn_model:
         st.markdown("""An important parameter in KNN algorithms is the number of points to choose as neighboors. <br>
                 The image below shows two cases where the number of neighboors (k) are equal to 3 and 6.
-- When k is equal to 3, the most common class is **Classe B**. The red point will then be predicted as Classe B.
-- When k is equal to 6, the  the most common class is **Classe A**. The red point will then be predicted as Classe A.""",
                 unsafe_allow_html=True)
         st.image("images/knn.png", width=600)
         st.markdown("""K-nearest-neighbor algorithm are popular for their simplicity. <br>
                         This can be a drawback for use cases/dataset that require a more complex approach to make accurate predictions.""", unsafe_allow_html=True)
@@ -339,15 +346,15 @@ if select_model == "Decision Tree 🌳":
     st.info("""**About the model**: Decision trees are classification model that split the prediction task into a succession of decisions, each with only two possible outcomes.
             These decisions can be visualized as a tree, with data points arriving from the top of the tree and landing at final "prediction regions".""")
-    select_param = None
     model_dict = {"model":select_model, "param":select_param}
-    learn_model = st.checkbox("Learn more", key="tree")
     if learn_model:
-        st.markdown("""The following image showcases a decision tree that was built to predict whether a **bank should give out a loan** to a client. <br>
                     The data used to train the model has each client's **age**, **salary** and **number of children**.""", unsafe_allow_html=True)
-        st.markdown("""To predict whether a client gets a loan, the client's data goes through each 'question' in the tree and **gets assigned the class of the region it fell into**. <br>
                     For example, a client that is under 30 years old and has a lower salary than 2500$ will not be awarded a loan by the model.""", unsafe_allow_html=True)
         st.image("images/decisiontree.png", width=800)
@@ -363,15 +370,15 @@ if select_model == "Decision Tree 🌳":
 if select_model == "Random Forest 🏕️":
     st.info("""**About the model:** Random Forest models generate multiple decision tree models to make predictions.
             The main drawback of decision trees is that their predictions can be unstable, meaning that their output often changes.
-            Random Forest models aggregate the predictions of multiple decision trees to reduce this unstability and improve robustness.""")
-    select_param = None
     model_dict = {"model":select_model, "param":select_param}
-    learn_model = st.checkbox("Learn more", key="tree")
     if learn_model:
-        st.markdown("""Random Forests classifiers aggregate results by apply **majority voting**, which means selecting the class that was most often predicted by trees as the final prediction.
-                    In the following image, the random forest model built four decision trees, who each have made their own final prediction. <br>"""
                     , unsafe_allow_html=True)
         st.markdown("""Class C was predicted twice, whereas Class B et D where only predicted once. <br>
@@ -401,28 +408,36 @@ st.markdown(f"""You've selected the **{select_data}** dataset and the **{select_
 run_model = st.button("Run model", type="primary")
 if run_model:
     score, feature_imp, feature_names, labels = model_training(X, y, model_dict, _num_transformer=StandardScaler())
     if select_model in ["Decision Tree 🌳", "Random Forest 🏕️"]: # show explainability for decision tree, random firest
-        tab1, tab2 = st.tabs(["Accuracy", "Explainability"])
         with tab1:
-            if select_data == "Diabetes 👩‍⚕️":
-                st.error("""**Important**: The Diabetes dataset only contains information on 768 patients. 500 patients don't have diabetes and 268 do have the disease.
                             This small number of patient data explains why the model's performance isn't optimal.
-                            Additional data collection should be conducted to improve results, as well as hyperparameter tuning (see explanation after graph).""")
             score_df = pd.DataFrame({"label":labels, "accuracy":np.round(score*100)})
-            fig = px.bar(score_df, x="label", y="accuracy", color="label", title="Accuracy results", text_auto=True)
             st.plotly_chart(fig, use_container_width=True)
             st.warning("""**Note**: To improve the results of a model, practionners often conduct *hyperparameter tuning*.
                         It consists of trying different combination of the model's parameters to maximise the accuracy score.
                         Hyperparameter tuning wasn't conduct here in order to insure the app doesn't lag.""")
         with tab2:
             df_feature_imp = pd.DataFrame({"variable":feature_names, "importance":feature_imp})
             df_feature_imp = df_feature_imp.groupby("variable").mean().reset_index()
@@ -434,14 +449,16 @@ if run_model:
     else: # only show results for knn
         st.markdown("#### Results")
         st.markdown("""The K-nearest-neighbor algorithm doesn't have a built-in solution to compute model explainability with `scikit-learn`.
-                    You can use other python packages such as `SHAP` to compute explainability, which we didn't use  here since they usually take a long time to output results.""")
-        if select_data == "Diabetes 👩‍⚕️":
-                st.error("""**Important**: Note that Diabetes dataset only contains information on 768 patients. 500 patients don't have diabetes and 268 do have the disease.
                             This small number of patient data explains why the model's performance isn't optimal.
-                            Additional data collection should be conducted to improve results, as well as hyperparameter tuning (see explanation after graph).""")
         score_df = pd.DataFrame({"label":labels, "accuracy":np.round(score*100)})
         fig = px.bar(score_df, x="label", y="accuracy", color="label", title="Accuracy results", text_auto=True)
@@ -458,3 +475,4 @@ if run_model:

         model_sklearn = KNeighborsClassifier(n_neighbors=param)
     if model == "Decision Tree 🌳":
+        model_sklearn = DecisionTreeClassifier(max_depth=param, class_weight="balanced")
         explainability = True
     if model == "Random Forest 🏕️":
+        model_sklearn = RandomForestClassifier(max_depth=param, )#class_weight="balanced_subsample")
         explainability = True
 st.image("images/ML_header.jpg")
 st.markdown("# Go further 🚀")
+st.markdown("""This page allows you to test and compare results between different AI models, and gain a deeper understanding of how they make predictions. <br>
             It includes three different types of **classification models** with Python code illustrations, as well as four datasets to choose from.
+**Explainability** is also given for most models.
+These results give an indication on which variable had the most impact on the model's final prediction. <br>
+Note that each model has its own way of measuring explainability, which makes comparisions between model explainabilities difficult.
+All of the classification models used in this page come from `scikit-learn`, which is a popular Data Science library in Python.
+            """, unsafe_allow_html=True)
 try:
     st.link_button("Go to the scikit-learn website", "https://scikit-learn.org/stable/index.html")
 except:
 st.markdown("  ")
 st.markdown("  ")
 ########################## SELECT A DATASET ###############################
 st.markdown("### Select a dataset 📋")
+st.markdown("""To perform the classification task, you can choose between three different datasets: **Titanic**, **Car evaluation**, **Wine quality** and **Diabetes prevention** <br>
             Each dataset will be shown in its original format and will go through pre-processing steps to insure its quality and usability for the chosen model.
             """, unsafe_allow_html=True)
 st.warning("""**Note:** The performance of a Machine Learning model is sensitive to the data being used to train it.
     Data cleaning and pre-processing are usually as important as training the AI model. These steps can include removing missing values, identifying outliers and transforming columns from text to numbers.""")
+select_data = st.selectbox("Choose an option", ["Titanic 🚢", "Car evaluation 🚙", "Wine quality 🍷", "Diabetes prevention 👩‍⚕️"]) #label_visibility="collapsed")
 st.markdown(" ")
 if select_data =="Wine quality 🍷":
 - **Evaluation**: Evaluation level (unacceptable, acceptable)""")
+if select_data == "Diabetes prevention 👩‍⚕️":
     # Load data and clean it
     data = load_data_csv(path_data, "diabetes.csv")
     data["Outcome"] = data["Outcome"].map({1:"Yes", 0:"No"})
 st.markdown(" ")
 ########################## SELECT A MODEL ###############################
 st.markdown("### Select a model 📚")
     For each model, you will be given a short explanation as to how they function.
     """, unsafe_allow_html=True)
+st.warning("""**Note**: Different types of models exists for most Machine Learning tasks.
+           Models tend to vary in complexity and picking which one to train for a specific use case isn't always straightforward.
+           Complex model might output better results but take longer to make predictions.
+           The model selection step requires a good amount of testing by practitioners.""")
 select_model = st.selectbox("**Choose an option**", ["K-nearest-neighbor 🏘️", "Decision Tree 🌳", "Random Forest 🏕️"])
 st.markdown(" ")
 if select_model == "K-nearest-neighbor 🏘️":
     #st.markdown("#### Model: K-nearest-neighbor")
     st.info("""**About the model**: K-nearest-neighbor (or KNN) is a type of classification model that uses neighboring points to classify new data.
+            When trying to predict a class to new data point, the algorithm will look at points in close proximity (or in its neighborhood) to make a decision.
+            The most common class in the points' neighborhood will then be chosen as the final prediction.""")
     select_param = 6
     model_dict = {"model":select_model, "param":select_param}
+    learn_model = st.checkbox("Learn more about the model", key="knn")
     if learn_model:
         st.markdown("""An important parameter in KNN algorithms is the number of points to choose as neighboors. <br>
                 The image below shows two cases where the number of neighboors (k) are equal to 3 and 6.
+- When k is equal to 3 (the small dotted circle in the image below), the most common class is **Class B**. The red point will then be predicted as Classe B.
+- When k is equal to 6 (the large dotted circle in the image below), the  the most common class is **Class A**. The red point will then be predicted as Classe A.""",
                 unsafe_allow_html=True)
         st.image("images/knn.png", width=600)
         st.markdown("""K-nearest-neighbor algorithm are popular for their simplicity. <br>
                         This can be a drawback for use cases/dataset that require a more complex approach to make accurate predictions.""", unsafe_allow_html=True)
     st.info("""**About the model**: Decision trees are classification model that split the prediction task into a succession of decisions, each with only two possible outcomes.
             These decisions can be visualized as a tree, with data points arriving from the top of the tree and landing at final "prediction regions".""")
+    select_param = 8
     model_dict = {"model":select_model, "param":select_param}
+    learn_model = st.checkbox("Learn more about the model", key="tree")
     if learn_model:
+        st.markdown("""The following image showcases a decision tree which predicts whether a **bank should give out a loan** to a client. <br>
                     The data used to train the model has each client's **age**, **salary** and **number of children**.""", unsafe_allow_html=True)
+        st.markdown("""To predict whether a client gets a loan, the client's data goes through each 'leaf' in the tree (leaves are the blue box question in the image below) and **gets assigned the class of the final leaf it fell into** (either Get loan or Don't get loan).
                     For example, a client that is under 30 years old and has a lower salary than 2500$ will not be awarded a loan by the model.""", unsafe_allow_html=True)
         st.image("images/decisiontree.png", width=800)
 if select_model == "Random Forest 🏕️":
     st.info("""**About the model:** Random Forest models generate multiple decision tree models to make predictions.
             The main drawback of decision trees is that their predictions can be unstable, meaning that their output often changes.
+            Random Forest models combine the predictions of multiple decision trees to reduce this unstability and improve robustness.""")
+    select_param = 8
     model_dict = {"model":select_model, "param":select_param}
+    learn_model = st.checkbox("Learn more about the model", key="tree")
     if learn_model:
+        st.markdown("""Random Forests classifiers combine the results of multiple trees by apply **majority voting**, which means selecting the class that was most often predicted by trees as the final prediction.
+                    In the following image, the random forest model built four decision trees, who each have made their own class prediction. <br>"""
                     , unsafe_allow_html=True)
         st.markdown("""Class C was predicted twice, whereas Class B et D where only predicted once. <br>
 run_model = st.button("Run model", type="primary")
+st.markdown("  ")
 if run_model:
     score, feature_imp, feature_names, labels = model_training(X, y, model_dict, _num_transformer=StandardScaler())
     if select_model in ["Decision Tree 🌳", "Random Forest 🏕️"]: # show explainability for decision tree, random firest
+        tab1, tab2 = st.tabs(["Results", "Explainability"])
         with tab1:
+            st.markdown("#### Results")
+            st.markdown("""The values below represent the model's accuracy for each possible class.
+                    The lowest possible accuracy is 0 and the highest 100.""")
+            if select_data == "Diabetes prevention 👩‍⚕️":
+                st.warning("""**Note**: The Diabetes dataset only contains information on 768 patients. 500 patients don't have diabetes and 268 do have the disease.
                             This small number of patient data explains why the model's performance isn't optimal.
+                            Additional data collection as well as hyperparameter tuning can be conducted to improve results.""")
             score_df = pd.DataFrame({"label":labels, "accuracy":np.round(score*100)})
+            fig = px.bar(score_df, x="label", y="accuracy", color="label", text_auto=True)
             st.plotly_chart(fig, use_container_width=True)
             st.warning("""**Note**: To improve the results of a model, practionners often conduct *hyperparameter tuning*.
                         It consists of trying different combination of the model's parameters to maximise the accuracy score.
                         Hyperparameter tuning wasn't conduct here in order to insure the app doesn't lag.""")
         with tab2:
+            st.markdown("#### Explainability")
+            st.markdown("""Variables with a high explainability score had the most impact on the model's predictions.
+                        Variables with a low explainability score had a much smaller impact.""")
             df_feature_imp = pd.DataFrame({"variable":feature_names, "importance":feature_imp})
             df_feature_imp = df_feature_imp.groupby("variable").mean().reset_index()
     else: # only show results for knn
         st.markdown("#### Results")
+        st.markdown("""The values below represent the model's accuracy for each possible class.
+                    The lowest possible accuracy is 0 and the highest 100.""")
         st.markdown("""The K-nearest-neighbor algorithm doesn't have a built-in solution to compute model explainability with `scikit-learn`.
+                    You can use other python packages such as `SHAP` to compute explainability, which we didn't use here since they usually take a long time to output results.""")
+        if select_data == "Diabetes prevention 👩‍⚕️":
+            st.warning("""**Note**: The Diabetes dataset only contains information on 768 patients. 500 patients don't have diabetes and 268 do have the disease.
                             This small number of patient data explains why the model's performance isn't optimal.
+                            Additional data collection as well as hyperparameter tuning can be conducted to improve results.""")
         score_df = pd.DataFrame({"label":labels, "accuracy":np.round(score*100)})
         fig = px.bar(score_df, x="label", y="accuracy", color="label", title="Accuracy results", text_auto=True)

pages/object_detection.py CHANGED Viewed

@@ -170,10 +170,10 @@ st.divider()
 st.markdown("# Fashion Object Detection 👗")
 # st.info("""This use case showcases the application of **Object detection** to detect clothing items/features on images. <br>
 #             The images used were gathered from Dior's""")
-st.info("""**Object detection models** can very valuable for fashion retailers wishing to improve customer experience by providing, for example, **product recognition**, **visual search**
-        and even **virtual try-ons**.
-        In this use case, we are going to show an object detection model that as able to identify and locate different articles of clothings on fashipn show images.
-        """)
 st.markdown("  ")
 st.markdown("  ")
@@ -194,8 +194,8 @@ st.markdown("  ")
 st.markdown("### About the model 📚")
-st.markdown("""The object detection model was trained specifically to **detect clothing items** on images. <br>
-            It is able to detect <b>46</b> different types of clothing items.""", unsafe_allow_html=True)
 colors = ["#8ef", "#faa", "#afa", "#fea", "#8ef","#afa"]*7 + ["#8ef", "#faa", "#afa", "#fea"]
@@ -209,7 +209,7 @@ annotated_text([cats_annotated])
 #             'epaulette', 'sleeve', 'pocket', 'neckline', 'buckle', 'zipper', 'applique', 'bead', 'bow', 'flower', 'fringe', 'ribbon', 'rivet',
 #             'ruffle', 'sequin', 'tassel'""", unsafe_allow_html=True)
-st.markdown("Credits: https://huggingface.co/valentinafeve/yolos-fashionpedia")
 st.markdown("")
 st.markdown("")
@@ -294,6 +294,7 @@ dict_cats_final = {key:value for (key,value) in dict_cats.items() if value in se
 st.markdown("### Define a threshold for predictions 🔎")
 st.markdown("""In this section, you can select a threshold for the model's final predictions. <br>
             Objects that are given a lower score than the chosen threshold will be ignored in the final results""", unsafe_allow_html=True)
 st.info("""**Note**: Object detection models detect objects using bounding boxes as well as assign objects to specific classes.
         Each object is given a class based on a probability score computed by the model. A high probability signals that the model is confident in its prediction.
         On the contrary, a lower probability score signals a level of uncertainty.""")

 st.markdown("# Fashion Object Detection 👗")
 # st.info("""This use case showcases the application of **Object detection** to detect clothing items/features on images. <br>
 #             The images used were gathered from Dior's""")
+st.info("""**Object detection models** can very valuable for fashion retailers wishing to improve customer experience. They can provide, for example, **product recognition**, **visual search**
+        and even **virtual try-ons**.""")
+st.markdown("In this use case, we are going to show an object detection model that as able to identify and locate different articles of clothings on fashion show images.")
 st.markdown("  ")
 st.markdown("  ")
 st.markdown("### About the model 📚")
+st.markdown("""The object detection model was trained to **detect specific clothing items** on images. <br>
+            Below is a list of the <b>46</b> different types of clothing items the model can identify and locate.""", unsafe_allow_html=True)
 colors = ["#8ef", "#faa", "#afa", "#fea", "#8ef","#afa"]*7 + ["#8ef", "#faa", "#afa", "#fea"]
 #             'epaulette', 'sleeve', 'pocket', 'neckline', 'buckle', 'zipper', 'applique', 'bead', 'bow', 'flower', 'fringe', 'ribbon', 'rivet',
 #             'ruffle', 'sequin', 'tassel'""", unsafe_allow_html=True)
+st.markdown("Credits for the model: https://huggingface.co/valentinafeve/yolos-fashionpedia")
 st.markdown("")
 st.markdown("")
 st.markdown("### Define a threshold for predictions 🔎")
 st.markdown("""In this section, you can select a threshold for the model's final predictions. <br>
             Objects that are given a lower score than the chosen threshold will be ignored in the final results""", unsafe_allow_html=True)
 st.info("""**Note**: Object detection models detect objects using bounding boxes as well as assign objects to specific classes.
         Each object is given a class based on a probability score computed by the model. A high probability signals that the model is confident in its prediction.
         On the contrary, a lower probability score signals a level of uncertainty.""")

pages/recommendation_system.py CHANGED Viewed

@@ -26,7 +26,7 @@ st.markdown("### What is a Recommendation System ?")
 st.info("""**Recommendation systems** are algorithms built to **suggest** or **recommend** **products** to consumers.
         They are very common in social media platforms such as TikTok, Youtube or Instagram or e-commerce websites as they help improve and personalize a consumer's experience.""")
-st.markdown("""There are two methods to build recommendation systems:
 - **Content-based filtering**: Recommendations are made based on the user's own preferences
 - **Collaborative filtering**: Recommendations are made based on the preferences and behavior of similar users""", unsafe_allow_html=True)

 st.info("""**Recommendation systems** are algorithms built to **suggest** or **recommend** **products** to consumers.
         They are very common in social media platforms such as TikTok, Youtube or Instagram or e-commerce websites as they help improve and personalize a consumer's experience.""")
+st.markdown("""There are two main types of recommendation systems:
 - **Content-based filtering**: Recommendations are made based on the user's own preferences
 - **Collaborative filtering**: Recommendations are made based on the preferences and behavior of similar users""", unsafe_allow_html=True)

pages/supervised_unsupervised_page.py CHANGED Viewed

@@ -26,8 +26,9 @@ st.set_page_config(layout="wide")
 #st.image("images/ML_header.jpg", use_column_width=True)
 st.markdown("# Supervised vs Unsupervised Learning 🔍")
-st.info("""There are two main types of models in the field of Data Science, **Supervised** and **Unsupervised learning** models.
-        Being able to distinguish which type of model fits your data is an essential step in building any AI project.""")
 st.markdown(" ")
 #st.markdown("## What are the differences between both ?")
@@ -38,7 +39,7 @@ with col1:
     st.markdown("## Supervised Learning")
     st.markdown("""Supervised learning models are trained by learning from **labeled data**. <br>
                 Labeled data provides to the model the desired output, which it will then use to learn relevant patterns and make predictions.
-- A model is first **trained** to make predictions using labeled data.
 - The trained model can then be used to **predict values** for new data.
                 """, unsafe_allow_html=True)
     st.markdown(" ")
@@ -57,7 +58,7 @@ with col2:
 st.markdown("  ")
-learning_type = st.selectbox("**Select a type of model**",
                              ["Supervised Learning",
                            "Unsupervised Learning"])
@@ -91,8 +92,11 @@ if learning_type == "Supervised Learning":
         ## Description of the use case
         st.divider()
         st.markdown("# Credit score classification 💯")
-        st.info("""**Classification** is a type of supervised learning where the goal is to categorize input data into predefined classes or categories.
-                In this case, we will build a **credit score classification** model that predicts if a client will have a **'Bad'**, **'Standard'** or **'Good'** credit score.""")
         st.markdown(" ")
         _, col, _ = st.columns([0.25,0.5,0.25])
@@ -101,7 +105,7 @@ if learning_type == "Supervised Learning":
         ## Learn about the data
         st.markdown("#### About the data 📋")
-        st.markdown("""To train the credit classification model, you were provided a **labeled** database with the bank and credit-related information of around 7600 clients. <br>
                     This dataset is 'labeled' since it contains information on what we are trying to predict, which is the **Credit_Score** variable.""",
                     unsafe_allow_html=True)
@@ -350,9 +354,12 @@ if learning_type == "Supervised Learning":
         ## Description of the use case
         st.divider()
         st.markdown("# Customer churn prediction ❌")
-        st.info(""" Classification is a type of supervised learning model whose goal is to categorize input data into predefined classes or categories.
-                In this example, we will build a **customer churn classification model** that can predict whether a customer is likely to leave a company's service in the future using historical data.
-        """)
         st.markdown(" ")
@@ -367,8 +374,8 @@ if learning_type == "Supervised Learning":
         ## Learn about the data
         st.markdown("#### About the data 📋")
-        st.markdown("""To train the customer churn classification model, you were provided a **labeled** database with around 7000 clients of a telecommunications company. <br>
-                    The data contains information on which services the customer has signed for, information on his account as well as whether the customer churned or not (our label here).""",
                     unsafe_allow_html=True)
         # st.markdown("This dataset is 'labeled' since it contains information on what we are trying to predict, which is the **Churn** variable.")
         st.info("**Note**: The variables that had two possible values (Yes or No) where transformed into binary variables (0 or 1) with 0 being 'No' and 1 being 'Yes'.")
@@ -660,7 +667,7 @@ def markdown_general_info(df):
 if learning_type == "Unsupervised Learning":
     usl_usecase = st.selectbox("**Choose a use case**",
-                          ["Customer segmentation 🧑‍🤝‍🧑"])
     #################################### CUSTOMER SEGMENTATION ##################################
@@ -668,16 +675,16 @@ if learning_type == "Unsupervised Learning":
     path_clustering = r"data/clustering"
     path_clustering_results = r"data/clustering/results"
-    if usl_usecase == "Customer segmentation 🧑‍🤝‍🧑":
         # st.divider()
         st.divider()
-        st.markdown("# Customer Segmentation 🧑‍🤝‍🧑")
-        st.info("""**Unsupervised learning** models are valulable tools for cases where you want your model to discover patterns by itself, without having to give it examples to learn from (especially if you don't have labeled data).
-                    In this use case, we will show how they can be useful for **Customer Segmentation** to detect unknown groups of clients in a company's customer base.
-                Using this previously unknown segmentation, companies can then create more targeted add campaigns based on their consumer's behavior and preferences.
-        """)
         st.markdown("  ")
         ## Show image
@@ -726,13 +733,14 @@ if learning_type == "Unsupervised Learning":
         st.info("""**Clustering** is a type of unsupervised learning method that learns how to group similar data points together into "clusters", without needing supervision.
                     In our case, a data points represents a customer that will be assigned to an unknown group.""")
-        st.markdown("""
-- The clustering algorithm used in this use case allows a specific number of groups to be identified, which isn't the case for all clustering models.
-- The number of clusters chosen by the user can have a strong impact on the quality of the segmentation. Try to run the model multiple times with different number of clusters and see which number leads to groups with more distinct customer behaviors/preferences.""")
         st.markdown(" ")
         st.markdown("Here is an example of grouped data using a clustering model.")
         st.image("images/clustering.webp")
         nb_groups = st.selectbox("Choose a number of customer groups to identify", np.arange(2,6))
         df_results = load_data_pickle(path_clustering_results, f"results_{nb_groups}_clusters.pkl")

 #st.image("images/ML_header.jpg", use_column_width=True)
 st.markdown("# Supervised vs Unsupervised Learning 🔍")
+st.info("""Data Science models are often split into two categories: **Supervised** and **Unsupervised Learning**.
+        The goal of this page is to present these two kinds of Data Science models, as well as give you multiple use cases to try them with.
+        Note that other kinds of AI models exist such as Reinforcement Learning or Federated Learning, which we won't cover in this app.""")
 st.markdown(" ")
 #st.markdown("## What are the differences between both ?")
     st.markdown("## Supervised Learning")
     st.markdown("""Supervised learning models are trained by learning from **labeled data**. <br>
                 Labeled data provides to the model the desired output, which it will then use to learn relevant patterns and make predictions.
+- A model is first **trained** to make predictions using labeled data, which doesn't contain the desired output.
 - The trained model can then be used to **predict values** for new data.
                 """, unsafe_allow_html=True)
     st.markdown(" ")
 st.markdown("  ")
+learning_type = st.selectbox("**Select an AI task**",
                              ["Supervised Learning",
                            "Unsupervised Learning"])
         ## Description of the use case
         st.divider()
         st.markdown("# Credit score classification 💯")
+        st.info("""**Classification models** are supervised learning models whose goal is to categorize data into predefined categories.
+                As opposed to unsupervised learning models, these categories are known beforehand.
+                Other types of supervised learning models include Regression models, which learn how to predict numerical values, instead of a set number of categories.""")
+        st.markdown("In this use case, we will build a **credit score classification model** which predicts whether a client has a 'Bad', 'Standard', or 'Good' credit score.")
         st.markdown(" ")
         _, col, _ = st.columns([0.25,0.5,0.25])
         ## Learn about the data
         st.markdown("#### About the data 📋")
+        st.markdown("""To train the credit classification model, you were provided a **labeled** database with 7600 clients and containing bank and credit-related client information. <br>
                     This dataset is 'labeled' since it contains information on what we are trying to predict, which is the **Credit_Score** variable.""",
                     unsafe_allow_html=True)
         ## Description of the use case
         st.divider()
         st.markdown("# Customer churn prediction ❌")
+        st.info("""**Classification models** are supervised learning models whose goal is to categorize data into predefined categories.
+                As opposed to unsupervised learning models, these categories are known beforehand.
+                Other types of supervised learning models include Regression models, which learn how to predict numerical values, instead of a set number of categories.""")
+        st.markdown("For this use case, we will build a **customer churn classification model** that can predict whether a person will stop being a customer using historical data.")
         st.markdown(" ")
         ## Learn about the data
         st.markdown("#### About the data 📋")
+        st.markdown("""To train the customer churn model, you were provided a **labeled** database with around 7000 clients of a telecommunications company. <br>
+                    The data contains information on which services the customer has signed for, account information as well as whether the customer churned or not (our label here).""",
                     unsafe_allow_html=True)
         # st.markdown("This dataset is 'labeled' since it contains information on what we are trying to predict, which is the **Churn** variable.")
         st.info("**Note**: The variables that had two possible values (Yes or No) where transformed into binary variables (0 or 1) with 0 being 'No' and 1 being 'Yes'.")
 if learning_type == "Unsupervised Learning":
     usl_usecase = st.selectbox("**Choose a use case**",
+                          ["Customer segmentation (clustering) 🧑‍🤝‍🧑"])
     #################################### CUSTOMER SEGMENTATION ##################################
     path_clustering = r"data/clustering"
     path_clustering_results = r"data/clustering/results"
+    if usl_usecase == "Customer segmentation (clustering) 🧑‍🤝‍🧑":
         # st.divider()
         st.divider()
+        st.markdown("# Customer Segmentation (clustering) 🧑‍🤝‍🧑")
+        st.markdown("""In this use case, we will use a clustering model, a type of Unsupervised Learning model, to perform **Customer Segmentation**. <br>
+                    Our model will allow similar groups of clients to be identified within company's consumer database based on consumer habits and caracteristics.
+        """, unsafe_allow_html=True)
         st.markdown("  ")
         ## Show image
         st.info("""**Clustering** is a type of unsupervised learning method that learns how to group similar data points together into "clusters", without needing supervision.
                     In our case, a data points represents a customer that will be assigned to an unknown group.""")
+#         st.markdown("""
+# - The clustering algorithm used in this use case allows a specific number of groups to be identified, which isn't the case for all clustering models.""")
         st.markdown(" ")
         st.markdown("Here is an example of grouped data using a clustering model.")
         st.image("images/clustering.webp")
+        st.warning("**Note**: The number of clusters chosen by the user can have a strong impact on the quality of the segmentation. Try to run the model multiple times with different number of clusters and see which number leads to groups with more distinct customer behaviors/preferences.")
         nb_groups = st.selectbox("Choose a number of customer groups to identify", np.arange(2,6))
         df_results = load_data_pickle(path_clustering_results, f"results_{nb_groups}_clusters.pkl")

pages/timeseries_analysis.py CHANGED Viewed

@@ -35,7 +35,7 @@ def forecast_prophet(train, test, col=None):
 st.markdown("# Time Series Forecasting 📈")
 st.markdown("### What is Time Series Forecasting ?")
-st.info("""Time series forecasting models are AI models built to make accurate predictions about future values using historical data.
             These types of models take into account temporal patterns, such as **trends** (long-term movements), **seasonality** (repeating patterns at fixed intervals), and **cyclic patterns** (repeating patterns not necessarily at fixed intervals)""")
             #unsafe_allow_html=True)
@@ -77,8 +77,8 @@ st.divider()
 st.markdown("# Power Consumption Forecasting ⚡")
 #st.markdown("  ")
-st.info("""In this use case, a time series forecasting model is used to predict the **energy consumption** (or **Global Active Power**) of a household using historical data.
-        A forecasting model can be a valuable tool to optimize resource planning and avoid overloads during peak demand periods.""")
 st.markdown(" ")

 st.markdown("# Time Series Forecasting 📈")
 st.markdown("### What is Time Series Forecasting ?")
+st.info("""Time series forecasting models are AI models built to make predictions about future values using historical data.
             These types of models take into account temporal patterns, such as **trends** (long-term movements), **seasonality** (repeating patterns at fixed intervals), and **cyclic patterns** (repeating patterns not necessarily at fixed intervals)""")
             #unsafe_allow_html=True)
 st.markdown("# Power Consumption Forecasting ⚡")
 #st.markdown("  ")
+st.info("""In this use case, a time series forecasting model learns how to accuratly predict the **energy consumption** (or global active power in the dataset) of a household using historical data.
+        A forecasting model can be a valuable tool for energy consumption analysis as it can help **optimize resource planning** and **avoid overloads** during peak demand periods.""")
 st.markdown(" ")

pages/topic_modeling.py CHANGED Viewed

@@ -41,7 +41,7 @@ st.markdown("  ")
 st.divider()
 st.markdown("# Topic modeling on product descriptions 🛍️")
-st.info("""In this use case, we will use a topic model to categorize around 20 000 e-commerce products using text descriptions and identify
         the main types of products solds.""")
 _, col, _ = st.columns([0.2,0.6,0.2])

 st.divider()
 st.markdown("# Topic modeling on product descriptions 🛍️")
+st.markdown("""In this use case, we will use a **topic model** to categorize around **20 000 e-commerce products** as well as identify
         the main types of products solds.""")
 _, col, _ = st.columns([0.2,0.6,0.2])