Spaces:
Sleeping
Sleeping
latest version of app
Browse files- main_page.py +19 -17
- pages/go_further.py +58 -40
- pages/object_detection.py +8 -7
- pages/recommendation_system.py +1 -1
- pages/supervised_unsupervised_page.py +30 -22
- pages/timeseries_analysis.py +3 -3
- pages/topic_modeling.py +1 -1
main_page.py
CHANGED
@@ -44,21 +44,21 @@ col1, col2 = st.columns([0.65,0.35], gap="medium")
|
|
44 |
with col1:
|
45 |
st.title("AI and Data Science Examples")
|
46 |
st.subheader("HEC Paris, 2023-2024")
|
47 |
-
st.markdown("""**Course provided by Shirish C. SRIVASTAVA** <br>
|
48 |
-
|
49 |
#st.markdown("in collaboration with Hi! PARIS engineers: Laurène DAVID, Salma HOUIDI and Maeva N'GUESSAN")
|
50 |
|
51 |
-
with col2:
|
52 |
#Hi! PARIS collaboration mention
|
53 |
-
st.markdown(" ")
|
54 |
-
st.markdown(" ")
|
55 |
-
st.markdown(" ")
|
56 |
-
image_hiparis = Image.open('images/hi-paris.png')
|
57 |
-
st.image(image_hiparis, width=150)
|
58 |
|
59 |
url = "https://www.hi-paris.fr/"
|
60 |
#st.markdown("This app was funded by the Hi! PARIS Center")
|
61 |
-
st.markdown("""###### **
|
|
|
|
|
62 |
|
63 |
|
64 |
st.markdown(" ")
|
@@ -114,23 +114,25 @@ show_pages(
|
|
114 |
st.header("About the app")
|
115 |
|
116 |
|
117 |
-
st.info("""The **AI and Data Science Examples**
|
118 |
-
|
119 |
|
120 |
st.markdown(" ")
|
121 |
|
122 |
-
st.markdown("""The app
|
123 |
-
- 1οΈβ£ **Machine Learning**: This first section covers use cases where structured data (data in a tabular format) is
|
124 |
You will find pages on *Supervised/Unsupervised Learning*, *Time Series Forecasting* and AI powered *Recommendation Systems*.
|
125 |
- 2οΈβ£ **Natural Language Processing** (NLP): This second section showcases AI applications where large amounts of text data is analyzed using Deep Learning models.
|
126 |
-
Pages on *Topic Modeling* and *Sentiment Analysis*, which are
|
127 |
-
- 3οΈβ£ **Computer Vision**: This
|
128 |
The field of Computer Vision includes *Image classification* and *Object Detection*, which are both featured in this section.
|
|
|
|
|
129 |
""")
|
130 |
|
131 |
st.image("images/ML_domains.png",
|
132 |
-
caption="""This figure showcases a selection of sub-fields
|
133 |
-
Machine Learning, NLP
|
134 |
|
135 |
|
136 |
# st.markdown(" ")
|
|
|
44 |
with col1:
|
45 |
st.title("AI and Data Science Examples")
|
46 |
st.subheader("HEC Paris, 2023-2024")
|
47 |
+
# st.markdown("""**Course provided by Shirish C. SRIVASTAVA** <br>
|
48 |
+
# **Hi! PARIS Engineering team**: Laurène DAVID, Salma HOUIDI and Maeva N'GUESSAN""", unsafe_allow_html=True)
|
49 |
#st.markdown("in collaboration with Hi! PARIS engineers: Laurène DAVID, Salma HOUIDI and Maeva N'GUESSAN")
|
50 |
|
51 |
+
# with col2:
|
52 |
#Hi! PARIS collaboration mention
|
53 |
+
# st.markdown(" ")
|
54 |
+
# st.markdown(" ")
|
55 |
+
#st.markdown(" ")
|
|
|
|
|
56 |
|
57 |
url = "https://www.hi-paris.fr/"
|
58 |
#st.markdown("This app was funded by the Hi! PARIS Center")
|
59 |
+
st.markdown("""###### **The app was made in collaboration with [Hi! PARIS](%s)** """ % url, unsafe_allow_html=True)
|
60 |
+
image_hiparis = Image.open('images/hi-paris.png')
|
61 |
+
st.image(image_hiparis, width=150)
|
62 |
|
63 |
|
64 |
st.markdown(" ")
|
|
|
114 |
st.header("About the app")
|
115 |
|
116 |
|
117 |
+
st.info("""The goal of the **AI and Data Science Examples** is to give an introduction to Data Science by showcasing real-life applications.
|
118 |
+
The app includes use cases using traditional Machine Learning algorithms on structured data, as well as models that analyze unstructured data (text, images,...).""")
|
119 |
|
120 |
st.markdown(" ")
|
121 |
|
122 |
+
st.markdown("""The app contains four sections:
|
123 |
+
- 1οΈβ£ **Machine Learning**: This first section covers use cases where structured data (data in a tabular format) is used to train an AI model.
|
124 |
You will find pages on *Supervised/Unsupervised Learning*, *Time Series Forecasting* and AI powered *Recommendation Systems*.
|
125 |
- 2οΈβ£ **Natural Language Processing** (NLP): This second section showcases AI applications where large amounts of text data is analyzed using Deep Learning models.
|
126 |
+
Pages on *Topic Modeling* and *Sentiment Analysis*, which are different kinds of NLP models, can be found in this section.
|
127 |
+
- 3οΈβ£ **Computer Vision**: This third section covers a sub-field of AI called Computer Vision, which deals with image/video data.
|
128 |
The field of Computer Vision includes *Image classification* and *Object Detection*, which are both featured in this section.
|
129 |
+
- π **Go further**: In the final section, you will gain a deeper understanding of AI models and how they function.
|
130 |
+
The page features multiple models to try, as well as different datasets to train a model on.
|
131 |
""")
|
132 |
|
133 |
st.image("images/ML_domains.png",
|
134 |
+
caption="""This figure showcases a selection of sub-fields of AI, which includes
|
135 |
+
Machine Learning, NLP and Computer Vision.""")
|
136 |
|
137 |
|
138 |
# st.markdown(" ")
|
pages/go_further.py
CHANGED
@@ -43,11 +43,11 @@ def model_training(X, y, model_dict, _num_transformer=MinMaxScaler(),
|
|
43 |
model_sklearn = KNeighborsClassifier(n_neighbors=param)
|
44 |
|
45 |
if model == "Decision Tree π³":
|
46 |
-
model_sklearn = DecisionTreeClassifier(max_depth=param)
|
47 |
explainability = True
|
48 |
|
49 |
if model == "Random Forest ποΈ":
|
50 |
-
model_sklearn = RandomForestClassifier(max_depth=param)
|
51 |
explainability = True
|
52 |
|
53 |
|
@@ -125,19 +125,15 @@ scores = np.diag(cm)
|
|
125 |
|
126 |
st.image("images/ML_header.jpg")
|
127 |
st.markdown("# Go further π")
|
128 |
-
st.markdown("""This page allows you to test and compare
|
129 |
It includes three different types of **classification models** with Python code illustrations, as well as four datasets to choose from.
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
st.warning("""**Note**: Different types of models exists for most Machine Learning tasks.
|
136 |
-
Models tend to vary in complexity and picking which one to train for a specific use case isn't always straightforward.
|
137 |
-
Complex model might output better results but take longer to make predictions.
|
138 |
-
The model selection step requires a good amount of testing by practitioners.""")
|
139 |
|
140 |
-
|
|
|
141 |
try:
|
142 |
st.link_button("Go to the scikit-learn website", "https://scikit-learn.org/stable/index.html")
|
143 |
except:
|
@@ -155,17 +151,20 @@ st.markdown("""**Reminder**: Classification models are AI models that are traine
|
|
155 |
st.markdown(" ")
|
156 |
st.markdown(" ")
|
157 |
|
|
|
|
|
|
|
158 |
########################## SELECT A DATASET ###############################
|
159 |
|
160 |
st.markdown("### Select a dataset π")
|
161 |
-
st.markdown("""To perform the classification task, you can choose between three different datasets: **
|
162 |
Each dataset will be shown in its original format and will go through pre-processing steps to insure its quality and usability for the chosen model.
|
163 |
""", unsafe_allow_html=True)
|
164 |
|
165 |
st.warning("""**Note:** The performance of a Machine Learning model is sensitive to the data being used to train it.
|
166 |
Data cleaning and pre-processing are usually as important as training the AI model. These steps can include removing missing values, identifying outliers and transforming columns from text to numbers.""")
|
167 |
|
168 |
-
select_data = st.selectbox("Choose an option", ["
|
169 |
st.markdown(" ")
|
170 |
|
171 |
if select_data =="Wine quality π·":
|
@@ -259,7 +258,7 @@ if select_data == "Car evaluation π":
|
|
259 |
- **Evaluation**: Evaluation level (unacceptable, acceptable)""")
|
260 |
|
261 |
|
262 |
-
if select_data == "Diabetes π©ββοΈ":
|
263 |
# Load data and clean it
|
264 |
data = load_data_csv(path_data, "diabetes.csv")
|
265 |
data["Outcome"] = data["Outcome"].map({1:"Yes", 0:"No"})
|
@@ -299,6 +298,8 @@ st.markdown(" ")
|
|
299 |
st.markdown(" ")
|
300 |
|
301 |
|
|
|
|
|
302 |
########################## SELECT A MODEL ###############################
|
303 |
|
304 |
st.markdown("### Select a model π")
|
@@ -306,6 +307,11 @@ st.markdown("""You can choose between three types of classification models: **K
|
|
306 |
For each model, you will be given a short explanation as to how they function.
|
307 |
""", unsafe_allow_html=True)
|
308 |
|
|
|
|
|
|
|
|
|
|
|
309 |
select_model = st.selectbox("**Choose an option**", ["K-nearest-neighbor ποΈ", "Decision Tree π³", "Random Forest ποΈ"])
|
310 |
st.markdown(" ")
|
311 |
|
@@ -313,19 +319,20 @@ st.markdown(" ")
|
|
313 |
if select_model == "K-nearest-neighbor ποΈ":
|
314 |
#st.markdown("#### Model: K-nearest-neighbor")
|
315 |
st.info("""**About the model**: K-nearest-neighbor (or KNN) is a type of classification model that uses neighboring points to classify new data.
|
316 |
-
When trying to predict a class to new data
|
317 |
-
The most common class
|
318 |
|
319 |
select_param = 6
|
320 |
model_dict = {"model":select_model, "param":select_param}
|
321 |
|
322 |
-
learn_model = st.checkbox("Learn more", key="knn")
|
323 |
if learn_model:
|
324 |
st.markdown("""An important parameter in KNN algorithms is the number of points to choose as neighboors. <br>
|
325 |
The image below shows two cases where the number of neighboors (k) are equal to 3 and 6.
|
326 |
-
- When k is equal to 3, the most common class is **
|
327 |
-
- When k is equal to 6, the the most common class is **
|
328 |
unsafe_allow_html=True)
|
|
|
329 |
st.image("images/knn.png", width=600)
|
330 |
st.markdown("""K-nearest-neighbor algorithm are popular for their simplicity. <br>
|
331 |
This can be a drawback for use cases/dataset that require a more complex approach to make accurate predictions.""", unsafe_allow_html=True)
|
@@ -339,15 +346,15 @@ if select_model == "Decision Tree π³":
|
|
339 |
st.info("""**About the model**: Decision trees are classification model that split the prediction task into a succession of decisions, each with only two possible outcomes.
|
340 |
These decisions can be visualized as a tree, with data points arriving from the top of the tree and landing at final "prediction regions".""")
|
341 |
|
342 |
-
select_param =
|
343 |
model_dict = {"model":select_model, "param":select_param}
|
344 |
|
345 |
-
learn_model = st.checkbox("Learn more", key="tree")
|
346 |
if learn_model:
|
347 |
-
st.markdown("""The following image showcases a decision tree
|
348 |
The data used to train the model has each client's **age**, **salary** and **number of children**.""", unsafe_allow_html=True)
|
349 |
|
350 |
-
st.markdown("""To predict whether a client gets a loan, the client's data goes through each '
|
351 |
For example, a client that is under 30 years old and has a lower salary than 2500$ will not be awarded a loan by the model.""", unsafe_allow_html=True)
|
352 |
|
353 |
st.image("images/decisiontree.png", width=800)
|
@@ -363,15 +370,15 @@ if select_model == "Decision Tree π³":
|
|
363 |
if select_model == "Random Forest ποΈ":
|
364 |
st.info("""**About the model:** Random Forest models generate multiple decision tree models to make predictions.
|
365 |
The main drawback of decision trees is that their predictions can be unstable, meaning that their output often changes.
|
366 |
-
Random Forest models
|
367 |
|
368 |
-
select_param =
|
369 |
model_dict = {"model":select_model, "param":select_param}
|
370 |
|
371 |
-
learn_model = st.checkbox("Learn more", key="tree")
|
372 |
if learn_model:
|
373 |
-
st.markdown("""Random Forests classifiers
|
374 |
-
In the following image, the random forest model built four decision trees, who each have made their own
|
375 |
, unsafe_allow_html=True)
|
376 |
|
377 |
st.markdown("""Class C was predicted twice, whereas Class B et D where only predicted once. <br>
|
@@ -401,28 +408,36 @@ st.markdown(f"""You've selected the **{select_data}** dataset and the **{select_
|
|
401 |
|
402 |
|
403 |
run_model = st.button("Run model", type="primary")
|
|
|
404 |
|
405 |
if run_model:
|
406 |
score, feature_imp, feature_names, labels = model_training(X, y, model_dict, _num_transformer=StandardScaler())
|
407 |
|
408 |
if select_model in ["Decision Tree π³", "Random Forest ποΈ"]: # show explainability for decision tree, random firest
|
409 |
-
tab1, tab2 = st.tabs(["
|
410 |
|
411 |
with tab1:
|
412 |
-
|
413 |
-
|
|
|
|
|
|
|
414 |
This small number of patient data explains why the model's performance isn't optimal.
|
415 |
-
Additional data collection
|
416 |
|
417 |
score_df = pd.DataFrame({"label":labels, "accuracy":np.round(score*100)})
|
418 |
-
fig = px.bar(score_df, x="label", y="accuracy", color="label",
|
419 |
st.plotly_chart(fig, use_container_width=True)
|
420 |
-
|
421 |
st.warning("""**Note**: To improve the results of a model, practionners often conduct *hyperparameter tuning*.
|
422 |
It consists of trying different combination of the model's parameters to maximise the accuracy score.
|
423 |
Hyperparameter tuning wasn't conduct here in order to insure the app doesn't lag.""")
|
|
|
424 |
|
425 |
with tab2:
|
|
|
|
|
|
|
426 |
|
427 |
df_feature_imp = pd.DataFrame({"variable":feature_names, "importance":feature_imp})
|
428 |
df_feature_imp = df_feature_imp.groupby("variable").mean().reset_index()
|
@@ -434,14 +449,16 @@ if run_model:
|
|
434 |
|
435 |
else: # only show results for knn
|
436 |
st.markdown("#### Results")
|
|
|
|
|
437 |
|
438 |
st.markdown("""The K-nearest-neighbor algorithm doesn't have a built-in solution to compute model explainability with `scikit-learn`.
|
439 |
-
You can use other python packages such as `SHAP` to compute explainability, which we didn't use
|
440 |
|
441 |
-
if select_data == "Diabetes π©ββοΈ":
|
442 |
-
|
443 |
This small number of patient data explains why the model's performance isn't optimal.
|
444 |
-
Additional data collection
|
445 |
|
446 |
score_df = pd.DataFrame({"label":labels, "accuracy":np.round(score*100)})
|
447 |
fig = px.bar(score_df, x="label", y="accuracy", color="label", title="Accuracy results", text_auto=True)
|
@@ -458,3 +475,4 @@ if run_model:
|
|
458 |
|
459 |
|
460 |
|
|
|
|
43 |
model_sklearn = KNeighborsClassifier(n_neighbors=param)
|
44 |
|
45 |
if model == "Decision Tree π³":
|
46 |
+
model_sklearn = DecisionTreeClassifier(max_depth=param, class_weight="balanced")
|
47 |
explainability = True
|
48 |
|
49 |
if model == "Random Forest ποΈ":
|
50 |
+
model_sklearn = RandomForestClassifier(max_depth=param, )#class_weight="balanced_subsample")
|
51 |
explainability = True
|
52 |
|
53 |
|
|
|
125 |
|
126 |
st.image("images/ML_header.jpg")
|
127 |
st.markdown("# Go further π")
|
128 |
+
st.markdown("""This page allows you to test and compare results between different AI models, and gain a deeper understanding of how they make predictions. <br>
|
129 |
It includes three different types of **classification models** with Python code illustrations, as well as four datasets to choose from.
|
130 |
+
|
131 |
+
**Explainability** is also given for most models.
|
132 |
+
These results give an indication on which variable had the most impact on the model's final prediction. <br>
|
133 |
+
Note that each model has its own way of measuring explainability, which makes comparisions between model explainabilities difficult.
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
+
All of the classification models used in this page come from `scikit-learn`, which is a popular Data Science library in Python.
|
136 |
+
""", unsafe_allow_html=True)
|
137 |
try:
|
138 |
st.link_button("Go to the scikit-learn website", "https://scikit-learn.org/stable/index.html")
|
139 |
except:
|
|
|
151 |
st.markdown(" ")
|
152 |
st.markdown(" ")
|
153 |
|
154 |
+
|
155 |
+
|
156 |
+
|
157 |
########################## SELECT A DATASET ###############################
|
158 |
|
159 |
st.markdown("### Select a dataset π")
|
160 |
+
st.markdown("""To perform the classification task, you can choose between three different datasets: **Titanic**, **Car evaluation**, **Wine quality** and **Diabetes prevention** <br>
|
161 |
Each dataset will be shown in its original format and will go through pre-processing steps to insure its quality and usability for the chosen model.
|
162 |
""", unsafe_allow_html=True)
|
163 |
|
164 |
st.warning("""**Note:** The performance of a Machine Learning model is sensitive to the data being used to train it.
|
165 |
Data cleaning and pre-processing are usually as important as training the AI model. These steps can include removing missing values, identifying outliers and transforming columns from text to numbers.""")
|
166 |
|
167 |
+
select_data = st.selectbox("Choose an option", ["Titanic π’", "Car evaluation π", "Wine quality π·", "Diabetes prevention π©ββοΈ"]) #label_visibility="collapsed")
|
168 |
st.markdown(" ")
|
169 |
|
170 |
if select_data =="Wine quality π·":
|
|
|
258 |
- **Evaluation**: Evaluation level (unacceptable, acceptable)""")
|
259 |
|
260 |
|
261 |
+
if select_data == "Diabetes prevention π©ββοΈ":
|
262 |
# Load data and clean it
|
263 |
data = load_data_csv(path_data, "diabetes.csv")
|
264 |
data["Outcome"] = data["Outcome"].map({1:"Yes", 0:"No"})
|
|
|
298 |
st.markdown(" ")
|
299 |
|
300 |
|
301 |
+
|
302 |
+
|
303 |
########################## SELECT A MODEL ###############################
|
304 |
|
305 |
st.markdown("### Select a model π")
|
|
|
307 |
For each model, you will be given a short explanation as to how they function.
|
308 |
""", unsafe_allow_html=True)
|
309 |
|
310 |
+
st.warning("""**Note**: Different types of models exists for most Machine Learning tasks.
|
311 |
+
Models tend to vary in complexity and picking which one to train for a specific use case isn't always straightforward.
|
312 |
+
Complex model might output better results but take longer to make predictions.
|
313 |
+
The model selection step requires a good amount of testing by practitioners.""")
|
314 |
+
|
315 |
select_model = st.selectbox("**Choose an option**", ["K-nearest-neighbor ποΈ", "Decision Tree π³", "Random Forest ποΈ"])
|
316 |
st.markdown(" ")
|
317 |
|
|
|
319 |
if select_model == "K-nearest-neighbor ποΈ":
|
320 |
#st.markdown("#### Model: K-nearest-neighbor")
|
321 |
st.info("""**About the model**: K-nearest-neighbor (or KNN) is a type of classification model that uses neighboring points to classify new data.
|
322 |
+
When trying to predict a class to new data point, the algorithm will look at points in close proximity (or in its neighborhood) to make a decision.
|
323 |
+
The most common class in the points' neighborhood will then be chosen as the final prediction.""")
|
324 |
|
325 |
select_param = 6
|
326 |
model_dict = {"model":select_model, "param":select_param}
|
327 |
|
328 |
+
learn_model = st.checkbox("Learn more about the model", key="knn")
|
329 |
if learn_model:
|
330 |
st.markdown("""An important parameter in KNN algorithms is the number of points to choose as neighboors. <br>
|
331 |
The image below shows two cases where the number of neighboors (k) are equal to 3 and 6.
|
332 |
+
- When k is equal to 3 (the small dotted circle in the image below), the most common class is **Class B**. The red point will then be predicted as Classe B.
|
333 |
+
- When k is equal to 6 (the large dotted circle in the image below), the the most common class is **Class A**. The red point will then be predicted as Classe A.""",
|
334 |
unsafe_allow_html=True)
|
335 |
+
|
336 |
st.image("images/knn.png", width=600)
|
337 |
st.markdown("""K-nearest-neighbor algorithm are popular for their simplicity. <br>
|
338 |
This can be a drawback for use cases/dataset that require a more complex approach to make accurate predictions.""", unsafe_allow_html=True)
|
|
|
346 |
st.info("""**About the model**: Decision trees are classification model that split the prediction task into a succession of decisions, each with only two possible outcomes.
|
347 |
These decisions can be visualized as a tree, with data points arriving from the top of the tree and landing at final "prediction regions".""")
|
348 |
|
349 |
+
select_param = 8
|
350 |
model_dict = {"model":select_model, "param":select_param}
|
351 |
|
352 |
+
learn_model = st.checkbox("Learn more about the model", key="tree")
|
353 |
if learn_model:
|
354 |
+
st.markdown("""The following image showcases a decision tree which predicts whether a **bank should give out a loan** to a client. <br>
|
355 |
The data used to train the model has each client's **age**, **salary** and **number of children**.""", unsafe_allow_html=True)
|
356 |
|
357 |
+
st.markdown("""To predict whether a client gets a loan, the client's data goes through each 'leaf' in the tree (leaves are the blue box question in the image below) and **gets assigned the class of the final leaf it fell into** (either Get loan or Don't get loan).
|
358 |
For example, a client that is under 30 years old and has a lower salary than 2500$ will not be awarded a loan by the model.""", unsafe_allow_html=True)
|
359 |
|
360 |
st.image("images/decisiontree.png", width=800)
|
|
|
370 |
if select_model == "Random Forest ποΈ":
|
371 |
st.info("""**About the model:** Random Forest models generate multiple decision tree models to make predictions.
|
372 |
The main drawback of decision trees is that their predictions can be unstable, meaning that their output often changes.
|
373 |
+
Random Forest models combine the predictions of multiple decision trees to reduce this unstability and improve robustness.""")
|
374 |
|
375 |
+
select_param = 8
|
376 |
model_dict = {"model":select_model, "param":select_param}
|
377 |
|
378 |
+
learn_model = st.checkbox("Learn more about the model", key="tree")
|
379 |
if learn_model:
|
380 |
+
st.markdown("""Random Forests classifiers combine the results of multiple trees by apply **majority voting**, which means selecting the class that was most often predicted by trees as the final prediction.
|
381 |
+
In the following image, the random forest model built four decision trees, who each have made their own class prediction. <br>"""
|
382 |
, unsafe_allow_html=True)
|
383 |
|
384 |
st.markdown("""Class C was predicted twice, whereas Class B et D where only predicted once. <br>
|
|
|
408 |
|
409 |
|
410 |
run_model = st.button("Run model", type="primary")
|
411 |
+
st.markdown(" ")
|
412 |
|
413 |
if run_model:
|
414 |
score, feature_imp, feature_names, labels = model_training(X, y, model_dict, _num_transformer=StandardScaler())
|
415 |
|
416 |
if select_model in ["Decision Tree π³", "Random Forest ποΈ"]: # show explainability for decision tree, random firest
|
417 |
+
tab1, tab2 = st.tabs(["Results", "Explainability"])
|
418 |
|
419 |
with tab1:
|
420 |
+
st.markdown("#### Results")
|
421 |
+
st.markdown("""The values below represent the model's accuracy for each possible class.
|
422 |
+
The lowest possible accuracy is 0 and the highest 100.""")
|
423 |
+
if select_data == "Diabetes prevention π©ββοΈ":
|
424 |
+
st.warning("""**Note**: The Diabetes dataset only contains information on 768 patients. 500 patients don't have diabetes and 268 do have the disease.
|
425 |
This small number of patient data explains why the model's performance isn't optimal.
|
426 |
+
Additional data collection as well as hyperparameter tuning can be conducted to improve results.""")
|
427 |
|
428 |
score_df = pd.DataFrame({"label":labels, "accuracy":np.round(score*100)})
|
429 |
+
fig = px.bar(score_df, x="label", y="accuracy", color="label", text_auto=True)
|
430 |
st.plotly_chart(fig, use_container_width=True)
|
431 |
+
|
432 |
st.warning("""**Note**: To improve the results of a model, practionners often conduct *hyperparameter tuning*.
|
433 |
It consists of trying different combination of the model's parameters to maximise the accuracy score.
|
434 |
Hyperparameter tuning wasn't conduct here in order to insure the app doesn't lag.""")
|
435 |
+
|
436 |
|
437 |
with tab2:
|
438 |
+
st.markdown("#### Explainability")
|
439 |
+
st.markdown("""Variables with a high explainability score had the most impact on the model's predictions.
|
440 |
+
Variables with a low explainability score had a much smaller impact.""")
|
441 |
|
442 |
df_feature_imp = pd.DataFrame({"variable":feature_names, "importance":feature_imp})
|
443 |
df_feature_imp = df_feature_imp.groupby("variable").mean().reset_index()
|
|
|
449 |
|
450 |
else: # only show results for knn
|
451 |
st.markdown("#### Results")
|
452 |
+
st.markdown("""The values below represent the model's accuracy for each possible class.
|
453 |
+
The lowest possible accuracy is 0 and the highest 100.""")
|
454 |
|
455 |
st.markdown("""The K-nearest-neighbor algorithm doesn't have a built-in solution to compute model explainability with `scikit-learn`.
|
456 |
+
You can use other python packages such as `SHAP` to compute explainability, which we didn't use here since they usually take a long time to output results.""")
|
457 |
|
458 |
+
if select_data == "Diabetes prevention π©ββοΈ":
|
459 |
+
st.warning("""**Note**: The Diabetes dataset only contains information on 768 patients. 500 patients don't have diabetes and 268 do have the disease.
|
460 |
This small number of patient data explains why the model's performance isn't optimal.
|
461 |
+
Additional data collection as well as hyperparameter tuning can be conducted to improve results.""")
|
462 |
|
463 |
score_df = pd.DataFrame({"label":labels, "accuracy":np.round(score*100)})
|
464 |
fig = px.bar(score_df, x="label", y="accuracy", color="label", title="Accuracy results", text_auto=True)
|
|
|
475 |
|
476 |
|
477 |
|
478 |
+
|
pages/object_detection.py
CHANGED
@@ -170,10 +170,10 @@ st.divider()
|
|
170 |
st.markdown("# Fashion Object Detection π")
|
171 |
# st.info("""This use case showcases the application of **Object detection** to detect clothing items/features on images. <br>
|
172 |
# The images used were gathered from Dior's""")
|
173 |
-
st.info("""**Object detection models** can very valuable for fashion retailers wishing to improve customer experience
|
174 |
-
and even **virtual try-ons**.
|
175 |
-
|
176 |
-
|
177 |
|
178 |
st.markdown(" ")
|
179 |
st.markdown(" ")
|
@@ -194,8 +194,8 @@ st.markdown(" ")
|
|
194 |
|
195 |
|
196 |
st.markdown("### About the model π")
|
197 |
-
st.markdown("""The object detection model was trained
|
198 |
-
|
199 |
|
200 |
colors = ["#8ef", "#faa", "#afa", "#fea", "#8ef","#afa"]*7 + ["#8ef", "#faa", "#afa", "#fea"]
|
201 |
|
@@ -209,7 +209,7 @@ annotated_text([cats_annotated])
|
|
209 |
# 'epaulette', 'sleeve', 'pocket', 'neckline', 'buckle', 'zipper', 'applique', 'bead', 'bow', 'flower', 'fringe', 'ribbon', 'rivet',
|
210 |
# 'ruffle', 'sequin', 'tassel'""", unsafe_allow_html=True)
|
211 |
|
212 |
-
st.markdown("Credits: https://huggingface.co/valentinafeve/yolos-fashionpedia")
|
213 |
st.markdown("")
|
214 |
st.markdown("")
|
215 |
|
@@ -294,6 +294,7 @@ dict_cats_final = {key:value for (key,value) in dict_cats.items() if value in se
|
|
294 |
st.markdown("### Define a threshold for predictions π")
|
295 |
st.markdown("""In this section, you can select a threshold for the model's final predictions. <br>
|
296 |
Objects that are given a lower score than the chosen threshold will be ignored in the final results""", unsafe_allow_html=True)
|
|
|
297 |
st.info("""**Note**: Object detection models detect objects using bounding boxes as well as assign objects to specific classes.
|
298 |
Each object is given a class based on a probability score computed by the model. A high probability signals that the model is confident in its prediction.
|
299 |
On the contrary, a lower probability score signals a level of uncertainty.""")
|
|
|
170 |
st.markdown("# Fashion Object Detection π")
|
171 |
# st.info("""This use case showcases the application of **Object detection** to detect clothing items/features on images. <br>
|
172 |
# The images used were gathered from Dior's""")
|
173 |
+
st.info("""**Object detection models** can very valuable for fashion retailers wishing to improve customer experience. They can provide, for example, **product recognition**, **visual search**
|
174 |
+
and even **virtual try-ons**.""")
|
175 |
+
|
176 |
+
st.markdown("In this use case, we are going to show an object detection model that as able to identify and locate different articles of clothings on fashion show images.")
|
177 |
|
178 |
st.markdown(" ")
|
179 |
st.markdown(" ")
|
|
|
194 |
|
195 |
|
196 |
st.markdown("### About the model π")
|
197 |
+
st.markdown("""The object detection model was trained to **detect specific clothing items** on images. <br>
|
198 |
+
Below is a list of the <b>46</b> different types of clothing items the model can identify and locate.""", unsafe_allow_html=True)
|
199 |
|
200 |
colors = ["#8ef", "#faa", "#afa", "#fea", "#8ef","#afa"]*7 + ["#8ef", "#faa", "#afa", "#fea"]
|
201 |
|
|
|
209 |
# 'epaulette', 'sleeve', 'pocket', 'neckline', 'buckle', 'zipper', 'applique', 'bead', 'bow', 'flower', 'fringe', 'ribbon', 'rivet',
|
210 |
# 'ruffle', 'sequin', 'tassel'""", unsafe_allow_html=True)
|
211 |
|
212 |
+
st.markdown("Credits for the model: https://huggingface.co/valentinafeve/yolos-fashionpedia")
|
213 |
st.markdown("")
|
214 |
st.markdown("")
|
215 |
|
|
|
294 |
st.markdown("### Define a threshold for predictions π")
|
295 |
st.markdown("""In this section, you can select a threshold for the model's final predictions. <br>
|
296 |
Objects that are given a lower score than the chosen threshold will be ignored in the final results""", unsafe_allow_html=True)
|
297 |
+
|
298 |
st.info("""**Note**: Object detection models detect objects using bounding boxes as well as assign objects to specific classes.
|
299 |
Each object is given a class based on a probability score computed by the model. A high probability signals that the model is confident in its prediction.
|
300 |
On the contrary, a lower probability score signals a level of uncertainty.""")
|
pages/recommendation_system.py
CHANGED
@@ -26,7 +26,7 @@ st.markdown("### What is a Recommendation System ?")
|
|
26 |
st.info("""**Recommendation systems** are algorithms built to **suggest** or **recommend** **products** to consumers.
|
27 |
They are very common in social media platforms such as TikTok, Youtube or Instagram or e-commerce websites as they help improve and personalize a consumer's experience.""")
|
28 |
|
29 |
-
st.markdown("""There are two
|
30 |
- **Content-based filtering**: Recommendations are made based on the user's own preferences
|
31 |
- **Collaborative filtering**: Recommendations are made based on the preferences and behavior of similar users""", unsafe_allow_html=True)
|
32 |
|
|
|
26 |
st.info("""**Recommendation systems** are algorithms built to **suggest** or **recommend** **products** to consumers.
|
27 |
They are very common in social media platforms such as TikTok, Youtube or Instagram or e-commerce websites as they help improve and personalize a consumer's experience.""")
|
28 |
|
29 |
+
st.markdown("""There are two main types of recommendation systems:
|
30 |
- **Content-based filtering**: Recommendations are made based on the user's own preferences
|
31 |
- **Collaborative filtering**: Recommendations are made based on the preferences and behavior of similar users""", unsafe_allow_html=True)
|
32 |
|
pages/supervised_unsupervised_page.py
CHANGED
@@ -26,8 +26,9 @@ st.set_page_config(layout="wide")
|
|
26 |
#st.image("images/ML_header.jpg", use_column_width=True)
|
27 |
st.markdown("# Supervised vs Unsupervised Learning π")
|
28 |
|
29 |
-
st.info("""
|
30 |
-
|
|
|
31 |
|
32 |
st.markdown(" ")
|
33 |
#st.markdown("## What are the differences between both ?")
|
@@ -38,7 +39,7 @@ with col1:
|
|
38 |
st.markdown("## Supervised Learning")
|
39 |
st.markdown("""Supervised learning models are trained by learning from **labeled data**. <br>
|
40 |
Labeled data provides to the model the desired output, which it will then use to learn relevant patterns and make predictions.
|
41 |
-
- A model is first **trained** to make predictions using labeled data.
|
42 |
- The trained model can then be used to **predict values** for new data.
|
43 |
""", unsafe_allow_html=True)
|
44 |
st.markdown(" ")
|
@@ -57,7 +58,7 @@ with col2:
|
|
57 |
|
58 |
st.markdown(" ")
|
59 |
|
60 |
-
learning_type = st.selectbox("**Select
|
61 |
["Supervised Learning",
|
62 |
"Unsupervised Learning"])
|
63 |
|
@@ -91,8 +92,11 @@ if learning_type == "Supervised Learning":
|
|
91 |
## Description of the use case
|
92 |
st.divider()
|
93 |
st.markdown("# Credit score classification π―")
|
94 |
-
st.info("""**Classification**
|
95 |
-
|
|
|
|
|
|
|
96 |
st.markdown(" ")
|
97 |
|
98 |
_, col, _ = st.columns([0.25,0.5,0.25])
|
@@ -101,7 +105,7 @@ if learning_type == "Supervised Learning":
|
|
101 |
|
102 |
## Learn about the data
|
103 |
st.markdown("#### About the data π")
|
104 |
-
st.markdown("""To train the credit classification model, you were provided a **labeled** database with
|
105 |
This dataset is 'labeled' since it contains information on what we are trying to predict, which is the **Credit_Score** variable.""",
|
106 |
unsafe_allow_html=True)
|
107 |
|
@@ -350,9 +354,12 @@ if learning_type == "Supervised Learning":
|
|
350 |
## Description of the use case
|
351 |
st.divider()
|
352 |
st.markdown("# Customer churn prediction β")
|
353 |
-
|
354 |
-
|
355 |
-
|
|
|
|
|
|
|
356 |
|
357 |
st.markdown(" ")
|
358 |
|
@@ -367,8 +374,8 @@ if learning_type == "Supervised Learning":
|
|
367 |
|
368 |
## Learn about the data
|
369 |
st.markdown("#### About the data π")
|
370 |
-
st.markdown("""To train the customer churn
|
371 |
-
The data contains information on which services the customer has signed for, information
|
372 |
unsafe_allow_html=True)
|
373 |
# st.markdown("This dataset is 'labeled' since it contains information on what we are trying to predict, which is the **Churn** variable.")
|
374 |
st.info("**Note**: The variables that had two possible values (Yes or No) where transformed into binary variables (0 or 1) with 0 being 'No' and 1 being 'Yes'.")
|
@@ -660,7 +667,7 @@ def markdown_general_info(df):
|
|
660 |
|
661 |
if learning_type == "Unsupervised Learning":
|
662 |
usl_usecase = st.selectbox("**Choose a use case**",
|
663 |
-
["Customer segmentation π§βπ€βπ§"])
|
664 |
|
665 |
|
666 |
#################################### CUSTOMER SEGMENTATION ##################################
|
@@ -668,16 +675,16 @@ if learning_type == "Unsupervised Learning":
|
|
668 |
path_clustering = r"data/clustering"
|
669 |
path_clustering_results = r"data/clustering/results"
|
670 |
|
671 |
-
if usl_usecase == "Customer segmentation π§βπ€βπ§":
|
672 |
|
673 |
# st.divider()
|
674 |
st.divider()
|
675 |
-
st.markdown("# Customer Segmentation π§βπ€βπ§")
|
676 |
|
677 |
-
st.
|
678 |
-
|
679 |
-
|
680 |
-
|
681 |
st.markdown(" ")
|
682 |
|
683 |
## Show image
|
@@ -726,13 +733,14 @@ if learning_type == "Unsupervised Learning":
|
|
726 |
st.info("""**Clustering** is a type of unsupervised learning method that learns how to group similar data points together into "clusters", without needing supervision.
|
727 |
In our case, a data points represents a customer that will be assigned to an unknown group.""")
|
728 |
|
729 |
-
|
730 |
-
- The clustering algorithm used in this use case allows a specific number of groups to be identified, which isn't the case for all clustering models.
|
731 |
-
|
732 |
st.markdown(" ")
|
733 |
st.markdown("Here is an example of grouped data using a clustering model.")
|
734 |
st.image("images/clustering.webp")
|
735 |
|
|
|
736 |
|
737 |
nb_groups = st.selectbox("Choose a number of customer groups to identify", np.arange(2,6))
|
738 |
df_results = load_data_pickle(path_clustering_results, f"results_{nb_groups}_clusters.pkl")
|
|
|
26 |
#st.image("images/ML_header.jpg", use_column_width=True)
|
27 |
st.markdown("# Supervised vs Unsupervised Learning π")
|
28 |
|
29 |
+
st.info("""Data Science models are often split into two categories: **Supervised** and **Unsupervised Learning**.
|
30 |
+
The goal of this page is to present these two kinds of Data Science models, as well as give you multiple use cases to try them with.
|
31 |
+
Note that other kinds of AI models exist such as Reinforcement Learning or Federated Learning, which we won't cover in this app.""")
|
32 |
|
33 |
st.markdown(" ")
|
34 |
#st.markdown("## What are the differences between both ?")
|
|
|
39 |
st.markdown("## Supervised Learning")
|
40 |
st.markdown("""Supervised learning models are trained by learning from **labeled data**. <br>
|
41 |
Labeled data provides to the model the desired output, which it will then use to learn relevant patterns and make predictions.
|
42 |
+
- A model is first **trained** to make predictions using labeled data, which doesn't contain the desired output.
|
43 |
- The trained model can then be used to **predict values** for new data.
|
44 |
""", unsafe_allow_html=True)
|
45 |
st.markdown(" ")
|
|
|
58 |
|
59 |
st.markdown(" ")
|
60 |
|
61 |
+
learning_type = st.selectbox("**Select an AI task**",
|
62 |
["Supervised Learning",
|
63 |
"Unsupervised Learning"])
|
64 |
|
|
|
92 |
## Description of the use case
|
93 |
st.divider()
|
94 |
st.markdown("# Credit score classification π―")
|
95 |
+
st.info("""**Classification models** are supervised learning models whose goal is to categorize data into predefined categories.
|
96 |
+
As opposed to unsupervised learning models, these categories are known beforehand.
|
97 |
+
Other types of supervised learning models include Regression models, which learn how to predict numerical values, instead of a set number of categories.""")
|
98 |
+
|
99 |
+
st.markdown("In this use case, we will build a **credit score classification model** which predicts whether a client has a 'Bad', 'Standard', or 'Good' credit score.")
|
100 |
st.markdown(" ")
|
101 |
|
102 |
_, col, _ = st.columns([0.25,0.5,0.25])
|
|
|
105 |
|
106 |
## Learn about the data
|
107 |
st.markdown("#### About the data π")
|
108 |
+
st.markdown("""To train the credit classification model, you were provided a **labeled** database with 7600 clients and containing bank and credit-related client information. <br>
|
109 |
This dataset is 'labeled' since it contains information on what we are trying to predict, which is the **Credit_Score** variable.""",
|
110 |
unsafe_allow_html=True)
|
111 |
|
|
|
354 |
## Description of the use case
|
355 |
st.divider()
|
356 |
st.markdown("# Customer churn prediction β")
|
357 |
+
|
358 |
+
st.info("""**Classification models** are supervised learning models whose goal is to categorize data into predefined categories.
|
359 |
+
As opposed to unsupervised learning models, these categories are known beforehand.
|
360 |
+
Other types of supervised learning models include Regression models, which learn how to predict numerical values, instead of a set number of categories.""")
|
361 |
+
|
362 |
+
st.markdown("For this use case, we will build a **customer churn classification model** that can predict whether a person will stop being a customer using historical data.")
|
363 |
|
364 |
st.markdown(" ")
|
365 |
|
|
|
374 |
|
375 |
## Learn about the data
|
376 |
st.markdown("#### About the data π")
|
377 |
+
st.markdown("""To train the customer churn model, you were provided a **labeled** database with around 7000 clients of a telecommunications company. <br>
|
378 |
+
The data contains information on which services the customer has signed for, account information as well as whether the customer churned or not (our label here).""",
|
379 |
unsafe_allow_html=True)
|
380 |
# st.markdown("This dataset is 'labeled' since it contains information on what we are trying to predict, which is the **Churn** variable.")
|
381 |
st.info("**Note**: The variables that had two possible values (Yes or No) where transformed into binary variables (0 or 1) with 0 being 'No' and 1 being 'Yes'.")
|
|
|
667 |
|
668 |
if learning_type == "Unsupervised Learning":
|
669 |
usl_usecase = st.selectbox("**Choose a use case**",
|
670 |
+
["Customer segmentation (clustering) π§βπ€βπ§"])
|
671 |
|
672 |
|
673 |
#################################### CUSTOMER SEGMENTATION ##################################
|
|
|
675 |
path_clustering = r"data/clustering"
|
676 |
path_clustering_results = r"data/clustering/results"
|
677 |
|
678 |
+
if usl_usecase == "Customer segmentation (clustering) π§βπ€βπ§":
|
679 |
|
680 |
# st.divider()
|
681 |
st.divider()
|
682 |
+
st.markdown("# Customer Segmentation (clustering) π§βπ€βπ§")
|
683 |
|
684 |
+
st.markdown("""In this use case, we will use a clustering model, a type of Unsupervised Learning model, to perform **Customer Segmentation**. <br>
|
685 |
+
Our model will allow similar groups of clients to be identified within company's consumer database based on consumer habits and caracteristics.
|
686 |
+
""", unsafe_allow_html=True)
|
687 |
+
|
688 |
st.markdown(" ")
|
689 |
|
690 |
## Show image
|
|
|
733 |
st.info("""**Clustering** is a type of unsupervised learning method that learns how to group similar data points together into "clusters", without needing supervision.
|
734 |
In our case, a data points represents a customer that will be assigned to an unknown group.""")
|
735 |
|
736 |
+
# st.markdown("""
|
737 |
+
# - The clustering algorithm used in this use case allows a specific number of groups to be identified, which isn't the case for all clustering models.""")
|
738 |
+
|
739 |
st.markdown(" ")
|
740 |
st.markdown("Here is an example of grouped data using a clustering model.")
|
741 |
st.image("images/clustering.webp")
|
742 |
|
743 |
+
st.warning("**Note**: The number of clusters chosen by the user can have a strong impact on the quality of the segmentation. Try to run the model multiple times with different number of clusters and see which number leads to groups with more distinct customer behaviors/preferences.")
|
744 |
|
745 |
nb_groups = st.selectbox("Choose a number of customer groups to identify", np.arange(2,6))
|
746 |
df_results = load_data_pickle(path_clustering_results, f"results_{nb_groups}_clusters.pkl")
|
pages/timeseries_analysis.py
CHANGED
@@ -35,7 +35,7 @@ def forecast_prophet(train, test, col=None):
|
|
35 |
st.markdown("# Time Series Forecasting π")
|
36 |
|
37 |
st.markdown("### What is Time Series Forecasting ?")
|
38 |
-
st.info("""Time series forecasting models are AI models built to make
|
39 |
These types of models take into account temporal patterns, such as **trends** (long-term movements), **seasonality** (repeating patterns at fixed intervals), and **cyclic patterns** (repeating patterns not necessarily at fixed intervals)""")
|
40 |
#unsafe_allow_html=True)
|
41 |
|
@@ -77,8 +77,8 @@ st.divider()
|
|
77 |
st.markdown("# Power Consumption Forecasting β‘")
|
78 |
|
79 |
#st.markdown(" ")
|
80 |
-
st.info("""In this use case, a time series forecasting model
|
81 |
-
A forecasting model can be a valuable tool
|
82 |
|
83 |
st.markdown(" ")
|
84 |
|
|
|
35 |
st.markdown("# Time Series Forecasting π")
|
36 |
|
37 |
st.markdown("### What is Time Series Forecasting ?")
|
38 |
+
st.info("""Time series forecasting models are AI models built to make predictions about future values using historical data.
|
39 |
These types of models take into account temporal patterns, such as **trends** (long-term movements), **seasonality** (repeating patterns at fixed intervals), and **cyclic patterns** (repeating patterns not necessarily at fixed intervals)""")
|
40 |
#unsafe_allow_html=True)
|
41 |
|
|
|
77 |
st.markdown("# Power Consumption Forecasting β‘")
|
78 |
|
79 |
#st.markdown(" ")
|
80 |
+
st.info("""In this use case, a time series forecasting model learns how to accuratly predict the **energy consumption** (or global active power in the dataset) of a household using historical data.
|
81 |
+
A forecasting model can be a valuable tool for energy consumption analysis as it can help **optimize resource planning** and **avoid overloads** during peak demand periods.""")
|
82 |
|
83 |
st.markdown(" ")
|
84 |
|
pages/topic_modeling.py
CHANGED
@@ -41,7 +41,7 @@ st.markdown(" ")
|
|
41 |
st.divider()
|
42 |
|
43 |
st.markdown("# Topic modeling on product descriptions ποΈ")
|
44 |
-
st.
|
45 |
the main types of products solds.""")
|
46 |
|
47 |
_, col, _ = st.columns([0.2,0.6,0.2])
|
|
|
41 |
st.divider()
|
42 |
|
43 |
st.markdown("# Topic modeling on product descriptions ποΈ")
|
44 |
+
st.markdown("""In this use case, we will use a **topic model** to categorize around **20 000 e-commerce products** as well as identify
|
45 |
the main types of products solds.""")
|
46 |
|
47 |
_, col, _ = st.columns([0.2,0.6,0.2])
|