etrotta commited on
Commit
25e395f
·
1 Parent(s): caac288

Update reactive plots comments

Browse files
Files changed (1) hide show
  1. polars/05_reactive_plots.py +125 -61
polars/05_reactive_plots.py CHANGED
@@ -4,14 +4,14 @@
4
  # "marimo",
5
  # "numpy==2.2.3",
6
  # "plotly[express]==6.0.0",
7
- # "polars==1.23.0",
8
  # "statsmodels==0.14.4",
9
  # ]
10
  # ///
11
 
12
  import marimo
13
 
14
- __generated_with = "0.11.16"
15
  app = marimo.App(width="medium")
16
 
17
 
@@ -30,9 +30,35 @@ def _(mo):
30
  For this tutorial, we will be using the a [Spotify Tracks dataset](https://huggingface.co/datasets/maharshipandya/spotify-tracks-dataset).
31
 
32
  Note that it does not contains data about ***all*** tracks, you can try using a larger dataset such as [bigdata-pw/Spotify](https://huggingface.co/datasets/bigdata-pw/Spotify), but I'm sticking with the smaller one to keep the notebook size managable for most users.
 
 
 
 
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  You should always take a look at the data you are working on before actually doing any operations on it - for data coming from sources such as HuggingFace or Kaggle you may want to look in their websites, then filter or do some transformations before downloading.
35
 
 
 
36
  Let's say that looking at it in the Data Viewer, we decided we do not want the Unnamed column (which appears to be the row index), nor do we care about the original ID, and we only want non-explicit tracks.
37
  """
38
  )
@@ -40,44 +66,36 @@ def _(mo):
40
 
41
 
42
  @app.cell
43
- def _(pl):
44
- # You can read directly from the Hugging Face dataset, in which case polars will only read the necessary data:
45
- #repo_id, branch, file_path = (
46
- # "maharshipandya/spotify-tracks-dataset",
47
- # "~parquet",
48
- # "default/train/0000.parquet",
49
- #)
50
- #URL = f"hf://datasets/{repo_id}@{branch}/{file_path}"
51
- #lz = pl.scan_parquet(URL)
52
- # Or save to a local file first if you want to avoid downloading it each time you run:
53
- file_path = "spotify-tracks.parquet"
54
- lz = pl.scan_parquet(file_path)
55
  df = (
56
  lz
57
  # Filter data we consider relevant (somewhat arbitrary in this example)
58
  .filter(pl.col("explicit") == False)
59
  .drop("Unnamed: 0", "track_id", "explicit")
60
  .with_columns(
61
- # Some random transformations for example,
62
- # Transform a String column with few unique values into Categorical to occupy less memory
63
- pl.col("track_genre").cast(pl.Categorical()),
64
  # Convert the duration from miliseconds to seconds (int)
65
  pl.col("duration_ms").floordiv(1_000).alias("duration_seconds"),
66
  # Convert the popularity from an integer 0 ~ 100 to a percentage 0 ~ 1.0
67
  pl.col("popularity").truediv(100),
68
  )
69
- # lastly, download and collect into memory
70
  .collect()
71
  )
72
  df
73
- return df, file_path, lz
74
 
75
 
76
  @app.cell(hide_code=True)
77
  def _(mo):
78
  mo.md(
79
  r"""
80
- We may want to start by investigating any values that seem weird, to verify if there could be issues in the data, in bugs in our pipelines, or if our understanding of it is wrong.
 
 
 
 
 
81
 
82
  For example, the "min" value for the duration column is zero, and the max is over an hour. Why is that?
83
  """
@@ -93,9 +111,20 @@ def _(df, pl):
93
  return
94
 
95
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  @app.cell
97
  def _(df, mo, px):
98
- # Let's visualize it and get a feel for which region makes sense to focus on for our analysis
99
  duration_counts = df.group_by("duration_seconds").len("count")
100
  fig = px.bar(duration_counts, x="duration_seconds", y="count")
101
  fig.update_layout(selectdirection="h")
@@ -108,10 +137,10 @@ def _(df, mo, px):
108
  def _(mo):
109
  mo.md(
110
  """
111
- The previous cell set a default, but you can and should try moving it around a bit.
112
-
113
  Note how there are a few outliers with extremely little duration (less than 2 minutes) and a few with extremely long duration (more than 6 minutes)
114
 
 
 
115
  We will focus on those within that middle ground from around 120 seconds to 360 seconds, but you can play around with it a bit and see how the results change if you move the Selection region. Perhaps you can even find some Classical songs?
116
  """
117
  )
@@ -120,21 +149,24 @@ def _(mo):
120
 
121
  @app.cell
122
  def _(pl, plot):
123
- # We can see our selection and use it as a filter:
124
  pl.DataFrame(plot.value)
125
  return
126
 
127
 
128
  @app.cell
129
  def _(df, pl, plot):
130
- if plot.value:
 
 
 
 
 
 
131
  min_dur, max_dur = (
132
  min(row["duration_seconds"] for row in plot.value),
133
  max(row["duration_seconds"] for row in plot.value),
134
  )
135
- else:
136
- print("Could not find a selected region. Using default values instead, try clicking and dragging in the above plot to change them.")
137
- min_dur, max_dur = 120, 360
138
 
139
  # Calculate how many we are keeping vs throwing away with the filter
140
  duration_in_range = pl.col("duration_seconds").is_between(min_dur, max_dur)
@@ -142,7 +174,7 @@ def _(df, pl, plot):
142
  f"Filtering to keep rows between {min_dur}s and {max_dur}s duration - Throwing away {df.select(1 - duration_in_range.mean()).item():.2%} of the rows"
143
  )
144
 
145
- # Actually filter
146
  filtered_duration = df.filter(duration_in_range)
147
  filtered_duration
148
  return duration_in_range, filtered_duration, max_dur, min_dur
@@ -152,7 +184,7 @@ def _(df, pl, plot):
152
  def _(mo):
153
  mo.md(
154
  r"""
155
- Now that our data is clean, let's start making some more analises over it. Some example questions:
156
 
157
  - Which tracks or artists are the most popular? (Both globally as well as for each genre)
158
  - Which genres are the most popular? The loudest?
@@ -163,6 +195,7 @@ def _(mo):
163
  - Can you classify a song's genre based on its attributes?
164
 
165
  For brevity, we will not explore all of them - feel free to try some of the others yourself, or go more in deep in the explored ones.
 
166
  """
167
  )
168
  return
@@ -174,24 +207,25 @@ def _(filter_genre, filtered_duration, mo, pl):
174
  most_popular_artists = (
175
  filtered_duration.lazy()
176
  .with_columns(pl.col("artists").str.split(";"))
177
- # Spoiler for the next cell! Remember that in marimo you can do things 'out of order'
178
  .filter(True if filter_genre.value is None else pl.col("track_genre").eq(filter_genre.value))
179
  .explode("artists")
180
  .group_by("artists")
181
  .agg(
182
- # Now, how we aggregate it is also a question.
183
  # Do we take the sum of each of their songs popularity?
184
  # Do we just take their most popular song?
185
  # Do we take an average of their songs popularity?
186
  # We'll proceed with the average of their top 10 most popular songs for now,
187
- # but that is something you may want to modify and experiment with.
188
  pl.col("popularity").top_k(10).mean(),
 
189
  # Let's also take some of their most popular albums songs for reference:
190
  pl.col("track_name").sort_by("popularity").unique(maintain_order=True).top_k(5),
191
  pl.col("album_name").sort_by("popularity").unique(maintain_order=True).top_k(5),
192
  pl.col("track_genre").top_k_by("popularity", k=1).alias("Most popular genre"),
193
  # And for good measure, see how many total tracks they have
194
- pl.col("track_name").n_unique().alias("tracks_count")
195
  )
196
  .collect()
197
  )
@@ -209,7 +243,15 @@ def _(most_popular_artists, pl):
209
  @app.cell
210
  def _(filtered_duration, mo):
211
  # Recognize any of your favourite songs? Me neither. Let's try adding a filter by genre
212
- filter_genre = mo.ui.dropdown(options=filtered_duration["track_genre"].unique().sort().to_list(), allow_select_none=True, value=None, searchable=True, label="Filter by Track Genre:")
 
 
 
 
 
 
 
 
213
  filter_genre
214
  return (filter_genre,)
215
 
@@ -229,9 +271,11 @@ def _(mo):
229
  @app.cell
230
  def _(filtered_duration, pl, px):
231
  fig_dur_per_genre = px.scatter(
232
- filtered_duration.group_by("track_genre").agg(
 
233
  pl.col("duration_seconds", "popularity").mean().round(2),
234
- ).sort("track_genre", descending=True),
 
235
  hover_name="track_genre",
236
  y="duration_seconds",
237
  x="popularity",
@@ -279,8 +323,14 @@ def _(filtered_duration, mo):
279
  color = mo.ui.dropdown(options, value="loudness", allow_select_none=True, searchable=True, label="Color column")
280
  alpha = mo.ui.slider(start=0.01, stop=1.0, step=0.01, value=0.1, label="Alpha", show_value=True)
281
  include_trendline = mo.ui.checkbox(label="Trendline")
282
- # We *could* reuse the same filter_genre as above, but it would cause marimo to rerun both the table and the graph whenever we change either
283
- filter_genre2 = mo.ui.dropdown(options=filtered_duration["track_genre"].unique().sort().to_list(), allow_select_none=True, value=None, searchable=True, label="Filter by Track Genre:")
 
 
 
 
 
 
284
  x_axis, y_axis, color, alpha, include_trendline, filter_genre2
285
  return (
286
  alpha,
@@ -307,7 +357,9 @@ def _(
307
  y_axis,
308
  ):
309
  fig2 = px.scatter(
310
- filtered_duration.filter((pl.col("track_genre") == filter_genre2.value) if filter_genre2.value is not None else True),
 
 
311
  x=x_axis.value,
312
  y=y_axis.value,
313
  color=color.value,
@@ -334,14 +386,16 @@ def _(mo):
334
 
335
  @app.cell
336
  def _(chart2, filtered_duration, mo, pl):
337
- # Let's look at which sort of songs were included in that region
338
  if len(chart2.value) == 0:
339
  out = mo.md("No data found in selection")
340
  active_columns = column_order = None
341
  else:
342
  active_columns = list(chart2.value[0].keys())
343
  column_order = ["track_name", *active_columns, "album_name", "artists"]
344
- out = filtered_duration.join(pl.DataFrame(chart2.value).unique(), on=active_columns).select(pl.col(column_order), pl.exclude(*column_order))
 
 
345
  out
346
  return active_columns, column_order, out
347
 
@@ -363,8 +417,8 @@ def _(mo):
363
  @app.cell(disabled=True)
364
  def _(filtered_duration, mo, pl):
365
  # Note that we cannot use dropdown due to the sheer number of elements being enormous:
366
- all_artists = filtered_duration.select(pl.col("artists").str.split(';').explode().unique().sort())['artists'].to_list()
367
- all_tracks = filtered_duration['track_name'].unique().sort().to_list()
368
  alternative_filter_artist = mo.ui.dropdown(all_artists, value=None, searchable=True)
369
  alternative_filter_track = mo.ui.dropdown(all_tracks, value=None, searchable=True)
370
  # So we just provide freeform text boxes and filter ourselfves later
@@ -387,22 +441,29 @@ def _(filter_artist, filter_track, filtered_duration, mo, pl):
387
  string = string.casefold()
388
  return (
389
  # For a more professional use case, you might want to look into string distance functions
390
- # in the polars-dspolars-ds package or other polars plugins
391
- - col.str.len_chars().cast(pl.Int32())
392
  + pl.when(col.str.contains(string)).then(50).otherwise(0)
393
  + pl.when(col.str.starts_with(string)).then(50).otherwise(0)
394
  )
395
 
396
- filtered_artist_track = filtered_duration.select(
397
- pl.col("artists"),
398
- pl.col("track_name"),
399
- (score_match_text(pl.col("track_name"), filter_track.value)
400
- + pl.col('artists').str.split(';').list.eval(score_match_text(pl.element(), filter_artist.value)).list.sum()).alias("match_score"),
401
- pl.col("album_name"),
402
- pl.col("track_genre"),
403
- pl.col("popularity"),
404
- pl.col("duration_seconds"),
405
- ).filter(pl.col("match_score") > 0).sort("match_score", descending=True)
 
 
 
 
 
 
 
406
 
407
  mo.md("Filter a track based on its name or artist"), filter_artist, filter_track, filtered_artist_track
408
  return filtered_artist_track, score_match_text
@@ -412,22 +473,25 @@ def _(filter_artist, filter_track, filtered_duration, mo, pl):
412
  def _(filter_genre2, filtered_duration, mo, pl):
413
  # Artists combinations
414
  artist_combinations = (
415
- filtered_duration
416
- .lazy()
417
  .filter((pl.col("track_genre") == filter_genre2.value) if filter_genre2.value is not None else True)
418
- .with_columns(pl.col("artists").str.split(';'))
419
  .with_columns(pl.col("artists").alias("other_artist"))
420
  .explode("artists")
421
  .explode("other_artist")
422
  # Filter to:
423
  # 1) Remove an artist with themselves
424
- # 2) Remove duplicate combinations, otherwise we would have once row for (A, B) and one for (B, A)
425
  .filter(pl.col("artists") > pl.col("other_artist"))
426
  .group_by("artists", "other_artist")
427
  .len("count")
428
  .collect()
429
  )
430
- mo.md("Check which artists collaborate with others most often (reuses the last genre filter)"), filter_genre2, artist_combinations.sort("count", descending=True)
 
 
 
 
431
  return (artist_combinations,)
432
 
433
 
 
4
  # "marimo",
5
  # "numpy==2.2.3",
6
  # "plotly[express]==6.0.0",
7
+ # "polars==1.26.0",
8
  # "statsmodels==0.14.4",
9
  # ]
10
  # ///
11
 
12
  import marimo
13
 
14
+ __generated_with = "0.11.26"
15
  app = marimo.App(width="medium")
16
 
17
 
 
30
  For this tutorial, we will be using the a [Spotify Tracks dataset](https://huggingface.co/datasets/maharshipandya/spotify-tracks-dataset).
31
 
32
  Note that it does not contains data about ***all*** tracks, you can try using a larger dataset such as [bigdata-pw/Spotify](https://huggingface.co/datasets/bigdata-pw/Spotify), but I'm sticking with the smaller one to keep the notebook size managable for most users.
33
+ """
34
+ )
35
+ return
36
+
37
 
38
+ @app.cell
39
+ def _(pl):
40
+ # You can read directly from the Hugging Face dataset, in which case polars will only read the necessary data:
41
+ repo_id, branch, file_path = (
42
+ "maharshipandya/spotify-tracks-dataset",
43
+ "~parquet",
44
+ "default/train/0000.parquet",
45
+ )
46
+ URL = f"hf://datasets/{repo_id}@{branch}/{file_path}"
47
+ lz = pl.scan_parquet(URL)
48
+ # Or save to a local file first if you want to avoid downloading it each time you run:
49
+ # file_path = "spotify-tracks.parquet"
50
+ # lz = pl.scan_parquet(file_path)
51
+ return URL, branch, file_path, lz, repo_id
52
+
53
+
54
+ @app.cell(hide_code=True)
55
+ def _(mo):
56
+ mo.md(
57
+ """
58
  You should always take a look at the data you are working on before actually doing any operations on it - for data coming from sources such as HuggingFace or Kaggle you may want to look in their websites, then filter or do some transformations before downloading.
59
 
60
+ The Polars Lazy engine allows for you define operations before loading the data, and polars will optimize the plan in order to avoid doing unnecessary operations
61
+
62
  Let's say that looking at it in the Data Viewer, we decided we do not want the Unnamed column (which appears to be the row index), nor do we care about the original ID, and we only want non-explicit tracks.
63
  """
64
  )
 
66
 
67
 
68
  @app.cell
69
+ def _(lz, pl):
 
 
 
 
 
 
 
 
 
 
 
70
  df = (
71
  lz
72
  # Filter data we consider relevant (somewhat arbitrary in this example)
73
  .filter(pl.col("explicit") == False)
74
  .drop("Unnamed: 0", "track_id", "explicit")
75
  .with_columns(
76
+ # Perform whichever transformations you want (again somewhat arbitrary in this example)
 
 
77
  # Convert the duration from miliseconds to seconds (int)
78
  pl.col("duration_ms").floordiv(1_000).alias("duration_seconds"),
79
  # Convert the popularity from an integer 0 ~ 100 to a percentage 0 ~ 1.0
80
  pl.col("popularity").truediv(100),
81
  )
82
+ # lastly, download (if needed) and collect into memory
83
  .collect()
84
  )
85
  df
86
+ return (df,)
87
 
88
 
89
  @app.cell(hide_code=True)
90
  def _(mo):
91
  mo.md(
92
  r"""
93
+ When you start exploring a dataset, some of the first things to do may include:
94
+
95
+ - investigating any values that seem weird
96
+ - verifying if there could be issues in the data
97
+ - checking for potential bugs in our pipelines
98
+ - ensuring you understand the data correctly, includign its relationships and edge cases
99
 
100
  For example, the "min" value for the duration column is zero, and the max is over an hour. Why is that?
101
  """
 
111
  return
112
 
113
 
114
+ @app.cell(hide_code=True)
115
+ def _(mo):
116
+ mo.md(
117
+ r"""
118
+ For this Notebook we will be using [plotly](https://plotly.com/python), but Marimo also supports some other plotting libraries, read the documentation to learn more later.
119
+
120
+ Let's visualize it using a [bar chart](https://plotly.com/python/bar-charts/) and get a feel for which region makes sense to focus on for our analysis
121
+ """
122
+ )
123
+ return
124
+
125
+
126
  @app.cell
127
  def _(df, mo, px):
 
128
  duration_counts = df.group_by("duration_seconds").len("count")
129
  fig = px.bar(duration_counts, x="duration_seconds", y="count")
130
  fig.update_layout(selectdirection="h")
 
137
  def _(mo):
138
  mo.md(
139
  """
 
 
140
  Note how there are a few outliers with extremely little duration (less than 2 minutes) and a few with extremely long duration (more than 6 minutes)
141
 
142
+ You can select a region in the graph by clicking and dragging, which can later be used to filter or transform data. In this Notebook we set a default if there is no selection, but you should try selecting a region yourself.
143
+
144
  We will focus on those within that middle ground from around 120 seconds to 360 seconds, but you can play around with it a bit and see how the results change if you move the Selection region. Perhaps you can even find some Classical songs?
145
  """
146
  )
 
149
 
150
  @app.cell
151
  def _(pl, plot):
152
+ # Taking a look at the selection:
153
  pl.DataFrame(plot.value)
154
  return
155
 
156
 
157
  @app.cell
158
  def _(df, pl, plot):
159
+ if plot.value is None or len(plot.value) == 0:
160
+ print(
161
+ "Could not find a selected region. Using default values instead, try clicking and dragging in the above plot to change them."
162
+ )
163
+ min_dur, max_dur = 120, 360
164
+ else:
165
+ # We can retrieve it and use it as a filter:
166
  min_dur, max_dur = (
167
  min(row["duration_seconds"] for row in plot.value),
168
  max(row["duration_seconds"] for row in plot.value),
169
  )
 
 
 
170
 
171
  # Calculate how many we are keeping vs throwing away with the filter
172
  duration_in_range = pl.col("duration_seconds").is_between(min_dur, max_dur)
 
174
  f"Filtering to keep rows between {min_dur}s and {max_dur}s duration - Throwing away {df.select(1 - duration_in_range.mean()).item():.2%} of the rows"
175
  )
176
 
177
+ # Actually apply the filter
178
  filtered_duration = df.filter(duration_in_range)
179
  filtered_duration
180
  return duration_in_range, filtered_duration, max_dur, min_dur
 
184
  def _(mo):
185
  mo.md(
186
  r"""
187
+ Now that our data is clean, let's start coming up with and answering some questions about it. Some examples:
188
 
189
  - Which tracks or artists are the most popular? (Both globally as well as for each genre)
190
  - Which genres are the most popular? The loudest?
 
195
  - Can you classify a song's genre based on its attributes?
196
 
197
  For brevity, we will not explore all of them - feel free to try some of the others yourself, or go more in deep in the explored ones.
198
+ Make sure to come up with some questions of your own and explore them as well!
199
  """
200
  )
201
  return
 
207
  most_popular_artists = (
208
  filtered_duration.lazy()
209
  .with_columns(pl.col("artists").str.split(";"))
210
+ # Spoiler for a future cell! Remember that in marimo you can do things 'out of order'
211
  .filter(True if filter_genre.value is None else pl.col("track_genre").eq(filter_genre.value))
212
  .explode("artists")
213
  .group_by("artists")
214
  .agg(
215
+ # How to aggregate it is also a question,
216
  # Do we take the sum of each of their songs popularity?
217
  # Do we just take their most popular song?
218
  # Do we take an average of their songs popularity?
219
  # We'll proceed with the average of their top 10 most popular songs for now,
220
+ # but that is something you may want to modify and experiment with, or ask for input from stakeholders in real problems.
221
  pl.col("popularity").top_k(10).mean(),
222
+ # Say that after doing this you don't recognize them and want to know what are their top hits,
223
  # Let's also take some of their most popular albums songs for reference:
224
  pl.col("track_name").sort_by("popularity").unique(maintain_order=True).top_k(5),
225
  pl.col("album_name").sort_by("popularity").unique(maintain_order=True).top_k(5),
226
  pl.col("track_genre").top_k_by("popularity", k=1).alias("Most popular genre"),
227
  # And for good measure, see how many total tracks they have
228
+ pl.col("track_name").n_unique().alias("tracks_count"),
229
  )
230
  .collect()
231
  )
 
243
  @app.cell
244
  def _(filtered_duration, mo):
245
  # Recognize any of your favourite songs? Me neither. Let's try adding a filter by genre
246
+ # While developing, you can add things out of order then go back to old cells and edit them,
247
+ # it's up to you whenever to re-order them later or keep in whichever order visually makes the most sense to you.
248
+ filter_genre = mo.ui.dropdown(
249
+ options=filtered_duration["track_genre"].unique().sort().to_list(),
250
+ allow_select_none=True,
251
+ value=None,
252
+ searchable=True,
253
+ label="Filter by Track Genre:",
254
+ )
255
  filter_genre
256
  return (filter_genre,)
257
 
 
271
  @app.cell
272
  def _(filtered_duration, pl, px):
273
  fig_dur_per_genre = px.scatter(
274
+ filtered_duration.group_by("track_genre")
275
+ .agg(
276
  pl.col("duration_seconds", "popularity").mean().round(2),
277
+ )
278
+ .sort("track_genre", descending=True),
279
  hover_name="track_genre",
280
  y="duration_seconds",
281
  x="popularity",
 
323
  color = mo.ui.dropdown(options, value="loudness", allow_select_none=True, searchable=True, label="Color column")
324
  alpha = mo.ui.slider(start=0.01, stop=1.0, step=0.01, value=0.1, label="Alpha", show_value=True)
325
  include_trendline = mo.ui.checkbox(label="Trendline")
326
+ # We *could* reuse the same filter_genre as above, but it would cause marimo to rerun both the table and the graph whenever we change it
327
+ filter_genre2 = mo.ui.dropdown(
328
+ options=filtered_duration["track_genre"].unique().sort().to_list(),
329
+ allow_select_none=True,
330
+ value=None,
331
+ searchable=True,
332
+ label="Filter by Track Genre:",
333
+ )
334
  x_axis, y_axis, color, alpha, include_trendline, filter_genre2
335
  return (
336
  alpha,
 
357
  y_axis,
358
  ):
359
  fig2 = px.scatter(
360
+ filtered_duration.filter(
361
+ (pl.col("track_genre") == filter_genre2.value) if filter_genre2.value is not None else True
362
+ ),
363
  x=x_axis.value,
364
  y=y_axis.value,
365
  color=color.value,
 
386
 
387
  @app.cell
388
  def _(chart2, filtered_duration, mo, pl):
389
+ # Looking at which sort of songs were included in that region
390
  if len(chart2.value) == 0:
391
  out = mo.md("No data found in selection")
392
  active_columns = column_order = None
393
  else:
394
  active_columns = list(chart2.value[0].keys())
395
  column_order = ["track_name", *active_columns, "album_name", "artists"]
396
+ out = filtered_duration.join(pl.DataFrame(chart2.value).unique(), on=active_columns).select(
397
+ pl.col(column_order), pl.exclude(*column_order)
398
+ )
399
  out
400
  return active_columns, column_order, out
401
 
 
417
  @app.cell(disabled=True)
418
  def _(filtered_duration, mo, pl):
419
  # Note that we cannot use dropdown due to the sheer number of elements being enormous:
420
+ all_artists = filtered_duration.select(pl.col("artists").str.split(";").explode().unique().sort())["artists"].to_list()
421
+ all_tracks = filtered_duration["track_name"].unique().sort().to_list()
422
  alternative_filter_artist = mo.ui.dropdown(all_artists, value=None, searchable=True)
423
  alternative_filter_track = mo.ui.dropdown(all_tracks, value=None, searchable=True)
424
  # So we just provide freeform text boxes and filter ourselfves later
 
441
  string = string.casefold()
442
  return (
443
  # For a more professional use case, you might want to look into string distance functions
444
+ # in the polars-ds package or other polars plugins
445
+ -col.str.len_chars().cast(pl.Int32())
446
  + pl.when(col.str.contains(string)).then(50).otherwise(0)
447
  + pl.when(col.str.starts_with(string)).then(50).otherwise(0)
448
  )
449
 
450
+
451
+ filtered_artist_track = (
452
+ filtered_duration.select(
453
+ pl.col("artists"),
454
+ pl.col("track_name"),
455
+ (
456
+ score_match_text(pl.col("track_name"), filter_track.value)
457
+ + pl.col("artists").str.split(";").list.eval(score_match_text(pl.element(), filter_artist.value)).list.sum()
458
+ ).alias("match_score"),
459
+ pl.col("album_name"),
460
+ pl.col("track_genre"),
461
+ pl.col("popularity"),
462
+ pl.col("duration_seconds"),
463
+ )
464
+ .filter(pl.col("match_score") > 0)
465
+ .sort("match_score", descending=True)
466
+ )
467
 
468
  mo.md("Filter a track based on its name or artist"), filter_artist, filter_track, filtered_artist_track
469
  return filtered_artist_track, score_match_text
 
473
  def _(filter_genre2, filtered_duration, mo, pl):
474
  # Artists combinations
475
  artist_combinations = (
476
+ filtered_duration.lazy()
 
477
  .filter((pl.col("track_genre") == filter_genre2.value) if filter_genre2.value is not None else True)
478
+ .with_columns(pl.col("artists").str.split(";"))
479
  .with_columns(pl.col("artists").alias("other_artist"))
480
  .explode("artists")
481
  .explode("other_artist")
482
  # Filter to:
483
  # 1) Remove an artist with themselves
484
+ # 2) Remove duplicate combinations, otherwise we would have once row for (A, B) and one for (B, A)
485
  .filter(pl.col("artists") > pl.col("other_artist"))
486
  .group_by("artists", "other_artist")
487
  .len("count")
488
  .collect()
489
  )
490
+ (
491
+ mo.md("Check which artists collaborate with others most often (reuses the last genre filter)"),
492
+ filter_genre2,
493
+ artist_combinations.sort("count", descending=True),
494
+ )
495
  return (artist_combinations,)
496
 
497