Aurelien-Morgan-Bot commited on
Commit
87ca194
·
verified ·
1 Parent(s): 7901f44

source-code for model version v0.1_20250311_013607311_UTC- retrain-pipelines 0.1.1

Browse files
v0.1_20250311_013607311_UTC/requirements.txt ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==1.4.0
2
+ accelerate==1.3.0
3
+ aiohappyeyeballs==2.5.0
4
+ aiohttp==3.11.13
5
+ aiosignal==1.3.2
6
+ alabaster==1.0.0
7
+ albucore==0.0.23
8
+ albumentations==2.0.5
9
+ ale-py==0.10.2
10
+ altair==5.5.0
11
+ annotated-types==0.7.0
12
+ anyio==3.7.1
13
+ argon2-cffi==23.1.0
14
+ argon2-cffi-bindings==21.2.0
15
+ array_record==0.6.0
16
+ arviz==0.20.0
17
+ astropy==7.0.1
18
+ astropy-iers-data==0.2025.3.3.0.34.45
19
+ astunparse==1.6.3
20
+ atpublic==4.1.0
21
+ attrs==25.1.0
22
+ audioread==3.0.1
23
+ autograd==1.7.0
24
+ babel==2.17.0
25
+ backcall==0.2.0
26
+ beautifulsoup4==4.13.3
27
+ betterproto==2.0.0b6
28
+ bigframes==1.38.0
29
+ bigquery-magics==0.6.0
30
+ bitsandbytes==0.45.3
31
+ bleach==6.2.0
32
+ blinker==1.9.0
33
+ blis==0.7.11
34
+ blosc2==3.2.0
35
+ bokeh==3.6.3
36
+ boto3==1.37.10
37
+ botocore==1.37.10
38
+ Bottleneck==1.4.2
39
+ bqplot==0.12.44
40
+ branca==0.8.1
41
+ CacheControl==0.14.2
42
+ cachetools==5.5.2
43
+ catalogue==2.0.10
44
+ certifi==2025.1.31
45
+ cffi==1.17.1
46
+ chardet==5.2.0
47
+ charset-normalizer==3.4.1
48
+ chex==0.1.89
49
+ clarabel==0.10.0
50
+ click==8.1.8
51
+ cloudpathlib==0.21.0
52
+ cloudpickle==3.1.1
53
+ cmake==3.31.6
54
+ cmdstanpy==1.2.5
55
+ colorama==0.4.6
56
+ colorcet==3.1.0
57
+ colorlover==0.3.0
58
+ colour==0.1.5
59
+ comm==0.2.2
60
+ community==1.0.0b1
61
+ confection==0.1.5
62
+ cons==0.4.6
63
+ contourpy==1.3.1
64
+ cramjam==2.9.1
65
+ cryptography==43.0.3
66
+ cuda-python==12.6.0
67
+ cudf-cu12 @ https://pypi.nvidia.com/cudf-cu12/cudf_cu12-25.2.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
68
+ cudf-polars-cu12==24.12.0
69
+ cufflinks==0.17.3
70
+ cuml-cu12==25.2.1
71
+ cupy-cuda12x==13.3.0
72
+ cut-cross-entropy==25.1.1
73
+ cuvs-cu12==25.2.1
74
+ cvxopt==1.3.2
75
+ cvxpy==1.6.2
76
+ cycler==0.12.1
77
+ cyipopt==1.5.0
78
+ cymem==2.0.11
79
+ Cython==3.0.12
80
+ dask==2024.12.1
81
+ dask-cuda==25.2.0
82
+ dask-cudf-cu12==25.2.2
83
+ dask-expr==1.1.21
84
+ datascience==0.17.6
85
+ datasets==3.1.0
86
+ db-dtypes==1.4.2
87
+ dbus-python==1.2.18
88
+ debugpy==1.8.0
89
+ decorator==4.4.2
90
+ defusedxml==0.7.1
91
+ Deprecated==1.2.18
92
+ diffusers==0.32.2
93
+ dill==0.3.8
94
+ distributed==2024.12.1
95
+ distributed-ucxx-cu12==0.42.0
96
+ distro==1.9.0
97
+ dlib==19.24.2
98
+ dm-tree==0.1.9
99
+ docker==7.1.0
100
+ docker-pycreds==0.4.0
101
+ docstring_parser==0.16
102
+ docutils==0.21.2
103
+ dopamine_rl==4.1.2
104
+ duckdb==1.1.3
105
+ earthengine-api==1.5.5
106
+ easydict==1.13
107
+ editdistance==0.8.1
108
+ eerepr==0.1.1
109
+ einops==0.8.1
110
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
111
+ entrypoints==0.4
112
+ et_xmlfile==2.0.0
113
+ etils==1.12.0
114
+ etuples==0.3.9
115
+ Farama-Notifications==0.0.4
116
+ fastai==2.7.18
117
+ fastapi==0.115.11
118
+ fastcore==1.7.29
119
+ fastdownload==0.0.7
120
+ fastjsonschema==2.21.1
121
+ fastprogress==1.0.3
122
+ fastrlock==0.8.3
123
+ filelock==3.17.0
124
+ firebase-admin==6.6.0
125
+ Flask==3.1.0
126
+ flatbuffers==25.2.10
127
+ flax==0.10.4
128
+ folium==0.19.5
129
+ fonttools==4.56.0
130
+ frozendict==2.4.6
131
+ frozenlist==1.5.0
132
+ fsspec==2024.9.0
133
+ future==1.0.0
134
+ gast==0.6.0
135
+ GDAL==3.6.4
136
+ gdown==5.2.0
137
+ geemap==0.35.3
138
+ gensim==4.3.3
139
+ geocoder==1.38.1
140
+ geographiclib==2.0
141
+ geopandas==1.0.1
142
+ geopy==2.4.1
143
+ gin-config==0.5.0
144
+ gitdb==4.0.12
145
+ GitPython==3.1.44
146
+ glob2==0.7
147
+ google==2.0.3
148
+ google-ai-generativelanguage==0.6.15
149
+ google-api-core==2.24.1
150
+ google-api-python-client==2.160.0
151
+ google-auth==2.38.0
152
+ google-auth-httplib2==0.2.0
153
+ google-auth-oauthlib==1.2.1
154
+ google-cloud-aiplatform==1.79.0
155
+ google-cloud-bigquery==3.29.0
156
+ google-cloud-bigquery-connection==1.18.1
157
+ google-cloud-bigquery-storage==2.28.0
158
+ google-cloud-bigtable==2.29.0
159
+ google-cloud-core==2.4.2
160
+ google-cloud-dataproc==5.18.0
161
+ google-cloud-datastore==2.20.2
162
+ google-cloud-firestore==2.20.1
163
+ google-cloud-functions==1.19.0
164
+ google-cloud-iam==2.18.1
165
+ google-cloud-language==2.16.0
166
+ google-cloud-pubsub==2.25.0
167
+ google-cloud-resource-manager==1.14.1
168
+ google-cloud-spanner==3.52.0
169
+ google-cloud-storage==2.19.0
170
+ google-cloud-translate==3.19.0
171
+ google-colab @ file:///colabtools/dist/google_colab-1.0.0.tar.gz
172
+ google-crc32c==1.6.0
173
+ google-genai==1.4.0
174
+ google-generativeai==0.8.4
175
+ google-pasta==0.2.0
176
+ google-resumable-media==2.7.2
177
+ google-spark-connect==0.5.3
178
+ googleapis-common-protos==1.69.0
179
+ googledrivedownloader==1.1.0
180
+ graphviz==0.20.3
181
+ greenlet==3.1.1
182
+ grpc-google-iam-v1==0.14.1
183
+ grpc-interceptor==0.15.4
184
+ grpcio==1.70.0
185
+ grpcio-status==1.62.3
186
+ grpclib==0.4.7
187
+ gspread==6.1.4
188
+ gspread-dataframe==4.0.0
189
+ gym==0.25.2
190
+ gym-notices==0.0.8
191
+ gymnasium==1.1.0
192
+ h11==0.14.0
193
+ h2==4.2.0
194
+ h5netcdf==1.5.0
195
+ h5py==3.12.1
196
+ hdbscan==0.8.40
197
+ hf_transfer==0.1.9
198
+ highspy==1.9.0
199
+ holidays==0.68
200
+ holoviews==1.20.1
201
+ hpack==4.1.0
202
+ html5lib==1.1
203
+ httpcore==1.0.7
204
+ httpimport==1.4.1
205
+ httplib2==0.22.0
206
+ httptools==0.6.4
207
+ httpx==0.28.1
208
+ huggingface-hub==0.27.1
209
+ humanize==4.11.0
210
+ hyperframe==6.1.0
211
+ hyperopt==0.2.7
212
+ ibis-framework==9.2.0
213
+ idna==3.10
214
+ imageio==2.37.0
215
+ imageio-ffmpeg==0.6.0
216
+ imagesize==1.4.1
217
+ imbalanced-learn==0.13.0
218
+ imgaug==0.4.0
219
+ immutabledict==4.2.1
220
+ importlib_metadata==8.6.1
221
+ importlib_resources==6.5.2
222
+ imutils==0.5.4
223
+ inflect==7.5.0
224
+ iniconfig==2.0.0
225
+ intel-cmplr-lib-ur==2025.0.5
226
+ intel-openmp==2025.0.5
227
+ ipyevents==2.0.2
228
+ ipyfilechooser==0.6.0
229
+ ipykernel==6.29.5
230
+ ipyleaflet==0.19.2
231
+ ipyparallel==8.8.0
232
+ ipython==7.34.0
233
+ ipython-genutils==0.2.0
234
+ ipython-sql==0.5.0
235
+ ipytree==0.2.2
236
+ ipywidgets==7.7.1
237
+ itsdangerous==2.2.0
238
+ jax==0.4.33
239
+ jax-cuda12-pjrt==0.4.33
240
+ jax-cuda12-plugin==0.4.33
241
+ jaxlib==0.4.33
242
+ jedi==0.19.2
243
+ jeepney==0.7.1
244
+ jellyfish==1.1.0
245
+ jieba==0.42.1
246
+ Jinja2==3.1.4
247
+ jiter==0.8.2
248
+ jmespath==1.0.1
249
+ joblib==1.4.2
250
+ jsonpatch==1.33
251
+ jsonpickle==4.0.2
252
+ jsonpointer==3.0.0
253
+ jsonschema==4.23.0
254
+ jsonschema-specifications==2024.10.1
255
+ jupyter-client==6.1.12
256
+ jupyter-console==6.1.0
257
+ jupyter-leaflet==0.19.2
258
+ jupyter-server==1.24.0
259
+ jupyter_core==5.7.2
260
+ jupyterlab_pygments==0.3.0
261
+ jupyterlab_widgets==3.0.13
262
+ kaggle==1.6.17
263
+ kagglehub==0.3.10
264
+ keras==3.8.0
265
+ keras-hub==0.18.1
266
+ keras-nlp==0.18.1
267
+ keyring==23.5.0
268
+ kiwisolver==1.4.8
269
+ langchain==0.3.20
270
+ langchain-core==0.3.41
271
+ langchain-text-splitters==0.3.6
272
+ langcodes==3.5.0
273
+ langsmith==0.3.11
274
+ language_data==1.3.0
275
+ launchpadlib==1.10.16
276
+ lazr.restfulclient==0.14.4
277
+ lazr.uri==1.0.6
278
+ lazy_loader==0.4
279
+ libclang==18.1.1
280
+ libcudf-cu12==24.12.0
281
+ libcugraph-cu12==25.2.0
282
+ libcuml-cu12==25.2.1
283
+ libcuvs-cu12==25.2.1
284
+ libkvikio-cu12==24.12.1
285
+ libraft-cu12==25.2.0
286
+ librosa==0.10.2.post1
287
+ libucx-cu12==1.18.0
288
+ libucxx-cu12==0.42.0
289
+ lightgbm==4.5.0
290
+ linkify-it-py==2.0.3
291
+ litserve==0.2.6
292
+ llvmlite==0.43.0
293
+ locket==1.0.0
294
+ logical-unification==0.4.6
295
+ lxml==5.3.0
296
+ marisa-trie==1.2.1
297
+ Markdown==3.7
298
+ markdown-it-py==3.0.0
299
+ MarkupSafe==3.0.2
300
+ matplotlib==3.9.2
301
+ matplotlib-inline==0.1.7
302
+ matplotlib-venn==1.1.2
303
+ mdit-py-plugins==0.4.2
304
+ mdurl==0.1.2
305
+ metaflow==2.10.0
306
+ metaflow-card-html==1.0.2
307
+ miniKanren==1.0.3
308
+ missingno==0.5.2
309
+ mistune==3.1.2
310
+ mizani==0.13.1
311
+ mkl==2025.0.1
312
+ ml-dtypes==0.4.1
313
+ mlxtend==0.23.4
314
+ more-itertools==10.6.0
315
+ moviepy==1.0.3
316
+ mpmath==1.3.0
317
+ msgpack==1.1.0
318
+ multidict==6.1.0
319
+ multipledispatch==1.0.0
320
+ multiprocess==0.70.16
321
+ multitasking==0.0.11
322
+ murmurhash==1.0.12
323
+ music21==9.3.0
324
+ namex==0.0.8
325
+ narwhals==1.29.1
326
+ natsort==8.4.0
327
+ nbclassic==1.2.0
328
+ nbclient==0.10.2
329
+ nbconvert==7.16.6
330
+ nbformat==5.10.4
331
+ ndindex==1.9.2
332
+ nest-asyncio==1.6.0
333
+ networkx==3.2.1
334
+ nibabel==5.3.2
335
+ nltk==3.9.1
336
+ notebook==6.5.5
337
+ notebook_shim==0.2.4
338
+ numba==0.60.0
339
+ numba-cuda==0.2.0
340
+ numexpr==2.10.2
341
+ numpy==1.26.4
342
+ nvidia-cublas-cu12==12.4.5.8
343
+ nvidia-cuda-cupti-cu12==12.4.127
344
+ nvidia-cuda-nvcc-cu12==12.5.82
345
+ nvidia-cuda-nvrtc-cu12==12.4.127
346
+ nvidia-cuda-runtime-cu12==12.4.127
347
+ nvidia-cudnn-cu12==9.1.0.70
348
+ nvidia-cufft-cu12==11.2.1.3
349
+ nvidia-curand-cu12==10.3.5.147
350
+ nvidia-cusolver-cu12==11.6.1.9
351
+ nvidia-cusparse-cu12==12.3.1.170
352
+ nvidia-ml-py==12.570.86
353
+ nvidia-nccl-cu12==2.21.5
354
+ nvidia-nvcomp-cu12==4.1.0.6
355
+ nvidia-nvjitlink-cu12==12.4.127
356
+ nvidia-nvtx-cu12==12.4.127
357
+ nvtx==0.2.11
358
+ nx-cugraph-cu12 @ https://pypi.nvidia.com/nx-cugraph-cu12/nx_cugraph_cu12-25.2.0-py3-none-any.whl
359
+ oauth2client==4.1.3
360
+ oauthlib==3.2.2
361
+ openai==1.61.1
362
+ opencv-contrib-python==4.11.0.86
363
+ opencv-python==4.11.0.86
364
+ opencv-python-headless==4.11.0.86
365
+ openpyxl==3.1.5
366
+ opentelemetry-api==1.16.0
367
+ opentelemetry-sdk==1.16.0
368
+ opentelemetry-semantic-conventions==0.37b0
369
+ opt_einsum==3.4.0
370
+ optax==0.2.4
371
+ optree==0.14.1
372
+ orbax-checkpoint==0.6.4
373
+ orjson==3.10.15
374
+ osqp==0.6.7.post3
375
+ packaging==24.2
376
+ pandas==2.2.2
377
+ pandas-datareader==0.10.0
378
+ pandas-gbq==0.28.0
379
+ pandas-stubs==2.2.2.240909
380
+ pandocfilters==1.5.1
381
+ panel==1.6.1
382
+ param==2.2.0
383
+ parso==0.8.4
384
+ parsy==2.1
385
+ partd==1.4.2
386
+ pathlib==1.0.1
387
+ patsy==1.0.1
388
+ peewee==3.17.9
389
+ peft==0.14.0
390
+ pexpect==4.9.0
391
+ pickleshare==0.7.5
392
+ pillow==11.1.0
393
+ platformdirs==4.3.6
394
+ plotly==5.24.1
395
+ plotnine==0.14.5
396
+ pluggy==1.5.0
397
+ ply==3.11
398
+ polars==1.11.0
399
+ pooch==1.8.2
400
+ portpicker==1.5.2
401
+ preshed==3.0.9
402
+ prettytable==3.15.1
403
+ proglog==0.1.10
404
+ progressbar2==4.5.0
405
+ prometheus_client==0.21.1
406
+ promise==2.3
407
+ prompt_toolkit==3.0.50
408
+ propcache==0.3.0
409
+ prophet==1.1.6
410
+ proto-plus==1.26.0
411
+ protobuf==3.20.3
412
+ psutil==5.9.5
413
+ psycopg2==2.9.10
414
+ ptyprocess==0.7.0
415
+ py-cpuinfo==9.0.0
416
+ py4j==0.10.9.7
417
+ pyarrow==17.0.0
418
+ pyasn1==0.6.1
419
+ pyasn1_modules==0.4.1
420
+ pycocotools==2.0.8
421
+ pycparser==2.22
422
+ pydantic==2.9.2
423
+ pydantic_core==2.23.4
424
+ pydata-google-auth==1.9.1
425
+ pydot==1.4.2
426
+ pydotplus==2.0.2
427
+ PyDrive==1.3.1
428
+ PyDrive2==1.21.3
429
+ pyerfa==2.0.1.5
430
+ pygame==2.6.1
431
+ pygit2==1.17.0
432
+ Pygments==2.18.0
433
+ PyGObject==3.42.1
434
+ PyJWT==2.10.1
435
+ pylibcudf-cu12==24.12.0
436
+ pylibcugraph-cu12==25.2.0
437
+ pylibraft-cu12==25.2.0
438
+ pymc==5.20.1
439
+ pymystem3==0.2.0
440
+ pynndescent==0.5.13
441
+ pynvjitlink-cu12==0.5.0
442
+ pynvml==12.0.0
443
+ pyogrio==0.10.0
444
+ Pyomo==6.8.2
445
+ PyOpenGL==3.1.9
446
+ pyOpenSSL==24.2.1
447
+ pyparsing==3.2.1
448
+ pyperclip==1.9.0
449
+ pyproj==3.7.1
450
+ pyshp==2.3.1
451
+ PySocks==1.7.1
452
+ pyspark==3.5.5
453
+ pytensor==2.27.1
454
+ pytest==8.3.3
455
+ python-apt==0.0.0
456
+ python-box==7.3.2
457
+ python-dateutil==2.8.2
458
+ python-dotenv==1.0.1
459
+ python-louvain==0.16
460
+ python-multipart==0.0.20
461
+ python-slugify==8.0.4
462
+ python-snappy==0.7.3
463
+ python-utils==3.9.1
464
+ pytz==2025.1
465
+ pyviz_comms==3.0.4
466
+ PyYAML==6.0.2
467
+ pyzmq==24.0.1
468
+ qdldl==0.1.7.post5
469
+ raft-dask-cu12==25.2.0
470
+ rapids-dask-dependency==25.2.0
471
+ ratelim==0.1.6
472
+ referencing==0.36.2
473
+ regex==2024.11.6
474
+ requests==2.32.3
475
+ requests-oauthlib==2.0.0
476
+ requests-toolbelt==1.0.0
477
+ requirements-parser==0.9.0
478
+ retrain_pipelines @ git+https://github.com/aurelienmorgan/retrain-pipelines.git@9271679f5b6fc26b890a33d0b65501702b24012d#subdirectory=pkg_src
479
+ rich==13.9.4
480
+ rmm-cu12==24.12.0
481
+ rpds-py==0.23.1
482
+ rpy2==3.5.17
483
+ rsa==4.9
484
+ s3transfer==0.11.4
485
+ safetensors==0.5.3
486
+ scikit-image==0.25.2
487
+ scikit-learn==1.6.1
488
+ scipy==1.13.1
489
+ scooby==0.10.0
490
+ scs==3.2.7.post2
491
+ seaborn==0.13.2
492
+ SecretStorage==3.3.1
493
+ Send2Trash==1.8.3
494
+ sentence-transformers==3.4.1
495
+ sentencepiece==0.2.0
496
+ sentry-sdk==2.22.0
497
+ setproctitle==1.3.5
498
+ shap==0.46.0
499
+ shapely==2.0.7
500
+ shellingham==1.5.4
501
+ shtab==1.7.1
502
+ simple-parsing==0.1.7
503
+ simsimd==6.2.1
504
+ six==1.17.0
505
+ sklearn-compat==0.1.3
506
+ sklearn-pandas==2.2.0
507
+ slicer==0.0.8
508
+ smart-open==7.1.0
509
+ smmap==5.0.2
510
+ sniffio==1.3.1
511
+ snowballstemmer==2.2.0
512
+ sortedcontainers==2.4.0
513
+ soundfile==0.13.1
514
+ soupsieve==2.6
515
+ soxr==0.5.0.post1
516
+ spacy==3.7.5
517
+ spacy-legacy==3.0.12
518
+ spacy-loggers==1.0.5
519
+ spanner-graph-notebook==1.1.1
520
+ Sphinx==8.1.3
521
+ sphinxcontrib-applehelp==2.0.0
522
+ sphinxcontrib-devhelp==2.0.0
523
+ sphinxcontrib-htmlhelp==2.1.0
524
+ sphinxcontrib-jsmath==1.0.1
525
+ sphinxcontrib-qthelp==2.0.0
526
+ sphinxcontrib-serializinghtml==2.0.0
527
+ SQLAlchemy==2.0.38
528
+ sqlglot==25.6.1
529
+ sqlparse==0.5.3
530
+ srsly==2.5.1
531
+ stanio==0.5.1
532
+ starlette==0.46.1
533
+ statsmodels==0.14.4
534
+ stringzilla==3.12.2
535
+ sympy==1.13.1
536
+ tables==3.10.2
537
+ tabulate==0.9.0
538
+ tbb==2022.0.0
539
+ tblib==3.0.0
540
+ tcmlib==1.2.0
541
+ tenacity==9.0.0
542
+ tensorboard==2.18.0
543
+ tensorboard-data-server==0.7.2
544
+ tensorflow==2.18.0
545
+ tensorflow-datasets==4.9.7
546
+ tensorflow-hub==0.16.1
547
+ tensorflow-io-gcs-filesystem==0.37.1
548
+ tensorflow-metadata==1.16.1
549
+ tensorflow-probability==0.25.0
550
+ tensorflow-text==2.18.1
551
+ tensorstore==0.1.72
552
+ termcolor==2.5.0
553
+ terminado==0.18.1
554
+ text-unidecode==1.3
555
+ textblob==0.19.0
556
+ tf-slim==1.1.0
557
+ tf_keras==2.18.0
558
+ thinc==8.2.5
559
+ threadpoolctl==3.5.0
560
+ tifffile==2025.2.18
561
+ timm==1.0.15
562
+ tinycss2==1.4.0
563
+ tokenizers==0.21.0
564
+ toml==0.10.2
565
+ toolz==0.12.1
566
+ torch==2.5.0
567
+ torchsummary==1.5.1
568
+ torchvision==0.20.0
569
+ tornado==6.4.2
570
+ tqdm==4.67.1
571
+ traitlets==5.7.1
572
+ traittypes==0.2.1
573
+ transformers==4.48.3
574
+ treelite==4.4.1
575
+ treescope==0.1.9
576
+ triton==3.1.0
577
+ trl==0.14.0
578
+ tweepy==4.15.0
579
+ typeguard==4.4.2
580
+ typer==0.15.2
581
+ types-pytz==2025.1.0.20250204
582
+ types-setuptools==75.8.2.20250305
583
+ typing_extensions==4.12.2
584
+ tyro==0.9.16
585
+ tzdata==2025.1
586
+ tzlocal==5.3.1
587
+ uc-micro-py==1.0.3
588
+ ucx-py-cu12==0.42.0
589
+ ucxx-cu12==0.42.0
590
+ umap-learn==0.5.7
591
+ umf==0.9.1
592
+ unsloth @ git+https://github.com/unslothai/unsloth.git@512fec6a7b77a930b85a5b5685bf056fbb29ff5e
593
+ unsloth_zoo==2025.3.8
594
+ uritemplate==4.1.1
595
+ urllib3==2.3.0
596
+ uvicorn==0.34.0
597
+ uvloop==0.21.0
598
+ vega-datasets==0.9.0
599
+ wadllib==1.3.6
600
+ wandb==0.19.8
601
+ wasabi==1.1.3
602
+ watchfiles==1.0.4
603
+ wcwidth==0.2.13
604
+ weasel==0.4.1
605
+ webcolors==24.11.1
606
+ webencodings==0.5.1
607
+ websocket-client==1.8.0
608
+ websockets==14.2
609
+ Werkzeug==3.1.3
610
+ widgetsnbextension==3.6.10
611
+ wordcloud==1.9.4
612
+ wrapt==1.17.2
613
+ xarray==2025.1.2
614
+ xarray-einstats==0.8.0
615
+ xformers==0.0.28.post2
616
+ xgboost==2.1.4
617
+ xlrd==2.0.1
618
+ xxhash==3.5.0
619
+ xyzservices==2025.1.0
620
+ yarl==1.18.3
621
+ yellowbrick==1.5
622
+ yfinance==0.2.54
623
+ zict==3.0.0
624
+ zipp==3.21.0
625
+ zstandard==0.23.0
v0.1_20250311_013607311_UTC/retraining_pipeline.py ADDED
@@ -0,0 +1,2137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from unsloth import FastLanguageModel, \
3
+ is_bfloat16_supported, UnslothTrainer, \
4
+ UnslothTrainingArguments
5
+
6
+ import torch
7
+
8
+ import os
9
+ import sys
10
+
11
+ import gc
12
+ import json
13
+ import time
14
+ import shutil
15
+ import logging
16
+ import traceback
17
+ import subprocess
18
+ import importlib.util
19
+ from enum import Enum
20
+ from io import StringIO
21
+ from textwrap import dedent
22
+ from datetime import datetime
23
+ from contextlib import redirect_stdout
24
+
25
+ import numpy as np
26
+ import pandas as pd
27
+
28
+ import polars as pl
29
+ from polars.exceptions import ComputeError
30
+
31
+ import matplotlib
32
+ import matplotlib.pyplot as plt
33
+
34
+ from jinja2 import Environment, FileSystemLoader
35
+
36
+ from metaflow import FlowSpec, step, Parameter, JSONType, \
37
+ IncludeFile, current, metaflow_config as mf_config, \
38
+ resources, Flow, Task, card
39
+ from metaflow.current import Current
40
+ from metaflow.cards import Image, Table, Markdown, \
41
+ Artifact, get_cards
42
+
43
+ from datasets import load_dataset, Dataset, DatasetDict
44
+ from datasets.config import HF_DATASETS_CACHE, HF_CACHE_HOME
45
+ from huggingface_hub import list_repo_commits
46
+ from transformers import AutoTokenizer
47
+ from transformers.utils import logging as hf_logging
48
+
49
+ from retrain_pipelines import __version__
50
+ from retrain_pipelines.dataset.hf_utils import get_lazy_df, \
51
+ get_column_info, iterable_dataset_multi_buffer_sampler, \
52
+ push_dataset_version_to_hub
53
+ from retrain_pipelines.dataset.tool_calls import \
54
+ get_unique_tools, count_tool_occurrences, \
55
+ plot_tools_occurences, column_words_stats, \
56
+ plot_words_count
57
+ from retrain_pipelines.utils.hf_utils import \
58
+ get_new_repo_minor_version, push_files_to_hub_repo_branch
59
+ from retrain_pipelines.utils import create_requirements
60
+
61
+
62
+ class LocalServeReadinessEnum(Enum):
63
+ """
64
+ tracking local-serve (infra-validation)
65
+ status using a "3+"-states enum :
66
+ - "-1" for "not applicable"
67
+ (i.e. "model version not blessed"),
68
+ - "0/1" bool for failure/success.
69
+ """
70
+ NOT_APPLICABLE = -1
71
+ FAILURE = 0
72
+ FAILURE_NO_DOCKER = 2
73
+ SUCCESS = 1
74
+
75
+
76
+ class UnslothFuncCallFlow(FlowSpec):
77
+ """
78
+ Training pipeline
79
+ """
80
+ # @see https://github.com/unslothai/unsloth/wiki
81
+
82
+ #--- flow parameters -------------------------------------------------------
83
+
84
+ RETRAIN_PIPELINE_TYPE = "mf_unsloth_func_call_litserve"
85
+ # in order to share the config across subprocesses
86
+ os.environ["retrain_pipeline_type"] = RETRAIN_PIPELINE_TYPE
87
+
88
+ hf_dataset = Parameter(
89
+ "hf_dataset",
90
+ help="dict with 'repo_id' and 'commit_hash' keys. " + \
91
+ "if 'commit_hash is None, falls back to latest version " +\
92
+ "of the dataset available in parquet format.\n" +
93
+ "Note that there are 3 required 'attributes' of type " + \
94
+ "str, list[str], list[str]",
95
+ type=JSONType,
96
+ default=dedent("""{
97
+ "repo_id": "Salesforce/xlam-function-calling-60k",
98
+ "config_name": "",
99
+ "commit_hash": "",
100
+ "attributes": {
101
+ "query_attr": "query",
102
+ "answers_attr": "answers",
103
+ "tools_attr": "tools"
104
+ }
105
+ }""").replace("'", '"').strip('"')
106
+ )
107
+
108
+ augmentation_rate = Parameter(
109
+ "augmentation_rate",
110
+ type=float,
111
+ default=.05,
112
+ help="proportion of records to be augmented "+\
113
+ "(x% of original dataset is created"+\
114
+ " as additional augmented datapoints), i.e. "+\
115
+ "truncated queries to serve as negative examples, "+\
116
+ "meaning they trigger no tool call "+\
117
+ "due to info incompleteness."
118
+ )
119
+
120
+ hf_enrich_dataset = Parameter(
121
+ "hf_enrich_dataset",
122
+ help="dict with 'repo_id', 'config_name' and 'commit_hash', "+\
123
+ "query_attribute' and 'query_attribute_handler' keys. "+\
124
+ "if 'commit_hash is None, falls back to latest version "+\
125
+ "of the dataset available in parquet format."+\
126
+ "'query_attribute' depicts the dataset attribute "+\
127
+ "from which 'queries' are to be sampled."+\
128
+ "'query_attribute_handler' serves for attributes "+\
129
+ "that have complex structure, "+\
130
+ "other than 'string' datatype.",
131
+ type=JSONType,
132
+ # @see https://huggingface.co/datasets/google-research-datasets/natural_questions
133
+ default=dedent("""{
134
+ "repo_id": "lighteval/natural_questions_clean",
135
+ "config_name": "",
136
+ "commit_hash": "",
137
+ "query_attribute": "question",
138
+ "query_attribute_handler": "lambda x: x"
139
+ }""").replace("'", '"').strip('"')
140
+ )
141
+
142
+ enrichment_rate = Parameter(
143
+ "enrichment_rate",
144
+ type=float,
145
+ default=.1,
146
+ help="proportion of records "+\
147
+ "to be added from the 'hf_enrich_dataset'"+\
148
+ "(x% of original dataset is sampled and"+\
149
+ " added as enriching datapoints), i.e. "+\
150
+ "queries to serve as negative examples, "+\
151
+ "due to their complete disconnexion "+\
152
+ "to tool calling situations."
153
+ )
154
+
155
+ dataset_repo_id = Parameter(
156
+ "dataset_repo_id",
157
+ type=str,
158
+ default="retrain-pipelines/func_calls",
159
+ help="The 'repo_id' to be used " + \
160
+ "for the Hugging Face dataset version push " + \
161
+ "(will be created at runtime" + \
162
+ " if doesn't already exist)."
163
+ )
164
+
165
+ hf_base_model = Parameter(
166
+ "hf_base_model",
167
+ help="dict with 'repo_id' and 'commit_hash' keys."+\
168
+ "if 'commit_hash is None, falls back "+\
169
+ "to latest available version of the model.",
170
+ type=JSONType,
171
+ default=dedent("""{
172
+ "repo_id": "unsloth/Qwen2.5-1.5B",
173
+ "commit_hash": ""
174
+ }""").replace("'", '"').strip('"')
175
+ )
176
+
177
+ cpt_training_args = Parameter(
178
+ "cpt_training_args",
179
+ help="dict with `TrainingArguments` params "+\
180
+ "for the CPT job.",
181
+ type=JSONType,
182
+ default=dedent("""{
183
+ "warmup_ratio": 0.1,
184
+ "num_train_epochs": 1
185
+ }""").replace("'", '"').strip('"')
186
+ )
187
+
188
+ sft_training_args = Parameter(
189
+ "sft_training_args",
190
+ help="dict with `TrainingArguments` params "+\
191
+ "for the SFT job.",
192
+ type=JSONType,
193
+ default=dedent("""{
194
+ "warmup_ratio": 0.1,
195
+ "num_train_epochs": 1
196
+ }""").replace("'", '"').strip('"')
197
+ )
198
+
199
+ model_repo_id = Parameter(
200
+ "model_repo_id",
201
+ type=str,
202
+ default="retrain-pipelines/function_caller",
203
+ help="The 'repo_id' to be used " + \
204
+ "for the Hugging Face model version push " + \
205
+ "(will be created at runtime" + \
206
+ " if doesn't already exist)."
207
+ )
208
+
209
+ default_pipeline_card_module_dir = \
210
+ os.path.dirname(
211
+ importlib.util.find_spec(
212
+ f"retrain_pipelines.pipeline_card."+
213
+ f"{RETRAIN_PIPELINE_TYPE}"
214
+ ).origin)
215
+ pipeline_card_artifacts_path = Parameter(
216
+ "pipeline_card_artifacts_path",
217
+ type=str,
218
+ default=default_pipeline_card_module_dir,
219
+ help="pipeline_card artifacts location "+\
220
+ "(i.e. dir hosting your optional " + \
221
+ " custom documentation files :" + \
222
+ " 'pipeline_card.py' and/or 'template.html'"+\
223
+ " and/or 'model_readme.py'"+\
224
+ " and/or 'model_readme_template.md'," +\
225
+ " and/or 'dataset_readme.py'"+\
226
+ " and/or 'dataset_readme_template.md' file), " +\
227
+ "if different from default."
228
+ )
229
+ @staticmethod
230
+ def copy_default_dataset_readme_module(
231
+ target_dir: str,
232
+ exists_ok: bool = False
233
+ ) -> None:
234
+ os.makedirs(target_dir, exist_ok=True)
235
+ if (
236
+ not exists_ok and
237
+ os.path.exists(os.path.join(target_dir, "dataset_readme.py"))
238
+ ):
239
+ print("File already exists. Skipping copy.")
240
+ else:
241
+ filefullname = os.path.join(
242
+ UnslothFuncCallFlow.default_pipeline_card_module_dir,
243
+ "dataset_readme.py"
244
+ )
245
+ shutil.copy(filefullname, target_dir)
246
+ print(filefullname)
247
+ @staticmethod
248
+ def copy_default_dataset_readme_template(
249
+ target_dir: str,
250
+ exists_ok: bool = False
251
+ ) -> None:
252
+ os.makedirs(target_dir, exist_ok=True)
253
+ if (
254
+ not exists_ok and
255
+ os.path.exists(os.path.join(target_dir,
256
+ "dataset_readme_template.md"))
257
+ ):
258
+ print("File already exists. Skipping copy.")
259
+ else:
260
+ filefullname = os.path.join(
261
+ UnslothFuncCallFlow.default_pipeline_card_module_dir,
262
+ "dataset_readme_template.md")
263
+ shutil.copy(filefullname, target_dir)
264
+ print(filefullname)
265
+ @staticmethod
266
+ def copy_default_model_readme_module(
267
+ target_dir: str,
268
+ exists_ok: bool = False
269
+ ) -> None:
270
+ os.makedirs(target_dir, exist_ok=True)
271
+ if (
272
+ not exists_ok and
273
+ os.path.exists(os.path.join(target_dir, "model_readme.py"))
274
+ ):
275
+ print("File already exists. Skipping copy.")
276
+ else:
277
+ filefullname = os.path.join(
278
+ UnslothFuncCallFlow.default_pipeline_card_module_dir,
279
+ "model_readme.py"
280
+ )
281
+ shutil.copy(filefullname, target_dir)
282
+ print(filefullname)
283
+ @staticmethod
284
+ def copy_default_model_readme_template(
285
+ target_dir: str,
286
+ exists_ok: bool = False
287
+ ) -> None:
288
+ os.makedirs(target_dir, exist_ok=True)
289
+ if (
290
+ not exists_ok and
291
+ os.path.exists(os.path.join(target_dir,
292
+ "model_readme_template.md"))
293
+ ):
294
+ print("File already exists. Skipping copy.")
295
+ else:
296
+ filefullname = os.path.join(
297
+ UnslothFuncCallFlow.default_pipeline_card_module_dir,
298
+ "model_readme_template.md")
299
+ shutil.copy(filefullname, target_dir)
300
+ print(filefullname)
301
+ @staticmethod
302
+ def copy_default_pipeline_card_module(
303
+ target_dir: str,
304
+ exists_ok: bool = False
305
+ ) -> None:
306
+ os.makedirs(target_dir, exist_ok=True)
307
+ if (
308
+ not exists_ok and
309
+ os.path.exists(os.path.join(target_dir, "pipeline_card.py"))
310
+ ):
311
+ print("File already exists. Skipping copy.")
312
+ else:
313
+ filefullname = os.path.join(
314
+ UnslothFuncCallFlow.default_pipeline_card_module_dir,
315
+ "pipeline_card.py"
316
+ )
317
+ shutil.copy(filefullname, target_dir)
318
+ print(filefullname)
319
+ @staticmethod
320
+ def copy_default_pipeline_card_html_template(
321
+ target_dir: str,
322
+ exists_ok: bool = False
323
+ ) -> None:
324
+ os.makedirs(target_dir, exist_ok=True)
325
+ if (
326
+ not exists_ok and
327
+ os.path.exists(os.path.join(target_dir, "template.html"))
328
+ ):
329
+ print("File already exists. Skipping copy.")
330
+ else:
331
+ filefullname = os.path.join(
332
+ UnslothFuncCallFlow.default_pipeline_card_module_dir,
333
+ "template.html")
334
+ shutil.copy(filefullname, target_dir)
335
+ print(filefullname)
336
+
337
+ del RETRAIN_PIPELINE_TYPE
338
+
339
+ #---------------------------------------------------------------------------
340
+
341
+ @step
342
+ def start(self):
343
+ print(f"{current.flow_name} - {current.run_id}")
344
+
345
+ # GPU availability
346
+ print(torch.cuda.get_device_name(0))
347
+ print(torch.__version__)
348
+ self.engine = "gpu" if torch.cuda.is_available() else "cpu"
349
+
350
+ # hf_dataset
351
+ hf_dataset_dict = \
352
+ get_lazy_df(
353
+ repo_id=self.hf_dataset["repo_id"],
354
+ commit_hash=self.hf_dataset["commit_hash"],
355
+ files_filter=(
356
+ self.hf_dataset['config_name']+"/.*\\.parquet"
357
+ if (
358
+ self.hf_dataset["config_name"] and
359
+ "" < self.hf_dataset["config_name"]
360
+ ) else ".*\\.parquet"
361
+ ),
362
+ hf_token=os.getenv("HF_TOKEN", None)
363
+ )
364
+ try:
365
+ print(hf_dataset_dict["repo_id"], ", ",
366
+ hf_dataset_dict["commit_hash"], " - ",
367
+ hf_dataset_dict["commit_datetime"], "\n",
368
+ hf_dataset_dict["lazy_df"].explain())
369
+ except ComputeError as ex:
370
+ if "HF_TOKEN" not in os.environ:
371
+ print("Does the Hugging Face-hosted dataset " +
372
+ "require authentication ?",
373
+ file=sys.stderr, flush=True)
374
+ raise ex
375
+ self.hf_dataset_dict = hf_dataset_dict
376
+
377
+ # hf_enrich_dataset
378
+ print(self.hf_enrich_dataset)
379
+ hf_enrich_dataset_dict = \
380
+ get_lazy_df(
381
+ repo_id=self.hf_enrich_dataset["repo_id"],
382
+ commit_hash=self.hf_enrich_dataset["commit_hash"],
383
+ files_filter=(
384
+ self.hf_enrich_dataset['config_name']+"/.*\\.parquet"
385
+ if (
386
+ self.hf_enrich_dataset["config_name"] and
387
+ "" < self.hf_enrich_dataset["config_name"]
388
+ ) else ".*\\.parquet"
389
+ ),
390
+ hf_token=os.getenv("HF_TOKEN", None)
391
+ )
392
+ print(' ; '.join(f"{k}: {hf_enrich_dataset_dict[k]}"
393
+ for k in ['commit_hash',
394
+ 'commit_datetime']))
395
+ self.hf_enrich_dataset_dict = hf_enrich_dataset_dict
396
+
397
+ # hf_base_model
398
+ hf_base_model_commits = list_repo_commits(
399
+ repo_id=self.hf_base_model["repo_id"],
400
+ revision=(
401
+ None if (rev_commit_hash:=self.hf_base_model["commit_hash"]) == ""
402
+ else rev_commit_hash
403
+ ),
404
+ repo_type="model",
405
+ token=os.getenv("HF_TOKEN", None))
406
+ self.hf_base_model_dict = {
407
+ "repo_id": self.hf_base_model["repo_id"],
408
+ "commit_hash": hf_base_model_commits[0].commit_id,
409
+ "commit_datetime": \
410
+ hf_base_model_commits[0].created_at
411
+ }
412
+
413
+ self.model_version_blessed = False
414
+ self.current_blessed_run = None
415
+ self.current_blessed_version_dict = None
416
+ current.run.remove_tag("model_version_blessed")
417
+
418
+ self.retrain_pipelines = f"retrain-pipelines {__version__}"
419
+ self.retrain_pipeline_type = os.environ["retrain_pipeline_type"]
420
+
421
+ self.serving_artifacts_local_folder = \
422
+ os.path.realpath(os.path.join(
423
+ os.path.dirname(__file__),
424
+ '..', '..', 'serving_artifacts',
425
+ os.path.sep.join(current.run.path_components)
426
+ ))
427
+
428
+ if not os.path.exists(self.serving_artifacts_local_folder):
429
+ os.makedirs(self.serving_artifacts_local_folder)
430
+
431
+ self.unsloth_dir = os.path.join(
432
+ self.serving_artifacts_local_folder,
433
+ "Unsloth"
434
+ )
435
+ print(f"unsloth_dir : {self.unsloth_dir}")
436
+ self.cpt_model_dir = os.path.join(
437
+ self.unsloth_dir, "cpt_model")
438
+ self.sft_model_dir = os.path.join(
439
+ self.unsloth_dir, "sft_model")
440
+
441
+ self.next(self.eda)
442
+
443
+
444
+ @step
445
+ def eda(self):
446
+ """
447
+ exploratory data analysis.
448
+ """
449
+
450
+ ############################
451
+ # features and label #
452
+ # basic counts #
453
+ ############################
454
+ self.records_count = self.hf_dataset_dict["lazy_df"] \
455
+ .select(pl.len()).collect(engine=self.engine).item()
456
+ self.data_schema = get_column_info(
457
+ self.hf_dataset_dict["lazy_df"], engine=self.engine)
458
+ ############################
459
+
460
+ ############################
461
+ # Answers #
462
+ # tools count #
463
+ ############################
464
+ struct_schema = pl.Struct([
465
+ pl.Field("name",
466
+ pl.String
467
+ ),
468
+ pl.Field("arguments",
469
+ pl.List(pl.String) # we retrieve list of args names
470
+ # (without assigned values)
471
+ )
472
+ ])
473
+ tool_answer_occurrences_df = \
474
+ count_tool_occurrences(
475
+ self.hf_dataset_dict["lazy_df"],
476
+ self.hf_dataset["attributes"]["answers_attr"],
477
+ struct_schema) \
478
+ .collect(engine=self.engine)
479
+ print(f"{tool_answer_occurrences_df['occurrences'].sum():,} " +
480
+ f"query/tool-calls pairs")
481
+ fig = plot_tools_occurences(tool_answer_occurrences_df,
482
+ title_prefix="Dataset answers - ")
483
+ self.answers_tools_count_fig = fig
484
+ ############################
485
+
486
+ ############################
487
+ # Query #
488
+ # words count #
489
+ ############################
490
+ queries_max_length = self.hf_dataset_dict["lazy_df"].select(
491
+ pl.col(
492
+ self.hf_dataset["attributes"]["query_attr"]
493
+ ).str.len_chars().max().alias("max_query_length")
494
+ ).collect(engine=self.engine)
495
+ print(f"longuest query counts " +
496
+ f"{queries_max_length['max_query_length'][0]:,} characters")
497
+
498
+ # queries length quartiles
499
+ self.query_words_stats = \
500
+ column_words_stats(
501
+ self.hf_dataset_dict["lazy_df"],
502
+ self.hf_dataset["attributes"]["query_attr"]
503
+ ).collect(engine=self.engine)
504
+ print(self.query_words_stats.to_pandas().to_string(index=False))
505
+ print("Two thirds of the records have a query with less than " +
506
+ f"{self.query_words_stats['q3'][0]} words.")
507
+
508
+ fig = plot_words_count(
509
+ self.hf_dataset_dict["lazy_df"],
510
+ column_name=self.hf_dataset["attributes"]["query_attr"],
511
+ engine=self.engine)
512
+ self.words_count_fig = fig
513
+ ############################
514
+
515
+ ############################
516
+ # hf_enrich_dataset #
517
+ # Query words count #
518
+ ############################
519
+ enrich_question_words_stats = \
520
+ column_words_stats(
521
+ self.hf_enrich_dataset_dict['lazy_df'],
522
+ self.hf_enrich_dataset["query_attribute"],
523
+ column_attr_handler=eval(
524
+ self.hf_enrich_dataset["query_attribute_handler"])
525
+ ).collect(engine=self.engine)
526
+ print(enrich_question_words_stats.to_pandas()
527
+ .to_string(index=False))
528
+ del enrich_question_words_stats
529
+ ############################
530
+
531
+ self.next(self.augment_data)
532
+
533
+
534
+ @step
535
+ def augment_data(self):
536
+ """
537
+ Add 'negative' examples, where
538
+ queries do not trigger any tool call.
539
+ To achieve that, we sample long user queries,
540
+ truncate at half words count, and
541
+ associate this to an empty list of tool-calls.
542
+ """
543
+ """
544
+ We only consider :
545
+ - records with longuest queries,
546
+ i.e. queries in the last quartile
547
+ of "queries with most word-counts"
548
+ (this is to avoid that 'truncated' queries
549
+ get really short)
550
+ - records with answers consisting
551
+ in a single tool-call
552
+ (in order to minimize the risk
553
+ that truncating actually gives
554
+ a valid answer with
555
+ one tool-call [or more])
556
+
557
+ Note on flow 'augmentation_rate' :
558
+ we add that many records (at most),
559
+ as quartiles size permits.
560
+ """
561
+
562
+ print("Sampling within the population with more than " +
563
+ str(self.query_words_stats['q3'][0]) +
564
+ " words (longest queries quartile) =>")
565
+
566
+ samples_count = \
567
+ int(self.records_count * self.augmentation_rate)
568
+ print(f"would represent {samples_count:,.0f} " +
569
+ f"records to be sampled")
570
+
571
+ eligible_records_df = \
572
+ self.hf_dataset_dict["lazy_df"].filter(
573
+ pl.col(
574
+ self.hf_dataset["attributes"]["query_attr"]
575
+ )
576
+ .str.extract_all(r"\w+")
577
+ .map_elements(
578
+ lambda arr: len(arr),
579
+ return_dtype=pl.Int16)
580
+ .gt(self.query_words_stats['q3'][0])
581
+ & pl.col("answers")
582
+ .map_elements(
583
+ lambda x: len(json.loads(x)) == 1
584
+ if isinstance(x, str)
585
+ else False,
586
+ return_dtype=pl.Boolean)
587
+ ) \
588
+ .collect(engine=self.engine)
589
+ eligible_records_count = \
590
+ eligible_records_df.select(pl.len())["len"][0]
591
+ print(f"eligible_records_count : " +
592
+ f"{eligible_records_count:,.0f}")
593
+ samples_count = min(samples_count, eligible_records_count)
594
+ self.actual_augmentation_rate = \
595
+ samples_count / self.records_count
596
+ print("actual augmentation rate : " +
597
+ f"{self.actual_augmentation_rate:.1%}")
598
+ sampled_records_df = eligible_records_df.sample(
599
+ n=samples_count
600
+ )
601
+
602
+ self.augmented_records_df = \
603
+ sampled_records_df.with_columns(
604
+ pl.col("query")
605
+ .map_elements(
606
+ lambda query:
607
+ " ".join(
608
+ query.split()[
609
+ :len(query.split()) // 2]),
610
+ return_dtype=pl.Utf8)
611
+ .alias("truncated_query")
612
+ ).select([
613
+ pl.col("truncated_query").alias("query"),
614
+ pl.lit("[]").alias("answers")
615
+ ])
616
+ print(self.augmented_records_df.height,
617
+ self.augmented_records_df.columns)
618
+
619
+ self.next(self.enrich_data)
620
+
621
+
622
+ @step
623
+ def enrich_data(self):
624
+ """
625
+ Further enrich our dataset with 'negative' records from
626
+ another dataset (can be general-purpose text dataset)
627
+ as specified by the the flow 'hf_enrich_dataset' argument.
628
+ """
629
+ """
630
+ Note : we here use the Hugging Face `datasets` library
631
+ in 'streaming' mode for records sampling.
632
+ """
633
+
634
+ hf_enrich_ds = load_dataset(
635
+ path=self.hf_enrich_dataset["repo_id"],
636
+ name=self.hf_enrich_dataset["config_name"],
637
+ revision=self.hf_enrich_dataset_dict["commit_hash"],
638
+ streaming=True)
639
+ print(hf_enrich_ds["train"])
640
+
641
+ samples_count = \
642
+ int(self.records_count * self.enrichment_rate)
643
+ print(f"Samplig {samples_count:,.0f} records")
644
+
645
+ query_attribute_handler = \
646
+ eval(self.hf_enrich_dataset["query_attribute_handler"])
647
+ samples_iterator = iterable_dataset_multi_buffer_sampler(
648
+ hf_enrich_ds["train"],
649
+ total_samples=samples_count,
650
+ attributes_selector=\
651
+ (lambda x:query_attribute_handler(
652
+ x[self.hf_enrich_dataset["query_attribute"]])),
653
+ buffer_size=3_000,
654
+ num_passes=3,
655
+ seed=None
656
+ )
657
+ # Capitalize and add end punctuation if missing
658
+ start_time = time.time()
659
+ print("Starting sample enriching records, " +
660
+ "this may take some time if the source dataset " +
661
+ "has a complex structure..")
662
+ samples_list = [
663
+ s.capitalize() + ("" if s[-1] in ".!?" else "?")
664
+ for s in samples_iterator]
665
+ elapsed_time = time.time() - start_time
666
+ print(f".. sampling completed " +
667
+ f"({int(elapsed_time // 3_600)}h:" +
668
+ f"{int((elapsed_time % 3_600) // 60)}m:" +
669
+ f"{int(elapsed_time % 60)}s).")
670
+ enriched_records_df = pl.DataFrame(
671
+ {"query": samples_list,
672
+ "answers": \
673
+ ["[]"] * \
674
+ len(samples_list)}
675
+ )
676
+ self.enriched_records_df = enriched_records_df
677
+
678
+ self.next(self.dataset_to_hub)
679
+
680
+
681
+ @step
682
+ def dataset_to_hub(self):
683
+ """
684
+ Push to hub dataset version
685
+ - continued pre-training dataset
686
+ - training and validation splits of the
687
+ augmented and enriched
688
+ supervised finetuning dataset
689
+ - readme with versioning info
690
+ """
691
+
692
+ #############################
693
+ # case of user-provided #
694
+ # documentation artifact(s) #
695
+ #############################
696
+ # note that user can provide either
697
+ # 'pipeline_card.py' or 'template.html'
698
+ # or 'dataset_readme.py'
699
+ # or 'dataset_readme_template.md'
700
+ # or 'model_readme.py'
701
+ # or 'model_readme_template.md'
702
+ # or any combination of those
703
+ # when specifying custom
704
+ # 'pipeline_card_artifacts_path'
705
+ if (
706
+ "dataset_readme_template.md" in
707
+ os.listdir(self.pipeline_card_artifacts_path)
708
+ ):
709
+ template_dir = self.pipeline_card_artifacts_path
710
+ else:
711
+ template_dir = os.path.dirname(
712
+ importlib.util.find_spec(
713
+ f"retrain_pipelines.pipeline_card."+
714
+ f"{os.getenv('retrain_pipeline_type')}"
715
+ ).origin)
716
+ print(f"template_dir : '{template_dir}'")
717
+ #############################
718
+ if "dataset_readme.py" in os.listdir(
719
+ self.pipeline_card_artifacts_path):
720
+ from retrain_pipelines.utils import \
721
+ get_get_dataset_readme_content
722
+ get_dataset_readme_content = \
723
+ get_get_dataset_readme_content(
724
+ self.pipeline_card_artifacts_path)
725
+ else:
726
+ from retrain_pipelines.pipeline_card import \
727
+ get_dataset_readme_content
728
+ #############################
729
+
730
+
731
+ #############################
732
+ # augmented & enriched #
733
+ # finetuning dataset #
734
+ #############################
735
+ merged_df = pl.concat([
736
+ # dataset
737
+ self.hf_dataset_dict["lazy_df"].select([
738
+ self.hf_dataset["attributes"]["query_attr"],
739
+ self.hf_dataset["attributes"]["answers_attr"]
740
+ ]).collect(engine=self.engine),
741
+ # truncated queries augmentation
742
+ self.augmented_records_df,
743
+ # enriching dataset
744
+ self.enriched_records_df
745
+ ]).sample(
746
+ # shuffling
747
+ fraction=1,
748
+ shuffle=True,
749
+ with_replacement=False
750
+ )
751
+ merged_df = merged_df.sample(fraction=1, shuffle=True)
752
+ merged_df.rechunk()
753
+ print(("merged_df", f"{merged_df.shape[0]:,.0F}",
754
+ merged_df.columns))
755
+
756
+ pandas_df = merged_df.to_pandas()
757
+ train_size = int(0.8 * len(pandas_df))
758
+ print(f"validation : {len(pandas_df) - train_size}")
759
+ sft_dataset = DatasetDict({
760
+ "train": Dataset.from_pandas(pandas_df[:train_size]),
761
+ "validation": Dataset.from_pandas(pandas_df[train_size:])
762
+ })
763
+ #############################
764
+
765
+ #############################
766
+ # continued pre-training #
767
+ # dataset #
768
+ #############################
769
+ struct_schema = pl.Struct([
770
+ pl.Field("name", pl.String),
771
+ pl.Field("description", pl.String),
772
+ pl.Field(
773
+ "parameters",
774
+ pl.String # Use String to allow
775
+ # for varying structures
776
+ # (different tools indeed having
777
+ # different sets of parameters
778
+ # i.e. different parameters counts,
779
+ # datatypes and names)
780
+ # so parsing must be tolerant.
781
+ )
782
+ ])
783
+ unique_tools_df = get_unique_tools(
784
+ self.hf_dataset_dict["lazy_df"],
785
+ tools_attr_name=\
786
+ self.hf_dataset["attributes"]["tools_attr"],
787
+ struct_schema=struct_schema
788
+ ).collect(engine=self.engine)
789
+ unique_tools_arrow_table = unique_tools_df.to_arrow()
790
+ self.unique_tools_dataset = \
791
+ Dataset(unique_tools_arrow_table)
792
+ print(self.unique_tools_dataset)
793
+ #############################
794
+
795
+ #############################
796
+ # DatasetDict #
797
+ # with multiple tables #
798
+ #############################
799
+ dataset_dict = DatasetDict({
800
+ "continued_pre_training": \
801
+ self.unique_tools_dataset,
802
+ "supervised_finetuning": sft_dataset
803
+ })
804
+ print(dataset_dict, flush=True)
805
+ #############################
806
+
807
+ #############################
808
+ # dataset README #
809
+ # from template #
810
+ #############################
811
+ commit_datetime = datetime.utcnow()
812
+ new_dataset_version_label = get_new_repo_minor_version(
813
+ repo_id=self.dataset_repo_id,
814
+ repo_type="dataset",
815
+ hf_token=os.getenv("HF_TOKEN", None))
816
+ readme_content = get_dataset_readme_content(
817
+ template_folder=template_dir,
818
+
819
+ hf_dataset_dict=self.hf_dataset_dict,
820
+ hf_enrich_dataset_dict=self.hf_enrich_dataset_dict,
821
+ dataset_dict=dataset_dict,
822
+
823
+ augmentation_rate=self.actual_augmentation_rate,
824
+ enrichment_rate=self.enrichment_rate,
825
+
826
+ version_label=new_dataset_version_label,
827
+ commit_datetime=commit_datetime,
828
+
829
+ mf_flow_name=current.flow_name,
830
+ mf_run_id=current.run.id,
831
+ engine=self.engine
832
+ )
833
+ #############################
834
+
835
+ dataset_commit_hash = push_dataset_version_to_hub(
836
+ repo_id=self.dataset_repo_id,
837
+ version_label=new_dataset_version_label,
838
+ timestamp_str=commit_datetime.strftime(
839
+ "%Y-%m-%d %H:%M:%S UTC"),
840
+ dataset_dict=dataset_dict,
841
+ dataset_readme_content=readme_content,
842
+ hf_token=os.getenv("HF_TOKEN", None)
843
+ )
844
+ if not dataset_commit_hash:
845
+ raise Exception(
846
+ "Failed to publish dataset version.")
847
+ print(f"https://huggingface.co/datasets/{self.dataset_repo_id}" +
848
+ f"/blob/{dataset_commit_hash}/README.md")
849
+ self.dataset_commit_dict = {
850
+ "repo_id": self.dataset_repo_id,
851
+ "commit_hash": dataset_commit_hash,
852
+ "version_label": new_dataset_version_label,
853
+ "commit_datetime": commit_datetime,
854
+ }
855
+
856
+ self.next(self.continued_pre_training)
857
+
858
+
859
+ @step
860
+ def continued_pre_training(self):
861
+ """
862
+ Gives the base model some additional intrinsic knowkledge
863
+ through continued pre-training.
864
+ See unsloth.ai/blog/contpretraining
865
+ """
866
+ from retrain_pipelines.model.hf_utils import \
867
+ plot_log_history
868
+
869
+ #######################################
870
+ # base-model and associated tokenizer #
871
+ # from Hub (or local cache) #
872
+ #######################################
873
+ self.max_seq_length = 2048
874
+ model, tokenizer = FastLanguageModel.from_pretrained(
875
+ model_name=self.hf_base_model_dict["repo_id"],
876
+ revision=self.hf_base_model_dict["commit_hash"],
877
+ max_seq_length=self.max_seq_length,
878
+ dtype=None,
879
+ load_in_4bit=False,
880
+ # case of a gated or private base-model
881
+ token=os.getenv("HF_TOKEN", None)
882
+ )
883
+ #######################################
884
+
885
+ #######################################
886
+ # dataset prompt_template mapping #
887
+ #######################################
888
+ tools_dataset = DatasetDict(
889
+ {"train": self.unique_tools_dataset})
890
+ print(tools_dataset)
891
+ tool_prompt_template = "tool: {}"
892
+ def formatting_prompts_func(tools_batch):
893
+ tools_batch = tools_batch["tool"]
894
+ outputs = []
895
+ for tool in tools_batch:
896
+ # Must add EOS_TOKEN,
897
+ # otherwise generation will go on forever!
898
+ text = tool_prompt_template.format(tool) + \
899
+ tokenizer.eos_token
900
+ outputs.append(text)
901
+ return { "tools" : outputs, }
902
+ cpt_dataset = tools_dataset["train"].map(
903
+ formatting_prompts_func, batched=True,)
904
+ #######################################
905
+
906
+ #######################################
907
+ # PEFT adapter #
908
+ # for continued pre-training #
909
+ #######################################
910
+ model = FastLanguageModel.get_peft_model(
911
+ model,
912
+ r = 128, # any number >0 ; 8, 16, 32, 64, 128, 256
913
+ target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
914
+ "gate_proj", "up_proj", "down_proj",
915
+ # Add for continued pretraining
916
+ "embed_tokens", "lm_head",],
917
+ lora_alpha = 32,
918
+ lora_dropout = 0, # Supports any, 0 is optimized
919
+ bias = "none", # Supports any, "none" is optimized
920
+ # True or "unsloth" for very long context
921
+ use_gradient_checkpointing = "unsloth",
922
+ use_rslora = True, # rank-stabilized LoRA
923
+ loftq_config = None, # LoftQ
924
+ #random_state = 3407,
925
+ )
926
+ #######################################
927
+
928
+ #######################################
929
+ # cpt_trainer #
930
+ #######################################
931
+ if (
932
+ "records_cap" in self.cpt_training_args and
933
+ self.cpt_training_args["records_cap"] is not None and
934
+ isinstance(self.cpt_training_args["records_cap"], int)
935
+ ):
936
+ cpt_dataset = cpt_dataset.take(
937
+ self.cpt_training_args["records_cap"])
938
+ print(f"cpt_dataset : {cpt_dataset}")
939
+
940
+ train_args = UnslothTrainingArguments(
941
+ # https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.save_strategy
942
+ per_device_train_batch_size=2,
943
+ gradient_accumulation_steps=8,
944
+
945
+ **{k: v for k, v in self.cpt_training_args.items()
946
+ if k != "records_cap"},
947
+
948
+ # 2 to 10x smaller learning rate
949
+ # for the embedding matrices
950
+ learning_rate=5e-5,
951
+ embedding_learning_rate=1e-5,
952
+
953
+ fp16=not is_bfloat16_supported(),
954
+ bf16=is_bfloat16_supported(),
955
+ logging_steps=1,
956
+ optim="adamw_8bit",
957
+ weight_decay=0.01,
958
+ lr_scheduler_type="linear",
959
+ #seed=3407,
960
+
961
+ output_dir=os.path.join(
962
+ self.unsloth_dir, "outputs", "cpt"),
963
+ save_total_limit = 2,
964
+
965
+ report_to="tensorboard",
966
+ logging_dir=os.path.join(
967
+ self.sft_model_dir,
968
+ "runs", "cpt")
969
+ )
970
+
971
+ trainer = UnslothTrainer(
972
+ model=model, tokenizer=tokenizer,
973
+ train_dataset=cpt_dataset,
974
+ dataset_text_field="tools",
975
+ max_seq_length=self.max_seq_length,
976
+ dataset_num_proc=2,
977
+ args=train_args,
978
+ )
979
+ #######################################
980
+
981
+ #######################################
982
+ # Show current memory stats #
983
+ #######################################
984
+ torch.cuda.ipc_collect()
985
+ torch.cuda.empty_cache()
986
+ gc.collect()
987
+
988
+ gpu_stats = torch.cuda.get_device_properties(0)
989
+ self.start_gpu_memory = \
990
+ round(torch.cuda.max_memory_reserved()
991
+ / 1024 / 1024 / 1024, 3)
992
+ self.max_memory = \
993
+ round(gpu_stats.total_memory
994
+ / 1024 / 1024 / 1024, 3)
995
+ print(f"GPU = {gpu_stats.name}. " +
996
+ f"Max memory = {self.max_memory} GB.")
997
+ print(f"{self.start_gpu_memory} GB of memory reserved.")
998
+ #######################################
999
+
1000
+ self.cpt_traces_file_fullname = os.path.join(
1001
+ self.unsloth_dir, "cpt_trainer_traces.txt")
1002
+ print("Training started. " +
1003
+ f"Check {self.cpt_traces_file_fullname} for live traces.",
1004
+ flush=True)
1005
+ with open(self.cpt_traces_file_fullname, 'w') as f:
1006
+ with redirect_stdout(f):
1007
+ hf_logging.set_verbosity_error()
1008
+ trainer_stats = trainer.train()
1009
+ print(f"{trainer_stats.metrics['train_runtime']} " +
1010
+ f"seconds used for training " +
1011
+ f"({round(trainer_stats.metrics['train_runtime']/60, 2)}" +
1012
+ f" minutes).")
1013
+
1014
+ self.cpt_log_history = trainer.state.log_history
1015
+ # print(self.cpt_log_history)
1016
+ self.cpt_log_history_fig = \
1017
+ plot_log_history(
1018
+ self.cpt_log_history,
1019
+ title="Continued pretraining loss"
1020
+ )
1021
+
1022
+ model.save_pretrained_merged(
1023
+ save_directory=self.cpt_model_dir,
1024
+ tokenizer=tokenizer,
1025
+ save_method="lora"
1026
+ )
1027
+ print(f"cpt_model_dir : {self.cpt_model_dir}\n")
1028
+
1029
+ self.next(self.supervised_finetuning)
1030
+
1031
+
1032
+ @step
1033
+ def supervised_finetuning(self):
1034
+ """
1035
+ Trains the model on tool-calling
1036
+ task specialization.
1037
+ """
1038
+ from retrain_pipelines.model.hf_utils import \
1039
+ plot_log_history
1040
+
1041
+ torch.cuda.ipc_collect()
1042
+ torch.cuda.empty_cache()
1043
+ gc.collect()
1044
+
1045
+ model, tokenizer = FastLanguageModel.from_pretrained(
1046
+ model_name=self.cpt_model_dir,
1047
+ max_seq_length=self.max_seq_length,
1048
+ dtype=None,
1049
+ load_in_4bit=False,
1050
+ )
1051
+ # !!!! bug fix BEGIN !!!!
1052
+ # otherwise, 'embed_tokens' and 'lm_head'
1053
+ # trained during CPT are "ignored",
1054
+ # i.e. not saved after SFT
1055
+ # (note that, alternatively, we could also
1056
+ # do this fix after sft-training and
1057
+ # just before saving ;
1058
+ # which would be equivalent to
1059
+ # freezing embeddings during finetuning
1060
+ # for better pretrained knowledge retention)
1061
+ # @see https://www.reddit.com/r/unsloth/comments/1dtzcd6/fastlanguagemodelpatch_peft_model_changing/
1062
+ model.model.model.embed_tokens.modules_to_save.default.to(
1063
+ device="cuda:0",
1064
+ dtype=torch.float32,
1065
+ non_blocking=True)
1066
+ model.model.model.embed_tokens.modules_to_save.default \
1067
+ .requires_grad_(True)
1068
+ model.model.lm_head.modules_to_save.default.to(
1069
+ device="cuda:0",
1070
+ dtype=torch.float32,
1071
+ non_blocking=True)
1072
+ model.model.lm_head.modules_to_save.default \
1073
+ .requires_grad_(True)
1074
+ # !!!! bug fix END !!!!
1075
+
1076
+ #######################################
1077
+ # dataset prompt_template mapping #
1078
+ #######################################
1079
+ # download from Hub (or get from local cache)
1080
+ queries_dataset = load_dataset(
1081
+ path=self.dataset_commit_dict["repo_id"],
1082
+ name="supervised_finetuning",
1083
+ revision=self.dataset_commit_dict["commit_hash"],
1084
+ token=os.getenv("HF_TOKEN", None))
1085
+ print(f"HF_DATASETS_CACHE : {HF_DATASETS_CACHE}") # HF_CACHE_HOME
1086
+ self.sft_prompt_template = dedent("""
1087
+ You specialize in generating tool calls. Given a query, your task is to return a list of tool calls based on your knowledge of known tools.
1088
+
1089
+ Rules:
1090
+ 1. You can only use tools you know. Do not create new tools under any circumstances.
1091
+ 2. If a query does not match any known tool, return an empty list ([]).
1092
+ 3. If information is missing to use a known tool, do not attempt to use it.
1093
+ 4. Your response must always be a valid JSON array, and nothing else.
1094
+
1095
+ Be precise and do not guess.
1096
+
1097
+ # query:
1098
+ {}
1099
+ # response:
1100
+ {}
1101
+ """).strip()
1102
+ tokenizer.chat_template = self.sft_prompt_template
1103
+
1104
+ EOS_TOKEN = tokenizer.eos_token
1105
+ def formatting_prompts_func(records):
1106
+ query = records["query"]
1107
+ tools = records["answers"]
1108
+ outputs = []
1109
+ for query, tools in zip(query, tools):
1110
+ # Must add EOS_TOKEN,
1111
+ # otherwise your generation will go on forever
1112
+ text = self.sft_prompt_template.format(query, tools) \
1113
+ + EOS_TOKEN
1114
+ outputs.append(text)
1115
+ return { "text" : outputs, }
1116
+ sft_train_dataset = queries_dataset["train"].map(
1117
+ formatting_prompts_func, batched=True)
1118
+ sft_valid_dataset = queries_dataset["validation"].map(
1119
+ formatting_prompts_func, batched=True,)
1120
+ #######################################
1121
+
1122
+ #######################################
1123
+ # PEFT adapter #
1124
+ # for supervised finetuning #
1125
+ #######################################
1126
+ # for cases where CPT has been merged into overall model
1127
+ # otherwize, keep on training current LoRa adapter
1128
+ # model = FastLanguageModel.get_peft_model(
1129
+ # model,
1130
+ # r = 128, # any number >0 ; 8, 16, 32, 64, 128, 256
1131
+ # target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
1132
+ # "gate_proj", "up_proj", "down_proj"],
1133
+ # lora_alpha = 32,
1134
+ # lora_dropout = 0, # Supports any, but = 0 is optimized
1135
+ # bias = "none", # Supports any, but = "none" is optimized
1136
+ # # True or "unsloth" for very long context
1137
+ # use_gradient_checkpointing = "unsloth",
1138
+ # random_state = 3407,
1139
+ # use_rslora = True, # rank stabilized LoRA
1140
+ # loftq_config = None, # LoftQ
1141
+ # )
1142
+ #######################################
1143
+
1144
+ #######################################
1145
+ # sft_trainer #
1146
+ #######################################
1147
+ split = sft_train_dataset.train_test_split(
1148
+ test_size=1000,
1149
+ #seed=42
1150
+ )
1151
+ train_dataset = split['train']
1152
+ eval_dataset = split['test']
1153
+ if (
1154
+ "records_cap" in self.sft_training_args and
1155
+ self.sft_training_args["records_cap"] is not None and
1156
+ isinstance(self.sft_training_args["records_cap"], int)
1157
+ ):
1158
+ train_dataset = train_dataset.take(
1159
+ self.sft_training_args["records_cap"])
1160
+ eval_dataset = eval_dataset.take(
1161
+ self.sft_training_args["records_cap"])
1162
+ print(f"train_dataset : {train_dataset}")
1163
+ print(f"eval_dataset : {eval_dataset}")
1164
+
1165
+ train_args = UnslothTrainingArguments(
1166
+ per_device_train_batch_size=2,
1167
+ gradient_accumulation_steps=8,
1168
+
1169
+ **{k: v for k, v in self.sft_training_args.items()
1170
+ if k != "records_cap"},
1171
+
1172
+ per_device_eval_batch_size=2,
1173
+ eval_steps=200,
1174
+ eval_strategy="steps",
1175
+ do_eval=True,
1176
+
1177
+ learning_rate=5e-5,
1178
+ # embedding_learning_rate=1e-5, # Optionally here
1179
+
1180
+ fp16=not is_bfloat16_supported(),
1181
+ bf16=is_bfloat16_supported(),
1182
+
1183
+ optim="adamw_8bit",
1184
+ weight_decay=0.00,
1185
+ lr_scheduler_type="linear",
1186
+ #seed=3407,
1187
+
1188
+ output_dir=os.path.join(
1189
+ self.unsloth_dir, "outputs", "sft"),
1190
+ save_total_limit=2,
1191
+
1192
+ logging_steps=1,
1193
+ report_to="tensorboard",
1194
+ logging_dir=os.path.join(
1195
+ self.sft_model_dir,
1196
+ "runs", "sft")
1197
+ )
1198
+
1199
+ trainer = UnslothTrainer(
1200
+ model=model, tokenizer=tokenizer,
1201
+ train_dataset=train_dataset,
1202
+ dataset_text_field="text",
1203
+ eval_dataset=eval_dataset,
1204
+ max_seq_length=self.max_seq_length,
1205
+ dataset_num_proc=8,
1206
+ args=train_args
1207
+ )
1208
+ trainer.can_return_loss = True
1209
+ #######################################
1210
+
1211
+ #######################################
1212
+ # Show current memory stats #
1213
+ #######################################
1214
+ torch.cuda.ipc_collect()
1215
+ torch.cuda.empty_cache()
1216
+ gc.collect()
1217
+
1218
+ used_memory = \
1219
+ round(torch.cuda.max_memory_reserved()
1220
+ /1024/1024/1024, 3)
1221
+ used_memory_for_lora = \
1222
+ round(used_memory-self.start_gpu_memory, 3)
1223
+ used_percentage = \
1224
+ round(used_memory/self.max_memory*100, 3)
1225
+ lora_percentage = \
1226
+ round(used_memory_for_lora/self.max_memory*100,
1227
+ 3)
1228
+ print(f"Peak reserved memory = " +
1229
+ f"{used_memory} GB.")
1230
+ print(f"Peak reserved memory for " +
1231
+ f"training = {used_memory_for_lora} " +
1232
+ f"GB.")
1233
+ print(f"Peak reserved memory % of " +
1234
+ f"max memory = {used_percentage} %.")
1235
+ print(f"Peak reserved memory for training " +
1236
+ f"% of max memory = {lora_percentage} %.")
1237
+ #######################################
1238
+
1239
+ self.sft_traces_file_fullname = os.path.join(
1240
+ self.unsloth_dir, "sft_trainer_traces.txt")
1241
+ print("Training started. " +
1242
+ f"Check {self.sft_traces_file_fullname} for live traces.",
1243
+ flush=True)
1244
+ with open(self.sft_traces_file_fullname, 'w') as f:
1245
+ with redirect_stdout(f):
1246
+ hf_logging.set_verbosity_error()
1247
+ trainer_stats = trainer.train()
1248
+ print(f"{trainer_stats.metrics['train_runtime']} " +
1249
+ f"seconds used for training " +
1250
+ f"({round(trainer_stats.metrics['train_runtime']/60, 2)}" +
1251
+ f" minutes).")
1252
+
1253
+ self.sft_log_history = trainer.state.log_history
1254
+ self.sft_log_history_fig = \
1255
+ plot_log_history(
1256
+ self.sft_log_history,
1257
+ title="Supervised finetuning loss"
1258
+ )
1259
+
1260
+ model.save_pretrained_merged(
1261
+ self.sft_model_dir, tokenizer,
1262
+ save_method = "lora"
1263
+ )
1264
+ print(f"sft_model_dir : {self.sft_model_dir}\n")
1265
+
1266
+ self.next(self.evaluate_model)
1267
+
1268
+
1269
+ @step
1270
+ def evaluate_model(self):
1271
+ """
1272
+ Batch inference on the SFT validation dataset.
1273
+ """
1274
+ from retrain_pipelines.model import \
1275
+ infer_validation, compute_counts_n_metrics, \
1276
+ plot_validation_completions
1277
+
1278
+ torch.cuda.ipc_collect()
1279
+ torch.cuda.empty_cache()
1280
+ gc.collect()
1281
+
1282
+
1283
+ ######################################################
1284
+ # loading trained adapter #
1285
+ ######################################################
1286
+ # Unsloth (if loading both model & tokenizer at once #
1287
+ # same as we did in prior tasks, but now #
1288
+ # with tokenizer.chat_template being set #
1289
+ # in tokenizer.config) is forcing on us some kind of #
1290
+ # chat_template format hard-requirements #
1291
+ # coming from their dream-fantasmagorical world.. #
1292
+ ######################################################
1293
+ # load base from cache
1294
+ # (with base tokenizer, which we ignore)
1295
+ model, _ = FastLanguageModel.from_pretrained(
1296
+ model_name=self.hf_base_model_dict["repo_id"],
1297
+ revision=self.hf_base_model_dict["commit_hash"],
1298
+ max_seq_length=self.max_seq_length,
1299
+ dtype=None,
1300
+ load_in_4bit=False,
1301
+ # case of a gated or private base-model
1302
+ token=os.getenv("HF_TOKEN", None)
1303
+ )
1304
+ model = FastLanguageModel.for_inference(model)
1305
+ # load our CPT+SFT trained & locally-saved adapter
1306
+ model.load_adapter(peft_model_id=self.sft_model_dir)
1307
+ # Separately load our (potentially trained &)
1308
+ # locally-saved adapter-tokenizer
1309
+ # (loading it below via HF and not Unsloth)
1310
+ tokenizer = AutoTokenizer.from_pretrained(
1311
+ pretrained_model_name_or_path=self.sft_model_dir
1312
+ )
1313
+ ######################################################
1314
+
1315
+ ######################################################
1316
+ # validation dataset #
1317
+ ######################################################
1318
+ # download from Hub (or get from local cache)
1319
+ queries_dataset = load_dataset(
1320
+ path=self.dataset_commit_dict["repo_id"],
1321
+ name="supervised_finetuning",
1322
+ revision=self.dataset_commit_dict["commit_hash"],
1323
+ token=os.getenv("HF_TOKEN", None))
1324
+ if (
1325
+ "records_cap" in self.sft_training_args and
1326
+ self.sft_training_args["records_cap"] is not None and
1327
+ isinstance(self.sft_training_args["records_cap"], int)
1328
+ ):
1329
+ validation_data = queries_dataset["validation"].take(
1330
+ self.sft_training_args["records_cap"])
1331
+ else:
1332
+ validation_data = queries_dataset["validation"]
1333
+ print(validation_data)
1334
+ ######################################################
1335
+
1336
+ self.max_new_tokens = 400
1337
+ start_time = time.time()
1338
+ validation_results = infer_validation(
1339
+ tokenizer=tokenizer,
1340
+ model=model,
1341
+ validation_data=validation_data,
1342
+ prompt_template=tokenizer.chat_template,
1343
+ batch_size=32, # 64,
1344
+ queries_attr_name=\
1345
+ self.hf_dataset["attributes"]["query_attr"],
1346
+ answers_attr_name=\
1347
+ self.hf_dataset["attributes"]["answers_attr"],
1348
+ max_new_tokens=self.max_new_tokens,
1349
+ device="cuda"
1350
+ )
1351
+ print("infer_validation - Elapsed time: " +
1352
+ f"{(time.time() - start_time):.2f} seconds")
1353
+ self.validation_results = validation_results # <= to artifacts store
1354
+
1355
+ eval_df = pl.LazyFrame(validation_results)
1356
+
1357
+ records = eval_df.with_columns(
1358
+ (pl.col("answer") == pl.col("completion")) \
1359
+ .alias("is_ground_truth_identical")
1360
+ ).collect() #engine=self.engine)
1361
+ print("perfect characters-match accuracy : " +
1362
+ str(records['is_ground_truth_identical'].mean()))
1363
+
1364
+ eval_metrics_df = compute_counts_n_metrics(
1365
+ eval_df, is_format_fault_tolerant=True)
1366
+ overall_metrics_df = eval_metrics_df.select([
1367
+ pl.col("precision").mean(),
1368
+ pl.col("recall").mean(),
1369
+ pl.col("f1").mean(),
1370
+ pl.col("jaccard").mean()
1371
+ ]).collect() #engine=self.engine)
1372
+ self.perf_metrics = overall_metrics_df.row(0, named=True)
1373
+ print(self.perf_metrics)
1374
+
1375
+ self.validation_completions_fig = \
1376
+ plot_validation_completions(
1377
+ eval_metrics_df, engine=self.engine)
1378
+
1379
+ del model
1380
+ del tokenizer
1381
+ torch.cuda.ipc_collect()
1382
+ torch.cuda.empty_cache()
1383
+ gc.collect()
1384
+
1385
+ self.next(self.model_version_blessing)
1386
+
1387
+
1388
+ @step
1389
+ def model_version_blessing(self):
1390
+ """
1391
+ """
1392
+ from retrain_pipelines.model.hf_utils import \
1393
+ current_blessed_model_version_dict
1394
+
1395
+ main_perf_metric_name = "jaccard"
1396
+
1397
+ current_blessed_version_dict = \
1398
+ current_blessed_model_version_dict(
1399
+ repo_id=self.model_repo_id,
1400
+ hf_token=os.getenv("HF_TOKEN", None)
1401
+ )
1402
+ print("current_blessed_version_dict : " +
1403
+ str(current_blessed_version_dict))
1404
+
1405
+ if current_blessed_version_dict is None:
1406
+ print("case 'no prior blessed model version found"
1407
+ " => blessing.'")
1408
+ self.model_version_blessed = True
1409
+
1410
+ elif (
1411
+ main_perf_metric_name in
1412
+ current_blessed_version_dict["perf_metrics"]
1413
+ ):
1414
+ current_blessed_run_id = \
1415
+ current_blessed_version_dict["mf_run_id"]
1416
+ current_blessed_metric_value = \
1417
+ current_blessed_version_dict[
1418
+ "perf_metrics"][main_perf_metric_name]
1419
+
1420
+ self.model_version_blessed = (
1421
+ self.perf_metrics[main_perf_metric_name] >=
1422
+ current_blessed_metric_value
1423
+ )
1424
+
1425
+ if not self.model_version_blessed:
1426
+ self.current_blessed_version_dict = \
1427
+ current_blessed_version_dict
1428
+ for run in Flow(self.__class__.__name__):
1429
+ if str(run.id) == current_blessed_run_id:
1430
+ self.current_blessed_run = run
1431
+ break
1432
+ if not self.current_blessed_run:
1433
+ raise Exception(
1434
+ "Couldn't find blessed run " +
1435
+ f"{current_blessed_run_id} !")
1436
+
1437
+ print("new : " +
1438
+ str(self.perf_metrics[main_perf_metric_name]) +
1439
+ " - previous best : " +
1440
+ str(current_blessed_metric_value) +
1441
+ " - model_version_blessing : " +
1442
+ str(self.model_version_blessed))
1443
+
1444
+ else:
1445
+ raise Exception(
1446
+ "Performance metric '" +
1447
+ main_perf_metric_name +
1448
+ "' can't be found in eval results " +
1449
+ "from blessed run " +
1450
+ str(current_blessed_version_dict[
1451
+ "mf_run_id"]) + " !")
1452
+
1453
+ # self.model_version_blessed = True ### DEBUG - DELETE ###
1454
+
1455
+ self.next(self.model_to_hub)
1456
+
1457
+
1458
+ @step
1459
+ def model_to_hub(self):
1460
+ """
1461
+ Push to hub model version, including
1462
+ readme with versioning info.
1463
+ """
1464
+
1465
+ #############################
1466
+ # case of user-provided #
1467
+ # documentation artifact(s) #
1468
+ #############################
1469
+ # note that user can provide either
1470
+ # 'pipeline_card.py' or 'template.html'
1471
+ # or 'dataset_readme.py'
1472
+ # or 'dataset_readme_template.md'
1473
+ # or 'model_readme.py'
1474
+ # or 'model_readme_template.md'
1475
+ # or any combination of those
1476
+ # when specifying custom
1477
+ # 'pipeline_card_artifacts_path'
1478
+ if (
1479
+ "model_readme_template.md" in
1480
+ os.listdir(self.pipeline_card_artifacts_path)
1481
+ ):
1482
+ template_dir = self.pipeline_card_artifacts_path
1483
+ else:
1484
+ template_dir = os.path.dirname(
1485
+ importlib.util.find_spec(
1486
+ f"retrain_pipelines.pipeline_card."+
1487
+ f"{os.getenv('retrain_pipeline_type')}"
1488
+ ).origin)
1489
+ print(f"template_dir : '{template_dir}'")
1490
+ #############################
1491
+ if "model_readme.py" in os.listdir(
1492
+ self.pipeline_card_artifacts_path):
1493
+ from retrain_pipelines.utils import \
1494
+ get_get_model_readme_content
1495
+ get_model_readme_content = \
1496
+ get_get_model_readme_content(
1497
+ self.pipeline_card_artifacts_path)
1498
+ else:
1499
+ from retrain_pipelines.pipeline_card import \
1500
+ get_model_readme_content
1501
+ #############################
1502
+ from retrain_pipelines.model.hf_utils import \
1503
+ push_model_version_to_hub
1504
+
1505
+ #############################
1506
+ # model README #
1507
+ # from template #
1508
+ #############################
1509
+ commit_datetime = datetime.utcnow()
1510
+ new_model_version_label = get_new_repo_minor_version(
1511
+ repo_id=self.model_repo_id,
1512
+ repo_type="model",
1513
+ hf_token=os.getenv("HF_TOKEN", None))
1514
+ readme_content = get_model_readme_content(
1515
+ template_folder=template_dir,
1516
+
1517
+ model_repo_id=self.model_repo_id,
1518
+
1519
+ base_model_dict=self.hf_base_model_dict,
1520
+ training_dataset_dict=self.dataset_commit_dict,
1521
+
1522
+ version_label=new_model_version_label,
1523
+ commit_datetime=commit_datetime,
1524
+ perf_metrics=self.perf_metrics,
1525
+
1526
+ mf_flow_name=current.flow_name,
1527
+ mf_run_id=current.run.id
1528
+ )
1529
+ #############################
1530
+
1531
+ print("Pushing model version to HF hub " +
1532
+ ("(blessed). " if self.model_version_blessed
1533
+ else "(not blessed). ") +
1534
+ "May take a while..",
1535
+ flush=True)
1536
+ model_commit_hash = push_model_version_to_hub(
1537
+ repo_id=self.model_repo_id,
1538
+ model_version_blessed=\
1539
+ self.model_version_blessed,
1540
+ version_label=new_model_version_label,
1541
+ timestamp_str=commit_datetime.strftime(
1542
+ "%Y-%m-%d %H:%M:%S UTC"),
1543
+ model_dir=self.sft_model_dir,
1544
+ model_readme_content=readme_content,
1545
+ hf_token=os.getenv("HF_TOKEN", None)
1546
+ )
1547
+ if not model_commit_hash:
1548
+ raise Exception(
1549
+ "Failed to publish model version.")
1550
+ print("Push of model version to HF hub completed.",
1551
+ flush=True)
1552
+ print(f"https://huggingface.co/{self.model_repo_id}" +
1553
+ f"/blob/{model_commit_hash}/README.md")
1554
+
1555
+ self.model_commit_dict = {
1556
+ "repo_id": self.model_repo_id,
1557
+ "commit_hash": model_commit_hash,
1558
+ "version_label": new_model_version_label,
1559
+ "commit_datetime": commit_datetime,
1560
+ }
1561
+
1562
+ self.next(self.infra_validator)
1563
+
1564
+
1565
+ @step
1566
+ def infra_validator(self):
1567
+ """
1568
+ If the trained model version is blessed,
1569
+ validate serving.
1570
+ """
1571
+ """
1572
+ Note that using isolated virtual env
1573
+ (using @conda task decorator)
1574
+ is advisable to not embark the whole
1575
+ pipeline dependencies into the local server.
1576
+ We don't for educational purpose,
1577
+ keep things "simple" to grasp
1578
+ as well as to avoid forcing conda
1579
+ (for instance miniconda) as
1580
+ a virtual environment management mean
1581
+ to the user.
1582
+ """
1583
+ """
1584
+ Note : We load base model from HF-cache
1585
+ (mounted as /huggingface_hub_cache
1586
+ docker volume) and adapter from local dir
1587
+ (mounted as /FuncCallAdater docker volume.
1588
+ """
1589
+
1590
+ self.local_serve_is_ready = LocalServeReadinessEnum.NOT_APPLICABLE
1591
+
1592
+ if self.model_version_blessed:
1593
+ from retrain_pipelines.utils.docker import \
1594
+ env_has_docker
1595
+
1596
+ if env_has_docker():
1597
+ model_module_dir = \
1598
+ os.path.dirname(
1599
+ importlib.util.find_spec(
1600
+ "retrain_pipelines.model." +
1601
+ os.getenv('retrain_pipeline_type')
1602
+ ).origin)
1603
+
1604
+ # server & data-model & server-config modules artifacts
1605
+ files_to_copy = [
1606
+ "litserve_server.py",
1607
+ "litserve_datamodel.py",
1608
+ "litserve_serverconfig.py",
1609
+ ".dockerignore" # docker context loading
1610
+ # at image-build time,
1611
+ # exclude model weights
1612
+ ]
1613
+ for filename in files_to_copy:
1614
+ shutil.copy(
1615
+ os.path.join(model_module_dir, "litserve",
1616
+ filename),
1617
+ os.path.join(self.serving_artifacts_local_folder,
1618
+ filename)
1619
+ )
1620
+
1621
+ # save dependencies as artifact
1622
+ create_requirements(self.serving_artifacts_local_folder,
1623
+ exclude=["cudf-polars-.*", "cuda-python",
1624
+ "nvidia-.*", "(py)?libcudf-.*",
1625
+ "nvtx", "rmm-.*", "litserve",
1626
+ ".*retrain-pipelines.*"]
1627
+ )
1628
+
1629
+ # server config yaml
1630
+ env = Environment(loader=FileSystemLoader(
1631
+ os.path.join(model_module_dir, "litserve")))
1632
+ template = env.get_template(
1633
+ "litserve_serverconfig_template.yaml")
1634
+ server_config_data = {
1635
+ "port": "8000",
1636
+ "max_seq_length": self.max_seq_length,
1637
+ "max_new_token": self.max_new_tokens,
1638
+ "base_model": {
1639
+ "repo_id": self.hf_base_model_dict["repo_id"],
1640
+ "revision": self.hf_base_model_dict["commit_hash"]
1641
+ },
1642
+ "adapters": [
1643
+ {
1644
+ "name": "func_caller",
1645
+ "path": "/FuncCallAdapter"
1646
+ }
1647
+ ]
1648
+ }
1649
+ server_config_yaml = template.render(server_config_data)
1650
+ print(server_config_yaml)
1651
+ with open(os.path.join(
1652
+ self.serving_artifacts_local_folder,
1653
+ "litserve_serverconfig.yaml"), 'w'
1654
+ ) as output_file:
1655
+ output_file.write(server_config_yaml)
1656
+
1657
+ # Dockerfile
1658
+ env = Environment(loader=FileSystemLoader(
1659
+ os.path.join(model_module_dir)))
1660
+ template = env.get_template(
1661
+ "Dockerfile.litserve_template")
1662
+ # Change CUDA version here from available list
1663
+ # @see https://hub.docker.com/r/nvidia/cuda/tags
1664
+ dockerfile_content = template.render(
1665
+ {"cuda_version": "12.0.0"})
1666
+ with open(os.path.join(
1667
+ self.serving_artifacts_local_folder,
1668
+ "Dockerfile.litserve"), 'w'
1669
+ ) as output_file:
1670
+ output_file.write(dockerfile_content)
1671
+
1672
+ os.environ["no_proxy"] = "localhost,127.0.0.1,0.0.0.0"
1673
+
1674
+ ############################################
1675
+ # actually deploy the inference service #
1676
+ ############################################
1677
+ start_time = time.time()
1678
+ from retrain_pipelines.utils.docker import \
1679
+ build_and_run_docker, print_container_log_tail, \
1680
+ cleanup_docker
1681
+ from retrain_pipelines.model.litserve import \
1682
+ endpoint_started, endpoint_is_ready
1683
+
1684
+ self.port = 8765
1685
+ HF_HUB_CACHE = os.path.realpath(os.path.expanduser(
1686
+ os.getenv(
1687
+ "HF_HUB_CACHE",
1688
+ os.path.join(os.getenv("HF_HOME",
1689
+ "~/.cache/huggingface"),
1690
+ "hub")
1691
+ )))
1692
+ print(f"HF_HUB_CACHE : {HF_HUB_CACHE}")
1693
+ image_name = container_name = "litserve-model"
1694
+
1695
+ serving_container = build_and_run_docker(
1696
+ image_name=image_name, image_tag="1.0",
1697
+ build_path=self.serving_artifacts_local_folder,
1698
+ dockerfile="Dockerfile.litserve",
1699
+ ports_publish_dict={'8000/tcp': self.port},
1700
+ env_vars_dict={
1701
+ "HF_HUB_CACHE": "/huggingface_hub_cache",
1702
+ "HF_TOKEN": os.getenv("HF_TOKEN")
1703
+ },
1704
+ volumes_dict={
1705
+ self.sft_model_dir:
1706
+ {"bind": "/FuncCallAdapter",
1707
+ "mode": "ro"},
1708
+ HF_HUB_CACHE:
1709
+ {"bind": "/huggingface_hub_cache",
1710
+ "mode": "ro"}
1711
+ }
1712
+ )
1713
+
1714
+ if not serving_container:
1715
+ print("failed spinning the LitServe container",
1716
+ file=sys.stderr)
1717
+ self.local_serve_is_ready = \
1718
+ LocalServeReadinessEnum.FAILURE
1719
+ try:
1720
+ cleanup_docker(
1721
+ container_name=container_name,
1722
+ image_name=f"{image_name}:1.0",
1723
+ no_pruning=True # for intermediate layers recycling
1724
+ # (during later re-runs)
1725
+ # to avoid long rebuild time
1726
+ # of exactly the same.
1727
+ )
1728
+ except Exception as cleanup_ex:
1729
+ # fail silently
1730
+ pass
1731
+ else:
1732
+ print("Awaiting endpoint launch..")
1733
+ start_time = time.time()
1734
+ if not endpoint_started(
1735
+ container_name, port=self.port, timeout=10*60
1736
+ ):
1737
+ print(
1738
+ f"The endpoint '{container_name}' " +
1739
+ f"did not start.")
1740
+ self.local_serve_is_ready = \
1741
+ LocalServeReadinessEnum.FAILURE
1742
+ # health check on the spun-up endpoint
1743
+ elif endpoint_is_ready(port=self.port):
1744
+ self.local_serve_is_ready = \
1745
+ LocalServeReadinessEnum.SUCCESS
1746
+ elapsed_time = time.time() - start_time
1747
+ print("deploy_local - Elapsed time: " +
1748
+ f"{elapsed_time:.2f} seconds")
1749
+ ############################################
1750
+ else:
1751
+ # env doesn't have docker
1752
+ self.local_serve_is_ready = \
1753
+ LocalServeReadinessEnum.FAILURE_NO_DOCKER
1754
+
1755
+ if LocalServeReadinessEnum.SUCCESS == self.local_serve_is_ready:
1756
+ from retrain_pipelines.model.litserve.litserve_datamodel \
1757
+ import Response
1758
+
1759
+ import requests
1760
+
1761
+ url = f"http://localhost:{self.port}/predict"
1762
+ headers = {"accept": "application/x-www-form-urlencoded"}
1763
+
1764
+ try:
1765
+ start_time = time.time()
1766
+ data = {
1767
+ "adapter_name": "func_caller",
1768
+ "queries": '["Hello.", "Is 49 a perfect square?"]'
1769
+ }
1770
+ print(f"inference test - data: {data}")
1771
+ response = requests.post(url, headers=headers, data=data)
1772
+ parsed_response = Response(**{"output": response.json()})
1773
+ elapsed_time = time.time() - start_time
1774
+ print("parsed_response ('func_caller' adapter ON) :" +
1775
+ str(parsed_response) +
1776
+ f"\t-\tElapsed time: {elapsed_time:.2f} seconds")
1777
+
1778
+ start_time = time.time()
1779
+ data = {
1780
+ "queries": '["Hello.", "Is 49 a perfect square?"]'
1781
+ }
1782
+ print(f"inference test - data: {data}")
1783
+ response = requests.post(url, headers=headers, data=data)
1784
+ parsed_response = Response(**{"output": response.json()})
1785
+ elapsed_time = time.time() - start_time
1786
+ print(f"parsed_response (no adapter) : {parsed_response}" +
1787
+ f"\t-\tElapsed time: {elapsed_time:.2f} seconds")
1788
+
1789
+ except Exception as ex:
1790
+ print(ex, file=sys.stderr)
1791
+ traceback.print_tb(ex.__traceback__, file=sys.stderr)
1792
+ self.local_serve_is_ready = \
1793
+ LocalServeReadinessEnum.FAILURE
1794
+ pass
1795
+
1796
+ try:
1797
+ cleanup_docker(
1798
+ container_name=container_name,
1799
+ image_name=f"{image_name}:1.0",
1800
+ no_pruning=True # for intermediate layers recycling
1801
+ # (during later re-runs)
1802
+ # to avoid long rebuild time
1803
+ # of exactly the same.
1804
+ )
1805
+ except Exception as cleanup_ex:
1806
+ # fail silently
1807
+ pass
1808
+
1809
+ self.next(self.pipeline_card)
1810
+
1811
+
1812
+ @card(id='default')
1813
+ @card(type='html', id='custom')
1814
+ @step
1815
+ def pipeline_card(self):
1816
+ import re
1817
+ import datetime
1818
+ import importlib.metadata
1819
+
1820
+ #############################
1821
+ # case of user-provided #
1822
+ # documentation artifact(s) #
1823
+ #############################
1824
+ # note that user can provide either
1825
+ # 'pipeline_card.py' or 'template.html'
1826
+ # or 'dataset_readme.py'
1827
+ # or 'dataset_readme_template.md'
1828
+ # or 'model_readme.py'
1829
+ # or 'model_readme_template.md'
1830
+ # or any combination of those
1831
+ # when specifying custom
1832
+ # 'pipeline_card_artifacts_path'
1833
+ if "template.html" in os.listdir(
1834
+ self.pipeline_card_artifacts_path
1835
+ ):
1836
+ template_dir = self.pipeline_card_artifacts_path
1837
+ else:
1838
+ template_dir = os.path.dirname(
1839
+ importlib.util.find_spec(
1840
+ f"retrain_pipelines.pipeline_card."+
1841
+ f"{os.getenv('retrain_pipeline_type')}"
1842
+ ).origin)
1843
+ #############################
1844
+ if "pipeline_card.py" in os.listdir(
1845
+ self.pipeline_card_artifacts_path
1846
+ ):
1847
+ from retrain_pipelines.utils import get_get_html
1848
+ get_html = \
1849
+ get_get_html(self.pipeline_card_artifacts_path)
1850
+ else:
1851
+ from retrain_pipelines.pipeline_card import \
1852
+ get_html
1853
+ from retrain_pipelines.pipeline_card.helpers import \
1854
+ mf_dag_svg
1855
+ #############################
1856
+
1857
+
1858
+ #############################
1859
+ ## "default" card ##
1860
+ #############################
1861
+ self.metadata = {
1862
+ "name": "TabNet Model",
1863
+ "version": "1.0",
1864
+ "retrain_pipelines": f"retrain-pipelines {__version__}",
1865
+ "retrain_pipeline_type": os.environ["retrain_pipeline_type"],
1866
+ "description": "A PyTorch TabNet model retrained",
1867
+ "authors": [current.username],
1868
+ "tags": ["classification", "tabnet"],
1869
+ "license": "MIT License",
1870
+ "data_augmentation": [
1871
+ {
1872
+ "name": "Augmentation",
1873
+ "description": "Truncating queries and " + \
1874
+ "associate those to " + \
1875
+ "no tool-call answers. " + \
1876
+ "Intent being to instruct on " + \
1877
+ "not hallucinating missing " + \
1878
+ "tool-calls parameters values."
1879
+ },
1880
+ {
1881
+ "name": "Enrichment",
1882
+ "description": "Addition of records " + \
1883
+ "from an external data-source. " + \
1884
+ "Here to instruct on no tool-call."
1885
+ }
1886
+ ],
1887
+ "references": [
1888
+ {
1889
+ "title": "Base model",
1890
+ "link": f"https://hf.co/{self.hf_base_model_dict['repo_id']}"
1891
+ },
1892
+ {
1893
+ "title": "Function-calling dataset",
1894
+ "link": f"https://hf.co/{self.hf_dataset_dict['repo_id']}"
1895
+ },
1896
+ {
1897
+ "title": "Data-enrichment dataset",
1898
+ "link": f"https://hf.co/{self.hf_enrich_dataset_dict['repo_id']}"
1899
+ },
1900
+ {
1901
+ "title": "Unsloth",
1902
+ "link": "https://unsloth.ai/blog/contpretraining"
1903
+ }
1904
+ ]
1905
+ }
1906
+
1907
+ current.card['default'].append(Markdown(
1908
+ "model_version_blessed : **%s**" % str(self.model_version_blessed)))
1909
+ current.card['default'].append(Artifact(
1910
+ {"model_version_blessed": self.model_version_blessed}))
1911
+
1912
+ current.card['default'].append(
1913
+ Image.from_matplotlib(self.sft_log_history_fig))
1914
+ current.card['default'].append(
1915
+ Image.from_matplotlib(self.validation_completions_fig))
1916
+ #############################
1917
+
1918
+ #############################
1919
+ ## html "custom" card ##
1920
+ #############################
1921
+ dt = datetime.datetime.now(tz=datetime.timezone.utc)
1922
+ formatted_dt = dt.strftime("%A %b %d %Y %I:%M:%S %p %Z")
1923
+ task_obj_python_cmd = f"metaflow.Task(" + \
1924
+ f"\"{current.pathspec}\", " + \
1925
+ f"attempt={str(current.retry_count)})"
1926
+ params={
1927
+ 'template_dir': template_dir,
1928
+ 'title': f"{current.flow_name}",
1929
+ "subtitle": f"(flow run # {len(list(current.run.parent.runs()))}," + \
1930
+ f" run_id: {str(current.run.id)} - {formatted_dt})",
1931
+ 'model_version_blessed': self.model_version_blessed,
1932
+ 'current_blessed_run': self.current_blessed_run,
1933
+ 'current_blessed_model_commit_hash': (
1934
+ self.current_blessed_version_dict["commit_hash"]
1935
+ if self.current_blessed_version_dict
1936
+ else None
1937
+ ),
1938
+ 'LocalServeReadinessEnum': LocalServeReadinessEnum,
1939
+ 'local_serve_is_ready': self.local_serve_is_ready,
1940
+ # EDA
1941
+ 'main_dataset_repo_id': self.hf_dataset['repo_id'],
1942
+ 'main_dataset_commit_hash': self.hf_dataset_dict['commit_hash'],
1943
+ 'main_dataset_commit_datetime': \
1944
+ self.hf_dataset_dict['commit_datetime'],
1945
+
1946
+ 'records_count': self.records_count,
1947
+ 'data_schema': self.data_schema,
1948
+ 'answers_tools_count_fig': self.answers_tools_count_fig,
1949
+ 'words_count_fig': self.words_count_fig,
1950
+
1951
+ # model training
1952
+ 'dataset_repo_id': self.dataset_repo_id,
1953
+ 'dataset_version_label': self.dataset_commit_dict["version_label"],
1954
+ 'dataset_commit_datetime': self.dataset_commit_dict["commit_datetime"],
1955
+ 'dataset_commit_hash': self.dataset_commit_dict["commit_hash"],
1956
+ 'dataset_augmentation_rate': self.actual_augmentation_rate,
1957
+ 'dataset_enrichment_rate': self.enrichment_rate,
1958
+
1959
+ 'model_repo_id': self.model_repo_id,
1960
+ 'model_version_label': self.model_commit_dict["version_label"],
1961
+ 'model_commit_datetime': self.model_commit_dict["commit_datetime"],
1962
+ 'model_commit_hash': self.model_commit_dict["commit_hash"],
1963
+
1964
+ 'cpt_log_history_fig': self.cpt_log_history_fig,
1965
+ 'sft_log_history_fig': self.sft_log_history_fig,
1966
+
1967
+ 'validation_completions_fig': self.validation_completions_fig,
1968
+
1969
+ 'pipeline_parameters_dict': {"cpt": self.cpt_training_args,
1970
+ "sft": self.sft_training_args},
1971
+
1972
+ 'metrics_dict': self.perf_metrics,
1973
+
1974
+ 'task_obj_python_cmd': task_obj_python_cmd,
1975
+ 'dag_svg': mf_dag_svg(self)
1976
+ }
1977
+ self.html = get_html(params)
1978
+ #############################
1979
+ current
1980
+ #############################
1981
+
1982
+ self.next(self.pipeline_to_hub)
1983
+
1984
+
1985
+ @step
1986
+ def pipeline_to_hub(self):
1987
+ """
1988
+ publish versioned source-code and pipeline-card
1989
+ for ths run on the Hugging Face Hub.
1990
+ """
1991
+
1992
+ model_commit_datetime = \
1993
+ self.model_commit_dict["commit_datetime"]
1994
+ timestamp_str = \
1995
+ "{:%Y%m%d_%H%M%S}".format(model_commit_datetime) + \
1996
+ "{:03d}".format(model_commit_datetime.microsecond//1000) + \
1997
+ "_UTC"
1998
+ subfolder_name = \
1999
+ "v" + self.model_commit_dict["version_label"] + \
2000
+ "_" + timestamp_str
2001
+ commit_datetime = datetime.utcnow()
2002
+
2003
+ ###############################
2004
+ # source-code #
2005
+ ###############################
2006
+ # We upload only herein file #
2007
+ # plus user-provided versions #
2008
+ # of the customizable ones #
2009
+ # (if any). #
2010
+ ###############################
2011
+ custom_source_files = [os.path.abspath(__file__)]
2012
+ if (
2013
+ self.pipeline_card_artifacts_path != \
2014
+ self.default_pipeline_card_module_dir
2015
+ ):
2016
+ candidate_source_files = [
2017
+ "pipeline_card.py",
2018
+ "template.html",
2019
+ "dataset_readme.py",
2020
+ "dataset_readme_template.md",
2021
+ "model_readme.py",
2022
+ "model_readme_template.md"
2023
+ ]
2024
+ for candidate_source_file in candidate_source_files:
2025
+ file_fullpath = os.path.join(
2026
+ self.pipeline_card_artifacts_path,
2027
+ candidate_source_file)
2028
+ if os.path.exists(file_fullpath):
2029
+ custom_source_files.append(file_fullpath)
2030
+
2031
+ source_code_commit_hash = \
2032
+ push_files_to_hub_repo_branch(
2033
+ repo_id=self.model_repo_id,
2034
+ branch_name="retrain-pipelines_source-code",
2035
+ file_fullnames=custom_source_files,
2036
+ include_requirements_txt=True,
2037
+ path_in_repo=subfolder_name,
2038
+ commit_message=\
2039
+ "source-code for model version " + \
2040
+ subfolder_name + \
2041
+ f"- retrain-pipelines {__version__}",
2042
+ repo_type="model",
2043
+ hf_token=os.getenv("HF_TOKEN", None)
2044
+ )
2045
+ print(source_code_commit_hash)
2046
+ self.source_code_commit_dict = {
2047
+ "repo_id": self.model_repo_id,
2048
+ "branch_name": "retrain-pipelines_source-code",
2049
+ "commit_datetime": commit_datetime,
2050
+ "commit_hash": source_code_commit_hash
2051
+ }
2052
+ ###############################
2053
+
2054
+ ###############################
2055
+ # pipeline-card #
2056
+ ###############################
2057
+ pipeline_card_fullname = None
2058
+ for run_step in current.run.steps():
2059
+ task = list(run_step.tasks())[0]
2060
+ task_name = task.path_components[2]
2061
+ if "pipeline_card" == task_name:
2062
+ pipeline_card = get_cards(
2063
+ task, id='custom', type='html')[0]
2064
+ pipeline_card_fullname = os.path.realpath(
2065
+ os.path.join(
2066
+ task.metadata_dict.get("ds-root", None),
2067
+ mf_config.CARD_SUFFIX, pipeline_card.path
2068
+ ))
2069
+ print(pipeline_card_fullname)
2070
+ break
2071
+ pipeline_card_commit_hash = \
2072
+ push_files_to_hub_repo_branch(
2073
+ repo_id=self.model_repo_id,
2074
+ branch_name="retrain-pipelines_pipeline-card",
2075
+ file_fullnames=[pipeline_card_fullname],
2076
+ path_in_repo=subfolder_name,
2077
+ commit_message=\
2078
+ "pipeline-card for model version " + \
2079
+ subfolder_name + \
2080
+ f"- retrain-pipelines {__version__}",
2081
+ repo_type="model",
2082
+ hf_token=os.getenv("HF_TOKEN", None)
2083
+ )
2084
+ print(pipeline_card_commit_hash)
2085
+ self.pipeline_card_commit_dict = {
2086
+ "repo_id": self.model_repo_id,
2087
+ "branch_name": "retrain-pipelines_pipeline-card",
2088
+ "commit_datetime": commit_datetime,
2089
+ "commit_hash": pipeline_card_commit_hash
2090
+ }
2091
+ ###############################
2092
+
2093
+ self.next(self.deploy)
2094
+
2095
+
2096
+ @step
2097
+ def deploy(self):
2098
+ """
2099
+ placeholder for the serving SDK deploy call
2100
+ (on the target production platform).
2101
+ Include any artifact you want,
2102
+ consider including the portable pipelione-card
2103
+ itself !
2104
+ """
2105
+
2106
+ if (
2107
+ self.model_version_blessed and
2108
+ (self.local_serve_is_ready == LocalServeReadinessEnum.SUCCESS)
2109
+ ):
2110
+ pass # your code here
2111
+
2112
+ self.next(self.load_test)
2113
+
2114
+
2115
+ @step
2116
+ def load_test(self):
2117
+ """
2118
+ placeholder
2119
+ """
2120
+
2121
+ if (
2122
+ self.model_version_blessed and
2123
+ (self.local_serve_is_ready == LocalServeReadinessEnum.SUCCESS)
2124
+ ):
2125
+ pass # your code here
2126
+
2127
+ self.next(self.end)
2128
+
2129
+
2130
+ @step
2131
+ def end(self):
2132
+ pass
2133
+
2134
+
2135
+ if __name__ == "__main__":
2136
+ UnslothFuncCallFlow()
2137
+