File size: 6,177 Bytes
cebb861 f1c3ec3 98fd0ab f1c3ec3 98fd0ab f1c3ec3 98fd0ab f1c3ec3 98fd0ab f1c3ec3 98fd0ab f1c3ec3 98fd0ab f1c3ec3 98fd0ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
base_datasets = [
#
# general
#
# 3.35 GB, 1,000,000 - Curated RefinedWeb with medium context length (2048 <= ctx_len <= 8192)
*[
{'kind': 'base', 'path': 'vilm/refinedweb-1m-medium', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
for i in range(0, 100, 5)
],
# 4.01 GB, 1,360,929
*[
{'kind': 'base', 'path': 'deatos/fineweb-edu-mini-combined', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
for i in range(0, 100, 5)
],
#
# multilingual
#
# 3.17 GB, 2,226,907
*[
{'kind': 'base', 'path': 'ontocord/fineweb-permissive-multilingual-2m', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
for i in range(0, 100, 5)
],
# 1.64 GB, 1,001,000
*[
{'kind': 'base', 'path': 'distily/c4_multilingual_1M', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
for i in range(0, 100, 5)
],
# 742 MB, 321,697
*[
{'kind': 'base', 'path': 'data-silence/sumnews', 'split': split, 'format': lambda n: n[field]}
for split in ['train', 'test']
for field in ['title', 'resume', 'news']
],
# 193 MB, 1,141,967
*[
{'kind': 'base', 'path': 'xu-song/cc100-samples', 'name': name, 'split': 'train', 'format': lambda n: n['text']}
for name in [
'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'bn_rom', 'br',
'bs', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en', 'eo', 'es',
'et', 'eu', 'fa', 'ff', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl',
'gn', 'gu', 'ha', 'he', 'hi', 'hi_rom', 'hr', 'ht', 'hu',
'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km',
'kn', 'ko', 'ku', 'ky', 'la', 'lg', 'li', 'ln', 'lo', 'lt',
'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms', 'my', 'my_zaw',
'ne', 'nl', 'no', 'ns', 'om', 'or', 'pa', 'pl', 'ps', 'pt',
'qu', 'rm', 'ro', 'ru', 'sa', 'si', 'sc', 'sd', 'sk', 'sl',
'so', 'sq', 'sr', 'ss', 'su', 'sv', 'sw', 'ta', 'ta_rom',
'te', 'te_rom', 'th', 'tl', 'tn', 'tr', 'ug', 'uk', 'ur',
'ur_rom', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo',
'zh-Hans', 'zh-Hant', 'zu',
]
],
#
# misc
#
# 472 KB, 5,034
{'kind': 'base', 'path': 'badrex/llm-emoji-dataset', 'format': '{short description}. {LLM description}. {character}'},
#
# math
#
# 7.1 MB, 400,000
*[
{'kind': 'base', 'path': 'garrethlee/simple-arithmetic-problems', 'name': name, 'split': split, 'format': lambda n: n['question'].strip() + ' ' + n['answer'].strip()}
for name in [
'very_easy', 'very_easy_use_commas',
'easy', 'easy_use_commas',
'medium', 'medium_use_commas',
'hard', 'hard_use_commas',
'very_hard', 'very_hard_use_commas',
]
for split in [
'int_add_train', 'int_add_test',
'float_add_train', 'float_add_test',
'int_subtract_train', 'int_subtract_test',
'float_subtract_train', 'float_subtract_test',
'int_multiply_train', 'int_multiply_test',
'float_multiply_train', 'float_multiply_test',
'int_divide_train', 'int_divide_test',
'float_divide_train', 'float_divide_test',
]
],
# 12.2 MB, 500,000
{'kind': 'base', 'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{instruction} = {output}'},
{'kind': 'base', 'path': 'fblgit/simple-math', 'revision': 'refs/convert/parquet', 'split': 'test', 'format': '{instruction} = {output}'},
# 125 MB, 1,000,000
{'kind': 'base', 'path': 'Gusarich/math-expressions-1m', 'revision': 'refs/convert/parquet', 'split': 'train', 'format': '{expression} = {result}'},
#
# stem
#
# 1.44 GB, 63,357
*[
{'kind': 'base', 'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['abstract']}
for i in range(0, 100, 10)
],
*[
{'kind': 'base', 'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['markdown']}
for i in range(0, 100, 10)
],
#
# code
#
# 36.8 MB, 79,013 - Rosetta Code currently has 1,203 tasks, 389 draft tasks, and is aware of 883 languages
{'kind': 'base', 'path': 'christopher/rosetta-code', 'format': lambda n: n['code']},
# 1.62 GB, 1,632,309 - Python, TypeScript, JavaScript, Ruby, Julia, Rust, C++, Bash, Java, C#, and Go; SQL, Cypher
*[
{'kind': 'base', 'path': 'nampdn-ai/tiny-codes', 'split': f'train[{i}%:{i + 10}%]', 'format': '{prompt} {response}'}
for i in range(0, 100, 10)
],
#
# math / code
#
# 2.23 GB, 719,244
*[
{'kind': 'base', 'path': 'MathGenie/MathCode-Pile', 'split': f'train[{i}%:{i + 10}%]', 'format': lambda n: n['text']}
for i in range(0, 100, 10)
],
#
# general knowledge
#
# 4.03 GB, 6,035,374
*[
{'kind': 'base', 'path': 'TAWGCreatology/en-wiki-paraphrased-cleaned', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['paraphrase']}
for i in range(0, 100, 5)
],
# 3.18 GB, 1,010,500 - uncompressed 6GB
*[
{'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['text']}
for i in range(0, 100, 5)
],
{'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'validation', 'format': lambda n: n['text']},
{'kind': 'base', 'path': 'JeanKaddour/minipile', 'split': 'test', 'format': lambda n: n['text']},
#
# light instructions
#
# 44.3 MB, 51,760
{'kind': 'base', 'path': 'yahma/alpaca-cleaned', 'split': 'train', 'format': '{instruction}\n{input}\n{output}'},
# 11 MB, 12,564
{'kind': 'base', 'path': 'Cleanlab/databricks-dolly-15k-cleanset', 'split': 'train', 'format': '{instruction}\n{context}\n{response}'},
# 15.6 MB, 24,926
{'kind': 'base', 'path': 'garage-bAInd/Open-Platypus', 'split': 'train', 'format': '{instruction}\n{output}'},
]
|