File size: 9,053 Bytes
463cc27
1
2
{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.execute_input":"2024-09-22T15:43:44.884747Z","iopub.status.busy":"2024-09-22T15:43:44.884016Z","iopub.status.idle":"2024-09-22T15:43:53.003699Z","shell.execute_reply":"2024-09-22T15:43:53.002880Z","shell.execute_reply.started":"2024-09-22T15:43:44.884711Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"10615553125b47bbb283c6d15b9d8ac3","version_major":2,"version_minor":0},"text/plain":["Downloading readme:   0%|          | 0.00/624 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"6ee918c192bf476fb66c29742418b4ca","version_major":2,"version_minor":0},"text/plain":["Downloading data:   0%|          | 0.00/86.1M [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"7d7dcb663d2747d5bd509b4931809910","version_major":2,"version_minor":0},"text/plain":["Downloading data:   0%|          | 0.00/94.2k [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"1a364f3f533f4edfa042201e1825d207","version_major":2,"version_minor":0},"text/plain":["Generating train split:   0%|          | 0/2397414 [00:00<?, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"76d84909e197482ea32cdd4a4e035ee3","version_major":2,"version_minor":0},"text/plain":["Generating validation split:   0%|          | 0/2804 [00:00<?, ? examples/s]"]},"metadata":{},"output_type":"display_data"}],"source":["from datasets import load_dataset\n","\n","ds = load_dataset(\"Saugatkafley/Nepali-Roman-Transliteration\")"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2024-09-22T15:48:28.037465Z","iopub.status.busy":"2024-09-22T15:48:28.036523Z","iopub.status.idle":"2024-09-22T15:48:28.042501Z","shell.execute_reply":"2024-09-22T15:48:28.041586Z","shell.execute_reply.started":"2024-09-22T15:48:28.037419Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["DatasetDict({\n","    train: Dataset({\n","        features: ['unique_identifier', 'native word', 'english word'],\n","        num_rows: 2397414\n","    })\n","    validation: Dataset({\n","        features: ['unique_identifier', 'native word', 'english word'],\n","        num_rows: 2804\n","    })\n","})\n"]}],"source":["print(ds)"]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2024-09-22T15:51:52.459691Z","iopub.status.busy":"2024-09-22T15:51:52.458718Z","iopub.status.idle":"2024-09-22T15:51:52.495039Z","shell.execute_reply":"2024-09-22T15:51:52.493900Z","shell.execute_reply.started":"2024-09-22T15:51:52.459633Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"d15ba4abe5c342d5afe42cc8959365bb","version_major":2,"version_minor":0},"text/plain":["VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"]},"metadata":{},"output_type":"display_data"}],"source":["# !pip install huggingface\n","\n","from huggingface_hub import notebook_login\n","\n","notebook_login()"]},{"cell_type":"code","execution_count":15,"metadata":{"execution":{"iopub.execute_input":"2024-09-22T16:06:46.498144Z","iopub.status.busy":"2024-09-22T16:06:46.497197Z","iopub.status.idle":"2024-09-22T16:09:05.290568Z","shell.execute_reply":"2024-09-22T16:09:05.288867Z","shell.execute_reply.started":"2024-09-22T16:06:46.498097Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["100%|██████████| 2397414/2397414 [02:11<00:00, 18234.37it/s]\n","100%|██████████| 2804/2804 [00:00<00:00, 19079.34it/s]\n"]}],"source":["from datasets import DatasetDict, Dataset\n","from tqdm import tqdm\n","\n","\n","def transform_dataset(dataset):\n","    # Create a list to hold our transformed data\n","    transformed_data = []\n","    \n","    for example in tqdm(dataset):\n","#         # Generate a random 5-digit ID (you may want to use a more robust method)\n","#         random_id = str(random.randint(10000, 99999))\n","        \n","        transformed_example = {\n","            'id': example['unique_identifier'],\n","            'translation': {\n","                'roman': example['english word'],\n","                'nepali': example['native word']  \n","            }\n","        }\n","        transformed_data.append(transformed_example)\n","    \n","    # Create a new dataset from our transformed data\n","    return Dataset.from_list(transformed_data)\n","\n","transformed_train = transform_dataset(ds['train'])\n","transformed_validation = transform_dataset(ds['validation'])"]},{"cell_type":"code","execution_count":16,"metadata":{"execution":{"iopub.execute_input":"2024-09-22T16:09:19.328280Z","iopub.status.busy":"2024-09-22T16:09:19.327739Z","iopub.status.idle":"2024-09-22T16:09:19.350833Z","shell.execute_reply":"2024-09-22T16:09:19.349569Z","shell.execute_reply.started":"2024-09-22T16:09:19.328241Z"},"trusted":true},"outputs":[],"source":["transformed_dataset = DatasetDict({\n","    'train': transformed_train,\n","    'validation': transformed_validation\n","})"]},{"cell_type":"code","execution_count":17,"metadata":{"execution":{"iopub.execute_input":"2024-09-22T16:09:23.069359Z","iopub.status.busy":"2024-09-22T16:09:23.068241Z","iopub.status.idle":"2024-09-22T16:09:23.074713Z","shell.execute_reply":"2024-09-22T16:09:23.073749Z","shell.execute_reply.started":"2024-09-22T16:09:23.069316Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["DatasetDict({\n","    train: Dataset({\n","        features: ['id', 'translation'],\n","        num_rows: 2397414\n","    })\n","    validation: Dataset({\n","        features: ['id', 'translation'],\n","        num_rows: 2804\n","    })\n","})\n"]}],"source":["print(transformed_dataset)"]},{"cell_type":"code","execution_count":18,"metadata":{"execution":{"iopub.execute_input":"2024-09-22T16:09:33.982265Z","iopub.status.busy":"2024-09-22T16:09:33.981439Z","iopub.status.idle":"2024-09-22T16:09:42.214228Z","shell.execute_reply":"2024-09-22T16:09:42.213022Z","shell.execute_reply.started":"2024-09-22T16:09:33.982224Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"c0871bd3cfd84bbfa54346500196cc28","version_major":2,"version_minor":0},"text/plain":["Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"a1d23f24909544b19842206be23976a5","version_major":2,"version_minor":0},"text/plain":["Creating parquet from Arrow format:   0%|          | 0/2398 [00:00<?, ?ba/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"641a18429d8c4fcfacdd568aa5e9e9ad","version_major":2,"version_minor":0},"text/plain":["Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"3b6e0b99110644a29d97aba3ee927aa3","version_major":2,"version_minor":0},"text/plain":["Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"081765380fb64a8eb0e6122590532a0f","version_major":2,"version_minor":0},"text/plain":["README.md:   0%|          | 0.00/683 [00:00<?, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["{'id': 'nep1', 'translation': {'nepali': 'मुस्कुराउँदै', 'roman': 'muskuraundai'}}\n","{'id': 'nep1', 'translation': {'nepali': 'सर्वसाधारणसम्मले', 'roman': 'sarwasadharansammale'}}\n"]}],"source":["# Save the transformed dataset\n","transformed_dataset.push_to_hub('syubraj/roman2nepali-transliteration')\n","\n","# To verify the transformation, you can load a few examples:\n","print(transformed_dataset['train'][0])\n","print(transformed_dataset['validation'][0])"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]}],"metadata":{"kaggle":{"accelerator":"none","dataSources":[],"dockerImageVersionId":30761,"isGpuEnabled":false,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.0"}},"nbformat":4,"nbformat_minor":4}