Upload 12 files
Browse files- .gitattributes +2 -0
- app.py +28 -0
- requirements.txt +6 -0
- translate_model_finetune.zip +3 -0
- translate_model_finetune/config.json +56 -0
- translate_model_finetune/generation_config.json +16 -0
- translate_model_finetune/model.safetensors +3 -0
- translate_model_finetune/source.spm +3 -0
- translate_model_finetune/special_tokens_map.json +5 -0
- translate_model_finetune/target.spm +3 -0
- translate_model_finetune/tokenizer_config.json +39 -0
- translate_model_finetune/vocab.json +0 -0
- translater.ipynb +1035 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
translate_model_finetune/source.spm filter=lfs diff=lfs merge=lfs -text
|
37 |
+
translate_model_finetune/target.spm filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import MarianMTModel, AutoTokenizer,pipeline
|
3 |
+
import torch
|
4 |
+
|
5 |
+
st.title("Translate Fine tunning model")
|
6 |
+
|
7 |
+
# Load model and tokenizer
|
8 |
+
model_dir = "translate_model_finetune"
|
9 |
+
tokenizer = AutoTokenizer.from_pretrained(model_dir)
|
10 |
+
model = MarianMTModel.from_pretrained(model_dir)
|
11 |
+
|
12 |
+
|
13 |
+
def translate(text, src_lang="en", tgt_lang="hi"):
|
14 |
+
inputs = tokenizer(text, return_tensors="pt")
|
15 |
+
|
16 |
+
translated_tokens = model.generate(**inputs, max_length=50)
|
17 |
+
return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
inputs_text=st.text_input("Please enter the text",value="good boy")
|
23 |
+
|
24 |
+
if st.button("submit"):
|
25 |
+
output= translate(inputs_text)
|
26 |
+
|
27 |
+
st.write(output)
|
28 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
pandas==2.2.2
|
3 |
+
torch==2.5.1
|
4 |
+
transformers==4.48.3
|
5 |
+
streamlit==1.41.1
|
6 |
+
sentencepiece==0.2.0
|
translate_model_finetune.zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:90a8938cc7e46df59e74f8a9bc0862299629d5d1800ae0622f12031afb002198
|
3 |
+
size 282898174
|
translate_model_finetune/config.json
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "Helsinki-NLP/opus-mt-en-hi",
|
3 |
+
"activation_dropout": 0.0,
|
4 |
+
"activation_function": "swish",
|
5 |
+
"add_bias_logits": false,
|
6 |
+
"add_final_layer_norm": false,
|
7 |
+
"architectures": [
|
8 |
+
"MarianMTModel"
|
9 |
+
],
|
10 |
+
"attention_dropout": 0.0,
|
11 |
+
"bos_token_id": 0,
|
12 |
+
"classif_dropout": 0.0,
|
13 |
+
"classifier_dropout": 0.0,
|
14 |
+
"d_model": 512,
|
15 |
+
"decoder_attention_heads": 8,
|
16 |
+
"decoder_ffn_dim": 2048,
|
17 |
+
"decoder_layerdrop": 0.0,
|
18 |
+
"decoder_layers": 6,
|
19 |
+
"decoder_start_token_id": 61949,
|
20 |
+
"decoder_vocab_size": 61950,
|
21 |
+
"dropout": 0.1,
|
22 |
+
"encoder_attention_heads": 8,
|
23 |
+
"encoder_ffn_dim": 2048,
|
24 |
+
"encoder_layerdrop": 0.0,
|
25 |
+
"encoder_layers": 6,
|
26 |
+
"eos_token_id": 0,
|
27 |
+
"extra_pos_embeddings": 61950,
|
28 |
+
"forced_eos_token_id": 0,
|
29 |
+
"id2label": {
|
30 |
+
"0": "LABEL_0",
|
31 |
+
"1": "LABEL_1",
|
32 |
+
"2": "LABEL_2"
|
33 |
+
},
|
34 |
+
"init_std": 0.02,
|
35 |
+
"is_encoder_decoder": true,
|
36 |
+
"label2id": {
|
37 |
+
"LABEL_0": 0,
|
38 |
+
"LABEL_1": 1,
|
39 |
+
"LABEL_2": 2
|
40 |
+
},
|
41 |
+
"max_length": null,
|
42 |
+
"max_position_embeddings": 512,
|
43 |
+
"model_type": "marian",
|
44 |
+
"normalize_before": false,
|
45 |
+
"normalize_embedding": false,
|
46 |
+
"num_beams": null,
|
47 |
+
"num_hidden_layers": 6,
|
48 |
+
"pad_token_id": 61949,
|
49 |
+
"scale_embedding": true,
|
50 |
+
"share_encoder_decoder_embeddings": true,
|
51 |
+
"static_position_embeddings": true,
|
52 |
+
"torch_dtype": "float32",
|
53 |
+
"transformers_version": "4.48.3",
|
54 |
+
"use_cache": true,
|
55 |
+
"vocab_size": 61950
|
56 |
+
}
|
translate_model_finetune/generation_config.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bad_words_ids": [
|
3 |
+
[
|
4 |
+
61949
|
5 |
+
]
|
6 |
+
],
|
7 |
+
"bos_token_id": 0,
|
8 |
+
"decoder_start_token_id": 61949,
|
9 |
+
"eos_token_id": 0,
|
10 |
+
"forced_eos_token_id": 0,
|
11 |
+
"max_length": 512,
|
12 |
+
"num_beams": 4,
|
13 |
+
"pad_token_id": 61949,
|
14 |
+
"renormalize_logits": true,
|
15 |
+
"transformers_version": "4.48.3"
|
16 |
+
}
|
translate_model_finetune/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02a4a427718bb73a61f1c2a6f035da3a79eac4e53c7c019ec5a00c199b7af5ea
|
3 |
+
size 303704440
|
translate_model_finetune/source.spm
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fd4e951487aed00bae6a6c2ee4ef5d8d1db05fd098b19b608046c9334b58d24d
|
3 |
+
size 812240
|
translate_model_finetune/special_tokens_map.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eos_token": "</s>",
|
3 |
+
"pad_token": "<pad>",
|
4 |
+
"unk_token": "<unk>"
|
5 |
+
}
|
translate_model_finetune/target.spm
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f5529d3a72f8c1d5f7e357f1b6fd30e3cf58f6e1ba0401db135a118ac92f4a76
|
3 |
+
size 1067935
|
translate_model_finetune/tokenizer_config.json
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "</s>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<unk>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"61949": {
|
20 |
+
"content": "<pad>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"clean_up_tokenization_spaces": false,
|
29 |
+
"eos_token": "</s>",
|
30 |
+
"extra_special_tokens": {},
|
31 |
+
"model_max_length": 512,
|
32 |
+
"pad_token": "<pad>",
|
33 |
+
"separate_vocabs": false,
|
34 |
+
"source_lang": "eng",
|
35 |
+
"sp_model_kwargs": {},
|
36 |
+
"target_lang": "hin",
|
37 |
+
"tokenizer_class": "MarianTokenizer",
|
38 |
+
"unk_token": "<unk>"
|
39 |
+
}
|
translate_model_finetune/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
translater.ipynb
ADDED
@@ -0,0 +1,1035 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"nbformat": 4,
|
3 |
+
"nbformat_minor": 0,
|
4 |
+
"metadata": {
|
5 |
+
"colab": {
|
6 |
+
"provenance": [],
|
7 |
+
"machine_shape": "hm",
|
8 |
+
"gpuType": "A100"
|
9 |
+
},
|
10 |
+
"kernelspec": {
|
11 |
+
"name": "python3",
|
12 |
+
"display_name": "Python 3"
|
13 |
+
},
|
14 |
+
"language_info": {
|
15 |
+
"name": "python"
|
16 |
+
},
|
17 |
+
"accelerator": "GPU"
|
18 |
+
},
|
19 |
+
"cells": [
|
20 |
+
{
|
21 |
+
"cell_type": "code",
|
22 |
+
"source": [
|
23 |
+
"!pip install datasets"
|
24 |
+
],
|
25 |
+
"metadata": {
|
26 |
+
"colab": {
|
27 |
+
"base_uri": "https://localhost:8080/"
|
28 |
+
},
|
29 |
+
"id": "y4FB2eje4ClO",
|
30 |
+
"outputId": "d75dfce9-32fc-4825-c91a-01a99ee338af"
|
31 |
+
},
|
32 |
+
"execution_count": 1,
|
33 |
+
"outputs": [
|
34 |
+
{
|
35 |
+
"output_type": "stream",
|
36 |
+
"name": "stdout",
|
37 |
+
"text": [
|
38 |
+
"Collecting datasets\n",
|
39 |
+
" Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)\n",
|
40 |
+
"Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from datasets) (3.17.0)\n",
|
41 |
+
"Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.11/dist-packages (from datasets) (1.26.4)\n",
|
42 |
+
"Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (18.1.0)\n",
|
43 |
+
"Collecting dill<0.3.9,>=0.3.0 (from datasets)\n",
|
44 |
+
" Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n",
|
45 |
+
"Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (from datasets) (2.2.2)\n",
|
46 |
+
"Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.11/dist-packages (from datasets) (2.32.3)\n",
|
47 |
+
"Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.11/dist-packages (from datasets) (4.67.1)\n",
|
48 |
+
"Collecting xxhash (from datasets)\n",
|
49 |
+
" Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
|
50 |
+
"Collecting multiprocess<0.70.17 (from datasets)\n",
|
51 |
+
" Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)\n",
|
52 |
+
"Requirement already satisfied: fsspec<=2024.12.0,>=2023.1.0 in /usr/local/lib/python3.11/dist-packages (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets) (2024.10.0)\n",
|
53 |
+
"Requirement already satisfied: aiohttp in /usr/local/lib/python3.11/dist-packages (from datasets) (3.11.13)\n",
|
54 |
+
"Requirement already satisfied: huggingface-hub>=0.24.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (0.28.1)\n",
|
55 |
+
"Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from datasets) (24.2)\n",
|
56 |
+
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from datasets) (6.0.2)\n",
|
57 |
+
"Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (2.4.6)\n",
|
58 |
+
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (1.3.2)\n",
|
59 |
+
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (25.1.0)\n",
|
60 |
+
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (1.5.0)\n",
|
61 |
+
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (6.1.0)\n",
|
62 |
+
"Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (0.3.0)\n",
|
63 |
+
"Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->datasets) (1.18.3)\n",
|
64 |
+
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.24.0->datasets) (4.12.2)\n",
|
65 |
+
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (3.4.1)\n",
|
66 |
+
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (3.10)\n",
|
67 |
+
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (2.3.0)\n",
|
68 |
+
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (2025.1.31)\n",
|
69 |
+
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2.8.2)\n",
|
70 |
+
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2025.1)\n",
|
71 |
+
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2025.1)\n",
|
72 |
+
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0)\n",
|
73 |
+
"Downloading datasets-3.3.2-py3-none-any.whl (485 kB)\n",
|
74 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m485.4/485.4 kB\u001b[0m \u001b[31m33.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
75 |
+
"\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n",
|
76 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m10.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
77 |
+
"\u001b[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)\n",
|
78 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.5/143.5 kB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
79 |
+
"\u001b[?25hDownloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
|
80 |
+
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.8/194.8 kB\u001b[0m \u001b[31m18.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
81 |
+
"\u001b[?25hInstalling collected packages: xxhash, dill, multiprocess, datasets\n",
|
82 |
+
"Successfully installed datasets-3.3.2 dill-0.3.8 multiprocess-0.70.16 xxhash-3.5.0\n"
|
83 |
+
]
|
84 |
+
}
|
85 |
+
]
|
86 |
+
},
|
87 |
+
{
|
88 |
+
"cell_type": "code",
|
89 |
+
"execution_count": 16,
|
90 |
+
"metadata": {
|
91 |
+
"colab": {
|
92 |
+
"base_uri": "https://localhost:8080/"
|
93 |
+
},
|
94 |
+
"id": "XGlb6yNNwmSR",
|
95 |
+
"outputId": "69c617f8-a7a7-4c60-9597-74fa8b1a1b8d"
|
96 |
+
},
|
97 |
+
"outputs": [
|
98 |
+
{
|
99 |
+
"output_type": "stream",
|
100 |
+
"name": "stdout",
|
101 |
+
"text": [
|
102 |
+
"Dataset({\n",
|
103 |
+
" features: ['translation'],\n",
|
104 |
+
" num_rows: 2507\n",
|
105 |
+
"})\n"
|
106 |
+
]
|
107 |
+
}
|
108 |
+
],
|
109 |
+
"source": [
|
110 |
+
"from datasets import load_dataset\n",
|
111 |
+
"\n",
|
112 |
+
"dataset = load_dataset(\"cfilt/iitb-english-hindi\")[\"test\"] # English-Hindi dataset\n",
|
113 |
+
"print(dataset)\n"
|
114 |
+
]
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"cell_type": "code",
|
118 |
+
"source": [
|
119 |
+
"from transformers import MarianMTModel, MarianTokenizer\n",
|
120 |
+
"\n",
|
121 |
+
"model_name = \"Helsinki-NLP/opus-mt-en-hi\" # Change to \"opus-mt-hi-en\" for Hindi to English\n",
|
122 |
+
"model = MarianMTModel.from_pretrained(model_name)\n",
|
123 |
+
"tokenizer = MarianTokenizer.from_pretrained(model_name)"
|
124 |
+
],
|
125 |
+
"metadata": {
|
126 |
+
"colab": {
|
127 |
+
"base_uri": "https://localhost:8080/"
|
128 |
+
},
|
129 |
+
"id": "NQJD04Y_4i99",
|
130 |
+
"outputId": "fc6ae6da-e727-4375-c32e-2ccc401e4b15"
|
131 |
+
},
|
132 |
+
"execution_count": 17,
|
133 |
+
"outputs": [
|
134 |
+
{
|
135 |
+
"output_type": "stream",
|
136 |
+
"name": "stderr",
|
137 |
+
"text": [
|
138 |
+
"/usr/local/lib/python3.11/dist-packages/transformers/models/marian/tokenization_marian.py:175: UserWarning: Recommended: pip install sacremoses.\n",
|
139 |
+
" warnings.warn(\"Recommended: pip install sacremoses.\")\n"
|
140 |
+
]
|
141 |
+
}
|
142 |
+
]
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"cell_type": "code",
|
146 |
+
"source": [
|
147 |
+
"en=[]\n",
|
148 |
+
"hi=[]\n",
|
149 |
+
"for i in dataset['translation']:\n",
|
150 |
+
" en.append(i[\"en\"])\n",
|
151 |
+
" hi.append(i[\"hi\"])"
|
152 |
+
],
|
153 |
+
"metadata": {
|
154 |
+
"id": "zZsvE-9P43DB"
|
155 |
+
},
|
156 |
+
"execution_count": 18,
|
157 |
+
"outputs": []
|
158 |
+
},
|
159 |
+
{
|
160 |
+
"cell_type": "code",
|
161 |
+
"source": [
|
162 |
+
"en[0],hi[0]"
|
163 |
+
],
|
164 |
+
"metadata": {
|
165 |
+
"colab": {
|
166 |
+
"base_uri": "https://localhost:8080/"
|
167 |
+
},
|
168 |
+
"id": "VncVkICMt9uX",
|
169 |
+
"outputId": "10dacfb6-6653-40cc-db92-f8221608617a"
|
170 |
+
},
|
171 |
+
"execution_count": 19,
|
172 |
+
"outputs": [
|
173 |
+
{
|
174 |
+
"output_type": "execute_result",
|
175 |
+
"data": {
|
176 |
+
"text/plain": [
|
177 |
+
"('A black box in your car?', 'आपकी कार में ब्लैक बॉक्स?')"
|
178 |
+
]
|
179 |
+
},
|
180 |
+
"metadata": {},
|
181 |
+
"execution_count": 19
|
182 |
+
}
|
183 |
+
]
|
184 |
+
},
|
185 |
+
{
|
186 |
+
"cell_type": "code",
|
187 |
+
"source": [
|
188 |
+
"import pandas as pd\n",
|
189 |
+
"df=pd.DataFrame({'en':en,'hi':hi})\n",
|
190 |
+
"df.drop(index=0, inplace=True)\n",
|
191 |
+
"df.head()\n"
|
192 |
+
],
|
193 |
+
"metadata": {
|
194 |
+
"colab": {
|
195 |
+
"base_uri": "https://localhost:8080/",
|
196 |
+
"height": 206
|
197 |
+
},
|
198 |
+
"id": "iFCfdlvHuCpj",
|
199 |
+
"outputId": "40fcdf49-cbbe-4989-ca98-c53b7b19fe7c"
|
200 |
+
},
|
201 |
+
"execution_count": 20,
|
202 |
+
"outputs": [
|
203 |
+
{
|
204 |
+
"output_type": "execute_result",
|
205 |
+
"data": {
|
206 |
+
"text/plain": [
|
207 |
+
" en \\\n",
|
208 |
+
"1 As America's road planners struggle to find th... \n",
|
209 |
+
"2 The devices, which track every mile a motorist... \n",
|
210 |
+
"3 The usually dull arena of highway planning has... \n",
|
211 |
+
"4 Libertarians have joined environmental groups ... \n",
|
212 |
+
"5 The tea party is aghast. \n",
|
213 |
+
"\n",
|
214 |
+
" hi \n",
|
215 |
+
"1 जबकि अमेरिका के सड़क योजनाकार, ध्वस्त होते हुए... \n",
|
216 |
+
"2 यह डिवाइस, जो मोटर-चालक द्वारा वाहन चलाए गए प्... \n",
|
217 |
+
"3 आम तौर पर हाईवे नियोजन जैसा उबाऊ काम भी अचानक ... \n",
|
218 |
+
"4 आपने द्वारा ड्राइव किए गए मील, तथा संभवतः ड्रा... \n",
|
219 |
+
"5 चाय पार्टी भौचक्की है। "
|
220 |
+
],
|
221 |
+
"text/html": [
|
222 |
+
"\n",
|
223 |
+
" <div id=\"df-00637f8f-74bf-4de7-989f-f55e94b7e8a5\" class=\"colab-df-container\">\n",
|
224 |
+
" <div>\n",
|
225 |
+
"<style scoped>\n",
|
226 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
227 |
+
" vertical-align: middle;\n",
|
228 |
+
" }\n",
|
229 |
+
"\n",
|
230 |
+
" .dataframe tbody tr th {\n",
|
231 |
+
" vertical-align: top;\n",
|
232 |
+
" }\n",
|
233 |
+
"\n",
|
234 |
+
" .dataframe thead th {\n",
|
235 |
+
" text-align: right;\n",
|
236 |
+
" }\n",
|
237 |
+
"</style>\n",
|
238 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
239 |
+
" <thead>\n",
|
240 |
+
" <tr style=\"text-align: right;\">\n",
|
241 |
+
" <th></th>\n",
|
242 |
+
" <th>en</th>\n",
|
243 |
+
" <th>hi</th>\n",
|
244 |
+
" </tr>\n",
|
245 |
+
" </thead>\n",
|
246 |
+
" <tbody>\n",
|
247 |
+
" <tr>\n",
|
248 |
+
" <th>1</th>\n",
|
249 |
+
" <td>As America's road planners struggle to find th...</td>\n",
|
250 |
+
" <td>जबकि अमेरिका के सड़क योजनाकार, ध्वस्त होते हुए...</td>\n",
|
251 |
+
" </tr>\n",
|
252 |
+
" <tr>\n",
|
253 |
+
" <th>2</th>\n",
|
254 |
+
" <td>The devices, which track every mile a motorist...</td>\n",
|
255 |
+
" <td>यह डिवाइस, जो मोटर-चालक द्वारा वाहन चलाए गए प्...</td>\n",
|
256 |
+
" </tr>\n",
|
257 |
+
" <tr>\n",
|
258 |
+
" <th>3</th>\n",
|
259 |
+
" <td>The usually dull arena of highway planning has...</td>\n",
|
260 |
+
" <td>आम तौर पर हाईवे नियोजन जैसा उबाऊ काम भी अचानक ...</td>\n",
|
261 |
+
" </tr>\n",
|
262 |
+
" <tr>\n",
|
263 |
+
" <th>4</th>\n",
|
264 |
+
" <td>Libertarians have joined environmental groups ...</td>\n",
|
265 |
+
" <td>आपने द्वारा ड्राइव किए गए मील, तथा संभवतः ड्रा...</td>\n",
|
266 |
+
" </tr>\n",
|
267 |
+
" <tr>\n",
|
268 |
+
" <th>5</th>\n",
|
269 |
+
" <td>The tea party is aghast.</td>\n",
|
270 |
+
" <td>चाय पार्टी भौचक्की है।</td>\n",
|
271 |
+
" </tr>\n",
|
272 |
+
" </tbody>\n",
|
273 |
+
"</table>\n",
|
274 |
+
"</div>\n",
|
275 |
+
" <div class=\"colab-df-buttons\">\n",
|
276 |
+
"\n",
|
277 |
+
" <div class=\"colab-df-container\">\n",
|
278 |
+
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-00637f8f-74bf-4de7-989f-f55e94b7e8a5')\"\n",
|
279 |
+
" title=\"Convert this dataframe to an interactive table.\"\n",
|
280 |
+
" style=\"display:none;\">\n",
|
281 |
+
"\n",
|
282 |
+
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
|
283 |
+
" <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
|
284 |
+
" </svg>\n",
|
285 |
+
" </button>\n",
|
286 |
+
"\n",
|
287 |
+
" <style>\n",
|
288 |
+
" .colab-df-container {\n",
|
289 |
+
" display:flex;\n",
|
290 |
+
" gap: 12px;\n",
|
291 |
+
" }\n",
|
292 |
+
"\n",
|
293 |
+
" .colab-df-convert {\n",
|
294 |
+
" background-color: #E8F0FE;\n",
|
295 |
+
" border: none;\n",
|
296 |
+
" border-radius: 50%;\n",
|
297 |
+
" cursor: pointer;\n",
|
298 |
+
" display: none;\n",
|
299 |
+
" fill: #1967D2;\n",
|
300 |
+
" height: 32px;\n",
|
301 |
+
" padding: 0 0 0 0;\n",
|
302 |
+
" width: 32px;\n",
|
303 |
+
" }\n",
|
304 |
+
"\n",
|
305 |
+
" .colab-df-convert:hover {\n",
|
306 |
+
" background-color: #E2EBFA;\n",
|
307 |
+
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
|
308 |
+
" fill: #174EA6;\n",
|
309 |
+
" }\n",
|
310 |
+
"\n",
|
311 |
+
" .colab-df-buttons div {\n",
|
312 |
+
" margin-bottom: 4px;\n",
|
313 |
+
" }\n",
|
314 |
+
"\n",
|
315 |
+
" [theme=dark] .colab-df-convert {\n",
|
316 |
+
" background-color: #3B4455;\n",
|
317 |
+
" fill: #D2E3FC;\n",
|
318 |
+
" }\n",
|
319 |
+
"\n",
|
320 |
+
" [theme=dark] .colab-df-convert:hover {\n",
|
321 |
+
" background-color: #434B5C;\n",
|
322 |
+
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
|
323 |
+
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
|
324 |
+
" fill: #FFFFFF;\n",
|
325 |
+
" }\n",
|
326 |
+
" </style>\n",
|
327 |
+
"\n",
|
328 |
+
" <script>\n",
|
329 |
+
" const buttonEl =\n",
|
330 |
+
" document.querySelector('#df-00637f8f-74bf-4de7-989f-f55e94b7e8a5 button.colab-df-convert');\n",
|
331 |
+
" buttonEl.style.display =\n",
|
332 |
+
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
|
333 |
+
"\n",
|
334 |
+
" async function convertToInteractive(key) {\n",
|
335 |
+
" const element = document.querySelector('#df-00637f8f-74bf-4de7-989f-f55e94b7e8a5');\n",
|
336 |
+
" const dataTable =\n",
|
337 |
+
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
|
338 |
+
" [key], {});\n",
|
339 |
+
" if (!dataTable) return;\n",
|
340 |
+
"\n",
|
341 |
+
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
|
342 |
+
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
|
343 |
+
" + ' to learn more about interactive tables.';\n",
|
344 |
+
" element.innerHTML = '';\n",
|
345 |
+
" dataTable['output_type'] = 'display_data';\n",
|
346 |
+
" await google.colab.output.renderOutput(dataTable, element);\n",
|
347 |
+
" const docLink = document.createElement('div');\n",
|
348 |
+
" docLink.innerHTML = docLinkHtml;\n",
|
349 |
+
" element.appendChild(docLink);\n",
|
350 |
+
" }\n",
|
351 |
+
" </script>\n",
|
352 |
+
" </div>\n",
|
353 |
+
"\n",
|
354 |
+
"\n",
|
355 |
+
"<div id=\"df-ef251f92-f2d2-487f-87d8-a7264719c349\">\n",
|
356 |
+
" <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-ef251f92-f2d2-487f-87d8-a7264719c349')\"\n",
|
357 |
+
" title=\"Suggest charts\"\n",
|
358 |
+
" style=\"display:none;\">\n",
|
359 |
+
"\n",
|
360 |
+
"<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
|
361 |
+
" width=\"24px\">\n",
|
362 |
+
" <g>\n",
|
363 |
+
" <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
|
364 |
+
" </g>\n",
|
365 |
+
"</svg>\n",
|
366 |
+
" </button>\n",
|
367 |
+
"\n",
|
368 |
+
"<style>\n",
|
369 |
+
" .colab-df-quickchart {\n",
|
370 |
+
" --bg-color: #E8F0FE;\n",
|
371 |
+
" --fill-color: #1967D2;\n",
|
372 |
+
" --hover-bg-color: #E2EBFA;\n",
|
373 |
+
" --hover-fill-color: #174EA6;\n",
|
374 |
+
" --disabled-fill-color: #AAA;\n",
|
375 |
+
" --disabled-bg-color: #DDD;\n",
|
376 |
+
" }\n",
|
377 |
+
"\n",
|
378 |
+
" [theme=dark] .colab-df-quickchart {\n",
|
379 |
+
" --bg-color: #3B4455;\n",
|
380 |
+
" --fill-color: #D2E3FC;\n",
|
381 |
+
" --hover-bg-color: #434B5C;\n",
|
382 |
+
" --hover-fill-color: #FFFFFF;\n",
|
383 |
+
" --disabled-bg-color: #3B4455;\n",
|
384 |
+
" --disabled-fill-color: #666;\n",
|
385 |
+
" }\n",
|
386 |
+
"\n",
|
387 |
+
" .colab-df-quickchart {\n",
|
388 |
+
" background-color: var(--bg-color);\n",
|
389 |
+
" border: none;\n",
|
390 |
+
" border-radius: 50%;\n",
|
391 |
+
" cursor: pointer;\n",
|
392 |
+
" display: none;\n",
|
393 |
+
" fill: var(--fill-color);\n",
|
394 |
+
" height: 32px;\n",
|
395 |
+
" padding: 0;\n",
|
396 |
+
" width: 32px;\n",
|
397 |
+
" }\n",
|
398 |
+
"\n",
|
399 |
+
" .colab-df-quickchart:hover {\n",
|
400 |
+
" background-color: var(--hover-bg-color);\n",
|
401 |
+
" box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
|
402 |
+
" fill: var(--button-hover-fill-color);\n",
|
403 |
+
" }\n",
|
404 |
+
"\n",
|
405 |
+
" .colab-df-quickchart-complete:disabled,\n",
|
406 |
+
" .colab-df-quickchart-complete:disabled:hover {\n",
|
407 |
+
" background-color: var(--disabled-bg-color);\n",
|
408 |
+
" fill: var(--disabled-fill-color);\n",
|
409 |
+
" box-shadow: none;\n",
|
410 |
+
" }\n",
|
411 |
+
"\n",
|
412 |
+
" .colab-df-spinner {\n",
|
413 |
+
" border: 2px solid var(--fill-color);\n",
|
414 |
+
" border-color: transparent;\n",
|
415 |
+
" border-bottom-color: var(--fill-color);\n",
|
416 |
+
" animation:\n",
|
417 |
+
" spin 1s steps(1) infinite;\n",
|
418 |
+
" }\n",
|
419 |
+
"\n",
|
420 |
+
" @keyframes spin {\n",
|
421 |
+
" 0% {\n",
|
422 |
+
" border-color: transparent;\n",
|
423 |
+
" border-bottom-color: var(--fill-color);\n",
|
424 |
+
" border-left-color: var(--fill-color);\n",
|
425 |
+
" }\n",
|
426 |
+
" 20% {\n",
|
427 |
+
" border-color: transparent;\n",
|
428 |
+
" border-left-color: var(--fill-color);\n",
|
429 |
+
" border-top-color: var(--fill-color);\n",
|
430 |
+
" }\n",
|
431 |
+
" 30% {\n",
|
432 |
+
" border-color: transparent;\n",
|
433 |
+
" border-left-color: var(--fill-color);\n",
|
434 |
+
" border-top-color: var(--fill-color);\n",
|
435 |
+
" border-right-color: var(--fill-color);\n",
|
436 |
+
" }\n",
|
437 |
+
" 40% {\n",
|
438 |
+
" border-color: transparent;\n",
|
439 |
+
" border-right-color: var(--fill-color);\n",
|
440 |
+
" border-top-color: var(--fill-color);\n",
|
441 |
+
" }\n",
|
442 |
+
" 60% {\n",
|
443 |
+
" border-color: transparent;\n",
|
444 |
+
" border-right-color: var(--fill-color);\n",
|
445 |
+
" }\n",
|
446 |
+
" 80% {\n",
|
447 |
+
" border-color: transparent;\n",
|
448 |
+
" border-right-color: var(--fill-color);\n",
|
449 |
+
" border-bottom-color: var(--fill-color);\n",
|
450 |
+
" }\n",
|
451 |
+
" 90% {\n",
|
452 |
+
" border-color: transparent;\n",
|
453 |
+
" border-bottom-color: var(--fill-color);\n",
|
454 |
+
" }\n",
|
455 |
+
" }\n",
|
456 |
+
"</style>\n",
|
457 |
+
"\n",
|
458 |
+
" <script>\n",
|
459 |
+
" async function quickchart(key) {\n",
|
460 |
+
" const quickchartButtonEl =\n",
|
461 |
+
" document.querySelector('#' + key + ' button');\n",
|
462 |
+
" quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n",
|
463 |
+
" quickchartButtonEl.classList.add('colab-df-spinner');\n",
|
464 |
+
" try {\n",
|
465 |
+
" const charts = await google.colab.kernel.invokeFunction(\n",
|
466 |
+
" 'suggestCharts', [key], {});\n",
|
467 |
+
" } catch (error) {\n",
|
468 |
+
" console.error('Error during call to suggestCharts:', error);\n",
|
469 |
+
" }\n",
|
470 |
+
" quickchartButtonEl.classList.remove('colab-df-spinner');\n",
|
471 |
+
" quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
|
472 |
+
" }\n",
|
473 |
+
" (() => {\n",
|
474 |
+
" let quickchartButtonEl =\n",
|
475 |
+
" document.querySelector('#df-ef251f92-f2d2-487f-87d8-a7264719c349 button');\n",
|
476 |
+
" quickchartButtonEl.style.display =\n",
|
477 |
+
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
|
478 |
+
" })();\n",
|
479 |
+
" </script>\n",
|
480 |
+
"</div>\n",
|
481 |
+
"\n",
|
482 |
+
" </div>\n",
|
483 |
+
" </div>\n"
|
484 |
+
],
|
485 |
+
"application/vnd.google.colaboratory.intrinsic+json": {
|
486 |
+
"type": "dataframe",
|
487 |
+
"variable_name": "df",
|
488 |
+
"summary": "{\n \"name\": \"df\",\n \"rows\": 2506,\n \"fields\": [\n {\n \"column\": \"en\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2505,\n \"samples\": [\n \"Thereafter, some people invested between two to five lakh rupees.\",\n \"O'Malley said Frontier's website and check-in procedures are being changed to make sure passengers know about the fee before they get to the gate.\",\n \"Chadvir said that, based on the popularity gained by the Chief Minister by the way of landmark decisions and for activities for the people's welfare, the public would also elect Hooda for the third time.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"hi\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2505,\n \"samples\": [\n \"\\u0907\\u0938\\u0915\\u0947 \\u092c\\u093e\\u0926 \\u0915\\u093f\\u0938\\u0940 \\u0938\\u0947 \\u0926\\u094b \\u0932\\u093e\\u0916 \\u0924\\u094b \\u0915\\u093f\\u0938\\u0940 \\u0938\\u0947 \\u092a\\u093e\\u0902\\u091a \\u0932\\u093e\\u0916 \\u0930\\u0941\\u092a\\u090f \\u0924\\u0915 \\u091c\\u092e\\u093e \\u0915\\u0930\\u0935\\u093e\\u090f \\u091c\\u093e\\u0924\\u0947 \\u0925\\u0947\\u0964\",\n \"\\u0913 '\\u092e\\u0948\\u0932\\u0940 \\u0928\\u0947 \\u0915\\u0939\\u093e \\u0915\\u093f Frontier \\u0915\\u0940 \\u0935\\u0947\\u092c\\u0938\\u093e\\u0907\\u091f \\u0914\\u0930 \\u091a\\u0947\\u0915 \\u0907\\u0928 \\u0915\\u0947 \\u092a\\u094d\\u0930\\u0915\\u094d\\u0930\\u093f\\u092f\\u093e\\u0913\\u0902 \\u0915\\u094b \\u092c\\u0926\\u0932 \\u0926\\u093f\\u092f\\u093e \\u091c\\u093e \\u0930\\u0939\\u093e \\u0939\\u0948 \\u0917\\u0947\\u091f \\u0915\\u094b \\u092a\\u093e\\u0928\\u0947 \\u0938\\u0947 \\u092a\\u0939\\u0932\\u0947 \\u092f\\u093e\\u0924\\u094d\\u0930\\u093f\\u092f\\u094b\\u0902 \\u0915\\u094b \\u092b\\u0940\\u0938 \\u0915\\u0947 \\u092c\\u093e\\u0930\\u0947 \\u092e\\u0947\\u0902 \\u092a\\u0924\\u093e \\u0939\\u0948 \\u092f\\u0939 \\u0938\\u0941\\u0928\\u093f\\u0936\\u094d\\u091a\\u093f\\u0924 \\u0915\\u0930\\u0928\\u0947 \\u0915\\u0947 \\u0932\\u093f\\u090f.\",\n \"\\u091a\\u093e\\u0926\\u0935\\u0940\\u0930 \\u0928\\u0947 \\u0915\\u0939\\u093e \\u0915\\u093f \\u092e\\u0941\\u0916\\u094d\\u092f\\u092e\\u0902\\u0924\\u094d\\u0930\\u0940 \\u0928\\u0947 \\u092a\\u094d\\u0930\\u0926\\u0947\\u0936 \\u092e\\u0947\\u0902 \\u0910\\u0924\\u093f\\u0939\\u093e\\u0938\\u093f\\u0915 \\u0928\\u093f\\u0930\\u094d\\u0923\\u092f\\u094b\\u0902 \\u090f\\u0935\\u0902 \\u091c\\u0928\\u0939\\u093f\\u0924 \\u0915\\u093e\\u0930\\u094d\\u092f\\u094b \\u0915\\u0940 \\u0926\\u093f\\u0936\\u093e \\u092e\\u0947\\u0902 \\u091c\\u094b \\u0932\\u094b\\u0915\\u092a\\u094d\\u0930\\u093f\\u092f\\u0924\\u093e \\u0939\\u093e\\u0938\\u093f\\u0932 \\u0915\\u0940 \\u0939\\u0948, \\u0907\\u0938\\u0940 \\u0915\\u0947 \\u092c\\u0932\\u092c\\u0942\\u0924\\u0947 \\u092a\\u0930 \\u092a\\u094d\\u0930\\u0926\\u0947\\u0936 \\u0915\\u0940 \\u091c\\u0928\\u0924\\u093e \\u0939\\u0941\\u0921\\u094d\\u0921\\u093e \\u0915\\u094b \\u0938\\u0924\\u094d\\u0924\\u093e \\u0915\\u093e \\u0924\\u093e\\u091c \\u0924\\u0940\\u0938\\u0930\\u0940 \\u092c\\u093e\\u0930 \\u092d\\u0940 \\u092a\\u0939\\u0928\\u093e\\u090f\\u0917\\u0940\\u0964\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
|
489 |
+
}
|
490 |
+
},
|
491 |
+
"metadata": {},
|
492 |
+
"execution_count": 20
|
493 |
+
}
|
494 |
+
]
|
495 |
+
},
|
496 |
+
{
|
497 |
+
"cell_type": "code",
|
498 |
+
"source": [
|
499 |
+
"from sklearn.model_selection import train_test_split\n",
|
500 |
+
"train,test=train_test_split(df,test_size=0.2,random_state=42)\n",
|
501 |
+
"train.reset_index(drop=True,inplace=True)\n",
|
502 |
+
"test.reset_index(drop=True,inplace=True)\n",
|
503 |
+
"train.head(),test.head()"
|
504 |
+
],
|
505 |
+
"metadata": {
|
506 |
+
"colab": {
|
507 |
+
"base_uri": "https://localhost:8080/"
|
508 |
+
},
|
509 |
+
"id": "rwI5aHjrvE4h",
|
510 |
+
"outputId": "766b26b3-5ec4-4743-a007-b7d7c46ba5f3"
|
511 |
+
},
|
512 |
+
"execution_count": 21,
|
513 |
+
"outputs": [
|
514 |
+
{
|
515 |
+
"output_type": "execute_result",
|
516 |
+
"data": {
|
517 |
+
"text/plain": [
|
518 |
+
"( en \\\n",
|
519 |
+
" 0 Renamo wanted to \"warn the international commu... \n",
|
520 |
+
" 1 On the occasion of Mahavir Jayanti, the god Ha... \n",
|
521 |
+
" 2 Already the stamp duty take for residential pr... \n",
|
522 |
+
" 3 But as the evening comes, our minds change and... \n",
|
523 |
+
" 4 \"No,\" Mr Tripodi replied. \n",
|
524 |
+
" \n",
|
525 |
+
" hi \n",
|
526 |
+
" 0 रे नमो \"चीजें मोजाम्बिक में अच्छी तरह से नहीं... \n",
|
527 |
+
" 1 देविका तट पर महावीर जयंती के उपलक्ष्य पर भगवान... \n",
|
528 |
+
" 2 कर वर्ष 2012/13 में वेस्टमिन्स्टर तथा केन्सिंग... \n",
|
529 |
+
" 3 लेकिन शाम होते ही हमारा मन बदल जाता हैं और हम ... \n",
|
530 |
+
" 4 \"नहीं,\" श्री त्रिपोडी ने उत्तर दिया। ,\n",
|
531 |
+
" en \\\n",
|
532 |
+
" 0 Due to financial constraints Dhirubhai had to ... \n",
|
533 |
+
" 1 Other places are hoping to sell the concept to... \n",
|
534 |
+
" 2 There's also money for walk-in clinics in Hoba... \n",
|
535 |
+
" 3 He wanted the party to become strong. \n",
|
536 |
+
" 4 It was simply nothing I could have imagined. \n",
|
537 |
+
" \n",
|
538 |
+
" hi \n",
|
539 |
+
" 0 आर्थिक तंगी के कारण धीरूभाई को हाईस्कूल के बाद... \n",
|
540 |
+
" 1 अन्य स्थानों को आशा है कि डिवाइस में कम की बजा... \n",
|
541 |
+
" 2 इसमें होबार्ट और लॉनसेस्टशन में वॉक-इन क्लिनिक... \n",
|
542 |
+
" 3 वे चाहते हैं कि पार्टी मजबूत हो। \n",
|
543 |
+
" 4 सरल तौर कहें तो में इसमें से किसी भी कल्पना भी... )"
|
544 |
+
]
|
545 |
+
},
|
546 |
+
"metadata": {},
|
547 |
+
"execution_count": 21
|
548 |
+
}
|
549 |
+
]
|
550 |
+
},
|
551 |
+
{
|
552 |
+
"cell_type": "code",
|
553 |
+
"source": [
|
554 |
+
"source_train=train['en']\n",
|
555 |
+
"target_train=train['hi']\n",
|
556 |
+
"source_test=test['en']\n",
|
557 |
+
"target_test=test['hi']\n",
|
558 |
+
"\n"
|
559 |
+
],
|
560 |
+
"metadata": {
|
561 |
+
"id": "vB6Pqm2ovOBi"
|
562 |
+
},
|
563 |
+
"execution_count": 22,
|
564 |
+
"outputs": []
|
565 |
+
},
|
566 |
+
{
|
567 |
+
"cell_type": "code",
|
568 |
+
"source": [
|
569 |
+
"source_texts = [ex for ex in train['en']]\n",
|
570 |
+
"source_texts[0]"
|
571 |
+
],
|
572 |
+
"metadata": {
|
573 |
+
"colab": {
|
574 |
+
"base_uri": "https://localhost:8080/",
|
575 |
+
"height": 52
|
576 |
+
},
|
577 |
+
"id": "YmIW9z6XwqFs",
|
578 |
+
"outputId": "b8394f24-a813-4ce2-82d8-8173567bc7d0"
|
579 |
+
},
|
580 |
+
"execution_count": 23,
|
581 |
+
"outputs": [
|
582 |
+
{
|
583 |
+
"output_type": "execute_result",
|
584 |
+
"data": {
|
585 |
+
"text/plain": [
|
586 |
+
"'Renamo wanted to \"warn the international community that things were not going well in Mozambique,\" Mr Mazanga said.'"
|
587 |
+
],
|
588 |
+
"application/vnd.google.colaboratory.intrinsic+json": {
|
589 |
+
"type": "string"
|
590 |
+
}
|
591 |
+
},
|
592 |
+
"metadata": {},
|
593 |
+
"execution_count": 23
|
594 |
+
}
|
595 |
+
]
|
596 |
+
},
|
597 |
+
{
|
598 |
+
"cell_type": "code",
|
599 |
+
"source": [
|
600 |
+
"def preprocess_function(examples):\n",
|
601 |
+
" source_texts = examples['en']\n",
|
602 |
+
" target_texts = examples['hi']\n",
|
603 |
+
"\n",
|
604 |
+
" model_inputs = tokenizer(source_texts.tolist(), truncation=True, padding=\"max_length\", max_length=128)\n",
|
605 |
+
" labels = tokenizer(target_texts.tolist(), truncation=True, padding=\"max_length\", max_length=128)\n",
|
606 |
+
" model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
|
607 |
+
"\n",
|
608 |
+
" return model_inputs\n",
|
609 |
+
"\n",
|
610 |
+
"tokenized_train = preprocess_function(train)\n",
|
611 |
+
"tokenized_test = preprocess_function(test)\n"
|
612 |
+
],
|
613 |
+
"metadata": {
|
614 |
+
"id": "2AGcbIzA4KMe"
|
615 |
+
},
|
616 |
+
"execution_count": 24,
|
617 |
+
"outputs": []
|
618 |
+
},
|
619 |
+
{
|
620 |
+
"cell_type": "code",
|
621 |
+
"source": [
|
622 |
+
"# tokenized_train[0]"
|
623 |
+
],
|
624 |
+
"metadata": {
|
625 |
+
"id": "e15igWNy9XJ1"
|
626 |
+
},
|
627 |
+
"execution_count": 25,
|
628 |
+
"outputs": []
|
629 |
+
},
|
630 |
+
{
|
631 |
+
"cell_type": "code",
|
632 |
+
"source": [
|
633 |
+
"print(len(tokenized_train[\"input_ids\"]))\n",
|
634 |
+
"print(len(tokenized_train['attention_mask']))\n",
|
635 |
+
"print(len(tokenized_train[\"labels\"]))\n",
|
636 |
+
"print(len(tokenized_test[\"input_ids\"]))\n",
|
637 |
+
"print(len(tokenized_test['attention_mask']))\n",
|
638 |
+
"print(len(tokenized_test[\"labels\"]))"
|
639 |
+
],
|
640 |
+
"metadata": {
|
641 |
+
"colab": {
|
642 |
+
"base_uri": "https://localhost:8080/"
|
643 |
+
},
|
644 |
+
"id": "AndJk2s537y7",
|
645 |
+
"outputId": "080ad2de-5116-4331-f300-c4f61b46332a"
|
646 |
+
},
|
647 |
+
"execution_count": 26,
|
648 |
+
"outputs": [
|
649 |
+
{
|
650 |
+
"output_type": "stream",
|
651 |
+
"name": "stdout",
|
652 |
+
"text": [
|
653 |
+
"2004\n",
|
654 |
+
"2004\n",
|
655 |
+
"2004\n",
|
656 |
+
"502\n",
|
657 |
+
"502\n",
|
658 |
+
"502\n"
|
659 |
+
]
|
660 |
+
}
|
661 |
+
]
|
662 |
+
},
|
663 |
+
{
|
664 |
+
"cell_type": "code",
|
665 |
+
"source": [
|
666 |
+
"from datasets import Dataset\n",
|
667 |
+
"tokenized_train_dataset = Dataset.from_dict(tokenized_train)\n",
|
668 |
+
"tokenized_test_dataset = Dataset.from_dict(tokenized_test)"
|
669 |
+
],
|
670 |
+
"metadata": {
|
671 |
+
"id": "wb6wy_TV-UZu"
|
672 |
+
},
|
673 |
+
"execution_count": 27,
|
674 |
+
"outputs": []
|
675 |
+
},
|
676 |
+
{
|
677 |
+
"cell_type": "code",
|
678 |
+
"source": [
|
679 |
+
"from transformers import TrainingArguments\n",
|
680 |
+
"\n",
|
681 |
+
"training_args = TrainingArguments(\n",
|
682 |
+
" output_dir=\"./translator_en_hi\",\n",
|
683 |
+
" evaluation_strategy=\"epoch\",\n",
|
684 |
+
" save_strategy=\"epoch\",\n",
|
685 |
+
" per_device_train_batch_size=8,\n",
|
686 |
+
" per_device_eval_batch_size=8,\n",
|
687 |
+
" num_train_epochs=1,\n",
|
688 |
+
" learning_rate=5e-5,\n",
|
689 |
+
" weight_decay=0.01,\n",
|
690 |
+
" logging_dir=\"./logs\",\n",
|
691 |
+
" logging_steps=500,\n",
|
692 |
+
" save_total_limit=2,\n",
|
693 |
+
" load_best_model_at_end=True,\n",
|
694 |
+
")\n"
|
695 |
+
],
|
696 |
+
"metadata": {
|
697 |
+
"colab": {
|
698 |
+
"base_uri": "https://localhost:8080/"
|
699 |
+
},
|
700 |
+
"id": "_OGG0eBl8S45",
|
701 |
+
"outputId": "645db96c-0552-4fac-99d6-3a8277b81715"
|
702 |
+
},
|
703 |
+
"execution_count": 28,
|
704 |
+
"outputs": [
|
705 |
+
{
|
706 |
+
"output_type": "stream",
|
707 |
+
"name": "stderr",
|
708 |
+
"text": [
|
709 |
+
"/usr/local/lib/python3.11/dist-packages/transformers/training_args.py:1575: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
|
710 |
+
" warnings.warn(\n"
|
711 |
+
]
|
712 |
+
}
|
713 |
+
]
|
714 |
+
},
|
715 |
+
{
|
716 |
+
"cell_type": "code",
|
717 |
+
"source": [
|
718 |
+
"from transformers import Trainer\n",
|
719 |
+
"trainer = Trainer(\n",
|
720 |
+
" model=model,\n",
|
721 |
+
" args=training_args,\n",
|
722 |
+
" train_dataset=tokenized_train_dataset,\n",
|
723 |
+
" eval_dataset=tokenized_test_dataset\n",
|
724 |
+
")\n",
|
725 |
+
"trainer.train()"
|
726 |
+
],
|
727 |
+
"metadata": {
|
728 |
+
"colab": {
|
729 |
+
"base_uri": "https://localhost:8080/",
|
730 |
+
"height": 508
|
731 |
+
},
|
732 |
+
"id": "FzNYxonF97mR",
|
733 |
+
"outputId": "267a5ab2-d0ed-4212-bfc1-3f994905c9a1"
|
734 |
+
},
|
735 |
+
"execution_count": 29,
|
736 |
+
"outputs": [
|
737 |
+
{
|
738 |
+
"output_type": "stream",
|
739 |
+
"name": "stderr",
|
740 |
+
"text": [
|
741 |
+
"\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.\n",
|
742 |
+
"\u001b[34m\u001b[1mwandb\u001b[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.\n"
|
743 |
+
]
|
744 |
+
},
|
745 |
+
{
|
746 |
+
"output_type": "display_data",
|
747 |
+
"data": {
|
748 |
+
"text/plain": [
|
749 |
+
"<IPython.core.display.Javascript object>"
|
750 |
+
],
|
751 |
+
"application/javascript": [
|
752 |
+
"\n",
|
753 |
+
" window._wandbApiKey = new Promise((resolve, reject) => {\n",
|
754 |
+
" function loadScript(url) {\n",
|
755 |
+
" return new Promise(function(resolve, reject) {\n",
|
756 |
+
" let newScript = document.createElement(\"script\");\n",
|
757 |
+
" newScript.onerror = reject;\n",
|
758 |
+
" newScript.onload = resolve;\n",
|
759 |
+
" document.body.appendChild(newScript);\n",
|
760 |
+
" newScript.src = url;\n",
|
761 |
+
" });\n",
|
762 |
+
" }\n",
|
763 |
+
" loadScript(\"https://cdn.jsdelivr.net/npm/postmate/build/postmate.min.js\").then(() => {\n",
|
764 |
+
" const iframe = document.createElement('iframe')\n",
|
765 |
+
" iframe.style.cssText = \"width:0;height:0;border:none\"\n",
|
766 |
+
" document.body.appendChild(iframe)\n",
|
767 |
+
" const handshake = new Postmate({\n",
|
768 |
+
" container: iframe,\n",
|
769 |
+
" url: 'https://wandb.ai/authorize'\n",
|
770 |
+
" });\n",
|
771 |
+
" const timeout = setTimeout(() => reject(\"Couldn't auto authenticate\"), 5000)\n",
|
772 |
+
" handshake.then(function(child) {\n",
|
773 |
+
" child.on('authorize', data => {\n",
|
774 |
+
" clearTimeout(timeout)\n",
|
775 |
+
" resolve(data)\n",
|
776 |
+
" });\n",
|
777 |
+
" });\n",
|
778 |
+
" })\n",
|
779 |
+
" });\n",
|
780 |
+
" "
|
781 |
+
]
|
782 |
+
},
|
783 |
+
"metadata": {}
|
784 |
+
},
|
785 |
+
{
|
786 |
+
"output_type": "stream",
|
787 |
+
"name": "stderr",
|
788 |
+
"text": [
|
789 |
+
"\u001b[34m\u001b[1mwandb\u001b[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)\n",
|
790 |
+
"\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n",
|
791 |
+
"wandb: Paste an API key from your profile and hit enter:"
|
792 |
+
]
|
793 |
+
},
|
794 |
+
{
|
795 |
+
"name": "stdout",
|
796 |
+
"output_type": "stream",
|
797 |
+
"text": [
|
798 |
+
" ··········\n"
|
799 |
+
]
|
800 |
+
},
|
801 |
+
{
|
802 |
+
"output_type": "stream",
|
803 |
+
"name": "stderr",
|
804 |
+
"text": [
|
805 |
+
"\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m If you're specifying your api key in code, ensure this code is not shared publicly.\n",
|
806 |
+
"\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n",
|
807 |
+
"\u001b[34m\u001b[1mwandb\u001b[0m: No netrc file found, creating one.\n",
|
808 |
+
"\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n",
|
809 |
+
"\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mmummina-premkumar\u001b[0m (\u001b[33mmummina-premkumar-yami-technology\u001b[0m) to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
|
810 |
+
]
|
811 |
+
},
|
812 |
+
{
|
813 |
+
"output_type": "display_data",
|
814 |
+
"data": {
|
815 |
+
"text/plain": [
|
816 |
+
"<IPython.core.display.HTML object>"
|
817 |
+
],
|
818 |
+
"text/html": [
|
819 |
+
"Tracking run with wandb version 0.19.7"
|
820 |
+
]
|
821 |
+
},
|
822 |
+
"metadata": {}
|
823 |
+
},
|
824 |
+
{
|
825 |
+
"output_type": "display_data",
|
826 |
+
"data": {
|
827 |
+
"text/plain": [
|
828 |
+
"<IPython.core.display.HTML object>"
|
829 |
+
],
|
830 |
+
"text/html": [
|
831 |
+
"Run data is saved locally in <code>/content/wandb/run-20250302_193401-s8hefz7l</code>"
|
832 |
+
]
|
833 |
+
},
|
834 |
+
"metadata": {}
|
835 |
+
},
|
836 |
+
{
|
837 |
+
"output_type": "display_data",
|
838 |
+
"data": {
|
839 |
+
"text/plain": [
|
840 |
+
"<IPython.core.display.HTML object>"
|
841 |
+
],
|
842 |
+
"text/html": [
|
843 |
+
"Syncing run <strong><a href='https://wandb.ai/mummina-premkumar-yami-technology/huggingface/runs/s8hefz7l' target=\"_blank\">./translator_en_hi</a></strong> to <a href='https://wandb.ai/mummina-premkumar-yami-technology/huggingface' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/developer-guide' target=\"_blank\">docs</a>)<br>"
|
844 |
+
]
|
845 |
+
},
|
846 |
+
"metadata": {}
|
847 |
+
},
|
848 |
+
{
|
849 |
+
"output_type": "display_data",
|
850 |
+
"data": {
|
851 |
+
"text/plain": [
|
852 |
+
"<IPython.core.display.HTML object>"
|
853 |
+
],
|
854 |
+
"text/html": [
|
855 |
+
" View project at <a href='https://wandb.ai/mummina-premkumar-yami-technology/huggingface' target=\"_blank\">https://wandb.ai/mummina-premkumar-yami-technology/huggingface</a>"
|
856 |
+
]
|
857 |
+
},
|
858 |
+
"metadata": {}
|
859 |
+
},
|
860 |
+
{
|
861 |
+
"output_type": "display_data",
|
862 |
+
"data": {
|
863 |
+
"text/plain": [
|
864 |
+
"<IPython.core.display.HTML object>"
|
865 |
+
],
|
866 |
+
"text/html": [
|
867 |
+
" View run at <a href='https://wandb.ai/mummina-premkumar-yami-technology/huggingface/runs/s8hefz7l' target=\"_blank\">https://wandb.ai/mummina-premkumar-yami-technology/huggingface/runs/s8hefz7l</a>"
|
868 |
+
]
|
869 |
+
},
|
870 |
+
"metadata": {}
|
871 |
+
},
|
872 |
+
{
|
873 |
+
"output_type": "display_data",
|
874 |
+
"data": {
|
875 |
+
"text/plain": [
|
876 |
+
"<IPython.core.display.HTML object>"
|
877 |
+
],
|
878 |
+
"text/html": [
|
879 |
+
"\n",
|
880 |
+
" <div>\n",
|
881 |
+
" \n",
|
882 |
+
" <progress value='251' max='251' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
883 |
+
" [251/251 00:21, Epoch 1/1]\n",
|
884 |
+
" </div>\n",
|
885 |
+
" <table border=\"1\" class=\"dataframe\">\n",
|
886 |
+
" <thead>\n",
|
887 |
+
" <tr style=\"text-align: left;\">\n",
|
888 |
+
" <th>Epoch</th>\n",
|
889 |
+
" <th>Training Loss</th>\n",
|
890 |
+
" <th>Validation Loss</th>\n",
|
891 |
+
" </tr>\n",
|
892 |
+
" </thead>\n",
|
893 |
+
" <tbody>\n",
|
894 |
+
" <tr>\n",
|
895 |
+
" <td>1</td>\n",
|
896 |
+
" <td>No log</td>\n",
|
897 |
+
" <td>1.876339</td>\n",
|
898 |
+
" </tr>\n",
|
899 |
+
" </tbody>\n",
|
900 |
+
"</table><p>"
|
901 |
+
]
|
902 |
+
},
|
903 |
+
"metadata": {}
|
904 |
+
},
|
905 |
+
{
|
906 |
+
"output_type": "stream",
|
907 |
+
"name": "stderr",
|
908 |
+
"text": [
|
909 |
+
"/usr/local/lib/python3.11/dist-packages/transformers/modeling_utils.py:2758: UserWarning: Moving the following attributes in the config to the generation config: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]]}. You are seeing this warning because you've set generation parameters in the model config, as opposed to in the generation config.\n",
|
910 |
+
" warnings.warn(\n",
|
911 |
+
"There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].\n"
|
912 |
+
]
|
913 |
+
},
|
914 |
+
{
|
915 |
+
"output_type": "execute_result",
|
916 |
+
"data": {
|
917 |
+
"text/plain": [
|
918 |
+
"TrainOutput(global_step=251, training_loss=2.212838830225971, metrics={'train_runtime': 45.16, 'train_samples_per_second': 44.376, 'train_steps_per_second': 5.558, 'total_flos': 67932323315712.0, 'train_loss': 2.212838830225971, 'epoch': 1.0})"
|
919 |
+
]
|
920 |
+
},
|
921 |
+
"metadata": {},
|
922 |
+
"execution_count": 29
|
923 |
+
}
|
924 |
+
]
|
925 |
+
},
|
926 |
+
{
|
927 |
+
"cell_type": "code",
|
928 |
+
"source": [
|
929 |
+
"def translate(text, src_lang=\"en\", tgt_lang=\"hi\"):\n",
|
930 |
+
" inputs = tokenizer(text, return_tensors=\"pt\")\n",
|
931 |
+
" # Move inputs to the same device as the model\n",
|
932 |
+
" # Assuming model is on cuda:0 or default cuda device\n",
|
933 |
+
" for k in inputs:\n",
|
934 |
+
" inputs[k] = inputs[k].to(model.device)\n",
|
935 |
+
" translated_tokens = model.generate(**inputs, max_length=50)\n",
|
936 |
+
" return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)\n",
|
937 |
+
"\n",
|
938 |
+
"print(translate(\"good man\"))"
|
939 |
+
],
|
940 |
+
"metadata": {
|
941 |
+
"colab": {
|
942 |
+
"base_uri": "https://localhost:8080/"
|
943 |
+
},
|
944 |
+
"id": "OqV0Fiud09tt",
|
945 |
+
"outputId": "fcd79a5e-7dbe-4138-b7c8-226318c4ac7c"
|
946 |
+
},
|
947 |
+
"execution_count": 30,
|
948 |
+
"outputs": [
|
949 |
+
{
|
950 |
+
"output_type": "stream",
|
951 |
+
"name": "stdout",
|
952 |
+
"text": [
|
953 |
+
"अच्छा आदमी\n"
|
954 |
+
]
|
955 |
+
}
|
956 |
+
]
|
957 |
+
},
|
958 |
+
{
|
959 |
+
"cell_type": "code",
|
960 |
+
"source": [
|
961 |
+
"save_directory = \"translate_model\"\n",
|
962 |
+
"\n",
|
963 |
+
"model.save_pretrained(save_directory)\n",
|
964 |
+
"\n",
|
965 |
+
"tokenizer.save_pretrained(save_directory)"
|
966 |
+
],
|
967 |
+
"metadata": {
|
968 |
+
"colab": {
|
969 |
+
"base_uri": "https://localhost:8080/"
|
970 |
+
},
|
971 |
+
"id": "5pDS9v4lBynb",
|
972 |
+
"outputId": "652888cf-660d-40cd-fe5a-a53d27880b0a"
|
973 |
+
},
|
974 |
+
"execution_count": 33,
|
975 |
+
"outputs": [
|
976 |
+
{
|
977 |
+
"output_type": "execute_result",
|
978 |
+
"data": {
|
979 |
+
"text/plain": [
|
980 |
+
"('translate_model/tokenizer_config.json',\n",
|
981 |
+
" 'translate_model/special_tokens_map.json',\n",
|
982 |
+
" 'translate_model/vocab.json',\n",
|
983 |
+
" 'translate_model/source.spm',\n",
|
984 |
+
" 'translate_model/target.spm',\n",
|
985 |
+
" 'translate_model/added_tokens.json')"
|
986 |
+
]
|
987 |
+
},
|
988 |
+
"metadata": {},
|
989 |
+
"execution_count": 33
|
990 |
+
}
|
991 |
+
]
|
992 |
+
},
|
993 |
+
{
|
994 |
+
"cell_type": "code",
|
995 |
+
"source": [
|
996 |
+
"import shutil\n",
|
997 |
+
"\n",
|
998 |
+
"\n",
|
999 |
+
"folder_path = \"/content/translate_model\"\n",
|
1000 |
+
"zip_name = \"translate_model_finetune.zip\"\n",
|
1001 |
+
"\n",
|
1002 |
+
"\n",
|
1003 |
+
"shutil.make_archive(zip_name.replace('.zip', ''), 'zip', folder_path)\n",
|
1004 |
+
"\n",
|
1005 |
+
"print(f\"Folder '{folder_path}' has been zipped as '{zip_name}'.\")"
|
1006 |
+
],
|
1007 |
+
"metadata": {
|
1008 |
+
"colab": {
|
1009 |
+
"base_uri": "https://localhost:8080/"
|
1010 |
+
},
|
1011 |
+
"id": "9SuQLZ8KMnLl",
|
1012 |
+
"outputId": "27e6b557-ee86-425a-a47c-7d0f8ca812ab"
|
1013 |
+
},
|
1014 |
+
"execution_count": 35,
|
1015 |
+
"outputs": [
|
1016 |
+
{
|
1017 |
+
"output_type": "stream",
|
1018 |
+
"name": "stdout",
|
1019 |
+
"text": [
|
1020 |
+
"Folder '/content/translate_model' has been zipped as 'translate_model_finetune.zip'.\n"
|
1021 |
+
]
|
1022 |
+
}
|
1023 |
+
]
|
1024 |
+
},
|
1025 |
+
{
|
1026 |
+
"cell_type": "code",
|
1027 |
+
"source": [],
|
1028 |
+
"metadata": {
|
1029 |
+
"id": "qP6EM7VnWXaM"
|
1030 |
+
},
|
1031 |
+
"execution_count": null,
|
1032 |
+
"outputs": []
|
1033 |
+
}
|
1034 |
+
]
|
1035 |
+
}
|