Spaces:

neelimapreeti297
/

GermanToEnglish

Runtime error

App Files Files Community

neelimapreeti297 commited on Apr 7, 2024

Commit

6a13d05

verified ·

1 Parent(s): cced00b

Update germantoenglish.py

Browse files

Files changed (1) hide show

germantoenglish.py +8 -92

germantoenglish.py CHANGED Viewed

@@ -13,8 +13,6 @@ from torchtext.datasets import multi30k, Multi30k
 from typing import Iterable, List
-# We need to modify the URLs for the dataset since the links to the original dataset are broken
-# Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info
 multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
 multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"
@@ -55,14 +53,12 @@ special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']
 for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
     # Training data Iterator
     train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
-    # Create torchtext's Vocab object
     vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                     min_freq=1,
                                                     specials=special_symbols,
                                                     special_first=True)
-# Set ``UNK_IDX`` as the default index. This index is returned when the token is not found.
-# If not set, it throws ``RuntimeError`` when the queried token is not found in the Vocabulary.
 for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
   vocab_transform[ln].set_default_index(UNK_IDX)
@@ -73,7 +69,6 @@ from torch.nn import Transformer
 import math
 DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
 class PositionalEncoding(nn.Module):
     def __init__(self,
                  emb_size: int,
@@ -93,7 +88,7 @@ class PositionalEncoding(nn.Module):
     def forward(self, token_embedding: Tensor):
         return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
-# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
 class TokenEmbedding(nn.Module):
     def __init__(self, vocab_size: int, emb_size):
         super(TokenEmbedding, self).__init__()
@@ -157,7 +152,6 @@ from torch.nn import Transformer
 import math
 DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
 class PositionalEncoding(nn.Module):
     def __init__(self,
                  emb_size: int,
@@ -177,7 +171,6 @@ class PositionalEncoding(nn.Module):
     def forward(self, token_embedding: Tensor):
         return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
-# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
 class TokenEmbedding(nn.Module):
     def __init__(self, vocab_size: int, emb_size):
         super(TokenEmbedding, self).__init__()
@@ -285,13 +278,11 @@ def sequential_transforms(*transforms):
         return txt_input
     return func
-# function to add BOS/EOS and create tensor for input sequence indices
 def tensor_transform(token_ids: List[int]):
     return torch.cat((torch.tensor([BOS_IDX]),
                       torch.tensor(token_ids),
                       torch.tensor([EOS_IDX])))
-# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
 text_transform = {}
 for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
     text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
@@ -372,22 +363,19 @@ for epoch in range(1, NUM_EPOCHS+1):
     val_loss = evaluate(transformer)
     print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))
-model =torch.save(transformer.state_dict(), '/gdrive/My Drive/transformer_model.pth')
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                            NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
-model.load_state_dict(torch.load('/gdrive/My Drive/transformer_model.pth', map_location=device))
 model.to(device)
 model.eval()
 def greedy_decode(model,src, src_mask, max_len, start_symbol):
-    #DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     src = src.to(DEVICE)
     src_mask = src_mask.to(DEVICE)
-    #model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
-    #model.load_state_dict(torch.load('/gdrive/My Drive/transformer_model.pth',map_location= DEVICE))
     memory = model.encode(src, src_mask)
     ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
@@ -407,16 +395,10 @@ def greedy_decode(model,src, src_mask, max_len, start_symbol):
             break
     return ys
-# Load the saved model
-#loaded_model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
-                                 # NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
-#loaded_model.load_state_dict(torch.load('/gdrive/My Drive/transformer_model.pth'))
-#loaded_model.eval()  # Make sure to set the model in evaluation mode
 def translate(src_sentence: str):
     model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
-    #DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    model.load_state_dict(torch.load('/gdrive/My Drive/transformer_model.pth'))
     model.to(DEVICE)
     model.eval()
     src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
@@ -424,70 +406,4 @@ def translate(src_sentence: str):
     src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
     tgt_tokens = greedy_decode(
         model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
-    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")
-print(translate("Eine Gruppe von Menschen steht vor einem Iglu ."))
-#!pip install transformers
-from transformers.modeling_utils import PreTrainedModel ,PretrainedConfig
-class Seq2SeqTransformer(PreTrainedModel):
-    def __init__(self,config):
-        super(Seq2SeqTransformer, self).__init__(config)
-        self.transformer = Transformer(d_model=config.emb_size,
-                                       nhead=config.nhead,
-                                       num_encoder_layers=config.num_encoder_layers,
-                                       num_decoder_layers=config.num_decoder_layers,
-                                       dim_feedforward=config.dim_feedforward,
-                                       dropout=config.dropout)
-        self.generator = nn.Linear(config.emb_size, config.tgt_vocab_size)
-        self.src_tok_emb = TokenEmbedding(config.src_vocab_size, config.emb_size)
-        self.tgt_tok_emb = TokenEmbedding(config.tgt_vocab_size, config.emb_size)
-        self.positional_encoding = PositionalEncoding(
-            config.emb_size, dropout=config.dropout)
-config = PretrainedConfig(
-      # Specify your vocabulary size
-    dim_feedforward =512,
-    dropout= 0.1,
-    emb_size= 512,
-    num_decoder_layers= 3,
-    num_encoder_layers= 3,
-    nhead= 8,
-    src_vocab_size= 19214,
-    tgt_vocab_size= 10837
-)
-model = Seq2SeqTransformer(config)
-model.to(DEVICE)
-model.save_pretrained('/gdrive/My Drive')
-#!pip install -q gradio==3.48.0
-import gradio as gr
-import torch
-from torchtext.data.utils import get_tokenizer
-from torchtext.vocab import build_vocab_from_iterator
-from torchtext.datasets import Multi30k
-from torch import Tensor
-from typing import Iterable, List
-if __name__ == "__main__":
-    # Create the Gradio interface
-    iface = gr.Interface(
-        fn=translate,  # Specify the translation function as the main function
-        inputs=[
-            gr.components.Textbox(label="Text")
-    ],
-        outputs=["text"],
-        cache_examples=False,  # Disable caching of examples
-        title="germanToenglish",  # Set the title of the interface
-        #description="This is a translator app for arabic and english. Currently supports only english to arabic."  # Add a description of the interface
-    )
-    # Launch the interface
-    iface.launch(share=True)

 from typing import Iterable, List
 multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
 multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"
 for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
     # Training data Iterator
     train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
     vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                     min_freq=1,
                                                     specials=special_symbols,
                                                     special_first=True)
 for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
   vocab_transform[ln].set_default_index(UNK_IDX)
 import math
 DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 class PositionalEncoding(nn.Module):
     def __init__(self,
                  emb_size: int,
     def forward(self, token_embedding: Tensor):
         return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
 class TokenEmbedding(nn.Module):
     def __init__(self, vocab_size: int, emb_size):
         super(TokenEmbedding, self).__init__()
 import math
 DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 class PositionalEncoding(nn.Module):
     def __init__(self,
                  emb_size: int,
     def forward(self, token_embedding: Tensor):
         return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
 class TokenEmbedding(nn.Module):
     def __init__(self, vocab_size: int, emb_size):
         super(TokenEmbedding, self).__init__()
         return txt_input
     return func
 def tensor_transform(token_ids: List[int]):
     return torch.cat((torch.tensor([BOS_IDX]),
                       torch.tensor(token_ids),
                       torch.tensor([EOS_IDX])))
 text_transform = {}
 for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
     text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
     val_loss = evaluate(transformer)
     print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))
+model =torch.save(transformer.state_dict(), './transformer_model.pth')
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                            NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
+model.load_state_dict(torch.load('./transformer_model.pth', map_location=device))
 model.to(device)
 model.eval()
 def greedy_decode(model,src, src_mask, max_len, start_symbol):
     src = src.to(DEVICE)
     src_mask = src_mask.to(DEVICE)
     memory = model.encode(src, src_mask)
     ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
             break
     return ys
 def translate(src_sentence: str):
     model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
+    model.load_state_dict(torch.load('./transformer_model.pth'))
     model.to(DEVICE)
     model.eval()
     src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
     src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
     tgt_tokens = greedy_decode(
         model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
+    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")