neelimapreeti297 commited on
Commit
6a13d05
·
verified ·
1 Parent(s): cced00b

Update germantoenglish.py

Browse files
Files changed (1) hide show
  1. germantoenglish.py +8 -92
germantoenglish.py CHANGED
@@ -13,8 +13,6 @@ from torchtext.datasets import multi30k, Multi30k
13
  from typing import Iterable, List
14
 
15
 
16
- # We need to modify the URLs for the dataset since the links to the original dataset are broken
17
- # Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info
18
  multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
19
  multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"
20
 
@@ -55,14 +53,12 @@ special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']
55
  for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
56
  # Training data Iterator
57
  train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
58
- # Create torchtext's Vocab object
59
  vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
60
  min_freq=1,
61
  specials=special_symbols,
62
  special_first=True)
63
 
64
- # Set ``UNK_IDX`` as the default index. This index is returned when the token is not found.
65
- # If not set, it throws ``RuntimeError`` when the queried token is not found in the Vocabulary.
66
  for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
67
  vocab_transform[ln].set_default_index(UNK_IDX)
68
 
@@ -73,7 +69,6 @@ from torch.nn import Transformer
73
  import math
74
  DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
75
 
76
- # helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
77
  class PositionalEncoding(nn.Module):
78
  def __init__(self,
79
  emb_size: int,
@@ -93,7 +88,7 @@ class PositionalEncoding(nn.Module):
93
  def forward(self, token_embedding: Tensor):
94
  return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
95
 
96
- # helper Module to convert tensor of input indices into corresponding tensor of token embeddings
97
  class TokenEmbedding(nn.Module):
98
  def __init__(self, vocab_size: int, emb_size):
99
  super(TokenEmbedding, self).__init__()
@@ -157,7 +152,6 @@ from torch.nn import Transformer
157
  import math
158
  DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
159
 
160
- # helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
161
  class PositionalEncoding(nn.Module):
162
  def __init__(self,
163
  emb_size: int,
@@ -177,7 +171,6 @@ class PositionalEncoding(nn.Module):
177
  def forward(self, token_embedding: Tensor):
178
  return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
179
 
180
- # helper Module to convert tensor of input indices into corresponding tensor of token embeddings
181
  class TokenEmbedding(nn.Module):
182
  def __init__(self, vocab_size: int, emb_size):
183
  super(TokenEmbedding, self).__init__()
@@ -285,13 +278,11 @@ def sequential_transforms(*transforms):
285
  return txt_input
286
  return func
287
 
288
- # function to add BOS/EOS and create tensor for input sequence indices
289
  def tensor_transform(token_ids: List[int]):
290
  return torch.cat((torch.tensor([BOS_IDX]),
291
  torch.tensor(token_ids),
292
  torch.tensor([EOS_IDX])))
293
 
294
- # ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
295
  text_transform = {}
296
  for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
297
  text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
@@ -372,22 +363,19 @@ for epoch in range(1, NUM_EPOCHS+1):
372
  val_loss = evaluate(transformer)
373
  print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))
374
 
375
- model =torch.save(transformer.state_dict(), '/gdrive/My Drive/transformer_model.pth')
376
 
377
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
378
  model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
379
  NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
380
- model.load_state_dict(torch.load('/gdrive/My Drive/transformer_model.pth', map_location=device))
381
  model.to(device)
382
  model.eval()
383
 
384
  def greedy_decode(model,src, src_mask, max_len, start_symbol):
385
- #DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
386
  src = src.to(DEVICE)
387
  src_mask = src_mask.to(DEVICE)
388
- #model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
389
-
390
- #model.load_state_dict(torch.load('/gdrive/My Drive/transformer_model.pth',map_location= DEVICE))
391
 
392
  memory = model.encode(src, src_mask)
393
  ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
@@ -407,16 +395,10 @@ def greedy_decode(model,src, src_mask, max_len, start_symbol):
407
  break
408
  return ys
409
 
410
- # Load the saved model
411
- #loaded_model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
412
- # NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
413
- #loaded_model.load_state_dict(torch.load('/gdrive/My Drive/transformer_model.pth'))
414
- #loaded_model.eval() # Make sure to set the model in evaluation mode
415
-
416
  def translate(src_sentence: str):
417
  model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
418
- #DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
419
- model.load_state_dict(torch.load('/gdrive/My Drive/transformer_model.pth'))
420
  model.to(DEVICE)
421
  model.eval()
422
  src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
@@ -424,70 +406,4 @@ def translate(src_sentence: str):
424
  src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
425
  tgt_tokens = greedy_decode(
426
  model, src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
427
- return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")
428
-
429
- print(translate("Eine Gruppe von Menschen steht vor einem Iglu ."))
430
-
431
- #!pip install transformers
432
-
433
- from transformers.modeling_utils import PreTrainedModel ,PretrainedConfig
434
-
435
- class Seq2SeqTransformer(PreTrainedModel):
436
- def __init__(self,config):
437
- super(Seq2SeqTransformer, self).__init__(config)
438
- self.transformer = Transformer(d_model=config.emb_size,
439
- nhead=config.nhead,
440
- num_encoder_layers=config.num_encoder_layers,
441
- num_decoder_layers=config.num_decoder_layers,
442
- dim_feedforward=config.dim_feedforward,
443
- dropout=config.dropout)
444
- self.generator = nn.Linear(config.emb_size, config.tgt_vocab_size)
445
- self.src_tok_emb = TokenEmbedding(config.src_vocab_size, config.emb_size)
446
- self.tgt_tok_emb = TokenEmbedding(config.tgt_vocab_size, config.emb_size)
447
- self.positional_encoding = PositionalEncoding(
448
- config.emb_size, dropout=config.dropout)
449
-
450
- config = PretrainedConfig(
451
- # Specify your vocabulary size
452
- dim_feedforward =512,
453
- dropout= 0.1,
454
- emb_size= 512,
455
- num_decoder_layers= 3,
456
- num_encoder_layers= 3,
457
- nhead= 8,
458
- src_vocab_size= 19214,
459
- tgt_vocab_size= 10837
460
- )
461
-
462
- model = Seq2SeqTransformer(config)
463
- model.to(DEVICE)
464
-
465
-
466
- model.save_pretrained('/gdrive/My Drive')
467
-
468
- #!pip install -q gradio==3.48.0
469
-
470
- import gradio as gr
471
- import torch
472
- from torchtext.data.utils import get_tokenizer
473
- from torchtext.vocab import build_vocab_from_iterator
474
- from torchtext.datasets import Multi30k
475
- from torch import Tensor
476
- from typing import Iterable, List
477
-
478
- if __name__ == "__main__":
479
- # Create the Gradio interface
480
- iface = gr.Interface(
481
- fn=translate, # Specify the translation function as the main function
482
- inputs=[
483
- gr.components.Textbox(label="Text")
484
-
485
- ],
486
- outputs=["text"],
487
- cache_examples=False, # Disable caching of examples
488
- title="germanToenglish", # Set the title of the interface
489
- #description="This is a translator app for arabic and english. Currently supports only english to arabic." # Add a description of the interface
490
- )
491
-
492
- # Launch the interface
493
- iface.launch(share=True)
 
13
  from typing import Iterable, List
14
 
15
 
 
 
16
  multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
17
  multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"
18
 
 
53
  for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
54
  # Training data Iterator
55
  train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
56
+
57
  vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
58
  min_freq=1,
59
  specials=special_symbols,
60
  special_first=True)
61
 
 
 
62
  for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
63
  vocab_transform[ln].set_default_index(UNK_IDX)
64
 
 
69
  import math
70
  DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
71
 
 
72
  class PositionalEncoding(nn.Module):
73
  def __init__(self,
74
  emb_size: int,
 
88
  def forward(self, token_embedding: Tensor):
89
  return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
90
 
91
+
92
  class TokenEmbedding(nn.Module):
93
  def __init__(self, vocab_size: int, emb_size):
94
  super(TokenEmbedding, self).__init__()
 
152
  import math
153
  DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
154
 
 
155
  class PositionalEncoding(nn.Module):
156
  def __init__(self,
157
  emb_size: int,
 
171
  def forward(self, token_embedding: Tensor):
172
  return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
173
 
 
174
  class TokenEmbedding(nn.Module):
175
  def __init__(self, vocab_size: int, emb_size):
176
  super(TokenEmbedding, self).__init__()
 
278
  return txt_input
279
  return func
280
 
 
281
  def tensor_transform(token_ids: List[int]):
282
  return torch.cat((torch.tensor([BOS_IDX]),
283
  torch.tensor(token_ids),
284
  torch.tensor([EOS_IDX])))
285
 
 
286
  text_transform = {}
287
  for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
288
  text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
 
363
  val_loss = evaluate(transformer)
364
  print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))
365
 
366
+ model =torch.save(transformer.state_dict(), './transformer_model.pth')
367
 
368
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
369
  model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
370
  NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
371
+ model.load_state_dict(torch.load('./transformer_model.pth', map_location=device))
372
  model.to(device)
373
  model.eval()
374
 
375
  def greedy_decode(model,src, src_mask, max_len, start_symbol):
376
+
377
  src = src.to(DEVICE)
378
  src_mask = src_mask.to(DEVICE)
 
 
 
379
 
380
  memory = model.encode(src, src_mask)
381
  ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
 
395
  break
396
  return ys
397
 
 
 
 
 
 
 
398
  def translate(src_sentence: str):
399
  model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
400
+
401
+ model.load_state_dict(torch.load('./transformer_model.pth'))
402
  model.to(DEVICE)
403
  model.eval()
404
  src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
 
406
  src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
407
  tgt_tokens = greedy_decode(
408
  model, src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
409
+ return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")