neelimapreeti297 commited on
Commit
31da773
·
verified ·
1 Parent(s): 342fc29

Upload germantoenglish.py

Browse files
Files changed (1) hide show
  1. germantoenglish.py +493 -0
germantoenglish.py ADDED
@@ -0,0 +1,493 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """germanToEnglish.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1UI02YcWdG9ErJd18evuYmF1vNiqz7geo
8
+ """
9
+
10
+ from torchtext.data.utils import get_tokenizer
11
+ from torchtext.vocab import build_vocab_from_iterator
12
+ from torchtext.datasets import multi30k, Multi30k
13
+ from typing import Iterable, List
14
+
15
+
16
+ # We need to modify the URLs for the dataset since the links to the original dataset are broken
17
+ # Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info
18
+ multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
19
+ multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"
20
+
21
+ SRC_LANGUAGE = 'de'
22
+ TGT_LANGUAGE = 'en'
23
+
24
+ # Place-holders
25
+ token_transform = {}
26
+ vocab_transform = {}
27
+
28
+ #from google.colab import drive
29
+ #drive.mount('/gdrive')
30
+
31
+ #!pip install -U torchdata
32
+ #!pip install -U spacy
33
+
34
+ #!python -m spacy download en_core_web_sm
35
+ #!python -m spacy download de_core_news_sm
36
+
37
+ #!pip install portalocker>=2.0.0
38
+
39
+ token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
40
+ token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')
41
+
42
+
43
+ # helper function to yield list of tokens
44
+ def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
45
+ language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}
46
+
47
+ for data_sample in data_iter:
48
+ yield token_transform[language](data_sample[language_index[language]])
49
+
50
+ # Define special symbols and indices
51
+ UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
52
+ # Make sure the tokens are in order of their indices to properly insert them in vocab
53
+ special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']
54
+
55
+ for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
56
+ # Training data Iterator
57
+ train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
58
+ # Create torchtext's Vocab object
59
+ vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
60
+ min_freq=1,
61
+ specials=special_symbols,
62
+ special_first=True)
63
+
64
+ # Set ``UNK_IDX`` as the default index. This index is returned when the token is not found.
65
+ # If not set, it throws ``RuntimeError`` when the queried token is not found in the Vocabulary.
66
+ for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
67
+ vocab_transform[ln].set_default_index(UNK_IDX)
68
+
69
+ from torch import Tensor
70
+ import torch
71
+ import torch.nn as nn
72
+ from torch.nn import Transformer
73
+ import math
74
+ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
75
+
76
+ # helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
77
+ class PositionalEncoding(nn.Module):
78
+ def __init__(self,
79
+ emb_size: int,
80
+ dropout: float,
81
+ maxlen: int = 5000):
82
+ super(PositionalEncoding, self).__init__()
83
+ den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
84
+ pos = torch.arange(0, maxlen).reshape(maxlen, 1)
85
+ pos_embedding = torch.zeros((maxlen, emb_size))
86
+ pos_embedding[:, 0::2] = torch.sin(pos * den)
87
+ pos_embedding[:, 1::2] = torch.cos(pos * den)
88
+ pos_embedding = pos_embedding.unsqueeze(-2)
89
+
90
+ self.dropout = nn.Dropout(dropout)
91
+ self.register_buffer('pos_embedding', pos_embedding)
92
+
93
+ def forward(self, token_embedding: Tensor):
94
+ return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
95
+
96
+ # helper Module to convert tensor of input indices into corresponding tensor of token embeddings
97
+ class TokenEmbedding(nn.Module):
98
+ def __init__(self, vocab_size: int, emb_size):
99
+ super(TokenEmbedding, self).__init__()
100
+ self.embedding = nn.Embedding(vocab_size, emb_size)
101
+ self.emb_size = emb_size
102
+
103
+ def forward(self, tokens: Tensor):
104
+ return self.embedding(tokens.long()) * math.sqrt(self.emb_size)
105
+
106
+ # Seq2Seq Network
107
+ class Seq2SeqTransformer(nn.Module):
108
+ def __init__(self,
109
+ num_encoder_layers: int,
110
+ num_decoder_layers: int,
111
+ emb_size: int,
112
+ nhead: int,
113
+ src_vocab_size: int,
114
+ tgt_vocab_size: int,
115
+ dim_feedforward: int = 512,
116
+ dropout: float = 0.1):
117
+ super(Seq2SeqTransformer, self).__init__()
118
+ self.transformer = Transformer(d_model=emb_size,
119
+ nhead=nhead,
120
+ num_encoder_layers=num_encoder_layers,
121
+ num_decoder_layers=num_decoder_layers,
122
+ dim_feedforward=dim_feedforward,
123
+ dropout=dropout)
124
+ self.generator = nn.Linear(emb_size, tgt_vocab_size)
125
+ self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
126
+ self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
127
+ self.positional_encoding = PositionalEncoding(
128
+ emb_size, dropout=dropout)
129
+
130
+ def forward(self,
131
+ src: Tensor,
132
+ trg: Tensor,
133
+ src_mask: Tensor,
134
+ tgt_mask: Tensor,
135
+ src_padding_mask: Tensor,
136
+ tgt_padding_mask: Tensor,
137
+ memory_key_padding_mask: Tensor):
138
+ src_emb = self.positional_encoding(self.src_tok_emb(src))
139
+ tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
140
+ outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
141
+ src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
142
+ return self.generator(outs)
143
+
144
+ def encode(self, src: Tensor, src_mask: Tensor):
145
+ return self.transformer.encoder(self.positional_encoding(
146
+ self.src_tok_emb(src)), src_mask)
147
+
148
+ def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
149
+ return self.transformer.decoder(self.positional_encoding(
150
+ self.tgt_tok_emb(tgt)), memory,
151
+ tgt_mask)
152
+
153
+ from torch import Tensor
154
+ import torch
155
+ import torch.nn as nn
156
+ from torch.nn import Transformer
157
+ import math
158
+ DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
159
+
160
+ # helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
161
+ class PositionalEncoding(nn.Module):
162
+ def __init__(self,
163
+ emb_size: int,
164
+ dropout: float,
165
+ maxlen: int = 5000):
166
+ super(PositionalEncoding, self).__init__()
167
+ den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
168
+ pos = torch.arange(0, maxlen).reshape(maxlen, 1)
169
+ pos_embedding = torch.zeros((maxlen, emb_size))
170
+ pos_embedding[:, 0::2] = torch.sin(pos * den)
171
+ pos_embedding[:, 1::2] = torch.cos(pos * den)
172
+ pos_embedding = pos_embedding.unsqueeze(-2)
173
+
174
+ self.dropout = nn.Dropout(dropout)
175
+ self.register_buffer('pos_embedding', pos_embedding)
176
+
177
+ def forward(self, token_embedding: Tensor):
178
+ return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
179
+
180
+ # helper Module to convert tensor of input indices into corresponding tensor of token embeddings
181
+ class TokenEmbedding(nn.Module):
182
+ def __init__(self, vocab_size: int, emb_size):
183
+ super(TokenEmbedding, self).__init__()
184
+ self.embedding = nn.Embedding(vocab_size, emb_size)
185
+ self.emb_size = emb_size
186
+
187
+ def forward(self, tokens: Tensor):
188
+ return self.embedding(tokens.long()) * math.sqrt(self.emb_size)
189
+
190
+ # Seq2Seq Network
191
+ class Seq2SeqTransformer(nn.Module):
192
+ def __init__(self,
193
+ num_encoder_layers: int,
194
+ num_decoder_layers: int,
195
+ emb_size: int,
196
+ nhead: int,
197
+ src_vocab_size: int,
198
+ tgt_vocab_size: int,
199
+ dim_feedforward: int = 512,
200
+ dropout: float = 0.1):
201
+ super(Seq2SeqTransformer, self).__init__()
202
+ self.transformer = Transformer(d_model=emb_size,
203
+ nhead=nhead,
204
+ num_encoder_layers=num_encoder_layers,
205
+ num_decoder_layers=num_decoder_layers,
206
+ dim_feedforward=dim_feedforward,
207
+ dropout=dropout)
208
+ self.generator = nn.Linear(emb_size, tgt_vocab_size)
209
+ self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
210
+ self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
211
+ self.positional_encoding = PositionalEncoding(
212
+ emb_size, dropout=dropout)
213
+
214
+ def forward(self,
215
+ src: Tensor,
216
+ trg: Tensor,
217
+ src_mask: Tensor,
218
+ tgt_mask: Tensor,
219
+ src_padding_mask: Tensor,
220
+ tgt_padding_mask: Tensor,
221
+ memory_key_padding_mask: Tensor):
222
+ src_emb = self.positional_encoding(self.src_tok_emb(src))
223
+ tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
224
+ outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
225
+ src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
226
+ return self.generator(outs)
227
+
228
+ def encode(self, src: Tensor, src_mask: Tensor):
229
+ return self.transformer.encoder(self.positional_encoding(
230
+ self.src_tok_emb(src)), src_mask)
231
+
232
+ def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
233
+ return self.transformer.decoder(self.positional_encoding(
234
+ self.tgt_tok_emb(tgt)), memory,
235
+ tgt_mask)
236
+
237
+ def generate_square_subsequent_mask(sz):
238
+ mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
239
+ mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
240
+ return mask
241
+
242
+
243
+ def create_mask(src, tgt):
244
+ src_seq_len = src.shape[0]
245
+ tgt_seq_len = tgt.shape[0]
246
+
247
+ tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
248
+ src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)
249
+
250
+ src_padding_mask = (src == PAD_IDX).transpose(0, 1)
251
+ tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
252
+ return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask
253
+
254
+ torch.manual_seed(0)
255
+
256
+ SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
257
+ TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
258
+ EMB_SIZE = 512
259
+ NHEAD = 8
260
+ FFN_HID_DIM = 512
261
+ BATCH_SIZE = 128
262
+ NUM_ENCODER_LAYERS = 3
263
+ NUM_DECODER_LAYERS = 3
264
+
265
+ transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
266
+ NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
267
+
268
+ for p in transformer.parameters():
269
+ if p.dim() > 1:
270
+ nn.init.xavier_uniform_(p)
271
+
272
+ transformer = transformer.to(DEVICE)
273
+
274
+ loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)
275
+
276
+ optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
277
+
278
+ from torch.nn.utils.rnn import pad_sequence
279
+
280
+ # helper function to club together sequential operations
281
+ def sequential_transforms(*transforms):
282
+ def func(txt_input):
283
+ for transform in transforms:
284
+ txt_input = transform(txt_input)
285
+ return txt_input
286
+ return func
287
+
288
+ # function to add BOS/EOS and create tensor for input sequence indices
289
+ def tensor_transform(token_ids: List[int]):
290
+ return torch.cat((torch.tensor([BOS_IDX]),
291
+ torch.tensor(token_ids),
292
+ torch.tensor([EOS_IDX])))
293
+
294
+ # ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
295
+ text_transform = {}
296
+ for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
297
+ text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
298
+ vocab_transform[ln], #Numericalization
299
+ tensor_transform) # Add BOS/EOS and create tensor
300
+
301
+
302
+ # function to collate data samples into batch tensors
303
+ def collate_fn(batch):
304
+ src_batch, tgt_batch = [], []
305
+ for src_sample, tgt_sample in batch:
306
+ src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
307
+ tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))
308
+
309
+ src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
310
+ tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
311
+ return src_batch, tgt_batch
312
+
313
+ from torch.utils.data import DataLoader
314
+
315
+ def train_epoch(model, optimizer):
316
+ model.train()
317
+ losses = 0
318
+ train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
319
+ train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)
320
+
321
+ for src, tgt in train_dataloader:
322
+ src = src.to(DEVICE)
323
+ tgt = tgt.to(DEVICE)
324
+
325
+ tgt_input = tgt[:-1, :]
326
+
327
+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
328
+
329
+ logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)
330
+
331
+ optimizer.zero_grad()
332
+
333
+ tgt_out = tgt[1:, :]
334
+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
335
+ loss.backward()
336
+
337
+ optimizer.step()
338
+ losses += loss.item()
339
+
340
+ return losses / len(list(train_dataloader))
341
+
342
+ def evaluate(model):
343
+ model.eval()
344
+ losses = 0
345
+
346
+ val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
347
+ val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)
348
+
349
+ for src, tgt in val_dataloader:
350
+ src = src.to(DEVICE)
351
+ tgt = tgt.to(DEVICE)
352
+
353
+ tgt_input = tgt[:-1, :]
354
+
355
+ src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
356
+
357
+ logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)
358
+
359
+ tgt_out = tgt[1:, :]
360
+ loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
361
+ losses += loss.item()
362
+
363
+ return losses / len(list(val_dataloader))
364
+
365
+ from timeit import default_timer as timer
366
+ NUM_EPOCHS = 10
367
+
368
+ for epoch in range(1, NUM_EPOCHS+1):
369
+ start_time = timer()
370
+ train_loss = train_epoch(transformer, optimizer)
371
+ end_time = timer()
372
+ val_loss = evaluate(transformer)
373
+ print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))
374
+
375
+ model =torch.save(transformer.state_dict(), '/gdrive/My Drive/transformer_model.pth')
376
+
377
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
378
+ model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
379
+ NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
380
+ model.load_state_dict(torch.load('/gdrive/My Drive/transformer_model.pth', map_location=device))
381
+ model.to(device)
382
+ model.eval()
383
+
384
+ def greedy_decode(model,src, src_mask, max_len, start_symbol):
385
+ #DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
386
+ src = src.to(DEVICE)
387
+ src_mask = src_mask.to(DEVICE)
388
+ #model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
389
+
390
+ #model.load_state_dict(torch.load('/gdrive/My Drive/transformer_model.pth',map_location= DEVICE))
391
+
392
+ memory = model.encode(src, src_mask)
393
+ ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
394
+ for i in range(max_len-1):
395
+ memory = memory.to(DEVICE)
396
+ tgt_mask = (generate_square_subsequent_mask(ys.size(0))
397
+ .type(torch.bool)).to(DEVICE)
398
+ out = model.decode(ys, memory, tgt_mask)
399
+ out = out.transpose(0, 1)
400
+ prob = model.generator(out[:, -1])
401
+ _, next_word = torch.max(prob, dim=1)
402
+ next_word = next_word.item()
403
+
404
+ ys = torch.cat([ys,
405
+ torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
406
+ if next_word == EOS_IDX:
407
+ break
408
+ return ys
409
+
410
+ # Load the saved model
411
+ #loaded_model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
412
+ # NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
413
+ #loaded_model.load_state_dict(torch.load('/gdrive/My Drive/transformer_model.pth'))
414
+ #loaded_model.eval() # Make sure to set the model in evaluation mode
415
+
416
+ def translate(src_sentence: str):
417
+ model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
418
+ #DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
419
+ model.load_state_dict(torch.load('/gdrive/My Drive/transformer_model.pth'))
420
+ model.to(DEVICE)
421
+ model.eval()
422
+ src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
423
+ num_tokens = src.shape[0]
424
+ src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
425
+ tgt_tokens = greedy_decode(
426
+ model, src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
427
+ return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")
428
+
429
+ print(translate("Eine Gruppe von Menschen steht vor einem Iglu ."))
430
+
431
+ #!pip install transformers
432
+
433
+ from transformers.modeling_utils import PreTrainedModel ,PretrainedConfig
434
+
435
+ class Seq2SeqTransformer(PreTrainedModel):
436
+ def __init__(self,config):
437
+ super(Seq2SeqTransformer, self).__init__(config)
438
+ self.transformer = Transformer(d_model=config.emb_size,
439
+ nhead=config.nhead,
440
+ num_encoder_layers=config.num_encoder_layers,
441
+ num_decoder_layers=config.num_decoder_layers,
442
+ dim_feedforward=config.dim_feedforward,
443
+ dropout=config.dropout)
444
+ self.generator = nn.Linear(config.emb_size, config.tgt_vocab_size)
445
+ self.src_tok_emb = TokenEmbedding(config.src_vocab_size, config.emb_size)
446
+ self.tgt_tok_emb = TokenEmbedding(config.tgt_vocab_size, config.emb_size)
447
+ self.positional_encoding = PositionalEncoding(
448
+ config.emb_size, dropout=config.dropout)
449
+
450
+ config = PretrainedConfig(
451
+ # Specify your vocabulary size
452
+ dim_feedforward =512,
453
+ dropout= 0.1,
454
+ emb_size= 512,
455
+ num_decoder_layers= 3,
456
+ num_encoder_layers= 3,
457
+ nhead= 8,
458
+ src_vocab_size= 19214,
459
+ tgt_vocab_size= 10837
460
+ )
461
+
462
+ model = Seq2SeqTransformer(config)
463
+ model.to(DEVICE)
464
+
465
+
466
+ model.save_pretrained('/gdrive/My Drive')
467
+
468
+ #!pip install -q gradio==3.48.0
469
+
470
+ import gradio as gr
471
+ import torch
472
+ from torchtext.data.utils import get_tokenizer
473
+ from torchtext.vocab import build_vocab_from_iterator
474
+ from torchtext.datasets import Multi30k
475
+ from torch import Tensor
476
+ from typing import Iterable, List
477
+
478
+ if __name__ == "__main__":
479
+ # Create the Gradio interface
480
+ iface = gr.Interface(
481
+ fn=translate, # Specify the translation function as the main function
482
+ inputs=[
483
+ gr.components.Textbox(label="Text")
484
+
485
+ ],
486
+ outputs=["text"],
487
+ cache_examples=False, # Disable caching of examples
488
+ title="germanToenglish", # Set the title of the interface
489
+ #description="This is a translator app for arabic and english. Currently supports only english to arabic." # Add a description of the interface
490
+ )
491
+
492
+ # Launch the interface
493
+ iface.launch(share=True)