Spaces:
Runtime error
Runtime error
Update germantoenglish.py
Browse files- germantoenglish.py +8 -92
germantoenglish.py
CHANGED
@@ -13,8 +13,6 @@ from torchtext.datasets import multi30k, Multi30k
|
|
13 |
from typing import Iterable, List
|
14 |
|
15 |
|
16 |
-
# We need to modify the URLs for the dataset since the links to the original dataset are broken
|
17 |
-
# Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info
|
18 |
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
|
19 |
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"
|
20 |
|
@@ -55,14 +53,12 @@ special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']
|
|
55 |
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
|
56 |
# Training data Iterator
|
57 |
train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
|
58 |
-
|
59 |
vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
|
60 |
min_freq=1,
|
61 |
specials=special_symbols,
|
62 |
special_first=True)
|
63 |
|
64 |
-
# Set ``UNK_IDX`` as the default index. This index is returned when the token is not found.
|
65 |
-
# If not set, it throws ``RuntimeError`` when the queried token is not found in the Vocabulary.
|
66 |
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
|
67 |
vocab_transform[ln].set_default_index(UNK_IDX)
|
68 |
|
@@ -73,7 +69,6 @@ from torch.nn import Transformer
|
|
73 |
import math
|
74 |
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
75 |
|
76 |
-
# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
|
77 |
class PositionalEncoding(nn.Module):
|
78 |
def __init__(self,
|
79 |
emb_size: int,
|
@@ -93,7 +88,7 @@ class PositionalEncoding(nn.Module):
|
|
93 |
def forward(self, token_embedding: Tensor):
|
94 |
return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
|
95 |
|
96 |
-
|
97 |
class TokenEmbedding(nn.Module):
|
98 |
def __init__(self, vocab_size: int, emb_size):
|
99 |
super(TokenEmbedding, self).__init__()
|
@@ -157,7 +152,6 @@ from torch.nn import Transformer
|
|
157 |
import math
|
158 |
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
159 |
|
160 |
-
# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
|
161 |
class PositionalEncoding(nn.Module):
|
162 |
def __init__(self,
|
163 |
emb_size: int,
|
@@ -177,7 +171,6 @@ class PositionalEncoding(nn.Module):
|
|
177 |
def forward(self, token_embedding: Tensor):
|
178 |
return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
|
179 |
|
180 |
-
# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
|
181 |
class TokenEmbedding(nn.Module):
|
182 |
def __init__(self, vocab_size: int, emb_size):
|
183 |
super(TokenEmbedding, self).__init__()
|
@@ -285,13 +278,11 @@ def sequential_transforms(*transforms):
|
|
285 |
return txt_input
|
286 |
return func
|
287 |
|
288 |
-
# function to add BOS/EOS and create tensor for input sequence indices
|
289 |
def tensor_transform(token_ids: List[int]):
|
290 |
return torch.cat((torch.tensor([BOS_IDX]),
|
291 |
torch.tensor(token_ids),
|
292 |
torch.tensor([EOS_IDX])))
|
293 |
|
294 |
-
# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
|
295 |
text_transform = {}
|
296 |
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
|
297 |
text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
|
@@ -372,22 +363,19 @@ for epoch in range(1, NUM_EPOCHS+1):
|
|
372 |
val_loss = evaluate(transformer)
|
373 |
print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))
|
374 |
|
375 |
-
model =torch.save(transformer.state_dict(), '
|
376 |
|
377 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
378 |
model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
|
379 |
NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
|
380 |
-
model.load_state_dict(torch.load('
|
381 |
model.to(device)
|
382 |
model.eval()
|
383 |
|
384 |
def greedy_decode(model,src, src_mask, max_len, start_symbol):
|
385 |
-
|
386 |
src = src.to(DEVICE)
|
387 |
src_mask = src_mask.to(DEVICE)
|
388 |
-
#model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
|
389 |
-
|
390 |
-
#model.load_state_dict(torch.load('/gdrive/My Drive/transformer_model.pth',map_location= DEVICE))
|
391 |
|
392 |
memory = model.encode(src, src_mask)
|
393 |
ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
|
@@ -407,16 +395,10 @@ def greedy_decode(model,src, src_mask, max_len, start_symbol):
|
|
407 |
break
|
408 |
return ys
|
409 |
|
410 |
-
# Load the saved model
|
411 |
-
#loaded_model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
|
412 |
-
# NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
|
413 |
-
#loaded_model.load_state_dict(torch.load('/gdrive/My Drive/transformer_model.pth'))
|
414 |
-
#loaded_model.eval() # Make sure to set the model in evaluation mode
|
415 |
-
|
416 |
def translate(src_sentence: str):
|
417 |
model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
|
418 |
-
|
419 |
-
model.load_state_dict(torch.load('
|
420 |
model.to(DEVICE)
|
421 |
model.eval()
|
422 |
src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
|
@@ -424,70 +406,4 @@ def translate(src_sentence: str):
|
|
424 |
src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
|
425 |
tgt_tokens = greedy_decode(
|
426 |
model, src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
|
427 |
-
return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")
|
428 |
-
|
429 |
-
print(translate("Eine Gruppe von Menschen steht vor einem Iglu ."))
|
430 |
-
|
431 |
-
#!pip install transformers
|
432 |
-
|
433 |
-
from transformers.modeling_utils import PreTrainedModel ,PretrainedConfig
|
434 |
-
|
435 |
-
class Seq2SeqTransformer(PreTrainedModel):
|
436 |
-
def __init__(self,config):
|
437 |
-
super(Seq2SeqTransformer, self).__init__(config)
|
438 |
-
self.transformer = Transformer(d_model=config.emb_size,
|
439 |
-
nhead=config.nhead,
|
440 |
-
num_encoder_layers=config.num_encoder_layers,
|
441 |
-
num_decoder_layers=config.num_decoder_layers,
|
442 |
-
dim_feedforward=config.dim_feedforward,
|
443 |
-
dropout=config.dropout)
|
444 |
-
self.generator = nn.Linear(config.emb_size, config.tgt_vocab_size)
|
445 |
-
self.src_tok_emb = TokenEmbedding(config.src_vocab_size, config.emb_size)
|
446 |
-
self.tgt_tok_emb = TokenEmbedding(config.tgt_vocab_size, config.emb_size)
|
447 |
-
self.positional_encoding = PositionalEncoding(
|
448 |
-
config.emb_size, dropout=config.dropout)
|
449 |
-
|
450 |
-
config = PretrainedConfig(
|
451 |
-
# Specify your vocabulary size
|
452 |
-
dim_feedforward =512,
|
453 |
-
dropout= 0.1,
|
454 |
-
emb_size= 512,
|
455 |
-
num_decoder_layers= 3,
|
456 |
-
num_encoder_layers= 3,
|
457 |
-
nhead= 8,
|
458 |
-
src_vocab_size= 19214,
|
459 |
-
tgt_vocab_size= 10837
|
460 |
-
)
|
461 |
-
|
462 |
-
model = Seq2SeqTransformer(config)
|
463 |
-
model.to(DEVICE)
|
464 |
-
|
465 |
-
|
466 |
-
model.save_pretrained('/gdrive/My Drive')
|
467 |
-
|
468 |
-
#!pip install -q gradio==3.48.0
|
469 |
-
|
470 |
-
import gradio as gr
|
471 |
-
import torch
|
472 |
-
from torchtext.data.utils import get_tokenizer
|
473 |
-
from torchtext.vocab import build_vocab_from_iterator
|
474 |
-
from torchtext.datasets import Multi30k
|
475 |
-
from torch import Tensor
|
476 |
-
from typing import Iterable, List
|
477 |
-
|
478 |
-
if __name__ == "__main__":
|
479 |
-
# Create the Gradio interface
|
480 |
-
iface = gr.Interface(
|
481 |
-
fn=translate, # Specify the translation function as the main function
|
482 |
-
inputs=[
|
483 |
-
gr.components.Textbox(label="Text")
|
484 |
-
|
485 |
-
],
|
486 |
-
outputs=["text"],
|
487 |
-
cache_examples=False, # Disable caching of examples
|
488 |
-
title="germanToenglish", # Set the title of the interface
|
489 |
-
#description="This is a translator app for arabic and english. Currently supports only english to arabic." # Add a description of the interface
|
490 |
-
)
|
491 |
-
|
492 |
-
# Launch the interface
|
493 |
-
iface.launch(share=True)
|
|
|
13 |
from typing import Iterable, List
|
14 |
|
15 |
|
|
|
|
|
16 |
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
|
17 |
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"
|
18 |
|
|
|
53 |
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
|
54 |
# Training data Iterator
|
55 |
train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
|
56 |
+
|
57 |
vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
|
58 |
min_freq=1,
|
59 |
specials=special_symbols,
|
60 |
special_first=True)
|
61 |
|
|
|
|
|
62 |
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
|
63 |
vocab_transform[ln].set_default_index(UNK_IDX)
|
64 |
|
|
|
69 |
import math
|
70 |
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
71 |
|
|
|
72 |
class PositionalEncoding(nn.Module):
|
73 |
def __init__(self,
|
74 |
emb_size: int,
|
|
|
88 |
def forward(self, token_embedding: Tensor):
|
89 |
return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
|
90 |
|
91 |
+
|
92 |
class TokenEmbedding(nn.Module):
|
93 |
def __init__(self, vocab_size: int, emb_size):
|
94 |
super(TokenEmbedding, self).__init__()
|
|
|
152 |
import math
|
153 |
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
154 |
|
|
|
155 |
class PositionalEncoding(nn.Module):
|
156 |
def __init__(self,
|
157 |
emb_size: int,
|
|
|
171 |
def forward(self, token_embedding: Tensor):
|
172 |
return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
|
173 |
|
|
|
174 |
class TokenEmbedding(nn.Module):
|
175 |
def __init__(self, vocab_size: int, emb_size):
|
176 |
super(TokenEmbedding, self).__init__()
|
|
|
278 |
return txt_input
|
279 |
return func
|
280 |
|
|
|
281 |
def tensor_transform(token_ids: List[int]):
|
282 |
return torch.cat((torch.tensor([BOS_IDX]),
|
283 |
torch.tensor(token_ids),
|
284 |
torch.tensor([EOS_IDX])))
|
285 |
|
|
|
286 |
text_transform = {}
|
287 |
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
|
288 |
text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
|
|
|
363 |
val_loss = evaluate(transformer)
|
364 |
print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))
|
365 |
|
366 |
+
model =torch.save(transformer.state_dict(), './transformer_model.pth')
|
367 |
|
368 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
369 |
model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
|
370 |
NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
|
371 |
+
model.load_state_dict(torch.load('./transformer_model.pth', map_location=device))
|
372 |
model.to(device)
|
373 |
model.eval()
|
374 |
|
375 |
def greedy_decode(model,src, src_mask, max_len, start_symbol):
|
376 |
+
|
377 |
src = src.to(DEVICE)
|
378 |
src_mask = src_mask.to(DEVICE)
|
|
|
|
|
|
|
379 |
|
380 |
memory = model.encode(src, src_mask)
|
381 |
ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
|
|
|
395 |
break
|
396 |
return ys
|
397 |
|
|
|
|
|
|
|
|
|
|
|
|
|
398 |
def translate(src_sentence: str):
|
399 |
model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
|
400 |
+
|
401 |
+
model.load_state_dict(torch.load('./transformer_model.pth'))
|
402 |
model.to(DEVICE)
|
403 |
model.eval()
|
404 |
src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
|
|
|
406 |
src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
|
407 |
tgt_tokens = greedy_decode(
|
408 |
model, src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
|
409 |
+
return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|