In [1]:
import os
os.chdir('../')

In [10]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class BanTokenTrainConfig:
    root_dir : Path
    input_file_dir : Path
    save_file : Path
    model_prefix : str
    model_type : str
    vocab_size : int

In [11]:
from src.benglasummarization.constants import  *
from src.benglasummarization.utils.common import create_directories, read_yaml

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_train_token_config(self) -> BanTokenTrainConfig:
        config = self.config.train_tokenize
        params = self.params.train_tokenize
        create_directories([config.root_dir])
        
        train_token_config = BanTokenTrainConfig(
            root_dir= config.root_dir,
            input_file_dir= config.input_file_dir,
            save_file= config.save_file,
            model_prefix= params.model_prefix,
            model_type= params.model_type,
            vocab_size= params.vocab_size
        )
        return train_token_config

In [20]:
import sentencepiece as spm
from src.benglasummarization.logging import logger
from tqdm.notebook import tqdm
import os

class TrainTokenize:
    def __init__(self, config: BanTokenTrainConfig):
        self.config = config
        
    def train_tokenizer(self):
        with open(self.config.input_file_dir, 'r', encoding='utf-8') as f:
            total_lines = sum(1 for line in f)

        with tqdm(total=total_lines, desc='Preparing Sentence for Training', unit='lines') as pbar:
            with open(self.config.input_file_dir, 'r', encoding='utf-8') as f:
                for _ in f:
                    pbar.update(1)
                    
        # Ensure the save directory exists
        os.makedirs(os.path.dirname(self.config.save_file), exist_ok=True)
        
        # Training Arguments
        train_params = {
            'input': str(self.config.input_file_dir),
            'model_prefix': os.path.join(self.config.save_file, self.config.model_prefix),
            'vocab_size': self.config.vocab_size,
            'model_type': self.config.model_type,
            'character_coverage': 1.0,
            'input_sentence_size': 1000000,
            'shuffle_input_sentence': True
        }
        
        spm.SentencePieceTrainer.train(**train_params)
        logger.info(f'Tokenizer model saved to {train_params["model_prefix"]}.model')
        logger.info(f'Tokenizer vocabulary saved to {train_params["model_prefix"]}.vocab')
    
    

In [21]:
try:
    config = ConfigurationManager()
    train_token_config = config.get_train_token_config()
    train_config = TrainTokenize(config=train_token_config)
    train_config.train_tokenizer()
except Exception as e:
    logger.error(f"An error occurred: {str(e)}")
    raise e

[2024-10-16 20:25:26,476: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-10-16 20:25:26,477: INFO: common: yaml file: params.yaml loaded successfully]
[2024-10-16 20:25:26,478: INFO: common: created directory at: artifacts]
[2024-10-16 20:25:26,480: INFO: common: created directory at: artifacts/train_tokenization]


Preparing Sentence for Training:   0%|          | 0/160000 [00:00<?, ?lines/s]

[2024-10-16 20:26:03,153: INFO: 489807411: Tokenizer model saved to artifacts/train_tokenization\cbengali_tokenizer.model]
[2024-10-16 20:26:03,154: INFO: 489807411: Tokenizer vocabulary saved to artifacts/train_tokenization\cbengali_tokenizer.vocab]
