In [1]:
import os
os.chdir('../')

In [2]:
%pwd

'e:\\bengla text summarization\\train-pegasus-model-on-bengali-text-summarization-using-mlops'

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class BanTokenizationConfig:
    root_dir : Path
    source_dir : Path
    save_dir : Path
    output_file : str
    


In [4]:
from src.benglasummarization.constants import *
from src.benglasummarization.utils.common import  create_directories, read_yaml

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_ben_tokenization_config(self) -> BanTokenizationConfig:
        config = self.config.ban_tokenization
        params = self.params.pre_tokenize
        create_directories([config.root_dir])

        ben_tokenization_config = BanTokenizationConfig(
            root_dir=config.root_dir,
            source_dir=config.source_dir,
            save_dir= config.save_dir,
            output_file= params.output_file
        )
 
        return ben_tokenization_config

  

In [5]:
import pandas as pd
from pathlib import Path
from src.benglasummarization.logging import logger
from tqdm.notebook import tqdm

class BanTokenization:
    def __init__(self, config: BanTokenizationConfig):
        self.config = config

    def combine_text_columns(self, text_columns=['main']):
        df = pd.read_csv(self.config.source_dir)

        # Ensure save_dir is a Path object
        save_dir = Path(self.config.save_dir)
        
        # Create the directory if it doesn't exist
        save_dir.mkdir(parents=True, exist_ok=True)

        # Combine save_dir and output_file to form the output path
        output_txt_file = save_dir / self.config.output_file
        
        # Write the combined text data to the output file
        with open(output_txt_file, 'w', encoding='utf-8') as f:
            for index, row in tqdm(df.iterrows(), total=len(df)):
                combined_text = ' '.join(str(row[col]) for col in text_columns)
                f.write(combined_text + '\n')

        # Log the success of the operation
        logger.info(f"All text data has been combined into {output_txt_file}")

In [7]:
try:
    config = ConfigurationManager()
    prepare_ben_tok_config = config.get_ben_tokenization_config()  
    ben_data_tok = BanTokenization(config=prepare_ben_tok_config)
    ben_data_tok.combine_text_columns()
except Exception as e:
    raise e

[2024-10-16 19:09:09,141: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-10-16 19:09:09,143: INFO: common: yaml file: params.yaml loaded successfully]
[2024-10-16 19:09:09,145: INFO: common: created directory at: artifacts]
[2024-10-16 19:09:09,146: INFO: common: created directory at: artifacts/ban_tokenization]


  0%|          | 0/160000 [00:00<?, ?it/s]

[2024-10-16 19:10:00,660: INFO: 206824922: All text data has been combined into artifacts\ban_tokenization\combined_text.txt]
