logicsame commited on
Commit
9050a12
·
1 Parent(s): f4b830d

data ingestion added

Browse files
config/config.yaml CHANGED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ artifacts_root: artifacts
2
+
3
+ data_ingestion:
4
+ root_dir : artifacts/data_ingestion
5
+ source_dir : E:\\bengla text summarization\BanSum.zip
6
+ local_data_file : artifacts/data_ingestion/BanSum.zip
7
+ unzip_dir : artifacts/data_ingestion
8
+
main.py CHANGED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.benglasummarization.logging import logger
2
+ from src.benglasummarization.pipeline.stage01_data_ingestion import DataIngestionPipeline
3
+
4
+ STAGE_NAME = 'Data Ingestion Stage'
5
+
6
+ try:
7
+ logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
8
+ data_ingestion = DataIngestionPipeline()
9
+ data_ingestion.main()
10
+ logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
11
+ except Exception as e:
12
+ logger.exception(e)
13
+ raise e
params.yaml CHANGED
@@ -0,0 +1 @@
 
 
1
+ key : val
research/data_ingestion.ipynb ADDED
File without changes
src/benglasummarization/components/__init__.py ADDED
File without changes
src/benglasummarization/components/data_ingestion.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import zipfile
3
+ from src.benglasummarization.logging import logger
4
+ from tqdm.notebook import tqdm
5
+ from dataclasses import replace
6
+ from src.benglasummarization.entity.config_entity import DataIngestionConfig
7
+
8
+ class DataIngestion:
9
+ def __init__(self, config : DataIngestionConfig):
10
+ self.config = config
11
+
12
+ def load_file(self):
13
+ if os.path.exists(self.config.source_dir):
14
+ self.config = replace(self.config, local_data_file = self.config.source_dir)
15
+ logger.info(f'File found at: {self.config.local_data_file}')
16
+ else:
17
+ logger.info(f'File not found at: {self.config.source_dir}')
18
+ raise FileNotFoundError(f'No file found at: {self.config.source_dir}')
19
+
20
+ def extract_zip_file(self):
21
+ unzip_path = self.config.unzip_dir
22
+ os.makedirs(unzip_path, exist_ok=True)
23
+
24
+ with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
25
+ total_files = len(zip_ref.infolist())
26
+ for file in tqdm(iterable=zip_ref.infolist(), total=total_files, desc = 'Extracting Files'):
27
+ zip_ref.extract(member = file, path = unzip_path)
28
+
29
+ logger.info(f"Extracted {self.config.local_data_file} to {unzip_path}")
30
+
31
+
32
+
33
+
34
+
35
+
src/benglasummarization/config/configuration.py CHANGED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.benglasummarization.constants import *
2
+ from src.benglasummarization.utils.common import read_yaml, create_directories
3
+ from benglasummarization.entity.config_entity import DataIngestionConfig
4
+ class ConfigurationManager:
5
+ def __init__(
6
+ self,
7
+ config_filepath = CONFIG_FILE_PATH,
8
+ params_filepath = PARAMS_FILE_PATH
9
+ ):
10
+ self.config = read_yaml(config_filepath)
11
+ self.params = read_yaml(params_filepath)
12
+
13
+ create_directories([self.config.artifacts_root])
14
+
15
+ def get_data_ingestion_config(self) -> DataIngestionConfig:
16
+ config = self.config.data_ingestion
17
+ create_directories([config.root_dir])
18
+
19
+ data_ingestion_config = DataIngestionConfig(
20
+ root_dir= config.root_dir,
21
+ source_dir=config.source_dir,
22
+ local_data_file=config.local_data_file,
23
+ unzip_dir= config.unzip_dir
24
+ )
25
+
26
+ return data_ingestion_config
src/benglasummarization/entity/config_entity.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+
4
+ @dataclass(frozen=True)
5
+ class DataIngestionConfig:
6
+ root_dir : Path
7
+ source_dir : Path
8
+ local_data_file : Path
9
+ unzip_dir : Path
10
+
src/benglasummarization/pipeline/stage01_data_ingestion.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.benglasummarization.components.data_ingestion import DataIngestion
2
+ from src.benglasummarization.config.configuration import ConfigurationManager
3
+ from src.benglasummarization.config.configuration import ConfigurationManager
4
+
5
+ class DataIngestionPipeline:
6
+ def __init__(self):
7
+ pass
8
+
9
+ def main(self):
10
+ config = ConfigurationManager()
11
+ data_ingestion_config = config.get_data_ingestion_config()
12
+ data_ingesion = DataIngestion(config=data_ingestion_config)
13
+ data_ingesion.load_file()
14
+ data_ingesion.extract_zip_file()