Spaces:
Sleeping
Sleeping
logicsame
commited on
Commit
·
9050a12
1
Parent(s):
f4b830d
data ingestion added
Browse files- config/config.yaml +8 -0
- main.py +13 -0
- params.yaml +1 -0
- research/data_ingestion.ipynb +0 -0
- src/benglasummarization/components/__init__.py +0 -0
- src/benglasummarization/components/data_ingestion.py +35 -0
- src/benglasummarization/config/configuration.py +26 -0
- src/benglasummarization/entity/config_entity.py +10 -0
- src/benglasummarization/pipeline/stage01_data_ingestion.py +14 -0
config/config.yaml
CHANGED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
artifacts_root: artifacts
|
2 |
+
|
3 |
+
data_ingestion:
|
4 |
+
root_dir : artifacts/data_ingestion
|
5 |
+
source_dir : E:\\bengla text summarization\BanSum.zip
|
6 |
+
local_data_file : artifacts/data_ingestion/BanSum.zip
|
7 |
+
unzip_dir : artifacts/data_ingestion
|
8 |
+
|
main.py
CHANGED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.benglasummarization.logging import logger
|
2 |
+
from src.benglasummarization.pipeline.stage01_data_ingestion import DataIngestionPipeline
|
3 |
+
|
4 |
+
STAGE_NAME = 'Data Ingestion Stage'
|
5 |
+
|
6 |
+
try:
|
7 |
+
logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
|
8 |
+
data_ingestion = DataIngestionPipeline()
|
9 |
+
data_ingestion.main()
|
10 |
+
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
|
11 |
+
except Exception as e:
|
12 |
+
logger.exception(e)
|
13 |
+
raise e
|
params.yaml
CHANGED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
key : val
|
research/data_ingestion.ipynb
ADDED
File without changes
|
src/benglasummarization/components/__init__.py
ADDED
File without changes
|
src/benglasummarization/components/data_ingestion.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import zipfile
|
3 |
+
from src.benglasummarization.logging import logger
|
4 |
+
from tqdm.notebook import tqdm
|
5 |
+
from dataclasses import replace
|
6 |
+
from src.benglasummarization.entity.config_entity import DataIngestionConfig
|
7 |
+
|
8 |
+
class DataIngestion:
|
9 |
+
def __init__(self, config : DataIngestionConfig):
|
10 |
+
self.config = config
|
11 |
+
|
12 |
+
def load_file(self):
|
13 |
+
if os.path.exists(self.config.source_dir):
|
14 |
+
self.config = replace(self.config, local_data_file = self.config.source_dir)
|
15 |
+
logger.info(f'File found at: {self.config.local_data_file}')
|
16 |
+
else:
|
17 |
+
logger.info(f'File not found at: {self.config.source_dir}')
|
18 |
+
raise FileNotFoundError(f'No file found at: {self.config.source_dir}')
|
19 |
+
|
20 |
+
def extract_zip_file(self):
|
21 |
+
unzip_path = self.config.unzip_dir
|
22 |
+
os.makedirs(unzip_path, exist_ok=True)
|
23 |
+
|
24 |
+
with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
|
25 |
+
total_files = len(zip_ref.infolist())
|
26 |
+
for file in tqdm(iterable=zip_ref.infolist(), total=total_files, desc = 'Extracting Files'):
|
27 |
+
zip_ref.extract(member = file, path = unzip_path)
|
28 |
+
|
29 |
+
logger.info(f"Extracted {self.config.local_data_file} to {unzip_path}")
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
|
src/benglasummarization/config/configuration.py
CHANGED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.benglasummarization.constants import *
|
2 |
+
from src.benglasummarization.utils.common import read_yaml, create_directories
|
3 |
+
from benglasummarization.entity.config_entity import DataIngestionConfig
|
4 |
+
class ConfigurationManager:
|
5 |
+
def __init__(
|
6 |
+
self,
|
7 |
+
config_filepath = CONFIG_FILE_PATH,
|
8 |
+
params_filepath = PARAMS_FILE_PATH
|
9 |
+
):
|
10 |
+
self.config = read_yaml(config_filepath)
|
11 |
+
self.params = read_yaml(params_filepath)
|
12 |
+
|
13 |
+
create_directories([self.config.artifacts_root])
|
14 |
+
|
15 |
+
def get_data_ingestion_config(self) -> DataIngestionConfig:
|
16 |
+
config = self.config.data_ingestion
|
17 |
+
create_directories([config.root_dir])
|
18 |
+
|
19 |
+
data_ingestion_config = DataIngestionConfig(
|
20 |
+
root_dir= config.root_dir,
|
21 |
+
source_dir=config.source_dir,
|
22 |
+
local_data_file=config.local_data_file,
|
23 |
+
unzip_dir= config.unzip_dir
|
24 |
+
)
|
25 |
+
|
26 |
+
return data_ingestion_config
|
src/benglasummarization/entity/config_entity.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
@dataclass(frozen=True)
|
5 |
+
class DataIngestionConfig:
|
6 |
+
root_dir : Path
|
7 |
+
source_dir : Path
|
8 |
+
local_data_file : Path
|
9 |
+
unzip_dir : Path
|
10 |
+
|
src/benglasummarization/pipeline/stage01_data_ingestion.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.benglasummarization.components.data_ingestion import DataIngestion
|
2 |
+
from src.benglasummarization.config.configuration import ConfigurationManager
|
3 |
+
from src.benglasummarization.config.configuration import ConfigurationManager
|
4 |
+
|
5 |
+
class DataIngestionPipeline:
|
6 |
+
def __init__(self):
|
7 |
+
pass
|
8 |
+
|
9 |
+
def main(self):
|
10 |
+
config = ConfigurationManager()
|
11 |
+
data_ingestion_config = config.get_data_ingestion_config()
|
12 |
+
data_ingesion = DataIngestion(config=data_ingestion_config)
|
13 |
+
data_ingesion.load_file()
|
14 |
+
data_ingesion.extract_zip_file()
|