Nguyen17 commited on
Commit
36703bb
·
1 Parent(s): 4873385
Files changed (1) hide show
  1. app.py +97 -4
app.py CHANGED
@@ -13,9 +13,102 @@ from llama_index.core import Prompt, Settings, VectorStoreIndex
13
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer
14
 
15
  import gradio as gr
16
- from . import prepare_data
17
- from prepare_data import *
18
- from prepare_data import main
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
 
21
 
@@ -186,7 +279,7 @@ def main1(config_path):
186
  logger.exception("An error occurred: %s", e)
187
 
188
  if __name__ == "__main__":
189
- main()
190
  parser = argparse.ArgumentParser(description='Process some configurations.')
191
  parser.add_argument('--config', type=str, default='config.yaml', help='Path to the configuration file')
192
  args = parser.parse_args()
 
13
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer
14
 
15
  import gradio as gr
16
+ import os
17
+ import shutil
18
+ from pathlib import Path
19
+ from docx.api import Document
20
+ from types import SimpleNamespace
21
+ from llama_index.core import SimpleDirectoryReader
22
+ from utils.process_tables import extract_and_replace_docx_tables
23
+
24
+
25
+
26
+ # Configure logging
27
+ logging.basicConfig(
28
+ level=logging.INFO,
29
+ format='%(asctime)s - %(levelname)s - %(message)s',
30
+ handlers=[
31
+ logging.FileHandler("script.log"),
32
+ logging.StreamHandler()
33
+ ]
34
+ )
35
+ logger = logging.getLogger(__name__)
36
+
37
+ def load_config(file_path='config.yaml'):
38
+ logger.info('Loading config file ...')
39
+ try:
40
+ with open(file_path, 'r') as file:
41
+ cfg = yaml.safe_load(file)
42
+ for k, v in cfg.items():
43
+ if isinstance(v, dict):
44
+ cfg[k] = SimpleNamespace(**v)
45
+ logger.info('Config file loaded successfully.')
46
+ return SimpleNamespace(**cfg)
47
+ except Exception as e:
48
+ logger.error(f'Error loading config file: {e}')
49
+ raise
50
+
51
+ cfg = load_config()
52
+
53
+ def process_docx_files(data_dir=Path(cfg.dataset.data_dir),
54
+ processed_data_dir=Path(cfg.dataset.processed_data_dir),
55
+ chunk_marker=cfg.dataset.chunk_marker):
56
+ try:
57
+ if not os.path.exists(processed_data_dir):
58
+ shutil.rmtree(processed_data_dir)
59
+
60
+ docx_files = [file for file in os.listdir(data_dir) if file.endswith('.docx')]
61
+ logger.info(f'Found {len(docx_files)} DOCX files to process.')
62
+
63
+ for fname in docx_files:
64
+ document, html_chunked_tables = extract_and_replace_docx_tables(
65
+ docx_file=data_dir / fname,
66
+ chunk_marker=chunk_marker
67
+ )
68
+ document.save(processed_data_dir / f'processed_{fname}')
69
+ logger.info(f'Processed and saved {fname}')
70
+ except Exception as e:
71
+ logger.error(f'Error processing DOCX files: {e}')
72
+ raise
73
+
74
+ def load_processed_data(processed_data_dir=Path(cfg.dataset.processed_data_dir)):
75
+ try:
76
+ documents = SimpleDirectoryReader(
77
+ input_dir=processed_data_dir,
78
+ required_exts=[cfg.dataset.required_exts],
79
+ ).load_data()
80
+ logger.info('Processed data loaded successfully.')
81
+ return documents
82
+ except Exception as e:
83
+ logger.error(f'Error loading processed data: {e}')
84
+ raise
85
+
86
+ def get_chunks(documents, chunk_marker=cfg.dataset.chunk_marker):
87
+ try:
88
+ chunks = [chunk.strip() for doc in documents for chunk in doc.text.split(chunk_marker) if chunk.strip()]
89
+ logger.info(f'Extracted {len(chunks)} chunks from documents.')
90
+ return chunks
91
+ except Exception as e:
92
+ logger.error(f'Error extracting chunks: {e}')
93
+ raise
94
+
95
+ def main_prepare():
96
+ logger.info('Starting document processing ...')
97
+ try:
98
+ process_docx_files()
99
+
100
+ documents = load_processed_data()
101
+ chunks = get_chunks(documents)
102
+ num_chunks = len(chunks)
103
+ logger.info(f'Total number of chunks: {num_chunks}')
104
+
105
+ df_chunks = pd.DataFrame({'chunk': chunks})
106
+ df_chunks.to_pickle('processed_chunks.pickle')
107
+ logger.info('All chunks saved to processed_chunks.pickle')
108
+ except Exception as e:
109
+ logger.error(f'Error in main processing: {e}')
110
+ raise
111
+
112
 
113
 
114
 
 
279
  logger.exception("An error occurred: %s", e)
280
 
281
  if __name__ == "__main__":
282
+ main_prepare()
283
  parser = argparse.ArgumentParser(description='Process some configurations.')
284
  parser.add_argument('--config', type=str, default='config.yaml', help='Path to the configuration file')
285
  args = parser.parse_args()