Spaces:
Sleeping
Sleeping
update
Browse files
app.py
CHANGED
@@ -13,9 +13,102 @@ from llama_index.core import Prompt, Settings, VectorStoreIndex
|
|
13 |
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer
|
14 |
|
15 |
import gradio as gr
|
16 |
-
|
17 |
-
|
18 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
|
21 |
|
@@ -186,7 +279,7 @@ def main1(config_path):
|
|
186 |
logger.exception("An error occurred: %s", e)
|
187 |
|
188 |
if __name__ == "__main__":
|
189 |
-
|
190 |
parser = argparse.ArgumentParser(description='Process some configurations.')
|
191 |
parser.add_argument('--config', type=str, default='config.yaml', help='Path to the configuration file')
|
192 |
args = parser.parse_args()
|
|
|
13 |
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer
|
14 |
|
15 |
import gradio as gr
|
16 |
+
import os
|
17 |
+
import shutil
|
18 |
+
from pathlib import Path
|
19 |
+
from docx.api import Document
|
20 |
+
from types import SimpleNamespace
|
21 |
+
from llama_index.core import SimpleDirectoryReader
|
22 |
+
from utils.process_tables import extract_and_replace_docx_tables
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
# Configure logging
|
27 |
+
logging.basicConfig(
|
28 |
+
level=logging.INFO,
|
29 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
30 |
+
handlers=[
|
31 |
+
logging.FileHandler("script.log"),
|
32 |
+
logging.StreamHandler()
|
33 |
+
]
|
34 |
+
)
|
35 |
+
logger = logging.getLogger(__name__)
|
36 |
+
|
37 |
+
def load_config(file_path='config.yaml'):
|
38 |
+
logger.info('Loading config file ...')
|
39 |
+
try:
|
40 |
+
with open(file_path, 'r') as file:
|
41 |
+
cfg = yaml.safe_load(file)
|
42 |
+
for k, v in cfg.items():
|
43 |
+
if isinstance(v, dict):
|
44 |
+
cfg[k] = SimpleNamespace(**v)
|
45 |
+
logger.info('Config file loaded successfully.')
|
46 |
+
return SimpleNamespace(**cfg)
|
47 |
+
except Exception as e:
|
48 |
+
logger.error(f'Error loading config file: {e}')
|
49 |
+
raise
|
50 |
+
|
51 |
+
cfg = load_config()
|
52 |
+
|
53 |
+
def process_docx_files(data_dir=Path(cfg.dataset.data_dir),
|
54 |
+
processed_data_dir=Path(cfg.dataset.processed_data_dir),
|
55 |
+
chunk_marker=cfg.dataset.chunk_marker):
|
56 |
+
try:
|
57 |
+
if not os.path.exists(processed_data_dir):
|
58 |
+
shutil.rmtree(processed_data_dir)
|
59 |
+
|
60 |
+
docx_files = [file for file in os.listdir(data_dir) if file.endswith('.docx')]
|
61 |
+
logger.info(f'Found {len(docx_files)} DOCX files to process.')
|
62 |
+
|
63 |
+
for fname in docx_files:
|
64 |
+
document, html_chunked_tables = extract_and_replace_docx_tables(
|
65 |
+
docx_file=data_dir / fname,
|
66 |
+
chunk_marker=chunk_marker
|
67 |
+
)
|
68 |
+
document.save(processed_data_dir / f'processed_{fname}')
|
69 |
+
logger.info(f'Processed and saved {fname}')
|
70 |
+
except Exception as e:
|
71 |
+
logger.error(f'Error processing DOCX files: {e}')
|
72 |
+
raise
|
73 |
+
|
74 |
+
def load_processed_data(processed_data_dir=Path(cfg.dataset.processed_data_dir)):
|
75 |
+
try:
|
76 |
+
documents = SimpleDirectoryReader(
|
77 |
+
input_dir=processed_data_dir,
|
78 |
+
required_exts=[cfg.dataset.required_exts],
|
79 |
+
).load_data()
|
80 |
+
logger.info('Processed data loaded successfully.')
|
81 |
+
return documents
|
82 |
+
except Exception as e:
|
83 |
+
logger.error(f'Error loading processed data: {e}')
|
84 |
+
raise
|
85 |
+
|
86 |
+
def get_chunks(documents, chunk_marker=cfg.dataset.chunk_marker):
|
87 |
+
try:
|
88 |
+
chunks = [chunk.strip() for doc in documents for chunk in doc.text.split(chunk_marker) if chunk.strip()]
|
89 |
+
logger.info(f'Extracted {len(chunks)} chunks from documents.')
|
90 |
+
return chunks
|
91 |
+
except Exception as e:
|
92 |
+
logger.error(f'Error extracting chunks: {e}')
|
93 |
+
raise
|
94 |
+
|
95 |
+
def main_prepare():
|
96 |
+
logger.info('Starting document processing ...')
|
97 |
+
try:
|
98 |
+
process_docx_files()
|
99 |
+
|
100 |
+
documents = load_processed_data()
|
101 |
+
chunks = get_chunks(documents)
|
102 |
+
num_chunks = len(chunks)
|
103 |
+
logger.info(f'Total number of chunks: {num_chunks}')
|
104 |
+
|
105 |
+
df_chunks = pd.DataFrame({'chunk': chunks})
|
106 |
+
df_chunks.to_pickle('processed_chunks.pickle')
|
107 |
+
logger.info('All chunks saved to processed_chunks.pickle')
|
108 |
+
except Exception as e:
|
109 |
+
logger.error(f'Error in main processing: {e}')
|
110 |
+
raise
|
111 |
+
|
112 |
|
113 |
|
114 |
|
|
|
279 |
logger.exception("An error occurred: %s", e)
|
280 |
|
281 |
if __name__ == "__main__":
|
282 |
+
main_prepare()
|
283 |
parser = argparse.ArgumentParser(description='Process some configurations.')
|
284 |
parser.add_argument('--config', type=str, default='config.yaml', help='Path to the configuration file')
|
285 |
args = parser.parse_args()
|