File size: 5,104 Bytes
d777f1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import time
import mammoth
from bs4 import BeautifulSoup
from docx.api import Document
import re

def extract_and_replace_docx_tables(docx_file, chunk_marker):
    start_time = time.time()  # Record start time

    document = Document(docx_file)
    docx_tables = document.tables
    total_tables = len(document.tables)

    with open(docx_file, "rb") as docx_file:
        result = mammoth.convert_to_html(docx_file)
        html = result.value 

    tables = extract_html_tables(html)
    
    html_chunked_tables = get_html_table_chunks(tables, chunk_marker=chunk_marker)
    
    
    html_tables = []
    for table in html_chunked_tables:
        temp_document = Document()
        html_table = temp_document.add_paragraph(table)._element
        html_table.alignment = 0
        html_tables.append(html_table)


    track = 0    
    while len(document.tables) > 0:
        track += 1
        try:
            html_table = html_tables[0]
            document.element.body.replace(document.tables[0]._element, html_table)
            html_tables.remove(html_table)
            end_time = time.time()  # Record end time
            # print(f'{track} of {total_tables} | Success | Time: {end_time - start_time:.2f} seconds')

        except Exception as e:
            print(f'{track} of {total_tables} | Fail: {e}')
            if track >= 200:
                break
            
    return document, html_chunked_tables

def extract_html_tables(html):
    soup = BeautifulSoup(html, 'html.parser')
    tables = soup.find_all('table')
    return tables

def get_html_table_chunks(tables, chunk_marker):
    
    html_chunk_marker = '<strong>' + chunk_marker + '</strong>'

    html_table_chunks = []
    
    for table_soup in tables:

        html_table_string = str(table_soup)
        html_table_string = html_table_string.replace('<table>', '<table>\n')
        html_table_string = html_table_string.replace('<tr>', '\n<tr>\n')
        html_table_string = html_table_string.replace('</tr>', '\n</tr>')
        html_table_string = html_table_string.replace('<thead>', '<thead>')
        html_table_string = html_table_string.replace('</thead>', '\n</thead>\n')
        html_table_string = html_table_string.replace('<tbody>', '<tbody>')
        html_table_string = html_table_string.replace('</tbody>', '\n</tbody>\n')

        with open('table_html.txt', mode='w', encoding='utf8') as f:
            f.write(html_table_string)

        with open('table_html.txt', mode='r', encoding='utf8') as f:
            lines = f.readlines()

        start_table = lines[0].strip()
        end_table = lines[-1].strip()
        
        # Get start and end tags for tbody
        start_tbody = '<tbody>' if '<tbody>' in html_table_string else ''
        end_tbody = '</tbody>' if '</tbody>' in html_table_string else ''

        # Extract and clean headers if present
        headers = str(table_soup.find('thead')) if 'thead' in html_table_string else ''
        headers = re.sub(r'>\n\s*<', '><', headers)

        processed_lines = []
        for line in lines:
            if chunk_marker in line:
                start_index = line.find(html_chunk_marker)
                chunk_start = start_index - len('<p>')
                chunk_end = start_index + len(html_chunk_marker) + len('</p>')

                chunk_html = line[chunk_start:chunk_end]
                if chunk_html.startswith('<p>') & chunk_html.endswith('</p>'):
                    line = line.replace('<p>', '')
                    line = line.replace('</p>', '')
                else:
                    pass

                line = line.replace(html_chunk_marker, '')
                line = line.replace(' </td>', '</td>').strip()
                line += chunk_marker

            processed_lines.append(line)

        processed_lines = [line.strip() for line in processed_lines]
        html_table = ''.join(processed_lines)
        html_chunks = html_table.split(chunk_marker)

        proccessed_html_chunks = []
        for index, chunk in enumerate(html_chunks):
            if index == 0:
                chunk += (end_tbody + end_table)
                first_chunk = chunk.replace(end_table, '')
                start = first_chunk.find('<tr>')
                end = first_chunk.find('</tr>') + len('</tr>')
                headers = first_chunk[start:end]

                    
            elif chunk == html_chunks[-1]:
                chunk = start_table + headers + start_tbody + chunk 

            else:
                chunk = start_table + headers + start_tbody + chunk + end_tbody + end_table

            proccessed_html_chunks.append(chunk)

        chunks_to_html = ''
        for html_chunk in proccessed_html_chunks:
                chunks_to_html += wrap_signal(html_chunk, signal_type='html')
                if html_chunk != proccessed_html_chunks[-1]:
                    chunks_to_html += f'\n\n{chunk_marker}\n\n'
        
        html_table_chunks.append(chunks_to_html)
        
    return html_table_chunks

def wrap_signal(data, signal_type):
    data = f"```{signal_type}\n{data}\n```"
    return data