Spaces:
Sleeping
Sleeping
File size: 5,104 Bytes
d777f1b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import time
import mammoth
from bs4 import BeautifulSoup
from docx.api import Document
import re
def extract_and_replace_docx_tables(docx_file, chunk_marker):
start_time = time.time() # Record start time
document = Document(docx_file)
docx_tables = document.tables
total_tables = len(document.tables)
with open(docx_file, "rb") as docx_file:
result = mammoth.convert_to_html(docx_file)
html = result.value
tables = extract_html_tables(html)
html_chunked_tables = get_html_table_chunks(tables, chunk_marker=chunk_marker)
html_tables = []
for table in html_chunked_tables:
temp_document = Document()
html_table = temp_document.add_paragraph(table)._element
html_table.alignment = 0
html_tables.append(html_table)
track = 0
while len(document.tables) > 0:
track += 1
try:
html_table = html_tables[0]
document.element.body.replace(document.tables[0]._element, html_table)
html_tables.remove(html_table)
end_time = time.time() # Record end time
# print(f'{track} of {total_tables} | Success | Time: {end_time - start_time:.2f} seconds')
except Exception as e:
print(f'{track} of {total_tables} | Fail: {e}')
if track >= 200:
break
return document, html_chunked_tables
def extract_html_tables(html):
soup = BeautifulSoup(html, 'html.parser')
tables = soup.find_all('table')
return tables
def get_html_table_chunks(tables, chunk_marker):
html_chunk_marker = '<strong>' + chunk_marker + '</strong>'
html_table_chunks = []
for table_soup in tables:
html_table_string = str(table_soup)
html_table_string = html_table_string.replace('<table>', '<table>\n')
html_table_string = html_table_string.replace('<tr>', '\n<tr>\n')
html_table_string = html_table_string.replace('</tr>', '\n</tr>')
html_table_string = html_table_string.replace('<thead>', '<thead>')
html_table_string = html_table_string.replace('</thead>', '\n</thead>\n')
html_table_string = html_table_string.replace('<tbody>', '<tbody>')
html_table_string = html_table_string.replace('</tbody>', '\n</tbody>\n')
with open('table_html.txt', mode='w', encoding='utf8') as f:
f.write(html_table_string)
with open('table_html.txt', mode='r', encoding='utf8') as f:
lines = f.readlines()
start_table = lines[0].strip()
end_table = lines[-1].strip()
# Get start and end tags for tbody
start_tbody = '<tbody>' if '<tbody>' in html_table_string else ''
end_tbody = '</tbody>' if '</tbody>' in html_table_string else ''
# Extract and clean headers if present
headers = str(table_soup.find('thead')) if 'thead' in html_table_string else ''
headers = re.sub(r'>\n\s*<', '><', headers)
processed_lines = []
for line in lines:
if chunk_marker in line:
start_index = line.find(html_chunk_marker)
chunk_start = start_index - len('<p>')
chunk_end = start_index + len(html_chunk_marker) + len('</p>')
chunk_html = line[chunk_start:chunk_end]
if chunk_html.startswith('<p>') & chunk_html.endswith('</p>'):
line = line.replace('<p>', '')
line = line.replace('</p>', '')
else:
pass
line = line.replace(html_chunk_marker, '')
line = line.replace(' </td>', '</td>').strip()
line += chunk_marker
processed_lines.append(line)
processed_lines = [line.strip() for line in processed_lines]
html_table = ''.join(processed_lines)
html_chunks = html_table.split(chunk_marker)
proccessed_html_chunks = []
for index, chunk in enumerate(html_chunks):
if index == 0:
chunk += (end_tbody + end_table)
first_chunk = chunk.replace(end_table, '')
start = first_chunk.find('<tr>')
end = first_chunk.find('</tr>') + len('</tr>')
headers = first_chunk[start:end]
elif chunk == html_chunks[-1]:
chunk = start_table + headers + start_tbody + chunk
else:
chunk = start_table + headers + start_tbody + chunk + end_tbody + end_table
proccessed_html_chunks.append(chunk)
chunks_to_html = ''
for html_chunk in proccessed_html_chunks:
chunks_to_html += wrap_signal(html_chunk, signal_type='html')
if html_chunk != proccessed_html_chunks[-1]:
chunks_to_html += f'\n\n{chunk_marker}\n\n'
html_table_chunks.append(chunks_to_html)
return html_table_chunks
def wrap_signal(data, signal_type):
data = f"```{signal_type}\n{data}\n```"
return data |