Chatbot_Mining

Sleeping

App Files Files Community

Chatbot_Mining / utils /process_tables.py

Nguyen17

update

d777f1b 10 months ago

raw

history blame contribute delete

5.1 kB

	import time
	import mammoth
	from bs4 import BeautifulSoup
	from docx.api import Document
	import re

	def extract_and_replace_docx_tables(docx_file, chunk_marker):
	start_time = time.time() # Record start time

	document = Document(docx_file)
	docx_tables = document.tables
	total_tables = len(document.tables)

	with open(docx_file, "rb") as docx_file:
	result = mammoth.convert_to_html(docx_file)
	html = result.value

	tables = extract_html_tables(html)

	html_chunked_tables = get_html_table_chunks(tables, chunk_marker=chunk_marker)


	html_tables = []
	for table in html_chunked_tables:
	temp_document = Document()
	html_table = temp_document.add_paragraph(table)._element
	html_table.alignment = 0
	html_tables.append(html_table)


	track = 0
	while len(document.tables) > 0:
	track += 1
	try:
	html_table = html_tables[0]
	document.element.body.replace(document.tables[0]._element, html_table)
	html_tables.remove(html_table)
	end_time = time.time() # Record end time
	# print(f'{track} of {total_tables} \| Success \| Time: {end_time - start_time:.2f} seconds')

	except Exception as e:
	print(f'{track} of {total_tables} \| Fail: {e}')
	if track >= 200:
	break

	return document, html_chunked_tables

	def extract_html_tables(html):
	soup = BeautifulSoup(html, 'html.parser')
	tables = soup.find_all('table')
	return tables

	def get_html_table_chunks(tables, chunk_marker):

	html_chunk_marker = '<strong>' + chunk_marker + '</strong>'

	html_table_chunks = []

	for table_soup in tables:

	html_table_string = str(table_soup)
	html_table_string = html_table_string.replace('<table>', '<table>\n')
	html_table_string = html_table_string.replace('<tr>', '\n<tr>\n')
	html_table_string = html_table_string.replace('</tr>', '\n</tr>')
	html_table_string = html_table_string.replace('<thead>', '<thead>')
	html_table_string = html_table_string.replace('</thead>', '\n</thead>\n')
	html_table_string = html_table_string.replace('<tbody>', '<tbody>')
	html_table_string = html_table_string.replace('</tbody>', '\n</tbody>\n')

	with open('table_html.txt', mode='w', encoding='utf8') as f:
	f.write(html_table_string)

	with open('table_html.txt', mode='r', encoding='utf8') as f:
	lines = f.readlines()

	start_table = lines[0].strip()
	end_table = lines[-1].strip()

	# Get start and end tags for tbody
	start_tbody = '<tbody>' if '<tbody>' in html_table_string else ''
	end_tbody = '</tbody>' if '</tbody>' in html_table_string else ''

	# Extract and clean headers if present
	headers = str(table_soup.find('thead')) if 'thead' in html_table_string else ''
	headers = re.sub(r'>\n\s*<', '><', headers)

	processed_lines = []
	for line in lines:
	if chunk_marker in line:
	start_index = line.find(html_chunk_marker)
	chunk_start = start_index - len('<p>')
	chunk_end = start_index + len(html_chunk_marker) + len('</p>')

	chunk_html = line[chunk_start:chunk_end]
	if chunk_html.startswith('<p>') & chunk_html.endswith('</p>'):
	line = line.replace('<p>', '')
	line = line.replace('</p>', '')
	else:
	pass

	line = line.replace(html_chunk_marker, '')
	line = line.replace(' </td>', '</td>').strip()
	line += chunk_marker

	processed_lines.append(line)

	processed_lines = [line.strip() for line in processed_lines]
	html_table = ''.join(processed_lines)
	html_chunks = html_table.split(chunk_marker)

	proccessed_html_chunks = []
	for index, chunk in enumerate(html_chunks):
	if index == 0:
	chunk += (end_tbody + end_table)
	first_chunk = chunk.replace(end_table, '')
	start = first_chunk.find('<tr>')
	end = first_chunk.find('</tr>') + len('</tr>')
	headers = first_chunk[start:end]


	elif chunk == html_chunks[-1]:
	chunk = start_table + headers + start_tbody + chunk

	else:
	chunk = start_table + headers + start_tbody + chunk + end_tbody + end_table

	proccessed_html_chunks.append(chunk)

	chunks_to_html = ''
	for html_chunk in proccessed_html_chunks:
	chunks_to_html += wrap_signal(html_chunk, signal_type='html')
	if html_chunk != proccessed_html_chunks[-1]:
	chunks_to_html += f'\n\n{chunk_marker}\n\n'

	html_table_chunks.append(chunks_to_html)

	return html_table_chunks

	def wrap_signal(data, signal_type):
	data = f"```{signal_type}\n{data}\n```"
	return data