Chatbot_Mining

Sleeping

File size: 5,104 Bytes

d777f1b

import time
import mammoth
from bs4 import BeautifulSoup
from docx.api import Document
import re

def extract_and_replace_docx_tables(docx_file, chunk_marker):
    start_time = time.time()  # Record start time

    document = Document(docx_file)
    docx_tables = document.tables
    total_tables = len(document.tables)

    with open(docx_file, "rb") as docx_file:
        result = mammoth.convert_to_html(docx_file)
        html = result.value 

    tables = extract_html_tables(html)
    
    html_chunked_tables = get_html_table_chunks(tables, chunk_marker=chunk_marker)
    
    
    html_tables = []
    for table in html_chunked_tables:
        temp_document = Document()
        html_table = temp_document.add_paragraph(table)._element
        html_table.alignment = 0
        html_tables.append(html_table)


    track = 0    
    while len(document.tables) > 0:
        track += 1
        try:
            html_table = html_tables[0]
            document.element.body.replace(document.tables[0]._element, html_table)
            html_tables.remove(html_table)
            end_time = time.time()  # Record end time
            # print(f'{track} of {total_tables} | Success | Time: {end_time - start_time:.2f} seconds')

        except Exception as e:
            print(f'{track} of {total_tables} | Fail: {e}')
            if track >= 200:
                break
            
    return document, html_chunked_tables

def extract_html_tables(html):
    soup = BeautifulSoup(html, 'html.parser')
    tables = soup.find_all('table')
    return tables

def get_html_table_chunks(tables, chunk_marker):
    
    html_chunk_marker = '<strong>' + chunk_marker + '</strong>'

    html_table_chunks = []
    
    for table_soup in tables:

        html_table_string = str(table_soup)
        html_table_string = html_table_string.replace('<table>', '<table>\n')
        html_table_string = html_table_string.replace('<tr>', '\n<tr>\n')
        html_table_string = html_table_string.replace('</tr>', '\n</tr>')
        html_table_string = html_table_string.replace('<thead>', '<thead>')
        html_table_string = html_table_string.replace('</thead>', '\n</thead>\n')
        html_table_string = html_table_string.replace('<tbody>', '<tbody>')
        html_table_string = html_table_string.replace('</tbody>', '\n</tbody>\n')

        with open('table_html.txt', mode='w', encoding='utf8') as f:
            f.write(html_table_string)

        with open('table_html.txt', mode='r', encoding='utf8') as f:
            lines = f.readlines()

        start_table = lines[0].strip()
        end_table = lines[-1].strip()
        
        # Get start and end tags for tbody
        start_tbody = '<tbody>' if '<tbody>' in html_table_string else ''
        end_tbody = '</tbody>' if '</tbody>' in html_table_string else ''

        # Extract and clean headers if present
        headers = str(table_soup.find('thead')) if 'thead' in html_table_string else ''
        headers = re.sub(r'>\n\s*<', '><', headers)

        processed_lines = []
        for line in lines:
            if chunk_marker in line:
                start_index = line.find(html_chunk_marker)
                chunk_start = start_index - len('<p>')
                chunk_end = start_index + len(html_chunk_marker) + len('</p>')

                chunk_html = line[chunk_start:chunk_end]
                if chunk_html.startswith('<p>') & chunk_html.endswith('</p>'):
                    line = line.replace('<p>', '')
                    line = line.replace('</p>', '')
                else:
                    pass

                line = line.replace(html_chunk_marker, '')
                line = line.replace(' </td>', '</td>').strip()
                line += chunk_marker

            processed_lines.append(line)

        processed_lines = [line.strip() for line in processed_lines]
        html_table = ''.join(processed_lines)
        html_chunks = html_table.split(chunk_marker)

        proccessed_html_chunks = []
        for index, chunk in enumerate(html_chunks):
            if index == 0:
                chunk += (end_tbody + end_table)
                first_chunk = chunk.replace(end_table, '')
                start = first_chunk.find('<tr>')
                end = first_chunk.find('</tr>') + len('</tr>')
                headers = first_chunk[start:end]

                    
            elif chunk == html_chunks[-1]:
                chunk = start_table + headers + start_tbody + chunk 

            else:
                chunk = start_table + headers + start_tbody + chunk + end_tbody + end_table

            proccessed_html_chunks.append(chunk)

        chunks_to_html = ''
        for html_chunk in proccessed_html_chunks:
                chunks_to_html += wrap_signal(html_chunk, signal_type='html')
                if html_chunk != proccessed_html_chunks[-1]:
                    chunks_to_html += f'\n\n{chunk_marker}\n\n'
        
        html_table_chunks.append(chunks_to_html)
        
    return html_table_chunks

def wrap_signal(data, signal_type):
    data = f"```{signal_type}\n{data}\n```"
    return data