Spaces:

MicroHealth
/

auto-wiki

Paused

App Files Files Community

bluenevus commited on 14 days ago

Commit

ee2db4c

verified ·

1 Parent(s): eb7c4fb

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -2

app.py CHANGED Viewed

@@ -5,10 +5,12 @@ import zipfile
 from dash import Dash, dcc, html, Input, Output, State, callback_context, no_update
 import dash_bootstrap_components as dbc
 from docx import Document
 import markdown
 import threading
 import time
 import PyPDF2
 app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
@@ -45,8 +47,26 @@ def process_docx(contents, filename):
     decoded = base64.b64decode(content_string)
     doc = Document(io.BytesIO(decoded))
     full_text = []
     for para in doc.paragraphs:
-        full_text.append(para.text)
     return '\n\n'.join(full_text)
 def process_pdf(contents, filename):
@@ -55,8 +75,24 @@ def process_pdf(contents, filename):
     pdf_file = io.BytesIO(decoded)
     pdf_reader = PyPDF2.PdfReader(pdf_file)
     full_text = []
     for page in pdf_reader.pages:
-        full_text.append(page.extract_text())
     return '\n\n'.join(full_text)
 def process_files(contents, filenames):

 from dash import Dash, dcc, html, Input, Output, State, callback_context, no_update
 import dash_bootstrap_components as dbc
 from docx import Document
+from docx.enum.style import WD_STYLE_TYPE
 import markdown
 import threading
 import time
 import PyPDF2
+import re
 app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
     decoded = base64.b64decode(content_string)
     doc = Document(io.BytesIO(decoded))
     full_text = []
     for para in doc.paragraphs:
+        if para.style.name.startswith('Heading'):
+            level = int(para.style.name[-1])
+            full_text.append(f"{'#' * level} {para.text}")
+        else:
+            text = para.text
+            for run in para.runs:
+                if run.bold:
+                    text = text.replace(run.text, f"**{run.text}**")
+                if run.italic:
+                    text = text.replace(run.text, f"*{run.text}*")
+            if para.style.name == 'List Bullet':
+                full_text.append(f"- {text}")
+            elif para.style.name == 'List Number':
+                full_text.append(f"1. {text}")
+            else:
+                full_text.append(text)
     return '\n\n'.join(full_text)
 def process_pdf(contents, filename):
     pdf_file = io.BytesIO(decoded)
     pdf_reader = PyPDF2.PdfReader(pdf_file)
     full_text = []
     for page in pdf_reader.pages:
+        text = page.extract_text()
+        # Basic formatting detection (this is a simplified approach and may not catch all formatting)
+        text = re.sub(r'\*\*(.*?)\*\*', r'**\1**', text)  # Bold
+        text = re.sub(r'_(.*?)_', r'*\1*', text)  # Italic
+        text = re.sub(r'^(\d+\.)\s', r'\1 ', text, flags=re.MULTILINE)  # Numbered lists
+        text = re.sub(r'^[•●○]\s', '- ', text, flags=re.MULTILINE)  # Bullet points
+        # Detect potential headers (simplified approach)
+        lines = text.split('\n')
+        for i, line in enumerate(lines):
+            if i == 0 or (i > 0 and len(line) < 50 and line.strip() and line.strip()[0].isupper()):
+                lines[i] = f"## {line}"
+        full_text.append('\n'.join(lines))
     return '\n\n'.join(full_text)
 def process_files(contents, filenames):