bluenevus commited on
Commit
ee2db4c
·
verified ·
1 Parent(s): eb7c4fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -2
app.py CHANGED
@@ -5,10 +5,12 @@ import zipfile
5
  from dash import Dash, dcc, html, Input, Output, State, callback_context, no_update
6
  import dash_bootstrap_components as dbc
7
  from docx import Document
 
8
  import markdown
9
  import threading
10
  import time
11
  import PyPDF2
 
12
 
13
  app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
14
 
@@ -45,8 +47,26 @@ def process_docx(contents, filename):
45
  decoded = base64.b64decode(content_string)
46
  doc = Document(io.BytesIO(decoded))
47
  full_text = []
 
48
  for para in doc.paragraphs:
49
- full_text.append(para.text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  return '\n\n'.join(full_text)
51
 
52
  def process_pdf(contents, filename):
@@ -55,8 +75,24 @@ def process_pdf(contents, filename):
55
  pdf_file = io.BytesIO(decoded)
56
  pdf_reader = PyPDF2.PdfReader(pdf_file)
57
  full_text = []
 
58
  for page in pdf_reader.pages:
59
- full_text.append(page.extract_text())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  return '\n\n'.join(full_text)
61
 
62
  def process_files(contents, filenames):
 
5
  from dash import Dash, dcc, html, Input, Output, State, callback_context, no_update
6
  import dash_bootstrap_components as dbc
7
  from docx import Document
8
+ from docx.enum.style import WD_STYLE_TYPE
9
  import markdown
10
  import threading
11
  import time
12
  import PyPDF2
13
+ import re
14
 
15
  app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
16
 
 
47
  decoded = base64.b64decode(content_string)
48
  doc = Document(io.BytesIO(decoded))
49
  full_text = []
50
+
51
  for para in doc.paragraphs:
52
+ if para.style.name.startswith('Heading'):
53
+ level = int(para.style.name[-1])
54
+ full_text.append(f"{'#' * level} {para.text}")
55
+ else:
56
+ text = para.text
57
+ for run in para.runs:
58
+ if run.bold:
59
+ text = text.replace(run.text, f"**{run.text}**")
60
+ if run.italic:
61
+ text = text.replace(run.text, f"*{run.text}*")
62
+
63
+ if para.style.name == 'List Bullet':
64
+ full_text.append(f"- {text}")
65
+ elif para.style.name == 'List Number':
66
+ full_text.append(f"1. {text}")
67
+ else:
68
+ full_text.append(text)
69
+
70
  return '\n\n'.join(full_text)
71
 
72
  def process_pdf(contents, filename):
 
75
  pdf_file = io.BytesIO(decoded)
76
  pdf_reader = PyPDF2.PdfReader(pdf_file)
77
  full_text = []
78
+
79
  for page in pdf_reader.pages:
80
+ text = page.extract_text()
81
+
82
+ # Basic formatting detection (this is a simplified approach and may not catch all formatting)
83
+ text = re.sub(r'\*\*(.*?)\*\*', r'**\1**', text) # Bold
84
+ text = re.sub(r'_(.*?)_', r'*\1*', text) # Italic
85
+ text = re.sub(r'^(\d+\.)\s', r'\1 ', text, flags=re.MULTILINE) # Numbered lists
86
+ text = re.sub(r'^[•●○]\s', '- ', text, flags=re.MULTILINE) # Bullet points
87
+
88
+ # Detect potential headers (simplified approach)
89
+ lines = text.split('\n')
90
+ for i, line in enumerate(lines):
91
+ if i == 0 or (i > 0 and len(line) < 50 and line.strip() and line.strip()[0].isupper()):
92
+ lines[i] = f"## {line}"
93
+
94
+ full_text.append('\n'.join(lines))
95
+
96
  return '\n\n'.join(full_text)
97
 
98
  def process_files(contents, filenames):