bluenevus commited on
Commit
1bb1cee
·
verified ·
1 Parent(s): fe02558

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -62
app.py CHANGED
@@ -4,13 +4,9 @@ import os
4
  import zipfile
5
  from dash import Dash, dcc, html, Input, Output, State, callback_context, no_update
6
  import dash_bootstrap_components as dbc
7
- from docx import Document
8
- from docx.enum.style import WD_STYLE_TYPE
9
- import markdown
10
  import threading
11
  import time
12
- import PyPDF2
13
- import re
14
 
15
  app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
16
 
@@ -42,70 +38,22 @@ app.layout = dbc.Container([
42
  dcc.Download(id="download-zip")
43
  ])
44
 
45
- def process_docx(contents, filename):
46
  content_type, content_string = contents.split(',')
47
  decoded = base64.b64decode(content_string)
48
- doc = Document(io.BytesIO(decoded))
49
- full_text = []
50
 
51
- for para in doc.paragraphs:
52
- if para.style.name.startswith('Heading'):
53
- level = int(para.style.name[-1])
54
- full_text.append(f"{'#' * level} {para.text}")
55
- else:
56
- text = para.text
57
- for run in para.runs:
58
- if run.bold:
59
- text = text.replace(run.text, f"**{run.text}**")
60
- if run.italic:
61
- text = text.replace(run.text, f"*{run.text}*")
62
-
63
- if para.style.name == 'List Bullet':
64
- full_text.append(f"- {text}")
65
- elif para.style.name == 'List Number':
66
- full_text.append(f"1. {text}")
67
- else:
68
- full_text.append(text)
69
-
70
- return '\n\n'.join(full_text)
71
-
72
- def process_pdf(contents, filename):
73
- content_type, content_string = contents.split(',')
74
- decoded = base64.b64decode(content_string)
75
- pdf_file = io.BytesIO(decoded)
76
- pdf_reader = PyPDF2.PdfReader(pdf_file)
77
- full_text = []
78
-
79
- for page in pdf_reader.pages:
80
- text = page.extract_text()
81
-
82
- # Basic formatting detection (this is a simplified approach and may not catch all formatting)
83
- text = re.sub(r'\*\*(.*?)\*\*', r'**\1**', text) # Bold
84
- text = re.sub(r'_(.*?)_', r'*\1*', text) # Italic
85
- text = re.sub(r'^(\d+\.)\s', r'\1 ', text, flags=re.MULTILINE) # Numbered lists
86
- text = re.sub(r'^[•●○]\s', '- ', text, flags=re.MULTILINE) # Bullet points
87
-
88
- # Detect potential headers (simplified approach)
89
- lines = text.split('\n')
90
- for i, line in enumerate(lines):
91
- if i == 0 or (i > 0 and len(line) < 50 and line.strip() and line.strip()[0].isupper()):
92
- lines[i] = f"## {line}"
93
-
94
- full_text.append('\n'.join(lines))
95
-
96
- return '\n\n'.join(full_text)
97
 
98
  def process_files(contents, filenames):
99
  processed_files = []
100
  for c, n in zip(contents, filenames):
101
- if n.lower().endswith('.docx'):
102
- text = process_docx(c, n)
103
- elif n.lower().endswith('.pdf'):
104
- text = process_pdf(c, n)
105
- else:
106
- continue # Skip unsupported file types
107
- md = markdown.markdown(text)
108
- processed_files.append((n.replace('.docx', '.md').replace('.pdf', '.md'), md))
109
  time.sleep(0.1) # Simulate processing time
110
 
111
  zip_buffer = io.BytesIO()
 
4
  import zipfile
5
  from dash import Dash, dcc, html, Input, Output, State, callback_context, no_update
6
  import dash_bootstrap_components as dbc
 
 
 
7
  import threading
8
  import time
9
+ import pypandoc
 
10
 
11
  app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
12
 
 
38
  dcc.Download(id="download-zip")
39
  ])
40
 
41
+ def process_file(contents, filename):
42
  content_type, content_string = contents.split(',')
43
  decoded = base64.b64decode(content_string)
44
+ with open(filename, 'wb') as f:
45
+ f.write(decoded)
46
 
47
+ md_content = pypandoc.convert_file(filename, 'md')
48
+ os.remove(filename) # Clean up the temporary file
49
+ return md_content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  def process_files(contents, filenames):
52
  processed_files = []
53
  for c, n in zip(contents, filenames):
54
+ if n.lower().endswith(('.docx', '.pdf')):
55
+ text = process_file(c, n)
56
+ processed_files.append((n.rsplit('.', 1)[0] + '.md', text))
 
 
 
 
 
57
  time.sleep(0.1) # Simulate processing time
58
 
59
  zip_buffer = io.BytesIO()