Upload 5 files
Browse files- README.md +26 -13
- Spacefile +10 -0
- app.py +81 -446
- requirements.txt +3 -8
- runtime.txt +1 -0
README.md
CHANGED
@@ -1,13 +1,26 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# PDF to Markdown Converter
|
2 |
+
|
3 |
+
This application converts PDF documents to Markdown format. It uses the `docling` library for document conversion and provides a simple Streamlit interface.
|
4 |
+
|
5 |
+
## Features
|
6 |
+
|
7 |
+
- Upload PDF files directly
|
8 |
+
- Convert PDFs from URLs
|
9 |
+
- Download the resulting Markdown file
|
10 |
+
- Clean, user-friendly interface
|
11 |
+
|
12 |
+
## How to Use
|
13 |
+
|
14 |
+
1. Upload a PDF file using the file uploader or enter a URL to a PDF document
|
15 |
+
2. Click the "Convert to Markdown" button
|
16 |
+
3. Once conversion is complete, download the Markdown file
|
17 |
+
|
18 |
+
## Technical Details
|
19 |
+
|
20 |
+
Built with:
|
21 |
+
- Streamlit 1.29.0
|
22 |
+
- Docling 2.7.0
|
23 |
+
|
24 |
+
## Deployment
|
25 |
+
|
26 |
+
This application is deployed on Hugging Face Spaces.
|
Spacefile
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Spacefile Docs: https://huggingface.co/docs/hub/spaces-config-reference
|
2 |
+
title: PDF to Markdown Converter
|
3 |
+
emoji: 📄
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: green
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.29.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
python_version: 3.12
|
app.py
CHANGED
@@ -1,302 +1,12 @@
|
|
1 |
import streamlit as st
|
|
|
2 |
import tempfile
|
3 |
import os
|
4 |
-
import time
|
5 |
import logging
|
6 |
-
import sys
|
7 |
-
from io import BytesIO
|
8 |
-
from pathlib import Path
|
9 |
-
from urllib.parse import urlparse
|
10 |
-
import requests
|
11 |
-
from PIL import Image
|
12 |
-
import fitz # PyMuPDF for PDF processing
|
13 |
-
|
14 |
-
# Set environment variables for vLLM
|
15 |
-
os.environ["VLLM_LOGGING_LEVEL"] = "DEBUG"
|
16 |
-
os.environ["CUDA_VISIBLE_DEVICES"] = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
|
17 |
-
os.environ["VLLM_USE_CUDA"] = os.environ.get("VLLM_USE_CUDA", "1")
|
18 |
-
|
19 |
-
try:
|
20 |
-
from vllm import LLM, SamplingParams
|
21 |
-
from docling_core.types.doc import DoclingDocument
|
22 |
-
from docling_core.types.doc.document import DocTagsDocument
|
23 |
-
VLLM_AVAILABLE = True
|
24 |
-
except ImportError as e:
|
25 |
-
VLLM_AVAILABLE = False
|
26 |
-
st.error(f"Error importing vLLM or docling_core: {str(e)}")
|
27 |
-
st.info("Falling back to standard Transformers if available.")
|
28 |
-
try:
|
29 |
-
import torch
|
30 |
-
from transformers import AutoProcessor, AutoModelForVision2Seq
|
31 |
-
except ImportError:
|
32 |
-
st.error("Neither vLLM nor Transformers are available. Please check your installation.")
|
33 |
|
34 |
# Configure logging
|
35 |
-
logging.basicConfig(level=logging.
|
36 |
logger = logging.getLogger(__name__)
|
37 |
-
logger.info("SmolDocling OCR App starting up...")
|
38 |
-
|
39 |
-
# Set up cache directory
|
40 |
-
CACHE_DIR = os.environ.get("CACHE_DIR", "/tmp/smoldocling_cache")
|
41 |
-
os.makedirs(CACHE_DIR, exist_ok=True)
|
42 |
-
logger.info(f"Cache directory set to: {CACHE_DIR}")
|
43 |
-
|
44 |
-
# Custom DocumentConverter class that uses vLLM for fast inference
|
45 |
-
class VLLMDocumentConverter:
|
46 |
-
def __init__(self, model_name="ds4sd/SmolDocling-256M-preview"):
|
47 |
-
"""
|
48 |
-
Initialize the converter with vLLM for fast inference
|
49 |
-
|
50 |
-
Args:
|
51 |
-
model_name: The name of the model to use
|
52 |
-
"""
|
53 |
-
logger.info("Loading SmolDocling model with vLLM...")
|
54 |
-
try:
|
55 |
-
# Initialize vLLM with explicit device configuration
|
56 |
-
self.model_path = model_name
|
57 |
-
|
58 |
-
# Check if CUDA is available through torch
|
59 |
-
cuda_available = False
|
60 |
-
try:
|
61 |
-
import torch
|
62 |
-
cuda_available = torch.cuda.is_available()
|
63 |
-
if cuda_available:
|
64 |
-
logger.info(f"CUDA is available. Found {torch.cuda.device_count()} device(s).")
|
65 |
-
for i in range(torch.cuda.device_count()):
|
66 |
-
logger.info(f"Device {i}: {torch.cuda.get_device_name(i)}")
|
67 |
-
else:
|
68 |
-
logger.info("CUDA is not available. Using CPU.")
|
69 |
-
except:
|
70 |
-
logger.info("Could not check CUDA availability through torch.")
|
71 |
-
|
72 |
-
# Print CUDA environment variables for debugging
|
73 |
-
for env_var in os.environ:
|
74 |
-
if "CUDA" in env_var or "VLLM" in env_var:
|
75 |
-
logger.info(f"Environment variable: {env_var}={os.environ[env_var]}")
|
76 |
-
|
77 |
-
# Try multiple initialization approaches
|
78 |
-
initialization_methods = [
|
79 |
-
# Method 1: Standard GPU initialization
|
80 |
-
lambda: LLM(
|
81 |
-
model=self.model_path,
|
82 |
-
limit_mm_per_prompt={"image": 1},
|
83 |
-
tensor_parallel_size=1,
|
84 |
-
dtype="float16",
|
85 |
-
gpu_memory_utilization=0.7
|
86 |
-
),
|
87 |
-
# Method 2: With trust_remote_code and enforce_eager
|
88 |
-
lambda: LLM(
|
89 |
-
model=self.model_path,
|
90 |
-
limit_mm_per_prompt={"image": 1},
|
91 |
-
trust_remote_code=True,
|
92 |
-
enforce_eager=True,
|
93 |
-
dtype="float16"
|
94 |
-
),
|
95 |
-
# Method 3: With explicit device specification
|
96 |
-
lambda: LLM(
|
97 |
-
model=self.model_path,
|
98 |
-
limit_mm_per_prompt={"image": 1},
|
99 |
-
dtype="float16",
|
100 |
-
max_model_len=8192,
|
101 |
-
device="cuda:0" if cuda_available else "cpu"
|
102 |
-
),
|
103 |
-
# Method 4: CPU only as last resort
|
104 |
-
lambda: LLM(
|
105 |
-
model=self.model_path,
|
106 |
-
limit_mm_per_prompt={"image": 1},
|
107 |
-
trust_remote_code=True,
|
108 |
-
enforce_eager=True,
|
109 |
-
cpu_only=True
|
110 |
-
)
|
111 |
-
]
|
112 |
-
|
113 |
-
# Try each initialization method until one works
|
114 |
-
last_error = None
|
115 |
-
for i, init_method in enumerate(initialization_methods):
|
116 |
-
try:
|
117 |
-
logger.info(f"Trying vLLM initialization method {i+1}/{len(initialization_methods)}")
|
118 |
-
self.llm = init_method()
|
119 |
-
logger.info(f"Successfully initialized vLLM with method {i+1}")
|
120 |
-
break
|
121 |
-
except Exception as e:
|
122 |
-
last_error = e
|
123 |
-
logger.warning(f"Method {i+1} failed: {str(e)}")
|
124 |
-
continue
|
125 |
-
|
126 |
-
# If all methods failed, raise the last error
|
127 |
-
if not hasattr(self, 'llm'):
|
128 |
-
logger.error("All vLLM initialization methods failed")
|
129 |
-
raise last_error
|
130 |
-
|
131 |
-
self.sampling_params = SamplingParams(
|
132 |
-
temperature=0.0,
|
133 |
-
max_tokens=8192
|
134 |
-
)
|
135 |
-
logger.info("Model loaded successfully with vLLM")
|
136 |
-
except Exception as e:
|
137 |
-
logger.error(f"Error loading model: {str(e)}")
|
138 |
-
# Print detailed error information
|
139 |
-
import traceback
|
140 |
-
logger.error(traceback.format_exc())
|
141 |
-
raise
|
142 |
-
|
143 |
-
def load_image_from_path(self, file_path):
|
144 |
-
"""Load image from a path, handling both images and PDFs"""
|
145 |
-
logger.debug(f"Loading from path: {file_path}")
|
146 |
-
try:
|
147 |
-
# Check if it's a PDF
|
148 |
-
if file_path.lower().endswith('.pdf'):
|
149 |
-
return self.convert_pdf_to_images(file_path)
|
150 |
-
else:
|
151 |
-
# It's an image
|
152 |
-
pil_image = Image.open(file_path).convert("RGB")
|
153 |
-
logger.debug(f"Image loaded successfully: {pil_image.size}")
|
154 |
-
return [pil_image] # Return as a list for consistency
|
155 |
-
except Exception as e:
|
156 |
-
logger.error(f"Error loading file: {str(e)}")
|
157 |
-
raise
|
158 |
-
|
159 |
-
def convert_pdf_to_images(self, pdf_path):
|
160 |
-
"""Convert PDF to a list of images"""
|
161 |
-
logger.debug(f"Converting PDF to images: {pdf_path}")
|
162 |
-
try:
|
163 |
-
images = []
|
164 |
-
with fitz.open(pdf_path) as doc:
|
165 |
-
logger.debug(f"PDF has {len(doc)} pages")
|
166 |
-
for page_num, page in enumerate(doc):
|
167 |
-
logger.debug(f"Processing page {page_num+1}")
|
168 |
-
# Render page to an image with higher resolution
|
169 |
-
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
170 |
-
img_data = pix.tobytes("png")
|
171 |
-
img = Image.open(BytesIO(img_data)).convert("RGB")
|
172 |
-
images.append(img)
|
173 |
-
|
174 |
-
logger.debug(f"Converted {len(images)} pages to images")
|
175 |
-
return images
|
176 |
-
except Exception as e:
|
177 |
-
logger.error(f"Error converting PDF to images: {str(e)}")
|
178 |
-
raise
|
179 |
-
|
180 |
-
def load_image_from_url(self, url):
|
181 |
-
"""Load image from a URL, handling both images and PDFs"""
|
182 |
-
logger.debug(f"Loading from URL: {url}")
|
183 |
-
try:
|
184 |
-
response = requests.get(url, stream=True, timeout=10)
|
185 |
-
response.raise_for_status()
|
186 |
-
|
187 |
-
# Check if it's a PDF
|
188 |
-
content_type = response.headers.get('Content-Type', '').lower()
|
189 |
-
if content_type == 'application/pdf' or url.lower().endswith('.pdf'):
|
190 |
-
# Save PDF to a temporary file
|
191 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
|
192 |
-
tmp_file.write(response.content)
|
193 |
-
tmp_path = tmp_file.name
|
194 |
-
|
195 |
-
try:
|
196 |
-
# Convert PDF to images
|
197 |
-
images = self.convert_pdf_to_images(tmp_path)
|
198 |
-
return images
|
199 |
-
finally:
|
200 |
-
# Clean up temporary file
|
201 |
-
if os.path.exists(tmp_path):
|
202 |
-
os.unlink(tmp_path)
|
203 |
-
else:
|
204 |
-
# It's an image
|
205 |
-
pil_image = Image.open(BytesIO(response.content)).convert("RGB")
|
206 |
-
logger.debug(f"Image loaded successfully: {pil_image.size}")
|
207 |
-
return [pil_image] # Return as a list for consistency
|
208 |
-
except Exception as e:
|
209 |
-
logger.error(f"Error loading from URL: {str(e)}")
|
210 |
-
raise
|
211 |
-
|
212 |
-
def process_images(self, images, prompt="Convert page to Docling."):
|
213 |
-
"""Process images using vLLM and return doctags outputs"""
|
214 |
-
logger.debug(f"Processing {len(images)} images with prompt: {prompt}")
|
215 |
-
|
216 |
-
start_time = time.time()
|
217 |
-
all_outputs = []
|
218 |
-
|
219 |
-
# Create chat template
|
220 |
-
chat_template = f"<|im_start|>User:<image>{prompt}<end_of_utterance>\nAssistant:"
|
221 |
-
|
222 |
-
# Process each image
|
223 |
-
for i, image in enumerate(images):
|
224 |
-
logger.debug(f"Processing image {i+1} of {len(images)}")
|
225 |
-
|
226 |
-
# Prepare input for vLLM
|
227 |
-
llm_input = {"prompt": chat_template, "multi_modal_data": {"image": image}}
|
228 |
-
|
229 |
-
# Generate output
|
230 |
-
output = self.llm.generate([llm_input], sampling_params=self.sampling_params)[0]
|
231 |
-
doctags = output.outputs[0].text
|
232 |
-
|
233 |
-
all_outputs.append(doctags)
|
234 |
-
logger.debug(f"Generated doctags for image {i+1} (length: {len(doctags)})")
|
235 |
-
|
236 |
-
logger.debug(f"Total processing time: {time.time() - start_time:.2f} seconds")
|
237 |
-
return all_outputs
|
238 |
-
|
239 |
-
def convert_to_markdown(self, images, prompt="Convert page to Docling."):
|
240 |
-
"""Convert images to markdown using vLLM"""
|
241 |
-
logger.debug(f"Converting {len(images)} images to markdown with prompt: {prompt}")
|
242 |
-
try:
|
243 |
-
# Process images
|
244 |
-
all_outputs = self.process_images(images, prompt)
|
245 |
-
|
246 |
-
# Populate document with all pages
|
247 |
-
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(all_outputs, images)
|
248 |
-
# Create a docling document
|
249 |
-
doc = DoclingDocument(name="ConvertedDocument")
|
250 |
-
doc.load_from_doctags(doctags_doc)
|
251 |
-
|
252 |
-
# Export as markdown
|
253 |
-
markdown_text = doc.export_to_markdown()
|
254 |
-
logger.debug(f"Combined markdown text length: {len(markdown_text)}")
|
255 |
-
|
256 |
-
return doc
|
257 |
-
except Exception as e:
|
258 |
-
logger.error(f"Error converting to markdown: {str(e)}")
|
259 |
-
raise
|
260 |
-
|
261 |
-
def convert(self, source, prompt="Convert page to Docling.", max_pages=None):
|
262 |
-
"""
|
263 |
-
Convert a PDF/image to markdown
|
264 |
-
|
265 |
-
Args:
|
266 |
-
source: Either a path to a file or a URL
|
267 |
-
prompt: The prompt to use for conversion
|
268 |
-
max_pages: Maximum number of pages to process
|
269 |
-
|
270 |
-
Returns:
|
271 |
-
An object with a document attribute that has an export_to_markdown method
|
272 |
-
"""
|
273 |
-
logger.debug(f"Converting source: {source}")
|
274 |
-
try:
|
275 |
-
# Check if source is a URL
|
276 |
-
if urlparse(source).scheme != "":
|
277 |
-
images = self.load_image_from_url(source)
|
278 |
-
else:
|
279 |
-
# Check if it's a PDF or image
|
280 |
-
images = self.load_image_from_path(source)
|
281 |
-
|
282 |
-
# Limit the number of pages if specified
|
283 |
-
if max_pages and max_pages < len(images):
|
284 |
-
logger.debug(f"Limiting processing to {max_pages} pages out of {len(images)}")
|
285 |
-
images = images[:max_pages]
|
286 |
-
|
287 |
-
# Convert to markdown
|
288 |
-
doc = self.convert_to_markdown(images, prompt)
|
289 |
-
|
290 |
-
# Return the document
|
291 |
-
return ConversionResult(doc)
|
292 |
-
except Exception as e:
|
293 |
-
logger.error(f"Error in convert method: {str(e)}")
|
294 |
-
raise
|
295 |
-
|
296 |
-
class ConversionResult:
|
297 |
-
"""A simple class to mimic the interface of the original DocumentConverter result"""
|
298 |
-
def __init__(self, document):
|
299 |
-
self.document = document
|
300 |
|
301 |
# Custom CSS for better layout
|
302 |
st.markdown("""
|
@@ -333,164 +43,89 @@ st.markdown("""
|
|
333 |
</style>
|
334 |
""", unsafe_allow_html=True)
|
335 |
|
336 |
-
|
337 |
-
logger.info("Starting SmolDocling OCR App main function")
|
338 |
-
|
339 |
-
st.title("PDF to Markdown Converter")
|
340 |
-
st.subheader("Using SmolDocling OCR with vLLM")
|
341 |
-
|
342 |
-
# Add a sidebar for model and processing settings
|
343 |
-
st.sidebar.title("Settings")
|
344 |
-
|
345 |
-
# Model settings
|
346 |
-
st.sidebar.subheader("Model Settings")
|
347 |
-
model_name = st.sidebar.text_input(
|
348 |
-
"Model Name",
|
349 |
-
value="ds4sd/SmolDocling-256M-preview",
|
350 |
-
help="Enter the name of the model to use for PDF to Markdown conversion"
|
351 |
-
)
|
352 |
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
|
|
|
|
|
|
|
|
377 |
try:
|
378 |
-
with st.spinner(
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
logger.debug(f"Creating VLLMDocumentConverter instance with model: {model_name}")
|
385 |
-
st.session_state.converter = VLLMDocumentConverter(model_name=model_name)
|
386 |
-
logger.debug("Converter successfully created")
|
387 |
-
st.sidebar.success(f"Model {model_name} loaded successfully!")
|
388 |
-
except Exception as e:
|
389 |
-
error_msg = str(e)
|
390 |
-
logger.error(f"Error creating converter: {error_msg}")
|
391 |
-
st.error(f"Error creating converter: {error_msg}")
|
392 |
-
|
393 |
-
if 'converter' not in st.session_state:
|
394 |
-
st.stop()
|
395 |
-
|
396 |
-
# Main upload area
|
397 |
-
uploaded_file = st.file_uploader(
|
398 |
-
"Upload your PDF or image file",
|
399 |
-
type=['pdf', 'png', 'jpg', 'jpeg'],
|
400 |
-
key='file_uploader',
|
401 |
-
help="Drag and drop or click to select a file (max 200MB)"
|
402 |
-
)
|
403 |
-
|
404 |
-
# URL input area with spacing
|
405 |
-
st.markdown("<br>", unsafe_allow_html=True)
|
406 |
-
url = st.text_input("Or enter a PDF/image URL")
|
407 |
-
|
408 |
-
# Prompt input
|
409 |
-
prompt = st.text_input("Conversion prompt (optional)", value="Convert page to Docling.")
|
410 |
-
|
411 |
-
# Unified convert button
|
412 |
-
convert_clicked = st.button("Convert to Markdown", type="primary")
|
413 |
-
|
414 |
-
# Process either uploaded file or URL
|
415 |
-
if convert_clicked:
|
416 |
-
if uploaded_file is not None:
|
417 |
-
try:
|
418 |
-
with st.spinner('Converting file...'):
|
419 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{uploaded_file.name.split(".")[-1]}') as tmp_file:
|
420 |
-
tmp_file.write(uploaded_file.getvalue())
|
421 |
-
tmp_path = tmp_file.name
|
422 |
-
logger.debug(f"Temporary file created at: {tmp_path}")
|
423 |
-
|
424 |
-
try:
|
425 |
-
logger.debug(f"Converting file: {uploaded_file.name}")
|
426 |
-
# Convert the file
|
427 |
-
result = st.session_state.converter.convert(
|
428 |
-
tmp_path,
|
429 |
-
prompt=prompt,
|
430 |
-
max_pages=max_pages
|
431 |
-
)
|
432 |
-
markdown_text = result.document.export_to_markdown()
|
433 |
-
logger.debug(f"Markdown text length: {len(markdown_text)}")
|
434 |
-
|
435 |
-
output_filename = os.path.splitext(uploaded_file.name)[0] + '.md'
|
436 |
-
|
437 |
-
st.success("Conversion completed!")
|
438 |
-
st.download_button(
|
439 |
-
label="Download Markdown file",
|
440 |
-
data=markdown_text,
|
441 |
-
file_name=output_filename,
|
442 |
-
mime="text/markdown"
|
443 |
-
)
|
444 |
-
|
445 |
-
# Display the markdown
|
446 |
-
st.subheader("Preview:")
|
447 |
-
st.markdown(markdown_text)
|
448 |
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
# Convert from URL
|
467 |
-
result = st.session_state.converter.convert(
|
468 |
-
url,
|
469 |
-
prompt=prompt,
|
470 |
-
max_pages=max_pages
|
471 |
-
)
|
472 |
-
markdown_text = result.document.export_to_markdown()
|
473 |
-
logger.debug(f"Markdown text length: {len(markdown_text)}")
|
474 |
-
|
475 |
-
output_filename = url.split('/')[-1].split('.')[0] + '.md'
|
476 |
-
|
477 |
-
st.success("Conversion completed!")
|
478 |
-
st.download_button(
|
479 |
-
label="Download Markdown file",
|
480 |
-
data=markdown_text,
|
481 |
-
file_name=output_filename,
|
482 |
-
mime="text/markdown"
|
483 |
-
)
|
484 |
|
485 |
-
|
486 |
-
|
487 |
-
|
|
|
488 |
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
494 |
|
495 |
-
|
496 |
-
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
from docling.document_converter import DocumentConverter
|
3 |
import tempfile
|
4 |
import os
|
|
|
5 |
import logging
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
# Configure logging
|
8 |
+
logging.basicConfig(level=logging.DEBUG)
|
9 |
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
# Custom CSS for better layout
|
12 |
st.markdown("""
|
|
|
43 |
</style>
|
44 |
""", unsafe_allow_html=True)
|
45 |
|
46 |
+
st.title("PDF to Markdown Converter")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
+
# Initialize session state if it doesn't exist
|
49 |
+
if 'converter' not in st.session_state:
|
50 |
+
try:
|
51 |
+
st.session_state.converter = DocumentConverter()
|
52 |
+
logger.debug("Converter successfully created")
|
53 |
+
except Exception as e:
|
54 |
+
logger.error(f"Error creating converter: {str(e)}")
|
55 |
+
st.error(f"Error creating converter: {str(e)}")
|
56 |
+
st.stop()
|
57 |
+
|
58 |
+
# Main upload area
|
59 |
+
uploaded_file = st.file_uploader(
|
60 |
+
"Upload your PDF file",
|
61 |
+
type=['pdf'],
|
62 |
+
key='pdf_uploader',
|
63 |
+
help="Drag and drop or click to select a PDF file (max 200MB)"
|
64 |
+
)
|
65 |
+
|
66 |
+
# URL input area with spacing
|
67 |
+
st.markdown("<br>", unsafe_allow_html=True)
|
68 |
+
url = st.text_input("Or enter a PDF URL")
|
69 |
+
|
70 |
+
# Unified convert button
|
71 |
+
convert_clicked = st.button("Convert to Markdown", type="primary")
|
72 |
+
|
73 |
+
# Process either uploaded file or URL
|
74 |
+
if convert_clicked:
|
75 |
+
if uploaded_file is not None:
|
76 |
try:
|
77 |
+
with st.spinner('Converting file...'):
|
78 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
|
79 |
+
tmp_file.write(uploaded_file.getvalue())
|
80 |
+
tmp_path = tmp_file.name
|
81 |
+
logger.debug(f"Temporary file created at: {tmp_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
+
try:
|
84 |
+
result = st.session_state.converter.convert(tmp_path)
|
85 |
+
markdown_text = result.document.export_to_markdown()
|
86 |
|
87 |
+
output_filename = os.path.splitext(uploaded_file.name)[0] + '.md'
|
88 |
+
|
89 |
+
st.success("Conversion completed!")
|
90 |
+
st.download_button(
|
91 |
+
label="Download Markdown file",
|
92 |
+
data=markdown_text,
|
93 |
+
file_name=output_filename,
|
94 |
+
mime="text/markdown"
|
95 |
+
)
|
96 |
+
|
97 |
+
except Exception as e:
|
98 |
+
logger.error(f"Error converting file: {str(e)}")
|
99 |
+
st.error(f"Error converting file: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
+
finally:
|
102 |
+
if os.path.exists(tmp_path):
|
103 |
+
os.unlink(tmp_path)
|
104 |
+
logger.debug("Temporary file deleted")
|
105 |
|
106 |
+
except Exception as e:
|
107 |
+
logger.error(f"Error processing file: {str(e)}")
|
108 |
+
st.error(f"Error processing file: {str(e)}")
|
109 |
+
|
110 |
+
elif url:
|
111 |
+
try:
|
112 |
+
with st.spinner('Converting from URL...'):
|
113 |
+
logger.debug(f"Converting from URL: {url}")
|
114 |
+
result = st.session_state.converter.convert(url)
|
115 |
+
markdown_text = result.document.export_to_markdown()
|
116 |
+
|
117 |
+
output_filename = url.split('/')[-1].split('.')[0] + '.md'
|
118 |
+
|
119 |
+
st.success("Conversion completed!")
|
120 |
+
st.download_button(
|
121 |
+
label="Download Markdown file",
|
122 |
+
data=markdown_text,
|
123 |
+
file_name=output_filename,
|
124 |
+
mime="text/markdown"
|
125 |
+
)
|
126 |
|
127 |
+
except Exception as e:
|
128 |
+
logger.error(f"Error converting from URL: {str(e)}")
|
129 |
+
st.error(f"Error converting from URL: {str(e)}")
|
130 |
+
else:
|
131 |
+
st.warning("Please upload a file or enter a URL first")
|
requirements.txt
CHANGED
@@ -1,8 +1,3 @@
|
|
1 |
-
streamlit==1.
|
2 |
-
|
3 |
-
|
4 |
-
Pillow==10.1.0
|
5 |
-
PyMuPDF==1.23.8
|
6 |
-
requests==2.31.0
|
7 |
-
vllm==0.3.0
|
8 |
-
docling_core
|
|
|
1 |
+
streamlit==1.29.0
|
2 |
+
docling==2.7.0
|
3 |
+
watchdog==2.3.1
|
|
|
|
|
|
|
|
|
|
runtime.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
python-3.12
|