hellorahulk commited on
Commit
ec3f76a
·
1 Parent(s): fdbfd73

Add URL input support for document processing

Browse files
Files changed (2) hide show
  1. app.py +77 -13
  2. requirements.txt +2 -1
app.py CHANGED
@@ -6,17 +6,20 @@ from dockling_parser.exceptions import ParserError, UnsupportedFormatError
6
  import tempfile
7
  import mimetypes
8
  import traceback
 
 
9
 
10
  TITLE = "📄 Smart Document Parser"
11
  DESCRIPTION = """
12
  A powerful document parsing application that automatically extracts structured information from various document formats.
13
- Upload any document (PDF, DOCX, TXT, HTML, Markdown) and get structured information extracted automatically.
14
  """
15
 
16
  ARTICLE = """
17
  ## 🚀 Features
18
 
19
  - Multiple Format Support: PDF, DOCX, TXT, HTML, and Markdown
 
20
  - Rich Information Extraction
21
  - Smart Processing with Confidence Scoring
22
  - Automatic Format Detection
@@ -25,16 +28,30 @@ Made with ❤️ using Docling and Gradio
25
  """
26
 
27
  ERROR_MESSAGES = {
28
- "no_file": (
29
- "⚠️ No file uploaded",
30
- "Please upload a document to process.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  "No sections available",
32
  "No entities available",
33
  "Confidence Score: 0.0"
34
  ),
35
  "unsupported_format": (
36
  "⚠️ Unsupported file format",
37
- "Please upload a file in one of the supported formats: PDF, DOCX, TXT, HTML, or MD.",
38
  "No sections available",
39
  "No entities available",
40
  "Confidence Score: 0.0"
@@ -51,14 +68,45 @@ ERROR_MESSAGES = {
51
  # Initialize the document parser
52
  parser = DocumentParser()
53
 
54
- def process_document(file_path):
55
- """Process uploaded document and return structured information"""
56
- if file_path is None:
57
- return ERROR_MESSAGES["no_file"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
 
 
 
 
 
 
 
59
  try:
60
- # Parse the document directly using the file path
61
- result = parser.parse(file_path)
 
 
 
 
 
 
 
 
62
 
63
  # Prepare the outputs
64
  metadata_df = pd.DataFrame([{
@@ -110,6 +158,13 @@ def process_document(file_path):
110
  "No entities available",
111
  "Confidence Score: 0.0"
112
  )
 
 
 
 
 
 
 
113
 
114
  # Create Gradio interface
115
  with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
@@ -123,6 +178,10 @@ with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
123
  file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
124
  type="filepath"
125
  )
 
 
 
 
126
  submit_btn = gr.Button("Process Document", variant="primary")
127
 
128
  with gr.Column():
@@ -158,8 +217,8 @@ with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
158
 
159
  # Handle file submission
160
  submit_btn.click(
161
- fn=process_document,
162
- inputs=[file_input],
163
  outputs=[
164
  content_output,
165
  metadata_output,
@@ -176,6 +235,11 @@ with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
176
  - Text Files (*.txt)
177
  - HTML Files (*.html)
178
  - Markdown Files (*.md)
 
 
 
 
 
179
  """)
180
 
181
  gr.Markdown(ARTICLE)
 
6
  import tempfile
7
  import mimetypes
8
  import traceback
9
+ import requests
10
+ from urllib.parse import urlparse
11
 
12
  TITLE = "📄 Smart Document Parser"
13
  DESCRIPTION = """
14
  A powerful document parsing application that automatically extracts structured information from various document formats.
15
+ Upload a document or provide a URL (PDF, DOCX, TXT, HTML, Markdown) and get structured information automatically.
16
  """
17
 
18
  ARTICLE = """
19
  ## 🚀 Features
20
 
21
  - Multiple Format Support: PDF, DOCX, TXT, HTML, and Markdown
22
+ - Support for File Upload and URLs
23
  - Rich Information Extraction
24
  - Smart Processing with Confidence Scoring
25
  - Automatic Format Detection
 
28
  """
29
 
30
  ERROR_MESSAGES = {
31
+ "no_input": (
32
+ "⚠️ No input provided",
33
+ "Please upload a document or provide a URL.",
34
+ "No sections available",
35
+ "No entities available",
36
+ "Confidence Score: 0.0"
37
+ ),
38
+ "invalid_url": (
39
+ "⚠️ Invalid URL",
40
+ "Please provide a valid URL to a document.",
41
+ "No sections available",
42
+ "No entities available",
43
+ "Confidence Score: 0.0"
44
+ ),
45
+ "download_error": (
46
+ "⚠️ Failed to download document",
47
+ "Could not download the document from the provided URL.",
48
  "No sections available",
49
  "No entities available",
50
  "Confidence Score: 0.0"
51
  ),
52
  "unsupported_format": (
53
  "⚠️ Unsupported file format",
54
+ "Please provide a file in one of the supported formats: PDF, DOCX, TXT, HTML, or MD.",
55
  "No sections available",
56
  "No entities available",
57
  "Confidence Score: 0.0"
 
68
  # Initialize the document parser
69
  parser = DocumentParser()
70
 
71
+ def download_file(url: str) -> str:
72
+ """Download file from URL and save to temporary file"""
73
+ try:
74
+ # Extract filename from URL
75
+ parsed_url = urlparse(url)
76
+ filename = os.path.basename(parsed_url.path)
77
+ if not filename:
78
+ filename = "document.pdf" # Default filename
79
+
80
+ # Download file
81
+ response = requests.get(url, allow_redirects=True)
82
+ response.raise_for_status()
83
+
84
+ # Save to temporary file
85
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp_file:
86
+ tmp_file.write(response.content)
87
+ return tmp_file.name
88
+
89
+ except Exception as e:
90
+ raise Exception(f"Failed to download file: {str(e)}")
91
 
92
+ def process_input(file_input, url_input):
93
+ """Process either uploaded file or URL input"""
94
+ # Check if we have any input
95
+ if file_input is None and not url_input:
96
+ return ERROR_MESSAGES["no_input"]
97
+
98
+ temp_file = None
99
  try:
100
+ # Handle URL input if provided
101
+ if url_input:
102
+ try:
103
+ temp_file = download_file(url_input)
104
+ result = parser.parse(temp_file)
105
+ except Exception as e:
106
+ return ERROR_MESSAGES["download_error"]
107
+ # Handle file upload
108
+ else:
109
+ result = parser.parse(file_input)
110
 
111
  # Prepare the outputs
112
  metadata_df = pd.DataFrame([{
 
158
  "No entities available",
159
  "Confidence Score: 0.0"
160
  )
161
+ finally:
162
+ # Cleanup temporary file if it was created
163
+ if temp_file and os.path.exists(temp_file):
164
+ try:
165
+ os.unlink(temp_file)
166
+ except:
167
+ pass
168
 
169
  # Create Gradio interface
170
  with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
 
178
  file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
179
  type="filepath"
180
  )
181
+ url_input = gr.Textbox(
182
+ label="Or Enter Document URL",
183
+ placeholder="https://example.com/document.pdf"
184
+ )
185
  submit_btn = gr.Button("Process Document", variant="primary")
186
 
187
  with gr.Column():
 
217
 
218
  # Handle file submission
219
  submit_btn.click(
220
+ fn=process_input,
221
+ inputs=[file_input, url_input],
222
  outputs=[
223
  content_output,
224
  metadata_output,
 
235
  - Text Files (*.txt)
236
  - HTML Files (*.html)
237
  - Markdown Files (*.md)
238
+
239
+ ### 🔗 Example URLs
240
+ - ArXiv PDFs: https://arxiv.org/pdf/2408.08921.pdf
241
+ - Research Papers
242
+ - Documentation
243
  """)
244
 
245
  gr.Markdown(ARTICLE)
requirements.txt CHANGED
@@ -9,4 +9,5 @@ gradio>=4.44.1
9
  pandas>=1.5.0
10
  huggingface-hub>=0.19.0
11
  python-magic-bin>=0.4.14; platform_system == "Windows"
12
- libmagic; platform_system == "Linux"
 
 
9
  pandas>=1.5.0
10
  huggingface-hub>=0.19.0
11
  python-magic-bin>=0.4.14; platform_system == "Windows"
12
+ libmagic; platform_system == "Linux"
13
+ requests>=2.31.0