Spaces:
Running
Running
Add application file
Browse files- .gitignore +3 -0
- README.md +2 -2
- app.py +432 -0
- requirements.txt +5 -0
- webui.bat +73 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
.vs
|
2 |
+
venv
|
3 |
+
tmp
|
README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
---
|
2 |
title: HtmlToMarkdown
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.29.0
|
|
|
1 |
---
|
2 |
title: HtmlToMarkdown
|
3 |
+
emoji: π
|
4 |
+
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.29.0
|
app.py
ADDED
@@ -0,0 +1,432 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ο»Ώimport gradio as gr
|
2 |
+
import os
|
3 |
+
import shutil
|
4 |
+
import requests
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
from urllib.parse import urljoin, urlparse
|
7 |
+
from queue import Queue
|
8 |
+
import time
|
9 |
+
import zipfile
|
10 |
+
import tempfile
|
11 |
+
import sys
|
12 |
+
import logging
|
13 |
+
import traceback
|
14 |
+
import pypandoc
|
15 |
+
|
16 |
+
# --- Configuration & Logging ---
|
17 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
18 |
+
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
|
19 |
+
REQUEST_TIMEOUT = 20 # seconds
|
20 |
+
POLITENESS_DELAY = 0.3 # seconds between requests
|
21 |
+
|
22 |
+
# --- Pandoc Check ---
|
23 |
+
def check_pandoc_available():
|
24 |
+
"""Checks if pypandoc can find a pandoc executable."""
|
25 |
+
try:
|
26 |
+
pandoc_path = pypandoc.get_pandoc_path()
|
27 |
+
logging.info(f"pypandoc found Pandoc executable at: {pandoc_path}")
|
28 |
+
return True
|
29 |
+
except OSError:
|
30 |
+
logging.error("pypandoc could not find Pandoc executable.")
|
31 |
+
logging.error("Please ensure Pandoc is installed OR install 'pypandoc_binary' (`pip install pypandoc_binary`)")
|
32 |
+
return False
|
33 |
+
except ImportError:
|
34 |
+
logging.error("pypandoc library not found. Please install it (`pip install pypandoc_binary`).")
|
35 |
+
return False
|
36 |
+
|
37 |
+
# --- Core Functions ---
|
38 |
+
def fetch_html(url):
|
39 |
+
"""Fetches HTML content from a given URL."""
|
40 |
+
try:
|
41 |
+
headers = {'User-Agent': USER_AGENT}
|
42 |
+
response = requests.get(url, timeout=REQUEST_TIMEOUT, headers=headers)
|
43 |
+
response.raise_for_status()
|
44 |
+
response.encoding = response.apparent_encoding if response.apparent_encoding else 'utf-8'
|
45 |
+
logging.info(f"Successfully fetched: {url}")
|
46 |
+
return response.text
|
47 |
+
except requests.exceptions.Timeout:
|
48 |
+
logging.error(f"Timeout fetching URL: {url}")
|
49 |
+
return None
|
50 |
+
except requests.exceptions.RequestException as e:
|
51 |
+
logging.error(f"Error fetching URL {url}: {e}")
|
52 |
+
return None
|
53 |
+
except Exception as e:
|
54 |
+
logging.error(f"Unexpected error fetching {url}: {e}")
|
55 |
+
return None
|
56 |
+
|
57 |
+
def convert_html_to_md(html_content, output_md_path, pandoc_output_format, pandoc_extra_args):
|
58 |
+
"""
|
59 |
+
Converts HTML content string to a Markdown file using pypandoc
|
60 |
+
with specified format and arguments.
|
61 |
+
"""
|
62 |
+
if not html_content:
|
63 |
+
logging.warning(f"Empty HTML content for {output_md_path}. Conversion skipped.")
|
64 |
+
return False
|
65 |
+
# Using html+smart enables better handling of typographic characters in source HTML
|
66 |
+
input_format = 'html+smart' # Keep input format consistent
|
67 |
+
|
68 |
+
try:
|
69 |
+
logging.debug(f"pypandoc converting to {pandoc_output_format} with args: {pandoc_extra_args}")
|
70 |
+
# Use pypandoc.convert_text to convert the HTML string
|
71 |
+
# Specify input format ('html'), output format ('gfm'), and output file
|
72 |
+
# pypandoc handles invoking pandoc correctly with the string input
|
73 |
+
output = pypandoc.convert_text(
|
74 |
+
source=html_content,
|
75 |
+
to=pandoc_output_format,
|
76 |
+
format=input_format,
|
77 |
+
outputfile=output_md_path,
|
78 |
+
extra_args=pandoc_extra_args,
|
79 |
+
encoding='utf-8'
|
80 |
+
)
|
81 |
+
|
82 |
+
# When using outputfile, convert_text returns an empty string on success
|
83 |
+
if output == "":
|
84 |
+
logging.info(f"Successfully converted using pypandoc -> {os.path.basename(output_md_path)}")
|
85 |
+
return True
|
86 |
+
else:
|
87 |
+
logging.error(f"pypandoc conversion to {output_md_path} returned unexpected non-empty output.")
|
88 |
+
if os.path.exists(output_md_path) and os.path.getsize(output_md_path) == 0:
|
89 |
+
logging.warning(f"Output file {output_md_path} was created but is empty.")
|
90 |
+
return False
|
91 |
+
|
92 |
+
except Exception as e:
|
93 |
+
logging.error(f"Error during pypandoc conversion for {output_md_path}: {e}")
|
94 |
+
logging.error(traceback.format_exc())
|
95 |
+
if os.path.exists(output_md_path) and os.path.getsize(output_md_path) == 0:
|
96 |
+
try:
|
97 |
+
os.remove(output_md_path)
|
98 |
+
logging.info(f"Removed empty/failed output file: {os.path.basename(output_md_path)}")
|
99 |
+
except OSError as remove_err:
|
100 |
+
logging.warning(f"Could not remove empty/failed output file {output_md_path}: {remove_err}")
|
101 |
+
return False
|
102 |
+
|
103 |
+
|
104 |
+
def create_zip_archive(source_dir, output_zip_path):
|
105 |
+
"""Creates a ZIP archive from the contents of source_dir."""
|
106 |
+
try:
|
107 |
+
with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
108 |
+
for root, _, files in os.walk(source_dir):
|
109 |
+
for file in files:
|
110 |
+
file_path = os.path.join(root, file)
|
111 |
+
# Arcname is the path inside the zip file (relative to source_dir)
|
112 |
+
arcname = os.path.relpath(file_path, source_dir)
|
113 |
+
zipf.write(file_path, arcname)
|
114 |
+
logging.info(f"Successfully created ZIP archive: {output_zip_path}")
|
115 |
+
return True
|
116 |
+
except Exception as e:
|
117 |
+
logging.error(f"Failed to create ZIP archive {output_zip_path}: {e}")
|
118 |
+
return False
|
119 |
+
|
120 |
+
# --- Main Gradio Function ---
|
121 |
+
def process_conversion_request(start_url_str, restrict_path, use_aggressive_conversion, progress=gr.Progress(track_tqdm=True)):
|
122 |
+
"""The main function triggered by the Gradio interface."""
|
123 |
+
|
124 |
+
# --- 0. Check Pandoc via pypandoc ---
|
125 |
+
if not check_pandoc_available():
|
126 |
+
return "Error: pypandoc could not find a Pandoc executable. Please ensure Pandoc is installed or install `pypandoc_binary`.", None
|
127 |
+
|
128 |
+
# --- 1. Validate URL and Determine Restriction Path ---
|
129 |
+
start_url_str = start_url_str.strip()
|
130 |
+
start_path_dir_for_restriction = None # Initialize restriction path base
|
131 |
+
|
132 |
+
if not start_url_str:
|
133 |
+
return "Error: Starting URL cannot be empty.", None
|
134 |
+
try:
|
135 |
+
parsed_start_url = urlparse(start_url_str)
|
136 |
+
if not parsed_start_url.scheme or not parsed_start_url.netloc:
|
137 |
+
raise ValueError("Invalid URL format (missing scheme or domain).")
|
138 |
+
base_netloc = parsed_start_url.netloc
|
139 |
+
base_scheme = parsed_start_url.scheme
|
140 |
+
|
141 |
+
# Calculate the base directory path for comparison if restriction is enabled
|
142 |
+
start_path_cleaned = parsed_start_url.path.strip('/')
|
143 |
+
if start_path_cleaned: # If not root path
|
144 |
+
# Use os.path.dirname to get the directory part
|
145 |
+
# dirname('main/index.html') -> 'main'
|
146 |
+
# dirname('main') -> '' (This needs correction if start URL is like /main/)
|
147 |
+
# Let's adjust: if no '/' it means it's the first level dir or a root file
|
148 |
+
if '/' not in start_path_cleaned and '.' not in start_path_cleaned:
|
149 |
+
start_path_dir_for_restriction = start_path_cleaned # e.g. 'main'
|
150 |
+
else:
|
151 |
+
start_path_dir_for_restriction = os.path.dirname(start_path_cleaned) # e.g. 'main' from main/index.html, or '' from /index.html
|
152 |
+
if start_path_dir_for_restriction == '': # Handle case like /index.html correctly
|
153 |
+
start_path_dir_for_restriction = None # Treat like root, don't restrict path based on this
|
154 |
+
|
155 |
+
except ValueError as e:
|
156 |
+
return f"Error: Invalid starting URL '{start_url_str}': {e}", None
|
157 |
+
|
158 |
+
# Log restriction status
|
159 |
+
restriction_msg = f"Path restriction enabled: limiting to paths starting like '{start_path_dir_for_restriction}/'." if restrict_path and start_path_dir_for_restriction else "Path restriction disabled or starting from root."
|
160 |
+
|
161 |
+
# --- Determine Pandoc Settings based on Checkbox ---
|
162 |
+
# wrap=none, Prevent auto-wrapping lines
|
163 |
+
if use_aggressive_conversion:
|
164 |
+
pandoc_format_to_use = 'gfm-raw_html+hard_line_breaks'
|
165 |
+
pandoc_args_to_use = ['--wrap=none', '--markdown-headings=atx']
|
166 |
+
conversion_mode_msg = "Using aggressive Markdown conversion (less raw HTML, ATX headers)."
|
167 |
+
else:
|
168 |
+
# Using gfm+hard_line_breaks ensures GitHub compatibility and respects single newlines
|
169 |
+
pandoc_format_to_use = 'gfm+hard_line_breaks'
|
170 |
+
pandoc_args_to_use = ['--wrap=none']
|
171 |
+
conversion_mode_msg = "Using standard Markdown conversion (may preserve more raw HTML)."
|
172 |
+
|
173 |
+
logging.info(conversion_mode_msg) # Log the mode
|
174 |
+
|
175 |
+
# --- 2. Setup Temporary Directory & Crawler ---
|
176 |
+
staging_dir = tempfile.mkdtemp(prefix="md_convert_")
|
177 |
+
logging.info(f"Created temporary staging directory: {staging_dir}")
|
178 |
+
output_zip_file = None
|
179 |
+
|
180 |
+
urls_to_process = Queue()
|
181 |
+
processed_urls = set() # Still needed to avoid duplicates
|
182 |
+
failed_urls = set()
|
183 |
+
converted_count = 0
|
184 |
+
url_count_estimate = 1 # Total unique URLs discovered so far (starts with the first one)
|
185 |
+
dequeued_count = 0
|
186 |
+
|
187 |
+
urls_to_process.put(start_url_str)
|
188 |
+
processed_urls.add(start_url_str) # Add start URL here
|
189 |
+
|
190 |
+
log_messages = ["Process started...", restriction_msg, conversion_mode_msg]
|
191 |
+
|
192 |
+
try:
|
193 |
+
# --- 3. Crawl and Convert Loop ---
|
194 |
+
while not urls_to_process.empty():
|
195 |
+
# --- Get URL and Increment Dequeued Count ---
|
196 |
+
current_url = urls_to_process.get()
|
197 |
+
dequeued_count += 1 # Increment when an item is taken for processing
|
198 |
+
|
199 |
+
# --- Update Progress Bar ---
|
200 |
+
# Calculate progress based on dequeued items vs. total discovered
|
201 |
+
# Denominator is the total number of unique URLs added to processed_urls/queue so far
|
202 |
+
denominator = max(1, url_count_estimate) # url_count_estimate increases when new links are found
|
203 |
+
current_progress_value = dequeued_count / denominator
|
204 |
+
|
205 |
+
# Update Gradio progress - use dequeued_count for user display
|
206 |
+
# Display: Processed X / Total_Discovered Y
|
207 |
+
progress(current_progress_value, desc=f"Processing {dequeued_count}/{url_count_estimate}. Queue: {urls_to_process.qsize()}")
|
208 |
+
|
209 |
+
# --- Process the current URL ---
|
210 |
+
log_message = f"\nProcessing ({dequeued_count}/{url_count_estimate}): {current_url}"
|
211 |
+
logging.info(log_message)
|
212 |
+
log_messages.append(log_message)
|
213 |
+
|
214 |
+
# --- 3a. Fetch HTML ---
|
215 |
+
time.sleep(POLITENESS_DELAY)
|
216 |
+
html_content = fetch_html(current_url)
|
217 |
+
if not html_content:
|
218 |
+
failed_urls.add(current_url)
|
219 |
+
log_message = f" -> Failed to fetch content."
|
220 |
+
logging.warning(log_message)
|
221 |
+
log_messages.append(log_message)
|
222 |
+
continue
|
223 |
+
|
224 |
+
# --- 3b. Determine Output Path ---
|
225 |
+
parsed_current_url = urlparse(current_url)
|
226 |
+
# Get the path part of the URL, removing leading/trailing slashes
|
227 |
+
url_path_segment = parsed_current_url.path.strip('/') # e.g., "main/index.html", "HEAD/index.html", ""
|
228 |
+
# If the path is empty (domain root like https://example.com/), use 'index' as the base name
|
229 |
+
if not url_path_segment:
|
230 |
+
path_in_zip_base = 'index'
|
231 |
+
else:
|
232 |
+
path_in_zip_base = url_path_segment # e.g., "main/index.html", "HEAD/index.html"
|
233 |
+
|
234 |
+
# Now, determine the final .md filename based on the path base
|
235 |
+
if path_in_zip_base.lower().endswith('.html'):
|
236 |
+
relative_md_filename = os.path.splitext(path_in_zip_base)[0] + ".md"
|
237 |
+
elif path_in_zip_base.endswith('/'): # Should not happen often with strip('/') but handle defensively
|
238 |
+
# If URL was like /docs/, path_in_zip_base would be 'docs' after strip.
|
239 |
+
# This case is less likely needed now, but safe to keep.
|
240 |
+
relative_md_filename = os.path.join(path_in_zip_base, "index.md")
|
241 |
+
else:
|
242 |
+
# If it's not empty and doesn't end with .html, assume it's a directory path
|
243 |
+
# Append 'index.md' to treat it like accessing a directory index
|
244 |
+
# e.g., if URL path was /main, url_path_segment is 'main', output becomes 'main/index.md'
|
245 |
+
# If URL path was /path/to/file (no .html), output becomes 'path/to/file.md' if '.' in basename, else 'path/to/file/index.md'
|
246 |
+
basename = os.path.basename(path_in_zip_base)
|
247 |
+
if '.' in basename: # Check if it looks like a file without .html extension
|
248 |
+
relative_md_filename = path_in_zip_base + ".md"
|
249 |
+
else: # Assume it's a directory reference
|
250 |
+
relative_md_filename = os.path.join(path_in_zip_base, "index.md")
|
251 |
+
|
252 |
+
# Construct full path within the temporary staging directory
|
253 |
+
output_md_full_path = os.path.join(staging_dir, relative_md_filename)
|
254 |
+
output_md_dir = os.path.dirname(output_md_full_path)
|
255 |
+
|
256 |
+
# Create directories if they don't exist (check if output_md_dir is not empty)
|
257 |
+
try:
|
258 |
+
if output_md_dir and not os.path.exists(output_md_dir):
|
259 |
+
os.makedirs(output_md_dir)
|
260 |
+
except OSError as e:
|
261 |
+
log_message = f" -> Error creating directory {output_md_dir}: {e}. Skipping conversion for this URL."
|
262 |
+
logging.error(log_message)
|
263 |
+
log_messages.append(log_message)
|
264 |
+
failed_urls.add(current_url)
|
265 |
+
continue # Skip to next URL
|
266 |
+
|
267 |
+
# --- 3c. Convert HTML to Markdown ---
|
268 |
+
if convert_html_to_md(html_content, output_md_full_path, pandoc_format_to_use, pandoc_args_to_use):
|
269 |
+
converted_count += 1
|
270 |
+
log_message = f" -> Converted successfully to {os.path.relpath(output_md_full_path, staging_dir)}"
|
271 |
+
logging.info(log_message)
|
272 |
+
log_messages.append(log_message)
|
273 |
+
else:
|
274 |
+
failed_urls.add(current_url)
|
275 |
+
log_message = f" -> Conversion failed."
|
276 |
+
logging.warning(log_message)
|
277 |
+
log_messages.append(log_message)
|
278 |
+
|
279 |
+
# --- 3d. Find and Add New Links ---
|
280 |
+
try:
|
281 |
+
soup = BeautifulSoup(html_content, 'lxml')
|
282 |
+
links_found_this_page = 0
|
283 |
+
links_skipped_due_to_path = 0
|
284 |
+
for link in soup.find_all('a', href=True):
|
285 |
+
href = link['href']
|
286 |
+
absolute_url = urljoin(current_url, href)
|
287 |
+
absolute_url = urlparse(absolute_url)._replace(fragment="").geturl()
|
288 |
+
parsed_absolute_url = urlparse(absolute_url)
|
289 |
+
|
290 |
+
# Basic Filtering (scheme, domain, looks like html)
|
291 |
+
is_valid_target = (
|
292 |
+
parsed_absolute_url.scheme == base_scheme and
|
293 |
+
parsed_absolute_url.netloc == base_netloc and
|
294 |
+
(not parsed_absolute_url.path or
|
295 |
+
parsed_absolute_url.path == '/' or
|
296 |
+
parsed_absolute_url.path.lower().endswith('.html') or
|
297 |
+
'.' not in os.path.basename(parsed_absolute_url.path.rstrip('/')) # Include directory links
|
298 |
+
)
|
299 |
+
)
|
300 |
+
|
301 |
+
if not is_valid_target:
|
302 |
+
continue # Skip invalid links early
|
303 |
+
|
304 |
+
# --- Path Restriction Check ---
|
305 |
+
path_restricted = False
|
306 |
+
# Only apply if checkbox is checked AND we derived a non-root restriction path
|
307 |
+
if restrict_path and start_path_dir_for_restriction is not None:
|
308 |
+
candidate_path_clean = parsed_absolute_url.path.strip('/')
|
309 |
+
# Check if the cleaned candidate path starts with the restriction dir + '/'
|
310 |
+
# OR if the candidate path is exactly the restriction dir (e.g. /main matching main)
|
311 |
+
if not (candidate_path_clean.startswith(start_path_dir_for_restriction + '/') or \
|
312 |
+
candidate_path_clean == start_path_dir_for_restriction):
|
313 |
+
path_restricted = True
|
314 |
+
links_skipped_due_to_path += 1
|
315 |
+
# --- End Path Restriction Check ---
|
316 |
+
|
317 |
+
# Add to queue only if NOT restricted and NOT already processed
|
318 |
+
if not path_restricted and absolute_url not in processed_urls:
|
319 |
+
processed_urls.add(absolute_url) # Add to set immediately
|
320 |
+
urls_to_process.put(absolute_url)
|
321 |
+
links_found_this_page += 1
|
322 |
+
url_count_estimate += 1
|
323 |
+
|
324 |
+
# Log link discovery summary for the page
|
325 |
+
log_links_msg = f" -> Found {links_found_this_page} new link(s) to process."
|
326 |
+
if links_skipped_due_to_path > 0:
|
327 |
+
log_links_msg += f" Skipped {links_skipped_due_to_path} link(s) due to path restriction."
|
328 |
+
logging.info(log_links_msg)
|
329 |
+
log_messages.append(log_links_msg)
|
330 |
+
except Exception as e:
|
331 |
+
log_message = f" -> Error parsing links on {current_url}: {e}"
|
332 |
+
logging.error(log_message)
|
333 |
+
log_messages.append(log_message)
|
334 |
+
|
335 |
+
# --- 4. Create ZIP Archive ---
|
336 |
+
progress(1.0, desc="Zipping files...")
|
337 |
+
log_messages.append("\nCrawling complete. Creating ZIP file...")
|
338 |
+
yield "\n".join(log_messages), None
|
339 |
+
|
340 |
+
with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as temp_zip:
|
341 |
+
output_zip_path = temp_zip.name
|
342 |
+
|
343 |
+
if create_zip_archive(staging_dir, output_zip_path):
|
344 |
+
log_messages.append(f"\nProcess finished successfully!")
|
345 |
+
log_messages.append(f"Converted {converted_count} pages using {'aggressive' if use_aggressive_conversion else 'standard'} mode.") # Inform user of mode used
|
346 |
+
if failed_urls:
|
347 |
+
log_messages.append(f"Failed to process {len(failed_urls)} URLs (check logs).")
|
348 |
+
log_messages.append(f"ZIP file ready: {os.path.basename(output_zip_path)}")
|
349 |
+
yield "\n".join(log_messages), output_zip_path
|
350 |
+
else:
|
351 |
+
log_messages.append("\nError: Failed to create the final ZIP archive.")
|
352 |
+
yield "\n".join(log_messages), None
|
353 |
+
|
354 |
+
except KeyboardInterrupt:
|
355 |
+
log_messages.append("\nProcess interrupted by user.")
|
356 |
+
yield "\n".join(log_messages), None
|
357 |
+
except Exception as e:
|
358 |
+
log_messages.append(f"\nAn unexpected error occurred: {e}")
|
359 |
+
logging.error("Unhandled exception in process_conversion_request:")
|
360 |
+
logging.error(traceback.format_exc())
|
361 |
+
yield "\n".join(log_messages), None
|
362 |
+
finally:
|
363 |
+
# --- 5. Cleanup ---
|
364 |
+
if os.path.exists(staging_dir):
|
365 |
+
try:
|
366 |
+
shutil.rmtree(staging_dir)
|
367 |
+
logging.info(f"Cleaned up temporary directory: {staging_dir}")
|
368 |
+
except Exception as e:
|
369 |
+
logging.error(f"Error cleaning up temporary directory {staging_dir}: {e}")
|
370 |
+
|
371 |
+
css = """
|
372 |
+
textarea[rows]:not([rows="1"]) {
|
373 |
+
overflow-y: auto !important;
|
374 |
+
scrollbar-width: thin !important;
|
375 |
+
}
|
376 |
+
textarea[rows]:not([rows="1"])::-webkit-scrollbar {
|
377 |
+
all: initial !important;
|
378 |
+
background: #f1f1f1 !important;
|
379 |
+
}
|
380 |
+
textarea[rows]:not([rows="1"])::-webkit-scrollbar-thumb {
|
381 |
+
all: initial !important;
|
382 |
+
background: #a8a8a8 !important;
|
383 |
+
}
|
384 |
+
"""
|
385 |
+
|
386 |
+
# --- Gradio UI Definition ---
|
387 |
+
with gr.Blocks(title="HTML Docs to Markdown Converter", css=css) as demo:
|
388 |
+
gr.Markdown(
|
389 |
+
"""
|
390 |
+
# HTML Documentation to Markdown Converter (via pypandoc)
|
391 |
+
Enter the starting `index.html` URL of an online documentation site.
|
392 |
+
The script will crawl internal HTML links, convert pages to Markdown, and package results into a ZIP file.
|
393 |
+
**Requires `pip install pypandoc_binary`**.
|
394 |
+
"""
|
395 |
+
)
|
396 |
+
|
397 |
+
with gr.Row():
|
398 |
+
url_input = gr.Textbox(
|
399 |
+
label="Starting Index HTML URL",
|
400 |
+
placeholder="e.g., https://dghs-imgutils.deepghs.org/main/index.html"
|
401 |
+
)
|
402 |
+
|
403 |
+
with gr.Row():
|
404 |
+
restrict_path_checkbox = gr.Checkbox(
|
405 |
+
label="Restrict crawl to starting path structure (e.g., if start is '/main/index.html', only crawl '/main/...' URLs)",
|
406 |
+
value=True # Default to restricting path
|
407 |
+
)
|
408 |
+
aggressive_md_checkbox = gr.Checkbox(
|
409 |
+
label="Aggressive Markdown conversion (disable raw HTML, use ATX headers)",
|
410 |
+
value=True # Default to aggressive conversion
|
411 |
+
)
|
412 |
+
|
413 |
+
with gr.Row():
|
414 |
+
start_button = gr.Button("Start Conversion", variant="primary")
|
415 |
+
|
416 |
+
with gr.Row():
|
417 |
+
log_output = gr.Textbox(label="Progress Logs", lines=15, interactive=False, show_copy_button=True)
|
418 |
+
|
419 |
+
with gr.Row():
|
420 |
+
zip_output = gr.File(label="Download Markdown ZIP")
|
421 |
+
|
422 |
+
start_button.click(
|
423 |
+
fn=process_conversion_request,
|
424 |
+
inputs=[url_input, restrict_path_checkbox, aggressive_md_checkbox],
|
425 |
+
outputs=[log_output, zip_output],
|
426 |
+
show_progress="full"
|
427 |
+
)
|
428 |
+
|
429 |
+
# --- Launch App ---
|
430 |
+
if __name__ == "__main__":
|
431 |
+
demo.queue()
|
432 |
+
demo.launch(inbrowser=True)
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==5.29.0
|
2 |
+
requests
|
3 |
+
beautifulsoup4
|
4 |
+
lxml
|
5 |
+
pypandoc_binary
|
webui.bat
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@echo off
|
2 |
+
|
3 |
+
:: The source of the webui.bat file is stable-diffusion-webui
|
4 |
+
:: set COMMANDLINE_ARGS=--whisper_implementation faster-whisper --input_audio_max_duration -1 --default_model_name large-v2 --auto_parallel True --output_dir output --vad_max_merge_size 90 --save_downloaded_files --autolaunch
|
5 |
+
|
6 |
+
if not defined PYTHON (set PYTHON=python)
|
7 |
+
if not defined VENV_DIR (set "VENV_DIR=%~dp0%venv")
|
8 |
+
|
9 |
+
mkdir tmp 2>NUL
|
10 |
+
|
11 |
+
%PYTHON% -c "" >tmp/stdout.txt 2>tmp/stderr.txt
|
12 |
+
if %ERRORLEVEL% == 0 goto :check_pip
|
13 |
+
echo Couldn't launch python
|
14 |
+
goto :show_stdout_stderr
|
15 |
+
|
16 |
+
:check_pip
|
17 |
+
%PYTHON% -mpip --help >tmp/stdout.txt 2>tmp/stderr.txt
|
18 |
+
if %ERRORLEVEL% == 0 goto :start_venv
|
19 |
+
if "%PIP_INSTALLER_LOCATION%" == "" goto :show_stdout_stderr
|
20 |
+
%PYTHON% "%PIP_INSTALLER_LOCATION%" >tmp/stdout.txt 2>tmp/stderr.txt
|
21 |
+
if %ERRORLEVEL% == 0 goto :start_venv
|
22 |
+
echo Couldn't install pip
|
23 |
+
goto :show_stdout_stderr
|
24 |
+
|
25 |
+
:start_venv
|
26 |
+
if ["%VENV_DIR%"] == ["-"] goto :skip_venv
|
27 |
+
if ["%SKIP_VENV%"] == ["1"] goto :skip_venv
|
28 |
+
|
29 |
+
dir "%VENV_DIR%\Scripts\Python.exe" >tmp/stdout.txt 2>tmp/stderr.txt
|
30 |
+
if %ERRORLEVEL% == 0 goto :activate_venv
|
31 |
+
|
32 |
+
for /f "delims=" %%i in ('CALL %PYTHON% -c "import sys; print(sys.executable)"') do set PYTHON_FULLNAME="%%i"
|
33 |
+
echo Creating venv in directory %VENV_DIR% using python %PYTHON_FULLNAME%
|
34 |
+
%PYTHON_FULLNAME% -m venv "%VENV_DIR%" >tmp/stdout.txt 2>tmp/stderr.txt
|
35 |
+
if %ERRORLEVEL% == 0 goto :activate_venv
|
36 |
+
echo Unable to create venv in directory "%VENV_DIR%"
|
37 |
+
goto :show_stdout_stderr
|
38 |
+
|
39 |
+
:activate_venv
|
40 |
+
set PYTHON="%VENV_DIR%\Scripts\Python.exe"
|
41 |
+
echo venv %PYTHON%
|
42 |
+
|
43 |
+
:skip_venv
|
44 |
+
goto :launch
|
45 |
+
|
46 |
+
:launch
|
47 |
+
%PYTHON% app.py %COMMANDLINE_ARGS% %*
|
48 |
+
pause
|
49 |
+
exit /b
|
50 |
+
|
51 |
+
:show_stdout_stderr
|
52 |
+
|
53 |
+
echo.
|
54 |
+
echo exit code: %errorlevel%
|
55 |
+
|
56 |
+
for /f %%i in ("tmp\stdout.txt") do set size=%%~zi
|
57 |
+
if %size% equ 0 goto :show_stderr
|
58 |
+
echo.
|
59 |
+
echo stdout:
|
60 |
+
type tmp\stdout.txt
|
61 |
+
|
62 |
+
:show_stderr
|
63 |
+
for /f %%i in ("tmp\stderr.txt") do set size=%%~zi
|
64 |
+
if %size% equ 0 goto :show_stderr
|
65 |
+
echo.
|
66 |
+
echo stderr:
|
67 |
+
type tmp\stderr.txt
|
68 |
+
|
69 |
+
:endofscript
|
70 |
+
|
71 |
+
echo.
|
72 |
+
echo Launch unsuccessful. Exiting.
|
73 |
+
pause
|