chore: update something
Browse files- docsifer/service.py +9 -2
docsifer/service.py
CHANGED
@@ -111,18 +111,25 @@ class DocsiferService:
|
|
111 |
# Use a temp directory so MarkItDown sees the real file extension
|
112 |
with tempfile.TemporaryDirectory() as tmpdir:
|
113 |
mime_type = magic.from_file(str(src), mime=True)
|
|
|
114 |
if not mime_type:
|
115 |
logger.warning(f"Could not detect file type for: {src}")
|
116 |
new_filename = src.name
|
117 |
else:
|
118 |
logger.debug(f"Detected MIME type '{mime_type}' for: {src}")
|
119 |
-
guessed_ext = mimetypes.guess_extension(mime_type) or ""
|
120 |
new_filename = f"{src.stem}{guessed_ext}"
|
121 |
tmp_path = Path(tmpdir) / new_filename
|
122 |
tmp_path.write_bytes(src.read_bytes())
|
123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
# If it's HTML and cleanup is requested
|
125 |
-
if cleanup and
|
126 |
self._maybe_cleanup_html(tmp_path)
|
127 |
|
128 |
# Decide whether to use LLM or basic
|
|
|
111 |
# Use a temp directory so MarkItDown sees the real file extension
|
112 |
with tempfile.TemporaryDirectory() as tmpdir:
|
113 |
mime_type = magic.from_file(str(src), mime=True)
|
114 |
+
guessed_ext = mimetypes.guess_extension(mime_type) or ".tmp"
|
115 |
if not mime_type:
|
116 |
logger.warning(f"Could not detect file type for: {src}")
|
117 |
new_filename = src.name
|
118 |
else:
|
119 |
logger.debug(f"Detected MIME type '{mime_type}' for: {src}")
|
|
|
120 |
new_filename = f"{src.stem}{guessed_ext}"
|
121 |
tmp_path = Path(tmpdir) / new_filename
|
122 |
tmp_path.write_bytes(src.read_bytes())
|
123 |
|
124 |
+
logger.info(
|
125 |
+
"Using temp file: %s, MIME type: %s, Guessed ext: %s",
|
126 |
+
tmp_path,
|
127 |
+
mime_type,
|
128 |
+
guessed_ext,
|
129 |
+
)
|
130 |
+
|
131 |
# If it's HTML and cleanup is requested
|
132 |
+
if cleanup and guessed_ext.lower() in (".html", ".htm"):
|
133 |
self._maybe_cleanup_html(tmp_path)
|
134 |
|
135 |
# Decide whether to use LLM or basic
|