Spaces:
Sleeping
Sleeping
from fastapi import FastAPI, Request, HTTPException | |
from fastapi.responses import PlainTextResponse | |
from urllib.parse import unquote | |
import uvicorn | |
from scrapling import Fetcher | |
from markitdown import MarkItDown | |
import tempfile | |
import os | |
app = FastAPI() | |
fetcher = Fetcher(auto_match=True) | |
md = MarkItDown() | |
def scraper(url): | |
html = fetcher.get(url) | |
return html.prettify() | |
def convert_html_to_md(html): | |
with tempfile.NamedTemporaryFile(suffix=".html" ,delete=False) as temp_file: | |
temp_file.write(html.encode('utf-8')) | |
temp_file.flush() | |
temp_file_path = temp_file.name | |
print(temp_file_path) | |
x = md.convert(temp_file_path).text_content | |
os.remove(temp_file_path) | |
return x | |
# GET endpoint to /read/{url:path} expecting URL in path | |
async def get_markdown_get(url: str): | |
try: | |
# Retrieve the full path from the request | |
full_url = str(request.url) | |
# Extract the part of the URL after `/read/` | |
full_url = full_url.split("/read/")[1] | |
# Additional optional URL validation if needed | |
if not full_url.startswith(('http://', 'https://')): | |
full_url = f"http://{full_url}" | |
markdown_output = convert_html_to_md(scraper(full_url)) | |
return PlainTextResponse(markdown_output) | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}") | |
if __name__ == "__main__": | |
uvicorn.run(app, host="0.0.0.0", port=7860) |