Spaces:
Sleeping
Sleeping
File size: 1,590 Bytes
be31cd7 169e42a dd5e687 169e42a 51bbabe 169e42a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import PlainTextResponse
from urllib.parse import unquote
import uvicorn
from scrapling import Fetcher
from markitdown import MarkItDown
import tempfile
import os
app = FastAPI()
fetcher = Fetcher(auto_match=True)
md = MarkItDown()
def scraper(url):
html = fetcher.get(url)
return html.prettify()
def convert_html_to_md(html):
with tempfile.NamedTemporaryFile(suffix=".html" ,delete=False) as temp_file:
temp_file.write(html.encode('utf-8'))
temp_file.flush()
temp_file_path = temp_file.name
print(temp_file_path)
x = md.convert(temp_file_path).text_content
os.remove(temp_file_path)
return x
# GET endpoint to /read/{url:path} expecting URL in path
@app.get("/read/{url:path}", response_class=PlainTextResponse)
async def get_markdown_get(request: Request, url: str):
try:
# Retrieve the full path from the request
full_url = str(request.url)
# Extract the part of the URL after `/read/`
full_url = full_url.split("/read/")[1]
# Additional optional URL validation if needed
if not full_url.startswith(('http://', 'https://')):
full_url = f"http://{full_url}"
markdown_output = convert_html_to_md(scraper(full_url))
return PlainTextResponse(markdown_output)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860) |