reader-api / app.py
bcci's picture
Update app.py
51bbabe verified
raw
history blame
1.57 kB
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import PlainTextResponse
from urllib.parse import unquote
import uvicorn
from scrapling import Fetcher
from markitdown import MarkItDown
import tempfile
import os
app = FastAPI()
fetcher = Fetcher(auto_match=True)
md = MarkItDown()
def scraper(url):
html = fetcher.get(url)
return html.prettify()
def convert_html_to_md(html):
with tempfile.NamedTemporaryFile(suffix=".html" ,delete=False) as temp_file:
temp_file.write(html.encode('utf-8'))
temp_file.flush()
temp_file_path = temp_file.name
print(temp_file_path)
x = md.convert(temp_file_path).text_content
os.remove(temp_file_path)
return x
# GET endpoint to /read/{url:path} expecting URL in path
@app.get("/read/{url:path}", response_class=PlainTextResponse)
async def get_markdown_get(url: str):
try:
# Retrieve the full path from the request
full_url = str(request.url)
# Extract the part of the URL after `/read/`
full_url = full_url.split("/read/")[1]
# Additional optional URL validation if needed
if not full_url.startswith(('http://', 'https://')):
full_url = f"http://{full_url}"
markdown_output = convert_html_to_md(scraper(full_url))
return PlainTextResponse(markdown_output)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860)