reader-api / app.py
bcci's picture
Create app.py
169e42a verified
raw
history blame
1.97 kB
from fastapi import FastAPI, PlainTextResponse, Request, HTTPException
from urllib.parse import unquote
import uvicorn
from scrapling import Fetcher
from markitdown import MarkItDown
import tempfile
import os
app = FastAPI()
fetcher = Fetcher(auto_match=True)
md = MarkItDown()
def scraper(url):
html = fetcher.get(url)
return html.prettify()
def convert_html_to_md(html):
with tempfile.NamedTemporaryFile(suffix=".html" ,delete=False) as temp_file:
temp_file.write(html.encode('utf-8'))
temp_file.flush()
temp_file_path = temp_file.name
print(temp_file_path)
x = md.convert(temp_file_path).text_content
os.remove(temp_file_path)
return x
# POST endpoint to /reader expecting URL in JSON body
@app.post("/reader", response_class=PlainTextResponse)
async def get_markdown_post(request: Request):
try:
request_data = await request.json()
url = request_data.get("url")
if not url:
raise HTTPException(status_code=400, detail="Please provide a URL in the request body as JSON: {'url': 'your_url'}")
decoded_url = unquote(url)
markdown_output = convert_html_to_md(scraper(decoded_url))
return PlainTextResponse(markdown_output)
except HTTPException as http_exc:
raise http_exc
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
# GET endpoint to /read/{url:path} expecting URL in path
@app.get("/read/{url:path}", response_class=PlainTextResponse)
async def get_markdown_get(url: str):
try:
decoded_url = unquote(url) # URL in path needs unquoting as well
markdown_output = convert_html_to_md(scraper(decoded_url))
return PlainTextResponse(markdown_output)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860)