Spaces:

bcci
/

reader-api

Sleeping

File size: 1,590 Bytes

from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import PlainTextResponse
from urllib.parse import unquote
import uvicorn
from scrapling import Fetcher
from markitdown import MarkItDown
import tempfile
import os

app = FastAPI()

fetcher = Fetcher(auto_match=True)
md = MarkItDown()

def scraper(url):
    html = fetcher.get(url)
    return html.prettify()

def convert_html_to_md(html):
    with tempfile.NamedTemporaryFile(suffix=".html" ,delete=False) as temp_file:
        temp_file.write(html.encode('utf-8'))
        temp_file.flush()
        temp_file_path = temp_file.name
        print(temp_file_path)
        x = md.convert(temp_file_path).text_content
    os.remove(temp_file_path)
    return x

# GET endpoint to /read/{url:path} expecting URL in path
@app.get("/read/{url:path}", response_class=PlainTextResponse)
async def get_markdown_get(request: Request, url: str):
    try:
        # Retrieve the full path from the request
        full_url = str(request.url)
        
        # Extract the part of the URL after `/read/`
        full_url = full_url.split("/read/")[1]
        
        # Additional optional URL validation if needed
        if not full_url.startswith(('http://', 'https://')):
            full_url = f"http://{full_url}"
        
        markdown_output = convert_html_to_md(scraper(full_url))
        return PlainTextResponse(markdown_output)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")


if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)