Spaces:

bcci
/

reader-api

Sleeping

File size: 2,759 Bytes

from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import PlainTextResponse
from urllib.parse import unquote
import uvicorn
from scrapling import Fetcher, StealthyFetcher
from markitdown import MarkItDown
import tempfile
import os

app = FastAPI()

fetcher = Fetcher(auto_match=True)
stealthy_fetcher = StealthyFetcher()
md = MarkItDown()

def stealthy_scraper(url):
    html = stealthy_fetcher.fetch(url)
    return html

def scraper(url):
    html = fetcher.get(url)
    return html.html_content

def convert_html_to_md(html):
    with tempfile.NamedTemporaryFile(suffix=".html" ,delete=False) as temp_file:
        temp_file.write(html.encode('utf-8'))
        temp_file.flush()
        temp_file_path = temp_file.name
        print(temp_file_path)
        x = md.convert(temp_file_path).text_content
    os.remove(temp_file_path)
    return x

# GET endpoint to /read/{url:path} expecting URL in path
@app.get("/read/{url:path}", response_class=PlainTextResponse)
async def get_markdown_get(request: Request, url: str):
    try:
        # Retrieve the full path from the request
        full_url = str(request.url)
        
        # Extract the part of the URL after `/read/`
        full_url = full_url.split("/read/")[1]
        
        # Additional optional URL validation if needed
        if not full_url.startswith(('http://', 'https://')):
            full_url = f"http://{full_url}"
        
        markdown_output = convert_html_to_md(scraper(full_url))
        return PlainTextResponse(markdown_output)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")

# GET endpoint to /read/{url:path} expecting URL in path
@app.get("/reader/{url:path}", response_class=PlainTextResponse)
def get_markdown_get(request: Request, url: str):
    try:
        # Retrieve the full path from the request
        full_url = str(request.url)
        
        # Extract the part of the URL after `/read/`
        full_url = full_url.split("/reader/")[1]
        
        # Additional optional URL validation if needed
        if not full_url.startswith(('http://', 'https://')):
            full_url = f"http://{full_url}"
        
        markdown_output = convert_html_to_md(stealthy_scraper(full_url))
        return PlainTextResponse(markdown_output)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")


if __name__ == "__main__":
    import subprocess

    try:
        subprocess.run(['camoufox', 'fetch'], check=True)
        print("Command executed successfully!")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)