Spaces:

bcci
/

reader-api

Sleeping

File size: 2,806 Bytes

be31cd7
 
169e42a
 
6dc195c
169e42a
 
 
 
 
 
6dc195c
0b3c590
169e42a
 
0b3c590
6dc195c
69ebfb0
0b3c590
169e42a
6dc195c
6149ec1
169e42a
 
 
 
 
 
 
 
 
 
 
 
 
dd5e687
169e42a
51bbabe
 
 
 
 
 
 
 
 
 
 
169e42a
 
 
 
0b3c590
 
6dc195c
0b3c590
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169e42a
 
0b3c590
fe8c37e
 
adf38ca
fe8c37e
 
 
0b3c590
 
169e42a

from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import PlainTextResponse
from urllib.parse import unquote
import uvicorn
from scrapling import AsyncFetcher, StealthyFetcher
from markitdown import MarkItDown
import tempfile
import os

app = FastAPI()

fetcher = AsyncFetcher(auto_match=True)
stealthy_fetcher = StealthyFetcher()
md = MarkItDown()

def stealthy_scraper(url):
    html = await stealthy_fetcher.async_fetch(url)
    return html.html_content

def scraper(url):
    html = await fetcher.get(url)
    return html.html_content

def convert_html_to_md(html):
    with tempfile.NamedTemporaryFile(suffix=".html" ,delete=False) as temp_file:
        temp_file.write(html.encode('utf-8'))
        temp_file.flush()
        temp_file_path = temp_file.name
        print(temp_file_path)
        x = md.convert(temp_file_path).text_content
    os.remove(temp_file_path)
    return x

# GET endpoint to /read/{url:path} expecting URL in path
@app.get("/read/{url:path}", response_class=PlainTextResponse)
async def get_markdown_get(request: Request, url: str):
    try:
        # Retrieve the full path from the request
        full_url = str(request.url)
        
        # Extract the part of the URL after `/read/`
        full_url = full_url.split("/read/")[1]
        
        # Additional optional URL validation if needed
        if not full_url.startswith(('http://', 'https://')):
            full_url = f"http://{full_url}"
        
        markdown_output = convert_html_to_md(scraper(full_url))
        return PlainTextResponse(markdown_output)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")

# GET endpoint to /read/{url:path} expecting URL in path
@app.get("/reader/{url:path}", response_class=PlainTextResponse)
async def get_markdown_get(request: Request, url: str):
    try:
        # Retrieve the full path from the request
        full_url = str(request.url)
        
        # Extract the part of the URL after `/read/`
        full_url = full_url.split("/reader/")[1]
        
        # Additional optional URL validation if needed
        if not full_url.startswith(('http://', 'https://')):
            full_url = f"http://{full_url}"
        
        markdown_output = convert_html_to_md(stealthy_scraper(full_url))
        return PlainTextResponse(markdown_output)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")


if __name__ == "__main__":
    import subprocess

    try:
        subprocess.run(['camoufox', 'fetch'], check=True)
        print("Command executed successfully!")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)