Spaces:
Sleeping
Sleeping
from fastapi import FastAPI, PlainTextResponse, Request, HTTPException | |
from urllib.parse import unquote | |
import uvicorn | |
from scrapling import Fetcher | |
from markitdown import MarkItDown | |
import tempfile | |
import os | |
app = FastAPI() | |
fetcher = Fetcher(auto_match=True) | |
md = MarkItDown() | |
def scraper(url): | |
html = fetcher.get(url) | |
return html.prettify() | |
def convert_html_to_md(html): | |
with tempfile.NamedTemporaryFile(suffix=".html" ,delete=False) as temp_file: | |
temp_file.write(html.encode('utf-8')) | |
temp_file.flush() | |
temp_file_path = temp_file.name | |
print(temp_file_path) | |
x = md.convert(temp_file_path).text_content | |
os.remove(temp_file_path) | |
return x | |
# POST endpoint to /reader expecting URL in JSON body | |
async def get_markdown_post(request: Request): | |
try: | |
request_data = await request.json() | |
url = request_data.get("url") | |
if not url: | |
raise HTTPException(status_code=400, detail="Please provide a URL in the request body as JSON: {'url': 'your_url'}") | |
decoded_url = unquote(url) | |
markdown_output = convert_html_to_md(scraper(decoded_url)) | |
return PlainTextResponse(markdown_output) | |
except HTTPException as http_exc: | |
raise http_exc | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}") | |
# GET endpoint to /read/{url:path} expecting URL in path | |
async def get_markdown_get(url: str): | |
try: | |
decoded_url = unquote(url) # URL in path needs unquoting as well | |
markdown_output = convert_html_to_md(scraper(decoded_url)) | |
return PlainTextResponse(markdown_output) | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}") | |
if __name__ == "__main__": | |
uvicorn.run(app, host="0.0.0.0", port=7860) |