File size: 1,590 Bytes
be31cd7
 
169e42a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd5e687
169e42a
51bbabe
 
 
 
 
 
 
 
 
 
 
169e42a
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import PlainTextResponse
from urllib.parse import unquote
import uvicorn
from scrapling import Fetcher
from markitdown import MarkItDown
import tempfile
import os

app = FastAPI()

fetcher = Fetcher(auto_match=True)
md = MarkItDown()

def scraper(url):
    html = fetcher.get(url)
    return html.prettify()

def convert_html_to_md(html):
    with tempfile.NamedTemporaryFile(suffix=".html" ,delete=False) as temp_file:
        temp_file.write(html.encode('utf-8'))
        temp_file.flush()
        temp_file_path = temp_file.name
        print(temp_file_path)
        x = md.convert(temp_file_path).text_content
    os.remove(temp_file_path)
    return x

# GET endpoint to /read/{url:path} expecting URL in path
@app.get("/read/{url:path}", response_class=PlainTextResponse)
async def get_markdown_get(request: Request, url: str):
    try:
        # Retrieve the full path from the request
        full_url = str(request.url)
        
        # Extract the part of the URL after `/read/`
        full_url = full_url.split("/read/")[1]
        
        # Additional optional URL validation if needed
        if not full_url.startswith(('http://', 'https://')):
            full_url = f"http://{full_url}"
        
        markdown_output = convert_html_to_md(scraper(full_url))
        return PlainTextResponse(markdown_output)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")


if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)