File size: 2,759 Bytes
be31cd7
 
169e42a
 
0b3c590
169e42a
 
 
 
 
 
 
0b3c590
169e42a
 
0b3c590
 
 
 
169e42a
 
6149ec1
169e42a
 
 
 
 
 
 
 
 
 
 
 
 
dd5e687
169e42a
51bbabe
 
 
 
 
 
 
 
 
 
 
169e42a
 
 
 
0b3c590
 
6149ec1
0b3c590
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169e42a
 
0b3c590
fe8c37e
 
adf38ca
fe8c37e
 
 
0b3c590
 
169e42a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import PlainTextResponse
from urllib.parse import unquote
import uvicorn
from scrapling import Fetcher, StealthyFetcher
from markitdown import MarkItDown
import tempfile
import os

app = FastAPI()

fetcher = Fetcher(auto_match=True)
stealthy_fetcher = StealthyFetcher()
md = MarkItDown()

def stealthy_scraper(url):
    html = stealthy_fetcher.fetch(url)
    return html

def scraper(url):
    html = fetcher.get(url)
    return html.html_content

def convert_html_to_md(html):
    with tempfile.NamedTemporaryFile(suffix=".html" ,delete=False) as temp_file:
        temp_file.write(html.encode('utf-8'))
        temp_file.flush()
        temp_file_path = temp_file.name
        print(temp_file_path)
        x = md.convert(temp_file_path).text_content
    os.remove(temp_file_path)
    return x

# GET endpoint to /read/{url:path} expecting URL in path
@app.get("/read/{url:path}", response_class=PlainTextResponse)
async def get_markdown_get(request: Request, url: str):
    try:
        # Retrieve the full path from the request
        full_url = str(request.url)
        
        # Extract the part of the URL after `/read/`
        full_url = full_url.split("/read/")[1]
        
        # Additional optional URL validation if needed
        if not full_url.startswith(('http://', 'https://')):
            full_url = f"http://{full_url}"
        
        markdown_output = convert_html_to_md(scraper(full_url))
        return PlainTextResponse(markdown_output)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")

# GET endpoint to /read/{url:path} expecting URL in path
@app.get("/reader/{url:path}", response_class=PlainTextResponse)
def get_markdown_get(request: Request, url: str):
    try:
        # Retrieve the full path from the request
        full_url = str(request.url)
        
        # Extract the part of the URL after `/read/`
        full_url = full_url.split("/reader/")[1]
        
        # Additional optional URL validation if needed
        if not full_url.startswith(('http://', 'https://')):
            full_url = f"http://{full_url}"
        
        markdown_output = convert_html_to_md(stealthy_scraper(full_url))
        return PlainTextResponse(markdown_output)
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")


if __name__ == "__main__":
    import subprocess

    try:
        subprocess.run(['camoufox', 'fetch'], check=True)
        print("Command executed successfully!")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)