reader-api / app.py
bcci's picture
Update app.py
adf38ca verified
raw
history blame
2.76 kB
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import PlainTextResponse
from urllib.parse import unquote
import uvicorn
from scrapling import Fetcher, StealthyFetcher
from markitdown import MarkItDown
import tempfile
import os
app = FastAPI()
fetcher = Fetcher(auto_match=True)
stealthy_fetcher = StealthyFetcher()
md = MarkItDown()
def stealthy_scraper(url):
html = stealthy_fetcher.fetch(url)
return html
def scraper(url):
html = fetcher.get(url)
return html.html_content
def convert_html_to_md(html):
with tempfile.NamedTemporaryFile(suffix=".html" ,delete=False) as temp_file:
temp_file.write(html.encode('utf-8'))
temp_file.flush()
temp_file_path = temp_file.name
print(temp_file_path)
x = md.convert(temp_file_path).text_content
os.remove(temp_file_path)
return x
# GET endpoint to /read/{url:path} expecting URL in path
@app.get("/read/{url:path}", response_class=PlainTextResponse)
async def get_markdown_get(request: Request, url: str):
try:
# Retrieve the full path from the request
full_url = str(request.url)
# Extract the part of the URL after `/read/`
full_url = full_url.split("/read/")[1]
# Additional optional URL validation if needed
if not full_url.startswith(('http://', 'https://')):
full_url = f"http://{full_url}"
markdown_output = convert_html_to_md(scraper(full_url))
return PlainTextResponse(markdown_output)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
# GET endpoint to /read/{url:path} expecting URL in path
@app.get("/reader/{url:path}", response_class=PlainTextResponse)
def get_markdown_get(request: Request, url: str):
try:
# Retrieve the full path from the request
full_url = str(request.url)
# Extract the part of the URL after `/read/`
full_url = full_url.split("/reader/")[1]
# Additional optional URL validation if needed
if not full_url.startswith(('http://', 'https://')):
full_url = f"http://{full_url}"
markdown_output = convert_html_to_md(stealthy_scraper(full_url))
return PlainTextResponse(markdown_output)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")
if __name__ == "__main__":
import subprocess
try:
subprocess.run(['camoufox', 'fetch'], check=True)
print("Command executed successfully!")
except Exception as e:
print(f"An unexpected error occurred: {e}")
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)