Spaces:

bcci
/

reader-api

Sleeping

reader-api / app.py

Update app.py

51bbabe verified 3 months ago

1.57 kB

	from fastapi import FastAPI, Request, HTTPException
	from fastapi.responses import PlainTextResponse
	from urllib.parse import unquote
	import uvicorn
	from scrapling import Fetcher
	from markitdown import MarkItDown
	import tempfile
	import os

	app = FastAPI()

	fetcher = Fetcher(auto_match=True)
	md = MarkItDown()

	def scraper(url):
	html = fetcher.get(url)
	return html.prettify()

	def convert_html_to_md(html):
	with tempfile.NamedTemporaryFile(suffix=".html" ,delete=False) as temp_file:
	temp_file.write(html.encode('utf-8'))
	temp_file.flush()
	temp_file_path = temp_file.name
	print(temp_file_path)
	x = md.convert(temp_file_path).text_content
	os.remove(temp_file_path)
	return x

	# GET endpoint to /read/{url:path} expecting URL in path
	@app.get("/read/{url:path}", response_class=PlainTextResponse)
	async def get_markdown_get(url: str):
	try:
	# Retrieve the full path from the request
	full_url = str(request.url)

	# Extract the part of the URL after `/read/`
	full_url = full_url.split("/read/")[1]

	# Additional optional URL validation if needed
	if not full_url.startswith(('http://', 'https://')):
	full_url = f"http://{full_url}"

	markdown_output = convert_html_to_md(scraper(full_url))
	return PlainTextResponse(markdown_output)
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")


	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=7860)