Spaces:

bcci
/

reader-api

Sleeping

App Files Files Community

reader-api / app.py

bcci

Create app.py

169e42a verified 4 months ago

raw

history blame

1.97 kB

	from fastapi import FastAPI, PlainTextResponse, Request, HTTPException
	from urllib.parse import unquote
	import uvicorn
	from scrapling import Fetcher
	from markitdown import MarkItDown
	import tempfile
	import os

	app = FastAPI()

	fetcher = Fetcher(auto_match=True)
	md = MarkItDown()

	def scraper(url):
	html = fetcher.get(url)
	return html.prettify()

	def convert_html_to_md(html):
	with tempfile.NamedTemporaryFile(suffix=".html" ,delete=False) as temp_file:
	temp_file.write(html.encode('utf-8'))
	temp_file.flush()
	temp_file_path = temp_file.name
	print(temp_file_path)
	x = md.convert(temp_file_path).text_content
	os.remove(temp_file_path)
	return x

	# POST endpoint to /reader expecting URL in JSON body
	@app.post("/reader", response_class=PlainTextResponse)
	async def get_markdown_post(request: Request):
	try:
	request_data = await request.json()
	url = request_data.get("url")
	if not url:
	raise HTTPException(status_code=400, detail="Please provide a URL in the request body as JSON: {'url': 'your_url'}")
	decoded_url = unquote(url)
	markdown_output = convert_html_to_md(scraper(decoded_url))
	return PlainTextResponse(markdown_output)
	except HTTPException as http_exc:
	raise http_exc
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")

	# GET endpoint to /read/{url:path} expecting URL in path
	@app.get("/read/{url:path}", response_class=PlainTextResponse)
	async def get_markdown_get(url: str):
	try:
	decoded_url = unquote(url) # URL in path needs unquoting as well
	markdown_output = convert_html_to_md(scraper(decoded_url))
	return PlainTextResponse(markdown_output)
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error processing URL: {e}")


	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=7860)