Spaces:

DeepLearning101
/

Multimodal-RAG-Agent

Running

App Files Files Community

Multimodal-RAG-Agent / app.py

DeepLearning101

Update app.py

996ce1d verified about 14 hours ago

raw

history blame contribute delete

7.29 kB

	import gradio as gr
	import requests
	import mimetypes
	import json, os
	import asyncio
	import aiohttp

	LLM_API = os.environ.get("LLM_API")
	LLM_URL = os.environ.get("LLM_URL")
	USER_ID = "HuggingFace Space"

	async def send_chat_message(LLM_URL, LLM_API, user_input, file_id):
	payload = {
	"inputs": {},
	"query": user_input,
	"response_mode": "streaming",
	"conversation_id": "",
	"user": USER_ID,
	"files": [{
	"type": "image",
	"transfer_method": "local_file",
	"upload_file_id": file_id
	}]
	}

	async with aiohttp.ClientSession() as session:
	async with session.post(
	f"{LLM_URL}/chat-messages",
	headers={"Authorization": f"Bearer {LLM_API}"},
	json=payload
	) as response:
	if response.status == 404:
	return "Error: Endpoint not found (404)"
	last_thought = None
	async for line in response.content:
	if line:
	try:
	data = json.loads(line.decode("utf-8").replace("data: ", ""))
	if data.get("data", {}).get("outputs", {}).get("answer"):
	last_thought = data["data"]["outputs"]["answer"]
	break
	except Exception:
	continue
	return last_thought.strip() if last_thought else "Error: No answer found."

	async def upload_file(LLM_URL, LLM_API, file_path, user_id):
	if not os.path.exists(file_path):
	return f"Error: File {file_path} not found"
	mime_type, _ = mimetypes.guess_type(file_path)
	with open(file_path, 'rb') as f:
	async with aiohttp.ClientSession() as session:
	form_data = aiohttp.FormData()
	form_data.add_field('file', f, filename=file_path, content_type=mime_type)
	form_data.add_field('user', user_id)
	async with session.post(
	f"{LLM_URL}/files/upload",
	headers={"Authorization": f"Bearer {LLM_API}"},
	data=form_data
	) as response:
	if response.status == 404:
	return "Error: Upload endpoint not found"
	text = await response.text()
	try:
	json_resp = json.loads(text)
	return json_resp
	except json.JSONDecodeError:
	return "Error: Upload returned invalid JSON"

	async def handle_input(file_path, user_input):
	upload_response = await upload_file(LLM_URL, LLM_API, file_path, USER_ID)
	if isinstance(upload_response, str) and "Error" in upload_response:
	return upload_response
	file_id = upload_response.get("id")
	if not file_id:
	return "Error: No file ID from upload"
	return await send_chat_message(LLM_URL, LLM_API, user_input, file_id)

	# --- Gradio UI 設定 --- 定義界面標題和描述
	TITLE = """<h1>Multimodal RAG Playground 💬 輸入工地照片，生成工地場景及相關法規和缺失描述</h1>"""
	SUBTITLE = """<h2><a href='https://www.twman.org' target='_blank'>TonTon Huang Ph.D.</a> \| <a href='https://www.twman.org/AI' target='_blank'> AI </a> \| <a href='https://blog.twman.org/p/deeplearning101.html' target='_blank'>手把手帶你一起踩AI坑</a><br></h2>"""
	LINKS = """
	<a href='https://github.com/Deep-Learning-101' target='_blank'>Deep Learning 101 Github</a> \| <a href='http://deeplearning101.twman.org' target='_blank'>Deep Learning 101</a> \| <a href='https://www.facebook.com/groups/525579498272187/' target='_blank'>台灣人工智慧社團 FB</a> \| <a href='https://www.youtube.com/c/DeepLearning101' target='_blank'>YouTube</a><br>
	<a href='https://blog.twman.org/2025/03/AIAgent.html' target='_blank'>避開 AI Agent 開發陷阱：常見問題、挑戰與解決方案 (那些 AI Agent 實戰踩過的坑)</a>：探討多種 AI 代理人工具的應用經驗與挑戰，分享實用經驗與工具推薦。<br>
	<a href='https://blog.twman.org/2024/08/LLM.html' target='_blank'>白話文手把手帶你科普 GenAI</a>：淺顯介紹生成式人工智慧核心概念，強調硬體資源和數據的重要性。<br>
	<a href='https://blog.twman.org/2024/09/LLM.html' target='_blank'>大型語言模型直接就打完收工？</a>：回顧 LLM 領域探索歷程，討論硬體升級對 AI 開發的重要性。<br>
	<a href='https://blog.twman.org/2024/07/RAG.html' target='_blank'>檢索增強生成 (Retrieval-Augmented Generation, RAG) 不是萬靈丹之優化挑戰技巧</a>：探討 RAG 技術應用與挑戰，提供實用經驗分享和工具建議。<br>
	<a href='https://blog.twman.org/2024/02/LLM.html' target='_blank'>大型語言模型 (LLM) 入門完整指南：原理、應用與未來</a>：探討多種 LLM 工具的應用與挑戰，強調硬體資源的重要性。<br>
	<a href='https://blog.twman.org/2023/04/GPT.html' target='_blank'>什麼是大語言模型，它是什麼？想要嗎？(Large Language Model，LLM)</a>：探討 LLM 的發展與應用，強調硬體資源在開發中的關鍵作用。。<br>
	<a href='https://blog.twman.org/2024/11/diffusion.html' target='_blank'>ComfyUI + Stable Diffuision</a>：深入探討影像生成與分割技術的應用，強調硬體資源的重要性。<br>
	<a href='https://blog.twman.org/2024/02/asr-tts.html' target='_blank'>ASR/TTS 開發避坑指南：語音辨識與合成的常見挑戰與對策</a>：探討 ASR 和 TTS 技術應用中的問題，強調數據質量的重要性。<br>
	<a href='https://blog.twman.org/2021/04/NLP.html' target='_blank'>那些自然語言處理 (Natural Language Processing, NLP) 踩的坑</a>：分享 NLP 領域的實踐經驗，強調數據質量對模型效果的影響。<br>
	<a href='https://blog.twman.org/2021/04/ASR.html' target='_blank'>那些語音處理 (Speech Processing) 踩的坑</a>：分享語音處理領域的實務經驗，強調資料品質對模型效果的影響。<br>
	<a href='https://blog.twman.org/2023/07/wsl.html' target='_blank'>用PPOCRLabel來幫PaddleOCR做OCR的微調和標註</a><br>
	<a href='https://blog.twman.org/2023/07/HugIE.html' target='_blank'>基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析</a><br>
	"""

	examples = [
	['DEMO/DEMO_0004.jpg', '0004-51'],
	['DEMO/DEMO_0005.jpg', '0005-92'],
	['DEMO/DEMO_0006.jpg', '0006-281'],
	['DEMO/DEMO_0008.jpg', '0008-281'],
	['DEMO/DEMO_0011.jpg', '0011-108'],
	]

	with gr.Blocks() as demo:
	gr.HTML(TITLE)
	gr.HTML(SUBTITLE)
	gr.HTML(LINKS)

	with gr.Row():
	image_input = gr.Image(label='📷 上傳照片', type='filepath')
	text_input = gr.Textbox(label='💬 輸入問題描述', value="分析一下這張工地場景照片")

	output_box = gr.Textbox(label="📝 回應結果", lines=8)

	submit_button = gr.Button("🚀 開始分析")

	submit_button.click(
	fn=handle_input,
	inputs=[image_input, text_input],
	outputs=[output_box]
	)

	gr.Examples(
	examples=examples,
	inputs=[image_input, text_input],
	outputs=[output_box],
	label="點擊以下範例自動帶入"
	)

	demo.launch()