Spaces:

VIDraft
/

ThinkFlow-llama

Running on Zero

App Files Files Community

ThinkFlow-llama / app.py

openfree

Update app.py

89817e2 verified 2 months ago

raw

history blame

15.8 kB

	import re
	import threading
	import gc
	import os
	import torch

	import gradio as gr
	import spaces
	import transformers
	from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
	from huggingface_hub import login

	# 모델 메모리 관리 및 최적화를 위한 설정
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
	MAX_GPU_MEMORY = 80 * 1024 * 1024 * 1024 # 80GB A100 기준 (실제 사용 가능한 메모리는 이보다 적음)

	# 사용 가능한 모델 목록 - A100에서 효율적으로 실행 가능한 모델로 필터링
	available_models = {
	"meta-llama/Llama-3.2-3B-Instruct": "Llama 3.2 (3B)",
	"Hermes-3-Llama-3.1-8B": "Hermes 3 Llama 3.1 (8B)",
	"nvidia/Llama-3.1-Nemotron-Nano-8B-v1": "Nvidia Nemotron Nano (8B)",
	"mistralai/Mistral-Small-3.1-24B-Instruct-2503": "Mistral Small 3.1 (24B)",
	"google/gemma-3-27b-it": "Google Gemma 3 (27B)",
	"Qwen/Qwen2.5-Coder-32B-Instruct": "Qwen 2.5 Coder (32B)",
	"open-r1/OlympicCoder-32B": "Olympic Coder (32B)"
	}

	# 모델 로드에 사용되는 전역 변수
	pipe = None
	current_model_name = None

	# Hugging Face 토큰으로 로그인 시도
	try:
	hf_token = os.getenv("HF_TOKEN")
	if hf_token:
	login(token=hf_token)
	print("Hugging Face에 성공적으로 로그인했습니다.")
	else:
	print("경고: HF_TOKEN 환경 변수가 설정되지 않았습니다.")
	except Exception as e:
	print(f"Hugging Face 로그인 에러: {str(e)}")

	# 최종 답변을 감지하기 위한 마커
	ANSWER_MARKER = "답변"

	# 단계별 추론을 시작하는 문장들
	rethink_prepends = [
	"자, 이제 다음을 파악해야 합니다 ",
	"제 생각에는 ",
	"잠시만요, 제 생각에는 ",
	"다음 사항이 맞는지 확인해 보겠습니다 ",
	"또한 기억해야 할 것은 ",
	"또 다른 주목할 점은 ",
	"그리고 저는 다음과 같은 사실도 기억합니다 ",
	"이제 충분히 이해했다고 생각합니다 ",
	"지금까지의 정보를 바탕으로, 원래 질문에 사용된 언어로 답변하겠습니다:"
	"\n{question}\n"
	f"\n{ANSWER_MARKER}\n",
	]

	# 수식 표시 문제 해결을 위한 설정
	latex_delimiters = [
	{"left": "$$", "right": "$$", "display": True},
	{"left": "$", "right": "$", "display": False},
	]

	# 모델 크기 기반 구성 - 모델 크기에 따른 최적 설정 정의
	MODEL_CONFIG = {
	"small": { # <10B
	"max_memory": {0: "20GiB"},
	"offload": False,
	"quantization": None
	},
	"medium": { # 10B-30B
	"max_memory": {0: "40GiB"},
	"offload": False,
	"quantization": "4bit"
	},
	"large": { # >30B
	"max_memory": {0: "70GiB"},
	"offload": True,
	"quantization": "4bit"
	}
	}

	def get_model_size_category(model_name):
	"""모델 크기 카테고리 결정"""
	if "3B" in model_name or "8B" in model_name:
	return "small"
	elif "24B" in model_name or "27B" in model_name:
	return "medium"
	elif "32B" in model_name or "70B" in model_name:
	return "large"
	else:
	# 기본값으로 medium 반환
	return "medium"

	def clear_gpu_memory():
	"""GPU 메모리 정리"""
	global pipe

	if pipe is not None:
	del pipe
	pipe = None

	# CUDA 캐시 정리
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	torch.cuda.synchronize()

	def reformat_math(text):
	"""Gradio 구문(Katex)을 사용하도록 MathJax 구분 기호 수정."""
	text = re.sub(r"\\\[\s(.?)\s*\\\]", r"$$\1$$", text, flags=re.DOTALL)
	text = re.sub(r"\\$\s(.?)\s*\\$", r"$\1$", text, flags=re.DOTALL)
	return text

	def user_input(message, history: list):
	"""사용자 입력을 히스토리에 추가하고 입력 텍스트 상자 비우기"""
	return "", history + [
	gr.ChatMessage(role="user", content=message.replace(ANSWER_MARKER, ""))
	]

	def rebuild_messages(history: list):
	"""중간 생각 과정 없이 모델이 사용할 히스토리에서 메시지 재구성"""
	messages = []
	for h in history:
	if isinstance(h, dict) and not h.get("metadata", {}).get("title", False):
	messages.append(h)
	elif (
	isinstance(h, gr.ChatMessage)
	and h.metadata.get("title")
	and isinstance(h.content, str)
	):
	messages.append({"role": h.role, "content": h.content})
	return messages

	def load_model(model_names):
	"""선택된 모델 이름에 따라 모델 로드 (A100에 최적화된 설정 사용)"""
	global pipe, current_model_name

	# 기존 모델 정리
	clear_gpu_memory()

	# 모델이 선택되지 않았을 경우 기본값 지정
	if not model_names:
	model_name = "meta-llama/Llama-3.2-3B-Instruct" # 더 작은 모델을 기본값으로 사용
	else:
	# 첫 번째 선택된 모델 사용
	model_name = model_names[0]

	# 모델 크기 카테고리 확인
	size_category = get_model_size_category(model_name)
	config = MODEL_CONFIG[size_category]

	# 모델 로드 (크기에 따라 최적화된 설정 적용)
	try:
	# HF_TOKEN 환경 변수 확인
	hf_token = os.getenv("HF_TOKEN")
	# 공통 매개변수
	common_params = {
	"token": hf_token, # 접근 제한 모델을 위한 토큰
	"trust_remote_code": True,
	}

	# BF16 정밀도 사용 (A100에 최적화)
	if config["quantization"]:
	# 양자화 적용
	from transformers import BitsAndBytesConfig
	quantization_config = BitsAndBytesConfig(
	load_in_4bit=config["quantization"] == "4bit",
	bnb_4bit_compute_dtype=DTYPE
	)

	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	device_map="auto",
	max_memory=config["max_memory"],
	torch_dtype=DTYPE,
	quantization_config=quantization_config if config["quantization"] else None,
	offload_folder="offload" if config["offload"] else None,
	**common_params
	)
	tokenizer = AutoTokenizer.from_pretrained(model_name, **common_params)

	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	torch_dtype=DTYPE,
	device_map="auto"
	)
	else:
	# 양자화 없이 로드
	pipe = pipeline(
	"text-generation",
	model=model_name,
	device_map="auto",
	torch_dtype=DTYPE,
	**common_params
	)

	current_model_name = model_name
	return f"모델 '{model_name}'이(가) 성공적으로 로드되었습니다. (최적화: {size_category} 카테고리)"

	except Exception as e:
	return f"모델 로드 실패: {str(e)}"

	@spaces.GPU
	def bot(
	history: list,
	max_num_tokens: int,
	final_num_tokens: int,
	do_sample: bool,
	temperature: float,
	):
	"""모델이 질문에 답변하도록 하기"""
	global pipe

	# 모델이 로드되지 않았다면 오류 메시지 표시
	if pipe is None:
	history.append(
	gr.ChatMessage(
	role="assistant",
	content="모델이 로드되지 않았습니다. 하나 이상의 모델을 선택해 주세요.",
	)
	)
	yield history
	return

	# 토큰 길이 자동 조정 (모델 크기에 따라)
	size_category = get_model_size_category(current_model_name)

	# 대형 모델은 토큰 수를 줄여 메모리 효율성 향상
	if size_category == "large":
	max_num_tokens = min(max_num_tokens, 1000)
	final_num_tokens = min(final_num_tokens, 1500)

	# 나중에 스레드에서 토큰을 스트림으로 가져오기 위함
	streamer = transformers.TextIteratorStreamer(
	pipe.tokenizer,
	skip_special_tokens=True,
	skip_prompt=True,
	)

	# 필요한 경우 추론에 질문을 다시 삽입하기 위함
	question = history[-1]["content"]

	# 보조자 메시지 준비
	history.append(
	gr.ChatMessage(
	role="assistant",
	content=str(""),
	metadata={"title": "🧠 생각 중...", "status": "pending"},
	)
	)

	# 현재 채팅에 표시될 추론 과정
	messages = rebuild_messages(history)

	try:
	for i, prepend in enumerate(rethink_prepends):
	if i > 0:
	messages[-1]["content"] += "\n\n"
	messages[-1]["content"] += prepend.format(question=question)

	num_tokens = int(
	max_num_tokens if ANSWER_MARKER not in prepend else final_num_tokens
	)

	# 스레드에서 모델 실행
	t = threading.Thread(
	target=pipe,
	args=(messages,),
	kwargs=dict(
	max_new_tokens=num_tokens,
	streamer=streamer,
	do_sample=do_sample,
	temperature=temperature,
	# 메모리 효율성을 위한 추가 파라미터
	repetition_penalty=1.2, # 반복 방지
	use_cache=True, # KV 캐시 사용
	),
	)
	t.start()

	# 새 내용으로 히스토리 재구성
	history[-1].content += prepend.format(question=question)
	if ANSWER_MARKER in prepend:
	history[-1].metadata = {"title": "💭 사고 과정", "status": "done"}
	# 생각 종료, 이제 답변입니다 (중간 단계에 대한 메타데이터 없음)
	history.append(gr.ChatMessage(role="assistant", content=""))

	# 토큰 스트리밍
	for token in streamer:
	history[-1].content += token
	history[-1].content = reformat_math(history[-1].content)
	yield history

	t.join()

	# 대형 모델인 경우 각 단계 후 부분적 메모리 정리
	if size_category == "large" and torch.cuda.is_available():
	torch.cuda.empty_cache()

	except Exception as e:
	# 오류 발생시 사용자에게 알림
	if len(history) > 0 and history[-1].role == "assistant":
	history[-1].content += f"\n\n⚠️ 처리 중 오류가 발생했습니다: {str(e)}"
	yield history

	yield history


	# 사용 가능한 GPU 정보 표시 함수
	def get_gpu_info():
	if not torch.cuda.is_available():
	return "GPU를 사용할 수 없습니다."

	gpu_info = []
	for i in range(torch.cuda.device_count()):
	gpu_name = torch.cuda.get_device_name(i)
	total_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
	gpu_info.append(f"GPU {i}: {gpu_name} ({total_memory:.1f} GB)")

	return "\n".join(gpu_info)

	# Gradio 인터페이스
	with gr.Blocks(fill_height=True, title="ThinkFlow - Step-by-step Reasoning Service") as demo:
	# 상단에 타이틀과 설명 추가
	gr.Markdown("""
	# ThinkFlow
	## A thought amplification service that implants step-by-step reasoning abilities into LLMs without model modification
	""")

	with gr.Row(scale=1):
	with gr.Column(scale=5):
	# 채팅 인터페이스
	chatbot = gr.Chatbot(
	scale=1,
	type="messages",
	latex_delimiters=latex_delimiters,
	height=600,
	)
	msg = gr.Textbox(
	submit_btn=True,
	label="",
	show_label=False,
	placeholder="여기에 질문을 입력하세요.",
	autofocus=True,
	)

	with gr.Column(scale=1):
	# 하드웨어 정보 표시
	gpu_info = gr.Markdown(f"사용 가능한 하드웨어:\n{get_gpu_info()}")

	# 모델 선택 섹션 추가
	gr.Markdown("""## 모델 선택""")
	model_selector = gr.Radio(
	choices=list(available_models.values()),
	value=available_models["meta-llama/Llama-3.2-3B-Instruct"], # 작은 모델을 기본값으로
	label="사용할 LLM 모델 선택",
	)

	# 모델 로드 버튼
	load_model_btn = gr.Button("모델 로드", variant="primary")
	model_status = gr.Textbox(label="모델 상태", interactive=False)

	# 메모리 정리 버튼
	clear_memory_btn = gr.Button("GPU 메모리 정리", variant="secondary")

	gr.Markdown("""## 매개변수 조정""")
	with gr.Accordion("고급 설정", open=False):
	num_tokens = gr.Slider(
	50,
	2000,
	1000, # 기본값 축소
	step=50,
	label="추론 단계당 최대 토큰 수",
	interactive=True,
	)
	final_num_tokens = gr.Slider(
	50,
	3000,
	1500, # 기본값 축소
	step=50,
	label="최종 답변의 최대 토큰 수",
	interactive=True,
	)
	do_sample = gr.Checkbox(True, label="샘플링 사용")
	temperature = gr.Slider(0.1, 1.0, 0.7, step=0.1, label="온도")

	# 선택된 모델 로드 이벤트 연결
	def get_model_names(selected_model):
	# 표시 이름에서 원래 모델 이름으로 변환
	inverse_map = {v: k for k, v in available_models.items()}
	return [inverse_map[selected_model]] if selected_model else []

	load_model_btn.click(
	lambda selected: load_model(get_model_names(selected)),
	inputs=[model_selector],
	outputs=[model_status]
	)

	# GPU 메모리 정리 이벤트 연결
	clear_memory_btn.click(
	lambda: (clear_gpu_memory(), "GPU 메모리가 정리되었습니다."),
	inputs=[],
	outputs=[model_status]
	)

	# 사용자가 메시지를 제출하면 봇이 응답합니다
	msg.submit(
	user_input,
	[msg, chatbot], # 입력
	[msg, chatbot], # 출력
	).then(
	bot,
	[
	chatbot,
	num_tokens,
	final_num_tokens,
	do_sample,
	temperature,
	], # 실제로는 "history" 입력
	chatbot, # 출력에서 새 히스토리 저장
	)

	if __name__ == "__main__":
	# 디버깅 정보 출력
	print(f"GPU 사용 가능: {torch.cuda.is_available()}")
	if torch.cuda.is_available():
	print(f"사용 가능한 GPU 개수: {torch.cuda.device_count()}")
	print(f"현재 GPU: {torch.cuda.current_device()}")
	print(f"GPU 이름: {torch.cuda.get_device_name(0)}")

	# HF_TOKEN 환경 변수 확인
	hf_token = os.getenv("HF_TOKEN")
	if hf_token:
	print("HF_TOKEN 환경 변수가 설정되어 있습니다.")
	else:
	print("경고: HF_TOKEN 환경 변수가 설정되지 않았습니다. 제한된 모델에 접근할 수 없습니다.")

	# 큐 사용 및 앱 실행
	demo.queue(max_size=10).launch()