Spaces:

kimhyunwoo
/

bitnet

Running

App Files Files Community

bitnet / app.py

kimhyunwoo

Update app.py

4bf6d97 verified 15 days ago

raw

history blame

6.24 kB

	# 필요한 라이브러리를 설치하는 명령어입니다.
	# 이 부분은 스크립트 실행 초반에 한 번 실행됩니다.
	import os
	print("Installing required transformers branch...")
	os.system("pip install git+https://github.com/shumingma/transformers.git")
	print("Installation complete.")

	# 필요한 라이브러리들을 import 합니다.
	import threading
	import torch
	import torch._dynamo
	import gradio as gr
	import spaces # Hugging Face Spaces 관련 유틸리티

	# torch._dynamo 설정 (선택 사항, 성능 향상 시도)
	torch._dynamo.config.suppress_errors = True

	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	TextIteratorStreamer,
	)

	# --- 모델 로드 ---
	# 모델 경로 설정 (Hugging Face 모델 ID)
	model_id = "microsoft/bitnet-b1.58-2B-4T"

	# 모델 로드 시 경고 메시지를 최소화하기 위해 로깅 레벨 설정
	os.environ["TRANSFORMERS_VERBOSITY"] = "error"

	# AutoModelForCausalLM과 AutoTokenizer를 로드합니다.
	# trust_remote_code=True가 필요하며, device_map="auto"를 사용하여 자동으로 디바이스 설정
	try:
	print(f"모델 로딩 중: {model_id}...")
	tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.bfloat16, # bf16 사용 (GPU 권장)
	device_map="auto", # 사용 가능한 디바이스에 자동으로 모델 배치
	trust_remote_code=True
	)
	print(f"모델 디바이스: {model.device}")
	print("모델 로드 완료.")

	except Exception as e:
	print(f"모델 로드 중 오류 발생: {e}")
	tokenizer = None
	model = None
	print("모델 로드에 실패했습니다. 애플리케이션이 제대로 동작하지 않을 수 있습니다.")


	# --- 텍스트 생성 함수 (Gradio ChatInterface용) ---
	@spaces.GPU # 이 함수가 GPU 자원을 사용하도록 명시 (Hugging Face Spaces)
	def respond(
	message: str,
	history: list[tuple[str, str]],
	system_message: str,
	max_tokens: int,
	temperature: float,
	top_p: float,
	):
	if model is None or tokenizer is None:
	yield "모델 로드에 실패하여 텍스트 생성을 할 수 없습니다."
	return # 생성기 함수이므로 return 대신 빈 yield 또는 그냥 return

	try:
	# 메시지 형식을 모델의 chat template에 맞게 구성
	messages = [{"role": "system", "content": system_message}]
	for user_msg, bot_msg in history:
	if user_msg:
	messages.append({"role": "user", "content": user_msg})
	if bot_msg:
	messages.append({"role": "assistant", "content": bot_msg})
	messages.append({"role": "user", "content": message})

	prompt = tokenizer.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

	# 텍스트 스트리밍을 위한 streamer 설정
	streamer = TextIteratorStreamer(
	tokenizer, skip_prompt=True, skip_special_tokens=True
	)
	generate_kwargs = dict(
	**inputs,
	streamer=streamer,
	max_new_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id # 패딩 토큰 ID 설정
	)

	# 모델 생성을 별도의 스레드에서 실행
	thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
	thread.start()

	# 스트리머에서 생성된 텍스트를 읽어와 yield
	response = ""
	for new_text in streamer:
	response += new_text
	yield response # 실시간으로 응답을 Gradio 인터페이스로 전달

	except Exception as e:
	yield f"텍스트 생성 중 오류 발생: {e}"
	# 오류 발생 시 스레드 처리 로직 추가 고려 필요 (선택 사항)


	# --- Gradio 인터페이스 설정 ---
	if model is not None and tokenizer is not None:
	demo = gr.ChatInterface(
	fn=respond,
	title="Bitnet-b1.58-2B-4T Chatbot",
	description="Microsoft Bitnet-b1.58-2B-4T 모델을 사용한 채팅 데모입니다.",
	examples=[
	[
	"안녕하세요! 자기소개 해주세요.",
	"당신은 유능한 AI 비서입니다.", # System message 예시
	512, # Max new tokens 예시
	0.7, # Temperature 예시
	0.95, # Top-p 예시
	],
	[
	"파이썬으로 간단한 웹 서버 만드는 코드 알려줘",
	"당신은 유능한 AI 개발자입니다.", # System message 예시
	1024, # Max new tokens 예시
	0.8, # Temperature 예시
	0.9, # Top-p 예시
	],
	],
	additional_inputs=[
	gr.Textbox(
	value="당신은 유능한 AI 비서입니다.", # 기본 시스템 메시지
	label="System message",
	lines=1
	),
	gr.Slider(
	minimum=1,
	maximum=4096, # 모델 최대 컨텍스트 길이 고려 (또는 더 길게 설정)
	value=512,
	step=1,
	label="Max new tokens"
	),
	gr.Slider(
	minimum=0.1,
	maximum=2.0, # Temperature 범위 조정 (필요시)
	value=0.7,
	step=0.1,
	label="Temperature"
	),
	gr.Slider(
	minimum=0.0, # Top-p 범위 조정 (필요시)
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top-p (nucleus sampling)"
	),
	],
	)

	# Gradio 앱 실행
	# Hugging Face Spaces에서는 share=True가 자동으로 설정됩니다.
	# debug=True로 설정하면 상세 로그를 볼 수 있습니다.
	demo.launch(debug=True)
	else:
	print("모델 로드 실패로 인해 Gradio 인터페이스를 실행할 수 없습니다.")