Spaces:

kimhyunwoo
/

bitnet

Running

App Files Files Community

kimhyunwoo commited on Apr 20

Commit

4bf6d97

verified ·

1 Parent(s): 529d051

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -54

app.py CHANGED Viewed

@@ -1,7 +1,25 @@
-import gradio as gr
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import os
 # --- 모델 로드 ---
 # 모델 경로 설정 (Hugging Face 모델 ID)
@@ -11,28 +29,18 @@ model_id = "microsoft/bitnet-b1.58-2B-4T"
 os.environ["TRANSFORMERS_VERBOSITY"] = "error"
 # AutoModelForCausalLM과 AutoTokenizer를 로드합니다.
-# BitNet 모델은 trust_remote_code=True가 필요합니다.
-# GitHub 특정 브랜치에서 설치한 transformers를 사용합니다.
 try:
     print(f"모델 로딩 중: {model_id}...")
-    # GPU가 사용 가능하면 bf16 사용
-    if torch.cuda.is_available():
-        # torch_dtype을 명시적으로 설정하여 로드 오류 방지 시도
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            torch_dtype=torch.bfloat16,
-            trust_remote_code=True
-        ).to("cuda") # GPU로 모델 이동
-        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-        print("GPU를 사용하여 모델 로드 완료.")
-    else:
-        # CPU 사용 시 torch_dtype 생략 또는 float32
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id,
-            trust_remote_code=True
-        )
-        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-        print("CPU를 사용하여 모델 로드 완료. 성능이 느릴 수 있습니다.")
 except Exception as e:
     print(f"모델 로드 중 오류 발생: {e}")
@@ -41,54 +49,119 @@ except Exception as e:
     print("모델 로드에 실패했습니다. 애플리케이션이 제대로 동작하지 않을 수 있습니다.")
-# --- 텍스트 생성 함수 ---
-def generate_text(prompt, max_length=100, temperature=0.7):
     if model is None or tokenizer is None:
-        return "모델 로드에 실패하여 텍스트 생성을 할 수 없습니다."
     try:
-        # 프롬프트 토큰화
-        inputs = tokenizer(prompt, return_tensors="pt")
-        # GPU 사용 가능 시 GPU로 입력 이동
-        if torch.cuda.is_available():
-            inputs = {k: v.to("cuda") for k, v in inputs.items()}
-        # 텍스트 생성
-        # LLaMA 3 토크나이저를 사용하므로 chat template 적용 가능 (선택 사항)
-        # 메시지 형식을 사용하지 않고 직접 프롬프트 입력 시 아래 코드 사용
-        outputs = model.generate(
             **inputs,
-            max_new_tokens=max_length,
             temperature=temperature,
-            do_sample=True, # 샘플링 활성화
-            pad_token_id=tokenizer.eos_token_id # 패딩 토큰 ID 설정 (필요시)
         )
-        # 생성된 텍스트 디코딩
-        # 입력 프롬프트 부분을 제외하고 생성된 부분만 디코딩
-        generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
-        return generated_text
     except Exception as e:
-        return f"텍스트 생성 중 오류 발생: {e}"
 # --- Gradio 인터페이스 설정 ---
 if model is not None and tokenizer is not None:
-    interface = gr.Interface(
-        fn=generate_text,
-        inputs=[
-            gr.Textbox(lines=2, placeholder="텍스트를 입력하세요...", label="입력 프롬프트"),
-            gr.Slider(minimum=10, maximum=500, value=100, label="최대 생성 길이"),
-            gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Temperature (창의성)")
         ],
-        outputs=gr.Textbox(label="생성된 텍스트"),
-        title="BitNet b1.58-2B-4T 텍스트 생성 데모",
-        description="BitNet b1.58-2B-4T 모델을 사용하여 텍스트를 생성합니다."
     )
     # Gradio 앱 실행
     # Hugging Face Spaces에서는 share=True가 자동으로 설정됩니다.
-    interface.launch()
 else:
     print("모델 로드 실패로 인해 Gradio 인터페이스를 실행할 수 없습니다.")

+# 필요한 라이브러리를 설치하는 명령어입니다.
+# 이 부분은 스크립트 실행 초반에 한 번 실행됩니다.
 import os
+print("Installing required transformers branch...")
+os.system("pip install git+https://github.com/shumingma/transformers.git")
+print("Installation complete.")
+# 필요한 라이브러리들을 import 합니다.
+import threading
+import torch
+import torch._dynamo
+import gradio as gr
+import spaces # Hugging Face Spaces 관련 유틸리티
+# torch._dynamo 설정 (선택 사항, 성능 향상 시도)
+torch._dynamo.config.suppress_errors = True
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TextIteratorStreamer,
+)
 # --- 모델 로드 ---
 # 모델 경로 설정 (Hugging Face 모델 ID)
 os.environ["TRANSFORMERS_VERBOSITY"] = "error"
 # AutoModelForCausalLM과 AutoTokenizer를 로드합니다.
+# trust_remote_code=True가 필요하며, device_map="auto"를 사용하여 자동으로 디바이스 설정
 try:
     print(f"모델 로딩 중: {model_id}...")
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        torch_dtype=torch.bfloat16, # bf16 사용 (GPU 권장)
+        device_map="auto", # 사용 가능한 디바이스에 자동으로 모델 배치
+        trust_remote_code=True
+    )
+    print(f"모델 디바이스: {model.device}")
+    print("모델 로드 완료.")
 except Exception as e:
     print(f"모델 로드 중 오류 발생: {e}")
     print("모델 로드에 실패했습니다. 애플리케이션이 제대로 동작하지 않을 수 있습니다.")
+# --- 텍스트 생성 함수 (Gradio ChatInterface용) ---
+@spaces.GPU # 이 함수가 GPU 자원을 사용하도록 명시 (Hugging Face Spaces)
+def respond(
+    message: str,
+    history: list[tuple[str, str]],
+    system_message: str,
+    max_tokens: int,
+    temperature: float,
+    top_p: float,
+):
     if model is None or tokenizer is None:
+        yield "모델 로드에 실패하여 텍스트 생성을 할 수 없습니다."
+        return # 생성기 함수이므로 return 대신 빈 yield 또는 그냥 return
     try:
+        # 메시지 형식을 모델의 chat template에 맞게 구성
+        messages = [{"role": "system", "content": system_message}]
+        for user_msg, bot_msg in history:
+            if user_msg:
+                messages.append({"role": "user", "content": user_msg})
+            if bot_msg:
+                messages.append({"role": "assistant", "content": bot_msg})
+        messages.append({"role": "user", "content": message})
+        prompt = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+        # 텍스트 스트리밍을 위한 streamer 설정
+        streamer = TextIteratorStreamer(
+            tokenizer, skip_prompt=True, skip_special_tokens=True
+        )
+        generate_kwargs = dict(
             **inputs,
+            streamer=streamer,
+            max_new_tokens=max_tokens,
             temperature=temperature,
+            top_p=top_p,
+            do_sample=True,
+            pad_token_id=tokenizer.eos_token_id # 패딩 토큰 ID 설정
         )
+        # 모델 생성을 별도의 스레드에서 실행
+        thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
+        thread.start()
+        # 스트리머에서 생성된 텍스트를 읽어와 yield
+        response = ""
+        for new_text in streamer:
+            response += new_text
+            yield response # 실시간으로 응답을 Gradio 인터페이스로 전달
     except Exception as e:
+        yield f"텍스트 생성 중 오류 발생: {e}"
+        # 오류 발생 시 스레드 처리 로직 추가 고려 필요 (선택 사항)
 # --- Gradio 인터페이스 설정 ---
 if model is not None and tokenizer is not None:
+    demo = gr.ChatInterface(
+        fn=respond,
+        title="Bitnet-b1.58-2B-4T Chatbot",
+        description="Microsoft Bitnet-b1.58-2B-4T 모델을 사용한 채팅 데모입니다.",
+        examples=[
+            [
+                "안녕하세요! 자기소개 해주세요.",
+                "당신은 유능한 AI 비서입니다.", # System message 예시
+                512, # Max new tokens 예시
+                0.7, # Temperature 예시
+                0.95, # Top-p 예시
+            ],
+             [
+                "파이썬으로 간단한 웹 서버 만드는 코드 알려줘",
+                "당신은 유능한 AI 개발자입니다.", # System message 예시
+                1024, # Max new tokens 예시
+                0.8, # Temperature 예시
+                0.9, # Top-p 예시
+            ],
+        ],
+         additional_inputs=[
+            gr.Textbox(
+                value="당신은 유능한 AI 비서입니다.", # 기본 시스템 메시지
+                label="System message",
+                lines=1
+            ),
+            gr.Slider(
+                minimum=1,
+                maximum=4096, # 모델 최대 컨텍스트 길이 고려 (또는 더 길게 설정)
+                value=512,
+                step=1,
+                label="Max new tokens"
+            ),
+            gr.Slider(
+                minimum=0.1,
+                maximum=2.0, # Temperature 범위 조정 (필요시)
+                value=0.7,
+                step=0.1,
+                label="Temperature"
+            ),
+            gr.Slider(
+                minimum=0.0, # Top-p 범위 조정 (필요시)
+                maximum=1.0,
+                value=0.95,
+                step=0.05,
+                label="Top-p (nucleus sampling)"
+            ),
         ],
     )
     # Gradio 앱 실행
     # Hugging Face Spaces에서는 share=True가 자동으로 설정됩니다.
+    # debug=True로 설정하면 상세 로그를 볼 수 있습니다.
+    demo.launch(debug=True)
 else:
     print("모델 로드 실패로 인해 Gradio 인터페이스를 실행할 수 없습니다.")