# 필요한 라이브러리를 설치하는 명령어입니다.
# 이 부분은 스크립트 실행 초반에 한 번 실행됩니다.
import os
print("Installing required transformers branch...")
os.system("pip install git+https://github.com/shumingma/transformers.git")
print("Installation complete.")

# 필요한 라이브러리들을 import 합니다.
import threading
import torch
import torch._dynamo
import gradio as gr
import spaces # Hugging Face Spaces 관련 유틸리티

# torch._dynamo 설정 (선택 사항, 성능 향상 시도)
torch._dynamo.config.suppress_errors = True

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TextIteratorStreamer,
)

# --- 모델 로드 ---
# 모델 경로 설정 (Hugging Face 모델 ID)
model_id = "microsoft/bitnet-b1.58-2B-4T"

# 모델 로드 시 경고 메시지를 최소화하기 위해 로깅 레벨 설정
os.environ["TRANSFORMERS_VERBOSITY"] = "error"

# AutoModelForCausalLM과 AutoTokenizer를 로드합니다.
# trust_remote_code=True가 필요하며, device_map="auto"를 사용하여 자동으로 디바이스 설정
try:
    print(f"모델 로딩 중: {model_id}...")
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16, # bf16 사용 (GPU 권장)
        device_map="auto", # 사용 가능한 디바이스에 자동으로 모델 배치
        trust_remote_code=True
    )
    print(f"모델 디바이스: {model.device}")
    print("모델 로드 완료.")

except Exception as e:
    print(f"모델 로드 중 오류 발생: {e}")
    tokenizer = None
    model = None
    print("모델 로드에 실패했습니다. 애플리케이션이 제대로 동작하지 않을 수 있습니다.")


# --- 텍스트 생성 함수 (Gradio ChatInterface용) ---
@spaces.GPU # 이 함수가 GPU 자원을 사용하도록 명시 (Hugging Face Spaces)
def respond(
    message: str,
    history: list[tuple[str, str]],
    system_message: str,
    max_tokens: int,
    temperature: float,
    top_p: float,
):
    if model is None or tokenizer is None:
        yield "모델 로드에 실패하여 텍스트 생성을 할 수 없습니다."
        return # 생성기 함수이므로 return 대신 빈 yield 또는 그냥 return

    try:
        # 메시지 형식을 모델의 chat template에 맞게 구성
        messages = [{"role": "system", "content": system_message}]
        for user_msg, bot_msg in history:
            if user_msg:
                messages.append({"role": "user", "content": user_msg})
            if bot_msg:
                messages.append({"role": "assistant", "content": bot_msg})
        messages.append({"role": "user", "content": message})

        prompt = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        # 텍스트 스트리밍을 위한 streamer 설정
        streamer = TextIteratorStreamer(
            tokenizer, skip_prompt=True, skip_special_tokens=True
        )
        generate_kwargs = dict(
            **inputs,
            streamer=streamer,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id # 패딩 토큰 ID 설정
        )

        # 모델 생성을 별도의 스레드에서 실행
        thread = threading.Thread(target=model.generate, kwargs=generate_kwargs)
        thread.start()

        # 스트리머에서 생성된 텍스트를 읽어와 yield
        response = ""
        for new_text in streamer:
            response += new_text
            yield response # 실시간으로 응답을 Gradio 인터페이스로 전달

    except Exception as e:
        yield f"텍스트 생성 중 오류 발생: {e}"
        # 오류 발생 시 스레드 처리 로직 추가 고려 필요 (선택 사항)


# --- Gradio 인터페이스 설정 ---
if model is not None and tokenizer is not None:
    demo = gr.ChatInterface(
        fn=respond,
        title="Bitnet-b1.58-2B-4T Chatbot",
        description="Microsoft Bitnet-b1.58-2B-4T 모델을 사용한 채팅 데모입니다.",
        examples=[
            [
                "안녕하세요! 자기소개 해주세요.",
                "당신은 유능한 AI 비서입니다.", # System message 예시
                512, # Max new tokens 예시
                0.7, # Temperature 예시
                0.95, # Top-p 예시
            ],
             [
                "파이썬으로 간단한 웹 서버 만드는 코드 알려줘",
                "당신은 유능한 AI 개발자입니다.", # System message 예시
                1024, # Max new tokens 예시
                0.8, # Temperature 예시
                0.9, # Top-p 예시
            ],
        ],
         additional_inputs=[
            gr.Textbox(
                value="당신은 유능한 AI 비서입니다.", # 기본 시스템 메시지
                label="System message",
                lines=1
            ),
            gr.Slider(
                minimum=1,
                maximum=4096, # 모델 최대 컨텍스트 길이 고려 (또는 더 길게 설정)
                value=512,
                step=1,
                label="Max new tokens"
            ),
            gr.Slider(
                minimum=0.1,
                maximum=2.0, # Temperature 범위 조정 (필요시)
                value=0.7,
                step=0.1,
                label="Temperature"
            ),
            gr.Slider(
                minimum=0.0, # Top-p 범위 조정 (필요시)
                maximum=1.0,
                value=0.95,
                step=0.05,
                label="Top-p (nucleus sampling)"
            ),
        ],
    )

    # Gradio 앱 실행
    # Hugging Face Spaces에서는 share=True가 자동으로 설정됩니다.
    # debug=True로 설정하면 상세 로그를 볼 수 있습니다.
    demo.launch(debug=True)
else:
    print("모델 로드 실패로 인해 Gradio 인터페이스를 실행할 수 없습니다.")