Spaces:

kimhyunwoo
/

freetestn

Running

File size: 10,820 Bytes

25851fb
73d76b4
 
61bf4d9
 
 
c9ceb74
25851fb
c9ceb74
73d76b4
2673193
 
61bf4d9
c9ceb74
61bf4d9
 
 
 
 
 
2673193
 
25851fb
c9ceb74
61bf4d9
 
 
2673193
 
c9ceb74
8a7a11f
2673193
73d76b4
c9ceb74
73d76b4
 
2673193
 
8a7a11f
2673193
 
 
8a7a11f
73d76b4
2673193
c9ceb74
 
 
61bf4d9
c9ceb74
61bf4d9
8a7a11f
61bf4d9
8a7a11f
 
c9ceb74
 
61bf4d9
8a7a11f
61bf4d9
 
c9ceb74
 
 
 
 
8a7a11f
61bf4d9
 
 
73d76b4
61bf4d9
c9ceb74
 
61bf4d9
8a7a11f
c9ceb74
61bf4d9
 
c9ceb74
61bf4d9
2673193
61bf4d9
 
8a7a11f
61bf4d9
 
25851fb
c9ceb74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a7a11f
 
 
 
 
 
 
 
 
 
 
 
c9ceb74
8a7a11f
 
 
c9ceb74
 
 
 
 
 
 
 
 
 
 
 
8a7a11f
c9ceb74
 
73d76b4
c9ceb74
8a7a11f
 
c9ceb74
 
 
 
61bf4d9
 
8a7a11f
73d76b4
8a7a11f
61bf4d9
73d76b4
8a7a11f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73d76b4
25851fb
2673193
 
 
73d76b4
 
 
2673193
73d76b4
 
8a7a11f
61bf4d9
 
25851fb
61bf4d9
 
 
25851fb
73d76b4
61bf4d9
c9ceb74
8a7a11f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73d76b4
8a7a11f
 
c9ceb74
 
61bf4d9
73d76b4
61bf4d9
2673193
61bf4d9
73d76b4
61bf4d9
25851fb
c9ceb74
8a7a11f
dc10a73
c9ceb74
 
 
 
 
 
 
 
73d76b4
c9ceb74
dc10a73
 
2673193
61bf4d9
73d76b4
 
 
c9ceb74
61bf4d9
 
8a7a11f
 
61bf4d9
 
 
 
 
 
 
25851fb
8a7a11f
 
25851fb
8a7a11f
 
61bf4d9
73d76b4
61bf4d9
 
8a7a11f
61bf4d9
73d76b4
61bf4d9
8a7a11f
2673193
25851fb
 
c9ceb74
25851fb
c9ceb74
 
 
 
 
61bf4d9
c9ceb74
8a7a11f
 
c9ceb74

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import gc
import os
import datetime
import time

# --- Configuration ---
MODEL_ID = "naver-hyperclovax/HyperCLOVAX-SEED-Text-Instruct-0.5B"
MAX_NEW_TOKENS = 512
CPU_THREAD_COUNT = 4 # 필요시 조절

# --- Optional: Set CPU Threads ---
# torch.set_num_threads(CPU_THREAD_COUNT)
# os.environ["OMP_NUM_THREADS"] = str(CPU_THREAD_COUNT)
# os.environ["MKL_NUM_THREADS"] = str(CPU_THREAD_COUNT)

print("--- Environment Setup ---")
print(f"PyTorch version: {torch.__version__}")
print(f"Running on device: cpu")
print(f"Torch Threads: {torch.get_num_threads()}")

# --- Model and Tokenizer Loading ---
print(f"--- Loading Model: {MODEL_ID} ---")
print("This might take a few minutes, especially on the first launch...")

model = None
tokenizer = None
load_successful = False
stop_token_ids_list = [] # Initialize stop_token_ids_list

try:
    start_load_time = time.time()
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float32,
        device_map="cpu",
        # force_download=True # Keep commented unless cache issues reappear
    )
    tokenizer = AutoTokenizer.from_pretrained(
        MODEL_ID,
        # force_download=True # Keep commented
    )
    model.eval()
    load_time = time.time() - start_load_time
    print(f"--- Model and Tokenizer Loaded Successfully on CPU in {load_time:.2f} seconds ---")
    load_successful = True

    # --- Stop Token Configuration ---
    stop_token_strings = ["<|endofturn|>", "<|stop|>"]
    temp_stop_ids = [tokenizer.convert_tokens_to_ids(token) for token in stop_token_strings]

    if tokenizer.eos_token_id is not None and tokenizer.eos_token_id not in temp_stop_ids:
        temp_stop_ids.append(tokenizer.eos_token_id)
    elif tokenizer.eos_token_id is None:
         print("Warning: tokenizer.eos_token_id is None. Cannot add to stop tokens.")

    stop_token_ids_list = [tid for tid in temp_stop_ids if tid is not None] # Assign to the global scope variable

    if not stop_token_ids_list:
        print("Warning: Could not find any stop token IDs. Using default EOS if available, otherwise generation might not stop correctly.")
        if tokenizer.eos_token_id is not None:
            stop_token_ids_list = [tokenizer.eos_token_id]
        else:
             print("Error: No stop tokens found, including default EOS. Generation may run indefinitely.")
             # Consider raising an error or setting a default if this is critical

    print(f"Using Stop Token IDs: {stop_token_ids_list}")

except Exception as e:
    print(f"!!! Error loading model: {e}")
    if 'model' in locals() and model is not None: del model
    if 'tokenizer' in locals() and tokenizer is not None: del tokenizer
    gc.collect()
    # Raise Gradio error to display in the Space UI if loading fails
    raise gr.Error(f"Failed to load the model {MODEL_ID}. Cannot start the application. Error: {e}")


# --- System Prompt Definition ---
def get_system_prompt():
    current_date = datetime.datetime.now().strftime("%Y-%m-%d (%A)")
    return (
        f"- AI 언어모델의 이름은 \"CLOVA X\" 이며 네이버에서 만들었다.\n"
        # f"- 오늘은 {current_date}이다.\n" # Uncomment if needed
        f"- 사용자의 질문에 대해 친절하고 자세하게 한국어로 답변해야 한다."
    )

# --- Warm-up Function ---
def warmup_model():
    if not load_successful or model is None or tokenizer is None:
        print("Skipping warmup: Model not loaded successfully.")
        return

    print("--- Starting Model Warm-up ---")
    try:
        start_warmup_time = time.time()
        warmup_message = "안녕하세요"
        system_prompt = get_system_prompt()
        warmup_chat = [
            {"role": "tool_list", "content": ""},
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": warmup_message}
        ]

        inputs = tokenizer.apply_chat_template(
            warmup_chat,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt"
        ).to("cpu")

        # Check if stop_token_ids_list is empty and handle appropriately
        gen_kwargs = {
            "max_new_tokens": 10,
            "pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
            "do_sample": False
        }
        if stop_token_ids_list:
            gen_kwargs["eos_token_id"] = stop_token_ids_list
        else:
            print("Warmup Warning: No stop tokens defined for generation.")


        with torch.no_grad():
            output_ids = model.generate(**inputs, **gen_kwargs)

        # Optional: Decode warmup response for verification
        # response = tokenizer.decode(output_ids[0, inputs['input_ids'].shape[1]:], skip_special_tokens=True)
        # print(f"Warm-up response (decoded): {response}")

        del inputs
        del output_ids
        gc.collect()
        warmup_time = time.time() - start_warmup_time
        print(f"--- Model Warm-up Completed in {warmup_time:.2f} seconds ---")

    except Exception as e:
        print(f"!!! Error during model warm-up: {e}")
    finally:
        gc.collect()

# --- Inference Function ---
def predict(message, history):
    """
    Generates response using HyperCLOVAX.
    Assumes 'history' is in the Gradio 'messages' format: List[Dict].
    """
    if model is None or tokenizer is None:
         return "오류: 모델이 로드되지 않았습니다."

    system_prompt = get_system_prompt()

    # Start with system prompt
    chat_history_formatted = [
        {"role": "tool_list", "content": ""}, # As required by model card
        {"role": "system", "content": system_prompt}
    ]

    # Append history (List of {'role': 'user'/'assistant', 'content': '...'})
    if isinstance(history, list): # Check if history is a list
        for turn in history:
             # Validate turn format
            if isinstance(turn, dict) and "role" in turn and "content" in turn:
                 chat_history_formatted.append(turn)
            # Handle potential older tuple format if necessary (though less likely now)
            elif isinstance(turn, (list, tuple)) and len(turn) == 2:
                 print(f"Warning: Received history item in tuple format: {turn}. Converting to messages format.")
                 chat_history_formatted.append({"role": "user", "content": turn[0]})
                 if turn[1]: # Ensure assistant message exists
                      chat_history_formatted.append({"role": "assistant", "content": turn[1]})
            else:
                print(f"Warning: Skipping unexpected history format item: {turn}")


    # Append the latest user message
    chat_history_formatted.append({"role": "user", "content": message})

    inputs = None
    output_ids = None

    try:
        inputs = tokenizer.apply_chat_template(
            chat_history_formatted,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt"
        ).to("cpu")
        input_length = inputs['input_ids'].shape[1]
        print(f"\nInput tokens: {input_length}")

    except Exception as e:
        print(f"!!! Error applying chat template: {e}")
        return f"오류: 입력 형식을 처리하는 중 문제가 발생했습니다. ({e})"

    try:
        print("Generating response...")
        generation_start_time = time.time()

        # Prepare generation arguments, handling empty stop_token_ids_list
        gen_kwargs = {
            "max_new_tokens": MAX_NEW_TOKENS,
            "pad_token_id": tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.pad_token_id,
            "do_sample": True,
            "temperature": 0.7,
            "top_p": 0.9,
        }
        if stop_token_ids_list:
             gen_kwargs["eos_token_id"] = stop_token_ids_list
        else:
             print("Generation Warning: No stop tokens defined.")


        with torch.no_grad():
            output_ids = model.generate(**inputs, **gen_kwargs)

        generation_time = time.time() - generation_start_time
        print(f"Generation complete in {generation_time:.2f} seconds.")

    except Exception as e:
        print(f"!!! Error during model generation: {e}")
        if inputs is not None: del inputs
        if output_ids is not None: del output_ids
        gc.collect()
        return f"오류: 응답을 생성하는 중 문제가 발생했습니다. ({e})"

    # Decode the response
    response = "오류: 응답 생성에 실패했습니다."
    if output_ids is not None:
        try:
            new_tokens = output_ids[0, input_length:]
            response = tokenizer.decode(new_tokens, skip_special_tokens=True)
            print(f"Output tokens: {len(new_tokens)}")
            del new_tokens
        except Exception as e:
            print(f"!!! Error decoding response: {e}")
            response = "오류: 응답을 디코딩하는 중 문제가 발생했습니다."

    # Clean up memory
    if inputs is not None: del inputs
    if output_ids is not None: del output_ids
    gc.collect()
    print("Memory cleaned.")

    return response

# --- Gradio Interface Setup ---
print("--- Setting up Gradio Interface ---")

# No need to create a separate Chatbot component beforehand
# chatbot_component = gr.Chatbot(...) # REMOVED

examples = [
    ["네이버 클로바X는 무엇인가요?"],
    ["슈뢰딩거 방정식과 양자역학의 관계를 설명해주세요."],
    ["딥러닝 모델 학습 과정을 단계별로 알려줘."],
    ["제주도 여행 계획을 세우고 있는데, 3박 4일 추천 코스 좀 짜줄래?"],
]

# Let ChatInterface manage its own internal Chatbot component
# Remove the chatbot=... argument
demo = gr.ChatInterface(
    fn=predict,                 # Link the prediction function
    # chatbot=chatbot_component,  # REMOVED
    title="🇰🇷 네이버 HyperCLOVA X SEED (0.5B) 데모",
    description=(
        f"**모델:** {MODEL_ID}\n"
        f"**환경:** Hugging Face 무료 CPU (16GB RAM)\n"
        f"**주의:** CPU에서 실행되므로 응답 생성에 다소 시간이 걸릴 수 있습니다. (웜업 완료)\n"
        f"최대 생성 토큰 수는 {MAX_NEW_TOKENS}개로 제한됩니다."
    ),
    examples=examples,
    cache_examples=False,
    theme="soft",
)

# --- Application Launch ---
if __name__ == "__main__":
    if load_successful:
        warmup_model()
    else:
        print("Skipping warm-up because model loading failed.")

    print("--- Launching Gradio App ---")
    demo.queue().launch(
        # share=True # Uncomment for public link
        # server_name="0.0.0.0" # Uncomment for local network access
    )