import transformers
import torch

# Specify the model you want to use
model_id = "nvidia/Llama-3.1-Nemotron-8B-UltraLong-4M-Instruct"

# Set up the text-generation pipeline
pipeline = transformers.pipeline(
    "text-generation",  # You are using the text generation pipeline
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},  # Specifying the torch dtype
    device_map="auto",  # This will use available hardware (GPU or CPU)
)

# Define the conversation/messages you want the model to handle
messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"}
]

# Use the pipeline to generate a response
outputs = pipeline(
    messages[1]["content"],  # Use only the user message here
    max_new_tokens=256,  # Limit the number of tokens generated
)
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device=-1,  # Use CPU (avoid device_map)
)


# Print the generated text from the output
print(outputs[0]["generated_text"])