import transformers import torch # Specify the model you want to use model_id = "nvidia/Llama-3.1-Nemotron-8B-UltraLong-4M-Instruct" # Set up the text-generation pipeline pipeline = transformers.pipeline( "text-generation", # You are using the text generation pipeline model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, # Specifying the torch dtype device_map="auto", # This will use available hardware (GPU or CPU) ) # Define the conversation/messages you want the model to handle messages = [ {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"}, {"role": "user", "content": "Who are you?"} ] # Use the pipeline to generate a response outputs = pipeline( messages[1]["content"], # Use only the user message here max_new_tokens=256, # Limit the number of tokens generated ) pipeline = transformers.pipeline( "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device=-1, # Use CPU (avoid device_map) ) # Print the generated text from the output print(outputs[0]["generated_text"])