inference: greedy: False # Whether or not to use sampling ; use greedy decoding otherwise top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering. top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation. temperature: 1.0 # sampling temperature add_BOS: True # add the bos token at the begining of the prompt tokens_to_generate: 30 # The minimum length of the sequence to be generated. all_probs: False # whether return the log prob for all the tokens in vocab repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty. min_tokens_to_generate: 0 # The minimum length of the sequence to be generated. compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False batch_size: 1 trainer: devices: 1 num_nodes: 1 accelerator: gpu logger: False # logger provided by exp_manager precision: 16 # 16, 32, or bf16 tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 gpt_model_file: null # GPT nemo file path virtual_prompt_model_file: ??? # path to a MegatronGPTPromptLearningModel model if you want to use soft prompts pred_file_path: ??? # Path will model predictions will be written data_paths: # paths to .jsonl files you want to perform inference on num_workers: 8