--- library_name: transformers pipeline_tag: text-generation inference: true widget: - text: Hello! example_title: Hello world group: Python --- This tiny model is for debugging. It is randomly initialized with the config adapted from [meta-llama/Llama-4-Maverick-17B-128E-Instruct](https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct). ### Example usage: ```python import torch from transformers import AutoProcessor, Llama4ForConditionalGeneration model_id = "yujiepan/llama-4-8E-tiny-random" processor = AutoProcessor.from_pretrained(model_id) model = Llama4ForConditionalGeneration.from_pretrained( model_id, attn_implementation="sdpa", # flex attention / flash_attention_2 do not work, debugging... device_map="auto", torch_dtype=torch.bfloat16, ) url1 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" url2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png" messages = [ { "role": "user", "content": [ {"type": "image", "url": url1}, {"type": "image", "url": url2}, {"type": "text", "text": "Can you describe how these two images are similar, and how they differ?"}, ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate( **inputs, max_new_tokens=32, ) response = processor.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])[0] print(response) print(outputs[0]) ``` ### Codes to create this repo: ```python import json import torch from huggingface_hub import hf_hub_download from transformers import ( AutoConfig, AutoModelForCausalLM, AutoProcessor, AutoTokenizer, GenerationConfig, Llama4ForConditionalGeneration, pipeline, set_seed, ) source_model_id = "meta-llama/Llama-4-Maverick-17B-128E-Instruct" save_folder = "/tmp/yujiepan/llama-4-8E-tiny-random" processor = AutoProcessor.from_pretrained(source_model_id) processor.save_pretrained(save_folder) with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r') as f: config_json = json.load(f) config_json["text_config"]["num_hidden_layers"] = 4 # ensure to trigger no-rope & moe config_json["text_config"]["hidden_size"] = 32 config_json["text_config"]["head_dim"] = 32 # vllm requires dim >= 32 config_json["text_config"]["num_attention_heads"] = 1 config_json["text_config"]["num_key_value_heads"] = 1 config_json['text_config']["use_qk_norm"] = True config_json["text_config"]["intermediate_size"] = 64 config_json["text_config"]["intermediate_size_mlp"] = 128 config_json["text_config"]["num_local_experts"] = 8 config_json["text_config"]["tie_word_embeddings"] = True config_json["vision_config"]["num_hidden_layers"] = 2 config_json["vision_config"]["hidden_size"] = 32 config_json["vision_config"]["intermediate_size"] = 128 assert config_json["vision_config"]["intermediate_size"] == int( config_json["vision_config"]["hidden_size"] // config_json["vision_config"]["pixel_shuffle_ratio"] ** 2 ) config_json["vision_config"]["num_attention_heads"] = 1 config_json["vision_config"]["projector_input_dim"] = 32 config_json["vision_config"]["projector_output_dim"] = 32 config_json["vision_config"]["vision_output_dim"] = 32 with open(f"{save_folder}/config.json", "w") as f: json.dump(config_json, f, indent=2) config = AutoConfig.from_pretrained( save_folder, ) print(config) torch.set_default_dtype(torch.bfloat16) model = Llama4ForConditionalGeneration(config) torch.set_default_dtype(torch.float32) model.generation_config = GenerationConfig.from_pretrained( source_model_id, trust_remote_code=True, ) set_seed(42) with torch.no_grad(): for name, p in sorted(model.named_parameters()): torch.nn.init.normal_(p, 0, 0.5) print(name, p.shape) pass model.save_pretrained(save_folder) ```