from transformers import AutoTokenizer, AutoModelForCausalLM import torch from peft import PeftModel import gradio as gr from huggingface_hub import login import os # Retrieve the token from the environment variable and log in hf_token = os.environ.get("HF_TOKEN") if hf_token is None: raise ValueError("HF_TOKEN environment variable not found. Please check your Space secrets.") login(token=hf_token) # Define model paths base_model_name = "meta-llama/Llama-3.2-3B-Instruct" lora_adapter_path = "agilan1102/eysflow_adapters" # Load tokenizer and models tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_auth_token=True) base_model = AutoModelForCausalLM.from_pretrained( base_model_name, device_map="auto", use_auth_token=True ) model_with_adapter = PeftModel.from_pretrained(base_model, lora_adapter_path, use_auth_token=True) def generate_text_adapter(prompt): inputs = tokenizer(prompt, return_tensors="pt").to(model_with_adapter.device) outputs = model_with_adapter.generate(**inputs, max_new_tokens=500) result = tokenizer.decode(outputs[0], skip_special_tokens=True) return result # Create Gradio interface demo = gr.Interface( fn=generate_text_adapter, inputs="text", outputs="text", title="My Finetuned LLM API" ) # Launch the interface demo.launch()