from transformers import AutoTokenizer, AutoModelForCausalLM import os import torch # Check if CUDA is available for faster inference device = 'cuda' if torch.cuda.is_available() else 'cpu' # Load the tokenizer and model once, outside of the function huggingface_token = os.environ.get("KEY2") tokenizer = AutoTokenizer.from_pretrained( "meta-llama/Llama-3.2-1B", use_auth_token=huggingface_token ) model = AutoModelForCausalLM.from_pretrained( "meta-llama/Llama-3.2-1B", use_auth_token=huggingface_token ).to(device) def modelFeedback(ats_score, resume_data, job_description): """ """ try: # Tokenize the input input_ids = tokenizer.encode(input_prompt, return_tensors="pt").to(device) # Disable gradient calculation for faster inference with torch.no_grad(): # Generate the output output = model.generate( input_ids, max_length=1500, temperature=0.01, pad_token_id=tokenizer.eos_token_id # Ensure padding works properly ) # Decode the output response_text = tokenizer.decode(output[0], skip_special_tokens=True) return response_text except Exception as e: print(f"Error during generation: {e}")