# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM import torch # Model Info # model_path = '/Users/heykalsayid/Desktop/skill-academy/projects/ai-porto/deployment/app/model/eleutherai-finetuned' model_path_hf = 'paacamo/EleutherAI-pythia-1b-finetuned-nvidia-faq' tokenizer = AutoTokenizer.from_pretrained(model_path_hf) model = AutoModelForCausalLM.from_pretrained(model_path_hf) def text_generation(text, model=model, tokenizer=tokenizer, max_input_token=300, max_output_token=100): # Tokenize tokenizer.truncation_side = 'left' input_encoded = tokenizer( text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_token ) # set attention mask to the output input_ids = input_encoded['input_ids'] attention_mask = input_encoded['attention_mask'] # generate device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) output_ids = model.generate( input_ids=input_ids.to(device), attention_mask=attention_mask.to(device), max_new_tokens=max_output_token, pad_token_id=tokenizer.eos_token_id, do_sample=True, top_p=0.95, temperature=0.7 ) # decode generated_text_answer = tokenizer.decode( output_ids[0][input_ids.shape[-1]:], skip_special_tokens=True ) return generated_text_answer