Spaces:
Runtime error
Runtime error
File size: 3,784 Bytes
6cfbf56 03f374b 6cfbf56 f47c9d2 6cfbf56 f47c9d2 6cfbf56 290e051 6cfbf56 03f374b 6cfbf56 b730d3a 6cfbf56 b730d3a 6cfbf56 b730d3a 6cfbf56 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# -*- coding: utf-8 -*-
"""Copy of assessment3_Elina_Hemink.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1xhBZL_ztniX37QTt8SK_mV7nZKO_UrwW
## Create embeddings of the email dataset and store in a chromadb database
"""
import chromadb
from chromadb.utils import embedding_functions
import pandas as pd
import email
from sklearn.model_selection import train_test_split
# Loading email.csv dataset
emails = pd.read_csv('emails.csv')
print(emails.head())
# Getting the content of the emails and saving to a list
content_text = []
for item in emails.message:
text = email.message_from_string(item)
message = (text.get_payload())
cleaned_message = message.replace("\n","").replace("\r","").replace("> >>> > >","")
content_text.append(cleaned_message)
# Taking a sample of the dataset
train, test = train_test_split(content_text, train_size = 0.001) # Dataset is too large to complete embedding step
# Setting up ids for ChromaDB collections
ids = []
for i in range(len(train)):
id = 'id'+str(i+1)
ids.append(id)
# Creating collection
client = chromadb.Client()
collection = client.create_collection(name="Enron_emails")
collection.add(
documents = train,
ids = ids
)
"""## Fine-tune a Language Model on the Dataset"""
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
# Load pre-trained GPT2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
# Tokenize the dataset
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenized_emails = tokenizer(train, truncation=True, padding=True)
# Extract token IDs from BatchEncoding object
token_ids_list = tokenized_emails['input_ids']
# Save token IDs to a text file
with open('tokenized_emails.txt', 'w') as f:
for token_ids in token_ids_list:
f.write(' '.join(map(str, token_ids)) + '\n')
# Initialize TextDataset with the file path
dataset = TextDataset(tokenizer=tokenizer, file_path = 'tokenized_emails.txt', block_size=128)
# Define data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# Define training arguments
training_args = TrainingArguments(
output_dir='./output',
num_train_epochs=3,
per_device_train_batch_size=8,
)
# Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=dataset,
)
# Fine-tune the model
trainer.train()
# Save the fine-tuned model
model.save_pretrained("/fine_tuned_model")
tokenizer.save_pretrained("/fine_tuned_model")
"""## Create a Gradio Interface"""
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
# Initialize fine-tuned model and tokenizer
model_dir= "/fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForCausalLM.from_pretrained(model_dir)
# Create a text generation pipeline
text_gen = pipeline("text-generation", model=model, tokenizer=tokenizer)
# Define question_answering function
def question_answer(question):
generated = text_gen(question, max_length=200, num_return_sequences=1)
generated_tokens = generated[0]['generated_text'].replace(question, "")
generated_token_ids = [int(token) for token in generated_tokens.strip().split()]
answer = tokenizer.decode(generated_token_ids)
return answer
# Set up gradio interface
iface = gr.Interface(fn = question_answer, inputs='text', outputs='text', title='Fine-tuned Enron Question Answering',
description='Ask a question regarding the Enron case')
iface.launch()
"""## Deploy the Gradio Interface in a Huggingface Space"""
|