Upload 8 files

Browse files

Files changed (8) hide show

application.py +70 -0
distilbert.ipynb +981 -0
distilbert.py +175 -0
load_data.ipynb +1209 -0
qa_model.py +532 -0
question_answering.ipynb +2403 -0
requirements.txt +168 -0
util.py +134 -0

application.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import streamlit as st
+import numpy as np
+import torch
+from transformers import DistilBertTokenizer, DistilBertForMaskedLM
+from qa_model import ReuseQuestionDistilBERT
+@st.cache(allow_output_mutation=True)
+def load_model():
+    mod = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased").distilbert
+    m = ReuseQuestionDistilBERT(mod)
+    m.load_state_dict(torch.load("distilbert_reuse.model", map_location=torch.device('cpu')))
+    model = m
+    del mod
+    del m
+    tokenizer = DistilBertTokenizer.from_pretrained('qa_tokenizer')
+    return model, tokenizer
+def get_answer(question, text, tokenizer, model):
+    question = [question.strip()]
+    text = [text.strip()]
+    inputs = tokenizer(
+        question,
+        text,
+        max_length=512,
+        truncation="only_second",
+        padding="max_length",
+    )
+    input_ids = torch.tensor(inputs['input_ids'])
+    outputs = model(input_ids, attention_mask=torch.tensor(inputs['attention_mask']), start_positions=None, end_positions=None)
+    start = torch.argmax(outputs['start_logits'])
+    end = torch.argmax(outputs['end_logits'])
+    ans_tokens = input_ids[0][start: end + 1]
+    answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens, skip_special_tokens=True)
+    predicted = tokenizer.convert_tokens_to_string(answer_tokens)
+    return predicted
+def main():
+    st.set_page_config(page_title="Question Answering Tool", page_icon=":mag_right:")
+    st.write("# Question Answering Tool \n"
+         "This tool will help you find answers to your questions about the text you provide. \n"
+         "Please enter your question and the text you want to search in the boxes below.")
+    model, tokenizer = load_model()
+    with st.form("qa_form"):
+        # define a streamlit textarea
+        text = st.text_area("Enter your text here", on_change=None)
+        # define a streamlit input
+        question = st.text_input("Enter your question here")
+        if st.form_submit_button("Submit"):
+            data_load_state = st.text('Let me think about that...')
+            # call the function to get the answer
+            answer = get_answer(question, text, tokenizer, model)
+            # display the answer
+            if answer == "":
+                data_load_state.text("Sorry but I don't know the answer to that question")
+            else:
+                data_load_state.text(answer)
+main()

distilbert.ipynb ADDED Viewed

	@@ -0,0 +1,981 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "47700837",
+   "metadata": {},
+   "source": [
+    "# DistilBERT Base Model\n",
+    "The following contains the code to create and train a DistilBERT model using the Huggingface library. It works quite well for a moderate amount of data, but the runtime increases quite drastically with data.\n",
+    "\n",
+    "I decided to take the pretrained model after all, still, creating the model myself was quite interesting!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "c09fa906",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "import torch\n",
+    "import time\n",
+    "from pathlib import Path\n",
+    "from transformers import DistilBertTokenizerFast\n",
+    "import os\n",
+    "from transformers import DistilBertConfig\n",
+    "from transformers import DistilBertForMaskedLM\n",
+    "from tokenizers import BertWordPieceTokenizer\n",
+    "from tqdm.auto import tqdm\n",
+    "from torch.optim import AdamW\n",
+    "import torchtest\n",
+    "from transformers import pipeline\n",
+    "\n",
+    "\n",
+    "from distilbert import test_model\n",
+    "from distilbert import Dataset\n",
+    "\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3b773fac",
+   "metadata": {},
+   "source": [
+    "## Tokeniser\n",
+    "We need a way to convert the strings we get as the input to numerical tokens, that we can give to the neual network. Hence, we take a BertWorkPieceTokenizer (works for DistilBERT too) and create tokens from our words."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "24277c5b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tokeniser created\n"
+     ]
+    }
+   ],
+   "source": [
+    "fit_new_tokenizer = True\n",
+    "\n",
+    "if fit_new_tokenizer:\n",
+    "    paths = [str(x) for x in Path('data/original').glob('**/*.txt')]\n",
+    "\n",
+    "    tokenizer = BertWordPieceTokenizer(\n",
+    "        clean_text=True,\n",
+    "        handle_chinese_chars=False,\n",
+    "        strip_accents=False,\n",
+    "        lowercase=True\n",
+    "    )\n",
+    "    print(\"Tokeniser created\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "beacf3e3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# fit the tokenizer\n",
+    "if fit_new_tokenizer:\n",
+    "    tokenizer.train(files=paths[:10], vocab_size=30_000, min_frequency=2,\n",
+    "                    limit_alphabet=1000, wordpieces_prefix='##',\n",
+    "                    special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "0d462cc5",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "FileExistsError",
+     "evalue": "[Errno 17] File exists: './tokeniser'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileExistsError\u001b[0m                           Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn [7], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m fit_new_tokenizer:\n\u001b[0;32m----> 2\u001b[0m     os\u001b[38;5;241m.\u001b[39mmkdir(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m./tokeniser\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m      3\u001b[0m     tokenizer\u001b[38;5;241m.\u001b[39msave_model(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtokeniser\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m      4\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTokeniser saved\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mFileExistsError\u001b[0m: [Errno 17] File exists: './tokeniser'"
+     ]
+    }
+   ],
+   "source": [
+    "if fit_new_tokenizer:\n",
+    "    os.mkdir('./tokeniser')\n",
+    "    tokenizer.save_model('tokeniser')\n",
+    "    print(\"Tokeniser saved\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7eaa1667",
+   "metadata": {},
+   "source": [
+    "After having created a basic tokeniser, we use the model to initialise a DistilBert tokenizer, that we need for the model architecture later on. We save the tokeniser separately."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "f4dd0684",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('distilbert_tokenizer/tokenizer_config.json',\n",
+       " 'distilbert_tokenizer/special_tokens_map.json',\n",
+       " 'distilbert_tokenizer/vocab.txt',\n",
+       " 'distilbert_tokenizer/added_tokens.json',\n",
+       " 'distilbert_tokenizer/tokenizer.json')"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer = DistilBertTokenizerFast.from_pretrained('tokeniser', max_len=512)\n",
+    "tokenizer.save_pretrained(\"distilbert_tokenizer\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bfcafcde",
+   "metadata": {},
+   "source": [
+    "### Testing\n",
+    "We now test the created tokenizer. We take a simple example and tokenise the input. It can be seen that we add a special token in the beginning and end ('CLS' and 'SEP'), which is how the BERT model was defined.\n",
+    "\n",
+    "When we translate the input back, we can see that we get the same, except for the first and last token. Also, we can see that questionmarks and commas are encoded separately."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "37e7f6a8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'input_ids': [2, 10958, 16, 2175, 1993, 1965, 35, 3], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokens = tokenizer('Hello, how are you?')\n",
+    "print(tokens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "bbd0c4b1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'[CLS] hello, how are you? [SEP]'"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.decode(tokens['input_ids'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "4ab6e506",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[CLS]\n",
+      "hello\n",
+      ",\n",
+      "how\n",
+      "are\n",
+      "you\n",
+      "?\n",
+      "[SEP]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for tok in tokens['input_ids']:\n",
+    "    print(tokenizer.decode(tok))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "c75d3255",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assert len(tokenizer.vocab) == 30_000"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dd114355",
+   "metadata": {},
+   "source": [
+    "## Dataset\n",
+    "We now define a function to mask some of the tokens. In particular, we create a Dataset class, that automates loading the data and tokenising it for us. Lastly, we use a DataLoader to load the data step by step into memory.\n",
+    "\n",
+    "The big problem with the limited resources we have is memory. In particular, I am loading the data sequentially, file by file, keeping track how many samples have been read. Shuffling wouldn't work here (it would also not make a lot of sense for this dataset)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "bff9ea54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create dataset and dataloader \n",
+    "dataset = Dataset(paths = [str(x) for x in Path('data/original').glob('**/*.txt')][50:70], tokenizer=tokenizer)\n",
+    "loader = torch.utils.data.DataLoader(dataset, batch_size=8)\n",
+    "\n",
+    "test_dataset = Dataset(paths = [str(x) for x in Path('data/original').glob('**/*.txt')][10:12], tokenizer=tokenizer)\n",
+    "test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6bbe6e63",
+   "metadata": {},
+   "source": [
+    "### Testing\n",
+    "The randomisation makes it a bit difficult to test. But altogether, we see that the input ids, masks and labels have the same shape. Also, as we mask 15% of the samples, when decoding a given sample, we can see that some samples are now '[MASK]'."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "436ab745",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "i = iter(dataset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "330e599d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Passed\n"
+     ]
+    }
+   ],
+   "source": [
+    "for j in range(10):\n",
+    "    sample = next(i)\n",
+    "    \n",
+    "    input_ids = sample['input_ids']\n",
+    "    attention_masks = sample['attention_mask']\n",
+    "    labels = sample['labels']\n",
+    "    \n",
+    "    # check if the dimensions are right\n",
+    "    assert input_ids.shape[0] == (512)\n",
+    "    assert attention_masks.shape[0] == (512)\n",
+    "    assert labels.shape[0] == (512)\n",
+    "    \n",
+    "    # if the input ids are not masked, the labels are the same as the input ids\n",
+    "    assert np.array_equal(input_ids[input_ids != 4].numpy(),labels[input_ids != 4].numpy())\n",
+    "    # input ids are zero if the attention masks are zero\n",
+    "    assert np.all(input_ids[attention_masks == 0].numpy()==0)\n",
+    "    # check if input contains masked tokens (we can't guarantee this 100% but this will apply) most likely\n",
+    "    assert np.any(input_ids.numpy() == 4)\n",
+    "print(\"Passed\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "08db6d22",
+   "metadata": {},
+   "source": [
+    "## Model\n",
+    "In the following section, we intialise and train a model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "7803bda6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = DistilBertConfig(\n",
+    "    vocab_size=30000,\n",
+    "    max_position_embeddings=514\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "8ca03f6a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = DistilBertForMaskedLM(config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "4da22bff",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/sanju/anaconda3/envs/myenv/lib/python3.10/site-packages/torch/cuda/__init__.py:83: UserWarning: CUDA initialization: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero. (Triggered internally at  ../c10/cuda/CUDAFunctions.cpp:109.)\n",
+      "  return torch._C._cuda_getDeviceCount() > 0\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "DistilBertForMaskedLM(\n",
+       "  (activation): GELUActivation()\n",
+       "  (distilbert): DistilBertModel(\n",
+       "    (embeddings): Embeddings(\n",
+       "      (word_embeddings): Embedding(30000, 768, padding_idx=0)\n",
+       "      (position_embeddings): Embedding(514, 768)\n",
+       "      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "      (dropout): Dropout(p=0.1, inplace=False)\n",
+       "    )\n",
+       "    (transformer): Transformer(\n",
+       "      (layer): ModuleList(\n",
+       "        (0): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (1): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (2): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (3): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (4): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (5): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       "  (vocab_transform): Linear(in_features=768, out_features=768, bias=True)\n",
+       "  (vocab_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "  (vocab_projector): Linear(in_features=768, out_features=30000, bias=True)\n",
+       "  (mlm_loss_fct): CrossEntropyLoss()\n",
+       ")"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# if we have a GPU - train on gpu\n",
+    "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
+    "model.to(device)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6fb8c2e2",
+   "metadata": {},
+   "source": [
+    "### Testing the model\n",
+    "I stumbled across some Medium articles on how to test DeepLearning models beforehand \n",
+    "* https://thenerdstation.medium.com/how-to-unit-test-machine-learning-code-57cf6fd81765: the package is however deprecated\n",
+    "* https://towardsdatascience.com/testing-your-pytorch-models-with-torcheck-cb689ecbc08c: released a package (torcheck)\n",
+    "* https://github.com/suriyadeepan/torchtest: I found this package, which is the PyTorch version of the first one and is still maintained.\n",
+    "\n",
+    "Essentially, testing a model is inherently difficult, because we do not know the result in the beginning. Still, the following four conditions should be satisfied in every model (see second reference above):\n",
+    "1. The parameters should change during training (if they are not frozen).\n",
+    "2. The parameters should not change if they are frozen.\n",
+    "3. The range of the ouput should be in a predefined range.\n",
+    "4. The parameters should never contain NaN. The same goes for the outputs too.\n",
+    "\n",
+    "I tried using the packages, but they do not trivially apply for models with multiple inputs (we have input ids and attention masks). The following is partly adapted from the torchtest package (https://github.com/suriyadeepan/torchtest/blob/master/torchtest/torchtest.py)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "cfd33fa1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get smaller dataset\n",
+    "test_ds = Dataset(paths = [str(x) for x in Path('data/original').glob('**/*.txt')][:2], tokenizer=tokenizer)\n",
+    "test_ds_loader = torch.utils.data.DataLoader(test_ds, batch_size=2)\n",
+    "optim=torch.optim.Adam(model.parameters())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "907db815",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Passed\n"
+     ]
+    }
+   ],
+   "source": [
+    "from distilbert import test_model\n",
+    "\n",
+    "test_model(model, optim, test_ds_loader, device)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c02c9c4b",
+   "metadata": {},
+   "source": [
+    "### Training the model\n",
+    "We use AdamW as the optimiser and train for 10 epochs.\n",
+    "\n",
+    "Taking the whole dataset, takes about 100 hours per epoch for me, so I wasn't able to do that."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "178914f8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DistilBertForMaskedLM(\n",
+       "  (activation): GELUActivation()\n",
+       "  (distilbert): DistilBertModel(\n",
+       "    (embeddings): Embeddings(\n",
+       "      (word_embeddings): Embedding(30000, 768, padding_idx=0)\n",
+       "      (position_embeddings): Embedding(514, 768)\n",
+       "      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "      (dropout): Dropout(p=0.1, inplace=False)\n",
+       "    )\n",
+       "    (transformer): Transformer(\n",
+       "      (layer): ModuleList(\n",
+       "        (0): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (1): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (2): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (3): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (4): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (5): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       "  (vocab_transform): Linear(in_features=768, out_features=768, bias=True)\n",
+       "  (vocab_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "  (vocab_projector): Linear(in_features=768, out_features=30000, bias=True)\n",
+       "  (mlm_loss_fct): CrossEntropyLoss()\n",
+       ")"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model = DistilBertForMaskedLM(config)\n",
+    "# if we have a GPU - train on gpu\n",
+    "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
+    "model.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "bb6532be",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# we use AdamW as the optimiser\n",
+    "optim = AdamW(model.parameters(), lr=1e-4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "2fd5d609",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c3386dc78c65490a96d11ade635d522f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/23750 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "epochs = 10\n",
+    "\n",
+    "for epoch in range(epochs):\n",
+    "    loop = tqdm(loader, leave=True)\n",
+    "    \n",
+    "    # set model to training mode\n",
+    "    model.train()\n",
+    "    losses = []\n",
+    "    \n",
+    "    # iterate over dataset\n",
+    "    for batch in loop:\n",
+    "        optim.zero_grad()\n",
+    "        \n",
+    "        # copy input to device\n",
+    "        input_ids = batch['input_ids'].to(device)\n",
+    "        attention_mask = batch['attention_mask'].to(device)\n",
+    "        labels = batch['labels'].to(device)\n",
+    "        \n",
+    "        # predict\n",
+    "        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n",
+    "        \n",
+    "        # update weights\n",
+    "        loss = outputs.loss\n",
+    "        loss.backward()\n",
+    "        \n",
+    "        optim.step()\n",
+    "        \n",
+    "        # output current loss\n",
+    "        loop.set_description(f'Epoch {epoch}')\n",
+    "        loop.set_postfix(loss=loss.item())\n",
+    "        losses.append(loss.item())\n",
+    "        \n",
+    "        del input_ids\n",
+    "        del attention_mask\n",
+    "        del labels\n",
+    "        \n",
+    "    print(\"Mean Training Loss\", np.mean(losses))\n",
+    "    losses = []\n",
+    "    loop = tqdm(test_loader, leave=True)\n",
+    "    \n",
+    "    # set model to evaluation mode\n",
+    "    model.eval()\n",
+    "    \n",
+    "    # iterate over dataset\n",
+    "    for batch in loop:\n",
+    "        # copy input to device\n",
+    "        input_ids = batch['input_ids'].to(device)\n",
+    "        attention_mask = batch['attention_mask'].to(device)\n",
+    "        labels = batch['labels'].to(device)\n",
+    "        \n",
+    "        # predict\n",
+    "        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n",
+    "        \n",
+    "        # update weights\n",
+    "        loss = outputs.loss\n",
+    "        \n",
+    "        # output current loss\n",
+    "        loop.set_description(f'Epoch {epoch}')\n",
+    "        loop.set_postfix(loss=loss.item())\n",
+    "        losses.append(loss.item())\n",
+    "        \n",
+    "        del input_ids\n",
+    "        del attention_mask\n",
+    "        del labels\n",
+    "    print(\"Mean Test Loss\", np.mean(losses))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "03c23c3e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save the pretrained model\n",
+    "torch.save(model, \"distilbert.model\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "9b18d3e3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = torch.load(\"distilbert.model\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e6ad94db",
+   "metadata": {},
+   "source": [
+    "### Testing\n",
+    "Huggingface provides a library to quickly be able to see what word the model would predict for our masked token."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "7c8582d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fill = pipeline(\"fill-mask\", model='distilbert', config=config, tokenizer='distilbert_tokenizer')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "d309e57f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'score': 0.19730663299560547,\n",
+       "  'token': 2965,\n",
+       "  'token_str': 'change',\n",
+       "  'sequence': 'it seems important to tackle the climate change.'},\n",
+       " {'score': 0.12946806848049164,\n",
+       "  'token': 5215,\n",
+       "  'token_str': 'crisis',\n",
+       "  'sequence': 'it seems important to tackle the climate crisis.'},\n",
+       " {'score': 0.05868387222290039,\n",
+       "  'token': 3688,\n",
+       "  'token_str': 'issues',\n",
+       "  'sequence': 'it seems important to tackle the climate issues.'},\n",
+       " {'score': 0.047418754547834396,\n",
+       "  'token': 3406,\n",
+       "  'token_str': 'issue',\n",
+       "  'sequence': 'it seems important to tackle the climate issue.'},\n",
+       " {'score': 0.027855267748236656,\n",
+       "  'token': 2629,\n",
+       "  'token_str': 'here',\n",
+       "  'sequence': 'it seems important to tackle the climate here.'}]"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fill(f'It seems important to tackle the climate {fill.tokenizer.mask_token}.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "94e3e623",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.8 ('venv': venv)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.16"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "85bf9c14e9ba73b783ed1274d522bec79eb0b2b739090180d8ce17bb11aff4aa"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

distilbert.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import torch
+class Dataset(torch.utils.data.Dataset):
+    """
+    This class loads and preprocesses the given text data
+    """
+    def __init__(self, paths, tokenizer):
+        """
+        This function initialises the object. It takes the given paths and tokeniser.
+        """
+        # the last file might not have 10000 samples, which makes it difficult to get the total length of the ds
+        self.paths = paths[:len(paths)-1]
+        self.tokenizer = tokenizer
+        self.data = self.read_file(self.paths[0])
+        self.current_file = 1
+        self.remaining = len(self.data)
+        self.encodings = self.get_encodings(self.data)
+    def __len__(self):
+        """
+        returns the lenght of the ds
+        """
+        return 10000*len(self.paths)
+    def read_file(self, path):
+        """
+        reads a given file
+        """
+        with open(path, 'r', encoding='utf-8') as f:
+            lines = f.read().split('\n')
+        return lines
+    def get_encodings(self, lines_all):
+        """
+        Creates encodings for a given text input
+        """
+        # tokenise all text
+        batch = self.tokenizer(lines_all, max_length=512, padding='max_length', truncation=True)
+        # Ground Truth
+        labels = torch.tensor(batch['input_ids'])
+        # Attention Masks
+        mask = torch.tensor(batch['attention_mask'])
+        # Input to be masked
+        input_ids = labels.detach().clone()
+        rand = torch.rand(input_ids.shape)
+        # with a probability of 15%, mask a given word, leave out CLS, SEP and PAD
+        mask_arr = (rand < .15) * (input_ids != 0) * (input_ids != 2) * (input_ids != 3)
+        # assign token 4 (=MASK)
+        input_ids[mask_arr] = 4
+        return {'input_ids':input_ids, 'attention_mask':mask, 'labels':labels}
+    def __getitem__(self, i):
+        """
+        returns item i
+        Note: do not use shuffling for this dataset
+        """
+        # if we have looked at all items in the file - take next
+        if self.remaining == 0:
+            self.data = self.read_file(self.paths[self.current_file])
+            self.current_file += 1
+            self.remaining = len(self.data)
+            self.encodings = self.get_encodings(self.data)
+        # if we are at the end of the dataset, start over again
+        if self.current_file == len(self.paths):
+            self.current_file = 0
+        self.remaining -= 1
+        return {key: tensor[i%10000] for key, tensor in self.encodings.items()}
+def test_model(model, optim, test_ds_loader, device):
+    """
+    This function tests whether the parameters of the model that are frozen change, the ones that are not frozen do change,
+    and whether any parameters become NaN or Inf
+    :param model: model to be tested
+    :param optim: optimiser used for training
+    :param test_ds_loader: dataset to perform the forward pass on
+    :param device: current device
+    :raises Exception: if any of the above conditions are not met
+    """
+    ## Check if non-frozen parameters changed and frozen ones did not
+    # get initial parameters to check against
+    params = [ np for np in model.named_parameters() if np[1].requires_grad ]
+    initial_params = [ (name, p.clone()) for (name, p) in params ]
+    params_frozen = [ np for np in model.named_parameters() if not np[1].requires_grad ]
+    initial_params_frozen = [ (name, p.clone()) for (name, p) in params_frozen ]
+    optim.zero_grad()
+    # get data
+    batch = next(iter(test_ds_loader))
+    input_ids = batch['input_ids'].to(device)
+    attention_mask = batch['attention_mask'].to(device)
+    labels = batch['labels'].to(device)
+    # forward pass and backpropagation
+    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
+    loss = outputs.loss
+    loss.backward()
+    optim.step()
+    # check if variables have changed
+    for (_, p0), (name, p1) in zip(initial_params, params):
+        # check different than initial
+        try:
+            assert not torch.equal(p0.to(device), p1.to(device))
+        except AssertionError:
+            raise Exception(
+            "{var_name} {msg}".format(
+                var_name=name,
+                msg='did not change!'
+                )
+            )
+        # check not NaN
+        try:
+            assert not torch.isnan(p1).byte().any()
+        except AssertionError:
+            raise Exception(
+            "{var_name} {msg}".format(
+                var_name=name,
+                msg='is NaN!'
+                )
+            )
+        # check finite
+        try:
+            assert torch.isfinite(p1).byte().all()
+        except AssertionError:
+            raise Exception(
+            "{var_name} {msg}".format(
+                var_name=name,
+                msg='is Inf!'
+                )
+            )
+    # check that frozen weights have not changed
+    for (_, p0), (name, p1) in zip(initial_params_frozen, params_frozen):
+        # should be the same
+        try:
+            assert torch.equal(p0.to(device), p1.to(device))
+        except AssertionError:
+            raise Exception(
+            "{var_name} {msg}".format(
+                var_name=name,
+                msg='changed!'
+                )
+            )
+        # check not NaN
+        try:
+            assert not torch.isnan(p1).byte().any()
+        except AssertionError:
+            raise Exception(
+            "{var_name} {msg}".format(
+                var_name=name,
+                msg='is NaN!'
+                )
+            )
+        # check finite numbers
+        try:
+            assert torch.isfinite(p1).byte().all()
+        except AssertionError:
+            raise Exception(
+            "{var_name} {msg}".format(
+                var_name=name,
+                msg='is Inf!'
+                )
+            )
+    print("Passed")

load_data.ipynb ADDED Viewed

	@@ -0,0 +1,1209 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "12d87b30",
+   "metadata": {},
+   "source": [
+    "# Load Data\n",
+    "This notebook loads and preproceses all necessary data, namely the following.\n",
+    "* OpenWebTextCorpus: for base DistilBERT model\n",
+    "* SQuAD datasrt: for Q&A\n",
+    "* Natural Questions (needs to be downloaded externally but is preprocessed here): for Q&A\n",
+    "* HotPotQA: for Q&A"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "7c82d7fa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm.auto import tqdm\n",
+    "from datasets import load_dataset\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "import random"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1737f219",
+   "metadata": {},
+   "source": [
+    "## Distilbert Data\n",
+    "In the following, we download the english openwebtext dataset from huggingface (https://huggingface.co/datasets/openwebtext). The dataset is provided by Aaron Gokaslan and Vanya Cohen from Brown University (https://skylion007.github.io/OpenWebTextCorpus/).\n",
+    "\n",
+    "We first load the data, investigate the structure and write the dataset into files of each 10 000 texts."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cce7623c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = load_dataset(\"openwebtext\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "678a5e86",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['text'],\n",
+       "        num_rows: 8013769\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# we have a text-only training dataset with 8 million entries\n",
+    "ds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "b141bce7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create necessary folders\n",
+    "os.mkdir('data')\n",
+    "os.mkdir('data/original')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca94f995",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save text in chunks of 10000 samples\n",
+    "text = []\n",
+    "i = 0\n",
+    "\n",
+    "for sample in tqdm(ds['train']):\n",
+    "    # replace all newlines\n",
+    "    sample = sample['text'].replace('\\n','')\n",
+    "    \n",
+    "    # append cleaned sample to all texts\n",
+    "    text.append(sample)\n",
+    "    \n",
+    "    # if we processed 10000 samples, write them to a file and start over\n",
+    "    if len(text) == 10000:\n",
+    "        with open(f\"data/original/text_{i}.txt\", 'w', encoding='utf-8') as f:\n",
+    "            f.write('\\n'.join(text))\n",
+    "        text = []\n",
+    "        i += 1 \n",
+    "\n",
+    "# write remaining samples to a file\n",
+    "with open(f\"data/original/text_{i}.txt\", 'w', encoding='utf-8') as f:\n",
+    "    f.write('\\n'.join(text))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f131dcfc",
+   "metadata": {},
+   "source": [
+    "### Testing\n",
+    "If we load the first file, we should get a file that is 10000 lines long and has one column\n",
+    "\n",
+    "As we do not preprocess the data in any way, but just write the read text into the file, this is all testing necessary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "df50af74",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"data/original/text_0.txt\", 'r', encoding='utf-8') as f:\n",
+    "    lines = f.read().split('\\n')\n",
+    "lines = pd.DataFrame(lines)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "8ddb0085",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Passed\n"
+     ]
+    }
+   ],
+   "source": [
+    "assert lines.shape==(10000,1)\n",
+    "print(\"Passed\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1a65b268",
+   "metadata": {},
+   "source": [
+    "## SQuAD Data\n",
+    "In the following, we download the SQuAD dataset from huggingface (https://huggingface.co/datasets/squad). It was initially provided by Rajpurkar et al. from Stanford University.\n",
+    "\n",
+    "We again load the dataset and store it in chunks of 1000 into files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "6750ce6e",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn [6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m dataset \u001b[38;5;241m=\u001b[39m load_dataset(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msquad\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m~/anaconda3/envs/myenv/lib/python3.10/site-packages/datasets/load.py:1670\u001b[0m, in \u001b[0;36mload_dataset\u001b[0;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, **config_kwargs)\u001b[0m\n\u001b[1;32m   1667\u001b[0m ignore_verifications \u001b[38;5;241m=\u001b[39m ignore_verifications \u001b[38;5;129;01mor\u001b[39;00m save_infos\n\u001b[1;32m   1669\u001b[0m \u001b[38;5;66;03m# Create a dataset builder\u001b[39;00m\n\u001b[0;32m-> 1670\u001b[0m builder_instance \u001b[38;5;241m=\u001b[39m \u001b[43mload_dataset_builder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1671\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1672\u001b[0m \u001b[43m    \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1673\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1674\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1675\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1676\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1677\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1678\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1679\u001b[0m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1680\u001b[0m \u001b[43m    \u001b[49m\u001b[43muse_auth_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_auth_token\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1681\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mconfig_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1682\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1684\u001b[0m \u001b[38;5;66;03m# Return iterable dataset in case of streaming\u001b[39;00m\n\u001b[1;32m   1685\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m streaming:\n",
+      "File \u001b[0;32m~/anaconda3/envs/myenv/lib/python3.10/site-packages/datasets/load.py:1447\u001b[0m, in \u001b[0;36mload_dataset_builder\u001b[0;34m(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, **config_kwargs)\u001b[0m\n\u001b[1;32m   1445\u001b[0m     download_config \u001b[38;5;241m=\u001b[39m download_config\u001b[38;5;241m.\u001b[39mcopy() \u001b[38;5;28;01mif\u001b[39;00m download_config \u001b[38;5;28;01melse\u001b[39;00m DownloadConfig()\n\u001b[1;32m   1446\u001b[0m     download_config\u001b[38;5;241m.\u001b[39muse_auth_token \u001b[38;5;241m=\u001b[39m use_auth_token\n\u001b[0;32m-> 1447\u001b[0m dataset_module \u001b[38;5;241m=\u001b[39m \u001b[43mdataset_module_factory\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1448\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1449\u001b[0m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1450\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1451\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1452\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1453\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1454\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1456\u001b[0m \u001b[38;5;66;03m# Get dataset builder class from the processing script\u001b[39;00m\n\u001b[1;32m   1457\u001b[0m builder_cls \u001b[38;5;241m=\u001b[39m import_main_class(dataset_module\u001b[38;5;241m.\u001b[39mmodule_path)\n",
+      "File \u001b[0;32m~/anaconda3/envs/myenv/lib/python3.10/site-packages/datasets/load.py:1172\u001b[0m, in \u001b[0;36mdataset_module_factory\u001b[0;34m(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)\u001b[0m\n\u001b[1;32m   1167\u001b[0m             \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e1, \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m):\n\u001b[1;32m   1168\u001b[0m                 \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\n\u001b[1;32m   1169\u001b[0m                     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCouldn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt find a dataset script at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrelative_to_absolute_path(combined_path)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m or any data file in the same directory. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1170\u001b[0m                     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCouldn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt find \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m on the Hugging Face Hub either: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(e1)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me1\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1171\u001b[0m                 ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n\u001b[0;32m-> 1172\u001b[0m             \u001b[38;5;28;01mraise\u001b[39;00m e1 \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n\u001b[1;32m   1173\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   1174\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\n\u001b[1;32m   1175\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCouldn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt find a dataset script at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrelative_to_absolute_path(combined_path)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m or any data file in the same directory.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1176\u001b[0m     )\n",
+      "File \u001b[0;32m~/anaconda3/envs/myenv/lib/python3.10/site-packages/datasets/load.py:1151\u001b[0m, in \u001b[0;36mdataset_module_factory\u001b[0;34m(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)\u001b[0m\n\u001b[1;32m   1143\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m HubDatasetModuleFactoryWithScript(\n\u001b[1;32m   1144\u001b[0m             path,\n\u001b[1;32m   1145\u001b[0m             revision\u001b[38;5;241m=\u001b[39mrevision,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1148\u001b[0m             dynamic_modules_path\u001b[38;5;241m=\u001b[39mdynamic_modules_path,\n\u001b[1;32m   1149\u001b[0m         )\u001b[38;5;241m.\u001b[39mget_module()\n\u001b[1;32m   1150\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1151\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mHubDatasetModuleFactoryWithoutScript\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1152\u001b[0m \u001b[43m            \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1153\u001b[0m \u001b[43m            \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1154\u001b[0m \u001b[43m            \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1155\u001b[0m \u001b[43m            \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1156\u001b[0m \u001b[43m            \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1157\u001b[0m \u001b[43m            \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1158\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mget_module()\n\u001b[1;32m   1159\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e1:  \u001b[38;5;66;03m# noqa: all the attempts failed, before raising the error we should check if the module is already cached.\u001b[39;00m\n\u001b[1;32m   1160\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n",
+      "File \u001b[0;32m~/anaconda3/envs/myenv/lib/python3.10/site-packages/datasets/load.py:744\u001b[0m, in \u001b[0;36mHubDatasetModuleFactoryWithoutScript.__init__\u001b[0;34m(self, name, revision, data_dir, data_files, download_config, download_mode)\u001b[0m\n\u001b[1;32m    742\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdownload_config \u001b[38;5;241m=\u001b[39m download_config \u001b[38;5;129;01mor\u001b[39;00m DownloadConfig()\n\u001b[1;32m    743\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdownload_mode \u001b[38;5;241m=\u001b[39m download_mode\n\u001b[0;32m--> 744\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mname\u001b[38;5;241m.\u001b[39mcount(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m    745\u001b[0m increase_load_count(name, resource_type\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdataset\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "dataset = load_dataset(\"squad\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "65a7ee23",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "os.mkdir(\"data/training_squad\")\n",
+    "os.mkdir(\"data/test_squad\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6ebf63e",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "# we already have a training and test split. Each sample has an id, title, context, question and answers.\n",
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f67ae448",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "# answers are provided like that - we need to extract answer_end for the model\n",
+    "dataset['train']['answers'][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "101cd650",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "# column contains the split (either train or validation), save_dir is the directory\n",
+    "def save_samples(column, save_dir):\n",
+    "    text = []\n",
+    "    i = 0\n",
+    "\n",
+    "    for sample in tqdm(dataset[column]):\n",
+    "        \n",
+    "        # preprocess the context and question by removing the newlines\n",
+    "        context = sample['context'].replace('\\n','')\n",
+    "        question = sample['question'].replace('\\n','')\n",
+    "\n",
+    "        # get the answer as text and start character index\n",
+    "        answer_text = sample['answers']['text'][0]\n",
+    "        answer_start = str(sample['answers']['answer_start'][0])\n",
+    "        \n",
+    "        text.append([context, question, answer_text, answer_start])\n",
+    "\n",
+    "        # we choose chunks of 1000\n",
+    "        if len(text) == 1000:\n",
+    "            with open(f\"data/{save_dir}/text_{i}.txt\", 'w', encoding='utf-8') as f:\n",
+    "                f.write(\"\\n\".join([\"\\t\".join(t) for t in text]))\n",
+    "            text = []\n",
+    "            i += 1\n",
+    "\n",
+    "    # save remaining\n",
+    "    with open(f\"data/{save_dir}/text_{i}.txt\", 'w', encoding='utf-8') as f:\n",
+    "        f.write(\"\\n\".join([\"\\t\".join(t) for t in text]))\n",
+    "\n",
+    "save_samples(\"train\", \"training_squad\")\n",
+    "save_samples(\"validation\", \"test_squad\")\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "67044d13",
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
+   },
+   "source": [
+    "### Testing\n",
+    "If we load a file, we should get a file with 10000 lines and 4 columns\n",
+    "\n",
+    "Also, we want to assure the correct interval. Hence, the second test."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "446281cf",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "with open(\"data/training_squad/text_0.txt\", 'r', encoding='utf-8') as f:\n",
+    "    lines = f.read().split('\\n')\n",
+    "    \n",
+    "lines = pd.DataFrame([line.split(\"\\t\") for line in lines], columns=[\"context\", \"question\", \"answer\", \"answer_start\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ccd5c650",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "assert lines.shape==(1000,4)\n",
+    "print(\"Passed\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2c9e4b70",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "# we assert that we have the right interval\n",
+    "for ind, line in lines.iterrows():\n",
+    "    sample = line\n",
+    "    answer_start = int(sample['answer_start'])\n",
+    "    assert sample['context'][answer_start:answer_start+len(sample['answer'])] == sample['answer']\n",
+    "print(\"Passed\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "02265ace",
+   "metadata": {},
+   "source": [
+    "## Natural Questions Dataset\n",
+    "* Download from https://ai.google.com/research/NaturalQuestions via gsutil (the one from huggingface has 134.92GB, the one from google cloud is in archives)\n",
+    "* Use gunzip to get some samples - we then get `.jsonl`files\n",
+    "* The dataset is a lot more messy, as it is just wikipedia articles with all web artifacts\n",
+    "  * I cleaned the html tags\n",
+    "  * Also I chose a random interval (containing the answer) from the dataset\n",
+    "  * We can't send the whole text into the model anyways"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f3bce0c1",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "from pathlib import Path\n",
+    "paths = [str(x) for x in Path('data/natural_questions/v1.0/train/').glob('**/*.jsonl')]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9c58c00",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "os.mkdir(\"data/natural_questions_train\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0ed7ba6c",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "import re\n",
+    "\n",
+    "# clean html tags\n",
+    "CLEANR = re.compile('<.+?>')\n",
+    "# clean multiple spaces\n",
+    "CLEANMULTSPACE = re.compile('(\\s)+')\n",
+    "\n",
+    "# the function takes an html documents and removes artifacts\n",
+    "def cleanhtml(raw_html):\n",
+    "    # tags\n",
+    "    cleantext = re.sub(CLEANR, '', raw_html)\n",
+    "    # newlines\n",
+    "    cleantext = cleantext.replace(\"\\n\", '')\n",
+    "    # tabs\n",
+    "    cleantext = cleantext.replace(\"\\t\", '')\n",
+    "    # character encodings\n",
+    "    cleantext = cleantext.replace(\"&#39;\", \"'\")\n",
+    "    cleantext = cleantext.replace(\"&amp;\", \"'\")\n",
+    "    cleantext = cleantext.replace(\"&quot;\", '\"')\n",
+    "    # multiple spaces\n",
+    "    cleantext = re.sub(CLEANMULTSPACE, ' ', cleantext)\n",
+    "    # documents end with this tags, if it is present in the string, cut it off\n",
+    "    idx = cleantext.find(\"<!-- NewPP limit\")\n",
+    "    if idx > -1:\n",
+    "        cleantext = cleantext[:idx]\n",
+    "    return cleantext.strip()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "66ca19ac",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "\n",
+    "# file count\n",
+    "i = 0\n",
+    "data = []\n",
+    "\n",
+    "# iterate over all json files\n",
+    "for path in paths:\n",
+    "    print(path)\n",
+    "    # read file and store as list (this requires much memory, as the files are huge)\n",
+    "    with open(path, 'r') as json_file:\n",
+    "        json_list = list(json_file)\n",
+    "    \n",
+    "    # process every context, question, answer pair\n",
+    "    for json_str in json_list:\n",
+    "        result = json.loads(json_str)\n",
+    "\n",
+    "        # append a question mark - SQuAD questions end with a qm too\n",
+    "        question = result['question_text'] + \"?\"\n",
+    "        \n",
+    "        # some question do not contain an answer - we do not need them\n",
+    "        if(len(result['annotations'][0]['short_answers'])==0):\n",
+    "            continue\n",
+    "\n",
+    "        # get true start/end byte\n",
+    "        true_start = result['annotations'][0]['short_answers'][0]['start_byte']\n",
+    "        true_end = result['annotations'][0]['short_answers'][0]['end_byte']\n",
+    "\n",
+    "        # convert to bytes\n",
+    "        byte_encoding = bytes(result['document_html'], encoding='utf-8')\n",
+    "        \n",
+    "        # the document is the whole wikipedia article, we randomly choose an appropriate part (containing the\n",
+    "        # answer): we have 512 tokens as the input for the model - 4000 bytes lead to a good length\n",
+    "        max_back = 3500 if true_start >= 3500 else true_start\n",
+    "        first = random.randint(int(true_start)-max_back, int(true_start))\n",
+    "        end = first + 3500 + true_end - true_start\n",
+    "        \n",
+    "        # get chosen context\n",
+    "        cleanbytes = byte_encoding[first:end]\n",
+    "        # decode back to text - if our end byte is the middle of a word, we ignore it and cut it off\n",
+    "        cleantext = bytes.decode(cleanbytes, errors='ignore')\n",
+    "        # clean html tags\n",
+    "        cleantext = cleanhtml(cleantext)\n",
+    "\n",
+    "        # find the true answer\n",
+    "        answer_start = cleanbytes.find(byte_encoding[true_start:true_end])\n",
+    "        true_answer = bytes.decode(cleanbytes[answer_start:answer_start+(true_end-true_start)])\n",
+    "        \n",
+    "        # clean html tags\n",
+    "        true_answer = cleanhtml(true_answer)\n",
+    "        \n",
+    "        start_ind = cleantext.find(true_answer)\n",
+    "        \n",
+    "        # If cleaning the string makes the answer not findable skip it\n",
+    "        # this hardly ever happens, except if there is an emense amount of web artifacts\n",
+    "        if start_ind == -1:\n",
+    "            continue\n",
+    "            \n",
+    "        data.append([cleantext, question, true_answer, str(start_ind)])\n",
+    "\n",
+    "        if len(data) == 1000:\n",
+    "            with open(f\"data/natural_questions_train/text_{i}.txt\", 'w', encoding='utf-8') as f:\n",
+    "                f.write(\"\\n\".join([\"\\t\".join(t) for t in data]))\n",
+    "            i += 1\n",
+    "            data = []\n",
+    "with open(f\"data/natural_questions_train/text_{i}.txt\", 'w', encoding='utf-8') as f:\n",
+    "    f.write(\"\\n\".join([\"\\t\".join(t) for t in data]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "30f26b4e",
+   "metadata": {},
+   "source": [
+    "### Testing\n",
+    "In the following, we first check if the shape of the file is correct.\n",
+    "\n",
+    "Then we iterate over the file and check if the answers according to the file are the same as in the original file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "490ac0db",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "with open(\"data/natural_questions_train/text_0.txt\", 'r', encoding='utf-8') as f:\n",
+    "    lines = f.read().split('\\n')\n",
+    "    \n",
+    "lines = pd.DataFrame([line.split(\"\\t\") for line in lines], columns=[\"context\", \"question\", \"answer\", \"answer_start\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0d7cc3ee",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "assert lines.shape == (1000, 4)\n",
+    "print(\"Passed\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0fd8a854",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "with open(\"data/natural_questions/v1.0/train/nq-train-00.jsonl\", 'r') as json_file:\n",
+    "    json_list = list(json_file)[:500]\n",
+    "del json_file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "170bff30",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "lines_index = 0\n",
+    "for i in range(len(json_list)):\n",
+    "    result = json.loads(json_list[i])\n",
+    "     \n",
+    "    if(len(result['annotations'][0]['short_answers'])==0):\n",
+    "        pass\n",
+    "    else: \n",
+    "        # assert that the question text is the same\n",
+    "        assert result['question_text'] + \"?\" == lines.loc[lines_index, 'question']\n",
+    "        true_start = result['annotations'][0]['short_answers'][0]['start_byte']\n",
+    "        true_end = result['annotations'][0]['short_answers'][0]['end_byte']\n",
+    "        true_answer = bytes.decode(bytes(result['document_html'], encoding='utf-8')[true_start:true_end])\n",
+    "        \n",
+    "        processed_answer = lines.loc[lines_index, 'answer']\n",
+    "        # assert that the answer is the same\n",
+    "        assert cleanhtml(true_answer) == processed_answer\n",
+    "    \n",
+    "        start_ind = int(lines.loc[lines_index, 'answer_start'])\n",
+    "        # assert that the answer (according to the index) is the same\n",
+    "        assert cleanhtml(true_answer) == lines.loc[lines_index, 'context'][start_ind:start_ind+len(processed_answer)]\n",
+    "        \n",
+    "        lines_index += 1\n",
+    "    \n",
+    "    if lines_index == len(lines):\n",
+    "        break\n",
+    "print(\"Passed\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "78e6e737",
+   "metadata": {},
+   "source": [
+    "## Hotpot QA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "27efcc8c",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "ds = load_dataset(\"hotpot_qa\", 'fullwiki')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1493f21f",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "ds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a047946",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "os.mkdir('data/hotpotqa_training')\n",
+    "os.mkdir('data/hotpotqa_test')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e65b6485",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "# column contains the split (either train or validation), save_dir is the directory\n",
+    "def save_samples(column, save_dir):\n",
+    "    text = []\n",
+    "    i = 0\n",
+    "\n",
+    "    for sample in tqdm(ds[column]):\n",
+    "        \n",
+    "        # preprocess the context and question by removing the newlines\n",
+    "        context = sample['context']['sentences']\n",
+    "        context = \" \".join([\"\".join(sentence) for sentence in context])\n",
+    "        question = sample['question'].replace('\\n','')\n",
+    "        \n",
+    "        # get the answer as text and start character index\n",
+    "        answer_text = sample['answer']\n",
+    "        answer_start = context.find(answer_text)\n",
+    "        if answer_start == -1:\n",
+    "            continue\n",
+    "            \n",
+    "        \n",
+    "            \n",
+    "        if answer_start > 1500:\n",
+    "            first = random.randint(answer_start-1500, answer_start)\n",
+    "            end = first + 1500 + len(answer_text)\n",
+    "            \n",
+    "            context = context[first:end+1]\n",
+    "            answer_start = context.find(answer_text)\n",
+    "            \n",
+    "            if answer_start == -1:continue\n",
+    "            \n",
+    "        text.append([context, question, answer_text, str(answer_start)])\n",
+    "\n",
+    "        # we choose chunks of 1000\n",
+    "        if len(text) == 1000:\n",
+    "            with open(f\"data/{save_dir}/text_{i}.txt\", 'w', encoding='utf-8') as f:\n",
+    "                f.write(\"\\n\".join([\"\\t\".join(t) for t in text]))\n",
+    "            text = []\n",
+    "            i += 1\n",
+    "\n",
+    "    # save remaining\n",
+    "    with open(f\"data/{save_dir}/text_{i}.txt\", 'w', encoding='utf-8') as f:\n",
+    "        f.write(\"\\n\".join([\"\\t\".join(t) for t in text]))\n",
+    "\n",
+    "save_samples(\"train\", \"hotpotqa_training\")\n",
+    "save_samples(\"validation\", \"hotpotqa_test\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "97cc358f",
+   "metadata": {},
+   "source": [
+    "## Testing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f321483c",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "with open(\"data/hotpotqa_training/text_0.txt\", 'r', encoding='utf-8') as f:\n",
+    "    lines = f.read().split('\\n')\n",
+    "    \n",
+    "lines = pd.DataFrame([line.split(\"\\t\") for line in lines], columns=[\"context\", \"question\", \"answer\", \"answer_start\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72a96e78",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "assert lines.shape == (1000, 4)\n",
+    "print(\"Passed\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c32c2f16",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "# we assert that we have the right interval\n",
+    "for ind, line in lines.iterrows():\n",
+    "    sample = line\n",
+    "    answer_start = int(sample['answer_start'])\n",
+    "    assert sample['context'][answer_start:answer_start+len(sample['answer'])] == sample['answer']\n",
+    "print(\"Passed\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bc36fe7d",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mnotebook controller is DISPOSED. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.16"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "85bf9c14e9ba73b783ed1274d522bec79eb0b2b739090180d8ce17bb11aff4aa"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

qa_model.py ADDED Viewed

	@@ -0,0 +1,532 @@

+from torch import nn
+import torch
+from typing import Optional
+import copy
+import pandas as pd
+"""
+This module contains the implementation of the QA model. We define three different models and a dataset class.
+The structure is based on the Hugging Face implementations.
+https://huggingface.co/docs/transformers/model_doc/distilbert
+"""
+class SimpleQuestionDistilBERT(nn.Module):
+    """
+    This class implements a simple version of the distilbert question answering model, following the implementation of Hugging Face,
+    https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/distilbert/modeling_distilbert.py#L805
+    It basically fine-tunes a given distilbert model. We only add one linear layer on top, which determines the start and end logits.
+    """
+    def __init__(self, distilbert, dropout=0.1):
+        """
+        Creates and initialises model
+        """
+        super(SimpleQuestionDistilBERT, self).__init__()
+        self.distilbert = distilbert
+        self.dropout = nn.Dropout(dropout)
+        self.classifier = nn.Linear(768, 2)
+        # initialise weights
+        def init_weights(m):
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                m.bias.data.fill_(0.01)
+        self.classifier.apply(init_weights)
+    def forward(self,
+                input_ids: Optional[torch.Tensor] = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                head_mask: Optional[torch.Tensor] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                start_positions: Optional[torch.Tensor] = None,
+                end_positions: Optional[torch.Tensor] = None,
+                output_attentions: Optional[bool] = None,
+                output_hidden_states: Optional[bool] = None,
+                return_dict: Optional[bool] = None):
+        """
+        This function implements the forward pass of the model. It takes the input_ids and attention_mask and returns the start and end logits.
+        """
+        # make predictions on base model
+        distilbert_output = self.distilbert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # retrieve hidden states
+        hidden_states = distilbert_output[0]  # (bs, max_query_len, dim)
+        hidden_states = self.dropout(hidden_states)
+        # make predictions on head
+        logits = self.classifier(hidden_states)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()  # (bs, max_query_len)
+        end_logits = end_logits.squeeze(-1).contiguous()  # (bs, max_query_len)
+        # calculate loss
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        return {"loss": total_loss,
+                "start_logits": start_logits,
+                "end_logits": end_logits,
+                "hidden_states": distilbert_output.hidden_states,
+                "attentions": distilbert_output.attentions}
+class QuestionDistilBERT(nn.Module):
+    """
+    This class implements the distilbert question answering model. We fix all layers of the base model and only fine-tune the head.
+    The head consists of a transformer encoder with three layers and a classifier on top.
+    """
+    def __init__(self, distilbert, dropout=0.1):
+        """
+        Creates and initialises QuestionDIstilBERT instance
+        """
+        super(QuestionDistilBERT, self).__init__()
+        # fix parameters for base model
+        for param in distilbert.parameters():
+            param.requires_grad = False
+        self.distilbert = distilbert
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(dropout)
+        self.te = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=768, nhead=12), num_layers=3)
+        # create custom head
+        self.classifier = nn.Sequential(
+            nn.Dropout(dropout),
+            nn.ReLU(),
+            nn.Linear(768, 512),
+            nn.Dropout(dropout),
+            nn.ReLU(),
+            nn.Linear(512, 256),
+            nn.Dropout(dropout),
+            nn.ReLU(),
+            nn.Linear(256, 128),
+            nn.Dropout(dropout),
+            nn.ReLU(),
+            nn.Linear(128, 64),
+            nn.Dropout(dropout),
+            nn.ReLU(),
+            nn.Linear(64, 2)
+        )
+        # initialise weights of the linear layers
+        def init_weights(m):
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                m.bias.data.fill_(0.01)
+        self.classifier.apply(init_weights)
+    def forward(self,
+                input_ids: Optional[torch.Tensor] = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                head_mask: Optional[torch.Tensor] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                start_positions: Optional[torch.Tensor] = None,
+                end_positions: Optional[torch.Tensor] = None,
+                output_attentions: Optional[bool] = None,
+                output_hidden_states: Optional[bool] = None,
+                return_dict: Optional[bool] = None):
+        """
+        This function implements the forward pass of the model. It takes the input_ids and attention_mask and returns the start and end logits.
+        """
+        # make predictions on base model
+        distilbert_output = self.distilbert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # retrieve hidden states
+        hidden_states = distilbert_output[0]  # (bs, max_query_len, dim)
+        hidden_states = self.dropout(hidden_states)
+        attn_output = self.te(hidden_states)
+        # make predictions on head
+        logits = self.classifier(attn_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+        # calculate loss
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        return {"loss": total_loss,
+                "start_logits": start_logits,
+                "end_logits": end_logits,
+                "hidden_states": distilbert_output.hidden_states,
+                "attentions": distilbert_output.attentions}
+class ReuseQuestionDistilBERT(nn.Module):
+    """
+    This class imports a model where all layers of the base distilbert model are fixed.
+    Instead of training a completely new head, we copy the last two layers of the base model and add a classifier on top.
+    """
+    def __init__(self, distilbert, dropout=0.15):
+        """
+        Creates and initialises QuestionDIstilBERT instance
+        """
+        super(ReuseQuestionDistilBERT, self).__init__()
+        self.te = copy.deepcopy(list(list(distilbert.children())[1].children())[0][-2:])
+        # fix parameters for base model
+        for param in distilbert.parameters():
+            param.requires_grad = False
+        self.distilbert = distilbert
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(dropout)
+        # create custom head
+        self.classifier = nn.Linear(768, 2)
+        def init_weights(m):
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                m.bias.data.fill_(0.01)
+        self.classifier.apply(init_weights)
+    def forward(self,
+                input_ids: Optional[torch.Tensor] = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                head_mask: Optional[torch.Tensor] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                start_positions: Optional[torch.Tensor] = None,
+                end_positions: Optional[torch.Tensor] = None,
+                output_attentions: Optional[bool] = None,
+                output_hidden_states: Optional[bool] = None,
+                return_dict: Optional[bool] = None):
+        """
+        This function implements the forward pass of the model. It takes the input_ids and attention_mask and returns the start and end logits.
+        """
+        # make predictions on base model
+        distilbert_output = self.distilbert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # retrieve hidden states
+        hidden_states = distilbert_output[0]  # (bs, max_query_len, dim)
+        hidden_states = self.dropout(hidden_states)
+        for te in self.te:
+            hidden_states = te(
+                x=hidden_states,
+                attn_mask=attention_mask,
+                head_mask=head_mask,
+                output_attentions=output_attentions
+            )[0]
+        hidden_states = self.dropout(hidden_states)
+        # make predictions on head
+        logits = self.classifier(hidden_states)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()  # (bs, max_query_len)
+        end_logits = end_logits.squeeze(-1).contiguous()  # (bs, max_query_len)
+        # calculate loss
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        return {"loss": total_loss,
+                "start_logits": start_logits,
+                "end_logits": end_logits,
+                "hidden_states": distilbert_output.hidden_states,
+                "attentions": distilbert_output.attentions}
+class Dataset(torch.utils.data.Dataset):
+    """
+    This class creates a dataset for the DistilBERT qa-model.
+    """
+    def __init__(self, squad_paths, natural_question_paths, hotpotqa_paths, tokenizer):
+        """
+        creates and initialises dataset object
+        """
+        self.paths = []
+        self.count = 0
+        if squad_paths != None:
+            self.paths.extend(squad_paths[:len(squad_paths)-1])
+        if natural_question_paths != None:
+            self.paths.extend(natural_question_paths[:len(natural_question_paths)-1])
+        if hotpotqa_paths != None:
+            self.paths.extend(hotpotqa_paths[:len(hotpotqa_paths)-1])
+        self.data = None
+        self.current_file = 0
+        self.remaining = 0
+        self.encodings = None
+        # tokenizer for strings
+        self.tokenizer = tokenizer
+    def __len__(self):
+        """
+        returns the length of the dataset
+        """
+        return len(self.paths)*1000
+    def read_file(self, path):
+        """
+        reads the file stored at path
+        """
+        with open(path, 'r', encoding='utf-8') as f:
+            lines = f.read().split('\n')
+        return lines
+    def get_encodings(self):
+        """
+        returns encoded strings for the model
+        """
+        # remove leading and ending whitespaces
+        questions = [q.strip() for q in self.data["question"]]
+        context = [q.lower() for q in self.data["context"]]
+        # tokenises questions and context. If the context is too long, we truncate it.
+        inputs = self.tokenizer(
+            questions,
+            context,
+            max_length=512,
+            truncation="only_second",
+            return_offsets_mapping=True,
+            padding="max_length",
+        )
+        # tuples of integers giving us the original positions
+        offset_mapping = inputs.pop("offset_mapping")
+        answers = self.data["answer"]
+        answer_start = self.data["answer_start"]
+        # store beginning and end positions
+        start_positions = []
+        end_positions = []
+        # iterate through questions
+        for i, offset in enumerate(offset_mapping):
+            answer = answers[i]
+            start_char = int(answer_start[i])
+            end_char = start_char + len(answer)
+            sequence_ids = inputs.sequence_ids(i)
+            # start and end of context based on tokens
+            idx = 0
+            while sequence_ids[idx] != 1:
+                idx += 1
+            context_start = idx
+            while sequence_ids[idx] == 1:
+                idx += 1
+            context_end = idx - 1
+            # If answer not inside context add (0,0)
+            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
+                start_positions.append(0)
+                end_positions.append(0)
+                self.count += 1
+            else:
+                # go to first offset position that is smaller than start char
+                idx = context_start
+                while idx <= context_end and offset[idx][0] <= start_char:
+                    idx += 1
+                start_positions.append(idx - 1)
+                idx = context_end
+                while idx >= context_start and offset[idx][1] >= end_char:
+                    idx -= 1
+                end_positions.append(idx + 1)
+        # append start and end position to the embeddings
+        inputs["start_positions"] = start_positions
+        inputs["end_positions"] = end_positions
+        # return input_ids, attention mask, start and end positions (GT)
+        return {'input_ids': torch.tensor(inputs['input_ids']),
+                'attention_mask': torch.tensor(inputs['attention_mask']),
+                'start_positions': torch.tensor(inputs['start_positions']),
+                'end_positions': torch.tensor(inputs['end_positions'])}
+    def __getitem__(self, i):
+        """
+        returns encoding of item i
+        """
+        # if we have looked at all items in the file - take next
+        if self.remaining == 0:
+            self.data = self.read_file(self.paths[self.current_file])
+            self.data = pd.DataFrame([line.split("\t") for line in self.data],
+                                 columns=["context", "question", "answer", "answer_start"])
+            self.current_file += 1
+            self.remaining = len(self.data)
+            self.encodings = self.get_encodings()
+        # if we are at the end of the dataset, start over again
+        if self.current_file == len(self.paths):
+            self.current_file = 0
+        self.remaining -= 1
+        return {key: tensor[i%1000] for key, tensor in self.encodings.items()}
+def test_model(model, optim, test_ds_loader, device):
+    """
+    This function is used to test the model's functionality, namely if params are not NaN and infinite,
+    not-frozen parameters have to change, frozen ones must not
+    :param model: pytorch model to evaluate
+    :param optim: optimizer
+    :param test_ds_loader: dataloader object
+    :param device: device, the model is on
+    :raises Exception if the model doesn't work as expected
+    """
+    ## Check if non-frozen parameters changed and frozen ones did not
+    # get parameters used for tuning and store initial weight
+    params = [np for np in model.named_parameters() if np[1].requires_grad]
+    initial_params = [(name, p.clone()) for (name, p) in params]
+    # get frozen parameters and store initial weight
+    params_frozen = [np for np in model.named_parameters() if not np[1].requires_grad]
+    initial_params_frozen = [(name, p.clone()) for (name, p) in params_frozen]
+    # perform one iteration
+    optim.zero_grad()
+    batch = next(iter(test_ds_loader))
+    input_ids = batch['input_ids'].to(device)
+    attention_mask = batch['attention_mask'].to(device)
+    start_positions = batch['start_positions'].to(device)
+    end_positions = batch['end_positions'].to(device)
+    # forward pass and backpropagation
+    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions,
+                    end_positions=end_positions)
+    loss = outputs['loss']
+    loss.backward()
+    optim.step()
+    # check if variables have changed
+    for (_, p0), (name, p1) in zip(initial_params, params):
+        # check different than initial
+        try:
+            assert not torch.equal(p0.to(device), p1.to(device))
+        except AssertionError:
+            raise Exception(
+                "{var_name} {msg}".format(
+                    var_name=name,
+                    msg='did not change!'
+                )
+            )
+        # check not NaN
+        try:
+            assert not torch.isnan(p1).byte().any()
+        except AssertionError:
+            raise Exception(
+                "{var_name} {msg}".format(
+                    var_name=name,
+                    msg='is NaN!'
+                )
+            )
+        # check finite
+        try:
+            assert torch.isfinite(p1).byte().all()
+        except AssertionError:
+            raise Exception(
+                "{var_name} {msg}".format(
+                    var_name=name,
+                    msg='is Inf!'
+                )
+            )
+    # check that frozen weights have not changed
+    for (_, p0), (name, p1) in zip(initial_params_frozen, params_frozen):
+        # should be the same
+        try:
+            assert torch.equal(p0.to(device), p1.to(device))
+        except AssertionError:
+            raise Exception(
+                "{var_name} {msg}".format(
+                    var_name=name,
+                    msg='changed!'
+                )
+            )
+        # check not NaN
+        try:
+            assert not torch.isnan(p1).byte().any()
+        except AssertionError:
+            raise Exception(
+                "{var_name} {msg}".format(
+                    var_name=name,
+                    msg='is NaN!'
+                )
+            )
+        # check finite numbers
+        try:
+            assert torch.isfinite(p1).byte().all()
+        except AssertionError:
+            raise Exception(
+                "{var_name} {msg}".format(
+                    var_name=name,
+                    msg='is Inf!'
+                )
+            )
+    print("Passed")

question_answering.ipynb ADDED Viewed

	@@ -0,0 +1,2403 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "19817716",
+   "metadata": {},
+   "source": [
+    "# Question Answering\n",
+    "The following notebook contains different question answering models. We will start by introducing a representation for the dataset and corresponding DataLoader and then evaluate different models."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "49bf46c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import DistilBertModel, DistilBertForMaskedLM, DistilBertConfig, \\\n",
+    "            DistilBertTokenizerFast, AutoTokenizer, BertModel, BertForMaskedLM, BertTokenizerFast, BertConfig\n",
+    "from torch import nn\n",
+    "from pathlib import Path\n",
+    "import torch\n",
+    "import pandas as pd\n",
+    "from typing import Optional \n",
+    "from tqdm.auto import tqdm\n",
+    "from util import eval_test_set, count_parameters\n",
+    "from torch.optim import AdamW, RMSprop\n",
+    "\n",
+    "\n",
+    "from qa_model import QuestionDistilBERT, SimpleQuestionDistilBERT, ReuseQuestionDistilBERT, Dataset, test_model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3ea47820",
+   "metadata": {},
+   "source": [
+    "## Data\n",
+    "Processing the data correctly is partly based on the Huggingface Tutorial (https://huggingface.co/course/chapter7/7?fw=pt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "id": "7b1b2b3e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "f276eba7",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "   \n",
+    "# create datasets and loaders for training and test set\n",
+    "squad_paths = [str(x) for x in Path('data/training_squad/').glob('**/*.txt')]\n",
+    "nat_paths = [str(x) for x in Path('data/natural_questions_train/').glob('**/*.txt')]\n",
+    "hotpotqa_paths = [str(x) for x in Path('data/hotpotqa_training/').glob('**/*.txt')]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ad8d532a",
+   "metadata": {},
+   "source": [
+    "## POC Model\n",
+    "* Works very well:\n",
+    "  * Dropout 0.1 is too small (overfitting after first epoch) - changed to 0.15\n",
+    "  * Difference between AdamW and RMSprop minimal\n",
+    "  \n",
+    "### Results:\n",
+    "Dropout = 0.15\n",
+    "* Mean EM:  0.5374\n",
+    "* Mean F-1:  0.6826317532406944\n",
+    "\n",
+    "Dropout = 0.2 (overfitting realtively similar to first, but seems to be too high)\n",
+    "* Mean EM:  0.5044\n",
+    "* Mean F-1:  0.6437359169276439"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "id": "703e7f38",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = Dataset(squad_paths = squad_paths, natural_question_paths=None, hotpotqa_paths=hotpotqa_paths, tokenizer=tokenizer)\n",
+    "loader = torch.utils.data.DataLoader(dataset, batch_size=8)\n",
+    "\n",
+    "test_dataset = Dataset(squad_paths = [str(x) for x in Path('data/test_squad/').glob('**/*.txt')], \n",
+    "                       natural_question_paths=None, \n",
+    "                       hotpotqa_paths = None, tokenizer=tokenizer)\n",
+    "test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "id": "6672f614",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = DistilBertForMaskedLM.from_pretrained(\"distilbert-base-uncased\")\n",
+    "config = DistilBertConfig.from_pretrained(\"distilbert-base-uncased\")\n",
+    "mod = model.distilbert"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "dec15198",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "SimpleQuestionDistilBERT(\n",
+       "  (distilbert): DistilBertModel(\n",
+       "    (embeddings): Embeddings(\n",
+       "      (word_embeddings): Embedding(30522, 768, padding_idx=0)\n",
+       "      (position_embeddings): Embedding(512, 768)\n",
+       "      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "      (dropout): Dropout(p=0.1, inplace=False)\n",
+       "    )\n",
+       "    (transformer): Transformer(\n",
+       "      (layer): ModuleList(\n",
+       "        (0): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (1): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (2): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (3): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (4): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (5): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       "  (dropout): Dropout(p=0.5, inplace=False)\n",
+       "  (classifier): Linear(in_features=768, out_features=2, bias=True)\n",
+       ")"
+      ]
+     },
+     "execution_count": 56,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
+    "model = SimpleQuestionDistilBERT(mod)\n",
+    "model.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "9def3c83",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+---------------------------------------------------------+------------+\n",
+      "|                         Modules                         | Parameters |\n",
+      "+---------------------------------------------------------+------------+\n",
+      "|       distilbert.embeddings.word_embeddings.weight      |  23440896  |\n",
+      "|     distilbert.embeddings.position_embeddings.weight    |   393216   |\n",
+      "|          distilbert.embeddings.LayerNorm.weight         |    768     |\n",
+      "|           distilbert.embeddings.LayerNorm.bias          |    768     |\n",
+      "|  distilbert.transformer.layer.0.attention.q_lin.weight  |   589824   |\n",
+      "|   distilbert.transformer.layer.0.attention.q_lin.bias   |    768     |\n",
+      "|  distilbert.transformer.layer.0.attention.k_lin.weight  |   589824   |\n",
+      "|   distilbert.transformer.layer.0.attention.k_lin.bias   |    768     |\n",
+      "|  distilbert.transformer.layer.0.attention.v_lin.weight  |   589824   |\n",
+      "|   distilbert.transformer.layer.0.attention.v_lin.bias   |    768     |\n",
+      "| distilbert.transformer.layer.0.attention.out_lin.weight |   589824   |\n",
+      "|  distilbert.transformer.layer.0.attention.out_lin.bias  |    768     |\n",
+      "|   distilbert.transformer.layer.0.sa_layer_norm.weight   |    768     |\n",
+      "|    distilbert.transformer.layer.0.sa_layer_norm.bias    |    768     |\n",
+      "|      distilbert.transformer.layer.0.ffn.lin1.weight     |  2359296   |\n",
+      "|       distilbert.transformer.layer.0.ffn.lin1.bias      |    3072    |\n",
+      "|      distilbert.transformer.layer.0.ffn.lin2.weight     |  2359296   |\n",
+      "|       distilbert.transformer.layer.0.ffn.lin2.bias      |    768     |\n",
+      "| distilbert.transformer.layer.0.output_layer_norm.weight |    768     |\n",
+      "|  distilbert.transformer.layer.0.output_layer_norm.bias  |    768     |\n",
+      "|  distilbert.transformer.layer.1.attention.q_lin.weight  |   589824   |\n",
+      "|   distilbert.transformer.layer.1.attention.q_lin.bias   |    768     |\n",
+      "|  distilbert.transformer.layer.1.attention.k_lin.weight  |   589824   |\n",
+      "|   distilbert.transformer.layer.1.attention.k_lin.bias   |    768     |\n",
+      "|  distilbert.transformer.layer.1.attention.v_lin.weight  |   589824   |\n",
+      "|   distilbert.transformer.layer.1.attention.v_lin.bias   |    768     |\n",
+      "| distilbert.transformer.layer.1.attention.out_lin.weight |   589824   |\n",
+      "|  distilbert.transformer.layer.1.attention.out_lin.bias  |    768     |\n",
+      "|   distilbert.transformer.layer.1.sa_layer_norm.weight   |    768     |\n",
+      "|    distilbert.transformer.layer.1.sa_layer_norm.bias    |    768     |\n",
+      "|      distilbert.transformer.layer.1.ffn.lin1.weight     |  2359296   |\n",
+      "|       distilbert.transformer.layer.1.ffn.lin1.bias      |    3072    |\n",
+      "|      distilbert.transformer.layer.1.ffn.lin2.weight     |  2359296   |\n",
+      "|       distilbert.transformer.layer.1.ffn.lin2.bias      |    768     |\n",
+      "| distilbert.transformer.layer.1.output_layer_norm.weight |    768     |\n",
+      "|  distilbert.transformer.layer.1.output_layer_norm.bias  |    768     |\n",
+      "|  distilbert.transformer.layer.2.attention.q_lin.weight  |   589824   |\n",
+      "|   distilbert.transformer.layer.2.attention.q_lin.bias   |    768     |\n",
+      "|  distilbert.transformer.layer.2.attention.k_lin.weight  |   589824   |\n",
+      "|   distilbert.transformer.layer.2.attention.k_lin.bias   |    768     |\n",
+      "|  distilbert.transformer.layer.2.attention.v_lin.weight  |   589824   |\n",
+      "|   distilbert.transformer.layer.2.attention.v_lin.bias   |    768     |\n",
+      "| distilbert.transformer.layer.2.attention.out_lin.weight |   589824   |\n",
+      "|  distilbert.transformer.layer.2.attention.out_lin.bias  |    768     |\n",
+      "|   distilbert.transformer.layer.2.sa_layer_norm.weight   |    768     |\n",
+      "|    distilbert.transformer.layer.2.sa_layer_norm.bias    |    768     |\n",
+      "|      distilbert.transformer.layer.2.ffn.lin1.weight     |  2359296   |\n",
+      "|       distilbert.transformer.layer.2.ffn.lin1.bias      |    3072    |\n",
+      "|      distilbert.transformer.layer.2.ffn.lin2.weight     |  2359296   |\n",
+      "|       distilbert.transformer.layer.2.ffn.lin2.bias      |    768     |\n",
+      "| distilbert.transformer.layer.2.output_layer_norm.weight |    768     |\n",
+      "|  distilbert.transformer.layer.2.output_layer_norm.bias  |    768     |\n",
+      "|  distilbert.transformer.layer.3.attention.q_lin.weight  |   589824   |\n",
+      "|   distilbert.transformer.layer.3.attention.q_lin.bias   |    768     |\n",
+      "|  distilbert.transformer.layer.3.attention.k_lin.weight  |   589824   |\n",
+      "|   distilbert.transformer.layer.3.attention.k_lin.bias   |    768     |\n",
+      "|  distilbert.transformer.layer.3.attention.v_lin.weight  |   589824   |\n",
+      "|   distilbert.transformer.layer.3.attention.v_lin.bias   |    768     |\n",
+      "| distilbert.transformer.layer.3.attention.out_lin.weight |   589824   |\n",
+      "|  distilbert.transformer.layer.3.attention.out_lin.bias  |    768     |\n",
+      "|   distilbert.transformer.layer.3.sa_layer_norm.weight   |    768     |\n",
+      "|    distilbert.transformer.layer.3.sa_layer_norm.bias    |    768     |\n",
+      "|      distilbert.transformer.layer.3.ffn.lin1.weight     |  2359296   |\n",
+      "|       distilbert.transformer.layer.3.ffn.lin1.bias      |    3072    |\n",
+      "|      distilbert.transformer.layer.3.ffn.lin2.weight     |  2359296   |\n",
+      "|       distilbert.transformer.layer.3.ffn.lin2.bias      |    768     |\n",
+      "| distilbert.transformer.layer.3.output_layer_norm.weight |    768     |\n",
+      "|  distilbert.transformer.layer.3.output_layer_norm.bias  |    768     |\n",
+      "|  distilbert.transformer.layer.4.attention.q_lin.weight  |   589824   |\n",
+      "|   distilbert.transformer.layer.4.attention.q_lin.bias   |    768     |\n",
+      "|  distilbert.transformer.layer.4.attention.k_lin.weight  |   589824   |\n",
+      "|   distilbert.transformer.layer.4.attention.k_lin.bias   |    768     |\n",
+      "|  distilbert.transformer.layer.4.attention.v_lin.weight  |   589824   |\n",
+      "|   distilbert.transformer.layer.4.attention.v_lin.bias   |    768     |\n",
+      "| distilbert.transformer.layer.4.attention.out_lin.weight |   589824   |\n",
+      "|  distilbert.transformer.layer.4.attention.out_lin.bias  |    768     |\n",
+      "|   distilbert.transformer.layer.4.sa_layer_norm.weight   |    768     |\n",
+      "|    distilbert.transformer.layer.4.sa_layer_norm.bias    |    768     |\n",
+      "|      distilbert.transformer.layer.4.ffn.lin1.weight     |  2359296   |\n",
+      "|       distilbert.transformer.layer.4.ffn.lin1.bias      |    3072    |\n",
+      "|      distilbert.transformer.layer.4.ffn.lin2.weight     |  2359296   |\n",
+      "|       distilbert.transformer.layer.4.ffn.lin2.bias      |    768     |\n",
+      "| distilbert.transformer.layer.4.output_layer_norm.weight |    768     |\n",
+      "|  distilbert.transformer.layer.4.output_layer_norm.bias  |    768     |\n",
+      "|  distilbert.transformer.layer.5.attention.q_lin.weight  |   589824   |\n",
+      "|   distilbert.transformer.layer.5.attention.q_lin.bias   |    768     |\n",
+      "|  distilbert.transformer.layer.5.attention.k_lin.weight  |   589824   |\n",
+      "|   distilbert.transformer.layer.5.attention.k_lin.bias   |    768     |\n",
+      "|  distilbert.transformer.layer.5.attention.v_lin.weight  |   589824   |\n",
+      "|   distilbert.transformer.layer.5.attention.v_lin.bias   |    768     |\n",
+      "| distilbert.transformer.layer.5.attention.out_lin.weight |   589824   |\n",
+      "|  distilbert.transformer.layer.5.attention.out_lin.bias  |    768     |\n",
+      "|   distilbert.transformer.layer.5.sa_layer_norm.weight   |    768     |\n",
+      "|    distilbert.transformer.layer.5.sa_layer_norm.bias    |    768     |\n",
+      "|      distilbert.transformer.layer.5.ffn.lin1.weight     |  2359296   |\n",
+      "|       distilbert.transformer.layer.5.ffn.lin1.bias      |    3072    |\n",
+      "|      distilbert.transformer.layer.5.ffn.lin2.weight     |  2359296   |\n",
+      "|       distilbert.transformer.layer.5.ffn.lin2.bias      |    768     |\n",
+      "| distilbert.transformer.layer.5.output_layer_norm.weight |    768     |\n",
+      "|  distilbert.transformer.layer.5.output_layer_norm.bias  |    768     |\n",
+      "|                    classifier.weight                    |    1536    |\n",
+      "|                     classifier.bias                     |     2      |\n",
+      "+---------------------------------------------------------+------------+\n",
+      "Total Trainable Params: 66364418\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "66364418"
+      ]
+     },
+     "execution_count": 57,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "count_parameters(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "426a6311",
+   "metadata": {},
+   "source": [
+    "### Testing the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "id": "6151c201",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get smaller dataset\n",
+    "batch_size = 8\n",
+    "test_ds = Dataset(squad_paths = squad_paths[:2], natural_question_paths=None, hotpotqa_paths=None, tokenizer=tokenizer)\n",
+    "test_ds_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)\n",
+    "optim = RMSprop(model.parameters(), lr=1e-4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "id": "aeae0c56",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Passed\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_model(model, optim, test_ds_loader, device)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "59928d34",
+   "metadata": {},
+   "source": [
+    "### Model Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "a8017b8c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "SimpleQuestionDistilBERT(\n",
+       "  (distilbert): DistilBertModel(\n",
+       "    (embeddings): Embeddings(\n",
+       "      (word_embeddings): Embedding(30522, 768, padding_idx=0)\n",
+       "      (position_embeddings): Embedding(512, 768)\n",
+       "      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "      (dropout): Dropout(p=0.1, inplace=False)\n",
+       "    )\n",
+       "    (transformer): Transformer(\n",
+       "      (layer): ModuleList(\n",
+       "        (0): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (1): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (2): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (3): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (4): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (5): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       "  (dropout): Dropout(p=0.5, inplace=False)\n",
+       "  (classifier): Linear(in_features=768, out_features=2, bias=True)\n",
+       ")"
+      ]
+     },
+     "execution_count": 60,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
+    "model = SimpleQuestionDistilBERT(mod)\n",
+    "model.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "id": "f13c12dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.train()\n",
+    "optim = RMSprop(model.parameters(), lr=1e-4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "e4fa54d9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0016d9f5ba764eb98e9df8573995c86c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/10875 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Training Error 0.7555404769408292\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "96af0e22e2ee44fd920795b0e7317839",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/2500 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Test Error 1.761920437876694\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5160ffe5f60e4b72b46746a33b1d60d0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/10875 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mKeyboardInterrupt\u001B[0m                         Traceback (most recent call last)",
+      "Cell \u001B[0;32mIn [22], line 18\u001B[0m\n\u001B[1;32m     16\u001B[0m \u001B[38;5;66;03m# print(torch.argmax(outputs['start_logits'],axis=1), torch.argmax(outputs['end_logits'], axis=1), start, end)\u001B[39;00m\n\u001B[1;32m     17\u001B[0m loss \u001B[38;5;241m=\u001B[39m outputs[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mloss\u001B[39m\u001B[38;5;124m'\u001B[39m]\n\u001B[0;32m---> 18\u001B[0m loss\u001B[38;5;241m.\u001B[39mbackward()\n\u001B[1;32m     19\u001B[0m \u001B[38;5;66;03m# torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)\u001B[39;00m\n\u001B[1;32m     20\u001B[0m optim\u001B[38;5;241m.\u001B[39mstep()\n",
+      "File \u001B[0;32m~/Documents/University/WS2022/applieddl/venv/lib64/python3.10/site-packages/torch/_tensor.py:396\u001B[0m, in \u001B[0;36mTensor.backward\u001B[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001B[0m\n\u001B[1;32m    387\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m has_torch_function_unary(\u001B[38;5;28mself\u001B[39m):\n\u001B[1;32m    388\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m handle_torch_function(\n\u001B[1;32m    389\u001B[0m         Tensor\u001B[38;5;241m.\u001B[39mbackward,\n\u001B[1;32m    390\u001B[0m         (\u001B[38;5;28mself\u001B[39m,),\n\u001B[0;32m   (...)\u001B[0m\n\u001B[1;32m    394\u001B[0m         create_graph\u001B[38;5;241m=\u001B[39mcreate_graph,\n\u001B[1;32m    395\u001B[0m         inputs\u001B[38;5;241m=\u001B[39minputs)\n\u001B[0;32m--> 396\u001B[0m \u001B[43mtorch\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mautograd\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mbackward\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mgradient\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mretain_graph\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mcreate_graph\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43minputs\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43minputs\u001B[49m\u001B[43m)\u001B[49m\n",
+      "File \u001B[0;32m~/Documents/University/WS2022/applieddl/venv/lib64/python3.10/site-packages/torch/autograd/__init__.py:173\u001B[0m, in \u001B[0;36mbackward\u001B[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001B[0m\n\u001B[1;32m    168\u001B[0m     retain_graph \u001B[38;5;241m=\u001B[39m create_graph\n\u001B[1;32m    170\u001B[0m \u001B[38;5;66;03m# The reason we repeat same the comment below is that\u001B[39;00m\n\u001B[1;32m    171\u001B[0m \u001B[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001B[39;00m\n\u001B[1;32m    172\u001B[0m \u001B[38;5;66;03m# calls in the traceback and some print out the last line\u001B[39;00m\n\u001B[0;32m--> 173\u001B[0m \u001B[43mVariable\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_execution_engine\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mrun_backward\u001B[49m\u001B[43m(\u001B[49m\u001B[43m  \u001B[49m\u001B[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001B[39;49;00m\n\u001B[1;32m    174\u001B[0m \u001B[43m    \u001B[49m\u001B[43mtensors\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mgrad_tensors_\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mretain_graph\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mcreate_graph\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43minputs\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m    175\u001B[0m \u001B[43m    \u001B[49m\u001B[43mallow_unreachable\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43;01mTrue\u001B[39;49;00m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43maccumulate_grad\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43;01mTrue\u001B[39;49;00m\u001B[43m)\u001B[49m\n",
+      "\u001B[0;31mKeyboardInterrupt\u001B[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "epochs = 5\n",
+    "\n",
+    "for epoch in range(epochs):\n",
+    "    loop = tqdm(loader, leave=True)\n",
+    "    model.train()\n",
+    "    mean_training_error = []\n",
+    "    for batch in loop:\n",
+    "        optim.zero_grad()\n",
+    "        \n",
+    "        input_ids = batch['input_ids'].to(device)\n",
+    "        attention_mask = batch['attention_mask'].to(device)\n",
+    "        start = batch['start_positions'].to(device)\n",
+    "        end = batch['end_positions'].to(device)\n",
+    "        \n",
+    "        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start, end_positions=end)\n",
+    "        # print(torch.argmax(outputs['start_logits'],axis=1), torch.argmax(outputs['end_logits'], axis=1), start, end)\n",
+    "        loss = outputs['loss']\n",
+    "        loss.backward()\n",
+    "        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)\n",
+    "        optim.step()\n",
+    "        mean_training_error.append(loss.item())\n",
+    "        loop.set_description(f'Epoch {epoch}')\n",
+    "        loop.set_postfix(loss=loss.item())\n",
+    "    print(\"Mean Training Error\", np.mean(mean_training_error))\n",
+    "    \n",
+    "    \n",
+    "    loop = tqdm(test_loader, leave=True)\n",
+    "    model.eval()\n",
+    "    mean_test_error = []\n",
+    "    for batch in loop:\n",
+    "        \n",
+    "        input_ids = batch['input_ids'].to(device)\n",
+    "        attention_mask = batch['attention_mask'].to(device)\n",
+    "        start = batch['start_positions'].to(device)\n",
+    "        end = batch['end_positions'].to(device)\n",
+    "        \n",
+    "        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start, end_positions=end)\n",
+    "        # print(torch.argmax(outputs['start_logits'],axis=1), torch.argmax(outputs['end_logits'], axis=1), start, end)\n",
+    "        loss = outputs['loss']\n",
+    "        \n",
+    "        mean_test_error.append(loss.item())\n",
+    "        loop.set_description(f'Epoch {epoch} Testset')\n",
+    "        loop.set_postfix(loss=loss.item())\n",
+    "    print(\"Mean Test Error\", np.mean(mean_test_error))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "6ff26fb4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.save(model.state_dict(), \"simple_distilbert_qa.model\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "a5e7abeb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<All keys matched successfully>"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model = SimpleQuestionDistilBERT(mod)\n",
+    "model.load_state_dict(torch.load(\"simple_distilbert_qa.model\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "f5ad7bee",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 2500/2500 [02:09<00:00, 19.37it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean EM:  0.5374\n",
+      "Mean F-1:  0.6826317532406944\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "eval_test_set(model, tokenizer, test_loader, device)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fa6017a8",
+   "metadata": {},
+   "source": [
+    "## Freeze baseline and train new head\n",
+    "This was my initial idea, to freeze the layers and add a completely new head, which we train from scratch. I tried a lot of different configurations, but nothing really worked, I usually stayed at a CrossEntropyLoss of about 3 the whole time. Below, you can see the different heads I have tried.\n",
+    "\n",
+    "Furthermore, I experimented with different data, because I though it might not be enough data all in all. I would conclude that this didn't work because (1) Transformers are very data-hungry and I probably still used too little data (one epoch took about 1h though, so it wasn't possible to use even more). (2) We train the layers completely new, which means they contain absolutely no structure about the problem and task beforehand. I do not think that this way of training leads to better results / less energy used all in all, because it would be too resource intense.\n",
+    "\n",
+    "The following setup is partly based on the HuggingFace implementation of the question answering model (https://github.com/huggingface/transformers/blob/v4.23.1/src/transformers/models/distilbert/modeling_distilbert.py#L805)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "id": "92b21967",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = DistilBertForMaskedLM.from_pretrained(\"distilbert-base-uncased\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "id": "1d7b3a8c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = DistilBertConfig.from_pretrained(\"distilbert-base-uncased\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "id": "91444894",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# only take base model, we do not need the classification head\n",
+    "mod = model.distilbert"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "id": "74ca6c07",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "QuestionDistilBERT(\n",
+       "  (distilbert): DistilBertModel(\n",
+       "    (embeddings): Embeddings(\n",
+       "      (word_embeddings): Embedding(30522, 768, padding_idx=0)\n",
+       "      (position_embeddings): Embedding(512, 768)\n",
+       "      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "      (dropout): Dropout(p=0.1, inplace=False)\n",
+       "    )\n",
+       "    (transformer): Transformer(\n",
+       "      (layer): ModuleList(\n",
+       "        (0): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (1): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (2): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (3): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (4): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (5): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       "  (relu): ReLU()\n",
+       "  (dropout): Dropout(p=0.1, inplace=False)\n",
+       "  (te): TransformerEncoder(\n",
+       "    (layers): ModuleList(\n",
+       "      (0): TransformerEncoderLayer(\n",
+       "        (self_attn): MultiheadAttention(\n",
+       "          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n",
+       "        )\n",
+       "        (linear1): Linear(in_features=768, out_features=2048, bias=True)\n",
+       "        (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        (linear2): Linear(in_features=2048, out_features=768, bias=True)\n",
+       "        (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (dropout1): Dropout(p=0.1, inplace=False)\n",
+       "        (dropout2): Dropout(p=0.1, inplace=False)\n",
+       "      )\n",
+       "      (1): TransformerEncoderLayer(\n",
+       "        (self_attn): MultiheadAttention(\n",
+       "          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n",
+       "        )\n",
+       "        (linear1): Linear(in_features=768, out_features=2048, bias=True)\n",
+       "        (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        (linear2): Linear(in_features=2048, out_features=768, bias=True)\n",
+       "        (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (dropout1): Dropout(p=0.1, inplace=False)\n",
+       "        (dropout2): Dropout(p=0.1, inplace=False)\n",
+       "      )\n",
+       "      (2): TransformerEncoderLayer(\n",
+       "        (self_attn): MultiheadAttention(\n",
+       "          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n",
+       "        )\n",
+       "        (linear1): Linear(in_features=768, out_features=2048, bias=True)\n",
+       "        (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        (linear2): Linear(in_features=2048, out_features=768, bias=True)\n",
+       "        (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+       "        (dropout1): Dropout(p=0.1, inplace=False)\n",
+       "        (dropout2): Dropout(p=0.1, inplace=False)\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       "  (classifier): Sequential(\n",
+       "    (0): Dropout(p=0.1, inplace=False)\n",
+       "    (1): ReLU()\n",
+       "    (2): Linear(in_features=768, out_features=512, bias=True)\n",
+       "    (3): Dropout(p=0.1, inplace=False)\n",
+       "    (4): ReLU()\n",
+       "    (5): Linear(in_features=512, out_features=256, bias=True)\n",
+       "    (6): Dropout(p=0.1, inplace=False)\n",
+       "    (7): ReLU()\n",
+       "    (8): Linear(in_features=256, out_features=128, bias=True)\n",
+       "    (9): Dropout(p=0.1, inplace=False)\n",
+       "    (10): ReLU()\n",
+       "    (11): Linear(in_features=128, out_features=64, bias=True)\n",
+       "    (12): Dropout(p=0.1, inplace=False)\n",
+       "    (13): ReLU()\n",
+       "    (14): Linear(in_features=64, out_features=2, bias=True)\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 65,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
+    "model = QuestionDistilBERT(mod)\n",
+    "model.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "id": "340857f9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+---------------------------------------+------------+\n",
+      "|                Modules                | Parameters |\n",
+      "+---------------------------------------+------------+\n",
+      "|  te.layers.0.self_attn.in_proj_weight |  1769472   |\n",
+      "|   te.layers.0.self_attn.in_proj_bias  |    2304    |\n",
+      "| te.layers.0.self_attn.out_proj.weight |   589824   |\n",
+      "|  te.layers.0.self_attn.out_proj.bias  |    768     |\n",
+      "|       te.layers.0.linear1.weight      |  1572864   |\n",
+      "|        te.layers.0.linear1.bias       |    2048    |\n",
+      "|       te.layers.0.linear2.weight      |  1572864   |\n",
+      "|        te.layers.0.linear2.bias       |    768     |\n",
+      "|        te.layers.0.norm1.weight       |    768     |\n",
+      "|         te.layers.0.norm1.bias        |    768     |\n",
+      "|        te.layers.0.norm2.weight       |    768     |\n",
+      "|         te.layers.0.norm2.bias        |    768     |\n",
+      "|  te.layers.1.self_attn.in_proj_weight |  1769472   |\n",
+      "|   te.layers.1.self_attn.in_proj_bias  |    2304    |\n",
+      "| te.layers.1.self_attn.out_proj.weight |   589824   |\n",
+      "|  te.layers.1.self_attn.out_proj.bias  |    768     |\n",
+      "|       te.layers.1.linear1.weight      |  1572864   |\n",
+      "|        te.layers.1.linear1.bias       |    2048    |\n",
+      "|       te.layers.1.linear2.weight      |  1572864   |\n",
+      "|        te.layers.1.linear2.bias       |    768     |\n",
+      "|        te.layers.1.norm1.weight       |    768     |\n",
+      "|         te.layers.1.norm1.bias        |    768     |\n",
+      "|        te.layers.1.norm2.weight       |    768     |\n",
+      "|         te.layers.1.norm2.bias        |    768     |\n",
+      "|  te.layers.2.self_attn.in_proj_weight |  1769472   |\n",
+      "|   te.layers.2.self_attn.in_proj_bias  |    2304    |\n",
+      "| te.layers.2.self_attn.out_proj.weight |   589824   |\n",
+      "|  te.layers.2.self_attn.out_proj.bias  |    768     |\n",
+      "|       te.layers.2.linear1.weight      |  1572864   |\n",
+      "|        te.layers.2.linear1.bias       |    2048    |\n",
+      "|       te.layers.2.linear2.weight      |  1572864   |\n",
+      "|        te.layers.2.linear2.bias       |    768     |\n",
+      "|        te.layers.2.norm1.weight       |    768     |\n",
+      "|         te.layers.2.norm1.bias        |    768     |\n",
+      "|        te.layers.2.norm2.weight       |    768     |\n",
+      "|         te.layers.2.norm2.bias        |    768     |\n",
+      "|          classifier.2.weight          |   393216   |\n",
+      "|           classifier.2.bias           |    512     |\n",
+      "|          classifier.5.weight          |   131072   |\n",
+      "|           classifier.5.bias           |    256     |\n",
+      "|          classifier.8.weight          |   32768    |\n",
+      "|           classifier.8.bias           |    128     |\n",
+      "|          classifier.11.weight         |    8192    |\n",
+      "|           classifier.11.bias          |     64     |\n",
+      "|          classifier.14.weight         |    128     |\n",
+      "|           classifier.14.bias          |     2      |\n",
+      "+---------------------------------------+------------+\n",
+      "Total Trainable Params: 17108290\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "17108290"
+      ]
+     },
+     "execution_count": 66,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "count_parameters(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9babd013",
+   "metadata": {},
+   "source": [
+    "### Testing the model\n",
+    "This is the same procedure as in `distilbert.ipynb`. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "id": "694c828b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get smaller dataset\n",
+    "batch_size = 8\n",
+    "test_ds = Dataset(squad_paths = squad_paths[:2], natural_question_paths=None, hotpotqa_paths=None, tokenizer=tokenizer)\n",
+    "test_ds_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)\n",
+    "optim=torch.optim.Adam(model.parameters())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "id": "a76587df",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Passed\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_model(model, optim, test_ds_loader, device)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7c326e8e",
+   "metadata": {},
+   "source": [
+    "### Training the model\n",
+    "* Parameter Tuning:\n",
+    "  * Learning Rate: I experimented with several values, 1e-4 seemed to work best for me. 1e-3 was very unstable and 1e-5 was too small.\n",
+    "  * Gradient Clipping: I experimented with this, but the difference was only minimal\n",
+    "\n",
+    "Data:\n",
+    "* I first used only the SQuAD dataset, but generalisation is a problem\n",
+    "  * The dataset is realtively small and we often have entries with the same context but different questions\n",
+    "  * I believe, the diversity is not big enough to train a fully functional model\n",
+    "* Hence, I included the Natural Questions dataset too\n",
+    "  * It is however a lot more messy - I elaborated a bit more on this in `load_data.ipynb`\n",
+    "* Also the hotpotqa data was used\n",
+    "\n",
+    "Tested with: \n",
+    "* 3 Linear Layers\n",
+    "  * Training Error high - needed more layers\n",
+    "  * Already expected - this was mostly a Proof of Concept\n",
+    "* 1 TransformerEncoder with 4 attention heads + 1 Linear Layer:\n",
+    "  * Training Error was high, still too simple\n",
+    "* 1 TransformerEncoder with 8 heads + 1 Linear Layer:\n",
+    "  * Training Error gets lower, however stagnates at some point\n",
+    "  * Probably still too simple, it doesn't generalise either\n",
+    "* 2 TransformerEncoder with 8 and 4 heads + 1 Linear Layer:\n",
+    "  * Loss gets down but doesn't go further after some time\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2e9f4bd3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = Dataset(squad_paths = squad_paths, natural_question_paths=nat_paths, hotpotqa_paths=hotpotqa_paths, tokenizer=tokenizer)\n",
+    "loader = torch.utils.data.DataLoader(dataset, batch_size=8)\n",
+    "\n",
+    "test_dataset = Dataset(squad_paths = [str(x) for x in Path('data/test_squad/').glob('**/*.txt')], \n",
+    "                       natural_question_paths=None, \n",
+    "                       hotpotqa_paths = None, tokenizer=tokenizer)\n",
+    "test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "03a6de37",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = QuestionDistilBERT(mod)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "ed854b73",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.optim import AdamW, RMSprop\n",
+    "\n",
+    "model.train()\n",
+    "optim = RMSprop(model.parameters(), lr=1e-4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "79fdfcc9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.utils.tensorboard import SummaryWriter\n",
+    "writer = SummaryWriter()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f7bddb43",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5e9e74167c4b4b22b3218f4ca3c5abf0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/21750 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Training Error 3.8791405910185013\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f3ce562fc61d4bfc83a4860eb06bc20c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1250 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Test Error 3.7705092002868654\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2e84e21cedd446a0a5f5a40501711d1c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/21750 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Training Error 3.7389922174091996\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "07135c48be0146498cd37d767c1ee6ab",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1250 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Test Error 3.7443671816825868\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e9a51fbabc7043c2819a68e247e4a3ec",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/21750 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Training Error 3.7031057048117977\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bfdbcc9fe32542a19c47bc1d7704400e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1250 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Test Error 3.743248237323761\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "81fd1278b22643dc9fb3ac306533a240",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/21750 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Training Error 3.6711661003430685\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8b38d6cd44e048ec8bcd6b5cb86cce16",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1250 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Test Error 3.740310479736328\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "825248aa3f934f4aade9d973e6f3b43e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/21750 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Training Error 3.6591619139813827\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "edceb7af0ec6450997820967638c12db",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1250 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Test Error 3.8138498876571654\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "27e903eb0d0f4f949c234e4faf4277a1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/21750 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "epochs = 20\n",
+    "\n",
+    "for epoch in range(epochs):\n",
+    "    loop = tqdm(loader, leave=True)\n",
+    "    model.train()\n",
+    "    mean_training_error = []\n",
+    "    for batch in loop:\n",
+    "        optim.zero_grad()\n",
+    "        \n",
+    "        input_ids = batch['input_ids'].to(device)\n",
+    "        attention_mask = batch['attention_mask'].to(device)\n",
+    "        start = batch['start_positions'].to(device)\n",
+    "        end = batch['end_positions'].to(device)\n",
+    "        \n",
+    "        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start, end_positions=end)\n",
+    "        \n",
+    "        loss = outputs['loss']\n",
+    "        loss.backward()\n",
+    "        \n",
+    "        optim.step()\n",
+    "        mean_training_error.append(loss.item())\n",
+    "        loop.set_description(f'Epoch {epoch}')\n",
+    "        loop.set_postfix(loss=loss.item())\n",
+    "    print(\"Mean Training Error\", np.mean(mean_training_error))\n",
+    "    writer.add_scalar(\"Loss/train\", np.mean(mean_training_error), epoch)\n",
+    "    \n",
+    "    loop = tqdm(test_loader, leave=True)\n",
+    "    model.eval()\n",
+    "    mean_test_error = []\n",
+    "    for batch in loop:\n",
+    "        \n",
+    "        input_ids = batch['input_ids'].to(device)\n",
+    "        attention_mask = batch['attention_mask'].to(device)\n",
+    "        start = batch['start_positions'].to(device)\n",
+    "        end = batch['end_positions'].to(device)\n",
+    "        \n",
+    "        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start, end_positions=end)\n",
+    "        # print(torch.argmax(outputs['start_logits'],axis=1), torch.argmax(outputs['end_logits'], axis=1), start, end)\n",
+    "        loss = outputs['loss']\n",
+    "        \n",
+    "        mean_test_error.append(loss.item())\n",
+    "        loop.set_description(f'Epoch {epoch} Testset')\n",
+    "        loop.set_postfix(loss=loss.item())\n",
+    "    print(\"Mean Test Error\", np.mean(mean_test_error))\n",
+    "    writer.add_scalar(\"Loss/test\", np.mean(mean_test_error), epoch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 238,
+   "id": "a9d6af2e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "writer.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "ba43447e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.save(model.state_dict(), \"distilbert_qa.model\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "ffc49aca",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<All keys matched successfully>"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model = QuestionDistilBERT(mod)\n",
+    "model.load_state_dict(torch.load(\"distilbert_qa.model\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "730a86c1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 2500/2500 [02:57<00:00, 14.09it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean EM:  0.0479\n",
+      "Mean F-1:  0.08989175857485086\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "eval_test_set(model, tokenizer, test_loader, device)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bd1c7076",
+   "metadata": {},
+   "source": [
+    "## Reuse Layer\n",
+    "This was inspired by how well the original model with just one classification head worked. I felt like the main problem with the previous model was the lack of structure which was already in the layers, combined with the massive amount of resources needed for a Transformer.\n",
+    "\n",
+    "Hence, I tried cloning the last (and then last two) layers of the DistilBERT model, putting a classifier on top and using this as the head. The base DistilBERT model is completely frozen. This worked extremely well, while we only fine-tune about 21% of the parameters (14 Mio as opposed to 66 Mio!) we did before. Below you can see the results.\n",
+    "\n",
+    "### Last DistilBERT layer\n",
+    "\n",
+    "Dropout 0.1 and RMSprop 1e-4:\n",
+    "* Mean EM:  0.3888\n",
+    "* Mean F-1:  0.5122932744694068\n",
+    "\n",
+    "Dropout 0.25: very early stagnating\n",
+    "* Mean EM:  0.3552\n",
+    "* Mean F-1:  0.4711235721312687\n",
+    "\n",
+    "Dropout 0.15: seems to work well - training and test error stagnate around 1.7 and 1.8 but good generalisation (need to add more layers)\n",
+    "* Mean EM:  0.4119\n",
+    "* Mean F-1:  0.5296387232893214\n",
+    "\n",
+    "### Last DitilBERT layer + more Dense layers\n",
+    "Dropout 0.15 + 4 dense layers((786-512)-(512-256)-(256-128)-(128-2)) & ReLU: doesn't work too well - stagnates at around 2.4\n",
+    "\n",
+    "### Last two DistilBERT layers\n",
+    "Dropout 0.1 but last 2 DistilBERT layers: works very well, but early overfitting - maybe use more data\n",
+    "* Mean EM:  0.458\n",
+    "* Mean F-1:  0.6003368353673634\n",
+    "\n",
+    "Dropout 0.1 - last 2 distilbert layers: all data\n",
+    "* Mean EM:  0.484\n",
+    "* Mean F-1:  0.6344960035215299\n",
+    "\n",
+    "Dropout 0.15 - **BEST**\n",
+    "* Mean EM:  0.5178\n",
+    "* Mean F-1:  0.6671140689626448\n",
+    "\n",
+    "Dropout 0.2 - doesn't work too well\n",
+    "* Mean EM:  0.4353\n",
+    "* Mean F-1:  0.5776847879304647\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "id": "654e09e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = Dataset(squad_paths = squad_paths, natural_question_paths=None, hotpotqa_paths=hotpotqa_paths, tokenizer=tokenizer)\n",
+    "loader = torch.utils.data.DataLoader(dataset, batch_size=8)\n",
+    "\n",
+    "test_dataset = Dataset(squad_paths = [str(x) for x in Path('data/test_squad/').glob('**/*.txt')], \n",
+    "                       natural_question_paths=None, \n",
+    "                       hotpotqa_paths = None, tokenizer=tokenizer)\n",
+    "test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "id": "707c0cb5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ReuseQuestionDistilBERT(\n",
+       "  (te): ModuleList(\n",
+       "    (0): TransformerBlock(\n",
+       "      (attention): MultiHeadSelfAttention(\n",
+       "        (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "        (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "        (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "        (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "      )\n",
+       "      (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "      (ffn): FFN(\n",
+       "        (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "        (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "        (activation): GELUActivation()\n",
+       "      )\n",
+       "      (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "    )\n",
+       "    (1): TransformerBlock(\n",
+       "      (attention): MultiHeadSelfAttention(\n",
+       "        (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "        (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "        (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "        (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "      )\n",
+       "      (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "      (ffn): FFN(\n",
+       "        (dropout): Dropout(p=0.1, inplace=False)\n",
+       "        (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "        (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "        (activation): GELUActivation()\n",
+       "      )\n",
+       "      (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "    )\n",
+       "  )\n",
+       "  (distilbert): DistilBertModel(\n",
+       "    (embeddings): Embeddings(\n",
+       "      (word_embeddings): Embedding(30522, 768, padding_idx=0)\n",
+       "      (position_embeddings): Embedding(512, 768)\n",
+       "      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "      (dropout): Dropout(p=0.1, inplace=False)\n",
+       "    )\n",
+       "    (transformer): Transformer(\n",
+       "      (layer): ModuleList(\n",
+       "        (0): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (1): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (2): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (3): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (4): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "        (5): TransformerBlock(\n",
+       "          (attention): MultiHeadSelfAttention(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
+       "          )\n",
+       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "          (ffn): FFN(\n",
+       "            (dropout): Dropout(p=0.1, inplace=False)\n",
+       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (activation): GELUActivation()\n",
+       "          )\n",
+       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       "  (relu): ReLU()\n",
+       "  (dropout): Dropout(p=0.15, inplace=False)\n",
+       "  (classifier): Linear(in_features=768, out_features=2, bias=True)\n",
+       ")"
+      ]
+     },
+     "execution_count": 70,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model = DistilBertForMaskedLM.from_pretrained(\"distilbert-base-uncased\")\n",
+    "config = DistilBertConfig.from_pretrained(\"distilbert-base-uncased\")\n",
+    "mod = model.distilbert\n",
+    "\n",
+    "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
+    "model = ReuseQuestionDistilBERT(mod)\n",
+    "model.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "id": "d2c6bff5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-------------------------------+------------+\n",
+      "|            Modules            | Parameters |\n",
+      "+-------------------------------+------------+\n",
+      "|  te.0.attention.q_lin.weight  |   589824   |\n",
+      "|   te.0.attention.q_lin.bias   |    768     |\n",
+      "|  te.0.attention.k_lin.weight  |   589824   |\n",
+      "|   te.0.attention.k_lin.bias   |    768     |\n",
+      "|  te.0.attention.v_lin.weight  |   589824   |\n",
+      "|   te.0.attention.v_lin.bias   |    768     |\n",
+      "| te.0.attention.out_lin.weight |   589824   |\n",
+      "|  te.0.attention.out_lin.bias  |    768     |\n",
+      "|   te.0.sa_layer_norm.weight   |    768     |\n",
+      "|    te.0.sa_layer_norm.bias    |    768     |\n",
+      "|      te.0.ffn.lin1.weight     |  2359296   |\n",
+      "|       te.0.ffn.lin1.bias      |    3072    |\n",
+      "|      te.0.ffn.lin2.weight     |  2359296   |\n",
+      "|       te.0.ffn.lin2.bias      |    768     |\n",
+      "| te.0.output_layer_norm.weight |    768     |\n",
+      "|  te.0.output_layer_norm.bias  |    768     |\n",
+      "|  te.1.attention.q_lin.weight  |   589824   |\n",
+      "|   te.1.attention.q_lin.bias   |    768     |\n",
+      "|  te.1.attention.k_lin.weight  |   589824   |\n",
+      "|   te.1.attention.k_lin.bias   |    768     |\n",
+      "|  te.1.attention.v_lin.weight  |   589824   |\n",
+      "|   te.1.attention.v_lin.bias   |    768     |\n",
+      "| te.1.attention.out_lin.weight |   589824   |\n",
+      "|  te.1.attention.out_lin.bias  |    768     |\n",
+      "|   te.1.sa_layer_norm.weight   |    768     |\n",
+      "|    te.1.sa_layer_norm.bias    |    768     |\n",
+      "|      te.1.ffn.lin1.weight     |  2359296   |\n",
+      "|       te.1.ffn.lin1.bias      |    3072    |\n",
+      "|      te.1.ffn.lin2.weight     |  2359296   |\n",
+      "|       te.1.ffn.lin2.bias      |    768     |\n",
+      "| te.1.output_layer_norm.weight |    768     |\n",
+      "|  te.1.output_layer_norm.bias  |    768     |\n",
+      "|       classifier.weight       |    1536    |\n",
+      "|        classifier.bias        |     2      |\n",
+      "+-------------------------------+------------+\n",
+      "Total Trainable Params: 14177282\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "14177282"
+      ]
+     },
+     "execution_count": 71,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "count_parameters(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c386c2eb",
+   "metadata": {},
+   "source": [
+    "### Testing the Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "id": "818deed3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get smaller dataset\n",
+    "batch_size = 8\n",
+    "test_ds = Dataset(squad_paths = squad_paths[:2], natural_question_paths=None, hotpotqa_paths=None, tokenizer=tokenizer)\n",
+    "test_ds_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)\n",
+    "optim=torch.optim.Adam(model.parameters())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "id": "9da40760",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Passed\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_model(model, optim, test_ds_loader, device)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3f80248",
+   "metadata": {},
+   "source": [
+    "### Model Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "e1adabe6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.optim import AdamW, RMSprop\n",
+    "\n",
+    "model.train()\n",
+    "optim = AdamW(model.parameters(), lr=1e-4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "efe1cbd5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8785757b04214102830ded36c1392c8d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/35000 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Training Error 2.6535016193100383\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "836f5365498642fa9ae891a86dca5892",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/2500 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Test Error 2.384517493388057\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "981e1cef83a1477e920d1cdbffdfcde1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/35000 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Training Error 2.172889394424643\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "20a785e7fefb43239f1120992d2c3416",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/2500 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Test Error 2.013008696398139\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "47831e65b1ed4be78e8e7cb24068b0c3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/35000 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Training Error 1.9743544759827\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "15904a3f930249fb944ea87184676e14",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/2500 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Test Error 1.8922049684919418\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "108bdbf644d94d78910195992b9e2652",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/35000 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Training Error 1.857202093189742\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d6a75a6ab40d4a2599b7511bfc60bf83",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/2500 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Test Error 1.793771461571753\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d3468a6ba72a4f42b0e7cc77ee0a0011",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/35000 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Training Error 1.7750537034896867\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8aca0aa529d2452e8bd29fe7ada934f2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/2500 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Test Error 1.7466133671954274\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e09abdfa63c841ce97f445ba9b3eeaa8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/35000 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Training Error 1.7097622096568346\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0f49dd32d33e4f398be0942a59d735ce",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/2500 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean Test Error 1.7642206047609448\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a493dd70ffb64cd19830e5dc98607979",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/35000 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "KeyboardInterrupt\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "epochs = 16\n",
+    "\n",
+    "for epoch in range(epochs):\n",
+    "    loop = tqdm(loader, leave=True)\n",
+    "    model.train()\n",
+    "    mean_training_error = []\n",
+    "    for batch in loop:\n",
+    "        optim.zero_grad()\n",
+    "        \n",
+    "        input_ids = batch['input_ids'].to(device)\n",
+    "        attention_mask = batch['attention_mask'].to(device)\n",
+    "        start = batch['start_positions'].to(device)\n",
+    "        end = batch['end_positions'].to(device)\n",
+    "        \n",
+    "        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start, end_positions=end)\n",
+    "        # print(torch.argmax(outputs['start_logits'],axis=1), torch.argmax(outputs['end_logits'], axis=1), start, end)\n",
+    "        loss = outputs['loss']\n",
+    "        loss.backward()\n",
+    "        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)\n",
+    "        optim.step()\n",
+    "        mean_training_error.append(loss.item())\n",
+    "        loop.set_description(f'Epoch {epoch}')\n",
+    "        loop.set_postfix(loss=loss.item())\n",
+    "    print(\"Mean Training Error\", np.mean(mean_training_error))\n",
+    "    \n",
+    "    loop = tqdm(test_loader, leave=True)\n",
+    "    model.eval()\n",
+    "    mean_test_error = []\n",
+    "    for batch in loop:\n",
+    "        \n",
+    "        input_ids = batch['input_ids'].to(device)\n",
+    "        attention_mask = batch['attention_mask'].to(device)\n",
+    "        start = batch['start_positions'].to(device)\n",
+    "        end = batch['end_positions'].to(device)\n",
+    "        \n",
+    "        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start, end_positions=end)\n",
+    "        # print(torch.argmax(outputs['start_logits'],axis=1), torch.argmax(outputs['end_logits'], axis=1), start, end)\n",
+    "        loss = outputs['loss']\n",
+    "        \n",
+    "        mean_test_error.append(loss.item())\n",
+    "        loop.set_description(f'Epoch {epoch} Testset')\n",
+    "        loop.set_postfix(loss=loss.item())\n",
+    "    print(\"Mean Test Error\", np.mean(mean_test_error))\n",
+    "    torch.save(model.state_dict(), \"distilbert_reuse_{}\".format(epoch))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "fdf37d18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.save(model.state_dict(), \"distilbert_reuse.model\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "d1cfded4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "m = ReuseQuestionDistilBERT(mod)\n",
+    "m.load_state_dict(torch.load(\"distilbert_reuse.model\"))\n",
+    "model = m"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "233bdc18",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 2500/2500 [02:51<00:00, 14.59it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Mean EM:  0.5178\n",
+      "Mean F-1:  0.6671140689626448\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "eval_test_set(model, tokenizer, test_loader, device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0fb1ce9e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.8 ('venv': venv)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "85bf9c14e9ba73b783ed1274d522bec79eb0b2b739090180d8ce17bb11aff4aa"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,168 @@

+absl-py==1.3.0
+aiohttp==3.8.3
+aiosignal==1.2.0
+altair==4.2.0
+apache-beam>=2.41.0
+argon2-cffi==21.3.0
+argon2-cffi-bindings==21.2.0
+asttokens==2.0.8
+async-timeout==4.0.2
+attrs==22.1.0
+autopep8==1.7.0
+backcall==0.2.0
+beautifulsoup4==4.11.1
+bleach==5.0.1
+blinker==1.5
+cachetools==5.2.0
+certifi==2022.9.24
+cffi==1.15.1
+charset-normalizer==2.1.1
+click==8.1.3
+cloudpickle==2.2.0
+commonmark==0.9.1
+contourpy==1.0.5
+crcmod==1.7
+cycler==0.11.0
+datasets==2.5.2
+debugpy==1.6.3
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.1.1
+docopt==0.6.2
+entrypoints==0.4
+executing==1.1.0
+fastavro==1.6.1
+fastjsonschema==2.16.2
+filelock==3.8.0
+fonttools==4.37.4
+frozenlist==1.3.1
+fsspec==2022.8.2
+gitdb==4.0.10
+GitPython==3.1.29
+google-auth==2.13.0
+google-auth-oauthlib==0.4.6
+grpcio==1.49.1
+hdfs==2.7.0
+httplib2==0.20.4
+huggingface-hub==0.10.0
+idna==3.4
+importlib-metadata==5.1.0
+ipykernel==6.16.0
+ipython==8.5.0
+ipython-genutils==0.2.0
+ipywidgets==8.0.2
+jedi==0.18.1
+Jinja2==3.1.2
+joblib==1.2.0
+jsonschema==4.16.0
+jupyter==1.0.0
+jupyter-console==6.4.4
+jupyter-contrib-core==0.4.0
+jupyter-contrib-nbextensions==0.5.1
+jupyter-highlight-selected-word==0.2.0
+jupyter-latex-envs==1.4.6
+jupyter-nbextensions-configurator==0.5.0
+jupyter_client==7.3.5
+jupyter_core==4.11.2
+jupyterlab-pygments==0.2.2
+jupyterlab-widgets==3.0.3
+kiwisolver==1.4.4
+lesscpy==0.15.1
+lxml==4.9.1
+Markdown==3.4.1
+MarkupSafe==2.1.1
+matplotlib==3.6.1
+matplotlib-inline==0.1.6
+mistune==2.0.4
+multidict==6.0.2
+multiprocess==0.70.9
+mwparserfromhell==0.6.4
+nbclient==0.7.0
+nbconvert==7.2.1
+nbformat==5.6.1
+nest-asyncio==1.5.6
+notebook==6.4.12
+numpy==1.22.4
+oauthlib==3.2.2
+orjson==3.9.7
+packaging==21.3
+pandas==1.5.0
+pandocfilters==1.5.0
+parso==0.8.3
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==9.2.0
+ply==3.11
+prettytable==3.4.1
+prometheus-client==0.14.1
+prompt-toolkit==3.0.31
+proto-plus==1.22.1
+protobuf==3.19.6
+psutil==5.9.2
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==7.0.0
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
+pycodestyle==2.9.1
+pycparser==2.21
+pydeck==0.8.0
+pydot==1.4.2
+Pygments==2.13.0
+pymongo==3.12.3
+Pympler==1.0.1
+pyparsing==3.0.9
+pyrsistent==0.18.1
+python-dateutil==2.8.2
+pytz==2022.4
+pytz-deprecation-shim==0.1.0.post0
+PyYAML==6.0
+pyzmq==24.0.1
+qtconsole==5.3.2
+QtPy==2.2.1
+regex==2022.9.13
+requests==2.28.1
+requests-oauthlib==1.3.1
+responses==0.18.0
+rich==12.6.0
+rsa==4.9
+scikit-learn==1.1.2
+scipy==1.9.1
+semver==2.13.0
+Send2Trash==1.8.0
+six==1.16.0
+smmap==5.0.0
+soupsieve==2.3.2.post1
+stack-data==0.5.1
+streamlit==1.15.2
+tensorboard==2.10.1
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+terminado==0.16.0
+threadpoolctl==3.1.0
+tinycss2==1.1.1
+tokenizers==0.12.1
+toml==0.10.2
+toolz==0.12.0
+torch==1.12.1
+torchaudio==0.12.1
+torchsummary==1.5.1
+torchtest==0.5
+torchvision==0.13.1
+tornado==6.2
+tqdm==4.64.1
+traitlets==5.4.0
+transformers==4.22.2
+typing_extensions==4.4.0
+tzdata==2022.7
+tzlocal==4.2
+urllib3==1.26.12
+validators==0.20.0
+watchdog==2.2.0
+wcwidth==0.2.5
+webencodings==0.5.1
+Werkzeug==2.2.2
+widgetsnbextension==4.0.3
+xxhash==3.0.0
+yarl==1.8.1
+zipp==3.11.0

util.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import re
+import numpy as np
+from prettytable import PrettyTable
+from tqdm import tqdm
+import torch
+def normalize_text(s):
+    """
+    Removes articles and punctuation, and standardizing whitespace are all typical text processing steps.
+    Copied from: https://qa.fastforwardlabs.com/no%20answer/null%20threshold/bert/distilbert/exact%20match/f1/robust%20predictions/2020/06/09/Evaluating_BERT_on_SQuAD.html#Metrics-for-QA
+    :param s: string to clean
+    :return: cleaned string
+    """
+    import string, re
+    def remove_articles(text):
+        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+        return re.sub(regex, " ", text)
+    def white_space_fix(text):
+        return " ".join(text.split())
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+def compute_exact_match(prediction, truth):
+    """
+    Returns true if the predicted is an exact match, else False
+    Retrieved from: https://qa.fastforwardlabs.com/no%20answer/null%20threshold/bert/distilbert/exact%20match/f1/robust%20predictions/2020/06/09/Evaluating_BERT_on_SQuAD.html#Metrics-for-QA
+    :param prediction: predicted answer
+    :param truth: ground truth
+    :return: 1 if exact match, else 0
+    """
+    return int(normalize_text(prediction) == normalize_text(truth))
+def compute_f1(prediction, truth):
+    """
+    Computes the F-1 score of a prediction, based on the tokens
+    Retrieved from: https://qa.fastforwardlabs.com/no%20answer/null%20threshold/bert/distilbert/exact%20match/f1/robust%20predictions/2020/06/09/Evaluating_BERT_on_SQuAD.html#Metrics-for-QA
+    :param prediction: predicted answer
+    :param truth: ground truth
+    :return: the f-1 score of the prediction
+    """
+    pred_tokens = normalize_text(prediction).split()
+    truth_tokens = normalize_text(truth).split()
+    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
+    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
+        return int(pred_tokens == truth_tokens)
+    # get tokens that are in the prediction and gt
+    common_tokens = set(pred_tokens) & set(truth_tokens)
+    # if there are no common tokens then f1 = 0
+    if len(common_tokens) == 0:
+        return 0
+    # calculate precision and recall
+    prec = len(common_tokens) / len(pred_tokens)
+    rec = len(common_tokens) / len(truth_tokens)
+    return 2 * (prec * rec) / (prec + rec)
+def eval_test_set(model, tokenizer, test_loader, device):
+    """
+    Calculates the mean EM and mean F-1 score on the test set
+    :param model: pytorch model
+    :param tokenizer: tokenizer used to encode the samples
+    :param test_loader: dataloader object with test data
+    :param device: device the model is on
+    """
+    mean_em = []
+    mean_f1 = []
+    model.to(device)
+    model.eval()
+    for batch in tqdm(test_loader):
+        # get test data and transfer to device
+        input_ids = batch['input_ids'].to(device)
+        attention_mask = batch['attention_mask'].to(device)
+        start = batch['start_positions'].to(device)
+        end = batch['end_positions'].to(device)
+        # predict
+        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start, end_positions=end)
+        # iterate over samples, calculate EM and F-1 for all
+        for input_i, s, e, trues, truee in zip(input_ids, outputs['start_logits'], outputs['end_logits'], start, end):
+            # get predicted start and end logits (maximum score)
+            start_logits = torch.argmax(s)
+            end_logits = torch.argmax(e)
+            # get predicted answer as string
+            ans_tokens = input_i[start_logits: end_logits + 1]
+            answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens, skip_special_tokens=True)
+            predicted = tokenizer.convert_tokens_to_string(answer_tokens)
+            # get ground truth as string
+            ans_tokens = input_i[trues: truee + 1]
+            answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens, skip_special_tokens=True)
+            true = tokenizer.convert_tokens_to_string(answer_tokens)
+            # compute score
+            em_score = compute_exact_match(predicted, true)
+            f1_score = compute_f1(predicted, true)
+            mean_em.append(em_score)
+            mean_f1.append(f1_score)
+    print("Mean EM: ", np.mean(mean_em))
+    print("Mean F-1: ", np.mean(mean_f1))
+def count_parameters(model):
+    """
+    This function prints statistic regarding the trainable parameters
+    :param model: pytorch model
+    :return: parameters to be fine-tuned
+    """
+    table = PrettyTable(["Modules", "Parameters"])
+    total_params = 0
+    for name, parameter in model.named_parameters():
+        if not parameter.requires_grad: continue
+        params = parameter.numel()
+        table.add_row([name, params])
+        total_params += params
+    print(table)
+    print(f"Total Trainable Params: {total_params}")
+    return total_params