codeShare
/

JupyterNotebooks

Safetensors

Model card Files Files and versions Community

codeShare commited on Sep 14, 2024

Commit

4f8e1d4

verified ·

1 Parent(s): fa20c7e

Upload sd_token_similarity_calculator.ipynb

Browse files

Files changed (1) hide show

sd_token_similarity_calculator.ipynb +539 -505

sd_token_similarity_calculator.ipynb CHANGED Viewed

@@ -29,143 +29,9 @@
       "cell_type": "code",
       "source": [
         "# @title ✳️ Load/initialize values\n",
-        "# Load the tokens into the colab\n",
-        "!git clone https://huggingface.co/datasets/codeShare/sd_tokens\n",
-        "import torch\n",
-        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
-        "from torch import linalg as LA\n",
-        "%cd /content/sd_tokens\n",
-        "token = torch.load('sd15_tensors.pt', map_location= torch.device('cpu'), weights_only=True)\n",
-        "#-----#\n",
-        "VOCAB_FILENAME = 'tokens_most_similiar_to_girl'\n",
-        "ACTIVE_IMG = ''\n",
-        "#-----#\n",
-        "\n",
-        "# Define functions/constants\n",
-        "NUM_TOKENS = 49407\n",
-        "NUM_PREFIX = 13662\n",
-        "NUM_SUFFIX = 32901\n",
-        "\n",
-        "PREFIX_ENC_VOCAB = ['encoded_prefix_to_girl',]\n",
-        "SUFFIX_ENC_VOCAB = ['a_-_encoded_suffix' ,]\n",
-        "    #'from_-encoded_suffix',\n",
-        "    #'by_-encoded_suffix' ,\n",
-        "    #'encoded_suffix-_like']\n",
-        "\n",
-        "# Make sure these match above results\n",
-        "NUM_PREFIX_LISTS = len(PREFIX_ENC_VOCAB)\n",
-        "NUM_SUFFIX_LISTS = len(SUFFIX_ENC_VOCAB)\n",
-        "#-----#\n",
-        "\n",
-        "\n",
-        "#Import the vocab.json\n",
-        "import json\n",
-        "import pandas as pd\n",
-        "\n",
-        "# Read suffix.json\n",
-        "with open('suffix.json', 'r') as f:\n",
-        "    data = json.load(f)\n",
-        "_df = pd.DataFrame({'count': data})['count']\n",
-        "suffix = {\n",
-        "    key : value for key, value in _df.items()\n",
-        "}\n",
-        "# Read prefix json\n",
-        "with open('prefix.json', 'r') as f:\n",
-        "    data = json.load(f)\n",
-        "_df = pd.DataFrame({'count': data})['count']\n",
-        "prefix = {\n",
-        "    key : value for key, value in _df.items()\n",
-        "}\n",
-        "\n",
-        "# Read to_suffix.json\n",
-        "with open('to_suffix.json', 'r') as f:\n",
-        "    data = json.load(f)\n",
-        "_df = pd.DataFrame({'count': data})['count']\n",
-        "suffix_to_vocab = {\n",
-        "    key : value for key, value in _df.items()\n",
-        "}\n",
-        "\n",
-        "# Read to_prefix.json\n",
-        "with open('to_prefix.json', 'r') as f:\n",
-        "    data = json.load(f)\n",
-        "_df = pd.DataFrame({'count': data})['count']\n",
-        "prefix_to_vocab = {\n",
-        "    key : value for key, value in _df.items()\n",
-        "}\n",
-        "\n",
-        "#-----#\n",
-        "\n",
-        "\n",
-        "# Read to_suffix.json (reversing key and value)\n",
-        "with open('to_suffix.json', 'r') as f:\n",
-        "    data = json.load(f)\n",
-        "_df = pd.DataFrame({'count': data})['count']\n",
-        "vocab_to_suffix = {\n",
-        "    value : key for key, value in _df.items()\n",
-        "}\n",
-        "\n",
-        "# Read to_prefix.json (reversing key and value)\n",
-        "with open('to_prefix.json', 'r') as f:\n",
-        "    data = json.load(f)\n",
-        "_df = pd.DataFrame({'count': data})['count']\n",
-        "vocab_to_prefix = {\n",
-        "    value : key for key, value in _df.items()\n",
-        "}\n",
-        "\n",
-        "\n",
-        "#-----#\n",
-        "\n",
-        "#get token from id (excluding tokens with special symbols)\n",
-        "def vocab(id):\n",
-        "  _id = f'{id}'\n",
-        "  if _id in vocab_to_suffix:\n",
-        "    _id = vocab_to_suffix[_id]\n",
-        "    return suffix[_id]\n",
-        "  if _id in vocab_to_prefix:\n",
-        "    _id = vocab_to_prefix[_id]\n",
-        "    return prefix[_id]\n",
-        "  return ' ' #<---- return whitespace if other id like emojis etc.\n",
-        "#--------#\n",
-        "\n",
-        "#get token from id (excluding tokens with special symbols)\n",
-        "def get_suffix(id):\n",
-        "  _id = f'{id}'\n",
-        "  if int(id) <= NUM_SUFFIX:\n",
-        "    return suffix[_id]\n",
-        "  return ' ' #<---- return whitespace if out of bounds\n",
-        "#--------#\n",
-        "\n",
-        "#get token from id (excluding tokens with special symbols)\n",
-        "def get_prefix(id):\n",
-        "  _id = f'{id}'\n",
-        "  if int(id) <= NUM_PREFIX:\n",
-        "    return prefix[_id]\n",
-        "  return ' ' #<---- return whitespace if out of bounds\n",
-        "#--------#\n",
-        "\n",
-        "\n",
-        "def _modulus(_id,id_max):\n",
-        "  id = _id\n",
-        "  while(id>id_max):\n",
-        "    id = id-id_max\n",
-        "  return id\n",
-        "\n",
-        "#print(get_token(35894))\n"
-      ],
-      "metadata": {
-        "id": "w8O0TX7PBh5m"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# @title Load/initialize values (new version - ignore this cell)\n",
         "#Imports\n",
         "!pip install safetensors\n",
         "from safetensors.torch import load_file\n",
-        "\n",
         "import json , os , shelve , torch\n",
         "import pandas as pd\n",
         "#----#\n",
@@ -229,7 +95,7 @@
         "      continue\n",
         "      #-------#\n",
         "    #--------#\n",
-        "    _text_encodings.close() #close the text_encodings file\n",
         "    file_index = file_index + 1\n",
         "  #----------#\n",
         "  NUM_ITEMS = index\n",
@@ -245,175 +111,25 @@
       "metadata": {
         "id": "rUXQ73IbonHY"
       },
-      "execution_count": 3,
       "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
-        "# @title  Load the tokens into the colab (new version - ignore this cell)\n",
         "%cd /content/\n",
         "!git clone https://huggingface.co/datasets/codeShare/text-to-image-prompts\n",
         "#------#\n",
         "path = '/content/text-to-image-prompts/civitai-prompts/green'\n",
-        "prompts , text_encodings, NUM_ITEMS = getPrompts(path)"
-      ],
-      "metadata": {
-        "id": "ZMG4CThUAmwW"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# @title ⚡ Get similiar tokens\n",
-        "import torch\n",
-        "from transformers import AutoTokenizer\n",
-        "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
-        "\n",
-        "# @markdown Write name of token to match against\n",
-        "token_name = \"banana \" # @param {type:'string',\"placeholder\":\"leave empty for random value token\"}\n",
-        "\n",
-        "prompt = token_name\n",
-        "# @markdown (optional) Mix the token with something else\n",
-        "mix_with = \"\" # @param {\"type\":\"string\",\"placeholder\":\"leave empty for random value token\"}\n",
-        "mix_method = \"None\" # @param [\"None\" , \"Average\", \"Subtract\"] {allow-input: true}\n",
-        "w = 0.5 # @param {type:\"slider\", min:0, max:1, step:0.01}\n",
-        "# @markdown Limit char size of included token\n",
-        "\n",
-        "min_char_size = 0 # param {type:\"slider\", min:0, max: 50, step:1}\n",
-        "char_range = 50 # param {type:\"slider\", min:0, max: 50, step:1}\n",
-        "\n",
-        "tokenizer_output = tokenizer(text = prompt)\n",
-        "input_ids = tokenizer_output['input_ids']\n",
-        "id_A = input_ids[1]\n",
-        "A = torch.tensor(token[id_A])\n",
-        "A = A/A.norm(p=2, dim=-1, keepdim=True)\n",
-        "#-----#\n",
-        "tokenizer_output = tokenizer(text = mix_with)\n",
-        "input_ids = tokenizer_output['input_ids']\n",
-        "id_C = input_ids[1]\n",
-        "C = torch.tensor(token[id_C])\n",
-        "C = C/C.norm(p=2, dim=-1, keepdim=True)\n",
-        "#-----#\n",
-        "sim_AC = torch.dot(A,C)\n",
-        "#-----#\n",
-        "print(input_ids)\n",
-        "#-----#\n",
-        "\n",
-        "#if no imput exists we just randomize the entire thing\n",
-        "if (prompt == \"\"):\n",
-        "  id_A = -1\n",
-        "  print(\"Tokenized prompt tensor A is a random valued tensor with no ID\")\n",
-        "  R = torch.rand(A.shape)\n",
-        "  R = R/R.norm(p=2, dim=-1, keepdim=True)\n",
-        "  A = R\n",
-        "  name_A = 'random_A'\n",
-        "\n",
-        "#if no imput exists we just randomize the entire thing\n",
-        "if (mix_with == \"\"):\n",
-        "  id_C = -1\n",
-        "  print(\"Tokenized prompt  'mix_with' tensor C is a random valued tensor with no ID\")\n",
-        "  R = torch.rand(A.shape)\n",
-        "  R = R/R.norm(p=2, dim=-1, keepdim=True)\n",
-        "  C = R\n",
-        "  name_C = 'random_C'\n",
-        "\n",
-        "name_A = \"A of random type\"\n",
-        "if (id_A>-1):\n",
-        "  name_A = vocab(id_A)\n",
-        "\n",
-        "name_C = \"token C of random type\"\n",
-        "if (id_C>-1):\n",
-        "  name_C = vocab(id_C)\n",
-        "\n",
-        "print(f\"The similarity between A '{name_A}' and C '{name_C}' is {round(sim_AC.item()*100,2)} %\")\n",
-        "\n",
-        "if (mix_method ==  \"None\"):\n",
-        "  print(\"No operation\")\n",
-        "\n",
-        "if (mix_method ==  \"Average\"):\n",
-        "  A = w*A + (1-w)*C\n",
-        "  _A = LA.vector_norm(A, ord=2)\n",
-        "  print(f\"Tokenized prompt tensor A '{name_A}' token has been recalculated as A = w*A + (1-w)*C , where C is '{name_C}' token , for w = {w}  \")\n",
-        "\n",
-        "if (mix_method ==  \"Subtract\"):\n",
-        "  tmp =  w*A - (1-w)*C\n",
-        "  tmp = tmp/tmp.norm(p=2, dim=-1, keepdim=True)\n",
-        "  A = tmp\n",
-        "  #//---//\n",
-        "  print(f\"Tokenized prompt tensor A '{name_A}' token has been recalculated as A = _A*norm(w*A  - (1-w)*C) , where C is '{name_C}' token , for w = {w} \")\n",
-        "\n",
-        "#OPTIONAL : Add/subtract + normalize above result with another token. Leave field empty to get a random value tensor\n",
-        "\n",
-        "dots = torch.zeros(NUM_TOKENS)\n",
-        "for index in range(NUM_TOKENS):\n",
-        "  id_B = index\n",
-        "  B = torch.tensor(token[id_B])\n",
-        "  B = B/B.norm(p=2, dim=-1, keepdim=True)\n",
-        "  sim_AB = torch.dot(A,B)\n",
-        "  dots[index] = sim_AB\n",
-        "\n",
-        "\n",
-        "sorted, indices = torch.sort(dots,dim=0 , descending=True)\n",
-        "#----#\n",
-        "if (mix_method ==  \"Average\"):\n",
-        "  print(f'Calculated all cosine-similarities between the average of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n",
-        "if (mix_method ==  \"Subtract\"):\n",
-        "  print(f'Calculated all cosine-similarities between the subtract of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n",
-        "if (mix_method ==  \"None\"):\n",
-        "  print(f'Calculated all cosine-similarities between the token {name_A} with Id_A = {id_A} with the the rest of the {NUM_TOKENS} tokens as a 1x{sorted.shape[0]} tensor')\n",
-        "\n",
-        "#Produce a list id IDs that are most similiar to the prompt ID at positiion 1 based on above result\n",
-        "\n",
-        "# @markdown Set print options\n",
-        "list_size = 100 # @param {type:'number'}\n",
-        "print_ID = False # @param {type:\"boolean\"}\n",
-        "print_Similarity = True # @param {type:\"boolean\"}\n",
-        "print_Name = True # @param {type:\"boolean\"}\n",
-        "print_Divider = True # @param {type:\"boolean\"}\n",
-        "\n",
-        "\n",
-        "if (print_Divider):\n",
-        "  print('//---//')\n",
-        "\n",
-        "print('')\n",
-        "print('Here is the result : ')\n",
-        "print('')\n",
-        "\n",
-        "for index in range(list_size):\n",
-        "  id = indices[index].item()\n",
-        "  if (print_Name):\n",
-        "    print(f'{vocab(id)}') # vocab item\n",
-        "  if (print_ID):\n",
-        "    print(f'ID = {id}') # IDs\n",
-        "  if (print_Similarity):\n",
-        "    print(f'similiarity = {round(sorted[index].item()*100,2)} %')\n",
-        "  if (print_Divider):\n",
-        "    print('--------')\n",
-        "\n",
-        "#Print the sorted list from above result\n",
-        "\n",
-        "#The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens, which is why output will be [49406, ... , 49407].\n",
         "\n",
-        "#You can leave the 'prompt' field empty to get a random value tensor. Since the tensor is random value, it will not correspond to any tensor in the vocab.json list , and this it will have no ID.\n",
-        "\n",
-        "# Save results as .db file\n",
-        "import shelve\n",
-        "VOCAB_FILENAME = 'tokens_most_similiar_to_' + name_A.replace('</w>','').strip()\n",
-        "d = shelve.open(VOCAB_FILENAME)\n",
-        "#NUM TOKENS == 49407\n",
-        "for index in range(NUM_TOKENS):\n",
-        "  #print(d[f'{index}']) #<-----Use this to read values from the .db file\n",
-        "  d[f'{index}']= vocab(indices[index].item()) #<---- write values to .db file\n",
-        "#----#\n",
-        "d.close() #close the file\n",
-        "# See this link for additional stuff to do with shelve: https://docs.python.org/3/library/shelve.html"
       ],
       "metadata": {
-        "id": "iWeFnT1gAx6A",
-        "cellView": "form"
       },
       "execution_count": null,
       "outputs": []
@@ -424,7 +140,6 @@
         "# @title 📝 Get Prompt text_encoding similarity to the pre-calc. text_encodings\n",
         "prompt = \" a fast car on the road \" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
         "\n",
-        "\n",
         "from transformers import AutoTokenizer\n",
         "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
         "from transformers import  CLIPProcessor, CLIPModel\n",
@@ -438,47 +153,13 @@
         "name_A = prompt\n",
         "#------#\n",
         "\n",
-        "# Load the .db file for prefix encodings\n",
-        "import shelve\n",
-        "_iters = -1\n",
-        "RANGE = NUM_PREFIX\n",
-        "NUM_PREFIX_LISTS = 1\n",
-        "dots = results_sim = torch.zeros(RANGE*NUM_PREFIX_LISTS)\n",
-        "for _PREFIX_ENC_VOCAB in PREFIX_ENC_VOCAB:\n",
-        "  _iters = _iters + 1\n",
-        "  d = shelve.open(_PREFIX_ENC_VOCAB)\n",
-        "  for _index in range(RANGE):\n",
-        "    index = _iters*RANGE + _index\n",
-        "    text_features = d[f'{_index}']\n",
-        "    text_features = text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
-        "    sim = torch.nn.functional.cosine_similarity(text_features, text_features_A)\n",
-        "    dots[index] = sim\n",
-        "  #----#\n",
-        "  d.close() #close the file\n",
-        "#------#\n",
-        "prefix_sorted, prefix_indices = torch.sort(dots,dim=0 , descending=True)\n",
         "#------#\n",
         "\n",
-        "# Load the .db file for prefix encodings\n",
-        "import shelve\n",
-        "_iters = -1\n",
-        "RANGE = NUM_SUFFIX\n",
-        "dots = results_sim = torch.zeros(RANGE*NUM_SUFFIX_LISTS)\n",
-        "for _SUFFIX_ENC_VOCAB in SUFFIX_ENC_VOCAB:\n",
-        "  _iters = _iters + 1\n",
-        "  d = shelve.open(_SUFFIX_ENC_VOCAB)\n",
-        "  for _index in range(RANGE):\n",
-        "    index = _iters*RANGE + _index\n",
-        "    text_features = d[f'{_index}']\n",
-        "    text_features = text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
-        "    sim = torch.nn.functional.cosine_similarity(text_features, text_features_A)\n",
-        "    dots[index] = sim\n",
-        "  #----#\n",
-        "  d.close() #close the file\n",
-        "#------#\n",
-        "suffix_sorted, suffix_indices = torch.sort(dots,dim=0 , descending=True)\n",
-        "#------#\n",
-        "\n"
       ],
       "metadata": {
         "id": "xc-PbIYF428y"
@@ -493,75 +174,43 @@
         "list_size = 100 # @param {type:'number'}\n",
         "start_at_index = 0 # @param {type:'number'}\n",
         "print_Similarity = True # @param {type:\"boolean\"}\n",
-        "print_Suffix = True # @param {type:\"boolean\"}\n",
         "print_Prefix = True # @param {type:\"boolean\"}\n",
         "print_Descriptions = True # @param {type:\"boolean\"}\n",
-        "compact_Output = False # @param {type:\"boolean\"}\n",
         "\n",
         "# title Show the 100 most similiar suffix and prefix text-encodings to the text encoding\n",
         "RANGE = list_size\n",
-        "_suffixes = '{'\n",
         "_sims =  '{'\n",
-        "for index in range(start_at_index + RANGE):\n",
-        "  if index < start_at_index : continue\n",
-        "  id = int(suffix_indices[index])\n",
-        "  ahead = \"from \"\n",
-        "  behind = \"\"\n",
-        "  if(id>NUM_SUFFIX*1):\n",
-        "    ahead = \"a \"\n",
-        "  if(id>NUM_SUFFIX*2):\n",
-        "    ahead = \"by \"\n",
-        "  if(id>NUM_SUFFIX*3):\n",
-        "    ahead = \"\"\n",
-        "    behind = \"like\"\n",
-        "  id = _modulus(id,NUM_SUFFIX)\n",
-        "  #------#\n",
-        "  sim = suffix_sorted[index].item()\n",
-        "  name = ahead + get_suffix(id) + behind\n",
-        "  if(get_suffix(id) == ' '): name = ahead + f'{id}' + behind\n",
-        "  _suffixes = _suffixes + name + '|'\n",
-        "  _sims = _sims + f'{round(sim*100,2)} %' + '|'\n",
         "#------#\n",
-        "_suffixes = (_suffixes + '}').replace('|}', '}')\n",
-        "_sims = (_sims + '}').replace('|}', '}')\n",
         "#------#\n",
         "\n",
-        "\n",
-        "suffixes = _suffixes\n",
-        "sims = _sims\n",
-        "if(not print_Suffix): suffixes = ''\n",
-        "if(not print_Similarity): sims = ''\n",
         "\n",
         "if(not compact_Output):\n",
         "  if(print_Descriptions):\n",
-        "    print(f'The {start_at_index}-{start_at_index + RANGE} most similiar suffix items to prompt : ' + suffixes)\n",
-        "    print(f'The {start_at_index}-{start_at_index + RANGE} similarity % for suffix items : ' + sims)\n",
         "    print('')\n",
         "  else:\n",
-        "    print(suffixes)\n",
-        "#-------#\n",
-        "\n",
-        "_prefixes = '{'\n",
-        "for index in range(start_at_index + RANGE):\n",
-        "  if index < start_at_index : continue\n",
-        "  id = f'{prefix_indices[index]}'\n",
-        "  #sim = prefix_sorted[index]\n",
-        "  name = get_prefix(id)\n",
-        "  _prefixes = _prefixes + name + '|'\n",
-        "#------#\n",
-        "_prefixes = (_prefixes + '}').replace('|}', '}')\n",
-        "\n",
-        "\n",
-        "prefixes = _prefixes\n",
-        "if(not print_Prefix): prefixes = ''\n",
-        "\n",
-        "if(print_Descriptions):\n",
-        "  print(f'The {start_at_index}-{start_at_index + RANGE} most similiar prefixes to prompt : ' + prefixes)\n",
         "else:\n",
-        "  if(compact_Output):\n",
-        "    print((prefixes + _suffixes).replace('}{', '|'))\n",
-        "  else:\n",
-        "    print(prefixes)"
       ],
       "metadata": {
         "id": "_vnVbxcFf7WV"
@@ -589,6 +238,9 @@
         "  for k, v in uploaded.items():\n",
         "    open(k, 'wb').write(v)\n",
         "  return list(uploaded.keys())\n",
         "#Get image\n",
         "# You can use \"http://images.cocodataset.org/val2017/000000039769.jpg\" for testing\n",
         "image_url = \"\" # @param {\"type\":\"string\",\"placeholder\":\"leave empty for local upload (scroll down to see it)\"}\n",
@@ -608,11 +260,11 @@
         "  if colab_image_path == \"\":\n",
         "    keys =  upload_files()\n",
         "    for key in keys:\n",
-        "      image_A = cv2.imread(\"/content/sd_tokens/\" + key)\n",
-        "      colab_image_path = \"/content/sd_tokens/\" + key\n",
-        "      image_path = \"/content/sd_tokens/\" + key\n",
         "  else:\n",
-        "    image_A = cv2.imread(\"/content/sd_tokens/\" + colab_image_path)\n",
         "else:\n",
         "  image_A = Image.open(requests.get(image_url, stream=True).raw)\n",
         "#------#\n",
@@ -622,13 +274,13 @@
       ],
       "metadata": {
         "id": "ke6mZ1RZDOeB",
-        "outputId": "047a37dd-ad8e-42af-ca6d-d4e7becd8adb",
         "colab": {
           "base_uri": "https://localhost:8080/",
           "height": 1000
         }
       },
-      "execution_count": null,
       "outputs": [
         {
           "output_type": "display_data",
@@ -647,14 +299,6 @@
       "source": [
         "# @title 🖼️ Get image_encoding similarity to the pre-calc. text_encodings\n",
         "\n",
-        "list_size = 100 # @param {type:'number'}\n",
-        "start_at_index = 0 # @param {type:'number'}\n",
-        "print_Similarity = True # @param {type:\"boolean\"}\n",
-        "print_Suffix = True # @param {type:\"boolean\"}\n",
-        "print_Prefix = True # @param {type:\"boolean\"}\n",
-        "print_Descriptions = True # @param {type:\"boolean\"}\n",
-        "compact_Output = False # @param {type:\"boolean\"}\n",
-        "\n",
         "from transformers import AutoTokenizer\n",
         "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
         "from transformers import  CLIPProcessor, CLIPModel\n",
@@ -668,48 +312,14 @@
         "name_A = \"the image\"\n",
         "#-----#\n",
         "\n",
-        "# Load the .db file for prefix encodings\n",
-        "import shelve\n",
-        "_iters = -1\n",
-        "RANGE = NUM_PREFIX\n",
-        "NUM_PREFIX_LISTS = 1\n",
-        "dots = results_sim = torch.zeros(RANGE*NUM_PREFIX_LISTS)\n",
-        "for _PREFIX_ENC_VOCAB in PREFIX_ENC_VOCAB:\n",
-        "  _iters = _iters + 1\n",
-        "  d = shelve.open(_PREFIX_ENC_VOCAB)\n",
-        "  for _index in range(RANGE):\n",
-        "    index = _iters*RANGE + _index\n",
-        "    text_features = d[f'{_index}']\n",
-        "    logit_scale = model.logit_scale.exp()\n",
-        "    torch.matmul(text_features, image_features.t()) * logit_scale\n",
-        "    sim = torch.nn.functional.cosine_similarity(text_features, image_features) * logit_scale\n",
-        "    dots[index] = sim\n",
-        "  #----#\n",
-        "  d.close() #close the file\n",
-        "#------#\n",
-        "prefix_sorted, prefix_indices = torch.sort(dots,dim=0 , descending=True)\n",
-        "#------#\n",
-        "\n",
-        "# Load the .db file for prefix encodings\n",
-        "import shelve\n",
-        "_iters = -1\n",
-        "RANGE = NUM_SUFFIX\n",
-        "dots = results_sim = torch.zeros(RANGE*NUM_SUFFIX_LISTS)\n",
-        "for _SUFFIX_ENC_VOCAB in SUFFIX_ENC_VOCAB:\n",
-        "  _iters = _iters + 1\n",
-        "  d = shelve.open(_SUFFIX_ENC_VOCAB)\n",
-        "  for _index in range(RANGE):\n",
-        "    index = _iters*RANGE + _index\n",
-        "    text_features = d[f'{_index}']\n",
-        "    logit_scale = model.logit_scale.exp()\n",
-        "    torch.matmul(text_features, image_features.t()) * logit_scale\n",
-        "    sim = torch.nn.functional.cosine_similarity(text_features, image_features) * logit_scale\n",
-        "    dots[index] = sim\n",
-        "  #----#\n",
-        "  d.close() #close the file\n",
-        "#------#\n",
-        "suffix_sorted, suffix_indices = torch.sort(dots,dim=0 , descending=True)\n",
-        "#------#"
       ],
       "metadata": {
         "id": "rebogpoyOG8k"
@@ -722,80 +332,312 @@
       "source": [
         "# @title 🖼️ Print the results\n",
         "list_size = 100 # @param {type:'number'}\n",
-        "start_at_index = 0 # @param {type:'number'}\n",
         "print_Similarity = True # @param {type:\"boolean\"}\n",
-        "print_Suffix = True # @param {type:\"boolean\"}\n",
-        "print_Prefix = True # @param {type:\"boolean\"}\n",
-        "print_Descriptions = True # @param {type:\"boolean\"}\n",
-        "compact_Output = False # @param {type:\"boolean\"}\n",
         "\n",
-        "# title Show the 100 most similiar suffix and prefix text-encodings to the text encoding\n",
-        "RANGE = list_size\n",
-        "_suffixes = '{'\n",
-        "_sims =  '{'\n",
-        "for index in range(start_at_index + RANGE):\n",
-        "  if index < start_at_index : continue\n",
-        "  id = int(suffix_indices[index])\n",
-        "  ahead = \"from \"\n",
-        "  behind = \"\"\n",
-        "  if(id>NUM_SUFFIX*1):\n",
-        "    ahead = \"a \"\n",
-        "  if(id>NUM_SUFFIX*2):\n",
-        "    ahead = \"by \"\n",
-        "  if(id>NUM_SUFFIX*3):\n",
-        "    ahead = \"\"\n",
-        "    behind = \"like\"\n",
-        "  id = _modulus(id,NUM_SUFFIX)\n",
-        "  #------#\n",
-        "  sim = suffix_sorted[index].item()\n",
-        "  name = ahead + get_suffix(id) + behind\n",
-        "  if(get_suffix(id) == ' '): name = ahead + f'{id}' + behind\n",
-        "  _suffixes = _suffixes + name + '|'\n",
-        "  _sims = _sims + f'{round(sim*100,2)} %' + '|'\n",
-        "#------#\n",
-        "_suffixes = (_suffixes + '}').replace('|}', '}')\n",
-        "_sims = (_sims + '}').replace('|}', '}')\n",
-        "#------#\n",
         "\n",
         "\n",
-        "suffixes = _suffixes\n",
-        "sims = _sims\n",
-        "if(not print_Suffix): suffixes = ''\n",
-        "if(not print_Similarity): sims = ''\n",
         "\n",
-        "if(not compact_Output):\n",
-        "  if(print_Descriptions):\n",
-        "    print(f'The {start_at_index}-{start_at_index + RANGE} most similiar suffix items to image : ' + suffixes)\n",
-        "    print(f'The {start_at_index}-{start_at_index + RANGE} similarity % for suffix items : ' + sims)\n",
-        "    print('')\n",
-        "  else:\n",
-        "    print(suffixes)\n",
-        "#-------#\n",
         "\n",
-        "_prefixes = '{'\n",
-        "for index in range(start_at_index + RANGE):\n",
-        "  if index < start_at_index : continue\n",
-        "  id = f'{prefix_indices[index]}'\n",
-        "  #sim = prefix_sorted[index]\n",
-        "  name = get_prefix(id)\n",
-        "  _prefixes = _prefixes + name + '|'\n",
-        "#------#\n",
-        "_prefixes = (_prefixes + '}').replace('|}', '}')\n",
         "\n",
         "\n",
-        "prefixes = _prefixes\n",
-        "if(not print_Prefix): prefixes = ''\n",
         "\n",
-        "if(print_Descriptions):\n",
-        "  print(f'The {start_at_index}-{start_at_index + RANGE} most similiar prefixes to image : ' + prefixes)\n",
-        "else:\n",
-        "  if(compact_Output):\n",
-        "    print((prefixes + _suffixes).replace('}{', '|'))\n",
-        "  else:\n",
-        "    print(prefixes)"
       ],
       "metadata": {
-        "id": "JkzncP8SgKtS"
       },
       "execution_count": null,
       "outputs": []
@@ -1490,6 +1332,198 @@
       "metadata": {
         "id": "njeJx_nSSA8H"
       }
     }
   ]
 }

       "cell_type": "code",
       "source": [
         "# @title ✳️ Load/initialize values\n",
         "#Imports\n",
         "!pip install safetensors\n",
         "from safetensors.torch import load_file\n",
         "import json , os , shelve , torch\n",
         "import pandas as pd\n",
         "#----#\n",
         "      continue\n",
         "      #-------#\n",
         "    #--------#\n",
+        "    #_text_encodings.close() #close the text_encodings file\n",
         "    file_index = file_index + 1\n",
         "  #----------#\n",
         "  NUM_ITEMS = index\n",
       "metadata": {
         "id": "rUXQ73IbonHY"
       },
+      "execution_count": null,
       "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
+        "# @title 📝 Choose which vocab to load\n",
+        "use_vocab = '🦜 fusion-t2i-prompt-features' # @param ['🦜 fusion-t2i-prompt-features']\n",
+        "\n",
         "%cd /content/\n",
         "!git clone https://huggingface.co/datasets/codeShare/text-to-image-prompts\n",
         "#------#\n",
         "path = '/content/text-to-image-prompts/civitai-prompts/green'\n",
+        "prompts , text_encodings, NUM_VOCAB_ITEMS = getPrompts(path)\n",
         "\n",
+        "append_Whitespace = True\n"
       ],
       "metadata": {
+        "id": "ZMG4CThUAmwW"
       },
       "execution_count": null,
       "outputs": []
         "# @title 📝 Get Prompt text_encoding similarity to the pre-calc. text_encodings\n",
         "prompt = \" a fast car on the road \" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
         "\n",
         "from transformers import AutoTokenizer\n",
         "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
         "from transformers import  CLIPProcessor, CLIPModel\n",
         "name_A = prompt\n",
         "#------#\n",
         "\n",
+        "sims = torch.zeros(NUM_VOCAB_ITEMS)\n",
+        "for index in range(NUM_VOCAB_ITEMS):\n",
+        "  text_features =  text_encodings[f'{index}']\n",
+        "  sims[index] = torch.nn.functional.cosine_similarity(text_features, text_features_A)\n",
         "#------#\n",
         "\n",
+        "sorted , indices = torch.sort(sims,dim=0 , descending=True)"
       ],
       "metadata": {
         "id": "xc-PbIYF428y"
         "list_size = 100 # @param {type:'number'}\n",
         "start_at_index = 0 # @param {type:'number'}\n",
         "print_Similarity = True # @param {type:\"boolean\"}\n",
+        "print_Prompts = True # @param {type:\"boolean\"}\n",
         "print_Prefix = True # @param {type:\"boolean\"}\n",
         "print_Descriptions = True # @param {type:\"boolean\"}\n",
+        "compact_Output = True # @param {type:\"boolean\"}\n",
+        "newline_Separator = True # @param {type:\"boolean\"}\n",
         "\n",
         "# title Show the 100 most similiar suffix and prefix text-encodings to the text encoding\n",
         "RANGE = list_size\n",
+        "separator = '|'\n",
+        "if append_Whitespace : separator =  ' ' + separator\n",
+        "if newline_Separator : separator = separator + '\\n'\n",
+        "\n",
+        "_prompts = '{'\n",
         "_sims =  '{'\n",
+        "for _index in range(start_at_index + RANGE):\n",
+        "  if _index < start_at_index : continue\n",
+        "  index = indices[_index]\n",
+        "  _prompts = _prompts  + prompts[f'{index}'] + separator\n",
+        "  _sims = _sims + f'{round(100*sims[index].item(), 2)} %' + separator\n",
         "#------#\n",
+        "__prompts = (_prompts + '}').replace(separator + '}', '}')\n",
+        "__sims = (_sims + '}').replace(separator + '}', '}')\n",
         "#------#\n",
         "\n",
+        "if(not print_Prompts): __prompts = ''\n",
+        "if(not print_Similarity): __sims = ''\n",
         "\n",
         "if(not compact_Output):\n",
         "  if(print_Descriptions):\n",
+        "    print(f'The {start_at_index}-{start_at_index + RANGE} most similiar items to prompt : \\n\\n ' + __prompts)\n",
+        "    print(f'The {start_at_index}-{start_at_index + RANGE} similarity % for items : \\n\\n' + __sims)\n",
         "    print('')\n",
         "  else:\n",
+        "    print(__prompts)\n",
         "else:\n",
+        "  print(__prompts)\n",
+        "#-------#"
       ],
       "metadata": {
         "id": "_vnVbxcFf7WV"
         "  for k, v in uploaded.items():\n",
         "    open(k, 'wb').write(v)\n",
         "  return list(uploaded.keys())\n",
+        "\n",
+        "\n",
+        "colab_image_folder = '/content/text-to-image-prompts/images/'\n",
         "#Get image\n",
         "# You can use \"http://images.cocodataset.org/val2017/000000039769.jpg\" for testing\n",
         "image_url = \"\" # @param {\"type\":\"string\",\"placeholder\":\"leave empty for local upload (scroll down to see it)\"}\n",
         "  if colab_image_path == \"\":\n",
         "    keys =  upload_files()\n",
         "    for key in keys:\n",
+        "      image_A = cv2.imread(colab_image_folder + key)\n",
+        "      colab_image_path = colab_image_folder + key\n",
+        "      image_path = colab_image_folder + key\n",
         "  else:\n",
+        "    image_A = cv2.imread(colab_image_folder + colab_image_path)\n",
         "else:\n",
         "  image_A = Image.open(requests.get(image_url, stream=True).raw)\n",
         "#------#\n",
       ],
       "metadata": {
         "id": "ke6mZ1RZDOeB",
+        "outputId": "8ced884a-bf07-4fcb-c108-0f873d71a73c",
         "colab": {
           "base_uri": "https://localhost:8080/",
           "height": 1000
         }
       },
+      "execution_count": 4,
       "outputs": [
         {
           "output_type": "display_data",
       "source": [
         "# @title 🖼️ Get image_encoding similarity to the pre-calc. text_encodings\n",
         "\n",
         "from transformers import AutoTokenizer\n",
         "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
         "from transformers import  CLIPProcessor, CLIPModel\n",
         "name_A = \"the image\"\n",
         "#-----#\n",
         "\n",
+        "sims = torch.zeros(NUM_VOCAB_ITEMS)\n",
+        "for index in range(NUM_VOCAB_ITEMS):\n",
+        "  text_features =  text_encodings[f'{index}']\n",
+        "  logit_scale = model.logit_scale.exp()\n",
+        "  torch.matmul(text_features, image_features.t()) * logit_scale\n",
+        "  sims[index]  = torch.nn.functional.cosine_similarity(text_features, image_features)\n",
+        "#-------#\n",
+        "sorted , indices = torch.sort(sims,dim=0 , descending=True)"
       ],
       "metadata": {
         "id": "rebogpoyOG8k"
       "source": [
         "# @title 🖼️ Print the results\n",
         "list_size = 100 # @param {type:'number'}\n",
+        "start_at_index = 0 # @param {type:'number'}\n",
+        "print_Similarity = True # @param {type:\"boolean\"}\n",
+        "print_Prompts = True # @param {type:\"boolean\"}\n",
+        "print_Prefix = True # @param {type:\"boolean\"}\n",
+        "print_Descriptions = True # @param {type:\"boolean\"}\n",
+        "compact_Output = True # @param {type:\"boolean\"}\n",
+        "newline_Separator = True # @param {type:\"boolean\"}\n",
+        "\n",
+        "# title Show the 100 most similiar suffix and prefix text-encodings to the text encoding\n",
+        "RANGE = list_size\n",
+        "separator = '|'\n",
+        "if append_Whitespace : separator =  ' ' + separator\n",
+        "if newline_Separator : separator = separator + '\\n'\n",
+        "\n",
+        "_prompts = '{'\n",
+        "_sims =  '{'\n",
+        "for _index in range(start_at_index + RANGE):\n",
+        "  if _index < start_at_index : continue\n",
+        "  index = indices[_index]\n",
+        "  _prompts = _prompts  + prompts[f'{index}'] + separator\n",
+        "  _sims = _sims + f'{round(100*sims[index].item(), 2)} %' + separator\n",
+        "#------#\n",
+        "__prompts = (_prompts + '}').replace(separator + '}', '}')\n",
+        "__sims = (_sims + '}').replace(separator + '}', '}')\n",
+        "#------#\n",
+        "\n",
+        "if(not print_Prompts): __prompts = ''\n",
+        "if(not print_Similarity): __sims = ''\n",
+        "\n",
+        "if(not compact_Output):\n",
+        "  if(print_Descriptions):\n",
+        "    print(f'The {start_at_index}-{start_at_index + RANGE} most similiar items to prompt : \\n\\n ' + __prompts)\n",
+        "    print(f'The {start_at_index}-{start_at_index + RANGE} similarity % for items : \\n\\n' + __sims)\n",
+        "    print('')\n",
+        "  else:\n",
+        "    print(__prompts)\n",
+        "else:\n",
+        "  print(__prompts)\n",
+        "#-------#"
+      ],
+      "metadata": {
+        "id": "JkzncP8SgKtS",
+        "outputId": "37351bed-c5e2-4554-c5e0-a9dc84da700b",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "execution_count": 6,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "{beautiful avatar pictures |\n",
+            "purple hair crowned standing in storm background |\n",
+            "beautiful celebrity futuristic sci-fi |\n",
+            "by magali villeneuve |\n",
+            "visually striking spectacle inspired by the works |\n",
+            "visually striking spectacle inspired by the works |\n",
+            "a beautiful  female warrior |\n",
+            "a sexy scifi warrior |\n",
+            "a sexy scifi warrior |\n",
+            "film still from halo live action adaptation |\n",
+            "cinematic film still from captain marvel |\n",
+            "beautiful female warrior |\n",
+            "beautiful  female warrior |\n",
+            "film still from halo live-action movie adaptation |\n",
+            "outlandish costume design |\n",
+            "beautiful indian warrior queen |\n",
+            "of female space soldier |\n",
+            "blue light on her face she appears calm |\n",
+            "a female scifi warrior |\n",
+            "a female scifi warrior |\n",
+            "nebula in her streak hair |\n",
+            "of brown skinned indian warrior queen |\n",
+            "played by young dove cameron |\n",
+            "has runes on her body |\n",
+            "has runes on her body |\n",
+            "beautiful light makeup female sorceress |\n",
+            "a gorgeous female void thrall |\n",
+            "a gorgeous female void thrall |\n",
+            "beautiful female elf queen |\n",
+            "captivating mystique |\n",
+            "captivating mystique |\n",
+            "symbolizing her role as the goddess |\n",
+            "character integrated into the background |\n",
+            "swirling black light around the character |\n",
+            "very beautiful jean grey wearing |\n",
+            "lightly blued metal armor |\n",
+            "multiple different characters in the background |\n",
+            "cinematic still from conan |\n",
+            "yo person as dark elf queen |\n",
+            "trending at cgsociety |\n",
+            "femaleastronaut exalted human futuristic warrior |\n",
+            "femaleastronaut exalted human futuristic warrior |\n",
+            "epic fantasy greek priestess |\n",
+            "epic fantasy greek priestess |\n",
+            "female draenei world |\n",
+            "genetically engineered soldiers |\n",
+            "genetically engineered soldiers |\n",
+            "visually striking scene the lighting |\n",
+            "the female soldier marches in formation |\n",
+            "the female soldier marches in formation |\n",
+            "revealing costume design |\n",
+            "pandora_smith_magister |\n",
+            "pandora_smith_magister |\n",
+            "his sorceress in the back ground |\n",
+            "gorgeous muscular elven ukrainian |\n",
+            "gorgeous muscular elven ukrainian |\n",
+            "water elemental officer jenny |\n",
+            "norse female goddess |\n",
+            "matte fantasy painting |\n",
+            "periwinkle purple skin |\n",
+            "of norse female goddess |\n",
+            "mujer de ojos rojos y pelo azulado |\n",
+            "strength the battle scene around her |\n",
+            "a beautiful young redhead warrior |\n",
+            "a beautiful young redhead warrior |\n",
+            "moody cinematic epic concept art |\n",
+            "hypnotically beautiful wood elf in |\n",
+            "hypnotically beautiful wood elf in |\n",
+            "loraemmawatsonlora_v |\n",
+            "alphonse mucha cinematic epic + rule |\n",
+            "alphonse mucha cinematic epic + rule |\n",
+            "an actress standing behind |\n",
+            "fking_scifi_v amazing |\n",
+            "shine like sapphires |\n",
+            "female elemental water wizard |\n",
+            "female elemental water wizard |\n",
+            "beautiful character design |\n",
+            "female warriors protecting an underwater temple |\n",
+            "indian mary_winstead |\n",
+            "sarah kerrigan queen |\n",
+            "sarah kerrigan queen |\n",
+            "the female barbarian stands tall |\n",
+            "widowmaker from overwatchscarlett johannson |\n",
+            "intricate costume design |\n",
+            "award winning character concept art of |\n",
+            "movie still from braveheart |\n",
+            "movie still from braveheart |\n",
+            "an extremely beautiful young female elf |\n",
+            "an extremely beautiful young female elf |\n",
+            "cinematic character design d |\n",
+            "dark purple bodypaint |\n",
+            "dark purple bodypaint |\n",
+            "a daring reimagining |\n",
+            "captivating film still |\n",
+            "glowing dark blue skin |\n",
+            "of sexy cyberpunk female mage |\n",
+            "military men gaze at her longingly |\n",
+            "military men gaze at her longingly |\n",
+            "female space soldier |\n",
+            "norse goddess fighting in}\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# @title ⚡ Get similiar tokens (not updated yet)\n",
+        "import torch\n",
+        "from transformers import AutoTokenizer\n",
+        "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
+        "\n",
+        "# @markdown Write name of token to match against\n",
+        "token_name = \"banana \" # @param {type:'string',\"placeholder\":\"leave empty for random value token\"}\n",
+        "\n",
+        "prompt = token_name\n",
+        "# @markdown (optional) Mix the token with something else\n",
+        "mix_with = \"\" # @param {\"type\":\"string\",\"placeholder\":\"leave empty for random value token\"}\n",
+        "mix_method = \"None\" # @param [\"None\" , \"Average\", \"Subtract\"] {allow-input: true}\n",
+        "w = 0.5 # @param {type:\"slider\", min:0, max:1, step:0.01}\n",
+        "# @markdown Limit char size of included token\n",
+        "\n",
+        "min_char_size = 0 # param {type:\"slider\", min:0, max: 50, step:1}\n",
+        "char_range = 50 # param {type:\"slider\", min:0, max: 50, step:1}\n",
+        "\n",
+        "tokenizer_output = tokenizer(text = prompt)\n",
+        "input_ids = tokenizer_output['input_ids']\n",
+        "id_A = input_ids[1]\n",
+        "A = torch.tensor(token[id_A])\n",
+        "A = A/A.norm(p=2, dim=-1, keepdim=True)\n",
+        "#-----#\n",
+        "tokenizer_output = tokenizer(text = mix_with)\n",
+        "input_ids = tokenizer_output['input_ids']\n",
+        "id_C = input_ids[1]\n",
+        "C = torch.tensor(token[id_C])\n",
+        "C = C/C.norm(p=2, dim=-1, keepdim=True)\n",
+        "#-----#\n",
+        "sim_AC = torch.dot(A,C)\n",
+        "#-----#\n",
+        "print(input_ids)\n",
+        "#-----#\n",
+        "\n",
+        "#if no imput exists we just randomize the entire thing\n",
+        "if (prompt == \"\"):\n",
+        "  id_A = -1\n",
+        "  print(\"Tokenized prompt tensor A is a random valued tensor with no ID\")\n",
+        "  R = torch.rand(A.shape)\n",
+        "  R = R/R.norm(p=2, dim=-1, keepdim=True)\n",
+        "  A = R\n",
+        "  name_A = 'random_A'\n",
+        "\n",
+        "#if no imput exists we just randomize the entire thing\n",
+        "if (mix_with == \"\"):\n",
+        "  id_C = -1\n",
+        "  print(\"Tokenized prompt  'mix_with' tensor C is a random valued tensor with no ID\")\n",
+        "  R = torch.rand(A.shape)\n",
+        "  R = R/R.norm(p=2, dim=-1, keepdim=True)\n",
+        "  C = R\n",
+        "  name_C = 'random_C'\n",
+        "\n",
+        "name_A = \"A of random type\"\n",
+        "if (id_A>-1):\n",
+        "  name_A = vocab(id_A)\n",
+        "\n",
+        "name_C = \"token C of random type\"\n",
+        "if (id_C>-1):\n",
+        "  name_C = vocab(id_C)\n",
+        "\n",
+        "print(f\"The similarity between A '{name_A}' and C '{name_C}' is {round(sim_AC.item()*100,2)} %\")\n",
+        "\n",
+        "if (mix_method ==  \"None\"):\n",
+        "  print(\"No operation\")\n",
+        "\n",
+        "if (mix_method ==  \"Average\"):\n",
+        "  A = w*A + (1-w)*C\n",
+        "  _A = LA.vector_norm(A, ord=2)\n",
+        "  print(f\"Tokenized prompt tensor A '{name_A}' token has been recalculated as A = w*A + (1-w)*C , where C is '{name_C}' token , for w = {w}  \")\n",
+        "\n",
+        "if (mix_method ==  \"Subtract\"):\n",
+        "  tmp =  w*A - (1-w)*C\n",
+        "  tmp = tmp/tmp.norm(p=2, dim=-1, keepdim=True)\n",
+        "  A = tmp\n",
+        "  #//---//\n",
+        "  print(f\"Tokenized prompt tensor A '{name_A}' token has been recalculated as A = _A*norm(w*A  - (1-w)*C) , where C is '{name_C}' token , for w = {w} \")\n",
+        "\n",
+        "#OPTIONAL : Add/subtract + normalize above result with another token. Leave field empty to get a random value tensor\n",
+        "\n",
+        "dots = torch.zeros(NUM_TOKENS)\n",
+        "for index in range(NUM_TOKENS):\n",
+        "  id_B = index\n",
+        "  B = torch.tensor(token[id_B])\n",
+        "  B = B/B.norm(p=2, dim=-1, keepdim=True)\n",
+        "  sim_AB = torch.dot(A,B)\n",
+        "  dots[index] = sim_AB\n",
+        "\n",
+        "\n",
+        "sorted, indices = torch.sort(dots,dim=0 , descending=True)\n",
+        "#----#\n",
+        "if (mix_method ==  \"Average\"):\n",
+        "  print(f'Calculated all cosine-similarities between the average of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n",
+        "if (mix_method ==  \"Subtract\"):\n",
+        "  print(f'Calculated all cosine-similarities between the subtract of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n",
+        "if (mix_method ==  \"None\"):\n",
+        "  print(f'Calculated all cosine-similarities between the token {name_A} with Id_A = {id_A} with the the rest of the {NUM_TOKENS} tokens as a 1x{sorted.shape[0]} tensor')\n",
+        "\n",
+        "#Produce a list id IDs that are most similiar to the prompt ID at positiion 1 based on above result\n",
+        "\n",
+        "# @markdown Set print options\n",
+        "list_size = 100 # @param {type:'number'}\n",
+        "print_ID = False # @param {type:\"boolean\"}\n",
         "print_Similarity = True # @param {type:\"boolean\"}\n",
+        "print_Name = True # @param {type:\"boolean\"}\n",
+        "print_Divider = True # @param {type:\"boolean\"}\n",
         "\n",
         "\n",
+        "if (print_Divider):\n",
+        "  print('//---//')\n",
         "\n",
+        "print('')\n",
+        "print('Here is the result : ')\n",
+        "print('')\n",
         "\n",
+        "for index in range(list_size):\n",
+        "  id = indices[index].item()\n",
+        "  if (print_Name):\n",
+        "    print(f'{vocab(id)}') # vocab item\n",
+        "  if (print_ID):\n",
+        "    print(f'ID = {id}') # IDs\n",
+        "  if (print_Similarity):\n",
+        "    print(f'similiarity = {round(sorted[index].item()*100,2)} %')\n",
+        "  if (print_Divider):\n",
+        "    print('--------')\n",
         "\n",
+        "#Print the sorted list from above result\n",
         "\n",
+        "#The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens, which is why output will be [49406, ... , 49407].\n",
         "\n",
+        "#You can leave the 'prompt' field empty to get a random value tensor. Since the tensor is random value, it will not correspond to any tensor in the vocab.json list , and this it will have no ID.\n",
         "\n",
+        "# Save results as .db file\n",
+        "import shelve\n",
+        "VOCAB_FILENAME = 'tokens_most_similiar_to_' + name_A.replace('</w>','').strip()\n",
+        "d = shelve.open(VOCAB_FILENAME)\n",
+        "#NUM TOKENS == 49407\n",
+        "for index in range(NUM_TOKENS):\n",
+        "  #print(d[f'{index}']) #<-----Use this to read values from the .db file\n",
+        "  d[f'{index}']= vocab(indices[index].item()) #<---- write values to .db file\n",
+        "#----#\n",
+        "d.close() #close the file\n",
+        "# See this link for additional stuff to do with shelve: https://docs.python.org/3/library/shelve.html"
       ],
       "metadata": {
+        "id": "iWeFnT1gAx6A"
       },
       "execution_count": null,
       "outputs": []
       "metadata": {
         "id": "njeJx_nSSA8H"
       }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# @title Deprecated\n",
+        "prompt = \" a fast car on the road \" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
+        "\n",
+        "from transformers import AutoTokenizer\n",
+        "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
+        "from transformers import  CLIPProcessor, CLIPModel\n",
+        "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
+        "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n",
+        "\n",
+        "# Get text features for user input\n",
+        "inputs = tokenizer(text = prompt, padding=True, return_tensors=\"pt\")\n",
+        "text_features_A = model.get_text_features(**inputs)\n",
+        "text_features_A = text_features_A/text_features_A.norm(p=2, dim=-1, keepdim=True)\n",
+        "name_A = prompt\n",
+        "#------#\n",
+        "\n",
+        "# Load the .db file for prefix encodings\n",
+        "import shelve\n",
+        "_iters = -1\n",
+        "RANGE = NUM_PREFIX\n",
+        "NUM_PREFIX_LISTS = 1\n",
+        "dots = results_sim = torch.zeros(RANGE*NUM_PREFIX_LISTS)\n",
+        "for _PREFIX_ENC_VOCAB in PREFIX_ENC_VOCAB:\n",
+        "  _iters = _iters + 1\n",
+        "  d = shelve.open(_PREFIX_ENC_VOCAB)\n",
+        "  for _index in range(RANGE):\n",
+        "    index = _iters*RANGE + _index\n",
+        "    text_features =  text_encodings[f'{_index}']\n",
+        "    sim = torch.nn.functional.cosine_similarity(text_features, text_features_A)\n",
+        "    dots[index] = sim\n",
+        "  #----#\n",
+        "  d.close() #close the file\n",
+        "#------#\n",
+        "prefix_sorted, prefix_indices = torch.sort(dots,dim=0 , descending=True)\n",
+        "#------#\n",
+        "\n",
+        "\n",
+        "_prefixes = '{'\n",
+        "for index in range(start_at_index + RANGE):\n",
+        "  if index < start_at_index : continue\n",
+        "  id = f'{prefix_indices[index]}'\n",
+        "  #sim = prefix_sorted[index]\n",
+        "  name = get_prefix(id)\n",
+        "  _prefixes = _prefixes + name + '|'\n",
+        "#------#\n",
+        "_prefixes = (_prefixes + '}').replace('|}', '}')\n",
+        "\n",
+        "\n",
+        "prefixes = _prefixes\n",
+        "if(not print_Prefix): prefixes = ''\n",
+        "\n",
+        "if(print_Descriptions):\n",
+        "  print(f'The {start_at_index}-{start_at_index + RANGE} most similiar prefixes to prompt : ' + prefixes)\n",
+        "else:\n",
+        "  if(compact_Output):\n",
+        "    print((prefixes + _suffixes).replace('}{', '|'))\n",
+        "  else:\n",
+        "    print(prefixes)\n",
+        "\n",
+        "# @title ✳️ Load/initialize values\n",
+        "# Load the tokens into the colab\n",
+        "!git clone https://huggingface.co/datasets/codeShare/sd_tokens\n",
+        "import torch\n",
+        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+        "from torch import linalg as LA\n",
+        "%cd /content/sd_tokens\n",
+        "token = torch.load('sd15_tensors.pt', map_location= torch.device('cpu'), weights_only=True)\n",
+        "#-----#\n",
+        "VOCAB_FILENAME = 'tokens_most_similiar_to_girl'\n",
+        "ACTIVE_IMG = ''\n",
+        "#-----#\n",
+        "\n",
+        "# Define functions/constants\n",
+        "NUM_TOKENS = 49407\n",
+        "NUM_PREFIX = 13662\n",
+        "NUM_SUFFIX = 32901\n",
+        "\n",
+        "PREFIX_ENC_VOCAB = ['encoded_prefix_to_girl',]\n",
+        "SUFFIX_ENC_VOCAB = ['a_-_encoded_suffix' ,]\n",
+        "    #'from_-encoded_suffix',\n",
+        "    #'by_-encoded_suffix' ,\n",
+        "    #'encoded_suffix-_like']\n",
+        "\n",
+        "# Make sure these match above results\n",
+        "NUM_PREFIX_LISTS = len(PREFIX_ENC_VOCAB)\n",
+        "NUM_SUFFIX_LISTS = len(SUFFIX_ENC_VOCAB)\n",
+        "#-----#\n",
+        "\n",
+        "\n",
+        "#Import the vocab.json\n",
+        "import json\n",
+        "import pandas as pd\n",
+        "\n",
+        "# Read suffix.json\n",
+        "with open('suffix.json', 'r') as f:\n",
+        "    data = json.load(f)\n",
+        "_df = pd.DataFrame({'count': data})['count']\n",
+        "suffix = {\n",
+        "    key : value for key, value in _df.items()\n",
+        "}\n",
+        "# Read prefix json\n",
+        "with open('prefix.json', 'r') as f:\n",
+        "    data = json.load(f)\n",
+        "_df = pd.DataFrame({'count': data})['count']\n",
+        "prefix = {\n",
+        "    key : value for key, value in _df.items()\n",
+        "}\n",
+        "\n",
+        "# Read to_suffix.json\n",
+        "with open('to_suffix.json', 'r') as f:\n",
+        "    data = json.load(f)\n",
+        "_df = pd.DataFrame({'count': data})['count']\n",
+        "suffix_to_vocab = {\n",
+        "    key : value for key, value in _df.items()\n",
+        "}\n",
+        "\n",
+        "# Read to_prefix.json\n",
+        "with open('to_prefix.json', 'r') as f:\n",
+        "    data = json.load(f)\n",
+        "_df = pd.DataFrame({'count': data})['count']\n",
+        "prefix_to_vocab = {\n",
+        "    key : value for key, value in _df.items()\n",
+        "}\n",
+        "\n",
+        "#-----#\n",
+        "\n",
+        "\n",
+        "# Read to_suffix.json (reversing key and value)\n",
+        "with open('to_suffix.json', 'r') as f:\n",
+        "    data = json.load(f)\n",
+        "_df = pd.DataFrame({'count': data})['count']\n",
+        "vocab_to_suffix = {\n",
+        "    value : key for key, value in _df.items()\n",
+        "}\n",
+        "\n",
+        "# Read to_prefix.json (reversing key and value)\n",
+        "with open('to_prefix.json', 'r') as f:\n",
+        "    data = json.load(f)\n",
+        "_df = pd.DataFrame({'count': data})['count']\n",
+        "vocab_to_prefix = {\n",
+        "    value : key for key, value in _df.items()\n",
+        "}\n",
+        "\n",
+        "\n",
+        "#-----#\n",
+        "\n",
+        "#get token from id (excluding tokens with special symbols)\n",
+        "def vocab(id):\n",
+        "  _id = f'{id}'\n",
+        "  if _id in vocab_to_suffix:\n",
+        "    _id = vocab_to_suffix[_id]\n",
+        "    return suffix[_id]\n",
+        "  if _id in vocab_to_prefix:\n",
+        "    _id = vocab_to_prefix[_id]\n",
+        "    return prefix[_id]\n",
+        "  return ' ' #<---- return whitespace if other id like emojis etc.\n",
+        "#--------#\n",
+        "\n",
+        "#get token from id (excluding tokens with special symbols)\n",
+        "def get_suffix(id):\n",
+        "  _id = f'{id}'\n",
+        "  if int(id) <= NUM_SUFFIX:\n",
+        "    return suffix[_id]\n",
+        "  return ' ' #<---- return whitespace if out of bounds\n",
+        "#--------#\n",
+        "\n",
+        "#get token from id (excluding tokens with special symbols)\n",
+        "def get_prefix(id):\n",
+        "  _id = f'{id}'\n",
+        "  if int(id) <= NUM_PREFIX:\n",
+        "    return prefix[_id]\n",
+        "  return ' ' #<---- return whitespace if out of bounds\n",
+        "#--------#\n",
+        "\n",
+        "\n",
+        "def _modulus(_id,id_max):\n",
+        "  id = _id\n",
+        "  while(id>id_max):\n",
+        "    id = id-id_max\n",
+        "  return id\n",
+        "\n",
+        "#print(get_token(35894))\n"
+      ],
+      "metadata": {
+        "id": "8BWq7SY8mzKD"
+      },
+      "execution_count": null,
+      "outputs": []
     }
   ]
 }