Snehil Shah
commited on
Commit
·
eda15d0
1
Parent(s):
1c0f2c7
Set up the dataset, transformer, and the vector db
Browse files- images.ipynb +236 -33
images.ipynb
CHANGED
@@ -1,35 +1,238 @@
|
|
1 |
{
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
},
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
"metadata": {},
|
14 |
-
"outputs": [],
|
15 |
-
"source": [
|
16 |
-
"# hello world\n",
|
17 |
-
"print(\"hello world\")"
|
18 |
-
]
|
19 |
-
},
|
20 |
-
{
|
21 |
-
"cell_type": "markdown",
|
22 |
-
"metadata": {},
|
23 |
-
"source": [
|
24 |
-
"# EON"
|
25 |
-
]
|
26 |
-
}
|
27 |
-
],
|
28 |
-
"metadata": {
|
29 |
-
"language_info": {
|
30 |
-
"name": "python"
|
31 |
-
}
|
32 |
-
},
|
33 |
-
"nbformat": 4,
|
34 |
-
"nbformat_minor": 2
|
35 |
-
}
|
|
|
1 |
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {
|
6 |
+
"id": "view-in-github",
|
7 |
+
"colab_type": "text"
|
8 |
+
},
|
9 |
+
"source": [
|
10 |
+
"<a href=\"https://colab.research.google.com/github/Snehil-Shah/MultiModal-Vector-Semantic-Search-Engine/blob/main/images.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "markdown",
|
15 |
+
"metadata": {
|
16 |
+
"id": "aH0U6JkEbAcg"
|
17 |
+
},
|
18 |
+
"source": [
|
19 |
+
"# Image to Semantic Embeddings\n",
|
20 |
+
"\n",
|
21 |
+
"**Aim**: Encode around 50k jpg/jpeg images into vector embeddings using a vision tranformer model and upsert them into a vector database for clustering and querying"
|
22 |
+
]
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"cell_type": "code",
|
26 |
+
"execution_count": null,
|
27 |
+
"metadata": {
|
28 |
+
"id": "CFLaAyqCbAch"
|
29 |
+
},
|
30 |
+
"outputs": [],
|
31 |
+
"source": [
|
32 |
+
"!pip install jupyter pandas qdrant_client pyarrow datasets"
|
33 |
+
]
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"cell_type": "markdown",
|
37 |
+
"metadata": {
|
38 |
+
"id": "j5o4d0jbbAci"
|
39 |
+
},
|
40 |
+
"source": [
|
41 |
+
"# Load Dataset\n",
|
42 |
+
"This is the Open Images Dataset by CVDFoundation which hosts over 9 mil images. We will be working with a smaller subset.\n",
|
43 |
+
"\n",
|
44 |
+
"The dataset currently is a tsv file, with the first column representing a URL to a hosted jpg/jpeg image."
|
45 |
+
]
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"cell_type": "code",
|
49 |
+
"source": [
|
50 |
+
"import pandas as pd\n",
|
51 |
+
"data = pd.read_csv('open-images-dataset-validation.tsv', sep='\\t', header=None).reset_index()\n",
|
52 |
+
"print(data.shape, data.head(), sep=\"\\n\")"
|
53 |
+
],
|
54 |
+
"metadata": {
|
55 |
+
"id": "j97T0MIBeEDe",
|
56 |
+
"outputId": "df823427-2859-40f6-c171-f92b5a84361b",
|
57 |
+
"colab": {
|
58 |
+
"base_uri": "https://localhost:8080/"
|
59 |
+
}
|
60 |
+
},
|
61 |
+
"execution_count": 98,
|
62 |
+
"outputs": [
|
63 |
+
{
|
64 |
+
"output_type": "stream",
|
65 |
+
"name": "stdout",
|
66 |
+
"text": [
|
67 |
+
"(41620, 4)\n",
|
68 |
+
" index 0 1 \\\n",
|
69 |
+
"0 0 https://c2.staticflickr.com/6/5606/15611395595... 2038323 \n",
|
70 |
+
"1 1 https://c6.staticflickr.com/3/2808/10351094034... 1762125 \n",
|
71 |
+
"2 2 https://c2.staticflickr.com/9/8089/8416776003_... 9059623 \n",
|
72 |
+
"3 3 https://farm3.staticflickr.com/568/21452126474... 2306438 \n",
|
73 |
+
"4 4 https://farm4.staticflickr.com/1244/677743874_... 6571968 \n",
|
74 |
+
"\n",
|
75 |
+
" 2 \n",
|
76 |
+
"0 I4V4qq54NBEFDwBqPYCkDA== \n",
|
77 |
+
"1 38x6O2LAS75H1vUGVzIilg== \n",
|
78 |
+
"2 4ksF8TuGWGcKul6Z/6pq8g== \n",
|
79 |
+
"3 R+6Cs525mCUT6RovHPWREg== \n",
|
80 |
+
"4 JnkYas7iDJu+pb81tfqVow== \n"
|
81 |
+
]
|
82 |
+
}
|
83 |
+
]
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"cell_type": "markdown",
|
87 |
+
"source": [
|
88 |
+
"## Download the images\n",
|
89 |
+
"We need the image data locally to feed it to the model"
|
90 |
+
],
|
91 |
+
"metadata": {
|
92 |
+
"id": "M-Esbnhy6KTU"
|
93 |
+
}
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"cell_type": "code",
|
97 |
+
"source": [
|
98 |
+
"import urllib\n",
|
99 |
+
"import os\n",
|
100 |
+
"\n",
|
101 |
+
"def download_file(url):\n",
|
102 |
+
" basename = os.path.basename(url)\n",
|
103 |
+
" target_path = f\"./images/{basename}\"\n",
|
104 |
+
" if not os.path.exists(target_path):\n",
|
105 |
+
" try:\n",
|
106 |
+
" urllib.request.urlretrieve(url, target_path)\n",
|
107 |
+
" except urllib.error.HTTPError:\n",
|
108 |
+
" return None\n",
|
109 |
+
" return target_path"
|
110 |
+
],
|
111 |
+
"metadata": {
|
112 |
+
"id": "cK_63ubnieI6"
|
113 |
+
},
|
114 |
+
"execution_count": 99,
|
115 |
+
"outputs": []
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"cell_type": "markdown",
|
119 |
+
"source": [
|
120 |
+
"# The Model\n",
|
121 |
+
"We will be using a pre-trained model. Contrastive Language-Image Pre-training (CLIP) model developed by OpenAI is a multi-modal Vision Transformer model that can extract the visual features from the image into vector embeddings\n",
|
122 |
+
"\n",
|
123 |
+
"We will be storing these vector embeddings in a vector space database, where images will be clustered based on their semantic information ready for querying"
|
124 |
+
],
|
125 |
+
"metadata": {
|
126 |
+
"id": "0WrAbzxP6khy"
|
127 |
+
}
|
128 |
+
},
|
129 |
+
{
|
130 |
+
"cell_type": "code",
|
131 |
+
"source": [
|
132 |
+
"from sentence_transformers import SentenceTransformer\n",
|
133 |
+
"model = SentenceTransformer(\"clip-ViT-B-32\")"
|
134 |
+
],
|
135 |
+
"metadata": {
|
136 |
+
"id": "pHYk-KdmlJxz"
|
137 |
+
},
|
138 |
+
"execution_count": null,
|
139 |
+
"outputs": []
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"cell_type": "markdown",
|
143 |
+
"source": [
|
144 |
+
"# The Vector Database\n",
|
145 |
+
"\n",
|
146 |
+
"Qdrant is an open-source vector database, where we can store vector embeddings and query nearest neighbours of a given embedding to create a recommendation/semantic search engine\n",
|
147 |
+
"\n",
|
148 |
+
"We start by initializing the Qdrant client and connecting to the cluster hosted on Qdrant Cloud"
|
149 |
+
],
|
150 |
+
"metadata": {
|
151 |
+
"id": "2h7jMch58ADV"
|
152 |
+
}
|
153 |
+
},
|
154 |
+
{
|
155 |
+
"cell_type": "code",
|
156 |
+
"source": [
|
157 |
+
"from qdrant_client import QdrantClient\n",
|
158 |
+
"from qdrant_client.http import models as rest\n",
|
159 |
+
"from google.colab import userdata\n",
|
160 |
+
"\n",
|
161 |
+
"qdrant_client = QdrantClient(\n",
|
162 |
+
" url = userdata.get('QDRANT_CLUSTER_URL'),\n",
|
163 |
+
" api_key = userdata.get('QDRANT_CLUSTER_API_KEY'),\n",
|
164 |
+
")\n",
|
165 |
+
"qdrant_client.recreate_collection(\n",
|
166 |
+
" collection_name=\"images\",\n",
|
167 |
+
" vectors_config = rest.VectorParams(size=512, distance = rest.Distance.COSINE),\n",
|
168 |
+
")"
|
169 |
+
],
|
170 |
+
"metadata": {
|
171 |
+
"id": "nAObCg-yrzpC"
|
172 |
+
},
|
173 |
+
"execution_count": null,
|
174 |
+
"outputs": []
|
175 |
+
},
|
176 |
+
{
|
177 |
+
"cell_type": "markdown",
|
178 |
+
"source": [
|
179 |
+
"Function to upsert an embedding to the collection"
|
180 |
+
],
|
181 |
+
"metadata": {
|
182 |
+
"id": "zGbMrsDL_HH-"
|
183 |
+
}
|
184 |
+
},
|
185 |
+
{
|
186 |
+
"cell_type": "code",
|
187 |
+
"source": [
|
188 |
+
"def upsert_to_db(id, vector, payload):\n",
|
189 |
+
" qdrant_client.upsert(\n",
|
190 |
+
" collection_name=\"images\",\n",
|
191 |
+
" points=[\n",
|
192 |
+
" rest.PointStruct(\n",
|
193 |
+
" id=id,\n",
|
194 |
+
" vector=vector.tolist(),\n",
|
195 |
+
" payload=payload\n",
|
196 |
+
" )\n",
|
197 |
+
" ]\n",
|
198 |
+
")"
|
199 |
+
],
|
200 |
+
"metadata": {
|
201 |
+
"id": "mjTRm85dr13p"
|
202 |
+
},
|
203 |
+
"execution_count": 76,
|
204 |
+
"outputs": []
|
205 |
+
},
|
206 |
+
{
|
207 |
+
"cell_type": "code",
|
208 |
+
"source": [
|
209 |
+
"for i, link in data.iloc[:, :2].iterrows():\n",
|
210 |
+
" img = download_file(link[0])\n",
|
211 |
+
" if(img):\n",
|
212 |
+
" embedding = model.encode(str(img))\n",
|
213 |
+
" upsert_to_db(i,embedding, {\"link\":link[0]})\n",
|
214 |
+
" print(f\"upserted {i}\")"
|
215 |
+
],
|
216 |
+
"metadata": {
|
217 |
+
"id": "MvFEc4MgwSLW"
|
218 |
+
},
|
219 |
+
"execution_count": null,
|
220 |
+
"outputs": []
|
221 |
+
}
|
222 |
+
],
|
223 |
+
"metadata": {
|
224 |
+
"language_info": {
|
225 |
+
"name": "python"
|
226 |
+
},
|
227 |
+
"colab": {
|
228 |
+
"provenance": [],
|
229 |
+
"include_colab_link": true
|
230 |
+
},
|
231 |
+
"kernelspec": {
|
232 |
+
"name": "python3",
|
233 |
+
"display_name": "Python 3"
|
234 |
+
}
|
235 |
},
|
236 |
+
"nbformat": 4,
|
237 |
+
"nbformat_minor": 0
|
238 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|