Spaces:
Sleeping
Sleeping
liubangwei
commited on
Commit
·
3ee7efa
1
Parent(s):
d37eb96
fix retrieval
Browse files- app.py +23 -16
- image_embeddings.pkl +2 -2
- src/__pycache__/arguments.cpython-310.pyc +0 -0
- src/__pycache__/model.cpython-310.pyc +0 -0
- src/vlm_backbone/intern_vl/__pycache__/modeling_internvl_chat.cpython-310.pyc +0 -0
- src/vlm_backbone/intern_vl/__pycache__/processing_internvl.cpython-310.pyc +0 -0
- src/vlm_backbone/llava_next/__pycache__/__init__.cpython-310.pyc +0 -0
- src/vlm_backbone/llava_next/__pycache__/modeling_llava_next.cpython-310.pyc +0 -0
app.py
CHANGED
@@ -22,8 +22,8 @@ def load_model():
|
|
22 |
global IMAGE_TOKEN
|
23 |
|
24 |
model_args = ModelArguments(
|
25 |
-
|
26 |
-
model_name="lbw18601752667/IDMR-2B",
|
27 |
model_backbone="internvl_2_5",
|
28 |
)
|
29 |
|
@@ -81,15 +81,16 @@ def get_inputs(processor, text, image_path=None, image=None):
|
|
81 |
return inputs
|
82 |
|
83 |
def encode_image_library(image_paths):
|
84 |
-
|
85 |
for img_path in image_paths:
|
86 |
text = f"{IMAGE_TOKEN}\n Represent the given image."
|
87 |
print(f"text: {text}")
|
88 |
inputs = get_inputs(processor, text, image_path=img_path)
|
89 |
with torch.no_grad(), torch.autocast(device_type=device, dtype=torch.bfloat16):
|
90 |
output = model(tgt=inputs)
|
91 |
-
|
92 |
-
|
|
|
93 |
|
94 |
def save_embeddings(embeddings, file_path="image_embeddings.pkl"):
|
95 |
with open(file_path, "wb") as f:
|
@@ -115,22 +116,26 @@ def retrieve_images(query_text, query_image, top_n=TOP_N):
|
|
115 |
image = None
|
116 |
inputs = get_inputs(processor, query_text, image=image)
|
117 |
print(f"inputs: {inputs}")
|
118 |
-
# with torch.no_grad():
|
119 |
with torch.no_grad(), torch.autocast(device_type=device, dtype=torch.bfloat16):
|
120 |
query_embedding = model(qry=inputs)["qry_reps"].float().cpu().numpy()
|
121 |
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
similarity = cosine_similarity(query_embedding, embeddings)
|
125 |
similarity = similarity.T
|
126 |
print(f"cosine_similarity: {similarity}")
|
127 |
top_indices = np.argsort(-similarity).squeeze(0)[:top_n]
|
128 |
print(f"top_indices: {top_indices}")
|
129 |
-
|
130 |
-
# similarity = model.compute_similarity(np.expand_dims(query_embedding.squeeze(0), axis=1), embeddings.squeeze(1))
|
131 |
-
# print(f"model.compute_similarity: {similarity}")
|
132 |
|
133 |
-
return [
|
134 |
|
135 |
def demo(query_text, query_image):
|
136 |
# print(f"query_text: {query_text}, query_image: {query_image}, type(query_image): {type(query_image)}, image shape: {query_image.shape if query_image is not None else 'None'}")
|
@@ -157,13 +162,15 @@ def load_examples():
|
|
157 |
|
158 |
iface = gr.Interface(
|
159 |
fn=demo,
|
160 |
-
inputs=[
|
161 |
-
|
|
|
|
|
|
|
162 |
examples=load_examples(),
|
163 |
-
title="
|
164 |
-
description="Enter a query
|
165 |
)
|
166 |
-
|
167 |
if not os.path.exists("image_embeddings.pkl"):
|
168 |
embeddings = encode_image_library(image_paths)
|
169 |
save_embeddings(embeddings)
|
|
|
22 |
global IMAGE_TOKEN
|
23 |
|
24 |
model_args = ModelArguments(
|
25 |
+
model_name="/fs-computility/ai-shen/kilab-shared/liubangwei/ckpt/my_hf/IDMR-2B",
|
26 |
+
# model_name="lbw18601752667/IDMR-2B",
|
27 |
model_backbone="internvl_2_5",
|
28 |
)
|
29 |
|
|
|
81 |
return inputs
|
82 |
|
83 |
def encode_image_library(image_paths):
|
84 |
+
embeddings_dict = {}
|
85 |
for img_path in image_paths:
|
86 |
text = f"{IMAGE_TOKEN}\n Represent the given image."
|
87 |
print(f"text: {text}")
|
88 |
inputs = get_inputs(processor, text, image_path=img_path)
|
89 |
with torch.no_grad(), torch.autocast(device_type=device, dtype=torch.bfloat16):
|
90 |
output = model(tgt=inputs)
|
91 |
+
img_name = os.path.basename(img_path)
|
92 |
+
embeddings_dict[img_name] = output["tgt_reps"].float().cpu().numpy()
|
93 |
+
return embeddings_dict
|
94 |
|
95 |
def save_embeddings(embeddings, file_path="image_embeddings.pkl"):
|
96 |
with open(file_path, "wb") as f:
|
|
|
116 |
image = None
|
117 |
inputs = get_inputs(processor, query_text, image=image)
|
118 |
print(f"inputs: {inputs}")
|
|
|
119 |
with torch.no_grad(), torch.autocast(device_type=device, dtype=torch.bfloat16):
|
120 |
query_embedding = model(qry=inputs)["qry_reps"].float().cpu().numpy()
|
121 |
|
122 |
+
embeddings_dict = load_embeddings()
|
123 |
+
|
124 |
+
img_names = []
|
125 |
+
embeddings = []
|
126 |
+
for img_name in os.listdir(IMAGE_DIR):
|
127 |
+
if img_name in embeddings_dict:
|
128 |
+
img_names.append(img_name)
|
129 |
+
embeddings.append(embeddings_dict[img_name])
|
130 |
+
embeddings = np.stack(embeddings)
|
131 |
|
132 |
similarity = cosine_similarity(query_embedding, embeddings)
|
133 |
similarity = similarity.T
|
134 |
print(f"cosine_similarity: {similarity}")
|
135 |
top_indices = np.argsort(-similarity).squeeze(0)[:top_n]
|
136 |
print(f"top_indices: {top_indices}")
|
|
|
|
|
|
|
137 |
|
138 |
+
return [os.path.join(IMAGE_DIR, img_names[i]) for i in top_indices]
|
139 |
|
140 |
def demo(query_text, query_image):
|
141 |
# print(f"query_text: {query_text}, query_image: {query_image}, type(query_image): {type(query_image)}, image shape: {query_image.shape if query_image is not None else 'None'}")
|
|
|
162 |
|
163 |
iface = gr.Interface(
|
164 |
fn=demo,
|
165 |
+
inputs=[
|
166 |
+
gr.Textbox(placeholder="Enter your query text here...", label="Query Text"),
|
167 |
+
gr.Image(label="Query Image", type="numpy")
|
168 |
+
],
|
169 |
+
outputs=gr.Gallery(label=f"Retrieved Images (Top {TOP_N})", columns=3),
|
170 |
examples=load_examples(),
|
171 |
+
title="Instance-Driven Multi-modal Retrieval (IDMR) Demo",
|
172 |
+
description="Enter a query text or upload an image to retrieve relevant images from the library. You can click on the examples below to try them out."
|
173 |
)
|
|
|
174 |
if not os.path.exists("image_embeddings.pkl"):
|
175 |
embeddings = encode_image_library(image_paths)
|
176 |
save_embeddings(embeddings)
|
image_embeddings.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:05be7a8cbfdc77b64e473f3d2c30c47c6e522623abb11d288fb54fad07a5f8da
|
3 |
+
size 412525
|
src/__pycache__/arguments.cpython-310.pyc
CHANGED
Binary files a/src/__pycache__/arguments.cpython-310.pyc and b/src/__pycache__/arguments.cpython-310.pyc differ
|
|
src/__pycache__/model.cpython-310.pyc
CHANGED
Binary files a/src/__pycache__/model.cpython-310.pyc and b/src/__pycache__/model.cpython-310.pyc differ
|
|
src/vlm_backbone/intern_vl/__pycache__/modeling_internvl_chat.cpython-310.pyc
CHANGED
Binary files a/src/vlm_backbone/intern_vl/__pycache__/modeling_internvl_chat.cpython-310.pyc and b/src/vlm_backbone/intern_vl/__pycache__/modeling_internvl_chat.cpython-310.pyc differ
|
|
src/vlm_backbone/intern_vl/__pycache__/processing_internvl.cpython-310.pyc
CHANGED
Binary files a/src/vlm_backbone/intern_vl/__pycache__/processing_internvl.cpython-310.pyc and b/src/vlm_backbone/intern_vl/__pycache__/processing_internvl.cpython-310.pyc differ
|
|
src/vlm_backbone/llava_next/__pycache__/__init__.cpython-310.pyc
CHANGED
Binary files a/src/vlm_backbone/llava_next/__pycache__/__init__.cpython-310.pyc and b/src/vlm_backbone/llava_next/__pycache__/__init__.cpython-310.pyc differ
|
|
src/vlm_backbone/llava_next/__pycache__/modeling_llava_next.cpython-310.pyc
CHANGED
Binary files a/src/vlm_backbone/llava_next/__pycache__/modeling_llava_next.cpython-310.pyc and b/src/vlm_backbone/llava_next/__pycache__/modeling_llava_next.cpython-310.pyc differ
|
|