CallmeKaito commited on
Commit
27c4528
·
verified ·
1 Parent(s): ed002eb

Delete CLIP.py

Browse files
Files changed (1) hide show
  1. CLIP.py +0 -141
CLIP.py DELETED
@@ -1,141 +0,0 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
-
4
- # In[1]:
5
-
6
-
7
- get_ipython().system('pip install ftfy regex tqdm')
8
- get_ipython().system('pip install git+https://github.com/openai/CLIP.git')
9
- get_ipython().system('pip install sentencepiece-0.1.98-cp311-cp311-win_amd64.whl')
10
-
11
-
12
-
13
- # In[5]:
14
-
15
-
16
- # prompt: install transformers
17
-
18
- get_ipython().system('pip install transformers')
19
-
20
-
21
- # In[6]:
22
-
23
-
24
- from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
25
-
26
-
27
- feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
28
- tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
29
- model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
30
-
31
-
32
- # ## Import the necessary libraries and load the CLIP model:
33
-
34
- # In[7]:
35
-
36
-
37
- from PIL import Image
38
- import clip
39
- import torch
40
-
41
- device = "cuda" if torch.cuda.is_available() else "cpu"
42
- clip_model, preprocess = clip.load("ViT-B/32", device=device)
43
-
44
-
45
- # ## Define a function to generate product descriptions:
46
-
47
- # In[8]:
48
-
49
-
50
- image = Image.open("data/download.jpeg")
51
- pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
52
- output_ids = model.generate(pixel_values, max_length=50, num_beams=4, early_stopping=True)
53
- captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
54
-
55
-
56
- # In[9]:
57
-
58
-
59
- image = preprocess(image).unsqueeze(0).to(device)
60
- with torch.no_grad():
61
- image_features = clip_model.encode_image(image)
62
-
63
- text_inputs = torch.cat([clip.tokenize(caption).to(device) for caption in captions]).to(device)
64
- with torch.no_grad():
65
- text_features = clip_model.encode_text(text_inputs)
66
-
67
- similarity_scores = image_features @ text_features.T
68
- best_caption_idx = similarity_scores.argmax().item()
69
- product_description = captions[best_caption_idx]
70
- print(product_description)
71
-
72
-
73
- # # Using SigLip
74
-
75
- # In[11]:
76
-
77
-
78
- get_ipython().system('pip install sentencepiece')
79
- get_ipython().system('pip install protobuf')
80
-
81
-
82
- # In[12]:
83
-
84
-
85
- from transformers import AutoProcessor, AutoModel, VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
86
- import torch
87
- from PIL import Image
88
-
89
-
90
- model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
91
- processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
92
-
93
-
94
- image = Image.open("data/avito4.jpeg")
95
- inputs = processor(images=image, return_tensors="pt")
96
-
97
-
98
- feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
99
- tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
100
- model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
101
-
102
- pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
103
- output_ids = model.generate(pixel_values, max_length=100, num_beams=5, early_stopping=True)
104
- captions = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
105
-
106
- image = preprocess(image).unsqueeze(0).to(device)
107
- with torch.no_grad():
108
- image_features = clip_model.encode_image(image)
109
-
110
- text_inputs = torch.cat([clip.tokenize(caption).to(device) for caption in captions]).to(device)
111
- with torch.no_grad():
112
- text_features = clip_model.encode_text(text_inputs)
113
-
114
- similarity_scores = image_features @ text_features.T
115
- best_caption_idx = similarity_scores.argmax().item()
116
- product_description = captions[best_caption_idx]
117
- print(product_description)
118
-
119
- # a vase sitting on a shelf in a store => thuya
120
- # a wooden bench sitting on top of a wooden floor => avito
121
- ## two old fashioned vases sitting next to each other => avito2
122
- ## three wooden vases sitting on top of a wooden floor => avito3
123
- # an old fashioned clock sitting on top of a table => avito4
124
-
125
-
126
-
127
- # In[ ]:
128
-
129
-
130
-
131
-
132
-
133
- # # Implemeting LLaVa
134
-
135
- # https://colab.research.google.com/drive/1veefV17NcD1S4ou4nF8ABkfm8-TgU0Dr#scrollTo=XN2vJCPZk1UY
136
-
137
- # In[ ]:
138
-
139
-
140
-
141
-