rynmurdock commited on
Commit
fac61f6
Β·
1 Parent(s): aeeead2

other tiger

Browse files
.gitattributes CHANGED
@@ -28,3 +28,5 @@ first.png filter=lfs diff=lfs merge=lfs -text
28
  fourth.png filter=lfs diff=lfs merge=lfs -text
29
  *.mp4 filter=lfs diff=lfs merge=lfs -text
30
  *.png filter=lfs diff=lfs merge=lfs -text
 
 
 
28
  fourth.png filter=lfs diff=lfs merge=lfs -text
29
  *.mp4 filter=lfs diff=lfs merge=lfs -text
30
  *.png filter=lfs diff=lfs merge=lfs -text
31
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
32
+ last_epoch_ckpt/diffusion_pytorch_model.safetensors
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 rynmurdock
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.MD ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # The Other Tiger
2
+
3
+ ## tl;dr
4
+ Train on embeddings of media preferred by a specific user -> produce embeddings of media they may enjoy.
5
+
6
+ In our case here, we take the ECLIPSE `text embedding -> image embedding` prior (https://arxiv.org/abs/2312.04655) and finetune it to become a `preferred image embeddings -> heldout image embedding` prior.
7
+
8
+ ### Related work:
9
+
10
+ Patron et al. models preference using a diffusion prior and condition on user ids with ratings: https://arxiv.org/abs/2502.18477
11
+
12
+ Wang et al. models preference using a generator conditioned on averaged CLIP embeddings of users: https://arxiv.org/abs/2304.03516
13
+
14
+ My previous work based on Collaborative Filtering with CLIP embeddings: https://github.com/rynmurdock/generative_recommender
15
+
README.md DELETED
@@ -1,12 +0,0 @@
1
- ---
2
- license: mit
3
- title: Blue Tigers
4
- sdk: gradio
5
- emoji: πŸ‘
6
- colorFrom: blue
7
- colorTo: purple
8
- pinned: true
9
- ---
10
- # Blue Tigers
11
-
12
- Zahir with movement.
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,32 +1,29 @@
1
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  # TODO unify/merge origin and this
4
  # TODO save & restart from (if it exists) dataframe parquet
5
- import torch
6
 
7
- # lol
8
- DEVICE = 'cuda'
9
- STEPS = 8
10
- output_hidden_state = False
11
  device = "cuda"
12
- dtype = torch.bfloat16
13
 
14
 
15
  import spaces
16
-
17
  import matplotlib.pyplot as plt
18
- import matplotlib
19
- import logging
20
 
21
  import os
22
- import imageio
23
  import gradio as gr
24
- import numpy as np
25
- from sklearn.svm import LinearSVC
26
  import pandas as pd
27
  from apscheduler.schedulers.background import BackgroundScheduler
28
- import sched
29
- import threading
30
 
31
  import random
32
  import time
@@ -43,107 +40,38 @@ prevs_df = pd.DataFrame(columns=['paths', 'embeddings', 'ips', 'user:rating', 'l
43
  import spaces
44
  start_time = time.time()
45
 
46
- prompt_list = [p for p in list(set(
47
- pd.read_csv('./twitter_prompts.csv').iloc[:, 1].tolist())) if type(p) == str]
48
-
49
-
50
  ####################### Setup Model
51
- from diffusers import EulerDiscreteScheduler, LCMScheduler, AutoencoderTiny, UNet2DConditionModel, AutoencoderKL, AutoPipelineForText2Image
52
- from transformers import CLIPTextModel
53
- from huggingface_hub import hf_hub_download
54
- from safetensors.torch import load_file
55
  from PIL import Image
56
- from transformers import CLIPVisionModelWithProjection
57
  import uuid
58
- import av
59
-
60
- def write_video(file_name, images, fps=16):
61
- container = av.open(file_name, mode="w")
62
-
63
- stream = container.add_stream("h264", rate=fps)
64
- # stream.options = {'preset': 'faster'}
65
- stream.thread_count = 1
66
- stream.width = 512
67
- stream.height = 512
68
- stream.pix_fmt = "yuv420p"
69
-
70
- for img in images:
71
- img = np.array(img)
72
- img = np.round(img).astype(np.uint8)
73
- frame = av.VideoFrame.from_ndarray(img, format="rgb24")
74
- for packet in stream.encode(frame):
75
- container.mux(packet)
76
- # Flush stream
77
- for packet in stream.encode():
78
- container.mux(packet)
79
- # Close the file
80
- container.close()
81
-
82
- def imio_write_video(file_name, images, fps=15):
83
- writer = imageio.get_writer(file_name, fps=fps)
84
-
85
- for im in images:
86
- writer.append_data(np.array(im))
87
- writer.close()
88
-
89
-
90
- image_encoder = CLIPVisionModelWithProjection.from_pretrained("h94/IP-Adapter", subfolder="sdxl_models/image_encoder", torch_dtype=dtype,
91
- device_map='cuda')
92
- #vae = AutoencoderTiny.from_pretrained("madebyollin/taesd", torch_dtype=dtype)
93
-
94
- # vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=dtype)
95
- # vae = compile_unet(vae, config=config)
96
-
97
- #finetune_path = '''/home/ryn_mote/Misc/finetune-sd1.5/dreambooth-model best'''''
98
- #unet = UNet2DConditionModel.from_pretrained(finetune_path+'/unet/').to(dtype)
99
- #text_encoder = CLIPTextModel.from_pretrained(finetune_path+'/text_encoder/').to(dtype)
100
-
101
- #rynmurdock/Sea_Claws
102
- model_id = "stabilityai/stable-diffusion-xl-base-1.0"
103
- sdxl_lightening = "ByteDance/SDXL-Lightning"
104
- ckpt = "sdxl_lightning_8step_unet.safetensors"
105
- unet = UNet2DConditionModel.from_config(model_id, subfolder="unet", low_cpu_mem_usage=True, device_map=DEVICE).to(torch.float16)
106
- unet.load_state_dict(load_file(hf_hub_download(sdxl_lightening, ckpt)))
107
-
108
- image_encoder = CLIPVisionModelWithProjection.from_pretrained("h94/IP-Adapter", subfolder="models/image_encoder", torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map=DEVICE)
109
- pipe = AutoPipelineForText2Image.from_pretrained(model_id, unet=unet, torch_dtype=torch.float16, variant="fp16", image_encoder=image_encoder, low_cpu_mem_usage=True)
110
- pipe.unet._load_ip_adapter_weights(torch.load(hf_hub_download('h94/IP-Adapter', 'sdxl_models/ip-adapter_sdxl_vit-h.bin')))
111
- pipe.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl_vit-h.bin")
112
- pipe.register_modules(image_encoder = image_encoder)
113
- pipe.set_ip_adapter_scale(0.8)
114
-
115
- #pipe.vae = AutoencoderTiny.from_pretrained("madebyollin/taesdxl", torch_dtype=torch.float16, low_cpu_mem_usage=True)
116
- pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
117
-
118
- pipe.to(device=DEVICE).to(dtype=dtype)
119
- output_hidden_state = False
120
-
121
-
122
-
123
-
124
- # pipe.unet.fuse_qkv_projections()
125
- #pipe.enable_free_init(method="gaussian", use_fast_sampling=True)
126
-
127
- #pipe.unet = torch.compile(pipe.unet)
128
- #pipe.vae = torch.compile(pipe.vae)
129
-
130
 
131
 
132
  @spaces.GPU()
133
  def generate_gpu(in_im_embs, prompt='the scene'):
134
  with torch.no_grad():
135
- print(prompt)
136
- in_im_embs = in_im_embs.to('cuda').unsqueeze(0)
137
- output = pipe(prompt=prompt, guidance_scale=1, added_cond_kwargs={}, ip_adapter_image_embeds=[in_im_embs], num_inference_steps=STEPS)
138
- im_emb, _ = pipe.encode_image(
139
- output.images[0], 'cuda', 1, output_hidden_state
140
- )
141
- im_emb = im_emb.detach().to('cpu').to(torch.float32)
142
- return output, im_emb
143
-
144
-
145
- def generate(in_im_embs, prompt='the scene'):
146
- output, im_emb = generate_gpu(in_im_embs, prompt)
 
 
 
 
 
 
 
 
 
 
 
147
  nsfw = False#maybe_nsfw(output.images[0])
148
 
149
  name = str(uuid.uuid4()).replace("-", "")
@@ -154,87 +82,35 @@ def generate(in_im_embs, prompt='the scene'):
154
  # TODO could return an automatic dislike of auto dislike on the backend for neither as well; just would need refactoring.
155
  return None, im_emb
156
 
157
- output.images[0].save(path)
158
  return path, im_emb
159
 
160
 
161
  #######################
162
 
163
-
164
-
165
-
166
-
167
-
168
  @spaces.GPU()
169
- def solver(embs, ys):
170
- print('ys:', ys,'EMBS:', embs.shape, embs)
171
- ys = torch.tensor(ys).to('cpu', dtype=torch.float32).squeeze().unsqueeze(1)
172
-
173
- sol = LinearSVC(class_weight='balanced').fit(np.array(embs), np.array(torch.tensor(ys).float() * 2 - 1)).coef_
174
- return torch.tensor(sol).to('cpu', dtype=torch.float32)
175
-
176
-
177
 
 
178
 
 
179
  def get_user_emb(embs, ys):
180
- # sample only as many negatives as there are positives
181
- indices = range(len(ys))
182
- pos_indices = [i for i in indices if ys[i] > .5]
183
- neg_indices = [i for i in indices if ys[i] <= .5]
184
-
185
- mini = min(len(pos_indices), len(neg_indices))
186
 
187
- if len(ys) > 20: # drop earliest of whichever of neg or pos is most abundant
188
- if len(pos_indices) > len(neg_indices):
189
- ind = pos_indices[0]
190
- else:
191
- ind = neg_indices[0]
192
- ys.pop(ind)
193
- embs.pop(ind)
194
- print('Dropping at 20')
195
-
196
- if mini < 1:
197
- feature_embs = torch.stack([torch.randn(1024), torch.randn(1024)])
198
- ys_t = [0, 1]
199
- print('Not enough ratings.')
200
- else:
201
- indices = range(len(ys))
202
- ys_t = [ys[i] for i in indices]
203
- feature_embs = torch.stack([embs[e].detach().cpu() for e in indices]).squeeze()
204
-
205
- # scaler = preprocessing.StandardScaler().fit(feature_embs)
206
- # feature_embs = scaler.transform(feature_embs)
207
- # ys_t = ys
208
-
209
- print(np.array(feature_embs).shape, np.array(ys_t).shape)
210
-
211
- sol = solver(feature_embs.squeeze(), ys_t)
212
- dif = torch.tensor(sol, dtype=dtype).to(device)
213
-
214
- # could j have a base vector of a black image
215
- latest_pos = (random.sample([feature_embs[i] for i in range(len(ys_t)) if ys_t[i] > .5], 1)[0]).to(device, dtype)
216
-
217
- dif = ((dif / dif.std()) * latest_pos.std())
218
 
219
- sol = (1*latest_pos + 3*dif)/4
220
- return sol
221
 
222
-
223
- def pluck_img(user_id, user_emb):
224
- not_rated_rows = prevs_df[[i[1]['user:rating'].get(user_id, 'gone') == 'gone' for i in prevs_df.iterrows()]]
225
- while len(not_rated_rows) == 0:
226
- not_rated_rows = prevs_df[[i[1]['user:rating'].get(user_id, 'gone') == 'gone' for i in prevs_df.iterrows()]]
227
- time.sleep(.1)
228
- # TODO optimize this lol
229
- best_sim = -100000
230
- for i in not_rated_rows.iterrows():
231
- # TODO sloppy .to but it is 3am.
232
- sim = torch.cosine_similarity(i[1]['embeddings'].detach().to('cpu'), user_emb.detach().to('cpu'))
233
- if sim > best_sim:
234
- best_sim = sim
235
- best_row = i[1]
236
- img = best_row['paths']
237
- return img
238
 
239
 
240
  def background_next_image():
@@ -256,43 +132,30 @@ def background_next_image():
256
  # media.
257
 
258
  unrated_from_user = not_rated_rows[[i[1]['from_user_id'] == uid for i in not_rated_rows.iterrows()]]
259
- rated_from_user = rated_rows[[i[1]['from_user_id'] == uid for i in rated_rows.iterrows()]]
260
 
261
- # we pop previous ratings if there are > n
262
- if len(rated_from_user) >= 15:
263
- oldest = rated_from_user.iloc[0]['paths']
264
- prevs_df = prevs_df[prevs_df['paths'] != oldest]
265
  # we don't compute more after n are in the queue for them
266
  if len(unrated_from_user) >= 10:
267
  continue
268
 
269
  if len(rated_rows) < 5:
270
- continue
271
-
272
- embs, ys = pluck_embs_ys(uid)
273
-
274
- user_emb = get_user_emb(embs, [y[1] for y in ys])
275
-
276
 
277
  global glob_idx
278
  glob_idx += 1
279
- if glob_idx >= (len(prompt_list)-1):
280
- glob_idx = 0
281
-
282
 
283
- if glob_idx % 7 == 0:
284
- text = prompt_list[glob_idx]
285
- else:
286
- text = 'an image'
287
- img, embs = generate(user_emb, text)
288
 
289
  if img:
290
  tmp_df = pd.DataFrame(columns=['paths', 'embeddings', 'ips', 'user:rating', 'latest_user_to_rate', 'text', 'gemb'])
291
  tmp_df['paths'] = [img]
292
- tmp_df['embeddings'] = [embs]
293
  tmp_df['user:rating'] = [{' ': ' '}]
294
  tmp_df['from_user_id'] = [uid]
295
- tmp_df['text'] = [text]
296
  prevs_df = pd.concat((prevs_df, tmp_df))
297
  # we can free up storage by deleting the image
298
  if len(prevs_df) > 500:
@@ -305,19 +168,27 @@ def background_next_image():
305
  # only keep 50 images & embeddings & ips, then remove oldest besides calibrating
306
  prevs_df = pd.concat((prevs_df.iloc[:6], prevs_df.iloc[7:]))
307
 
 
 
 
 
 
308
 
309
- def pluck_embs_ys(user_id):
310
- rated_rows = prevs_df[[i[1]['user:rating'].get(user_id, None) != None for i in prevs_df.iterrows()]]
311
- #not_rated_rows = prevs_df[[i[1]['user:rating'].get(user_id, None) == None for i in prevs_df.iterrows()]]
312
- #while len(not_rated_rows) == 0:
313
- # not_rated_rows = prevs_df[[i[1]['user:rating'].get(user_id, None) == None for i in prevs_df.iterrows()]]
314
- # rated_rows = prevs_df[[i[1]['user:rating'].get(user_id, None) != None for i in prevs_df.iterrows()]]
315
- # time.sleep(.01)
316
- # print('current user has 0 not_rated_rows')
317
-
318
- embs = rated_rows['embeddings'].to_list()
319
- ys = [i[user_id] for i in rated_rows['user:rating'].to_list()]
320
- return embs, ys
 
 
 
321
 
322
  def next_image(calibrate_prompts, user_id):
323
  with torch.no_grad():
@@ -326,11 +197,8 @@ def next_image(calibrate_prompts, user_id):
326
  image = prevs_df[prevs_df['paths'] == cal_video]['paths'].to_list()[0]
327
  return image, calibrate_prompts,
328
  else:
329
- embs, ys = pluck_embs_ys(user_id)
330
- ys_here = [y[1] for y in ys]
331
- user_emb = get_user_emb(embs, ys_here)
332
- image = pluck_img(user_id, user_emb)
333
- return image, calibrate_prompts,
334
 
335
 
336
 
@@ -451,7 +319,7 @@ Explore the latent space without text prompts based on your preferences. Learn m
451
  ''', elem_id="description")
452
  user_id = gr.State()
453
  # calibration videos -- this is a misnomer now :D
454
- calibrate_prompts = gr.State([
455
  './5o.png',
456
  './2o.png',
457
  './6o.png',
@@ -462,22 +330,18 @@ Explore the latent space without text prompts based on your preferences. Learn m
462
  './4o.png',
463
  './10o.png',
464
  './9o.png',
465
- ])
 
466
  def l():
467
  return None
468
 
469
  with gr.Row(elem_id='output-image'):
470
  img = gr.Image(
471
- label='Lightning',
472
- # autoplay=True,
473
- interactive=False,
474
- # height=512,
475
- # width=512,
476
- #include_audio=False,
477
- elem_id="video_output",
478
- type='filepath',
479
- )
480
- #img.play(l, js='''document.querySelector('[data-testid="Lightning-player"]').loop = true''')
481
 
482
 
483
 
@@ -531,24 +395,24 @@ Explore the latent space without text prompts based on your preferences. Learn m
531
  </ div>''')
532
 
533
  # TODO quiet logging
534
-
535
  scheduler = BackgroundScheduler()
536
  scheduler.add_job(func=background_next_image, trigger="interval", seconds=.2)
537
  scheduler.start()
538
 
539
- #thread = threading.Thread(target=background_next_image,)
540
- #thread.start()
541
-
542
  # TODO shouldn't call this before gradio launch, yeah?
543
  @spaces.GPU()
544
  def encode_space(x):
545
- im_emb, _ = pipe.encode_image(
546
- image, DEVICE, 1, output_hidden_state
 
 
 
547
  )
 
548
  return im_emb.detach().to('cpu').to(torch.float32)
549
 
550
  # prep our calibration videos
551
- for im, txt in [ # DO NOT NAME THESE PNGs JUST NUMBERS! apparently we assign images by number
552
  ('./1o.png', 'describe the scene: omens in the suburbs'),
553
  ('./2o.png', 'describe the scene: geometric abstract art of a windmill'),
554
  ('./3o.png', 'describe the scene: memento mori'),
@@ -559,7 +423,9 @@ for im, txt in [ # DO NOT NAME THESE PNGs JUST NUMBERS! apparently we assign ima
559
  ('./8o.png', '8 '),
560
  ('./9o.png', '9 '),
561
  ('./10o.png', '10 '),
562
- ]:
 
 
563
  tmp_df = pd.DataFrame(columns=['paths', 'embeddings', 'ips', 'user:rating', 'text', 'gemb'])
564
  tmp_df['paths'] = [im]
565
  image = Image.open(im).convert('RGB')
 
1
 
2
+ import gradio as gr
3
+ import random
4
+ import time
5
+ import torch
6
+
7
+
8
+ import config
9
+ from model import get_model_and_tokenizer
10
+
11
+ model, model.prior_pipe.image_encoder = get_model_and_tokenizer(config.model_path,
12
+ 'cuda', torch.bfloat16)
13
 
14
  # TODO unify/merge origin and this
15
  # TODO save & restart from (if it exists) dataframe parquet
 
16
 
 
 
 
 
17
  device = "cuda"
 
18
 
19
 
20
  import spaces
 
21
  import matplotlib.pyplot as plt
 
 
22
 
23
  import os
 
24
  import gradio as gr
 
 
25
  import pandas as pd
26
  from apscheduler.schedulers.background import BackgroundScheduler
 
 
27
 
28
  import random
29
  import time
 
40
  import spaces
41
  start_time = time.time()
42
 
 
 
 
 
43
  ####################### Setup Model
44
+ from diffusers import EulerDiscreteScheduler
 
 
 
45
  from PIL import Image
 
46
  import uuid
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
 
49
  @spaces.GPU()
50
  def generate_gpu(in_im_embs, prompt='the scene'):
51
  with torch.no_grad():
52
+ in_im_embs = in_im_embs.to('cuda')
53
+
54
+ negative_image_embeds = in_im_embs[0] # model.prior_pipe.get_zero_embed()
55
+ positive_image_embeds = in_im_embs[1]
56
+
57
+ images = model.kandinsky_pipe(
58
+ num_inference_steps=50,
59
+ image_embeds=positive_image_embeds,
60
+ negative_image_embeds=negative_image_embeds,
61
+ guidance_scale=11,
62
+ ).images[0]
63
+ cond = (
64
+ model.prior_pipe.image_processor(images, return_tensors="pt")
65
+ .pixel_values[0]
66
+ .unsqueeze(0)
67
+ .to(dtype=model.prior_pipe.image_encoder.dtype, device=device)
68
+ )
69
+ im_emb = model.prior_pipe.image_encoder(cond)["image_embeds"]
70
+ return images, im_emb
71
+
72
+
73
+ def generate(in_im_embs, ):
74
+ output, im_emb = generate_gpu(in_im_embs)
75
  nsfw = False#maybe_nsfw(output.images[0])
76
 
77
  name = str(uuid.uuid4()).replace("-", "")
 
82
  # TODO could return an automatic dislike of auto dislike on the backend for neither as well; just would need refactoring.
83
  return None, im_emb
84
 
85
+ output.save(path)
86
  return path, im_emb
87
 
88
 
89
  #######################
90
 
 
 
 
 
 
91
  @spaces.GPU()
92
+ def sample_embs(prompt_embeds):
93
+ latent = torch.randn(prompt_embeds.shape[0], 1, prompt_embeds.shape[-1])
94
+ if prompt_embeds.shape[1] < 8: # TODO grab as `k` arg from config
95
+ prompt_embeds = torch.nn.functional.pad(prompt_embeds, [0, 0, 0, 8-prompt_embeds.shape[1]])
96
+ assert prompt_embeds.shape[1] == 8, f"The model is set to take `k`` cond image embeds but is shape {prompt_embeds.shape}"
97
+ image_embeds = model(latent.to('cuda'), prompt_embeds.to('cuda')).predicted_image_embedding
 
 
98
 
99
+ return image_embeds
100
 
101
+ @spaces.GPU()
102
  def get_user_emb(embs, ys):
103
+ positives = [e for e, ys in zip(embs, ys) if ys == 1]
104
+ embs = random.sample(positives, min(8, len(positives)))
105
+ positives = torch.stack(embs, 1)
 
 
 
106
 
107
+ negs = [e for e, ys in zip(embs, ys) if ys == 0]
108
+ negative_embs = random.sample(negs, min(8, len(negs)))
109
+ negatives = torch.stack(negative_embs, 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
+ image_embeds = torch.stack([sample_embs(negatives), sample_embs(positives)])
 
112
 
113
+ return image_embeds
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
 
116
  def background_next_image():
 
132
  # media.
133
 
134
  unrated_from_user = not_rated_rows[[i[1]['from_user_id'] == uid for i in not_rated_rows.iterrows()]]
 
135
 
 
 
 
 
136
  # we don't compute more after n are in the queue for them
137
  if len(unrated_from_user) >= 10:
138
  continue
139
 
140
  if len(rated_rows) < 5:
141
+ continue
 
 
 
 
 
142
 
143
  global glob_idx
144
  glob_idx += 1
145
+
146
+ ems = rated_rows['embeddings'].to_list()
147
+ ys = [i[uid][0] for i in rated_rows['user:rating'].to_list()]
148
 
149
+ emz = get_user_emb(ems, ys)
150
+ img, embs = generate(emz)
 
 
 
151
 
152
  if img:
153
  tmp_df = pd.DataFrame(columns=['paths', 'embeddings', 'ips', 'user:rating', 'latest_user_to_rate', 'text', 'gemb'])
154
  tmp_df['paths'] = [img]
155
+ tmp_df['embeddings'] = [embs.to(torch.float32).to('cpu')]
156
  tmp_df['user:rating'] = [{' ': ' '}]
157
  tmp_df['from_user_id'] = [uid]
158
+ tmp_df['text'] = ['']
159
  prevs_df = pd.concat((prevs_df, tmp_df))
160
  # we can free up storage by deleting the image
161
  if len(prevs_df) > 500:
 
168
  # only keep 50 images & embeddings & ips, then remove oldest besides calibrating
169
  prevs_df = pd.concat((prevs_df.iloc[:6], prevs_df.iloc[7:]))
170
 
171
+ def pluck_img(user_id):
172
+ rated_rows = prevs_df[[i[1]['user:rating'].get(user_id, None) is not None for i in prevs_df.iterrows()]]
173
+ ems = rated_rows['embeddings'].to_list()
174
+ ys = [i[user_id][0] for i in rated_rows['user:rating'].to_list()]
175
+ user_emb = get_user_emb(ems, ys)
176
 
177
+ not_rated_rows = prevs_df[[i[1]['user:rating'].get(user_id, 'gone') == 'gone' for i in prevs_df.iterrows()]]
178
+ while len(not_rated_rows) == 0:
179
+ not_rated_rows = prevs_df[[i[1]['user:rating'].get(user_id, 'gone') == 'gone' for i in prevs_df.iterrows()]]
180
+ time.sleep(.1)
181
+ # TODO optimize this lol
182
+ best_sim = -10000000
183
+ for i in not_rated_rows.iterrows():
184
+ # TODO sloppy .to but it is 3am.
185
+ sim = torch.cosine_similarity(i[1]['embeddings'].detach().to('cpu'), user_emb.detach().to('cpu'), -1)
186
+ if len(sim) > 1: sim = sim[1]
187
+ if sim.squeeze() > best_sim:
188
+ best_sim = sim
189
+ best_row = i[1]
190
+ img = best_row['paths']
191
+ return img
192
 
193
  def next_image(calibrate_prompts, user_id):
194
  with torch.no_grad():
 
197
  image = prevs_df[prevs_df['paths'] == cal_video]['paths'].to_list()[0]
198
  return image, calibrate_prompts,
199
  else:
200
+ image = pluck_img(user_id)
201
+ return image, calibrate_prompts
 
 
 
202
 
203
 
204
 
 
319
  ''', elem_id="description")
320
  user_id = gr.State()
321
  # calibration videos -- this is a misnomer now :D
322
+ calibrate_prompts = [
323
  './5o.png',
324
  './2o.png',
325
  './6o.png',
 
330
  './4o.png',
331
  './10o.png',
332
  './9o.png',
333
+ ]
334
+ calibrate_prompts = gr.State(['image_init/'+c for c in calibrate_prompts])
335
  def l():
336
  return None
337
 
338
  with gr.Row(elem_id='output-image'):
339
  img = gr.Image(
340
+ label='Lightning',
341
+ interactive=False,
342
+ elem_id="output_im",
343
+ type='filepath',
344
+ )
 
 
 
 
 
345
 
346
 
347
 
 
395
  </ div>''')
396
 
397
  # TODO quiet logging
 
398
  scheduler = BackgroundScheduler()
399
  scheduler.add_job(func=background_next_image, trigger="interval", seconds=.2)
400
  scheduler.start()
401
 
 
 
 
402
  # TODO shouldn't call this before gradio launch, yeah?
403
  @spaces.GPU()
404
  def encode_space(x):
405
+ im = (
406
+ model.prior_pipe.image_processor(x, return_tensors="pt")
407
+ .pixel_values[0]
408
+ .unsqueeze(0)
409
+ .to(dtype=model.prior_pipe.image_encoder.dtype, device=device)
410
  )
411
+ im_emb = model.prior_pipe.image_encoder(im)["image_embeds"]
412
  return im_emb.detach().to('cpu').to(torch.float32)
413
 
414
  # prep our calibration videos
415
+ m_calibrate = [ # DO NOT NAME THESE PNGs JUST NUMBERS! apparently we assign images by number
416
  ('./1o.png', 'describe the scene: omens in the suburbs'),
417
  ('./2o.png', 'describe the scene: geometric abstract art of a windmill'),
418
  ('./3o.png', 'describe the scene: memento mori'),
 
423
  ('./8o.png', '8 '),
424
  ('./9o.png', '9 '),
425
  ('./10o.png', '10 '),
426
+ ]
427
+ m_calibrate = [('image_init/'+c[0], c[1]) for c in m_calibrate]
428
+ for im, txt in m_calibrate:
429
  tmp_df = pd.DataFrame(columns=['paths', 'embeddings', 'ips', 'user:rating', 'text', 'gemb'])
430
  tmp_df['paths'] = [im]
431
  image = Image.open(im).convert('RGB')
config.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ # NOTE model path name changed
4
+ model_path = './last_epoch_ckpt/'
5
+ lr = 1e-5
6
+ device = 'cuda'
7
+ dtype = torch.bfloat16
8
+ data_path = '../data/lke_2017'
9
+ save_path = './'
10
+ epochs = 4
11
+ batch_size = 16
12
+ number_k_clip_embed = 16 # divide by this to determine bundling together of sequences -> CLIP
13
+ num_workers = 32
14
+ seed = 107
15
+
16
+ # TODO config option to swap to diffusion?
data.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from PIL import Image
3
+ import random
4
+ import logging
5
+ import torchvision
6
+
7
+ import torchvision.transforms as T
8
+ from torchvision.transforms.functional import InterpolationMode
9
+
10
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
11
+ IMAGENET_STD = (0.229, 0.224, 0.225)
12
+
13
+ def build_transform(input_size):
14
+ MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
15
+ transform = T.Compose([
16
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
17
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
18
+ T.ToTensor(),
19
+ T.Normalize(mean=MEAN, std=STD)
20
+ ])
21
+ return transform
22
+
23
+ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
24
+ best_ratio_diff = float('inf')
25
+ best_ratio = (1, 1)
26
+ area = width * height
27
+ for ratio in target_ratios:
28
+ target_aspect_ratio = ratio[0] / ratio[1]
29
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
30
+ if ratio_diff < best_ratio_diff:
31
+ best_ratio_diff = ratio_diff
32
+ best_ratio = ratio
33
+ elif ratio_diff == best_ratio_diff:
34
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
35
+ best_ratio = ratio
36
+ return best_ratio
37
+
38
+ def dynamic_preprocess(image, min_num=1, max_num=8, image_size=448, use_thumbnail=False):
39
+ orig_width, orig_height = image.size
40
+ aspect_ratio = orig_width / orig_height
41
+
42
+ # calculate the existing image aspect ratio
43
+ target_ratios = set(
44
+ (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
45
+ i * j <= max_num and i * j >= min_num)
46
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
47
+
48
+ # find the closest aspect ratio to the target
49
+ target_aspect_ratio = find_closest_aspect_ratio(
50
+ aspect_ratio, target_ratios, orig_width, orig_height, image_size)
51
+
52
+ # calculate the target width and height
53
+ target_width = image_size * target_aspect_ratio[0]
54
+ target_height = image_size * target_aspect_ratio[1]
55
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
56
+
57
+ # resize the image
58
+ resized_img = image.resize((target_width, target_height))
59
+ processed_images = []
60
+ for i in range(blocks):
61
+ box = (
62
+ (i % (target_width // image_size)) * image_size,
63
+ (i // (target_width // image_size)) * image_size,
64
+ ((i % (target_width // image_size)) + 1) * image_size,
65
+ ((i // (target_width // image_size)) + 1) * image_size
66
+ )
67
+ # split the image
68
+ split_img = resized_img.crop(box)
69
+ processed_images.append(split_img)
70
+ assert len(processed_images) == blocks
71
+ if use_thumbnail and len(processed_images) != 1:
72
+ thumbnail_img = image.resize((image_size, image_size))
73
+ processed_images.append(thumbnail_img)
74
+ return processed_images
75
+
76
+
77
+ def load_image(image_file, pil_image=None, input_size=224,):
78
+ if not pil_image:
79
+ pil_image = Image.open(image_file)
80
+ image = pil_image.convert('RGB')
81
+ transform = build_transform(input_size=input_size)
82
+ # images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
83
+ pixel_values = [transform(image) for image in [image]]
84
+ pixel_values = torch.stack(pixel_values)
85
+ return pixel_values
86
+
87
+ def my_collate(batch):
88
+ try:
89
+ targets = torch.stack([s['target'] for s in batch])
90
+ samples = torch.stack([s['samples'] for s in batch])
91
+
92
+ # targets = torch.stack([s['target'] for s in batch if s is not None])
93
+ # samples = torch.stack([s['samples'] for s in batch if s is not None])
94
+ except Exception as e:
95
+ logging.warning('my_collate issue ', e)
96
+ return None
97
+ return samples, targets
98
+
99
+
100
+ class ImageFolderSample(torchvision.datasets.ImageFolder):
101
+ def __init__(self, data_path, k, processor):
102
+ super().__init__(data_path)
103
+ self.k = k
104
+ self.processor = processor
105
+
106
+ def safe_getitem(self, index):
107
+ try:
108
+ target_path, class_type = self.samples[index]
109
+ target = torch.from_numpy(self.processor(self.loader(target_path)).data['pixel_values'][0])
110
+
111
+ input_paths = random.choices([p[0] for p in self.samples if p != target_path and class_type in p], k=self.k)
112
+ assert len(input_paths) == self.k # I think it may do this by default...
113
+ samples = torch.stack([torch.from_numpy(self.processor(self.loader(i)).data['pixel_values'][0]) for i in input_paths])
114
+ except Exception as e:
115
+ logging.warning('getitem issue ', e)
116
+ samples, target = None, None
117
+
118
+ drop_mask = torch.rand(samples.shape[0],) < .2
119
+ samples[drop_mask] = 0
120
+
121
+ drop_whole_set_mask = torch.rand(1,) < .1
122
+ if drop_whole_set_mask:
123
+ samples = torch.zeros_like(samples)
124
+ return {'samples': samples[:, :3], 'target': target[:3]}
125
+
126
+ def __getitem__(self, index: int):
127
+ return self.safe_getitem(index)
128
+
129
+
130
+ # https://data.mendeley.com/datasets/fs4k2zc5j5/3
131
+ # Gomez, J. C., Ibarra-Manzano, M. A., & Almanza-Ojeda, D. L. (2017). User Identification in Pinterest Through the Refinement of Cascade Fusion of Text and Images. Research in Computing Science, 144, 41-52.
132
+ def get_dataset(data_path, processor):
133
+ return ImageFolderSample(data_path, 8, processor)
134
+
135
+
136
+ def get_dataloader(data_path, batch_size, num_workers, processor):
137
+ dataloader = torch.utils.data.DataLoader(
138
+ get_dataset(data_path, processor=processor),
139
+ num_workers=num_workers,
140
+ collate_fn=my_collate,
141
+ batch_size=batch_size,
142
+ shuffle=True,
143
+ drop_last=True
144
+ )
145
+ return dataloader
146
+
147
+
10o.png β†’ image_init/10o.png RENAMED
File without changes
1o.png β†’ image_init/1o.png RENAMED
File without changes
2o.png β†’ image_init/2o.png RENAMED
File without changes
3o.png β†’ image_init/3o.png RENAMED
File without changes
4o.png β†’ image_init/4o.png RENAMED
File without changes
5o.png β†’ image_init/5o.png RENAMED
File without changes
6o.png β†’ image_init/6o.png RENAMED
File without changes
7o.png β†’ image_init/7o.png RENAMED
File without changes
8o.png β†’ image_init/8o.png RENAMED
File without changes
9o.png β†’ image_init/9o.png RENAMED
File without changes
last_epoch_ckpt/config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "PriorTransformer",
3
+ "_diffusers_version": "0.34.0.dev0",
4
+ "_name_or_path": "./last_epoch_ckpt/",
5
+ "added_emb_type": "prd",
6
+ "additional_embeddings": 3,
7
+ "attention_head_dim": 32,
8
+ "clip_embed_dim": null,
9
+ "dropout": 0.0,
10
+ "embedding_dim": 1280,
11
+ "embedding_proj_dim": null,
12
+ "embedding_proj_norm_type": null,
13
+ "encoder_hid_proj_type": "linear",
14
+ "norm_in_type": null,
15
+ "num_attention_heads": 16,
16
+ "num_embeddings": 77,
17
+ "num_layers": 10
18
+ }
last_epoch_ckpt/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4caacf8d2ee0d5be682f6d8af30205c6c18092d15edf9f912467e0f2736ef6ae
3
+ size 136790920
nsfweffnetv2-b02-3epochs.h5 β†’ latest_val.png RENAMED
File without changes
model.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import logging
4
+ from diffusers import DiffusionPipeline
5
+
6
+ from prior.pipeline_kandinsky_prior import KandinskyPriorPipeline
7
+ from prior.prior_transformer import PriorTransformer
8
+
9
+
10
+ class Zoo(torch.nn.Module):
11
+ def __init__(self, prior, prior_pipe, kandinsky_pipe, ) -> None:
12
+ super().__init__()
13
+ self.prior = prior
14
+ self.prior_pipe = prior_pipe
15
+ self.kandinsky_pipe = kandinsky_pipe
16
+ self.pre_prior_transformer = None
17
+ # NOTE we may get better perf from freezing our prior
18
+ # and only training a transformer adapter?
19
+
20
+ def forward(self, latents, preferred_embeds):
21
+ pred = self.prior(latents, preferred_embeds)
22
+ return pred
23
+
24
+ def do_validation(self, images): # TODO constant val seed
25
+ assert all([len(i) == 8 for i in images]), f'We have must have `k` images, not {len(images)}.'
26
+ image_embeds, negative_image_embeds = self.prior_pipe(images).to_tuple()
27
+ images = self.kandinsky_pipe(
28
+ num_inference_steps=50,
29
+ image_embeds=image_embeds,
30
+ negative_image_embeds=negative_image_embeds,
31
+ ).images
32
+ images[0].save('latest_val.png')
33
+ return images
34
+
35
+ def get_model_and_tokenizer(path, device, dtype):
36
+ prior = PriorTransformer.from_pretrained("ECLIPSE-Community/ECLIPSE_KandinskyV22_Prior"
37
+ if path is None else path).to(device)
38
+
39
+ pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior", prior=prior).to(device)
40
+ pipe_prior.image_encoder = pipe_prior.image_encoder.to(device, dtype)
41
+ # Note: don't set the prior to `dtype`` as it may be half precision,
42
+ # and we're training with mixed precision
43
+ # so we need to keep our full-precision weight for trained params
44
+ kandinsky_pipe = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder").to(device, dtype)
45
+ model = Zoo(prior, pipe_prior, kandinsky_pipe).to(device)
46
+
47
+ return model, model.prior_pipe.image_encoder
48
+
49
+ def get_optimizer(params, lr):
50
+ logging.info(f'Training: {params}')
51
+ optimizer = torch.optim.AdamW(params, lr=lr)
52
+ return optimizer
prior/__init__.py ADDED
File without changes
prior/pipeline_kandinsky_prior.py ADDED
@@ -0,0 +1,528 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import List, Optional, Union
3
+
4
+ import numpy as np
5
+ import PIL
6
+ import torch
7
+ from transformers import (
8
+ CLIPImageProcessor,
9
+ CLIPTextModelWithProjection,
10
+ CLIPTokenizer,
11
+ CLIPVisionModelWithProjection,
12
+ )
13
+
14
+ from diffusers.models import PriorTransformer
15
+ from diffusers.schedulers import UnCLIPScheduler
16
+ from diffusers.utils import (
17
+ BaseOutput,
18
+ is_accelerate_available,
19
+ is_accelerate_version,
20
+ logging,
21
+ replace_example_docstring,
22
+ )
23
+ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
24
+
25
+
26
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
27
+
28
+ EXAMPLE_DOC_STRING = """
29
+ Examples:
30
+ ```py
31
+ >>> from diffusers import KandinskyPipeline, KandinskyPriorPipeline
32
+ >>> import torch
33
+
34
+ >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior")
35
+ >>> pipe_prior.to("cuda")
36
+
37
+ >>> prompt = "red cat, 4k photo"
38
+ >>> out = pipe_prior(prompt)
39
+ >>> image_emb = out.image_embeds
40
+ >>> negative_image_emb = out.negative_image_embeds
41
+
42
+ >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1")
43
+ >>> pipe.to("cuda")
44
+
45
+ >>> image = pipe(
46
+ ... prompt,
47
+ ... image_embeds=image_emb,
48
+ ... negative_image_embeds=negative_image_emb,
49
+ ... height=768,
50
+ ... width=768,
51
+ ... num_inference_steps=100,
52
+ ... ).images
53
+
54
+ >>> image[0].save("cat.png")
55
+ ```
56
+ """
57
+
58
+ EXAMPLE_INTERPOLATE_DOC_STRING = """
59
+ Examples:
60
+ ```py
61
+ >>> from diffusers import KandinskyPriorPipeline, KandinskyPipeline
62
+ >>> from diffusers.utils import load_image
63
+ >>> import PIL
64
+
65
+ >>> import torch
66
+ >>> from torchvision import transforms
67
+
68
+ >>> pipe_prior = KandinskyPriorPipeline.from_pretrained(
69
+ ... "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
70
+ ... )
71
+ >>> pipe_prior.to("cuda")
72
+
73
+ >>> img1 = load_image(
74
+ ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
75
+ ... "/kandinsky/cat.png"
76
+ ... )
77
+
78
+ >>> img2 = load_image(
79
+ ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
80
+ ... "/kandinsky/starry_night.jpeg"
81
+ ... )
82
+
83
+ >>> images_texts = ["a cat", img1, img2]
84
+ >>> weights = [0.3, 0.3, 0.4]
85
+ >>> image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights)
86
+
87
+ >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
88
+ >>> pipe.to("cuda")
89
+
90
+ >>> image = pipe(
91
+ ... "",
92
+ ... image_embeds=image_emb,
93
+ ... negative_image_embeds=zero_image_emb,
94
+ ... height=768,
95
+ ... width=768,
96
+ ... num_inference_steps=150,
97
+ ... ).images[0]
98
+
99
+ >>> image.save("starry_cat.png")
100
+ ```
101
+ """
102
+
103
+
104
+ @dataclass
105
+ class KandinskyPriorPipelineOutput(BaseOutput):
106
+ """
107
+ Output class for KandinskyPriorPipeline.
108
+
109
+ Args:
110
+ image_embeds (`torch.FloatTensor`)
111
+ clip image embeddings for text prompt
112
+ negative_image_embeds (`List[PIL.Image.Image]` or `np.ndarray`)
113
+ clip image embeddings for unconditional tokens
114
+ """
115
+
116
+ image_embeds: Union[torch.FloatTensor, np.ndarray]
117
+ negative_image_embeds: Union[torch.FloatTensor, np.ndarray]
118
+
119
+
120
+ class KandinskyPriorPipeline(DiffusionPipeline):
121
+ """
122
+ Pipeline for generating image prior for Kandinsky
123
+
124
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
125
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
126
+
127
+ Args:
128
+ prior ([`PriorTransformer`]):
129
+ The canonincal unCLIP prior to approximate the image embedding from the text embedding.
130
+ image_encoder ([`CLIPVisionModelWithProjection`]):
131
+ Frozen image-encoder.
132
+ text_encoder ([`CLIPTextModelWithProjection`]):
133
+ Frozen text-encoder.
134
+ tokenizer (`CLIPTokenizer`):
135
+ Tokenizer of class
136
+ [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
137
+ scheduler ([`UnCLIPScheduler`]):
138
+ A scheduler to be used in combination with `prior` to generate image embedding.
139
+ """
140
+
141
+ _exclude_from_cpu_offload = ["prior"]
142
+
143
+ def __init__(
144
+ self,
145
+ prior: PriorTransformer,
146
+ image_encoder: CLIPVisionModelWithProjection,
147
+ text_encoder: CLIPTextModelWithProjection,
148
+ tokenizer: CLIPTokenizer,
149
+ scheduler: UnCLIPScheduler,
150
+ image_processor: CLIPImageProcessor,
151
+ ):
152
+ super().__init__()
153
+
154
+ self.register_modules(
155
+ prior=prior,
156
+ text_encoder=text_encoder,
157
+ tokenizer=tokenizer,
158
+ scheduler=scheduler,
159
+ image_encoder=image_encoder,
160
+ image_processor=image_processor,
161
+ )
162
+
163
+ @torch.no_grad()
164
+ @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING)
165
+ def interpolate(
166
+ self,
167
+ images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]],
168
+ weights: List[float],
169
+ num_images_per_prompt: int = 1,
170
+ num_inference_steps: int = 25,
171
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
172
+ latents: Optional[torch.FloatTensor] = None,
173
+ negative_prior_prompt: Optional[str] = None,
174
+ negative_prompt: str = "",
175
+ guidance_scale: float = 4.0,
176
+ device=None,
177
+ ):
178
+ """
179
+ Function invoked when using the prior pipeline for interpolation.
180
+
181
+ Args:
182
+ images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`):
183
+ list of prompts and images to guide the image generation.
184
+ weights: (`List[float]`):
185
+ list of weights for each condition in `images_and_prompts`
186
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
187
+ The number of images to generate per prompt.
188
+ num_inference_steps (`int`, *optional*, defaults to 25):
189
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
190
+ expense of slower inference.
191
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
192
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
193
+ to make generation deterministic.
194
+ latents (`torch.FloatTensor`, *optional*):
195
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
196
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
197
+ tensor will ge generated by sampling using the supplied random `generator`.
198
+ negative_prior_prompt (`str`, *optional*):
199
+ The prompt not to guide the prior diffusion process. Ignored when not using guidance (i.e., ignored if
200
+ `guidance_scale` is less than `1`).
201
+ negative_prompt (`str` or `List[str]`, *optional*):
202
+ The prompt not to guide the image generation. Ignored when not using guidance (i.e., ignored if
203
+ `guidance_scale` is less than `1`).
204
+ guidance_scale (`float`, *optional*, defaults to 4.0):
205
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
206
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
207
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
208
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
209
+ usually at the expense of lower image quality.
210
+
211
+ Examples:
212
+
213
+ Returns:
214
+ [`KandinskyPriorPipelineOutput`] or `tuple`
215
+ """
216
+
217
+ device = device or self.device
218
+
219
+ if len(images_and_prompts) != len(weights):
220
+ raise ValueError(
221
+ f"`images_and_prompts` contains {len(images_and_prompts)} items and `weights` contains {len(weights)} items - they should be lists of same length"
222
+ )
223
+
224
+ image_embeddings = []
225
+ for cond, weight in zip(images_and_prompts, weights):
226
+ if isinstance(cond, str):
227
+ image_emb = self(
228
+ cond,
229
+ num_inference_steps=num_inference_steps,
230
+ num_images_per_prompt=num_images_per_prompt,
231
+ generator=generator,
232
+ latents=latents,
233
+ negative_prompt=negative_prior_prompt,
234
+ guidance_scale=guidance_scale,
235
+ ).image_embeds
236
+
237
+ elif isinstance(cond, (PIL.Image.Image, torch.Tensor)):
238
+ if isinstance(cond, PIL.Image.Image):
239
+ cond = (
240
+ self.image_processor(cond, return_tensors="pt")
241
+ .pixel_values[0]
242
+ .unsqueeze(0)
243
+ .to(dtype=self.image_encoder.dtype, device=device)
244
+ )
245
+
246
+ image_emb = self.image_encoder(cond)["image_embeds"]
247
+
248
+ else:
249
+ raise ValueError(
250
+ f"`images_and_prompts` can only contains elements to be of type `str`, `PIL.Image.Image` or `torch.Tensor` but is {type(cond)}"
251
+ )
252
+
253
+ image_embeddings.append(image_emb * weight)
254
+
255
+ image_emb = torch.cat(image_embeddings).sum(dim=0, keepdim=True)
256
+
257
+ out_zero = self(
258
+ negative_prompt,
259
+ num_inference_steps=num_inference_steps,
260
+ num_images_per_prompt=num_images_per_prompt,
261
+ generator=generator,
262
+ latents=latents,
263
+ negative_prompt=negative_prior_prompt,
264
+ guidance_scale=guidance_scale,
265
+ )
266
+ zero_image_emb = (
267
+ out_zero.negative_image_embeds
268
+ if negative_prompt == ""
269
+ else out_zero.image_embeds
270
+ )
271
+
272
+ return KandinskyPriorPipelineOutput(
273
+ image_embeds=image_emb, negative_image_embeds=zero_image_emb
274
+ )
275
+
276
+ # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
277
+ def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
278
+ if latents is None:
279
+ latents = torch.randn(
280
+ shape, generator=generator, device=device, dtype=dtype
281
+ )
282
+ else:
283
+ if latents.shape != shape:
284
+ raise ValueError(
285
+ f"Unexpected latents shape, got {latents.shape}, expected {shape}"
286
+ )
287
+ latents = latents.to(device)
288
+
289
+ latents = latents * scheduler.init_noise_sigma
290
+ return latents
291
+
292
+ def get_zero_embed(self, batch_size=1, device=None):
293
+ device = device or self.device
294
+ zero_img = torch.zeros(
295
+ 1,
296
+ 3,
297
+ self.image_encoder.config.image_size,
298
+ self.image_encoder.config.image_size,
299
+ ).to(device=device, dtype=self.image_encoder.dtype)
300
+ zero_image_emb = self.image_encoder(zero_img)["image_embeds"]
301
+ zero_image_emb = zero_image_emb.repeat(batch_size, 1)
302
+ return zero_image_emb
303
+
304
+ def _encode_prompt(
305
+ self,
306
+ prompt,
307
+ device,
308
+ num_images_per_prompt,
309
+ do_classifier_free_guidance,
310
+ negative_prompt=None,
311
+ ):
312
+ batch_size = len(prompt) if isinstance(prompt, list) else 1
313
+ # get prompt text embeddings
314
+ cond = (
315
+ self.image_processor(prompt, return_tensors="pt")
316
+ .pixel_values[0]
317
+ .unsqueeze(0)
318
+ .to(dtype=self.image_encoder.dtype, device=device)
319
+ )
320
+ prompt_embeds = self.image_encoder(cond)["image_embeds"]
321
+
322
+ prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
323
+
324
+ if do_classifier_free_guidance:
325
+ if negative_prompt is None:
326
+ uncond_tokens = self.get_zero_embed(batch_size=prompt_embeds.shape[0])
327
+ elif type(prompt) is not type(negative_prompt):
328
+ raise TypeError(
329
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
330
+ f" {type(prompt)}."
331
+ )
332
+ elif batch_size != len(negative_prompt):
333
+ raise ValueError(
334
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
335
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
336
+ " the batch size of `prompt`."
337
+ )
338
+ else:
339
+ uncond_tokens = negative_prompt
340
+
341
+ cond = (
342
+ self.image_processor(uncond_tokens, return_tensors="pt")
343
+ .pixel_values[0]
344
+ .unsqueeze(0)
345
+ .to(dtype=self.image_encoder.dtype, device=device)
346
+ )
347
+
348
+ negative_prompt_embeds = self.image_encoder(cond)["image_embeds"]
349
+
350
+ seq_len = negative_prompt_embeds.shape[1]
351
+ negative_prompt_embeds = negative_prompt_embeds.repeat(
352
+ 1, num_images_per_prompt
353
+ )
354
+ negative_prompt_embeds = negative_prompt_embeds.view(
355
+ batch_size * num_images_per_prompt, seq_len
356
+ )
357
+
358
+ # For classifier free guidance, we need to do two forward passes.
359
+ # Here we concatenate the unconditional and text embeddings into a single batch
360
+ # to avoid doing two forward passes
361
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
362
+ return prompt_embeds, None
363
+
364
+ def enable_model_cpu_offload(self, gpu_id=0):
365
+ r"""
366
+ Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
367
+ to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
368
+ method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
369
+ `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
370
+ """
371
+ if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
372
+ from accelerate import cpu_offload_with_hook
373
+ else:
374
+ raise ImportError(
375
+ "`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher."
376
+ )
377
+
378
+ device = torch.device(f"cuda:{gpu_id}")
379
+
380
+ if self.device.type != "cpu":
381
+ self.to("cpu", silence_dtype_warnings=True)
382
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
383
+
384
+ hook = None
385
+ for cpu_offloaded_model in [self.text_encoder, self.prior]:
386
+ _, hook = cpu_offload_with_hook(
387
+ cpu_offloaded_model, device, prev_module_hook=hook
388
+ )
389
+
390
+ # We'll offload the last model manually.
391
+ self.prior_hook = hook
392
+
393
+ _, hook = cpu_offload_with_hook(
394
+ self.image_encoder, device, prev_module_hook=self.prior_hook
395
+ )
396
+
397
+ self.final_offload_hook = hook
398
+
399
+ @torch.no_grad()
400
+ @replace_example_docstring(EXAMPLE_DOC_STRING)
401
+ def __call__(
402
+ self,
403
+ prompt: Union[str, List[str]],
404
+ negative_prompt: Optional[Union[str, List[str]]] = None,
405
+ num_images_per_prompt: int = 1,
406
+ num_inference_steps: int = 25,
407
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
408
+ latents: Optional[torch.FloatTensor] = None,
409
+ guidance_scale: float = 4.0,
410
+ output_type: Optional[str] = "pt",
411
+ return_dict: bool = True,
412
+ ):
413
+ """
414
+ Function invoked when calling the pipeline for generation.
415
+
416
+ Args:
417
+ prompt (`str` or `List[str]`):
418
+ The prompt or prompts to guide the image generation.
419
+ negative_prompt (`str` or `List[str]`, *optional*):
420
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
421
+ if `guidance_scale` is less than `1`).
422
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
423
+ The number of images to generate per prompt.
424
+ num_inference_steps (`int`, *optional*, defaults to 25):
425
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
426
+ expense of slower inference.
427
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
428
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
429
+ to make generation deterministic.
430
+ latents (`torch.FloatTensor`, *optional*):
431
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
432
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
433
+ tensor will ge generated by sampling using the supplied random `generator`.
434
+ guidance_scale (`float`, *optional*, defaults to 4.0):
435
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
436
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
437
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
438
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
439
+ usually at the expense of lower image quality.
440
+ output_type (`str`, *optional*, defaults to `"pt"`):
441
+ The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
442
+ (`torch.Tensor`).
443
+ return_dict (`bool`, *optional*, defaults to `True`):
444
+ Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
445
+
446
+ Examples:
447
+
448
+ Returns:
449
+ [`KandinskyPriorPipelineOutput`] or `tuple`
450
+ """
451
+
452
+ # if the negative prompt is defined we double the batch size to
453
+ # directly retrieve the negative prompt embedding
454
+ if negative_prompt is not None:
455
+ prompt = prompt + negative_prompt
456
+ negative_prompt = 2 * negative_prompt
457
+
458
+ device = self._execution_device
459
+
460
+ batch_size = len(prompt)
461
+ batch_size = batch_size * num_images_per_prompt
462
+
463
+ full_prompt = []
464
+ for b in prompt: # TODO of course vectorize this lol
465
+ full_seq = []
466
+ for p in b:
467
+ prompt_embeds, text_mask = self._encode_prompt(
468
+ p, device, num_images_per_prompt, False, negative_prompt
469
+ )
470
+ full_seq.append(prompt_embeds)
471
+ prompt_embeds = torch.cat(full_seq, 0)
472
+ full_prompt.append(prompt_embeds)
473
+ prompt_embeds = torch.stack(full_prompt)
474
+ if prompt_embeds.shape[1] < 8: # TODO grab as `k` arg from config
475
+ prompt_embeds = torch.nn.functional.pad(prompt_embeds, [0, 0, 0, 8-prompt_embeds.shape[1]])
476
+ assert prompt_embeds.shape[1] == 8, f"The model is set to take `k`` cond image embeds but is shape {prompt_embeds.shape}"
477
+
478
+ prompt_embeds = prompt_embeds.to('cuda') # TODO set with `k` arg from config
479
+
480
+ hidden_states = torch.randn(
481
+ (batch_size, prompt_embeds.shape[-1]),
482
+ device=prompt_embeds.device,
483
+ dtype=prompt_embeds.dtype,
484
+ generator=generator,
485
+ )
486
+
487
+ latents = self.prior(
488
+ hidden_states,
489
+ proj_embedding=prompt_embeds,
490
+ encoder_hidden_states=prompt_embeds,
491
+ attention_mask=text_mask,
492
+ ).predicted_image_embedding
493
+
494
+ image_embeddings = latents
495
+
496
+ # if negative prompt has been defined, we retrieve split the image embedding into two
497
+ if negative_prompt is None:
498
+ zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
499
+
500
+ if (
501
+ hasattr(self, "final_offload_hook")
502
+ and self.final_offload_hook is not None
503
+ ):
504
+ self.final_offload_hook.offload()
505
+ else:
506
+ image_embeddings, zero_embeds = image_embeddings.chunk(2)
507
+
508
+ if (
509
+ hasattr(self, "final_offload_hook")
510
+ and self.final_offload_hook is not None
511
+ ):
512
+ self.prior_hook.offload()
513
+
514
+ if output_type not in ["pt", "np"]:
515
+ raise ValueError(
516
+ f"Only the output types `pt` and `np` are supported not output_type={output_type}"
517
+ )
518
+
519
+ if output_type == "np":
520
+ image_embeddings = image_embeddings.cpu().numpy()
521
+ zero_embeds = zero_embeds.cpu().numpy()
522
+
523
+ if not return_dict:
524
+ return (image_embeddings, zero_embeds)
525
+
526
+ return KandinskyPriorPipelineOutput(
527
+ image_embeds=image_embeddings, negative_image_embeds=zero_embeds
528
+ )
prior/prior_transformer.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ sys.path.append("..")
3
+
4
+ from dataclasses import dataclass
5
+ from typing import Dict, Optional, Union
6
+
7
+
8
+ import torch
9
+ import torch.nn.functional as F
10
+ from torch import nn
11
+
12
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
13
+ from diffusers.utils import BaseOutput
14
+ from diffusers.models.attention import BasicTransformerBlock
15
+ from diffusers.models.attention_processor import AttentionProcessor, AttnProcessor
16
+ from diffusers.models.embeddings import TimestepEmbedding, Timesteps
17
+ from diffusers.models.modeling_utils import ModelMixin
18
+
19
+
20
+ @dataclass
21
+ class PriorTransformerOutput(BaseOutput):
22
+ """
23
+ The output of [`PriorTransformer`].
24
+
25
+ Args:
26
+ predicted_image_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
27
+ The predicted CLIP image embedding conditioned on the CLIP text embedding input.
28
+ """
29
+
30
+ predicted_image_embedding: torch.FloatTensor
31
+
32
+
33
+ class PriorTransformer(ModelMixin, ConfigMixin):
34
+ """
35
+ A Prior Transformer model.
36
+
37
+ Parameters:
38
+ num_attention_heads (`int`, *optional*, defaults to 32): The number of heads to use for multi-head attention.
39
+ attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
40
+ num_layers (`int`, *optional*, defaults to 20): The number of layers of Transformer blocks to use.
41
+ embedding_dim (`int`, *optional*, defaults to 768): The dimension of the model input `hidden_states`
42
+ num_embeddings (`int`, *optional*, defaults to 77):
43
+ The number of embeddings of the model input `hidden_states`
44
+ additional_embeddings (`int`, *optional*, defaults to 4): The number of additional tokens appended to the
45
+ projected `hidden_states`. The actual length of the used `hidden_states` is `num_embeddings +
46
+ additional_embeddings`.
47
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
48
+ time_embed_act_fn (`str`, *optional*, defaults to 'silu'):
49
+ The activation function to use to create timestep embeddings.
50
+ norm_in_type (`str`, *optional*, defaults to None): The normalization layer to apply on hidden states before
51
+ passing to Transformer blocks. Set it to `None` if normalization is not needed.
52
+ embedding_proj_norm_type (`str`, *optional*, defaults to None):
53
+ The normalization layer to apply on the input `proj_embedding`. Set it to `None` if normalization is not
54
+ needed.
55
+ encoder_hid_proj_type (`str`, *optional*, defaults to `linear`):
56
+ The projection layer to apply on the input `encoder_hidden_states`. Set it to `None` if
57
+ `encoder_hidden_states` is `None`.
58
+ added_emb_type (`str`, *optional*, defaults to `prd`): Additional embeddings to condition the model.
59
+ Choose from `prd` or `None`. if choose `prd`, it will prepend a token indicating the (quantized) dot
60
+ product between the text embedding and image embedding as proposed in the unclip paper
61
+ https://arxiv.org/abs/2204.06125 If it is `None`, no additional embeddings will be prepended.
62
+ time_embed_dim (`int, *optional*, defaults to None): The dimension of timestep embeddings.
63
+ If None, will be set to `num_attention_heads * attention_head_dim`
64
+ embedding_proj_dim (`int`, *optional*, default to None):
65
+ The dimension of `proj_embedding`. If None, will be set to `embedding_dim`.
66
+ clip_embed_dim (`int`, *optional*, default to None):
67
+ The dimension of the output. If None, will be set to `embedding_dim`.
68
+ """
69
+
70
+ @register_to_config
71
+ def __init__(
72
+ self,
73
+ num_attention_heads: int = 32,
74
+ attention_head_dim: int = 64,
75
+ num_layers: int = 20,
76
+ embedding_dim: int = 768,
77
+ num_embeddings=77,
78
+ additional_embeddings=3, # as we have remvoed the time embedding
79
+ dropout: float = 0.0,
80
+ # time_embed_act_fn: str = "silu",
81
+ norm_in_type: Optional[str] = None, # layer
82
+ embedding_proj_norm_type: Optional[str] = None, # layer
83
+ encoder_hid_proj_type: Optional[str] = "linear", # linear
84
+ added_emb_type: Optional[str] = "prd", # prd
85
+ # time_embed_dim: Optional[int] = None,
86
+ embedding_proj_dim: Optional[int] = None,
87
+ clip_embed_dim: Optional[int] = None,
88
+ ):
89
+ super().__init__()
90
+ self.num_attention_heads = num_attention_heads
91
+ self.attention_head_dim = attention_head_dim
92
+ inner_dim = num_attention_heads * attention_head_dim
93
+ self.additional_embeddings = additional_embeddings
94
+
95
+ # time_embed_dim = time_embed_dim or inner_dim
96
+ embedding_proj_dim = embedding_proj_dim or embedding_dim
97
+ clip_embed_dim = clip_embed_dim or embedding_dim
98
+
99
+ # self.time_proj = Timesteps(inner_dim, True, 0)
100
+ # self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, out_dim=inner_dim, act_fn=time_embed_act_fn)
101
+
102
+ self.proj_in = nn.Linear(embedding_dim, inner_dim)
103
+
104
+ if embedding_proj_norm_type is None:
105
+ self.embedding_proj_norm = None
106
+ elif embedding_proj_norm_type == "layer":
107
+ self.embedding_proj_norm = nn.LayerNorm(embedding_proj_dim)
108
+ else:
109
+ raise ValueError(f"unsupported embedding_proj_norm_type: {embedding_proj_norm_type}")
110
+
111
+ self.embedding_proj = nn.Linear(embedding_proj_dim, inner_dim)
112
+
113
+ if encoder_hid_proj_type is None:
114
+ self.encoder_hidden_states_proj = None
115
+ elif encoder_hid_proj_type == "linear":
116
+ self.encoder_hidden_states_proj = nn.Linear(embedding_dim, inner_dim)
117
+ else:
118
+ raise ValueError(f"unsupported encoder_hid_proj_type: {encoder_hid_proj_type}")
119
+
120
+ self.positional_embedding = nn.Parameter(torch.zeros(1, num_embeddings + additional_embeddings, inner_dim))
121
+
122
+ if added_emb_type == "prd":
123
+ self.prd_embedding = nn.Parameter(torch.zeros(1, 1, inner_dim))
124
+ elif added_emb_type is None:
125
+ self.prd_embedding = None
126
+ else:
127
+ raise ValueError(
128
+ f"`added_emb_type`: {added_emb_type} is not supported. Make sure to choose one of `'prd'` or `None`."
129
+ )
130
+
131
+ self.transformer_blocks = nn.ModuleList(
132
+ [
133
+ BasicTransformerBlock(
134
+ inner_dim,
135
+ num_attention_heads,
136
+ attention_head_dim,
137
+ dropout=dropout,
138
+ activation_fn="gelu",
139
+ attention_bias=True,
140
+ )
141
+ for d in range(num_layers)
142
+ ]
143
+ )
144
+
145
+ if norm_in_type == "layer":
146
+ self.norm_in = nn.LayerNorm(inner_dim)
147
+ elif norm_in_type is None:
148
+ self.norm_in = None
149
+ else:
150
+ raise ValueError(f"Unsupported norm_in_type: {norm_in_type}.")
151
+
152
+ self.norm_out = nn.LayerNorm(inner_dim)
153
+
154
+ self.proj_to_clip_embeddings = nn.Linear(inner_dim, clip_embed_dim)
155
+
156
+ causal_attention_mask = torch.full(
157
+ [num_embeddings + additional_embeddings, num_embeddings + additional_embeddings], -10000.0
158
+ )
159
+ causal_attention_mask.triu_(1)
160
+ causal_attention_mask = causal_attention_mask[None, ...]
161
+ self.register_buffer("causal_attention_mask", causal_attention_mask, persistent=False)
162
+
163
+ self.clip_mean = nn.Parameter(torch.zeros(1, clip_embed_dim))
164
+ self.clip_std = nn.Parameter(torch.zeros(1, clip_embed_dim))
165
+
166
+ @property
167
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
168
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
169
+ r"""
170
+ Returns:
171
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
172
+ indexed by its weight name.
173
+ """
174
+ # set recursively
175
+ processors = {}
176
+
177
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
178
+ if hasattr(module, "set_processor"):
179
+ processors[f"{name}.processor"] = module.processor
180
+
181
+ for sub_name, child in module.named_children():
182
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
183
+
184
+ return processors
185
+
186
+ for name, module in self.named_children():
187
+ fn_recursive_add_processors(name, module, processors)
188
+
189
+ return processors
190
+
191
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
192
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
193
+ r"""
194
+ Sets the attention processor to use to compute attention.
195
+
196
+ Parameters:
197
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
198
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
199
+ for **all** `Attention` layers.
200
+
201
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
202
+ processor. This is strongly recommended when setting trainable attention processors.
203
+
204
+ """
205
+ count = len(self.attn_processors.keys())
206
+
207
+ if isinstance(processor, dict) and len(processor) != count:
208
+ raise ValueError(
209
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
210
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
211
+ )
212
+
213
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
214
+ if hasattr(module, "set_processor"):
215
+ if not isinstance(processor, dict):
216
+ module.set_processor(processor)
217
+ else:
218
+ module.set_processor(processor.pop(f"{name}.processor"))
219
+
220
+ for sub_name, child in module.named_children():
221
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
222
+
223
+ for name, module in self.named_children():
224
+ fn_recursive_attn_processor(name, module, processor)
225
+
226
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
227
+ def set_default_attn_processor(self):
228
+ """
229
+ Disables custom attention processors and sets the default attention implementation.
230
+ """
231
+ self.set_attn_processor(AttnProcessor())
232
+
233
+ def forward(
234
+ self,
235
+ hidden_states,
236
+ # timestep: Union[torch.Tensor, float, int],
237
+ proj_embedding: torch.FloatTensor,
238
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
239
+ attention_mask: Optional[torch.BoolTensor] = None,
240
+ return_dict: bool = True,
241
+ ):
242
+ """
243
+ The [`PriorTransformer`] forward method.
244
+
245
+ Args:
246
+ hidden_states (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
247
+ The currently predicted image embeddings.
248
+ timestep (`torch.LongTensor`):
249
+ Current denoising step.
250
+ proj_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
251
+ Projected embedding vector the denoising process is conditioned on.
252
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_embeddings, embedding_dim)`):
253
+ Hidden states of the text embeddings the denoising process is conditioned on.
254
+ attention_mask (`torch.BoolTensor` of shape `(batch_size, num_embeddings)`):
255
+ Text mask for the text embeddings.
256
+ return_dict (`bool`, *optional*, defaults to `True`):
257
+ Whether or not to return a [`~models.prior_transformer.PriorTransformerOutput`] instead of a plain
258
+ tuple.
259
+
260
+ Returns:
261
+ [`~models.prior_transformer.PriorTransformerOutput`] or `tuple`:
262
+ If return_dict is True, a [`~models.prior_transformer.PriorTransformerOutput`] is returned, otherwise a
263
+ tuple is returned where the first element is the sample tensor.
264
+ """
265
+ batch_size = hidden_states.shape[0]
266
+
267
+ # timesteps = timestep
268
+ # if not torch.is_tensor(timesteps):
269
+ # timesteps = torch.tensor([timesteps], dtype=torch.long, device=hidden_states.device)
270
+ # elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
271
+ # timesteps = timesteps[None].to(hidden_states.device)
272
+
273
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
274
+ # timesteps = timesteps * torch.ones(batch_size, dtype=timesteps.dtype, device=timesteps.device)
275
+
276
+ # timesteps_projected = self.time_proj(timesteps)
277
+
278
+ # timesteps does not contain any weights and will always return f32 tensors
279
+ # but time_embedding might be fp16, so we need to cast here.
280
+ # timesteps_projected = timesteps_projected.to(dtype=self.dtype)
281
+ # time_embeddings = self.time_embedding(timesteps_projected)
282
+
283
+ if self.embedding_proj_norm is not None:
284
+ proj_embedding = self.embedding_proj_norm(proj_embedding)
285
+
286
+ proj_embeddings = self.embedding_proj(proj_embedding)
287
+ if self.encoder_hidden_states_proj is not None and encoder_hidden_states is not None:
288
+ encoder_hidden_states = self.encoder_hidden_states_proj(encoder_hidden_states)
289
+ # elif self.encoder_hidden_states_proj is not None and encoder_hidden_states is None:
290
+ # raise ValueError("`encoder_hidden_states_proj` requires `encoder_hidden_states` to be set")
291
+
292
+ hidden_states = self.proj_in(hidden_states)
293
+
294
+ # TODO this really also ought to derive from config's `k`
295
+ positional_embeddings = self.positional_embedding.to(hidden_states.dtype)
296
+
297
+ additional_embeds = []
298
+ additional_embeddings_len = 0
299
+
300
+ if encoder_hidden_states is not None:
301
+ additional_embeds.append(encoder_hidden_states)
302
+ additional_embeddings_len += encoder_hidden_states.shape[1]
303
+
304
+ if len(proj_embeddings.shape) == 2:
305
+ proj_embeddings = proj_embeddings[:, None, :]
306
+
307
+ if len(hidden_states.shape) == 2:
308
+ hidden_states = hidden_states[:, None, :]
309
+
310
+ additional_embeds = additional_embeds + [
311
+ proj_embeddings,
312
+ # time_embeddings[:, None, :],
313
+ hidden_states,
314
+ ]
315
+
316
+ if self.prd_embedding is not None:
317
+ prd_embedding = self.prd_embedding.to(hidden_states.dtype).expand(batch_size, -1, -1)
318
+ additional_embeds.append(prd_embedding)
319
+
320
+ hidden_states = torch.cat(
321
+ additional_embeds,
322
+ dim=1,
323
+ )
324
+
325
+ # Allow positional_embedding to not include the `addtional_embeddings` and instead pad it with zeros for these additional tokens
326
+ additional_embeddings_len = additional_embeddings_len + proj_embeddings.shape[1] + 1
327
+ if positional_embeddings.shape[1] < hidden_states.shape[1]:
328
+ positional_embeddings = F.pad(
329
+ positional_embeddings,
330
+ (
331
+ 0,
332
+ 0,
333
+ additional_embeddings_len,
334
+ self.prd_embedding.shape[1] if self.prd_embedding is not None else 0,
335
+ ),
336
+ value=0.0,
337
+ )
338
+
339
+ hidden_states = hidden_states + positional_embeddings[:, :hidden_states.shape[1]]
340
+
341
+ if attention_mask is not None:
342
+ attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
343
+ attention_mask = F.pad(attention_mask, (0, self.additional_embeddings), value=0.0)
344
+ attention_mask = (attention_mask[:, None, :] + self.causal_attention_mask).to(hidden_states.dtype)
345
+ attention_mask = attention_mask.repeat_interleave(self.config.num_attention_heads, dim=0)
346
+
347
+ if self.norm_in is not None:
348
+ hidden_states = self.norm_in(hidden_states)
349
+
350
+ for block in self.transformer_blocks:
351
+ hidden_states = block(hidden_states, attention_mask=attention_mask)
352
+
353
+ hidden_states = self.norm_out(hidden_states)
354
+
355
+ if self.prd_embedding is not None:
356
+ hidden_states = hidden_states[:, -1]
357
+ else:
358
+ hidden_states = hidden_states[:, additional_embeddings_len:]
359
+
360
+ predicted_image_embedding = self.proj_to_clip_embeddings(hidden_states)
361
+
362
+ if not return_dict:
363
+ return (predicted_image_embedding,)
364
+
365
+ return PriorTransformerOutput(predicted_image_embedding=predicted_image_embedding)
366
+
367
+ def post_process_latents(self, prior_latents):
368
+ prior_latents = (prior_latents * self.clip_std) + self.clip_mean
369
+ return prior_latents
safety_checker_improved.py DELETED
@@ -1,46 +0,0 @@
1
-
2
- # TODO required tensorflow==2.14 for me
3
- # weights from https://github.com/LAION-AI/safety-pipeline/tree/main
4
- from PIL import Image
5
- import tensorflow_hub as hub
6
- import tensorflow
7
- import numpy as np
8
- import sys
9
- sys.path.append('/home/ryn_mote/Misc/generative_recommender/gradio_video/automl/efficientnetv2/')
10
- import tensorflow as tf
11
- from tensorflow.keras import mixed_precision
12
-
13
- physical_devices = tf.config.list_physical_devices('GPU')
14
- if len(physical_devices) > 0:
15
- tf.config.experimental.set_memory_growth(
16
- physical_devices[0], True
17
- )
18
-
19
- model = tf.keras.models.load_model('nsfweffnetv2-b02-3epochs.h5',custom_objects={"KerasLayer":hub.KerasLayer})
20
- # "The image classifier had been trained on 682550 images from the 5 classes "Drawing" (39026), "Hentai" (28134), "Neutral" (369507), "Porn" (207969) & "Sexy" (37914).
21
- # ... we created a manually inspected test set that consists of 4900 samples, that contains images & their captions."
22
-
23
- # Run prediction
24
- def maybe_nsfw(pil_image):
25
- # Run prediction
26
- imm = tensorflow.image.resize(np.array(pil_image)[:, :, :3], (260, 260))
27
- imm = (imm / 255)
28
- pred = model(tensorflow.expand_dims(imm, 0)).numpy()
29
- probs = tensorflow.math.softmax(pred[0]).numpy()
30
- print(probs)
31
- if all([i < .3 for i in probs[[1, 3, 4]]]):
32
- return False
33
- return True
34
-
35
- # pre-initializing prediction
36
- maybe_nsfw(Image. new("RGB", (260, 260), 255))
37
- model.load_weights('nsfweffnetv2-b02-3epochs.h5', by_name=True, )
38
-
39
-
40
-
41
-
42
-
43
-
44
-
45
-
46
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ ########################################
4
+ # python -m train
5
+ ###########################################
6
+
7
+
8
+ import torch
9
+ import logging
10
+ import numpy as np
11
+ from tqdm import tqdm
12
+ from PIL import Image
13
+
14
+ from data import get_dataloader
15
+ from model import get_model_and_tokenizer, get_optimizer
16
+ import config
17
+
18
+ logging.basicConfig(level=logging.INFO)
19
+
20
+ def get_loss(model, input, target, tokenizer):
21
+ with torch.no_grad():
22
+ assert len(input.shape) == 5 # [batch, s, c, w, h]
23
+ cuts = config.number_k_clip_embed
24
+ assert input.shape[0] * input.shape[1] % cuts == 0, 'batch size * `k` preferred embeds must be divisible by cuts'
25
+ input = input.view(cuts//8, -1, 3, target.shape[-2], target.shape[-1])
26
+ full_seq = []
27
+ for b in input:
28
+ input = tokenizer(b)['image_embeds'] # in our case, tokenizer is a clip embedding model
29
+ full_seq.append(input)
30
+ input = torch.stack(full_seq)
31
+
32
+ target = tokenizer(target)['image_embeds']
33
+
34
+ input = input.view(target.shape[0], -1, target.shape[-1])
35
+ assert len(input.shape) == 3 # [batch, sequence, inner]
36
+
37
+ with torch.cuda.amp.autocast(enabled=False, ):
38
+ input = input.to(torch.float32)
39
+ latent = torch.randn(input.shape[0], input.shape[-1], device=input.device)
40
+ output = model(latent, input).predicted_image_embedding
41
+
42
+ target = target.to(torch.float32)
43
+ mse_loss = torch.nn.functional.mse_loss(target, output).mean()
44
+
45
+ assert len(target.shape) == 2 and len(output.shape) == 2
46
+ cosine_loss = 1 - torch.nn.functional.cosine_similarity(output, target).mean()
47
+ loss = mse_loss + .2 * cosine_loss
48
+
49
+ logging.info(f'MSE: {mse_loss.item()}, Cosine: {cosine_loss.item()}, Weighted Total: {loss.item()}')
50
+ # TODO wandb
51
+
52
+ return loss
53
+
54
+ def main():
55
+ np.random.seed(config.seed)
56
+ torch.manual_seed(config.seed)
57
+
58
+ model, tokenizer = get_model_and_tokenizer(config.model_path, config.device, config.dtype)
59
+ optimizer = get_optimizer(list(model.prior.parameters()), config.lr)
60
+ dataloader = get_dataloader(config.data_path, config.batch_size, config.num_workers,
61
+ model.prior_pipe.image_processor)
62
+
63
+ for epoch in range(config.epochs):
64
+ for ind, batch in tqdm(enumerate(iter(dataloader))):
65
+ if batch is None:
66
+ continue
67
+
68
+ input, target = batch
69
+ input = input.to(config.device)
70
+ target = target.to(config.device)
71
+
72
+ if ind % 50 == 0:
73
+ with torch.cuda.amp.autocast(enabled=True, dtype=config.dtype): # NOTE using autocast because our training model is also our val model, so don't want to set to full half precision.
74
+ examples = ['../generative_recommender/Blue_Tigers_space/1o.png',
75
+ '../generative_recommender/Blue_Tigers_space/2o.png',
76
+ '../generative_recommender/Blue_Tigers_space/3o.png',
77
+ '../generative_recommender/Blue_Tigers_space/4o.png',
78
+ '../generative_recommender/Blue_Tigers_space/5o.png',
79
+ '../generative_recommender/Blue_Tigers_space/6o.png',
80
+ '../generative_recommender/Blue_Tigers_space/7o.png',
81
+ '../generative_recommender/Blue_Tigers_space/8o.png',]
82
+ model.do_validation([[Image.open('../'+j) for j in examples]])
83
+
84
+ loss = get_loss(model, input, target, tokenizer)
85
+ loss.backward()
86
+ optimizer.step()
87
+ optimizer.zero_grad()
88
+
89
+ if ind % 100 == 0:
90
+ # TODO add loading from path
91
+ model.prior.save_pretrained(f'{config.save_path}/last_epoch_ckpt', from_pt=True)
92
+
93
+ if __name__ == '__main__':
94
+ main()
train_requirements.txt ADDED
@@ -0,0 +1,642 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==1.4.0
2
+ accelerate==0.26.1
3
+ addict==2.4.0
4
+ aeiou==0.0.20
5
+ aenum==3.1.15
6
+ aiobotocore==2.13.0
7
+ aiofiles==23.1.0
8
+ aiohttp==3.9.5
9
+ aioitertools==0.11.0
10
+ aiosignal==1.3.1
11
+ alias-free-torch==0.0.6
12
+ aliyun-python-sdk-core==2.15.1
13
+ aliyun-python-sdk-kms==2.16.3
14
+ altair==4.2.2
15
+ anaconda-anon-usage @ file:///croot/anaconda-anon-usage_1710965072196/work
16
+ anaconda-client==1.11.2
17
+ anaconda-cloud-auth @ file:///croot/anaconda-cloud-auth_1712794769769/work
18
+ anaconda-navigator @ file:///croot/anaconda-navigator_1712087978399/work
19
+ anaconda-project @ file:///opt/conda/conda-bld/anaconda-project_1660339890420/work
20
+ annotated-types @ file:///croot/annotated-types_1709542908624/work
21
+ antlr4-python3-runtime==4.9.3
22
+ anyio==4.3.0
23
+ appdirs==1.4.4
24
+ apptools==5.2.1
25
+ APScheduler==3.10.4
26
+ argbind==0.3.9
27
+ argcomplete==3.1.1
28
+ asgiref==3.7.2
29
+ asttokens==2.2.1
30
+ astunparse==1.6.3
31
+ async-timeout==4.0.2
32
+ atproto==0.0.10
33
+ attrs==25.1.0
34
+ audioread==3.0.1
35
+ auraloss==0.4.0
36
+ av==10.0.0
37
+ awscli==1.33.2
38
+ backcall==0.2.0
39
+ backports.functools-lru-cache @ file:///tmp/build/80754af9/backports.functools_lru_cache_1618170165463/work
40
+ backports.tempfile @ file:///home/linux1/recipes/ci/backports.tempfile_1610991236607/work
41
+ backports.weakref==1.0.post1
42
+ bases==0.2.1
43
+ basicsr==1.4.2
44
+ beautifulsoup4==4.12.2
45
+ bitsandbytes==0.43.1
46
+ black==24.10.0
47
+ bleach==6.1.0
48
+ blendmodes==2022
49
+ blinker==1.6.2
50
+ blis==0.7.9
51
+ blobfile==2.1.1
52
+ blosc2==2.5.1
53
+ bokeh==3.4.1
54
+ boltons==23.0.0
55
+ boto==2.49.0
56
+ boto3==1.34.120
57
+ botocore==1.34.120
58
+ Bottleneck @ file:///croot/bottleneck_1707864210935/work
59
+ braceexpand==0.1.7
60
+ Brotli @ file:///tmp/abs_ecyw11_7ze/croots/recipe/brotli-split_1659616059936/work
61
+ brotlipy==0.7.0
62
+ cached-property==1.5.2
63
+ cachetools==5.3.3
64
+ Cartopy==0.21.1
65
+ catalogue==2.0.8
66
+ certifi==2025.1.31
67
+ cffi==1.15.1
68
+ cfgv==3.3.1
69
+ chardet @ file:///home/builder/ci_310/chardet_1640804867535/work
70
+ charset-normalizer==3.1.0
71
+ chex==0.1.81
72
+ clean-fid==0.1.35
73
+ click==8.1.3
74
+ clip @ git+https://github.com/openai/CLIP.git@a9b1bf5920416aaeaec965c25dd9e8f98c864f16
75
+ clip-anytorch==2.6.0
76
+ cloudpickle==2.2.1
77
+ clyent==1.2.2
78
+ cmake==3.26.4
79
+ colorama==0.4.6
80
+ colorcet==3.1.0
81
+ colored==2.2.4
82
+ coloredlogs==15.0.1
83
+ comm==0.1.4
84
+ commonmark==0.9.1
85
+ comtypes==1.2.0
86
+ conda @ file:///croot/conda_1696257509808/work
87
+ conda-build @ file:///croot/conda-build_1701720841368/work
88
+ conda-content-trust @ file:///tmp/abs_5952f1c8-355c-4855-ad2e-538535021ba5h26t22e5/croots/recipe/conda-content-trust_1658126371814/work
89
+ conda-libmamba-solver @ file:///croot/conda-libmamba-solver_1698163451663/work/src
90
+ conda-pack @ file:///tmp/build/80754af9/conda-pack_1611163042455/work
91
+ conda-package-handling @ file:///croot/conda-package-handling_1690999929514/work
92
+ conda-repo-cli @ file:///croot/conda-repo-cli_1709246574569/work
93
+ conda-token @ file:///Users/paulyim/miniconda3/envs/c3i/conda-bld/conda-token_1662660369760/work
94
+ conda-verify==3.4.2
95
+ conda_index @ file:///croot/conda-index_1706633791028/work
96
+ conda_package_streaming @ file:///croot/conda-package-streaming_1690987966409/work
97
+ confection==0.0.4
98
+ configobj==5.0.8
99
+ configparser==7.0.0
100
+ contextlib2==21.6.0
101
+ contexttimer==0.3.3
102
+ contourpy==1.2.1
103
+ cramjam==2.8.3
104
+ crcmod==1.7
105
+ cryptography @ file:///croot/cryptography_1677533068310/work
106
+ cuda-python==12.4.0
107
+ curl_cffi==0.6.4
108
+ cycler==0.11.0
109
+ cymem==2.0.7
110
+ Cython==0.29.35
111
+ dacite==1.8.1
112
+ dag-cbor==0.3.2
113
+ datasets==2.21.0
114
+ dctorch==0.1.2
115
+ -e git+https://github.com/jannerm/ddpo.git@b217eef955a94bf58e4de68caa5ec0a6558c221d#egg=ddpo
116
+ debugpy==1.6.7
117
+ decorator==4.4.2
118
+ decord==0.6.0
119
+ DeepCache==0.1.1
120
+ deepspeed==0.14.2
121
+ defusedxml @ file:///tmp/build/80754af9/defusedxml_1615228127516/work
122
+ Deprecated==1.2.14
123
+ deprecation==2.1.0
124
+ descript-audio-codec==1.0.0
125
+ descript-audiotools==0.7.2
126
+ diffusers @ git+https://github.com/huggingface/diffusers.git@06beecafc55cfddeb1b0b8660188de249f74b899
127
+ dill==0.3.6
128
+ disnake==2.9.0
129
+ Django==4.2.2
130
+ django-memcache-status==2.3
131
+ django-pylibmc==0.6.1
132
+ dm-tree==0.1.8
133
+ dnspython==2.6.1
134
+ docker-pycreds==0.4.0
135
+ docstring-parser==0.15
136
+ docutils==0.16
137
+ EasyProcess==1.1
138
+ einops==0.7.0
139
+ einops-exts==0.0.4
140
+ ema-pytorch==0.2.3
141
+ email_validator==2.1.1
142
+ emoji==2.4.0
143
+ encodec==0.1.1
144
+ entrypoints==0.4
145
+ envisage==7.0.3
146
+ etils==1.3.0
147
+ eva-decord==0.6.1
148
+ exceptiongroup==1.1.1
149
+ executing==1.2.0
150
+ facexlib==0.3.0
151
+ fairscale==0.4.4
152
+ fastapi==0.111.0
153
+ fastapi-cli==0.0.4
154
+ fastcore==1.5.44
155
+ fastjsonschema @ file:///opt/conda/conda-bld/python-fastjsonschema_1661371079312/work
156
+ fastparquet==2024.5.0
157
+ ffmpeg==1.4
158
+ ffmpeg-python==0.2.0
159
+ ffmpegio==0.8.3
160
+ ffmpegio-core==0.8.3
161
+ ffmpy==0.3.0
162
+ filelock @ file:///croot/filelock_1700591183607/work
163
+ filterpy==1.4.5
164
+ fire==0.6.0
165
+ flash-attn==2.5.9.post1
166
+ Flask==2.3.2
167
+ flatbuffers==23.5.26
168
+ flatten-dict==0.4.2
169
+ flax==0.6.9
170
+ flow-vis==0.1
171
+ fonttools==4.42.1
172
+ frozenlist==1.3.3
173
+ fsspec==2024.6.0
174
+ ftfy==6.1.1
175
+ future @ file:///croot/future_1677599870788/work
176
+ fvcore==0.1.5.post20221221
177
+ gast==0.4.0
178
+ gcs-oauth2-boto-plugin==3.0
179
+ gcsfs==2023.6.0
180
+ gdcm==1.1
181
+ gdown==4.7.1
182
+ gfpgan==1.3.8
183
+ gguf==0.16.2
184
+ gin-config==0.5.0
185
+ gitdb==4.0.10
186
+ GitPython==3.1.30
187
+ gmpy2 @ file:///tmp/build/80754af9/gmpy2_1645455533097/work
188
+ google-api-core==2.11.1
189
+ google-apitools==0.5.32
190
+ google-auth==2.29.0
191
+ google-auth-oauthlib==1.0.0
192
+ google-cloud-core==2.3.2
193
+ google-cloud-storage==2.10.0
194
+ google-crc32c==1.5.0
195
+ google-pasta==0.2.0
196
+ google-reauth==0.1.1
197
+ google-resumable-media==2.5.0
198
+ googleapis-common-protos==1.59.1
199
+ gradio==4.31.5
200
+ gradio_client==0.16.4
201
+ grpcio==1.54.2
202
+ gsutil==5.25
203
+ h11==0.14.0
204
+ h5py==3.11.0
205
+ hjson==3.1.0
206
+ holoviews==1.18.3
207
+ httpcore==1.0.5
208
+ httplib2==0.20.4
209
+ httptools==0.6.1
210
+ httpx==0.27.0
211
+ httpx-ws==0.3.1
212
+ huggingface-hub==0.30.2
213
+ humanfriendly==10.0
214
+ humanize==4.7.0
215
+ hydra-core==1.1.2
216
+ hyper-tile @ git+https://github.com/tfernd/HyperTile@2ef64b2800d007d305755c33550537410310d7df
217
+ icecream==2.1.3
218
+ identify==2.5.24
219
+ idna @ file:///croot/idna_1666125576474/work
220
+ imagebind @ git+https://github.com/facebookresearch/ImageBind.git@95d27c7fd5a8362f3527e176c3a80ae5a4d880c0
221
+ imageio==2.34.2
222
+ imageio-ffmpeg==0.4.8
223
+ importlib-metadata==6.8.0
224
+ importlib-resources==5.12.0
225
+ inflect==6.0.4
226
+ inflection==0.5.1
227
+ install==1.3.5
228
+ iopath==0.1.9
229
+ ipykernel==6.25.0
230
+ ipython==8.14.0
231
+ ipywidgets==8.0.6
232
+ itsdangerous==2.1.2
233
+ jaraco.classes @ file:///tmp/build/80754af9/jaraco.classes_1620983179379/work
234
+ jax==0.4.6
235
+ jaxlib==0.4.6
236
+ jedi==0.19.0
237
+ jeepney @ file:///tmp/build/80754af9/jeepney_1627537048313/work
238
+ Jinja2==3.1.2
239
+ jmespath==0.10.0
240
+ joblib==1.3.2
241
+ jsonmerge==1.8.0
242
+ jsonpatch @ file:///croot/jsonpatch_1710807507480/work
243
+ jsonpointer==2.1
244
+ jsonschema @ file:///croot/jsonschema_1699041609003/work
245
+ jsonschema-specifications @ file:///croot/jsonschema-specifications_1699032386549/work
246
+ julius==0.2.7
247
+ jupyter-js-widgets-nbextension==0.0.2.dev0
248
+ jupyter_client==8.3.0
249
+ jupyter_core @ file:///croot/jupyter_core_1698937308754/work
250
+ jupyterlab-widgets==3.0.7
251
+ k-diffusion==0.1.1
252
+ kaggle==1.5.13
253
+ kagglehub==0.3.12
254
+ kandinsky2 @ git+https://github.com/ai-forever/Kandinsky-2.git@aeefc1ce3a989eefe7c99d6a02cce44318c4d210
255
+ kecam==1.4.1
256
+ keras==2.14.0
257
+ keras-efficientnet-v2==1.2.2
258
+ Keras-Preprocessing==1.1.2
259
+ keyring @ file:///croot/keyring_1709632513808/work
260
+ kiwisolver==1.4.5
261
+ kornia==0.6.7
262
+ laion-clap==1.1.4
263
+ langcodes==3.3.0
264
+ lark==1.1.2
265
+ lazy_loader==0.2
266
+ libarchive-c @ file:///tmp/build/80754af9/python-libarchive-c_1617780486945/work
267
+ libclang==16.0.0
268
+ libmambapy @ file:///croot/mamba-split_1694187754698/work/libmambapy
269
+ librosa==0.9.2
270
+ lightning-utilities==0.8.0
271
+ linkify-it-py==2.0.2
272
+ lit==16.0.6
273
+ llvmlite==0.42.0
274
+ lmdb==1.4.1
275
+ local-attention==1.8.6
276
+ loguru==0.7.2
277
+ lpips==0.1.4
278
+ lvis==0.5.3
279
+ lxml==4.9.4
280
+ Markdown==3.6
281
+ markdown-it-py==2.2.0
282
+ markdown2==2.4.8
283
+ MarkupSafe==2.1.2
284
+ matplotlib==3.7.3
285
+ matplotlib-inline==0.1.6
286
+ mayavi==4.8.1
287
+ mc-bin-client==1.0.1
288
+ mdit-py-plugins==0.3.3
289
+ mdurl==0.1.2
290
+ mediapipe==0.10.15
291
+ menuinst @ file:///croot/menuinst_1706732933928/work
292
+ mkl-fft @ file:///croot/mkl_fft_1695058164594/work
293
+ mkl-random @ file:///croot/mkl_random_1695059800811/work
294
+ mkl-service==2.4.0
295
+ ml-collections==0.1.1
296
+ ml-dtypes==0.2.0
297
+ mmcv==1.7.2
298
+ mmengine==0.10.4
299
+ model-index==0.1.11
300
+ more-itertools @ file:///croot/more-itertools_1700662129964/work
301
+ MouseInfo==0.1.3
302
+ moviepy==1.0.3
303
+ mpmath @ file:///croot/mpmath_1690848262763/work
304
+ msgpack==1.0.5
305
+ multidict==6.0.4
306
+ multiformats==0.2.1
307
+ multiformats-config==0.2.0.post4
308
+ multiprocess==0.70.14
309
+ murmurhash==1.0.9
310
+ mypy-extensions==1.0.0
311
+ namex==0.0.8
312
+ natsort==8.4.0
313
+ navigator-updater @ file:///croot/navigator-updater_1713453362034/work
314
+ nbformat @ file:///croot/nbformat_1694616755618/work
315
+ ndindex==1.8
316
+ nest-asyncio==1.5.7
317
+ networkx==3.1
318
+ nh3==0.2.13
319
+ nibabel==5.1.0
320
+ ninja==1.11.1
321
+ nlpaug==1.1.11
322
+ nltk==3.8.1
323
+ nodeenv==1.8.0
324
+ numba==0.59.1
325
+ numexpr @ file:///croot/numexpr_1696515281613/work
326
+ numpy==1.26.4
327
+ nvidia-cublas-cu11==11.11.3.6
328
+ nvidia-cublas-cu117==11.10.1.25
329
+ nvidia-cublas-cu12==12.3.4.1
330
+ nvidia-cuda-cupti-cu11==11.8.87
331
+ nvidia-cuda-cupti-cu117==11.7.50
332
+ nvidia-cuda-cupti-cu12==12.3.101
333
+ nvidia-cuda-nvcc-cu11==11.8.89
334
+ nvidia-cuda-nvcc-cu12==12.3.107
335
+ nvidia-cuda-nvrtc-cu11==11.8.89
336
+ nvidia-cuda-nvrtc-cu12==12.3.107
337
+ nvidia-cuda-runtime-cu11==11.8.89
338
+ nvidia-cuda-runtime-cu117==11.7.60
339
+ nvidia-cuda-runtime-cu12==12.3.101
340
+ nvidia-cudnn-cu11==8.7.0.84
341
+ nvidia-cudnn-cu116==8.4.0.27
342
+ nvidia-cudnn-cu12==9.0.0.312
343
+ nvidia-cufft-cu11==10.9.0.58
344
+ nvidia-cufft-cu12==11.0.12.1
345
+ nvidia-curand-cu11==10.3.0.86
346
+ nvidia-curand-cu12==10.3.4.107
347
+ nvidia-cusolver-cu11==11.4.1.48
348
+ nvidia-cusolver-cu12==11.5.4.101
349
+ nvidia-cusparse-cu11==11.7.5.86
350
+ nvidia-cusparse-cu12==12.2.0.103
351
+ nvidia-nccl-cu11==2.19.3
352
+ nvidia-nccl-cu12==2.19.3
353
+ nvidia-nvjitlink-cu12==12.3.101
354
+ nvidia-nvtx-cu11==11.8.86
355
+ nvidia-pyindex==1.0.9
356
+ oauth2client==4.1.3
357
+ oauthlib==3.2.2
358
+ omegaconf==2.3.0
359
+ onnx==1.15.0
360
+ onnx-graphsurgeon==0.5.2
361
+ onnx2torch==1.5.6
362
+ onnxruntime==1.16.3
363
+ open_clip_torch==2.26.1
364
+ openai==0.27.8
365
+ opencv-contrib-python==4.6.0.66
366
+ opencv-python==4.6.0
367
+ opendatalab==0.0.10
368
+ opendatasets==0.1.22
369
+ openmim==0.3.9
370
+ openxlab==0.1.1
371
+ opt-einsum==3.3.0
372
+ optax==0.1.5
373
+ optree==0.11.0
374
+ orbax-checkpoint==0.1.6
375
+ ordered-set==4.1.0
376
+ orjson==3.9.0
377
+ oss2==2.17.0
378
+ outcome==1.3.0.post0
379
+ packaging @ file:///croot/packaging_1710807400464/work
380
+ pandas==2.0.2
381
+ panel==1.4.4
382
+ param==2.1.0
383
+ parameterized==0.9.0
384
+ parso==0.8.3
385
+ pathspec==0.11.1
386
+ pathtools==0.1.2
387
+ pathy==0.10.1
388
+ pedalboard==0.7.4
389
+ peewee==3.16.2
390
+ peft==0.10.0
391
+ pexpect==4.8.0
392
+ pickleshare==0.7.5
393
+ piexif==1.1.3
394
+ Pillow==9.4.0
395
+ pkce @ file:///croot/pkce_1690384816590/work
396
+ pkginfo @ file:///croot/pkginfo_1679431160147/work
397
+ platformdirs==3.8.0
398
+ plotly==5.14.1
399
+ pluggy @ file:///tmp/build/80754af9/pluggy_1648024709248/work
400
+ ply==3.11
401
+ polygraphy==0.49.9
402
+ pooch==1.8.1
403
+ portalocker==2.7.0
404
+ pre-commit==3.3.1
405
+ prefigure==0.0.9
406
+ preshed==3.0.8
407
+ proglog==0.1.10
408
+ progressbar==2.5
409
+ prompt-toolkit==3.0.39
410
+ protobuf==4.25.3
411
+ psutil==5.9.5
412
+ ptyprocess==0.7.0
413
+ pure-eval==0.2.2
414
+ py-cpuinfo==9.0.0
415
+ pyarrow==17.0.0
416
+ pyasn1==0.6.0
417
+ pyasn1-modules==0.3.0
418
+ PyAutoGUI==0.9.54
419
+ pyav==12.0.5
420
+ pycocoevalcap==1.2
421
+ pycocotools==2.0.6
422
+ pycosat @ file:///croot/pycosat_1696536503704/work
423
+ pycparser==2.21
424
+ pycryptodome==3.20.0
425
+ pycryptodomex==3.19.0
426
+ pydantic==2.7.3
427
+ pydantic_core==2.18.4
428
+ pydeck==0.8.1b0
429
+ pyDeprecate==0.3.2
430
+ pydicom==2.3.1
431
+ pydot==1.4.2
432
+ pydub==0.25.1
433
+ pyface==8.0.0
434
+ PyGetWindow==0.0.9
435
+ Pygments==2.15.1
436
+ PyJWT==2.7.0
437
+ pylibmc==1.6.3
438
+ pyloudnorm==0.1.1
439
+ pymemcache==4.0.0
440
+ Pympler==1.0.1
441
+ PyMsgBox==1.0.9
442
+ pynndescent==0.5.12
443
+ pynvml==11.5.0
444
+ pyOpenSSL @ file:///croot/pyopenssl_1690223430423/work
445
+ pyparsing==3.1.1
446
+ pyperclip==1.9.0
447
+ pyproj==3.6.0
448
+ PyQt5==5.15.10
449
+ PyQt5-sip @ file:///croot/pyqt-split_1698769088074/work/pyqt_sip
450
+ pyre-extensions==0.0.29
451
+ PyRect==0.2.0
452
+ PyScreeze==1.0.1
453
+ pyshp==2.3.1
454
+ PySocks==1.7.1
455
+ pystoi==0.4.1
456
+ python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work
457
+ python-docx==0.8.11
458
+ python-dotenv==1.0.0
459
+ python-magic==0.4.27
460
+ python-memcached==1.59
461
+ python-multipart==0.0.9
462
+ python-slugify==8.0.1
463
+ python3-xlib==0.15
464
+ pytorch-lantern==0.12.7
465
+ pytorch-lightning==2.1.0
466
+ pytorch-pretrained-biggan==0.1.1
467
+ pytorch-warmup==0.1.1
468
+ pytorchvideo==0.1.5
469
+ pytweening==1.2.0
470
+ pytz @ file:///croot/pytz_1695131579487/work
471
+ pyu2f==0.1.5
472
+ PyVirtualDisplay==3.0
473
+ pyviz_comms==3.0.2
474
+ PyWavelets==1.4.1
475
+ PyYAML==6.0
476
+ pyzmq==25.1.0
477
+ QtPy @ file:///croot/qtpy_1700144840038/work
478
+ randomname==0.2.1
479
+ realesrgan==0.3.0
480
+ referencing @ file:///croot/referencing_1699012038513/work
481
+ regex==2023.6.3
482
+ repeng @ git+https://github.com/vgel/repeng.git@c9093abddd87f865e7e2bcf4b3e556ec8813b5b2
483
+ replicate==0.25.1
484
+ requests==2.32.3
485
+ requests-oauthlib==1.3.1
486
+ requests-toolbelt @ file:///croot/requests-toolbelt_1690874004362/work
487
+ resampy==0.4.3
488
+ resize-right==0.0.2
489
+ responses==0.18.0
490
+ retry-decorator==1.1.1
491
+ rfc3986==1.5.0
492
+ rich==12.6.0
493
+ rotary-embedding-torch==0.3.0
494
+ rpds-py @ file:///croot/rpds-py_1698945930462/work
495
+ rsa==4.7.2
496
+ ruamel-yaml-conda @ file:///croot/ruamel_yaml_1667489728852/work
497
+ ruamel.yaml @ file:///croot/ruamel.yaml_1666304550667/work
498
+ ruamel.yaml.clib @ file:///croot/ruamel.yaml.clib_1666302247304/work
499
+ ruff==0.4.1
500
+ s2wrapper @ git+https://github.com/bfshi/scaling_on_scales@f08aec91337ae1ed6d7cc7a55441a96d51c14dd1
501
+ s3fs==2024.6.0
502
+ s3transfer==0.10.1
503
+ sacremoses==0.0.53
504
+ safetensors==0.4.1
505
+ salesforce-lavis @ git+https://github.com/salesforce/LAVIS.git@4a85b17846ee62f09c40f37cc955dd33c2abec68
506
+ scikit-image==0.20.0
507
+ scikit-learn==1.5.1
508
+ scikit-surprise==1.1.3
509
+ scipy==1.11.1
510
+ SecretStorage @ file:///croot/secretstorage_1678709481048/work
511
+ selenium==4.29.0
512
+ semantic-version==2.10.0
513
+ semver @ file:///croot/semver_1709243621175/work
514
+ sentencepiece==0.1.99
515
+ sentry-sdk==1.25.1
516
+ setproctitle==1.3.2
517
+ sgm @ file:///home/ryn_mote/Misc/generative-models
518
+ shapely==2.0.1
519
+ shellingham==1.5.0.post1
520
+ shortuuid==1.0.11
521
+ SimpleITK==2.2.1
522
+ sip @ file:///croot/sip_1698675935381/work
523
+ six @ file:///tmp/build/80754af9/six_1644875935023/work
524
+ sk-video==1.1.10
525
+ smart-open==6.3.0
526
+ smmap==5.0.0
527
+ sniffio==1.3.0
528
+ sortedcontainers==2.4.0
529
+ sounddevice==0.5.0
530
+ SoundFile==0.10.2
531
+ soupsieve==2.4.1
532
+ spaces==0.27.0
533
+ spacy==3.5.3
534
+ spacy-legacy==3.0.12
535
+ spacy-loggers==1.0.4
536
+ sqlparse==0.4.4
537
+ srsly==2.4.6
538
+ stable-audio-tools==0.0.16
539
+ stable-fast @ https://github.com/chengzeyi/stable-fast/releases/download/v1.0.4/stable_fast-1.0.4+torch220cu118-cp310-cp310-manylinux2014_x86_64.whl#sha256=11716f733237f557bee452eee63db415b4daeff29a28d939f73fff8003f0d415
540
+ stack-data==0.6.2
541
+ stanza==1.5.0
542
+ starlette==0.37.2
543
+ streamlit==1.22.0
544
+ svgwrite==1.4.3
545
+ sympy @ file:///croot/sympy_1701397643339/work
546
+ tables==3.9.2
547
+ tabulate==0.9.0
548
+ tenacity==8.2.2
549
+ tensorboard==2.14.1
550
+ tensorboard-data-server==0.7.2
551
+ tensorboard-plugin-wit==1.8.1
552
+ tensorflow==2.14.0
553
+ tensorflow-addons==0.16.1
554
+ tensorflow-estimator==2.14.0
555
+ tensorflow-hub==0.16.1
556
+ tensorflow-io-gcs-filesystem==0.32.0
557
+ tensorrt==8.6.1.post1
558
+ tensorrt-bindings==8.6.1
559
+ tensorrt-libs==8.6.1
560
+ tensorstore==0.1.39
561
+ termcolor==2.3.0
562
+ text-unidecode==1.3
563
+ tf-estimator-nightly==2.8.0.dev2021122109
564
+ tf_keras==2.16.0
565
+ tgate==0.1.1
566
+ thinc==8.1.10
567
+ threadpoolctl==3.2.0
568
+ tifffile==2023.4.12
569
+ tiktoken==0.4.0
570
+ timm==0.9.8
571
+ tokenizers==0.20.3
572
+ tomesd==0.1.3
573
+ tomli==2.0.1
574
+ tomlkit==0.12.0
575
+ toolz==0.12.0
576
+ torch==2.2.2+cu118
577
+ torch-ema==0.3
578
+ torch-stoi==0.2.1
579
+ torchaudio==2.0.2+cu118
580
+ torchdiffeq==0.2.3
581
+ torchio==0.19.0
582
+ torchlibrosa==0.1.0
583
+ torchmetrics==0.11.4
584
+ torchsde==0.2.6
585
+ torchvision==0.15.2+cu118
586
+ tornado @ file:///croot/tornado_1696936946304/work
587
+ tqdm==4.66.5
588
+ traitlets @ file:///croot/traitlets_1671143879854/work
589
+ traits==6.4.1
590
+ traitsui==8.0.0
591
+ trampoline==0.1.2
592
+ transformers==4.46.3
593
+ trio==0.29.0
594
+ trio-websocket==0.12.2
595
+ triton==2.2.0
596
+ truststore @ file:///croot/truststore_1695244293384/work
597
+ typed-argument-parser==1.8.1
598
+ typeguard==4.2.1
599
+ typer==0.12.3
600
+ types-regex==2023.6.3.1
601
+ typing-inspect==0.8.0
602
+ typing-validation==1.0.0.post2
603
+ typing_extensions==4.12.2
604
+ tzdata @ file:///croot/python-tzdata_1690578112552/work
605
+ tzlocal==5.0.1
606
+ uc-micro-py==1.0.2
607
+ ujson @ file:///opt/conda/conda-bld/ujson_1657544923770/work
608
+ umap-learn==0.5.6
609
+ undetected-chromedriver==3.5.5
610
+ urllib3==1.26.18
611
+ uvicorn==0.29.0
612
+ uvloop==0.19.0
613
+ v-diffusion-pytorch==0.0.2
614
+ validators==0.20.0
615
+ vector-quantize-pytorch==1.9.14
616
+ vtk==9.2.6
617
+ wandb==0.15.4
618
+ wasabi==1.1.1
619
+ watchdog==3.0.0
620
+ watchfiles==0.22.0
621
+ wavedrom==2.0.3.post3
622
+ wcwidth==0.2.6
623
+ webdataset==0.2.48
624
+ webencodings==0.5.1
625
+ websocket-client==1.8.0
626
+ websockets==11.0.3
627
+ Werkzeug==2.3.4
628
+ wget==3.2
629
+ widgetsnbextension==4.0.7
630
+ wikipedia==1.4.0
631
+ wrapt==1.14.1
632
+ wsproto==1.2.0
633
+ x-transformers==1.26.6
634
+ xformers==0.0.20
635
+ xxhash==3.2.0
636
+ xyzservices==2024.4.0
637
+ yacs==0.1.8
638
+ yapf==0.40.1
639
+ yarl==1.9.2
640
+ yattag==1.15.1
641
+ zipp==3.16.0
642
+ zstandard @ file:///croot/zstandard_1677013143055/work
twitter_prompts.csv DELETED
@@ -1,47 +0,0 @@
1
- ,0
2
- 0,a sunset
3
- 1,a still life in blue
4
- 2,last day on earth
5
- 3,the conch shell
6
- 4,the winds of change
7
- 5,a surrealist eye
8
- 6,a surrealist polaroid photo of an apple
9
- 7,metaphysics
10
- 8,the sun is setting into my glass of tea
11
- 9,the moon at 3am
12
- 10,a memento mori
13
- 11,quaking aspen tree
14
- 12,violets and daffodils
15
- 13,espresso
16
- 14,sisyphus
17
- 15,high windows of stained glass
18
- 16,a green dog
19
- 17,an adorable companion; it is a pig
20
- 18,bird of paradise
21
- 19,a complex intricate machine
22
- 20,a white clock
23
- 21,a film featuring the landscape Salt Lake City Utah
24
- 22,a creature
25
- 23,a house set aflame.
26
- 24,a gorgeous landscape by Cy Twombly
27
- 25,smoke rises from the caterpillar's hookah
28
- 26,corvid in red
29
- 27,Monet's pond
30
- 28,Genesis
31
- 29,Death is a black camel that kneels down so we can ride
32
- 30,a cherry tree made of fractals
33
- 29,the end of the sidewalk
34
- 30,a polaroid photo of a bustling city of lights and sky scrapers
35
- 31,The Fig Tree metaphor
36
- 32,God killed Van Gogh.
37
- 33,a cosmic entity alien with four eyes.
38
- 34,a horse with 128 eyes.
39
- 35,a being with an infinite set of eyes (it is omniscient)
40
- 36,A sticky-note magnum opus featuring birds
41
- 37,Moka Pot
42
- 38,the moon is a sickle cell
43
- 39,The Penultimate Supper
44
- 40,Art
45
- 41,surrealism
46
- 42,a god made of wires & dust
47
- 43,a dandelion blown into the universe