Spaces:

rynmurdock
/

generative_recsys

Running on L40S

App Files Files Community

rynmurdock commited on 3 days ago

Commit

fac61f6

1 Parent(s): aeeead2

other tiger

Browse files

Files changed (28) hide show

.gitattributes +2 -0
LICENSE +21 -0
README.MD +15 -0
README.md +0 -12
app.py +100 -234
config.py +16 -0
data.py +147 -0
10o.png → image_init/10o.png +0 -0
1o.png → image_init/1o.png +0 -0
2o.png → image_init/2o.png +0 -0
3o.png → image_init/3o.png +0 -0
4o.png → image_init/4o.png +0 -0
5o.png → image_init/5o.png +0 -0
6o.png → image_init/6o.png +0 -0
7o.png → image_init/7o.png +0 -0
8o.png → image_init/8o.png +0 -0
9o.png → image_init/9o.png +0 -0
last_epoch_ckpt/config.json +18 -0
last_epoch_ckpt/diffusion_pytorch_model.safetensors +3 -0
nsfweffnetv2-b02-3epochs.h5 → latest_val.png +2 -2
model.py +52 -0
prior/__init__.py +0 -0
prior/pipeline_kandinsky_prior.py +528 -0
prior/prior_transformer.py +369 -0
safety_checker_improved.py +0 -46
train.py +94 -0
train_requirements.txt +642 -0
twitter_prompts.csv +0 -47

.gitattributes CHANGED Viewed

@@ -28,3 +28,5 @@ first.png filter=lfs diff=lfs merge=lfs -text
 fourth.png filter=lfs diff=lfs merge=lfs -text
 *.mp4 filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text

 fourth.png filter=lfs diff=lfs merge=lfs -text
 *.mp4 filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+last_epoch_ckpt/diffusion_pytorch_model.safetensors

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 rynmurdock
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.MD ADDED Viewed

	@@ -0,0 +1,15 @@

+# The Other Tiger
+## tl;dr
+Train on embeddings of media preferred by a specific user -> produce embeddings of media they may enjoy.
+In our case here, we take the ECLIPSE `text embedding -> image embedding` prior (https://arxiv.org/abs/2312.04655) and finetune it to become a `preferred image embeddings -> heldout image embedding` prior.
+### Related work:
+Patron et al. models preference using a diffusion prior and condition on user ids with ratings: https://arxiv.org/abs/2502.18477
+Wang et al. models preference using a generator conditioned on averaged CLIP embeddings of users:  https://arxiv.org/abs/2304.03516
+My previous work based on Collaborative Filtering with CLIP embeddings: https://github.com/rynmurdock/generative_recommender

README.md DELETED Viewed

@@ -1,12 +0,0 @@
----
-license: mit
-title: Blue Tigers
-sdk: gradio
-emoji: 👁
-colorFrom: blue
-colorTo: purple
-pinned: true
----
-# Blue Tigers
-Zahir with movement.

app.py CHANGED Viewed

@@ -1,32 +1,29 @@
 # TODO unify/merge origin and this
 # TODO save & restart from (if it exists) dataframe parquet
-import torch
-# lol
-DEVICE = 'cuda'
-STEPS = 8
-output_hidden_state = False
 device = "cuda"
-dtype = torch.bfloat16
 import spaces
 import matplotlib.pyplot as plt
-import matplotlib
-import logging
 import os
-import imageio
 import gradio as gr
-import numpy as np
-from sklearn.svm import LinearSVC
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
-import sched
-import threading
 import random
 import time
@@ -43,107 +40,38 @@ prevs_df = pd.DataFrame(columns=['paths', 'embeddings', 'ips', 'user:rating', 'l
 import spaces
 start_time = time.time()
-prompt_list = [p for p in list(set(
-                pd.read_csv('./twitter_prompts.csv').iloc[:, 1].tolist())) if type(p) == str]
 ####################### Setup Model
-from diffusers import EulerDiscreteScheduler, LCMScheduler, AutoencoderTiny, UNet2DConditionModel, AutoencoderKL, AutoPipelineForText2Image
-from transformers import CLIPTextModel
-from huggingface_hub import hf_hub_download
-from safetensors.torch import load_file
 from PIL import Image
-from transformers import CLIPVisionModelWithProjection
 import uuid
-import av
-def write_video(file_name, images, fps=16):
-    container = av.open(file_name, mode="w")
-    stream = container.add_stream("h264", rate=fps)
-    # stream.options = {'preset': 'faster'}
-    stream.thread_count = 1
-    stream.width = 512
-    stream.height = 512
-    stream.pix_fmt = "yuv420p"
-    for img in images:
-        img = np.array(img)
-        img = np.round(img).astype(np.uint8)
-        frame = av.VideoFrame.from_ndarray(img, format="rgb24")
-        for packet in stream.encode(frame):
-            container.mux(packet)
-    # Flush stream
-    for packet in stream.encode():
-        container.mux(packet)
-    # Close the file
-    container.close()
-def imio_write_video(file_name, images, fps=15):
-    writer = imageio.get_writer(file_name, fps=fps)
-    for im in images:
-        writer.append_data(np.array(im))
-    writer.close()
-image_encoder = CLIPVisionModelWithProjection.from_pretrained("h94/IP-Adapter", subfolder="sdxl_models/image_encoder", torch_dtype=dtype,
-device_map='cuda')
-#vae = AutoencoderTiny.from_pretrained("madebyollin/taesd", torch_dtype=dtype)
-# vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=dtype)
-# vae = compile_unet(vae, config=config)
-#finetune_path = '''/home/ryn_mote/Misc/finetune-sd1.5/dreambooth-model best'''''
-#unet = UNet2DConditionModel.from_pretrained(finetune_path+'/unet/').to(dtype)
-#text_encoder = CLIPTextModel.from_pretrained(finetune_path+'/text_encoder/').to(dtype)
-#rynmurdock/Sea_Claws
-model_id = "stabilityai/stable-diffusion-xl-base-1.0"
-sdxl_lightening = "ByteDance/SDXL-Lightning"
-ckpt = "sdxl_lightning_8step_unet.safetensors"
-unet = UNet2DConditionModel.from_config(model_id, subfolder="unet", low_cpu_mem_usage=True, device_map=DEVICE).to(torch.float16)
-unet.load_state_dict(load_file(hf_hub_download(sdxl_lightening, ckpt)))
-image_encoder = CLIPVisionModelWithProjection.from_pretrained("h94/IP-Adapter",  subfolder="models/image_encoder", torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map=DEVICE)
-pipe = AutoPipelineForText2Image.from_pretrained(model_id, unet=unet, torch_dtype=torch.float16, variant="fp16", image_encoder=image_encoder, low_cpu_mem_usage=True)
-pipe.unet._load_ip_adapter_weights(torch.load(hf_hub_download('h94/IP-Adapter', 'sdxl_models/ip-adapter_sdxl_vit-h.bin')))
-pipe.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl_vit-h.bin")
-pipe.register_modules(image_encoder = image_encoder)
-pipe.set_ip_adapter_scale(0.8)
-#pipe.vae = AutoencoderTiny.from_pretrained("madebyollin/taesdxl", torch_dtype=torch.float16, low_cpu_mem_usage=True)
-pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
-pipe.to(device=DEVICE).to(dtype=dtype)
-output_hidden_state = False
-# pipe.unet.fuse_qkv_projections()
-#pipe.enable_free_init(method="gaussian", use_fast_sampling=True)
-#pipe.unet = torch.compile(pipe.unet)
-#pipe.vae = torch.compile(pipe.vae)
 @spaces.GPU()
 def generate_gpu(in_im_embs, prompt='the scene'):
     with torch.no_grad():
-        print(prompt)
-        in_im_embs = in_im_embs.to('cuda').unsqueeze(0)
-        output = pipe(prompt=prompt, guidance_scale=1, added_cond_kwargs={}, ip_adapter_image_embeds=[in_im_embs], num_inference_steps=STEPS)
-        im_emb, _ = pipe.encode_image(
-                    output.images[0], 'cuda', 1, output_hidden_state
-                )
-        im_emb = im_emb.detach().to('cpu').to(torch.float32)
-    return output, im_emb
-def generate(in_im_embs, prompt='the scene'):
-    output, im_emb = generate_gpu(in_im_embs, prompt)
     nsfw = False#maybe_nsfw(output.images[0])
     name = str(uuid.uuid4()).replace("-", "")
@@ -154,87 +82,35 @@ def generate(in_im_embs, prompt='the scene'):
         # TODO could return an automatic dislike of auto dislike on the backend for neither as well; just would need refactoring.
         return None, im_emb
-    output.images[0].save(path)
     return path, im_emb
 #######################
 @spaces.GPU()
-def solver(embs, ys):
-    print('ys:', ys,'EMBS:', embs.shape, embs)
-    ys = torch.tensor(ys).to('cpu', dtype=torch.float32).squeeze().unsqueeze(1)
-    sol = LinearSVC(class_weight='balanced').fit(np.array(embs), np.array(torch.tensor(ys).float() * 2 - 1)).coef_
-    return torch.tensor(sol).to('cpu', dtype=torch.float32)
 def get_user_emb(embs, ys):
-    # sample only as many negatives as there are positives
-    indices = range(len(ys))
-    pos_indices = [i for i in indices if ys[i] > .5]
-    neg_indices = [i for i in indices if ys[i] <= .5]
-    mini = min(len(pos_indices), len(neg_indices))
-    if len(ys) > 20: # drop earliest of whichever of neg or pos is most abundant
-        if len(pos_indices) > len(neg_indices):
-            ind = pos_indices[0]
-        else:
-            ind = neg_indices[0]
-        ys.pop(ind)
-        embs.pop(ind)
-        print('Dropping at 20')
-    if mini < 1:
-        feature_embs = torch.stack([torch.randn(1024), torch.randn(1024)])
-        ys_t = [0, 1]
-        print('Not enough ratings.')
-    else:
-        indices = range(len(ys))
-        ys_t = [ys[i] for i in indices]
-        feature_embs = torch.stack([embs[e].detach().cpu() for e in indices]).squeeze()
-        # scaler = preprocessing.StandardScaler().fit(feature_embs)
-        # feature_embs = scaler.transform(feature_embs)
-        # ys_t = ys
-        print(np.array(feature_embs).shape, np.array(ys_t).shape)
-    sol = solver(feature_embs.squeeze(), ys_t)
-    dif = torch.tensor(sol, dtype=dtype).to(device)
-    # could j have a base vector of a black image
-    latest_pos = (random.sample([feature_embs[i] for i in range(len(ys_t)) if ys_t[i] > .5], 1)[0]).to(device, dtype)
-    dif = ((dif / dif.std()) * latest_pos.std())
-    sol = (1*latest_pos + 3*dif)/4
-    return sol
-def pluck_img(user_id, user_emb):
-    not_rated_rows = prevs_df[[i[1]['user:rating'].get(user_id, 'gone') == 'gone' for i in prevs_df.iterrows()]]
-    while len(not_rated_rows) == 0:
-        not_rated_rows = prevs_df[[i[1]['user:rating'].get(user_id, 'gone') == 'gone' for i in prevs_df.iterrows()]]
-        time.sleep(.1)
-    # TODO optimize this lol
-    best_sim = -100000
-    for i in not_rated_rows.iterrows():
-        # TODO sloppy .to but it is 3am.
-        sim = torch.cosine_similarity(i[1]['embeddings'].detach().to('cpu'), user_emb.detach().to('cpu'))
-        if sim > best_sim:
-            best_sim = sim
-            best_row = i[1]
-    img = best_row['paths']
-    return img
 def background_next_image():
@@ -256,43 +132,30 @@ def background_next_image():
             #   media.
             unrated_from_user = not_rated_rows[[i[1]['from_user_id'] == uid for i in not_rated_rows.iterrows()]]
-            rated_from_user = rated_rows[[i[1]['from_user_id'] == uid for i in rated_rows.iterrows()]]
-            # we pop previous ratings if there are > n
-            if len(rated_from_user) >= 15:
-                oldest = rated_from_user.iloc[0]['paths']
-                prevs_df = prevs_df[prevs_df['paths'] != oldest]
             # we don't compute more after n are in the queue for them
             if len(unrated_from_user) >= 10:
                 continue
             if len(rated_rows) < 5:
-                continue
-            embs, ys = pluck_embs_ys(uid)
-            user_emb = get_user_emb(embs, [y[1] for y in ys])
             global glob_idx
             glob_idx += 1
-            if glob_idx >= (len(prompt_list)-1):
-                glob_idx = 0
-            if glob_idx % 7 == 0:
-                text = prompt_list[glob_idx]
-            else:
-                text = 'an image'
-            img, embs = generate(user_emb, text)
             if img:
                 tmp_df = pd.DataFrame(columns=['paths', 'embeddings', 'ips', 'user:rating', 'latest_user_to_rate', 'text', 'gemb'])
                 tmp_df['paths'] = [img]
-                tmp_df['embeddings'] = [embs]
                 tmp_df['user:rating'] = [{' ': ' '}]
                 tmp_df['from_user_id'] = [uid]
-                tmp_df['text'] = [text]
                 prevs_df = pd.concat((prevs_df, tmp_df))
                 # we can free up storage by deleting the image
                 if len(prevs_df) > 500:
@@ -305,19 +168,27 @@ def background_next_image():
                     # only keep 50 images & embeddings & ips, then remove oldest besides calibrating
                     prevs_df = pd.concat((prevs_df.iloc[:6], prevs_df.iloc[7:]))
-def pluck_embs_ys(user_id):
-    rated_rows = prevs_df[[i[1]['user:rating'].get(user_id, None) != None for i in prevs_df.iterrows()]]
-    #not_rated_rows = prevs_df[[i[1]['user:rating'].get(user_id, None) == None for i in prevs_df.iterrows()]]
-    #while len(not_rated_rows) == 0:
-    #    not_rated_rows = prevs_df[[i[1]['user:rating'].get(user_id, None) == None for i in prevs_df.iterrows()]]
-    #    rated_rows = prevs_df[[i[1]['user:rating'].get(user_id, None) != None for i in prevs_df.iterrows()]]
-    #    time.sleep(.01)
-    #    print('current user has 0 not_rated_rows')
-    embs = rated_rows['embeddings'].to_list()
-    ys = [i[user_id] for i in rated_rows['user:rating'].to_list()]
-    return embs, ys
 def next_image(calibrate_prompts, user_id):
     with torch.no_grad():
@@ -326,11 +197,8 @@ def next_image(calibrate_prompts, user_id):
             image = prevs_df[prevs_df['paths'] == cal_video]['paths'].to_list()[0]
             return image, calibrate_prompts,
         else:
-            embs, ys = pluck_embs_ys(user_id)
-            ys_here = [y[1] for y in ys]
-            user_emb = get_user_emb(embs, ys_here)
-            image  = pluck_img(user_id, user_emb)
-            return image, calibrate_prompts,
@@ -451,7 +319,7 @@ Explore the latent space without text prompts based on your preferences. Learn m
     ''', elem_id="description")
     user_id = gr.State()
     # calibration videos -- this is a misnomer now :D
-    calibrate_prompts = gr.State([
     './5o.png',
     './2o.png',
     './6o.png',
@@ -462,22 +330,18 @@ Explore the latent space without text prompts based on your preferences. Learn m
     './4o.png',
     './10o.png',
     './9o.png',
-    ])
     def l():
         return None
     with gr.Row(elem_id='output-image'):
         img = gr.Image(
-        label='Lightning',
-#        autoplay=True,
-        interactive=False,
-#        height=512,
-#        width=512,
-        #include_audio=False,
-        elem_id="video_output",
-        type='filepath',
-       )
-        #img.play(l, js='''document.querySelector('[data-testid="Lightning-player"]').loop = true''')
@@ -531,24 +395,24 @@ Explore the latent space without text prompts based on your preferences. Learn m
 </ div>''')
 # TODO quiet logging
 scheduler = BackgroundScheduler()
 scheduler.add_job(func=background_next_image, trigger="interval", seconds=.2)
 scheduler.start()
-#thread = threading.Thread(target=background_next_image,)
-#thread.start()
 # TODO shouldn't call this before gradio launch, yeah?
 @spaces.GPU()
 def encode_space(x):
-    im_emb, _ = pipe.encode_image(
-                image, DEVICE, 1, output_hidden_state
             )
     return im_emb.detach().to('cpu').to(torch.float32)
 # prep our calibration videos
-for im, txt in [ # DO NOT NAME THESE PNGs JUST NUMBERS! apparently we assign images by number
     ('./1o.png', 'describe the scene: omens in the suburbs'),
     ('./2o.png', 'describe the scene: geometric abstract art of a windmill'),
     ('./3o.png', 'describe the scene: memento mori'),
@@ -559,7 +423,9 @@ for im, txt in [ # DO NOT NAME THESE PNGs JUST NUMBERS! apparently we assign ima
     ('./8o.png', '8 '),
     ('./9o.png', '9 '),
     ('./10o.png', '10 '),
-    ]:
     tmp_df = pd.DataFrame(columns=['paths', 'embeddings', 'ips', 'user:rating', 'text', 'gemb'])
     tmp_df['paths'] = [im]
     image = Image.open(im).convert('RGB')

+import gradio as gr
+import random
+import time
+import torch
+import config
+from model import get_model_and_tokenizer
+model, model.prior_pipe.image_encoder = get_model_and_tokenizer(config.model_path,
+                                                                'cuda', torch.bfloat16)
 # TODO unify/merge origin and this
 # TODO save & restart from (if it exists) dataframe parquet
 device = "cuda"
 import spaces
 import matplotlib.pyplot as plt
 import os
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 import random
 import time
 import spaces
 start_time = time.time()
 ####################### Setup Model
+from diffusers import EulerDiscreteScheduler
 from PIL import Image
 import uuid
 @spaces.GPU()
 def generate_gpu(in_im_embs, prompt='the scene'):
     with torch.no_grad():
+        in_im_embs = in_im_embs.to('cuda')
+        negative_image_embeds = in_im_embs[0] # model.prior_pipe.get_zero_embed()
+        positive_image_embeds = in_im_embs[1]
+        images = model.kandinsky_pipe(
+            num_inference_steps=50,
+            image_embeds=positive_image_embeds,
+            negative_image_embeds=negative_image_embeds,
+            guidance_scale=11,
+        ).images[0]
+        cond = (
+                    model.prior_pipe.image_processor(images, return_tensors="pt")
+                    .pixel_values[0]
+                    .unsqueeze(0)
+                    .to(dtype=model.prior_pipe.image_encoder.dtype, device=device)
+                    )
+        im_emb = model.prior_pipe.image_encoder(cond)["image_embeds"]
+    return images, im_emb
+def generate(in_im_embs, ):
+    output, im_emb = generate_gpu(in_im_embs)
     nsfw = False#maybe_nsfw(output.images[0])
     name = str(uuid.uuid4()).replace("-", "")
         # TODO could return an automatic dislike of auto dislike on the backend for neither as well; just would need refactoring.
         return None, im_emb
+    output.save(path)
     return path, im_emb
 #######################
 @spaces.GPU()
+def sample_embs(prompt_embeds):
+    latent = torch.randn(prompt_embeds.shape[0], 1, prompt_embeds.shape[-1])
+    if prompt_embeds.shape[1] < 8: # TODO grab as `k` arg from config
+            prompt_embeds = torch.nn.functional.pad(prompt_embeds, [0, 0, 0, 8-prompt_embeds.shape[1]])
+    assert prompt_embeds.shape[1] == 8, f"The model is set to take `k`` cond image embeds but is shape {prompt_embeds.shape}"
+    image_embeds = model(latent.to('cuda'), prompt_embeds.to('cuda')).predicted_image_embedding
+    return image_embeds
+@spaces.GPU()
 def get_user_emb(embs, ys):
+    positives = [e for e, ys in zip(embs, ys) if ys == 1]
+    embs = random.sample(positives, min(8, len(positives)))
+    positives = torch.stack(embs, 1)
+    negs = [e for e, ys in zip(embs, ys) if ys == 0]
+    negative_embs = random.sample(negs, min(8, len(negs)))
+    negatives = torch.stack(negative_embs, 1)
+    image_embeds = torch.stack([sample_embs(negatives), sample_embs(positives)])
+    return image_embeds
 def background_next_image():
             #   media.
             unrated_from_user = not_rated_rows[[i[1]['from_user_id'] == uid for i in not_rated_rows.iterrows()]]
             # we don't compute more after n are in the queue for them
             if len(unrated_from_user) >= 10:
                 continue
             if len(rated_rows) < 5:
+                continue
             global glob_idx
             glob_idx += 1
+            ems = rated_rows['embeddings'].to_list()
+            ys = [i[uid][0] for i in rated_rows['user:rating'].to_list()]
+            emz = get_user_emb(ems, ys)
+            img, embs = generate(emz)
             if img:
                 tmp_df = pd.DataFrame(columns=['paths', 'embeddings', 'ips', 'user:rating', 'latest_user_to_rate', 'text', 'gemb'])
                 tmp_df['paths'] = [img]
+                tmp_df['embeddings'] = [embs.to(torch.float32).to('cpu')]
                 tmp_df['user:rating'] = [{' ': ' '}]
                 tmp_df['from_user_id'] = [uid]
+                tmp_df['text'] = ['']
                 prevs_df = pd.concat((prevs_df, tmp_df))
                 # we can free up storage by deleting the image
                 if len(prevs_df) > 500:
                     # only keep 50 images & embeddings & ips, then remove oldest besides calibrating
                     prevs_df = pd.concat((prevs_df.iloc[:6], prevs_df.iloc[7:]))
+def pluck_img(user_id):
+    rated_rows = prevs_df[[i[1]['user:rating'].get(user_id, None) is not None for i in prevs_df.iterrows()]]
+    ems = rated_rows['embeddings'].to_list()
+    ys = [i[user_id][0] for i in rated_rows['user:rating'].to_list()]
+    user_emb = get_user_emb(ems, ys)
+    not_rated_rows = prevs_df[[i[1]['user:rating'].get(user_id, 'gone') == 'gone' for i in prevs_df.iterrows()]]
+    while len(not_rated_rows) == 0:
+        not_rated_rows = prevs_df[[i[1]['user:rating'].get(user_id, 'gone') == 'gone' for i in prevs_df.iterrows()]]
+        time.sleep(.1)
+        # TODO optimize this lol
+    best_sim = -10000000
+    for i in not_rated_rows.iterrows():
+        # TODO sloppy .to but it is 3am.
+        sim = torch.cosine_similarity(i[1]['embeddings'].detach().to('cpu'), user_emb.detach().to('cpu'), -1)
+        if len(sim) > 1: sim = sim[1]
+        if sim.squeeze() > best_sim:
+            best_sim = sim
+            best_row = i[1]
+    img = best_row['paths']
+    return img
 def next_image(calibrate_prompts, user_id):
     with torch.no_grad():
             image = prevs_df[prevs_df['paths'] == cal_video]['paths'].to_list()[0]
             return image, calibrate_prompts,
         else:
+            image = pluck_img(user_id)
+            return image, calibrate_prompts
     ''', elem_id="description")
     user_id = gr.State()
     # calibration videos -- this is a misnomer now :D
+    calibrate_prompts = [
     './5o.png',
     './2o.png',
     './6o.png',
     './4o.png',
     './10o.png',
     './9o.png',
+    ]
+    calibrate_prompts = gr.State(['image_init/'+c for c in calibrate_prompts])
     def l():
         return None
     with gr.Row(elem_id='output-image'):
         img = gr.Image(
+            label='Lightning',
+            interactive=False,
+            elem_id="output_im",
+            type='filepath',
+        )
 </ div>''')
 # TODO quiet logging
 scheduler = BackgroundScheduler()
 scheduler.add_job(func=background_next_image, trigger="interval", seconds=.2)
 scheduler.start()
 # TODO shouldn't call this before gradio launch, yeah?
 @spaces.GPU()
 def encode_space(x):
+    im = (
+            model.prior_pipe.image_processor(x, return_tensors="pt")
+            .pixel_values[0]
+            .unsqueeze(0)
+            .to(dtype=model.prior_pipe.image_encoder.dtype, device=device)
             )
+    im_emb = model.prior_pipe.image_encoder(im)["image_embeds"]
     return im_emb.detach().to('cpu').to(torch.float32)
 # prep our calibration videos
+m_calibrate = [ # DO NOT NAME THESE PNGs JUST NUMBERS! apparently we assign images by number
     ('./1o.png', 'describe the scene: omens in the suburbs'),
     ('./2o.png', 'describe the scene: geometric abstract art of a windmill'),
     ('./3o.png', 'describe the scene: memento mori'),
     ('./8o.png', '8 '),
     ('./9o.png', '9 '),
     ('./10o.png', '10 '),
+    ]
+m_calibrate = [('image_init/'+c[0], c[1]) for c in m_calibrate]
+for im, txt in m_calibrate:
     tmp_df = pd.DataFrame(columns=['paths', 'embeddings', 'ips', 'user:rating', 'text', 'gemb'])
     tmp_df['paths'] = [im]
     image = Image.open(im).convert('RGB')

config.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch
+# NOTE model path name changed
+model_path = './last_epoch_ckpt/'
+lr = 1e-5
+device = 'cuda'
+dtype = torch.bfloat16
+data_path = '../data/lke_2017'
+save_path = './'
+epochs = 4
+batch_size = 16
+number_k_clip_embed = 16 # divide by this to determine bundling together of sequences -> CLIP
+num_workers = 32
+seed = 107
+# TODO config option to swap to diffusion?

data.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import torch
+from PIL import Image
+import random
+import logging
+import torchvision
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(image, min_num=1, max_num=8, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+def load_image(image_file, pil_image=None, input_size=224,):
+    if not pil_image:
+        pil_image = Image.open(image_file)
+    image = pil_image.convert('RGB')
+    transform = build_transform(input_size=input_size)
+    # images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+    pixel_values = [transform(image) for image in [image]]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+def my_collate(batch):
+    try:
+        targets = torch.stack([s['target'] for s in batch])
+        samples = torch.stack([s['samples'] for s in batch])
+        # targets = torch.stack([s['target'] for s in batch if s is not None])
+        # samples = torch.stack([s['samples'] for s in batch if s is not None])
+    except Exception as e:
+      logging.warning('my_collate issue ', e)
+      return None
+    return samples, targets
+class ImageFolderSample(torchvision.datasets.ImageFolder):
+    def __init__(self, data_path, k, processor):
+        super().__init__(data_path)
+        self.k = k
+        self.processor = processor
+    def safe_getitem(self, index):
+        try:
+            target_path, class_type = self.samples[index]
+            target = torch.from_numpy(self.processor(self.loader(target_path)).data['pixel_values'][0])
+            input_paths = random.choices([p[0] for p in self.samples if p != target_path and class_type in p], k=self.k)
+            assert len(input_paths) == self.k # I think it may do this by default...
+            samples = torch.stack([torch.from_numpy(self.processor(self.loader(i)).data['pixel_values'][0]) for i in input_paths])
+        except Exception as e:
+            logging.warning('getitem issue ', e)
+            samples, target = None, None
+        drop_mask = torch.rand(samples.shape[0],) < .2
+        samples[drop_mask] = 0
+        drop_whole_set_mask = torch.rand(1,) < .1
+        if drop_whole_set_mask:
+            samples = torch.zeros_like(samples)
+        return {'samples': samples[:, :3], 'target': target[:3]}
+    def __getitem__(self, index: int):
+        return self.safe_getitem(index)
+# https://data.mendeley.com/datasets/fs4k2zc5j5/3
+# Gomez, J. C., Ibarra-Manzano, M. A., & Almanza-Ojeda, D. L. (2017). User Identification in Pinterest Through the Refinement of Cascade Fusion of Text and Images. Research in Computing Science, 144, 41-52.
+def get_dataset(data_path, processor):
+    return ImageFolderSample(data_path, 8, processor)
+def get_dataloader(data_path, batch_size, num_workers, processor):
+    dataloader = torch.utils.data.DataLoader(
+                                            get_dataset(data_path, processor=processor),
+                                            num_workers=num_workers,
+                                            collate_fn=my_collate,
+                                            batch_size=batch_size,
+                                            shuffle=True,
+                                            drop_last=True
+                                            )
+    return dataloader

10o.png → image_init/10o.png RENAMED Viewed

File without changes

1o.png → image_init/1o.png RENAMED Viewed

File without changes

2o.png → image_init/2o.png RENAMED Viewed

File without changes

3o.png → image_init/3o.png RENAMED Viewed

File without changes

4o.png → image_init/4o.png RENAMED Viewed

File without changes

5o.png → image_init/5o.png RENAMED Viewed

File without changes

6o.png → image_init/6o.png RENAMED Viewed

File without changes

7o.png → image_init/7o.png RENAMED Viewed

File without changes

8o.png → image_init/8o.png RENAMED Viewed

File without changes

9o.png → image_init/9o.png RENAMED Viewed

File without changes

last_epoch_ckpt/config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "_class_name": "PriorTransformer",
+  "_diffusers_version": "0.34.0.dev0",
+  "_name_or_path": "./last_epoch_ckpt/",
+  "added_emb_type": "prd",
+  "additional_embeddings": 3,
+  "attention_head_dim": 32,
+  "clip_embed_dim": null,
+  "dropout": 0.0,
+  "embedding_dim": 1280,
+  "embedding_proj_dim": null,
+  "embedding_proj_norm_type": null,
+  "encoder_hid_proj_type": "linear",
+  "norm_in_type": null,
+  "num_attention_heads": 16,
+  "num_embeddings": 77,
+  "num_layers": 10
+}

last_epoch_ckpt/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4caacf8d2ee0d5be682f6d8af30205c6c18092d15edf9f912467e0f2736ef6ae
+size 136790920

nsfweffnetv2-b02-3epochs.h5 → latest_val.png RENAMED Viewed

File without changes

model.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import torch
+import logging
+from diffusers import DiffusionPipeline
+from prior.pipeline_kandinsky_prior import KandinskyPriorPipeline
+from prior.prior_transformer import PriorTransformer
+class Zoo(torch.nn.Module):
+    def __init__(self, prior, prior_pipe, kandinsky_pipe, ) -> None:
+        super().__init__()
+        self.prior = prior
+        self.prior_pipe = prior_pipe
+        self.kandinsky_pipe = kandinsky_pipe
+        self.pre_prior_transformer = None
+        # NOTE we may get better perf from freezing our prior
+        #     and only training a transformer adapter?
+    def forward(self, latents, preferred_embeds):
+        pred = self.prior(latents, preferred_embeds)
+        return pred
+    def do_validation(self, images): # TODO constant val seed
+        assert all([len(i) == 8 for i in images]), f'We have must have `k` images, not {len(images)}.'
+        image_embeds, negative_image_embeds = self.prior_pipe(images).to_tuple()
+        images = self.kandinsky_pipe(
+            num_inference_steps=50,
+            image_embeds=image_embeds,
+            negative_image_embeds=negative_image_embeds,
+        ).images
+        images[0].save('latest_val.png')
+        return images
+def get_model_and_tokenizer(path, device, dtype):
+    prior = PriorTransformer.from_pretrained("ECLIPSE-Community/ECLIPSE_KandinskyV22_Prior"
+                                             if path is None else path).to(device)
+    pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior", prior=prior).to(device)
+    pipe_prior.image_encoder = pipe_prior.image_encoder.to(device, dtype)
+    # Note: don't set the prior to `dtype`` as it may be half precision,
+    #     and we're training with mixed precision
+    #     so we need to keep our full-precision weight for trained params
+    kandinsky_pipe = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder").to(device, dtype)
+    model = Zoo(prior, pipe_prior, kandinsky_pipe).to(device)
+    return model, model.prior_pipe.image_encoder
+def get_optimizer(params, lr):
+    logging.info(f'Training: {params}')
+    optimizer = torch.optim.AdamW(params, lr=lr)
+    return optimizer

prior/__init__.py ADDED Viewed

File without changes

prior/pipeline_kandinsky_prior.py ADDED Viewed

	@@ -0,0 +1,528 @@

+from dataclasses import dataclass
+from typing import List, Optional, Union
+import numpy as np
+import PIL
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+from diffusers.models import PriorTransformer
+from diffusers.schedulers import UnCLIPScheduler
+from diffusers.utils import (
+    BaseOutput,
+    is_accelerate_available,
+    is_accelerate_version,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyPipeline, KandinskyPriorPipeline
+        >>> import torch
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior")
+        >>> pipe_prior.to("cuda")
+        >>> prompt = "red cat, 4k photo"
+        >>> out = pipe_prior(prompt)
+        >>> image_emb = out.image_embeds
+        >>> negative_image_emb = out.negative_image_embeds
+        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1")
+        >>> pipe.to("cuda")
+        >>> image = pipe(
+        ...     prompt,
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=negative_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=100,
+        ... ).images
+        >>> image[0].save("cat.png")
+        ```
+"""
+EXAMPLE_INTERPOLATE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyPriorPipeline, KandinskyPipeline
+        >>> from diffusers.utils import load_image
+        >>> import PIL
+        >>> import torch
+        >>> from torchvision import transforms
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+        >>> img1 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... )
+        >>> img2 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/starry_night.jpeg"
+        ... )
+        >>> images_texts = ["a cat", img1, img2]
+        >>> weights = [0.3, 0.3, 0.4]
+        >>> image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights)
+        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
+        >>> pipe.to("cuda")
+        >>> image = pipe(
+        ...     "",
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=150,
+        ... ).images[0]
+        >>> image.save("starry_cat.png")
+        ```
+"""
+@dataclass
+class KandinskyPriorPipelineOutput(BaseOutput):
+    """
+    Output class for KandinskyPriorPipeline.
+    Args:
+        image_embeds (`torch.FloatTensor`)
+            clip image embeddings for text prompt
+        negative_image_embeds (`List[PIL.Image.Image]` or `np.ndarray`)
+            clip image embeddings for unconditional tokens
+    """
+    image_embeds: Union[torch.FloatTensor, np.ndarray]
+    negative_image_embeds: Union[torch.FloatTensor, np.ndarray]
+class KandinskyPriorPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating image prior for Kandinsky
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        prior ([`PriorTransformer`]):
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+    """
+    _exclude_from_cpu_offload = ["prior"]
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        image_encoder: CLIPVisionModelWithProjection,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        scheduler: UnCLIPScheduler,
+        image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+        self.register_modules(
+            prior=prior,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            image_processor=image_processor,
+        )
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING)
+    def interpolate(
+        self,
+        images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]],
+        weights: List[float],
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        negative_prior_prompt: Optional[str] = None,
+        negative_prompt: str = "",
+        guidance_scale: float = 4.0,
+        device=None,
+    ):
+        """
+        Function invoked when using the prior pipeline for interpolation.
+        Args:
+            images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`):
+                list of prompts and images to guide the image generation.
+            weights: (`List[float]`):
+                list of weights for each condition in `images_and_prompts`
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            negative_prior_prompt (`str`, *optional*):
+                The prompt not to guide the prior diffusion process. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+        Examples:
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+        device = device or self.device
+        if len(images_and_prompts) != len(weights):
+            raise ValueError(
+                f"`images_and_prompts` contains {len(images_and_prompts)} items and `weights` contains {len(weights)} items - they should be lists of same length"
+            )
+        image_embeddings = []
+        for cond, weight in zip(images_and_prompts, weights):
+            if isinstance(cond, str):
+                image_emb = self(
+                    cond,
+                    num_inference_steps=num_inference_steps,
+                    num_images_per_prompt=num_images_per_prompt,
+                    generator=generator,
+                    latents=latents,
+                    negative_prompt=negative_prior_prompt,
+                    guidance_scale=guidance_scale,
+                ).image_embeds
+            elif isinstance(cond, (PIL.Image.Image, torch.Tensor)):
+                if isinstance(cond, PIL.Image.Image):
+                    cond = (
+                        self.image_processor(cond, return_tensors="pt")
+                        .pixel_values[0]
+                        .unsqueeze(0)
+                        .to(dtype=self.image_encoder.dtype, device=device)
+                    )
+                image_emb = self.image_encoder(cond)["image_embeds"]
+            else:
+                raise ValueError(
+                    f"`images_and_prompts` can only contains elements to be of type `str`, `PIL.Image.Image` or `torch.Tensor`  but is {type(cond)}"
+                )
+            image_embeddings.append(image_emb * weight)
+        image_emb = torch.cat(image_embeddings).sum(dim=0, keepdim=True)
+        out_zero = self(
+            negative_prompt,
+            num_inference_steps=num_inference_steps,
+            num_images_per_prompt=num_images_per_prompt,
+            generator=generator,
+            latents=latents,
+            negative_prompt=negative_prior_prompt,
+            guidance_scale=guidance_scale,
+        )
+        zero_image_emb = (
+            out_zero.negative_image_embeds
+            if negative_prompt == ""
+            else out_zero.image_embeds
+        )
+        return KandinskyPriorPipelineOutput(
+            image_embeds=image_emb, negative_image_embeds=zero_image_emb
+        )
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = torch.randn(
+                shape, generator=generator, device=device, dtype=dtype
+            )
+        else:
+            if latents.shape != shape:
+                raise ValueError(
+                    f"Unexpected latents shape, got {latents.shape}, expected {shape}"
+                )
+            latents = latents.to(device)
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+    def get_zero_embed(self, batch_size=1, device=None):
+        device = device or self.device
+        zero_img = torch.zeros(
+            1,
+            3,
+            self.image_encoder.config.image_size,
+            self.image_encoder.config.image_size,
+        ).to(device=device, dtype=self.image_encoder.dtype)
+        zero_image_emb = self.image_encoder(zero_img)["image_embeds"]
+        zero_image_emb = zero_image_emb.repeat(batch_size, 1)
+        return zero_image_emb
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        cond = (
+                self.image_processor(prompt, return_tensors="pt")
+                .pixel_values[0]
+                .unsqueeze(0)
+                .to(dtype=self.image_encoder.dtype, device=device)
+                )
+        prompt_embeds = self.image_encoder(cond)["image_embeds"]
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        if do_classifier_free_guidance:
+            if negative_prompt is None:
+                uncond_tokens = self.get_zero_embed(batch_size=prompt_embeds.shape[0])
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            cond = (
+                    self.image_processor(uncond_tokens, return_tensors="pt")
+                    .pixel_values[0]
+                    .unsqueeze(0)
+                    .to(dtype=self.image_encoder.dtype, device=device)
+                    )
+            negative_prompt_embeds = self.image_encoder(cond)["image_embeds"]
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(
+                1, num_images_per_prompt
+            )
+            negative_prompt_embeds = negative_prompt_embeds.view(
+                batch_size * num_images_per_prompt, seq_len
+            )
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        return prompt_embeds, None
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError(
+                "`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher."
+            )
+        device = torch.device(f"cuda:{gpu_id}")
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+        hook = None
+        for cpu_offloaded_model in [self.text_encoder, self.prior]:
+            _, hook = cpu_offload_with_hook(
+                cpu_offloaded_model, device, prev_module_hook=hook
+            )
+        # We'll offload the last model manually.
+        self.prior_hook = hook
+        _, hook = cpu_offload_with_hook(
+            self.image_encoder, device, prev_module_hook=self.prior_hook
+        )
+        self.final_offload_hook = hook
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        guidance_scale: float = 4.0,
+        output_type: Optional[str] = "pt",
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            output_type (`str`, *optional*, defaults to `"pt"`):
+                The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
+                (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+        Examples:
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+        # if the negative prompt is defined we double the batch size to
+        # directly retrieve the negative prompt embedding
+        if negative_prompt is not None:
+            prompt = prompt + negative_prompt
+            negative_prompt = 2 * negative_prompt
+        device = self._execution_device
+        batch_size = len(prompt)
+        batch_size = batch_size * num_images_per_prompt
+        full_prompt = []
+        for b in prompt: # TODO of course vectorize this lol
+            full_seq = []
+            for p in b:
+                prompt_embeds, text_mask = self._encode_prompt(
+                    p, device, num_images_per_prompt, False, negative_prompt
+                )
+                full_seq.append(prompt_embeds)
+                prompt_embeds = torch.cat(full_seq, 0)
+            full_prompt.append(prompt_embeds)
+        prompt_embeds = torch.stack(full_prompt)
+        if prompt_embeds.shape[1] < 8: # TODO grab as `k` arg from config
+            prompt_embeds = torch.nn.functional.pad(prompt_embeds, [0, 0, 0, 8-prompt_embeds.shape[1]])
+        assert prompt_embeds.shape[1] == 8, f"The model is set to take `k`` cond image embeds but is shape {prompt_embeds.shape}"
+        prompt_embeds = prompt_embeds.to('cuda') # TODO set with `k` arg from config
+        hidden_states = torch.randn(
+            (batch_size, prompt_embeds.shape[-1]),
+            device=prompt_embeds.device,
+            dtype=prompt_embeds.dtype,
+            generator=generator,
+        )
+        latents = self.prior(
+            hidden_states,
+            proj_embedding=prompt_embeds,
+            encoder_hidden_states=prompt_embeds,
+            attention_mask=text_mask,
+        ).predicted_image_embedding
+        image_embeddings = latents
+        # if negative prompt has been defined, we retrieve split the image embedding into two
+        if negative_prompt is None:
+            zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
+            if (
+                hasattr(self, "final_offload_hook")
+                and self.final_offload_hook is not None
+            ):
+                self.final_offload_hook.offload()
+        else:
+            image_embeddings, zero_embeds = image_embeddings.chunk(2)
+            if (
+                hasattr(self, "final_offload_hook")
+                and self.final_offload_hook is not None
+            ):
+                self.prior_hook.offload()
+        if output_type not in ["pt", "np"]:
+            raise ValueError(
+                f"Only the output types `pt` and `np` are supported not output_type={output_type}"
+            )
+        if output_type == "np":
+            image_embeddings = image_embeddings.cpu().numpy()
+            zero_embeds = zero_embeds.cpu().numpy()
+        if not return_dict:
+            return (image_embeddings, zero_embeds)
+        return KandinskyPriorPipelineOutput(
+            image_embeds=image_embeddings, negative_image_embeds=zero_embeds
+        )

prior/prior_transformer.py ADDED Viewed

	@@ -0,0 +1,369 @@

+import sys
+sys.path.append("..")
+from dataclasses import dataclass
+from typing import Dict, Optional, Union
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import BaseOutput
+from diffusers.models.attention import BasicTransformerBlock
+from diffusers.models.attention_processor import AttentionProcessor, AttnProcessor
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.models.modeling_utils import ModelMixin
+@dataclass
+class PriorTransformerOutput(BaseOutput):
+    """
+    The output of [`PriorTransformer`].
+    Args:
+        predicted_image_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
+            The predicted CLIP image embedding conditioned on the CLIP text embedding input.
+    """
+    predicted_image_embedding: torch.FloatTensor
+class PriorTransformer(ModelMixin, ConfigMixin):
+    """
+    A Prior Transformer model.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 32): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
+        num_layers (`int`, *optional*, defaults to 20): The number of layers of Transformer blocks to use.
+        embedding_dim (`int`, *optional*, defaults to 768): The dimension of the model input `hidden_states`
+        num_embeddings (`int`, *optional*, defaults to 77):
+            The number of embeddings of the model input `hidden_states`
+        additional_embeddings (`int`, *optional*, defaults to 4): The number of additional tokens appended to the
+            projected `hidden_states`. The actual length of the used `hidden_states` is `num_embeddings +
+            additional_embeddings`.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        time_embed_act_fn (`str`, *optional*, defaults to 'silu'):
+            The activation function to use to create timestep embeddings.
+        norm_in_type (`str`, *optional*, defaults to None): The normalization layer to apply on hidden states before
+            passing to Transformer blocks. Set it to `None` if normalization is not needed.
+        embedding_proj_norm_type (`str`, *optional*, defaults to None):
+            The normalization layer to apply on the input `proj_embedding`. Set it to `None` if normalization is not
+            needed.
+        encoder_hid_proj_type (`str`, *optional*, defaults to `linear`):
+            The projection layer to apply on the input `encoder_hidden_states`. Set it to `None` if
+            `encoder_hidden_states` is `None`.
+        added_emb_type (`str`, *optional*, defaults to `prd`): Additional embeddings to condition the model.
+            Choose from `prd` or `None`. if choose `prd`, it will prepend a token indicating the (quantized) dot
+            product between the text embedding and image embedding as proposed in the unclip paper
+            https://arxiv.org/abs/2204.06125 If it is `None`, no additional embeddings will be prepended.
+        time_embed_dim (`int, *optional*, defaults to None): The dimension of timestep embeddings.
+            If None, will be set to `num_attention_heads * attention_head_dim`
+        embedding_proj_dim (`int`, *optional*, default to None):
+            The dimension of `proj_embedding`. If None, will be set to `embedding_dim`.
+        clip_embed_dim (`int`, *optional*, default to None):
+            The dimension of the output. If None, will be set to `embedding_dim`.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 32,
+        attention_head_dim: int = 64,
+        num_layers: int = 20,
+        embedding_dim: int = 768,
+        num_embeddings=77,
+        additional_embeddings=3,  # as we have remvoed the time embedding
+        dropout: float = 0.0,
+        # time_embed_act_fn: str = "silu",
+        norm_in_type: Optional[str] = None,  # layer
+        embedding_proj_norm_type: Optional[str] = None,  # layer
+        encoder_hid_proj_type: Optional[str] = "linear",  # linear
+        added_emb_type: Optional[str] = "prd",  # prd
+        # time_embed_dim: Optional[int] = None,
+        embedding_proj_dim: Optional[int] = None,
+        clip_embed_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        self.additional_embeddings = additional_embeddings
+        # time_embed_dim = time_embed_dim or inner_dim
+        embedding_proj_dim = embedding_proj_dim or embedding_dim
+        clip_embed_dim = clip_embed_dim or embedding_dim
+        # self.time_proj = Timesteps(inner_dim, True, 0)
+        # self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, out_dim=inner_dim, act_fn=time_embed_act_fn)
+        self.proj_in = nn.Linear(embedding_dim, inner_dim)
+        if embedding_proj_norm_type is None:
+            self.embedding_proj_norm = None
+        elif embedding_proj_norm_type == "layer":
+            self.embedding_proj_norm = nn.LayerNorm(embedding_proj_dim)
+        else:
+            raise ValueError(f"unsupported embedding_proj_norm_type: {embedding_proj_norm_type}")
+        self.embedding_proj = nn.Linear(embedding_proj_dim, inner_dim)
+        if encoder_hid_proj_type is None:
+            self.encoder_hidden_states_proj = None
+        elif encoder_hid_proj_type == "linear":
+            self.encoder_hidden_states_proj = nn.Linear(embedding_dim, inner_dim)
+        else:
+            raise ValueError(f"unsupported encoder_hid_proj_type: {encoder_hid_proj_type}")
+        self.positional_embedding = nn.Parameter(torch.zeros(1, num_embeddings + additional_embeddings, inner_dim))
+        if added_emb_type == "prd":
+            self.prd_embedding = nn.Parameter(torch.zeros(1, 1, inner_dim))
+        elif added_emb_type is None:
+            self.prd_embedding = None
+        else:
+            raise ValueError(
+                f"`added_emb_type`: {added_emb_type} is not supported. Make sure to choose one of `'prd'` or `None`."
+            )
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    activation_fn="gelu",
+                    attention_bias=True,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        if norm_in_type == "layer":
+            self.norm_in = nn.LayerNorm(inner_dim)
+        elif norm_in_type is None:
+            self.norm_in = None
+        else:
+            raise ValueError(f"Unsupported norm_in_type: {norm_in_type}.")
+        self.norm_out = nn.LayerNorm(inner_dim)
+        self.proj_to_clip_embeddings = nn.Linear(inner_dim, clip_embed_dim)
+        causal_attention_mask = torch.full(
+            [num_embeddings + additional_embeddings, num_embeddings + additional_embeddings], -10000.0
+        )
+        causal_attention_mask.triu_(1)
+        causal_attention_mask = causal_attention_mask[None, ...]
+        self.register_buffer("causal_attention_mask", causal_attention_mask, persistent=False)
+        self.clip_mean = nn.Parameter(torch.zeros(1, clip_embed_dim))
+        self.clip_std = nn.Parameter(torch.zeros(1, clip_embed_dim))
+    @property
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "set_processor"):
+                processors[f"{name}.processor"] = module.processor
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        self.set_attn_processor(AttnProcessor())
+    def forward(
+        self,
+        hidden_states,
+        # timestep: Union[torch.Tensor, float, int],
+        proj_embedding: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.BoolTensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+        The [`PriorTransformer`] forward method.
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
+                The currently predicted image embeddings.
+            timestep (`torch.LongTensor`):
+                Current denoising step.
+            proj_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
+                Projected embedding vector the denoising process is conditioned on.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_embeddings, embedding_dim)`):
+                Hidden states of the text embeddings the denoising process is conditioned on.
+            attention_mask (`torch.BoolTensor` of shape `(batch_size, num_embeddings)`):
+                Text mask for the text embeddings.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.prior_transformer.PriorTransformerOutput`] instead of a plain
+                tuple.
+        Returns:
+            [`~models.prior_transformer.PriorTransformerOutput`] or `tuple`:
+                If return_dict is True, a [`~models.prior_transformer.PriorTransformerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        batch_size = hidden_states.shape[0]
+        # timesteps = timestep
+        # if not torch.is_tensor(timesteps):
+        #     timesteps = torch.tensor([timesteps], dtype=torch.long, device=hidden_states.device)
+        # elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+        #     timesteps = timesteps[None].to(hidden_states.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        # timesteps = timesteps * torch.ones(batch_size, dtype=timesteps.dtype, device=timesteps.device)
+        # timesteps_projected = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might be fp16, so we need to cast here.
+        # timesteps_projected = timesteps_projected.to(dtype=self.dtype)
+        # time_embeddings = self.time_embedding(timesteps_projected)
+        if self.embedding_proj_norm is not None:
+            proj_embedding = self.embedding_proj_norm(proj_embedding)
+        proj_embeddings = self.embedding_proj(proj_embedding)
+        if self.encoder_hidden_states_proj is not None and encoder_hidden_states is not None:
+            encoder_hidden_states = self.encoder_hidden_states_proj(encoder_hidden_states)
+        # elif self.encoder_hidden_states_proj is not None and encoder_hidden_states is None:
+        #     raise ValueError("`encoder_hidden_states_proj` requires `encoder_hidden_states` to be set")
+        hidden_states = self.proj_in(hidden_states)
+        # TODO this really also ought to derive from config's `k`
+        positional_embeddings = self.positional_embedding.to(hidden_states.dtype)
+        additional_embeds = []
+        additional_embeddings_len = 0
+        if encoder_hidden_states is not None:
+            additional_embeds.append(encoder_hidden_states)
+            additional_embeddings_len += encoder_hidden_states.shape[1]
+        if len(proj_embeddings.shape) == 2:
+            proj_embeddings = proj_embeddings[:, None, :]
+        if len(hidden_states.shape) == 2:
+            hidden_states = hidden_states[:, None, :]
+        additional_embeds = additional_embeds + [
+            proj_embeddings,
+            # time_embeddings[:, None, :],
+            hidden_states,
+        ]
+        if self.prd_embedding is not None:
+            prd_embedding = self.prd_embedding.to(hidden_states.dtype).expand(batch_size, -1, -1)
+            additional_embeds.append(prd_embedding)
+        hidden_states = torch.cat(
+            additional_embeds,
+            dim=1,
+        )
+        # Allow positional_embedding to not include the `addtional_embeddings` and instead pad it with zeros for these additional tokens
+        additional_embeddings_len = additional_embeddings_len + proj_embeddings.shape[1] + 1
+        if positional_embeddings.shape[1] < hidden_states.shape[1]:
+            positional_embeddings = F.pad(
+                positional_embeddings,
+                (
+                    0,
+                    0,
+                    additional_embeddings_len,
+                    self.prd_embedding.shape[1] if self.prd_embedding is not None else 0,
+                ),
+                value=0.0,
+            )
+        hidden_states = hidden_states + positional_embeddings[:, :hidden_states.shape[1]]
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = F.pad(attention_mask, (0, self.additional_embeddings), value=0.0)
+            attention_mask = (attention_mask[:, None, :] + self.causal_attention_mask).to(hidden_states.dtype)
+            attention_mask = attention_mask.repeat_interleave(self.config.num_attention_heads, dim=0)
+        if self.norm_in is not None:
+            hidden_states = self.norm_in(hidden_states)
+        for block in self.transformer_blocks:
+            hidden_states = block(hidden_states, attention_mask=attention_mask)
+        hidden_states = self.norm_out(hidden_states)
+        if self.prd_embedding is not None:
+            hidden_states = hidden_states[:, -1]
+        else:
+            hidden_states = hidden_states[:, additional_embeddings_len:]
+        predicted_image_embedding = self.proj_to_clip_embeddings(hidden_states)
+        if not return_dict:
+            return (predicted_image_embedding,)
+        return PriorTransformerOutput(predicted_image_embedding=predicted_image_embedding)
+    def post_process_latents(self, prior_latents):
+        prior_latents = (prior_latents * self.clip_std) + self.clip_mean
+        return prior_latents

safety_checker_improved.py DELETED Viewed

@@ -1,46 +0,0 @@
-# TODO required tensorflow==2.14 for me
-# weights from https://github.com/LAION-AI/safety-pipeline/tree/main
-from PIL import Image
-import tensorflow_hub as hub
-import tensorflow
-import numpy as np
-import sys
-sys.path.append('/home/ryn_mote/Misc/generative_recommender/gradio_video/automl/efficientnetv2/')
-import tensorflow as tf
-from tensorflow.keras import mixed_precision
-physical_devices = tf.config.list_physical_devices('GPU')
-if len(physical_devices) > 0:
-    tf.config.experimental.set_memory_growth(
-        physical_devices[0], True
-    )
-model = tf.keras.models.load_model('nsfweffnetv2-b02-3epochs.h5',custom_objects={"KerasLayer":hub.KerasLayer})
-# "The image classifier had been trained on 682550 images from the 5 classes "Drawing" (39026), "Hentai" (28134), "Neutral" (369507), "Porn" (207969) & "Sexy" (37914).
-# ... we created a manually inspected test set that consists of 4900 samples, that contains images & their captions."
-# Run prediction
-def maybe_nsfw(pil_image):
-    # Run prediction
-    imm = tensorflow.image.resize(np.array(pil_image)[:, :, :3], (260, 260))
-    imm = (imm / 255)
-    pred = model(tensorflow.expand_dims(imm, 0)).numpy()
-    probs = tensorflow.math.softmax(pred[0]).numpy()
-    print(probs)
-    if all([i < .3 for i in probs[[1, 3, 4]]]):
-        return False
-    return True
-# pre-initializing prediction
-maybe_nsfw(Image. new("RGB", (260, 260), 255))
-model.load_weights('nsfweffnetv2-b02-3epochs.h5', by_name=True, )

train.py ADDED Viewed

	@@ -0,0 +1,94 @@

+########################################
+# python -m train
+###########################################
+import torch
+import logging
+import numpy as np
+from tqdm import tqdm
+from PIL import Image
+from data import get_dataloader
+from model import get_model_and_tokenizer, get_optimizer
+import config
+logging.basicConfig(level=logging.INFO)
+def get_loss(model, input, target, tokenizer):
+    with torch.no_grad():
+        assert len(input.shape) == 5 # [batch, s, c, w, h]
+        cuts = config.number_k_clip_embed
+        assert input.shape[0] * input.shape[1] % cuts == 0, 'batch size * `k` preferred embeds must be divisible by cuts'
+        input = input.view(cuts//8, -1, 3, target.shape[-2], target.shape[-1])
+        full_seq = []
+        for b in input:
+            input = tokenizer(b)['image_embeds'] # in our case, tokenizer is a clip embedding model
+            full_seq.append(input)
+        input = torch.stack(full_seq)
+        target = tokenizer(target)['image_embeds']
+        input = input.view(target.shape[0], -1, target.shape[-1])
+        assert len(input.shape) == 3 # [batch, sequence, inner]
+    with torch.cuda.amp.autocast(enabled=False, ):
+        input = input.to(torch.float32)
+        latent = torch.randn(input.shape[0], input.shape[-1], device=input.device)
+        output = model(latent, input).predicted_image_embedding
+    target = target.to(torch.float32)
+    mse_loss = torch.nn.functional.mse_loss(target, output).mean()
+    assert len(target.shape) == 2 and len(output.shape) == 2
+    cosine_loss = 1 - torch.nn.functional.cosine_similarity(output, target).mean()
+    loss =  mse_loss + .2 * cosine_loss
+    logging.info(f'MSE: {mse_loss.item()}, Cosine: {cosine_loss.item()}, Weighted Total: {loss.item()}')
+    # TODO wandb
+    return loss
+def main():
+    np.random.seed(config.seed)
+    torch.manual_seed(config.seed)
+    model, tokenizer = get_model_and_tokenizer(config.model_path, config.device, config.dtype)
+    optimizer = get_optimizer(list(model.prior.parameters()), config.lr)
+    dataloader = get_dataloader(config.data_path, config.batch_size, config.num_workers,
+                                model.prior_pipe.image_processor)
+    for epoch in range(config.epochs):
+        for ind, batch in tqdm(enumerate(iter(dataloader))):
+            if batch is None:
+                continue
+            input, target = batch
+            input = input.to(config.device)
+            target = target.to(config.device)
+            if ind % 50 == 0:
+                with torch.cuda.amp.autocast(enabled=True, dtype=config.dtype): # NOTE using autocast because our training model is also our val model, so don't want to set to full half precision.
+                    examples = ['../generative_recommender/Blue_Tigers_space/1o.png',
+ '../generative_recommender/Blue_Tigers_space/2o.png',
+ '../generative_recommender/Blue_Tigers_space/3o.png',
+ '../generative_recommender/Blue_Tigers_space/4o.png',
+ '../generative_recommender/Blue_Tigers_space/5o.png',
+ '../generative_recommender/Blue_Tigers_space/6o.png',
+ '../generative_recommender/Blue_Tigers_space/7o.png',
+ '../generative_recommender/Blue_Tigers_space/8o.png',]
+                    model.do_validation([[Image.open('../'+j) for j in examples]])
+            loss = get_loss(model, input, target, tokenizer)
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+            if ind % 100 == 0:
+                # TODO add loading from path
+                model.prior.save_pretrained(f'{config.save_path}/last_epoch_ckpt', from_pt=True)
+if __name__ == '__main__':
+    main()

train_requirements.txt ADDED Viewed

	@@ -0,0 +1,642 @@

+absl-py==1.4.0
+accelerate==0.26.1
+addict==2.4.0
+aeiou==0.0.20
+aenum==3.1.15
+aiobotocore==2.13.0
+aiofiles==23.1.0
+aiohttp==3.9.5
+aioitertools==0.11.0
+aiosignal==1.3.1
+alias-free-torch==0.0.6
+aliyun-python-sdk-core==2.15.1
+aliyun-python-sdk-kms==2.16.3
+altair==4.2.2
+anaconda-anon-usage @ file:///croot/anaconda-anon-usage_1710965072196/work
+anaconda-client==1.11.2
+anaconda-cloud-auth @ file:///croot/anaconda-cloud-auth_1712794769769/work
+anaconda-navigator @ file:///croot/anaconda-navigator_1712087978399/work
+anaconda-project @ file:///opt/conda/conda-bld/anaconda-project_1660339890420/work
+annotated-types @ file:///croot/annotated-types_1709542908624/work
+antlr4-python3-runtime==4.9.3
+anyio==4.3.0
+appdirs==1.4.4
+apptools==5.2.1
+APScheduler==3.10.4
+argbind==0.3.9
+argcomplete==3.1.1
+asgiref==3.7.2
+asttokens==2.2.1
+astunparse==1.6.3
+async-timeout==4.0.2
+atproto==0.0.10
+attrs==25.1.0
+audioread==3.0.1
+auraloss==0.4.0
+av==10.0.0
+awscli==1.33.2
+backcall==0.2.0
+backports.functools-lru-cache @ file:///tmp/build/80754af9/backports.functools_lru_cache_1618170165463/work
+backports.tempfile @ file:///home/linux1/recipes/ci/backports.tempfile_1610991236607/work
+backports.weakref==1.0.post1
+bases==0.2.1
+basicsr==1.4.2
+beautifulsoup4==4.12.2
+bitsandbytes==0.43.1
+black==24.10.0
+bleach==6.1.0
+blendmodes==2022
+blinker==1.6.2
+blis==0.7.9
+blobfile==2.1.1
+blosc2==2.5.1
+bokeh==3.4.1
+boltons==23.0.0
+boto==2.49.0
+boto3==1.34.120
+botocore==1.34.120
+Bottleneck @ file:///croot/bottleneck_1707864210935/work
+braceexpand==0.1.7
+Brotli @ file:///tmp/abs_ecyw11_7ze/croots/recipe/brotli-split_1659616059936/work
+brotlipy==0.7.0
+cached-property==1.5.2
+cachetools==5.3.3
+Cartopy==0.21.1
+catalogue==2.0.8
+certifi==2025.1.31
+cffi==1.15.1
+cfgv==3.3.1
+chardet @ file:///home/builder/ci_310/chardet_1640804867535/work
+charset-normalizer==3.1.0
+chex==0.1.81
+clean-fid==0.1.35
+click==8.1.3
+clip @ git+https://github.com/openai/CLIP.git@a9b1bf5920416aaeaec965c25dd9e8f98c864f16
+clip-anytorch==2.6.0
+cloudpickle==2.2.1
+clyent==1.2.2
+cmake==3.26.4
+colorama==0.4.6
+colorcet==3.1.0
+colored==2.2.4
+coloredlogs==15.0.1
+comm==0.1.4
+commonmark==0.9.1
+comtypes==1.2.0
+conda @ file:///croot/conda_1696257509808/work
+conda-build @ file:///croot/conda-build_1701720841368/work
+conda-content-trust @ file:///tmp/abs_5952f1c8-355c-4855-ad2e-538535021ba5h26t22e5/croots/recipe/conda-content-trust_1658126371814/work
+conda-libmamba-solver @ file:///croot/conda-libmamba-solver_1698163451663/work/src
+conda-pack @ file:///tmp/build/80754af9/conda-pack_1611163042455/work
+conda-package-handling @ file:///croot/conda-package-handling_1690999929514/work
+conda-repo-cli @ file:///croot/conda-repo-cli_1709246574569/work
+conda-token @ file:///Users/paulyim/miniconda3/envs/c3i/conda-bld/conda-token_1662660369760/work
+conda-verify==3.4.2
+conda_index @ file:///croot/conda-index_1706633791028/work
+conda_package_streaming @ file:///croot/conda-package-streaming_1690987966409/work
+confection==0.0.4
+configobj==5.0.8
+configparser==7.0.0
+contextlib2==21.6.0
+contexttimer==0.3.3
+contourpy==1.2.1
+cramjam==2.8.3
+crcmod==1.7
+cryptography @ file:///croot/cryptography_1677533068310/work
+cuda-python==12.4.0
+curl_cffi==0.6.4
+cycler==0.11.0
+cymem==2.0.7
+Cython==0.29.35
+dacite==1.8.1
+dag-cbor==0.3.2
+datasets==2.21.0
+dctorch==0.1.2
+-e git+https://github.com/jannerm/ddpo.git@b217eef955a94bf58e4de68caa5ec0a6558c221d#egg=ddpo
+debugpy==1.6.7
+decorator==4.4.2
+decord==0.6.0
+DeepCache==0.1.1
+deepspeed==0.14.2
+defusedxml @ file:///tmp/build/80754af9/defusedxml_1615228127516/work
+Deprecated==1.2.14
+deprecation==2.1.0
+descript-audio-codec==1.0.0
+descript-audiotools==0.7.2
+diffusers @ git+https://github.com/huggingface/diffusers.git@06beecafc55cfddeb1b0b8660188de249f74b899
+dill==0.3.6
+disnake==2.9.0
+Django==4.2.2
+django-memcache-status==2.3
+django-pylibmc==0.6.1
+dm-tree==0.1.8
+dnspython==2.6.1
+docker-pycreds==0.4.0
+docstring-parser==0.15
+docutils==0.16
+EasyProcess==1.1
+einops==0.7.0
+einops-exts==0.0.4
+ema-pytorch==0.2.3
+email_validator==2.1.1
+emoji==2.4.0
+encodec==0.1.1
+entrypoints==0.4
+envisage==7.0.3
+etils==1.3.0
+eva-decord==0.6.1
+exceptiongroup==1.1.1
+executing==1.2.0
+facexlib==0.3.0
+fairscale==0.4.4
+fastapi==0.111.0
+fastapi-cli==0.0.4
+fastcore==1.5.44
+fastjsonschema @ file:///opt/conda/conda-bld/python-fastjsonschema_1661371079312/work
+fastparquet==2024.5.0
+ffmpeg==1.4
+ffmpeg-python==0.2.0
+ffmpegio==0.8.3
+ffmpegio-core==0.8.3
+ffmpy==0.3.0
+filelock @ file:///croot/filelock_1700591183607/work
+filterpy==1.4.5
+fire==0.6.0
+flash-attn==2.5.9.post1
+Flask==2.3.2
+flatbuffers==23.5.26
+flatten-dict==0.4.2
+flax==0.6.9
+flow-vis==0.1
+fonttools==4.42.1
+frozenlist==1.3.3
+fsspec==2024.6.0
+ftfy==6.1.1
+future @ file:///croot/future_1677599870788/work
+fvcore==0.1.5.post20221221
+gast==0.4.0
+gcs-oauth2-boto-plugin==3.0
+gcsfs==2023.6.0
+gdcm==1.1
+gdown==4.7.1
+gfpgan==1.3.8
+gguf==0.16.2
+gin-config==0.5.0
+gitdb==4.0.10
+GitPython==3.1.30
+gmpy2 @ file:///tmp/build/80754af9/gmpy2_1645455533097/work
+google-api-core==2.11.1
+google-apitools==0.5.32
+google-auth==2.29.0
+google-auth-oauthlib==1.0.0
+google-cloud-core==2.3.2
+google-cloud-storage==2.10.0
+google-crc32c==1.5.0
+google-pasta==0.2.0
+google-reauth==0.1.1
+google-resumable-media==2.5.0
+googleapis-common-protos==1.59.1
+gradio==4.31.5
+gradio_client==0.16.4
+grpcio==1.54.2
+gsutil==5.25
+h11==0.14.0
+h5py==3.11.0
+hjson==3.1.0
+holoviews==1.18.3
+httpcore==1.0.5
+httplib2==0.20.4
+httptools==0.6.1
+httpx==0.27.0
+httpx-ws==0.3.1
+huggingface-hub==0.30.2
+humanfriendly==10.0
+humanize==4.7.0
+hydra-core==1.1.2
+hyper-tile @ git+https://github.com/tfernd/HyperTile@2ef64b2800d007d305755c33550537410310d7df
+icecream==2.1.3
+identify==2.5.24
+idna @ file:///croot/idna_1666125576474/work
+imagebind @ git+https://github.com/facebookresearch/ImageBind.git@95d27c7fd5a8362f3527e176c3a80ae5a4d880c0
+imageio==2.34.2
+imageio-ffmpeg==0.4.8
+importlib-metadata==6.8.0
+importlib-resources==5.12.0
+inflect==6.0.4
+inflection==0.5.1
+install==1.3.5
+iopath==0.1.9
+ipykernel==6.25.0
+ipython==8.14.0
+ipywidgets==8.0.6
+itsdangerous==2.1.2
+jaraco.classes @ file:///tmp/build/80754af9/jaraco.classes_1620983179379/work
+jax==0.4.6
+jaxlib==0.4.6
+jedi==0.19.0
+jeepney @ file:///tmp/build/80754af9/jeepney_1627537048313/work
+Jinja2==3.1.2
+jmespath==0.10.0
+joblib==1.3.2
+jsonmerge==1.8.0
+jsonpatch @ file:///croot/jsonpatch_1710807507480/work
+jsonpointer==2.1
+jsonschema @ file:///croot/jsonschema_1699041609003/work
+jsonschema-specifications @ file:///croot/jsonschema-specifications_1699032386549/work
+julius==0.2.7
+jupyter-js-widgets-nbextension==0.0.2.dev0
+jupyter_client==8.3.0
+jupyter_core @ file:///croot/jupyter_core_1698937308754/work
+jupyterlab-widgets==3.0.7
+k-diffusion==0.1.1
+kaggle==1.5.13
+kagglehub==0.3.12
+kandinsky2 @ git+https://github.com/ai-forever/Kandinsky-2.git@aeefc1ce3a989eefe7c99d6a02cce44318c4d210
+kecam==1.4.1
+keras==2.14.0
+keras-efficientnet-v2==1.2.2
+Keras-Preprocessing==1.1.2
+keyring @ file:///croot/keyring_1709632513808/work
+kiwisolver==1.4.5
+kornia==0.6.7
+laion-clap==1.1.4
+langcodes==3.3.0
+lark==1.1.2
+lazy_loader==0.2
+libarchive-c @ file:///tmp/build/80754af9/python-libarchive-c_1617780486945/work
+libclang==16.0.0
+libmambapy @ file:///croot/mamba-split_1694187754698/work/libmambapy
+librosa==0.9.2
+lightning-utilities==0.8.0
+linkify-it-py==2.0.2
+lit==16.0.6
+llvmlite==0.42.0
+lmdb==1.4.1
+local-attention==1.8.6
+loguru==0.7.2
+lpips==0.1.4
+lvis==0.5.3
+lxml==4.9.4
+Markdown==3.6
+markdown-it-py==2.2.0
+markdown2==2.4.8
+MarkupSafe==2.1.2
+matplotlib==3.7.3
+matplotlib-inline==0.1.6
+mayavi==4.8.1
+mc-bin-client==1.0.1
+mdit-py-plugins==0.3.3
+mdurl==0.1.2
+mediapipe==0.10.15
+menuinst @ file:///croot/menuinst_1706732933928/work
+mkl-fft @ file:///croot/mkl_fft_1695058164594/work
+mkl-random @ file:///croot/mkl_random_1695059800811/work
+mkl-service==2.4.0
+ml-collections==0.1.1
+ml-dtypes==0.2.0
+mmcv==1.7.2
+mmengine==0.10.4
+model-index==0.1.11
+more-itertools @ file:///croot/more-itertools_1700662129964/work
+MouseInfo==0.1.3
+moviepy==1.0.3
+mpmath @ file:///croot/mpmath_1690848262763/work
+msgpack==1.0.5
+multidict==6.0.4
+multiformats==0.2.1
+multiformats-config==0.2.0.post4
+multiprocess==0.70.14
+murmurhash==1.0.9
+mypy-extensions==1.0.0
+namex==0.0.8
+natsort==8.4.0
+navigator-updater @ file:///croot/navigator-updater_1713453362034/work
+nbformat @ file:///croot/nbformat_1694616755618/work
+ndindex==1.8
+nest-asyncio==1.5.7
+networkx==3.1
+nh3==0.2.13
+nibabel==5.1.0
+ninja==1.11.1
+nlpaug==1.1.11
+nltk==3.8.1
+nodeenv==1.8.0
+numba==0.59.1
+numexpr @ file:///croot/numexpr_1696515281613/work
+numpy==1.26.4
+nvidia-cublas-cu11==11.11.3.6
+nvidia-cublas-cu117==11.10.1.25
+nvidia-cublas-cu12==12.3.4.1
+nvidia-cuda-cupti-cu11==11.8.87
+nvidia-cuda-cupti-cu117==11.7.50
+nvidia-cuda-cupti-cu12==12.3.101
+nvidia-cuda-nvcc-cu11==11.8.89
+nvidia-cuda-nvcc-cu12==12.3.107
+nvidia-cuda-nvrtc-cu11==11.8.89
+nvidia-cuda-nvrtc-cu12==12.3.107
+nvidia-cuda-runtime-cu11==11.8.89
+nvidia-cuda-runtime-cu117==11.7.60
+nvidia-cuda-runtime-cu12==12.3.101
+nvidia-cudnn-cu11==8.7.0.84
+nvidia-cudnn-cu116==8.4.0.27
+nvidia-cudnn-cu12==9.0.0.312
+nvidia-cufft-cu11==10.9.0.58
+nvidia-cufft-cu12==11.0.12.1
+nvidia-curand-cu11==10.3.0.86
+nvidia-curand-cu12==10.3.4.107
+nvidia-cusolver-cu11==11.4.1.48
+nvidia-cusolver-cu12==11.5.4.101
+nvidia-cusparse-cu11==11.7.5.86
+nvidia-cusparse-cu12==12.2.0.103
+nvidia-nccl-cu11==2.19.3
+nvidia-nccl-cu12==2.19.3
+nvidia-nvjitlink-cu12==12.3.101
+nvidia-nvtx-cu11==11.8.86
+nvidia-pyindex==1.0.9
+oauth2client==4.1.3
+oauthlib==3.2.2
+omegaconf==2.3.0
+onnx==1.15.0
+onnx-graphsurgeon==0.5.2
+onnx2torch==1.5.6
+onnxruntime==1.16.3
+open_clip_torch==2.26.1
+openai==0.27.8
+opencv-contrib-python==4.6.0.66
+opencv-python==4.6.0
+opendatalab==0.0.10
+opendatasets==0.1.22
+openmim==0.3.9
+openxlab==0.1.1
+opt-einsum==3.3.0
+optax==0.1.5
+optree==0.11.0
+orbax-checkpoint==0.1.6
+ordered-set==4.1.0
+orjson==3.9.0
+oss2==2.17.0
+outcome==1.3.0.post0
+packaging @ file:///croot/packaging_1710807400464/work
+pandas==2.0.2
+panel==1.4.4
+param==2.1.0
+parameterized==0.9.0
+parso==0.8.3
+pathspec==0.11.1
+pathtools==0.1.2
+pathy==0.10.1
+pedalboard==0.7.4
+peewee==3.16.2
+peft==0.10.0
+pexpect==4.8.0
+pickleshare==0.7.5
+piexif==1.1.3
+Pillow==9.4.0
+pkce @ file:///croot/pkce_1690384816590/work
+pkginfo @ file:///croot/pkginfo_1679431160147/work
+platformdirs==3.8.0
+plotly==5.14.1
+pluggy @ file:///tmp/build/80754af9/pluggy_1648024709248/work
+ply==3.11
+polygraphy==0.49.9
+pooch==1.8.1
+portalocker==2.7.0
+pre-commit==3.3.1
+prefigure==0.0.9
+preshed==3.0.8
+proglog==0.1.10
+progressbar==2.5
+prompt-toolkit==3.0.39
+protobuf==4.25.3
+psutil==5.9.5
+ptyprocess==0.7.0
+pure-eval==0.2.2
+py-cpuinfo==9.0.0
+pyarrow==17.0.0
+pyasn1==0.6.0
+pyasn1-modules==0.3.0
+PyAutoGUI==0.9.54
+pyav==12.0.5
+pycocoevalcap==1.2
+pycocotools==2.0.6
+pycosat @ file:///croot/pycosat_1696536503704/work
+pycparser==2.21
+pycryptodome==3.20.0
+pycryptodomex==3.19.0
+pydantic==2.7.3
+pydantic_core==2.18.4
+pydeck==0.8.1b0
+pyDeprecate==0.3.2
+pydicom==2.3.1
+pydot==1.4.2
+pydub==0.25.1
+pyface==8.0.0
+PyGetWindow==0.0.9
+Pygments==2.15.1
+PyJWT==2.7.0
+pylibmc==1.6.3
+pyloudnorm==0.1.1
+pymemcache==4.0.0
+Pympler==1.0.1
+PyMsgBox==1.0.9
+pynndescent==0.5.12
+pynvml==11.5.0
+pyOpenSSL @ file:///croot/pyopenssl_1690223430423/work
+pyparsing==3.1.1
+pyperclip==1.9.0
+pyproj==3.6.0
+PyQt5==5.15.10
+PyQt5-sip @ file:///croot/pyqt-split_1698769088074/work/pyqt_sip
+pyre-extensions==0.0.29
+PyRect==0.2.0
+PyScreeze==1.0.1
+pyshp==2.3.1
+PySocks==1.7.1
+pystoi==0.4.1
+python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work
+python-docx==0.8.11
+python-dotenv==1.0.0
+python-magic==0.4.27
+python-memcached==1.59
+python-multipart==0.0.9
+python-slugify==8.0.1
+python3-xlib==0.15
+pytorch-lantern==0.12.7
+pytorch-lightning==2.1.0
+pytorch-pretrained-biggan==0.1.1
+pytorch-warmup==0.1.1
+pytorchvideo==0.1.5
+pytweening==1.2.0
+pytz @ file:///croot/pytz_1695131579487/work
+pyu2f==0.1.5
+PyVirtualDisplay==3.0
+pyviz_comms==3.0.2
+PyWavelets==1.4.1
+PyYAML==6.0
+pyzmq==25.1.0
+QtPy @ file:///croot/qtpy_1700144840038/work
+randomname==0.2.1
+realesrgan==0.3.0
+referencing @ file:///croot/referencing_1699012038513/work
+regex==2023.6.3
+repeng @ git+https://github.com/vgel/repeng.git@c9093abddd87f865e7e2bcf4b3e556ec8813b5b2
+replicate==0.25.1
+requests==2.32.3
+requests-oauthlib==1.3.1
+requests-toolbelt @ file:///croot/requests-toolbelt_1690874004362/work
+resampy==0.4.3
+resize-right==0.0.2
+responses==0.18.0
+retry-decorator==1.1.1
+rfc3986==1.5.0
+rich==12.6.0
+rotary-embedding-torch==0.3.0
+rpds-py @ file:///croot/rpds-py_1698945930462/work
+rsa==4.7.2
+ruamel-yaml-conda @ file:///croot/ruamel_yaml_1667489728852/work
+ruamel.yaml @ file:///croot/ruamel.yaml_1666304550667/work
+ruamel.yaml.clib @ file:///croot/ruamel.yaml.clib_1666302247304/work
+ruff==0.4.1
+s2wrapper @ git+https://github.com/bfshi/scaling_on_scales@f08aec91337ae1ed6d7cc7a55441a96d51c14dd1
+s3fs==2024.6.0
+s3transfer==0.10.1
+sacremoses==0.0.53
+safetensors==0.4.1
+salesforce-lavis @ git+https://github.com/salesforce/LAVIS.git@4a85b17846ee62f09c40f37cc955dd33c2abec68
+scikit-image==0.20.0
+scikit-learn==1.5.1
+scikit-surprise==1.1.3
+scipy==1.11.1
+SecretStorage @ file:///croot/secretstorage_1678709481048/work
+selenium==4.29.0
+semantic-version==2.10.0
+semver @ file:///croot/semver_1709243621175/work
+sentencepiece==0.1.99
+sentry-sdk==1.25.1
+setproctitle==1.3.2
+sgm @ file:///home/ryn_mote/Misc/generative-models
+shapely==2.0.1
+shellingham==1.5.0.post1
+shortuuid==1.0.11
+SimpleITK==2.2.1
+sip @ file:///croot/sip_1698675935381/work
+six @ file:///tmp/build/80754af9/six_1644875935023/work
+sk-video==1.1.10
+smart-open==6.3.0
+smmap==5.0.0
+sniffio==1.3.0
+sortedcontainers==2.4.0
+sounddevice==0.5.0
+SoundFile==0.10.2
+soupsieve==2.4.1
+spaces==0.27.0
+spacy==3.5.3
+spacy-legacy==3.0.12
+spacy-loggers==1.0.4
+sqlparse==0.4.4
+srsly==2.4.6
+stable-audio-tools==0.0.16
+stable-fast @ https://github.com/chengzeyi/stable-fast/releases/download/v1.0.4/stable_fast-1.0.4+torch220cu118-cp310-cp310-manylinux2014_x86_64.whl#sha256=11716f733237f557bee452eee63db415b4daeff29a28d939f73fff8003f0d415
+stack-data==0.6.2
+stanza==1.5.0
+starlette==0.37.2
+streamlit==1.22.0
+svgwrite==1.4.3
+sympy @ file:///croot/sympy_1701397643339/work
+tables==3.9.2
+tabulate==0.9.0
+tenacity==8.2.2
+tensorboard==2.14.1
+tensorboard-data-server==0.7.2
+tensorboard-plugin-wit==1.8.1
+tensorflow==2.14.0
+tensorflow-addons==0.16.1
+tensorflow-estimator==2.14.0
+tensorflow-hub==0.16.1
+tensorflow-io-gcs-filesystem==0.32.0
+tensorrt==8.6.1.post1
+tensorrt-bindings==8.6.1
+tensorrt-libs==8.6.1
+tensorstore==0.1.39
+termcolor==2.3.0
+text-unidecode==1.3
+tf-estimator-nightly==2.8.0.dev2021122109
+tf_keras==2.16.0
+tgate==0.1.1
+thinc==8.1.10
+threadpoolctl==3.2.0
+tifffile==2023.4.12
+tiktoken==0.4.0
+timm==0.9.8
+tokenizers==0.20.3
+tomesd==0.1.3
+tomli==2.0.1
+tomlkit==0.12.0
+toolz==0.12.0
+torch==2.2.2+cu118
+torch-ema==0.3
+torch-stoi==0.2.1
+torchaudio==2.0.2+cu118
+torchdiffeq==0.2.3
+torchio==0.19.0
+torchlibrosa==0.1.0
+torchmetrics==0.11.4
+torchsde==0.2.6
+torchvision==0.15.2+cu118
+tornado @ file:///croot/tornado_1696936946304/work
+tqdm==4.66.5
+traitlets @ file:///croot/traitlets_1671143879854/work
+traits==6.4.1
+traitsui==8.0.0
+trampoline==0.1.2
+transformers==4.46.3
+trio==0.29.0
+trio-websocket==0.12.2
+triton==2.2.0
+truststore @ file:///croot/truststore_1695244293384/work
+typed-argument-parser==1.8.1
+typeguard==4.2.1
+typer==0.12.3
+types-regex==2023.6.3.1
+typing-inspect==0.8.0
+typing-validation==1.0.0.post2
+typing_extensions==4.12.2
+tzdata @ file:///croot/python-tzdata_1690578112552/work
+tzlocal==5.0.1
+uc-micro-py==1.0.2
+ujson @ file:///opt/conda/conda-bld/ujson_1657544923770/work
+umap-learn==0.5.6
+undetected-chromedriver==3.5.5
+urllib3==1.26.18
+uvicorn==0.29.0
+uvloop==0.19.0
+v-diffusion-pytorch==0.0.2
+validators==0.20.0
+vector-quantize-pytorch==1.9.14
+vtk==9.2.6
+wandb==0.15.4
+wasabi==1.1.1
+watchdog==3.0.0
+watchfiles==0.22.0
+wavedrom==2.0.3.post3
+wcwidth==0.2.6
+webdataset==0.2.48
+webencodings==0.5.1
+websocket-client==1.8.0
+websockets==11.0.3
+Werkzeug==2.3.4
+wget==3.2
+widgetsnbextension==4.0.7
+wikipedia==1.4.0
+wrapt==1.14.1
+wsproto==1.2.0
+x-transformers==1.26.6
+xformers==0.0.20
+xxhash==3.2.0
+xyzservices==2024.4.0
+yacs==0.1.8
+yapf==0.40.1
+yarl==1.9.2
+yattag==1.15.1
+zipp==3.16.0
+zstandard @ file:///croot/zstandard_1677013143055/work

twitter_prompts.csv DELETED Viewed

@@ -1,47 +0,0 @@
-,0
-0,a sunset
-1,a still life in blue
-2,last day on earth
-3,the conch shell
-4,the winds of change
-5,a surrealist eye
-6,a surrealist polaroid photo of an apple
-7,metaphysics
-8,the sun is setting into my glass of tea
-9,the moon at 3am
-10,a memento mori
-11,quaking aspen tree
-12,violets and daffodils
-13,espresso
-14,sisyphus
-15,high windows of stained glass
-16,a green dog
-17,an adorable companion; it is a pig
-18,bird of paradise
-19,a complex intricate machine
-20,a white clock
-21,a film featuring the landscape Salt Lake City Utah
-22,a creature
-23,a house set aflame.
-24,a gorgeous landscape by Cy Twombly
-25,smoke rises from the caterpillar's hookah
-26,corvid in red
-27,Monet's pond
-28,Genesis
-29,Death is a black camel that kneels down so we can ride
-30,a cherry tree made of fractals
-29,the end of the sidewalk
-30,a polaroid photo of a bustling city of lights and sky scrapers
-31,The Fig Tree metaphor
-32,God killed Van Gogh.
-33,a cosmic entity alien with four eyes.
-34,a horse with 128 eyes.
-35,a being with an infinite set of eyes (it is omniscient)
-36,A sticky-note magnum opus featuring birds
-37,Moka Pot
-38,the moon is a sickle cell
-39,The Penultimate Supper
-40,Art
-41,surrealism
-42,a god made of wires & dust
-43,a dandelion blown into the universe