File size: 3,794 Bytes
b90a4c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import requests
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image

model1 = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor1 = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer1 = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device1 = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model1.to(device1)



max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

def image_to_text_model_1(image_url):
    raw_image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')

    pixel_values = feature_extractor1(images=[raw_image], return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device1)

    output_ids = model1.generate(pixel_values, **gen_kwargs)

    preds = tokenizer1.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]
    return preds

def bytes_to_text_model_1(bts):
    pixel_values = feature_extractor1(images=[bts], return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device1)

    output_ids = model1.generate(pixel_values, **gen_kwargs)

    preds = tokenizer1.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]
    print(preds[0])


import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch

device2 = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
processor2 = BlipProcessor.from_pretrained("noamrot/FuseCap")
model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap").to(device2)


def image_to_text_model_2(img_url):
    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
    text = "a picture of "
    inputs = processor2(raw_image, text, return_tensors="pt").to(device2)

    out = model2.generate(**inputs, num_beams = 3)
    print(processor2.decode(out[0], skip_special_tokens=True))

def bytes_to_text_model_2(byts):
    text = "a picture of "
    inputs = processor2(byts, text, return_tensors="pt").to(device2)

    out = model2.generate(**inputs, num_beams = 3)
    print(processor2.decode(out[0], skip_special_tokens=True))



import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

processor3 = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model3 = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

def image_to_text_model_3(img_url):
    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
    text = "a picture of"
    inputs = processor3(raw_image, text, return_tensors="pt")
    inputs = processor3(raw_image, return_tensors="pt")

    out = model3.generate(**inputs)
    print(processor3.decode(out[0], skip_special_tokens=True))

def bytes_to_text_model_3(byts):
    text = "a picture of"
    inputs = processor3(byts, text, return_tensors="pt")
    inputs = processor3(byts, return_tensors="pt")

    out = model3.generate(**inputs)
    print(processor3.decode(out[0], skip_special_tokens=True))


import cv2 
  
def FrameCapture(path): 
    vidObj = cv2.VideoCapture(path) 
    count = 0
    success = 1
  
    while success: 
        success, image = vidObj.read() 

        if count % 20 == 0:
        
            print("NEW FRAME")
            print("MODEL 1")
            bytes_to_text_model_1(image)
            print("MODEL 2")
            bytes_to_text_model_2(image)
            print("MODEL 3")
            bytes_to_text_model_3(image)

            print("\n\n")
  
        count += 1


FrameCapture("animation.mp4")