prithivMLmods commited on
Commit
2f91bcb
·
verified ·
1 Parent(s): f34ba90

upload files

Browse files
Files changed (2) hide show
  1. app.py +57 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import subprocess
3
+ import torch
4
+ from PIL import Image
5
+ from transformers import AutoProcessor, AutoModelForCausalLM
6
+
7
+ try:
8
+ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, check=True, shell=True)
9
+ except subprocess.CalledProcessError as e:
10
+ print(f"Error installing flash-attn: {e}")
11
+ print("Continuing without flash-attn.")
12
+
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ vision_language_model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True).to(device).eval()
15
+ vision_language_processor = AutoProcessor.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True)
16
+
17
+ def describe_image(uploaded_image):
18
+ """
19
+ Generates a detailed description of the input image.
20
+
21
+ Args:
22
+ uploaded_image (PIL.Image.Image or numpy.ndarray): The image to describe.
23
+
24
+ Returns:
25
+ str: A detailed textual description of the image.
26
+ """
27
+ if not isinstance(uploaded_image, Image.Image):
28
+ uploaded_image = Image.fromarray(uploaded_image)
29
+
30
+ inputs = vision_language_processor(text="<MORE_DETAILED_CAPTION>", images=uploaded_image, return_tensors="pt").to(device)
31
+ with torch.no_grad():
32
+ generated_ids = vision_language_model.generate(
33
+ input_ids=inputs["input_ids"],
34
+ pixel_values=inputs["pixel_values"],
35
+ max_new_tokens=1024,
36
+ early_stopping=False,
37
+ do_sample=False,
38
+ num_beams=3,
39
+ )
40
+ generated_text = vision_language_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
41
+ processed_description = vision_language_processor.post_process_generation(
42
+ generated_text,
43
+ task="<MORE_DETAILED_CAPTION>",
44
+ image_size=(uploaded_image.width, uploaded_image.height)
45
+ )
46
+ image_description = processed_description["<MORE_DETAILED_CAPTION>"]
47
+ print("\nImage description generated!:", image_description)
48
+ return image_description
49
+
50
+ image_description_interface = gr.Interface(
51
+ fn=describe_image,
52
+ inputs=gr.Image(label="Upload Image"),
53
+ outputs=gr.Textbox(label="Generated Caption", lines=4, show_copy_button=True),
54
+ live=False,
55
+ )
56
+
57
+ image_description_interface.launch(debug=True, ssr_mode=False)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ timm
4
+ pillow