ikraamkb commited on
Commit
a078426
Β·
verified Β·
1 Parent(s): 6e8ae10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -3
app.py CHANGED
@@ -5,8 +5,9 @@ from tika import parser # Apache Tika for document parsing
5
  import openpyxl
6
  from pptx import Presentation
7
  from PIL import Image
8
- from transformers import pipeline
9
  import gradio as gr
 
10
  import numpy as np
11
 
12
  # Initialize FastAPI
@@ -15,8 +16,13 @@ app = FastAPI()
15
  print(f"πŸ”„ Loading models")
16
 
17
  # Load Hugging Face Models
 
18
  doc_qa_pipeline = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device=-1)
19
- image_captioning_pipeline = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
 
 
 
 
20
 
21
  print("βœ… Models loaded")
22
 
@@ -101,7 +107,9 @@ def answer_question_from_image(image, question):
101
  image = Image.fromarray(image) # Convert to PIL Image
102
 
103
  print("πŸ–ΌοΈ Generating caption for image...")
104
- caption = image_captioning_pipeline(image)[0]['generated_text']
 
 
105
 
106
  print("πŸ€– Answering question based on caption...")
107
  response = doc_qa_pipeline(f"Question: {question}\nContext: {caption}")
 
5
  import openpyxl
6
  from pptx import Presentation
7
  from PIL import Image
8
+ from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration
9
  import gradio as gr
10
+ import torch
11
  import numpy as np
12
 
13
  # Initialize FastAPI
 
16
  print(f"πŸ”„ Loading models")
17
 
18
  # Load Hugging Face Models
19
+
20
  doc_qa_pipeline = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device=-1)
21
+
22
+ # Load Image Captioning Model
23
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
24
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
25
+ model = model.to(dtype=torch.float16) # Quantizing to FP16
26
 
27
  print("βœ… Models loaded")
28
 
 
107
  image = Image.fromarray(image) # Convert to PIL Image
108
 
109
  print("πŸ–ΌοΈ Generating caption for image...")
110
+ inputs = processor(images=image, return_tensors="pt", use_fast=True).to(dtype=torch.float16)
111
+ output = model.generate(**inputs)
112
+ caption = processor.decode(output[0], skip_special_tokens=True)
113
 
114
  print("πŸ€– Answering question based on caption...")
115
  response = doc_qa_pipeline(f"Question: {question}\nContext: {caption}")