William Mattingly commited on
Commit
2fdca7c
·
1 Parent(s): 35dd58a

updated the app

Browse files
Files changed (2) hide show
  1. app.py +116 -91
  2. call.py +1 -1
app.py CHANGED
@@ -73,106 +73,131 @@ def run_example(image, model_id="Qwen/Qwen2.5-VL-7B-Instruct", run_ner=False, ne
73
  # First get the OCR text
74
  text_input = "Convert the image to text."
75
 
76
- # Handle various image input formats
77
- if image is None:
78
- raise ValueError("Image path is None.")
79
 
80
- # Case 1: Image is a dictionary with base64 data (from API calls)
81
- if isinstance(image, dict) and 'data' in image and isinstance(image['data'], str):
82
- if image['data'].startswith('data:image'):
83
- # Extract the base64 part after the comma
84
- base64_data = image['data'].split(',', 1)[1]
85
- # Convert base64 to bytes and then to PIL Image
86
- image_bytes = base64.b64decode(base64_data)
87
- pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
88
- # Convert to numpy array for further processing
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  image = np.array(pil_image)
90
-
91
- # Convert numpy array to image path
92
- image_path = array_to_image_path(image)
93
-
94
- model = models[model_id]
95
- processor = processors[model_id]
96
-
97
- prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}"
98
- image = Image.fromarray(image).convert("RGB")
99
- messages = [
100
- {
101
- "role": "user",
102
- "content": [
103
- {
104
- "type": "image",
105
- "image": image_path,
106
- },
107
- {"type": "text", "text": text_input},
108
- ],
109
- }
110
- ]
111
-
112
- # Preparation for inference
113
- text = processor.apply_chat_template(
114
- messages, tokenize=False, add_generation_prompt=True
115
- )
116
- image_inputs, video_inputs = process_vision_info(messages)
117
- inputs = processor(
118
- text=[text],
119
- images=image_inputs,
120
- videos=video_inputs,
121
- padding=True,
122
- return_tensors="pt",
123
- )
124
- inputs = inputs.to("cuda")
125
-
126
- # Inference: Generation of the output
127
- generated_ids = model.generate(**inputs, max_new_tokens=1024)
128
- generated_ids_trimmed = [
129
- out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
130
- ]
131
- output_text = processor.batch_decode(
132
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
133
- )
134
-
135
- ocr_text = output_text[0]
136
-
137
- # If NER is enabled, process the OCR text
138
- if run_ner:
139
- ner_results = gliner_model.predict_entities(
140
- ocr_text,
141
- ner_labels.split(","),
142
- threshold=0.3
143
  )
 
144
 
145
- # Create a list of tuples (text, label) for highlighting
146
- highlighted_text = []
147
- last_end = 0
 
 
 
 
 
148
 
149
- # Sort entities by start position
150
- sorted_entities = sorted(ner_results, key=lambda x: x["start"])
151
 
152
- # Process each entity and add non-entity text segments
153
- for entity in sorted_entities:
154
- # Add non-entity text before the current entity
155
- if last_end < entity["start"]:
156
- highlighted_text.append((ocr_text[last_end:entity["start"]], None))
 
 
157
 
158
- # Add the entity text with its label
159
- highlighted_text.append((
160
- ocr_text[entity["start"]:entity["end"]],
161
- entity["label"]
162
- ))
163
- last_end = entity["end"]
164
-
165
- # Add any remaining text after the last entity
166
- if last_end < len(ocr_text):
167
- highlighted_text.append((ocr_text[last_end:], None))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
- # Create TextWithMetadata instance with the highlighted text and metadata
170
- result = TextWithMetadata(highlighted_text, original_text=ocr_text, entities=ner_results)
171
  return result, result # Return twice: once for display, once for state
172
-
173
- # If NER is disabled, return the text without highlighting
174
- result = TextWithMetadata([(ocr_text, None)], original_text=ocr_text, entities=[])
175
- return result, result # Return twice: once for display, once for state
 
 
 
 
176
 
177
 
178
  with gr.Blocks() as demo:
 
73
  # First get the OCR text
74
  text_input = "Convert the image to text."
75
 
76
+ # Print debug info about the image type
77
+ print(f"Image type: {type(image)}")
78
+ print(f"Image value: {image}")
79
 
80
+ # Robust handling of image input
81
+ try:
82
+ # Handle None or empty input
83
+ if image is None:
84
+ raise ValueError("Image input is None")
85
+
86
+ # Handle dictionary input (from API)
87
+ if isinstance(image, dict):
88
+ if 'data' in image and isinstance(image['data'], str) and image['data'].startswith('data:image'):
89
+ # Extract the base64 part
90
+ base64_data = image['data'].split(',', 1)[1]
91
+ # Convert base64 to bytes, then to PIL Image
92
+ image_bytes = base64.b64decode(base64_data)
93
+ pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
94
+ # Convert to numpy array
95
+ image = np.array(pil_image)
96
+ else:
97
+ raise ValueError(f"Invalid image dictionary format: {image}")
98
+
99
+ # Convert string path to image if needed
100
+ if isinstance(image, str):
101
+ pil_image = Image.open(image).convert("RGB")
102
  image = np.array(pil_image)
103
+
104
+ # Ensure image is a numpy array
105
+ if not isinstance(image, np.ndarray):
106
+ raise ValueError(f"Unsupported image type: {type(image)}")
107
+
108
+ # Convert numpy array to image path
109
+ image_path = array_to_image_path(image)
110
+
111
+ model = models[model_id]
112
+ processor = processors[model_id]
113
+
114
+ prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}"
115
+ pil_image = Image.fromarray(image).convert("RGB")
116
+ messages = [
117
+ {
118
+ "role": "user",
119
+ "content": [
120
+ {
121
+ "type": "image",
122
+ "image": image_path,
123
+ },
124
+ {"type": "text", "text": text_input},
125
+ ],
126
+ }
127
+ ]
128
+
129
+ # Preparation for inference
130
+ text = processor.apply_chat_template(
131
+ messages, tokenize=False, add_generation_prompt=True
132
+ )
133
+ image_inputs, video_inputs = process_vision_info(messages)
134
+ inputs = processor(
135
+ text=[text],
136
+ images=image_inputs,
137
+ videos=video_inputs,
138
+ padding=True,
139
+ return_tensors="pt",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  )
141
+ inputs = inputs.to("cuda")
142
 
143
+ # Inference: Generation of the output
144
+ generated_ids = model.generate(**inputs, max_new_tokens=1024)
145
+ generated_ids_trimmed = [
146
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
147
+ ]
148
+ output_text = processor.batch_decode(
149
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
150
+ )
151
 
152
+ ocr_text = output_text[0]
 
153
 
154
+ # If NER is enabled, process the OCR text
155
+ if run_ner:
156
+ ner_results = gliner_model.predict_entities(
157
+ ocr_text,
158
+ ner_labels.split(","),
159
+ threshold=0.3
160
+ )
161
 
162
+ # Create a list of tuples (text, label) for highlighting
163
+ highlighted_text = []
164
+ last_end = 0
165
+
166
+ # Sort entities by start position
167
+ sorted_entities = sorted(ner_results, key=lambda x: x["start"])
168
+
169
+ # Process each entity and add non-entity text segments
170
+ for entity in sorted_entities:
171
+ # Add non-entity text before the current entity
172
+ if last_end < entity["start"]:
173
+ highlighted_text.append((ocr_text[last_end:entity["start"]], None))
174
+
175
+ # Add the entity text with its label
176
+ highlighted_text.append((
177
+ ocr_text[entity["start"]:entity["end"]],
178
+ entity["label"]
179
+ ))
180
+ last_end = entity["end"]
181
+
182
+ # Add any remaining text after the last entity
183
+ if last_end < len(ocr_text):
184
+ highlighted_text.append((ocr_text[last_end:], None))
185
+
186
+ # Create TextWithMetadata instance with the highlighted text and metadata
187
+ result = TextWithMetadata(highlighted_text, original_text=ocr_text, entities=ner_results)
188
+ return result, result # Return twice: once for display, once for state
189
 
190
+ # If NER is disabled, return the text without highlighting
191
+ result = TextWithMetadata([(ocr_text, None)], original_text=ocr_text, entities=[])
192
  return result, result # Return twice: once for display, once for state
193
+
194
+ except Exception as e:
195
+ import traceback
196
+ print(f"Error processing image: {e}")
197
+ print(traceback.format_exc())
198
+ # Return empty result on error
199
+ result = TextWithMetadata([("Error processing image: " + str(e), None)], original_text="Error: " + str(e), entities=[])
200
+ return result, result
201
 
202
 
203
  with gr.Blocks() as demo:
call.py CHANGED
@@ -15,7 +15,7 @@ dotenv.load_dotenv()
15
  hf_token = os.getenv("HF_TOKEN")
16
 
17
  # Create client for the Hugging Face Space with authentication
18
- client = Client("wjbmattingly/caracal-api", hf_token=hf_token)
19
 
20
  # Example usage
21
  if __name__ == "__main__":
 
15
  hf_token = os.getenv("HF_TOKEN")
16
 
17
  # Create client for the Hugging Face Space with authentication
18
+ client = Client("wjbmattingly/carcal-api", hf_token=hf_token)
19
 
20
  # Example usage
21
  if __name__ == "__main__":