Spaces:
Running
Running
improve gradio blocks interface
Browse files
app.py
CHANGED
@@ -55,31 +55,32 @@ You can reset the interface anytime by clicking the **Reset** button.
|
|
55 |
### Join us :
|
56 |
🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
|
57 |
"""
|
|
|
58 |
model_presentation = f"""
|
59 |
-
## PLeIAs/📸📈✍🏻Florence-PDF Model Overview
|
60 |
|
61 |
-
The
|
62 |
|
63 |
### Key Features
|
64 |
|
65 |
-
- **Model Architecture**: PLeIAs/📸📈✍🏻Florence-PDF uses an encoder-decoder structure, which makes it effective in tasks like **text generation**, **summarization**, and **translation**. It has **{num_layers} layers** for both the encoder and decoder, with a model dimension (`d_model`) of **{d_model}**.
|
66 |
- **Conditional Generation**: The model can generate text conditionally, with a maximum length of **{max_length} tokens** for each generated sequence, making it ideal for tasks that require concise output.
|
67 |
-
- **Beam Search**: PLeIAs/📸📈✍🏻Florence-PDFsupports **beam search** with up to **{beam_size} beams**, enabling more diverse and accurate text generation by exploring multiple potential outputs before selecting the best one.
|
68 |
- **Tokenization**: It includes a tokenizer with a vocabulary size of **{vocab_size} tokens**. Special tokens such as the **bos_token_id (0)** and **eos_token_id (2)** help control the generation process by marking the beginning and end of a sequence.
|
69 |
- **Attention Mechanism**: Both the encoder and decoder utilize **{attention_heads} attention heads** per layer, ensuring that the model can focus on relevant parts of the input when generating text.
|
70 |
-
- **Dropout and Activation**: PLeIAs/📸📈✍🏻Florence-PDF employs a **{activation_function} activation function** and a **dropout rate of {dropout}**, which enhances model performance by preventing overfitting and improving generalization.
|
71 |
- **Training Configuration**: The model uses **float32** precision for training, and it supports fine-tuning for specific tasks by setting `finetuning_task` appropriately.
|
72 |
|
73 |
### Vision Integration
|
74 |
|
75 |
-
In addition to text tasks, PLeIAs/📸📈✍🏻Florence-PDF also incorporates **vision capabilities**:
|
76 |
- **Patch-based Image Processing**: The vision component operates on image patches with a patch size of **{patch_size}x{patch_size}**.
|
77 |
- **Temporal Embedding**: Visual tasks benefit from temporal embeddings with up to **{temporal_embeddings} steps**, making Florence-2 well-suited for video analysis.
|
78 |
|
79 |
### Model Usage and Flexibility
|
80 |
|
81 |
- **No Repeat N-Grams**: To reduce repetition in text generation, the model is configured with a **no_repeat_ngram_size** of **{no_repeat_ngram_size}**, ensuring more diverse and meaningful outputs.
|
82 |
-
- **Sampling Strategies**: PLeIAs/📸📈✍🏻Florence-PDF offers flexible sampling strategies, including **top-k** and **top-p (nucleus) sampling**, allowing for both creative and constrained generation based on user needs.
|
83 |
|
84 |
📸📈✍🏻Florence-PDF is a robust model capable of handling various **text and image** tasks with high precision and flexibility, making it a valuable tool for both academic research and practical applications.
|
85 |
"""
|
@@ -165,12 +166,12 @@ def draw_bounding_boxes(image, quad_boxes, labels, color=(0, 255, 0), thickness=
|
|
165 |
|
166 |
def process_image(image, task):
|
167 |
prompt = TASK_PROMPTS[task]
|
168 |
-
# Print the inputs for debugging
|
169 |
-
print(f"\n--- Processing Task: {task} ---")
|
170 |
-
print(f"Prompt: {prompt}")
|
171 |
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)
|
172 |
-
# Print the input tensors for debugging
|
173 |
-
print(f"Model Input: {inputs}")
|
174 |
generated_ids = model.generate(
|
175 |
**inputs,
|
176 |
max_new_tokens=1024,
|
@@ -179,11 +180,11 @@ def process_image(image, task):
|
|
179 |
)
|
180 |
|
181 |
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
|
182 |
-
# Print the raw generated output for debugging
|
183 |
-
print(f"Raw Model Output: {generated_text}")
|
184 |
parsed_answer = processor.post_process_generation(generated_text, task=prompt, image_size=(image.width, image.height))
|
185 |
-
# Print the parsed answer for debugging
|
186 |
-
print(f"Parsed Answer: {parsed_answer}")
|
187 |
return parsed_answer
|
188 |
|
189 |
|
@@ -195,8 +196,8 @@ def main_process(image, task):
|
|
195 |
fig = plot_bbox(image, result.get('<OCR_WITH_REGION>', {}), use_quad_boxes=True)
|
196 |
output_image = fig_to_pil(fig)
|
197 |
text_output = result.get('<OCR_WITH_REGION>', {}).get('recognized_text', 'No text found')
|
198 |
-
# Debugging: Print the recognized text
|
199 |
-
print(f"Recognized Text: {text_output}")
|
200 |
return output_image, gr.update(visible=True), text_output, gr.update(visible=False)
|
201 |
else:
|
202 |
fig = plot_bbox(image, result.get(TASK_PROMPTS[task], {}))
|
@@ -209,22 +210,27 @@ def main_process(image, task):
|
|
209 |
def reset_outputs():
|
210 |
return None, gr.update(visible=False), None, gr.update(visible=True)
|
211 |
|
212 |
-
with gr.Blocks(title="PLeIAs/📸📈✍🏻Florence-PDF") as iface:
|
213 |
-
gr.Markdown(title)
|
214 |
-
gr.Markdown(description)
|
215 |
-
|
216 |
with gr.Column():
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
|
229 |
def process_and_update(image, task):
|
230 |
if image is None:
|
|
|
55 |
### Join us :
|
56 |
🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
|
57 |
"""
|
58 |
+
|
59 |
model_presentation = f"""
|
60 |
+
## 🙏🏻PLeIAs/📸📈✍🏻Florence-PDF Model Overview
|
61 |
|
62 |
+
The **🙏🏻PLeIAs/📸📈✍🏻Florence-PDF** model is a state-of-the-art model for conditional generation tasks, designed to be highly effective for both **text** and **vision** tasks. It is built as an **encoder-decoder** architecture, which allows for enhanced flexibility and performance in generating outputs based on diverse inputs.
|
63 |
|
64 |
### Key Features
|
65 |
|
66 |
+
- **Model Architecture**: 🙏🏻PLeIAs/📸📈✍🏻Florence-PDF uses an encoder-decoder structure, which makes it effective in tasks like **text generation**, **summarization**, and **translation**. It has **{num_layers} layers** for both the encoder and decoder, with a model dimension (`d_model`) of **{d_model}**.
|
67 |
- **Conditional Generation**: The model can generate text conditionally, with a maximum length of **{max_length} tokens** for each generated sequence, making it ideal for tasks that require concise output.
|
68 |
+
- **Beam Search**: 🙏🏻PLeIAs/📸📈✍🏻Florence-PDFsupports **beam search** with up to **{beam_size} beams**, enabling more diverse and accurate text generation by exploring multiple potential outputs before selecting the best one.
|
69 |
- **Tokenization**: It includes a tokenizer with a vocabulary size of **{vocab_size} tokens**. Special tokens such as the **bos_token_id (0)** and **eos_token_id (2)** help control the generation process by marking the beginning and end of a sequence.
|
70 |
- **Attention Mechanism**: Both the encoder and decoder utilize **{attention_heads} attention heads** per layer, ensuring that the model can focus on relevant parts of the input when generating text.
|
71 |
+
- **Dropout and Activation**: 🙏🏻PLeIAs/📸📈✍🏻Florence-PDF employs a **{activation_function} activation function** and a **dropout rate of {dropout}**, which enhances model performance by preventing overfitting and improving generalization.
|
72 |
- **Training Configuration**: The model uses **float32** precision for training, and it supports fine-tuning for specific tasks by setting `finetuning_task` appropriately.
|
73 |
|
74 |
### Vision Integration
|
75 |
|
76 |
+
In addition to text tasks, 🙏🏻PLeIAs/📸📈✍🏻Florence-PDF also incorporates **vision capabilities**:
|
77 |
- **Patch-based Image Processing**: The vision component operates on image patches with a patch size of **{patch_size}x{patch_size}**.
|
78 |
- **Temporal Embedding**: Visual tasks benefit from temporal embeddings with up to **{temporal_embeddings} steps**, making Florence-2 well-suited for video analysis.
|
79 |
|
80 |
### Model Usage and Flexibility
|
81 |
|
82 |
- **No Repeat N-Grams**: To reduce repetition in text generation, the model is configured with a **no_repeat_ngram_size** of **{no_repeat_ngram_size}**, ensuring more diverse and meaningful outputs.
|
83 |
+
- **Sampling Strategies**: 🙏🏻PLeIAs/📸📈✍🏻Florence-PDF offers flexible sampling strategies, including **top-k** and **top-p (nucleus) sampling**, allowing for both creative and constrained generation based on user needs.
|
84 |
|
85 |
📸📈✍🏻Florence-PDF is a robust model capable of handling various **text and image** tasks with high precision and flexibility, making it a valuable tool for both academic research and practical applications.
|
86 |
"""
|
|
|
166 |
|
167 |
def process_image(image, task):
|
168 |
prompt = TASK_PROMPTS[task]
|
169 |
+
# # Print the inputs for debugging
|
170 |
+
# print(f"\n--- Processing Task: {task} ---")
|
171 |
+
# print(f"Prompt: {prompt}")
|
172 |
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)
|
173 |
+
# # Print the input tensors for debugging
|
174 |
+
# print(f"Model Input: {inputs}")
|
175 |
generated_ids = model.generate(
|
176 |
**inputs,
|
177 |
max_new_tokens=1024,
|
|
|
180 |
)
|
181 |
|
182 |
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
|
183 |
+
# # Print the raw generated output for debugging
|
184 |
+
# print(f"Raw Model Output: {generated_text}")
|
185 |
parsed_answer = processor.post_process_generation(generated_text, task=prompt, image_size=(image.width, image.height))
|
186 |
+
# # Print the parsed answer for debugging
|
187 |
+
# print(f"Parsed Answer: {parsed_answer}")
|
188 |
return parsed_answer
|
189 |
|
190 |
|
|
|
196 |
fig = plot_bbox(image, result.get('<OCR_WITH_REGION>', {}), use_quad_boxes=True)
|
197 |
output_image = fig_to_pil(fig)
|
198 |
text_output = result.get('<OCR_WITH_REGION>', {}).get('recognized_text', 'No text found')
|
199 |
+
# # Debugging: Print the recognized text
|
200 |
+
# print(f"Recognized Text: {text_output}")
|
201 |
return output_image, gr.update(visible=True), text_output, gr.update(visible=False)
|
202 |
else:
|
203 |
fig = plot_bbox(image, result.get(TASK_PROMPTS[task], {}))
|
|
|
210 |
def reset_outputs():
|
211 |
return None, gr.update(visible=False), None, gr.update(visible=True)
|
212 |
|
213 |
+
with gr.Blocks(title="Tonic's 🙏🏻PLeIAs/📸📈✍🏻Florence-PDF") as iface:
|
|
|
|
|
|
|
214 |
with gr.Column():
|
215 |
+
with gr.Row():
|
216 |
+
gr.Markdown(title)
|
217 |
+
with gr.Row():
|
218 |
+
with gr.Column(scale=1):
|
219 |
+
with gr.Group():
|
220 |
+
gr.Markdown(model_presentation)
|
221 |
+
with gr.Column(scale=1):
|
222 |
+
with gr.Group():
|
223 |
+
gr.Markdown(description)
|
224 |
+
with gr.Row():
|
225 |
+
with gr.Column(scale=1):
|
226 |
+
image_input = gr.Image(type="pil", label="Input Image")
|
227 |
+
task_dropdown = gr.Dropdown(list(TASK_PROMPTS.keys()), label="Task", value="✍🏻Caption")
|
228 |
+
with gr.Row():
|
229 |
+
submit_button = gr.Button("Process")
|
230 |
+
reset_button = gr.Button("Reset")
|
231 |
+
with gr.Column(scale=1):
|
232 |
+
output_image = gr.Image(label="🙏🏻PLeIAs/📸📈✍🏻Florence-PDF", visible=False)
|
233 |
+
output_text = gr.Textbox(label="🙏🏻PLeIAs/📸📈✍🏻Florence-PDF", visible=True)
|
234 |
|
235 |
def process_and_update(image, task):
|
236 |
if image is None:
|