obichimav commited on
Commit
e5939e0
·
verified ·
1 Parent(s): 2541bac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -145
app.py CHANGED
@@ -329,178 +329,106 @@
329
  # demo.launch(share=True)
330
 
331
  import os
332
- import re
333
- import io
334
- import uuid
335
- import contextlib
336
  import gradio as gr
337
- from PIL import Image
338
- import shutil
339
 
340
- # Required packages:
341
- # pip install vision-agent gradio openai anthropic
342
 
343
- from vision_agent.agent import VisionAgentCoderV2
344
- from vision_agent.models import AgentMessage
345
-
346
- #############################################
347
- # GLOBAL INITIALIZATION
348
- #############################################
349
-
350
- # Create a unique temporary directory for saved images
351
- TEMP_DIR = "temp_images"
352
- if not os.path.exists(TEMP_DIR):
353
- os.makedirs(TEMP_DIR)
354
-
355
- # Initialize VisionAgentCoderV2 with verbose logging so the generated code has detailed print outputs.
356
- agent = VisionAgentCoderV2(verbose=True)
357
-
358
- #############################################
359
- # UTILITY: SAVE UPLOADED IMAGE TO A TEMP FILE
360
- #############################################
361
-
362
- def save_uploaded_image(image):
363
  """
364
- Saves the uploaded image (a numpy array) to a temporary file.
365
- Returns the filename (including path) to be passed as media to VisionAgent.
366
  """
367
- # Generate a unique filename
368
- filename = os.path.join(TEMP_DIR, f"{uuid.uuid4().hex}.jpg")
369
- im = Image.fromarray(image)
370
- im.save(filename)
371
- return filename
372
-
373
- #############################################
374
- # UTILITY: PARSE FILENAMES FROM save_image(...)
375
- #############################################
 
 
 
 
 
 
 
 
 
 
 
 
376
 
377
- def parse_saved_image_filenames(code_str):
378
  """
379
- Find all filenames in lines that look like:
380
- save_image(..., 'filename.jpg')
381
- Returns a list of the extracted filenames.
382
  """
383
- pattern = r"save_image\s*\(\s*[^,]+,\s*'([^']+)'\s*\)"
384
- return re.findall(pattern, code_str)
385
-
386
- #############################################
387
- # UTILITY: EXECUTE CODE, CAPTURE STDOUT, IDENTIFY IMAGES
388
- #############################################
389
 
390
- def run_and_capture_with_images(code_str):
391
  """
392
- Executes the given code_str, capturing stdout and returning:
393
- - output: a string with all print statements (the step logs)
394
- - existing_images: list of filenames that were saved and exist on disk.
 
395
  """
396
- # Parse the code for image filenames saved via save_image
397
- filenames = parse_saved_image_filenames(code_str)
398
-
399
- # Capture stdout using a StringIO buffer
400
- buf = io.StringIO()
401
- with contextlib.redirect_stdout(buf):
402
- # IMPORTANT: Here we exec the generated code.
403
- exec(code_str, globals(), locals())
404
-
405
- # Gather all printed output
406
- output = buf.getvalue()
407
-
408
- # Check which of the parsed filenames exist on disk (prepend TEMP_DIR if needed)
409
- existing_images = []
410
- for fn in filenames:
411
- # If filename is not an absolute path, assume it is in TEMP_DIR
412
- if not os.path.isabs(fn):
413
- fn = os.path.join(TEMP_DIR, fn)
414
- if os.path.exists(fn):
415
- existing_images.append(fn)
416
- return output, existing_images
417
-
418
- #############################################
419
- # CHAT FUNCTION: PROCESS USER PROMPT & IMAGE
420
- #############################################
421
-
422
- def chat(prompt, image, history):
423
- """
424
- When the user sends a prompt and optionally an image, do the following:
425
- 1. Save the image to a temp file.
426
- 2. Use VisionAgentCoderV2 to generate code for the task.
427
- 3. Execute the generated code, capturing its stdout logs and any saved image files.
428
- 4. Append the logs and image gallery info to the conversation history.
429
- """
430
- # Validate that an image was provided.
431
  if image is None:
432
- history.append(("System", "Please upload an image."))
433
- return history, None
434
-
435
- # Save the uploaded image for use in the generated code.
436
- image_path = save_uploaded_image(image)
437
-
438
- # Generate the code with VisionAgent using the user prompt and the image filename.
439
- code_context = agent.generate_code(
440
- [
441
- AgentMessage(
442
- role="user",
443
- content=prompt,
444
- media=[image_path]
445
- )
446
- ]
447
- )
448
 
449
- # Combine the generated code and its test snippet.
450
- generated_code = code_context.code + "\n" + code_context.test
451
-
452
- # Run the generated code and capture output and any saved images.
453
- stdout_text, image_files = run_and_capture_with_images(generated_code)
454
 
455
- # Format the response text (the captured logs).
456
- response_text = f"**Execution Logs:**\n{stdout_text}\n"
457
- if image_files:
458
- response_text += "\n**Saved Images:** " + ", ".join(image_files)
459
- else:
460
- response_text += "\nNo images were saved by the generated code."
461
 
462
- # Append the prompt and response to the chat history.
463
- history.append((prompt, response_text))
464
 
465
- # Optionally, you could clear the image input after use.
466
- return history, image_files
467
-
468
- #############################################
469
- # GRADIO CHAT INTERFACE
470
- #############################################
 
 
471
 
 
472
  with gr.Blocks() as demo:
473
- gr.Markdown("# VisionAgent Chat App")
474
  gr.Markdown(
475
  """
476
- This chat app lets you enter a prompt (e.g., "Count the number of cacao oranges in the image")
477
- along with an image. The app then uses VisionAgentCoderV2 to generate multi-step code, executes it,
478
- and returns the detailed logs and any saved images.
 
 
 
 
 
479
  """
480
  )
481
 
482
  with gr.Row():
483
- with gr.Column(scale=7):
484
- chatbot = gr.Chatbot(label="Chat History")
485
- prompt_input = gr.Textbox(label="Enter Prompt", placeholder="e.g., Count the number of cacao oranges in the image")
486
- submit_btn = gr.Button("Send")
487
- with gr.Column(scale=5):
488
- image_input = gr.Image(label="Upload Image", type="numpy")
489
 
490
- gallery = gr.Gallery(label="Generated Images").style(grid=[2], height="auto")
491
 
492
- # Clear chat history button
493
- clear_btn = gr.Button("Clear Chat")
494
 
495
- # Chat function wrapper (it takes current chat history, prompt, image)
496
- def user_chat_wrapper(prompt, image, history):
497
- history = history or []
498
- history, image_files = chat(prompt, image, history)
499
- return history, image_files
500
-
501
- submit_btn.click(fn=user_chat_wrapper, inputs=[prompt_input, image_input, chatbot], outputs=[chatbot, gallery])
502
-
503
- clear_btn.click(lambda: ([], None), None, [chatbot, gallery])
504
 
505
  demo.launch()
506
 
 
 
329
  # demo.launch(share=True)
330
 
331
  import os
332
+ import openai
 
 
 
333
  import gradio as gr
334
+ import vision_agent.tools as T
 
335
 
336
+ # Set your OpenAI API key (ensure the environment variable is set or replace with your key)
337
+ openai.api_key = os.getenv("OPENAI_API_KEY", "your-openai-api-key-here")
338
 
339
+ def get_single_prompt(user_input):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  """
341
+ Uses OpenAI to rephrase the user's chatter into a single, concise prompt for object detection.
342
+ The generated prompt will not include any question marks.
343
  """
344
+ if not user_input.strip():
345
+ user_input = "Detect objects in the image"
346
+
347
+ prompt_instruction = (
348
+ f"Based on the following user input, generate a single, concise prompt for object detection. "
349
+ f"Do not include any question marks in the output. "
350
+ f"User input: \"{user_input}\""
351
+ )
352
+
353
+ response = openai.Completion.create(
354
+ engine="text-davinci-003",
355
+ prompt=prompt_instruction,
356
+ max_tokens=50,
357
+ n=1,
358
+ stop=None,
359
+ temperature=0.3,
360
+ )
361
+ generated_prompt = response.choices[0].text.strip()
362
+ # Ensure no question marks exist in the prompt.
363
+ generated_prompt = generated_prompt.replace("?", "")
364
+ return generated_prompt
365
 
366
+ def is_count_query(user_input):
367
  """
368
+ Check if the user's input indicates a counting request.
369
+ Looks for common keywords such as "count", "how many", "number of", etc.
 
370
  """
371
+ keywords = ["count", "how many", "number of", "total", "get me a count"]
372
+ for kw in keywords:
373
+ if kw.lower() in user_input.lower():
374
+ return True
375
+ return False
 
376
 
377
+ def process_question_and_detect(user_input, image):
378
  """
379
+ 1. Uses OpenAI to generate a single, concise prompt (without question marks) from the user's input.
380
+ 2. Feeds that prompt to the VisionAgent detection function.
381
+ 3. Overlays the detection bounding boxes on the image.
382
+ 4. If the user's input is a counting query, it also returns the count of detected objects.
383
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  if image is None:
385
+ return None, "Please upload an image."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
 
387
+ # Generate a concise prompt from the user's input.
388
+ generated_prompt = get_single_prompt(user_input)
 
 
 
389
 
390
+ # Run object detection using the generated prompt.
391
+ dets = T.agentic_object_detection(generated_prompt, image)
 
 
 
 
392
 
393
+ # Overlay bounding boxes on the image.
394
+ viz = T.overlay_bounding_boxes(image, dets)
395
 
396
+ # Check if the user's input implies a counting request.
397
+ count_text = ""
398
+ if is_count_query(user_input):
399
+ count = len(dets)
400
+ count_text = f"Detected {count} objects."
401
+
402
+ output_text = f"Generated prompt: {generated_prompt}\n{count_text}"
403
+ return viz, output_text
404
 
405
+ # Build the Gradio interface.
406
  with gr.Blocks() as demo:
407
+ gr.Markdown("# VisionAgent Object Detection and Counting App")
408
  gr.Markdown(
409
  """
410
+ Enter your input (for example:
411
+ - "What is the number of fruit in my image?"
412
+ - "How many bicycles can you see?"
413
+ - "Get me a count of my bottles")
414
+ and upload an image.
415
+
416
+ The app uses OpenAI to generate a single, concise prompt for object detection (without question marks),
417
+ then runs the detection. If your input implies a counting request, it will also display the count of detected objects.
418
  """
419
  )
420
 
421
  with gr.Row():
422
+ user_input = gr.Textbox(label="Enter your input", placeholder="Type your input here...")
423
+ image_input = gr.Image(label="Upload Image", type="numpy")
 
 
 
 
424
 
425
+ submit_btn = gr.Button("Detect and Count")
426
 
427
+ output_image = gr.Image(label="Detection Result")
428
+ output_text = gr.Textbox(label="Output Details")
429
 
430
+ submit_btn.click(fn=process_question_and_detect, inputs=[user_input, image_input], outputs=[output_image, output_text])
 
 
 
 
 
 
 
 
431
 
432
  demo.launch()
433
 
434
+