Spaces:

MawaredHR
/

Vision_tester

Running

App Files Files Community

Daemontatox commited on Feb 11

Commit

86c6ea5

verified ·

1 Parent(s): 1cf7f91

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -144

app.py CHANGED Viewed

@@ -96,7 +96,7 @@ def process_uploaded_file(file):
             doc_state.doc_type = 'pdf'
             try:
                 doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
-                return f"PDF processed successfully. Total pages: {len(doc_state.current_doc_images)}. You can now ask questions about the content."
             except Exception as e:
                 return f"Error processing PDF: {str(e)}. Please try a different PDF file."
         elif file_ext in image_extensions:
@@ -109,7 +109,7 @@ def process_uploaded_file(file):
                     new_size = tuple(int(dim * ratio) for dim in img.size)
                     img = img.resize(new_size, Image.Resampling.LANCZOS)
                 doc_state.current_doc_images = [img]
-                return "Image loaded successfully. You can now ask questions about the content."
             except Exception as e:
                 return f"Error processing image: {str(e)}. Please try a different image file."
         else:
@@ -118,161 +118,106 @@ def process_uploaded_file(file):
         logger.error(f"Error in process_uploaded_file: {str(e)}")
         return "An error occurred while processing the file. Please try again."
 # -------------------------------
-# Bot Streaming Function Using the Multimodal API
 # -------------------------------
-def bot_streaming(prompt_option, max_new_tokens=500):
-    """
-    Build a multimodal message payload and call the inference API.
-    The payload includes:
-      - A text segment (the selected prompt and any document context).
-      - If available, an image as a data URI (using a base64-encoded PNG).
-    """
-    try:
-        # Predetermined prompts (you can adjust these as needed)
-        prompts = {
-            "NOC Timesheet": (
-                """Extract structured information from the provided timesheet. The extracted details should include:
-Name
-Position Title
-Work Location
-Contractor
-NOC ID
-Month and Year
-Regular Service Days (ONSHORE)
-Standby Days (ONSHORE in Doha)
-Offshore Days
-Standby & Extended Hitch Days (OFFSHORE)
-Extended Hitch Days (ONSHORE Rotational)
-Service during Weekends & Public Holidays
-ONSHORE Overtime Hours (Over 8 hours)
-OFFSHORE Overtime Hours (Over 12 hours)
-Per Diem Days (ONSHORE/OFFSHORE Rotational Personnel)
-Training Days
-Travel Days
-Noc representative appoval's name as approved_by
-Noc representative's date approval_date
-Noc representative status as approval_status
-Format the output as valid JSON.
-                """
-            ),
-            "NOC Basic": (
-                "Based on the provided timesheet details, extract the following information:\n"
-                "   - Full name\n"
-                "   - Position title\n"
-                "   - Work location\n"
-                "   - Contractor's name\n"
-                "   - NOC ID\n"
-                "   - Month and year (MM/YYYY)"
-            ),
-            "Aramco Full structured": (
-                """You are a document parsing assistant designed to extract structured data from various documents such as invoices, timesheets, purchase orders, and travel bookings. Return only valid JSON with no extra text.
-                """
-            ),
-            "Aramco Timesheet only": (
-                """Extract time tracking, work details, and approvals.
-Return a JSON object following the specified structure.
-                """
-            ),
-            "NOC Invoice": (
-                """You are a highly accurate data extraction system. Analyze the provided invoice image and extract all data into the following JSON format:
-{
-  "invoiceDetails": { ... },
-  "from": { ... },
-  "to": { ... },
-  "services": [ ... ],
-  "totals": { ... },
-  "bankDetails": { ... }
 }
-                """
-            )
-        }
-        # Select the appropriate prompt
-        selected_prompt = prompts.get(prompt_option, "Invalid prompt selected.")
-        context = ""
-        if doc_state.current_doc_images and doc_state.current_doc_text:
-            context = "\nDocument context:\n" + doc_state.current_doc_text
-        full_prompt = selected_prompt + context
-        # Build the message payload in the expected format.
-        # The content field is a list of objects—one for text, and (if an image is available) one for the image.
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": full_prompt
-                    }
-                ]
-            }
-        ]
-        # If an image is available, encode it as a data URI and append it as an image_url message.
-        if doc_state.current_doc_images:
             buffered = io.BytesIO()
             doc_state.current_doc_images[0].save(buffered, format="PNG")
             img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
-            # Create a data URI (many APIs accept this format in place of a public URL)
             data_uri = f"data:image/png;base64,{img_b64}"
-            messages[0]["content"].append({
                 "type": "image_url",
                 "image_url": {"url": data_uri}
             })
-        # Call the inference API with streaming enabled.
         stream = client.chat.completions.create(
-            model="qwen/qwen-vl-plus:free",
             messages=messages,
-            max_tokens=max_new_tokens,
             stream=True
         )
-        buffer = ""
-        for chunk in stream:
-            # The response structure is similar to the reference: each chunk contains a delta.
-            delta = chunk.choices[0].delta.content
-            buffer += delta
-            time.sleep(0.01)
-            yield buffer
     except Exception as e:
-        logger.error(f"Error in bot_streaming: {str(e)}")
-        yield "An error occurred while processing your request. Please try again."
-def clear_context():
-    """Clear the current document context."""
-    doc_state.clear()
-    return "Document context cleared. You can upload a new document."
 # -------------------------------
 # Create the Gradio Interface
 # -------------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("# Document Analyzer with Predetermined Prompts")
-    gr.Markdown("Upload a PDF or image (PNG, JPG, JPEG, GIF, BMP, WEBP) and select a prompt to analyze its contents.")
     with gr.Row():
         file_upload = gr.File(
@@ -284,16 +229,32 @@ with gr.Blocks() as demo:
     with gr.Row():
         prompt_dropdown = gr.Dropdown(
             label="Select Prompt",
-            choices=["NOC Timesheet", "Aramco Full Timesheet and Invoice structured", "Aramco Timesheet only", "NOC Invoice"],
-            value="NOC Timesheet"
         )
-        generate_btn = gr.Button("Generate")
-    clear_btn = gr.Button("Clear Document Context")
-    output_text = gr.Textbox(label="Output", interactive=False)
-    file_upload.change(fn=process_uploaded_file, inputs=[file_upload], outputs=[upload_status])
-    generate_btn.click(fn=bot_streaming, inputs=[prompt_dropdown], outputs=[output_text])
-    clear_btn.click(fn=clear_context, outputs=[upload_status])
 demo.launch(debug=True)

             doc_state.doc_type = 'pdf'
             try:
                 doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
+                return f"PDF processed successfully. Total pages: {len(doc_state.current_doc_images)}. You can now chat with the bot."
             except Exception as e:
                 return f"Error processing PDF: {str(e)}. Please try a different PDF file."
         elif file_ext in image_extensions:
                     new_size = tuple(int(dim * ratio) for dim in img.size)
                     img = img.resize(new_size, Image.Resampling.LANCZOS)
                 doc_state.current_doc_images = [img]
+                return "Image loaded successfully. You can now chat with the bot."
             except Exception as e:
                 return f"Error processing image: {str(e)}. Please try a different image file."
         else:
         logger.error(f"Error in process_uploaded_file: {str(e)}")
         return "An error occurred while processing the file. Please try again."
+def clear_context():
+    """Clear the current document context and chat history."""
+    doc_state.clear()
+    return "Document context cleared. You can upload a new document.", []
 # -------------------------------
+# Predetermined Prompts
 # -------------------------------
+predetermined_prompts = {
+    "Software Tester": (
+        "Act as a software tester. Analyze the uploaded image of a software interface and generate comprehensive "
+        "test cases for its features. For each feature, provide test steps, expected results, and any necessary "
+        "preconditions. Be as detailed as possible."
+    )
 }
+# -------------------------------
+# Chat Function with Streaming and Conversation History
+# -------------------------------
+def chat_respond(user_message, history, prompt_option):
+    """
+    Append the user message (or, if starting a new conversation and no message is provided,
+    use the predetermined prompt) to the conversation history; build the API call using
+    the full conversation history (and the image if available); stream back the assistant response
+    while updating the history.
+    The history is a list of [user_text, assistant_text] pairs.
+    """
+    # If this is the first message, add the predetermined prompt text.
+    if history == []:
+        # If user_message is empty, use the predetermined prompt.
+        if not user_message.strip():
+            user_message = predetermined_prompts.get(prompt_option, "Hello")
+        else:
+            # Optionally, prepend the predetermined prompt.
+            user_message = predetermined_prompts.get(prompt_option, "") + "\n" + user_message
+    # Append the new user message with an empty assistant response.
+    history = history + [[user_message, ""]]
+    # Build the messages list (for the multimodal API) from the conversation history.
+    messages = []
+    for i, (user_msg, assistant_msg) in enumerate(history):
+        # For the user message:
+        user_content = [{"type": "text", "text": user_msg}]
+        # For the very first user message, if an image was uploaded, append the image.
+        if i == 0 and doc_state.current_doc_images:
             buffered = io.BytesIO()
             doc_state.current_doc_images[0].save(buffered, format="PNG")
             img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
             data_uri = f"data:image/png;base64,{img_b64}"
+            user_content.append({
                 "type": "image_url",
                 "image_url": {"url": data_uri}
             })
+        messages.append({"role": "user", "content": user_content})
+        # For the assistant response, if available.
+        if assistant_msg:
+            messages.append({
+                "role": "assistant",
+                "content": [{"type": "text", "text": assistant_msg}]
+            })
+    # Call the inference API with streaming enabled.
+    try:
         stream = client.chat.completions.create(
+            model="google/gemini-2.0-pro-exp-02-05:free",
             messages=messages,
+            max_tokens=8192,
             stream=True
         )
     except Exception as e:
+        logger.error(f"Error calling the API: {str(e)}")
+        history[-1][1] = "An error occurred while processing your request. Please try again."
+        yield history, history
+    # Stream and update the assistant's reply token by token.
+    buffer = ""
+    for chunk in stream:
+        delta = chunk.choices[0].delta.content
+        buffer += delta
+        # Update the assistant part of the latest message in the history.
+        history[-1][1] = buffer
+        # Yield the updated chat history (for the Chatbot component) and the state.
+        yield history, history
+        time.sleep(0.01)
+    return history, history
 # -------------------------------
 # Create the Gradio Interface
 # -------------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("# Document Analyzer & Software Testing Chatbot")
+    gr.Markdown(
+        "Upload a PDF or an image (PNG, JPG, JPEG, GIF, BMP, WEBP). Then choose a prompt from the dropdown. "
+        "For example, select **Software Tester** to have the bot analyze an image of a software interface "
+        "and generate test cases. Chat with the bot in the conversation below."
+    )
     with gr.Row():
         file_upload = gr.File(
     with gr.Row():
         prompt_dropdown = gr.Dropdown(
             label="Select Prompt",
+            choices=[
+                "Software Tester"
+            ],
+            value="Software Tester"
         )
+        clear_btn = gr.Button("Clear Document Context & Chat History")
+    chatbot = gr.Chatbot(label="Chat History", elem_id="chatbot")
+    with gr.Row():
+        user_input = gr.Textbox(label="Your Message", placeholder="Type your message here...", show_label=False)
+        send_btn = gr.Button("Send")
+    # State to hold the conversation history
+    chat_state = gr.State([])
+    # When a file is uploaded, process it.
+    file_upload.change(fn=process_uploaded_file, inputs=file_upload, outputs=upload_status)
+    # Clear both the document context and chat history.
+    clear_btn.click(fn=clear_context, outputs=[upload_status, chat_state])
+    # When the user clicks Send, process the message and update the chat.
+    send_btn.click(fn=chat_respond,
+                   inputs=[user_input, chat_state, prompt_dropdown],
+                   outputs=[chatbot, chat_state],
+                   stream=True)
 demo.launch(debug=True)