Daemontatox commited on
Commit
86c6ea5
·
verified ·
1 Parent(s): 1cf7f91

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -144
app.py CHANGED
@@ -96,7 +96,7 @@ def process_uploaded_file(file):
96
  doc_state.doc_type = 'pdf'
97
  try:
98
  doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
99
- return f"PDF processed successfully. Total pages: {len(doc_state.current_doc_images)}. You can now ask questions about the content."
100
  except Exception as e:
101
  return f"Error processing PDF: {str(e)}. Please try a different PDF file."
102
  elif file_ext in image_extensions:
@@ -109,7 +109,7 @@ def process_uploaded_file(file):
109
  new_size = tuple(int(dim * ratio) for dim in img.size)
110
  img = img.resize(new_size, Image.Resampling.LANCZOS)
111
  doc_state.current_doc_images = [img]
112
- return "Image loaded successfully. You can now ask questions about the content."
113
  except Exception as e:
114
  return f"Error processing image: {str(e)}. Please try a different image file."
115
  else:
@@ -118,161 +118,106 @@ def process_uploaded_file(file):
118
  logger.error(f"Error in process_uploaded_file: {str(e)}")
119
  return "An error occurred while processing the file. Please try again."
120
 
 
 
 
 
 
121
  # -------------------------------
122
- # Bot Streaming Function Using the Multimodal API
123
  # -------------------------------
124
- def bot_streaming(prompt_option, max_new_tokens=500):
125
- """
126
- Build a multimodal message payload and call the inference API.
127
- The payload includes:
128
- - A text segment (the selected prompt and any document context).
129
- - If available, an image as a data URI (using a base64-encoded PNG).
130
- """
131
- try:
132
- # Predetermined prompts (you can adjust these as needed)
133
- prompts = {
134
- "NOC Timesheet": (
135
- """Extract structured information from the provided timesheet. The extracted details should include:
136
-
137
- Name
138
-
139
- Position Title
140
-
141
- Work Location
142
-
143
- Contractor
144
-
145
- NOC ID
146
-
147
- Month and Year
148
-
149
- Regular Service Days (ONSHORE)
150
-
151
- Standby Days (ONSHORE in Doha)
152
-
153
- Offshore Days
154
-
155
- Standby & Extended Hitch Days (OFFSHORE)
156
-
157
- Extended Hitch Days (ONSHORE Rotational)
158
-
159
- Service during Weekends & Public Holidays
160
-
161
- ONSHORE Overtime Hours (Over 8 hours)
162
-
163
- OFFSHORE Overtime Hours (Over 12 hours)
164
-
165
- Per Diem Days (ONSHORE/OFFSHORE Rotational Personnel)
166
-
167
- Training Days
168
-
169
- Travel Days
170
-
171
- Noc representative appoval's name as approved_by
172
-
173
- Noc representative's date approval_date
174
-
175
- Noc representative status as approval_status
176
-
177
- Format the output as valid JSON.
178
- """
179
- ),
180
- "NOC Basic": (
181
- "Based on the provided timesheet details, extract the following information:\n"
182
- " - Full name\n"
183
- " - Position title\n"
184
- " - Work location\n"
185
- " - Contractor's name\n"
186
- " - NOC ID\n"
187
- " - Month and year (MM/YYYY)"
188
- ),
189
- "Aramco Full structured": (
190
- """You are a document parsing assistant designed to extract structured data from various documents such as invoices, timesheets, purchase orders, and travel bookings. Return only valid JSON with no extra text.
191
- """
192
- ),
193
- "Aramco Timesheet only": (
194
- """Extract time tracking, work details, and approvals.
195
- Return a JSON object following the specified structure.
196
- """
197
- ),
198
- "NOC Invoice": (
199
- """You are a highly accurate data extraction system. Analyze the provided invoice image and extract all data into the following JSON format:
200
- {
201
- "invoiceDetails": { ... },
202
- "from": { ... },
203
- "to": { ... },
204
- "services": [ ... ],
205
- "totals": { ... },
206
- "bankDetails": { ... }
207
  }
208
- """
209
- )
210
- }
211
-
212
- # Select the appropriate prompt
213
- selected_prompt = prompts.get(prompt_option, "Invalid prompt selected.")
214
- context = ""
215
- if doc_state.current_doc_images and doc_state.current_doc_text:
216
- context = "\nDocument context:\n" + doc_state.current_doc_text
217
- full_prompt = selected_prompt + context
218
 
219
- # Build the message payload in the expected format.
220
- # The content field is a list of objects—one for text, and (if an image is available) one for the image.
221
- messages = [
222
- {
223
- "role": "user",
224
- "content": [
225
- {
226
- "type": "text",
227
- "text": full_prompt
228
- }
229
- ]
230
- }
231
- ]
232
-
233
- # If an image is available, encode it as a data URI and append it as an image_url message.
234
- if doc_state.current_doc_images:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  buffered = io.BytesIO()
236
  doc_state.current_doc_images[0].save(buffered, format="PNG")
237
  img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
238
- # Create a data URI (many APIs accept this format in place of a public URL)
239
  data_uri = f"data:image/png;base64,{img_b64}"
240
- messages[0]["content"].append({
241
  "type": "image_url",
242
  "image_url": {"url": data_uri}
243
  })
244
-
245
- # Call the inference API with streaming enabled.
 
 
 
 
 
 
 
 
246
  stream = client.chat.completions.create(
247
- model="qwen/qwen-vl-plus:free",
248
  messages=messages,
249
- max_tokens=max_new_tokens,
250
  stream=True
251
  )
252
-
253
- buffer = ""
254
- for chunk in stream:
255
- # The response structure is similar to the reference: each chunk contains a delta.
256
- delta = chunk.choices[0].delta.content
257
- buffer += delta
258
- time.sleep(0.01)
259
- yield buffer
260
-
261
  except Exception as e:
262
- logger.error(f"Error in bot_streaming: {str(e)}")
263
- yield "An error occurred while processing your request. Please try again."
264
-
265
- def clear_context():
266
- """Clear the current document context."""
267
- doc_state.clear()
268
- return "Document context cleared. You can upload a new document."
 
 
 
 
 
 
 
 
 
269
 
270
  # -------------------------------
271
  # Create the Gradio Interface
272
  # -------------------------------
273
  with gr.Blocks() as demo:
274
- gr.Markdown("# Document Analyzer with Predetermined Prompts")
275
- gr.Markdown("Upload a PDF or image (PNG, JPG, JPEG, GIF, BMP, WEBP) and select a prompt to analyze its contents.")
 
 
 
 
276
 
277
  with gr.Row():
278
  file_upload = gr.File(
@@ -284,16 +229,32 @@ with gr.Blocks() as demo:
284
  with gr.Row():
285
  prompt_dropdown = gr.Dropdown(
286
  label="Select Prompt",
287
- choices=["NOC Timesheet", "Aramco Full Timesheet and Invoice structured", "Aramco Timesheet only", "NOC Invoice"],
288
- value="NOC Timesheet"
 
 
289
  )
290
- generate_btn = gr.Button("Generate")
 
 
291
 
292
- clear_btn = gr.Button("Clear Document Context")
293
- output_text = gr.Textbox(label="Output", interactive=False)
 
294
 
295
- file_upload.change(fn=process_uploaded_file, inputs=[file_upload], outputs=[upload_status])
296
- generate_btn.click(fn=bot_streaming, inputs=[prompt_dropdown], outputs=[output_text])
297
- clear_btn.click(fn=clear_context, outputs=[upload_status])
298
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  demo.launch(debug=True)
 
96
  doc_state.doc_type = 'pdf'
97
  try:
98
  doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
99
+ return f"PDF processed successfully. Total pages: {len(doc_state.current_doc_images)}. You can now chat with the bot."
100
  except Exception as e:
101
  return f"Error processing PDF: {str(e)}. Please try a different PDF file."
102
  elif file_ext in image_extensions:
 
109
  new_size = tuple(int(dim * ratio) for dim in img.size)
110
  img = img.resize(new_size, Image.Resampling.LANCZOS)
111
  doc_state.current_doc_images = [img]
112
+ return "Image loaded successfully. You can now chat with the bot."
113
  except Exception as e:
114
  return f"Error processing image: {str(e)}. Please try a different image file."
115
  else:
 
118
  logger.error(f"Error in process_uploaded_file: {str(e)}")
119
  return "An error occurred while processing the file. Please try again."
120
 
121
+ def clear_context():
122
+ """Clear the current document context and chat history."""
123
+ doc_state.clear()
124
+ return "Document context cleared. You can upload a new document.", []
125
+
126
  # -------------------------------
127
+ # Predetermined Prompts
128
  # -------------------------------
129
+ predetermined_prompts = {
130
+
131
+ "Software Tester": (
132
+ "Act as a software tester. Analyze the uploaded image of a software interface and generate comprehensive "
133
+ "test cases for its features. For each feature, provide test steps, expected results, and any necessary "
134
+ "preconditions. Be as detailed as possible."
135
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  }
 
 
 
 
 
 
 
 
 
 
137
 
138
+ # -------------------------------
139
+ # Chat Function with Streaming and Conversation History
140
+ # -------------------------------
141
+ def chat_respond(user_message, history, prompt_option):
142
+ """
143
+ Append the user message (or, if starting a new conversation and no message is provided,
144
+ use the predetermined prompt) to the conversation history; build the API call using
145
+ the full conversation history (and the image if available); stream back the assistant response
146
+ while updating the history.
147
+
148
+ The history is a list of [user_text, assistant_text] pairs.
149
+ """
150
+ # If this is the first message, add the predetermined prompt text.
151
+ if history == []:
152
+ # If user_message is empty, use the predetermined prompt.
153
+ if not user_message.strip():
154
+ user_message = predetermined_prompts.get(prompt_option, "Hello")
155
+ else:
156
+ # Optionally, prepend the predetermined prompt.
157
+ user_message = predetermined_prompts.get(prompt_option, "") + "\n" + user_message
158
+
159
+ # Append the new user message with an empty assistant response.
160
+ history = history + [[user_message, ""]]
161
+
162
+ # Build the messages list (for the multimodal API) from the conversation history.
163
+ messages = []
164
+ for i, (user_msg, assistant_msg) in enumerate(history):
165
+ # For the user message:
166
+ user_content = [{"type": "text", "text": user_msg}]
167
+ # For the very first user message, if an image was uploaded, append the image.
168
+ if i == 0 and doc_state.current_doc_images:
169
  buffered = io.BytesIO()
170
  doc_state.current_doc_images[0].save(buffered, format="PNG")
171
  img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
 
172
  data_uri = f"data:image/png;base64,{img_b64}"
173
+ user_content.append({
174
  "type": "image_url",
175
  "image_url": {"url": data_uri}
176
  })
177
+ messages.append({"role": "user", "content": user_content})
178
+ # For the assistant response, if available.
179
+ if assistant_msg:
180
+ messages.append({
181
+ "role": "assistant",
182
+ "content": [{"type": "text", "text": assistant_msg}]
183
+ })
184
+
185
+ # Call the inference API with streaming enabled.
186
+ try:
187
  stream = client.chat.completions.create(
188
+ model="google/gemini-2.0-pro-exp-02-05:free",
189
  messages=messages,
190
+ max_tokens=8192,
191
  stream=True
192
  )
 
 
 
 
 
 
 
 
 
193
  except Exception as e:
194
+ logger.error(f"Error calling the API: {str(e)}")
195
+ history[-1][1] = "An error occurred while processing your request. Please try again."
196
+ yield history, history
197
+
198
+ # Stream and update the assistant's reply token by token.
199
+ buffer = ""
200
+ for chunk in stream:
201
+ delta = chunk.choices[0].delta.content
202
+ buffer += delta
203
+ # Update the assistant part of the latest message in the history.
204
+ history[-1][1] = buffer
205
+ # Yield the updated chat history (for the Chatbot component) and the state.
206
+ yield history, history
207
+ time.sleep(0.01)
208
+
209
+ return history, history
210
 
211
  # -------------------------------
212
  # Create the Gradio Interface
213
  # -------------------------------
214
  with gr.Blocks() as demo:
215
+ gr.Markdown("# Document Analyzer & Software Testing Chatbot")
216
+ gr.Markdown(
217
+ "Upload a PDF or an image (PNG, JPG, JPEG, GIF, BMP, WEBP). Then choose a prompt from the dropdown. "
218
+ "For example, select **Software Tester** to have the bot analyze an image of a software interface "
219
+ "and generate test cases. Chat with the bot in the conversation below."
220
+ )
221
 
222
  with gr.Row():
223
  file_upload = gr.File(
 
229
  with gr.Row():
230
  prompt_dropdown = gr.Dropdown(
231
  label="Select Prompt",
232
+ choices=[
233
+ "Software Tester"
234
+ ],
235
+ value="Software Tester"
236
  )
237
+ clear_btn = gr.Button("Clear Document Context & Chat History")
238
+
239
+ chatbot = gr.Chatbot(label="Chat History", elem_id="chatbot")
240
 
241
+ with gr.Row():
242
+ user_input = gr.Textbox(label="Your Message", placeholder="Type your message here...", show_label=False)
243
+ send_btn = gr.Button("Send")
244
 
245
+ # State to hold the conversation history
246
+ chat_state = gr.State([])
 
247
 
248
+ # When a file is uploaded, process it.
249
+ file_upload.change(fn=process_uploaded_file, inputs=file_upload, outputs=upload_status)
250
+
251
+ # Clear both the document context and chat history.
252
+ clear_btn.click(fn=clear_context, outputs=[upload_status, chat_state])
253
+
254
+ # When the user clicks Send, process the message and update the chat.
255
+ send_btn.click(fn=chat_respond,
256
+ inputs=[user_input, chat_state, prompt_dropdown],
257
+ outputs=[chatbot, chat_state],
258
+ stream=True)
259
+
260
  demo.launch(debug=True)