Spaces:

NCEE-Build-Lab
/

watsonx.ai_Table_Processor_MNB

Running

App Files Files Community

MilanM commited on 4 days ago

Commit

2b49576

verified ·

1 Parent(s): 65dde49

Update helper_functions/table_helper_functions.py

Browse files

Files changed (1) hide show

helper_functions/table_helper_functions.py +154 -16

helper_functions/table_helper_functions.py CHANGED Viewed

@@ -1,4 +1,3 @@
 def process_with_llm(fields_to_process, prompt_template, inf_model, params, batch_size=10):
     """
     Process documents with LLM using a prompt template with dynamic field mapping.
@@ -14,6 +13,7 @@ def process_with_llm(fields_to_process, prompt_template, inf_model, params, batc
     Returns:
         list: Processed results from the LLM
     """
     import time
     import re
@@ -97,28 +97,166 @@ def process_with_llm(fields_to_process, prompt_template, inf_model, params, batc
     results = []
     # Process each batch
-    for i, batch in enumerate(batches):
-        start_time = time.time()
-        try:
-            # Use the provided inference model to generate responses
-            print(f"Sending batch {i+1} of {len(batches)} to model")
-            # Call the inference model with the batch of prompts and params
-            batch_results = inf_model.generate_text(prompt=batch, params=params)
-            results.extend(batch_results)
-        except Exception as e:
-            print(f"Error in batch {i+1}: {str(e)}")
-            continue
-        end_time = time.time()
-        inference_time = end_time - start_time
-        print(f"Inference time for Batch {i+1}: {inference_time:.2f} seconds")
     return results
 def append_llm_results_to_dataframe(target_dataframe, fields_to_process, llm_results, selection_table, column_name=None):
     """
     Add LLM processing results directly to the target DataFrame using selection indices

 def process_with_llm(fields_to_process, prompt_template, inf_model, params, batch_size=10):
     """
     Process documents with LLM using a prompt template with dynamic field mapping.
     Returns:
         list: Processed results from the LLM
     """
+    import marimo as mo
     import time
     import re
     results = []
     # Process each batch
+    with mo.status.progress_bar(
+        total=len(batches),
+        title="Processing Batches",
+        subtitle=f"Processing {len(formatted_prompts)} prompts in {len(batches)} batches",
+        completion_title="Processing Complete",
+        completion_subtitle=f"Processed {len(formatted_prompts)} prompts successfully",
+        show_rate=True,
+        show_eta=True,
+        remove_on_exit=True
+    ) as progress:
+        for i, batch in enumerate(batches):
+            start_time = time.time()
+            try:
+                # Use the provided inference model to generate responses
+                print(f"Sending batch {i+1} of {len(batches)} to model")
+                # Call the inference model with the batch of prompts and params
+                batch_results = inf_model.generate_text(prompt=batch, params=params)
+                results.extend(batch_results)
+            except Exception as e:
+                print(f"Error in batch {i+1}: {str(e)}")
+                continue
+            end_time = time.time()
+            inference_time = end_time - start_time
+            print(f"Inference time for Batch {i+1}: {inference_time:.2f} seconds")
+            # Update progress bar
+            progress.update(increment=1)
+        # Add 1 second delay on completion before removing
+        time.sleep(1)
     return results
+# def process_with_llm_no_progress_bar(fields_to_process, prompt_template, inf_model, params, batch_size=10):
+#     """
+#     Process documents with LLM using a prompt template with dynamic field mapping.
+#     Uses template fields to extract values from pre-standardized document fields.
+#     Args:
+#         fields_to_process (list): List of document dictionaries to process
+#         prompt_template (str): Template with {field_name} placeholders matching keys in documents
+#         inf_model: The inference model instance to use for generation
+#         params: Parameters to pass to the inference model
+#         batch_size (int): Number of documents to process per batch
+#     Returns:
+#         list: Processed results from the LLM
+#     """
+#     import time
+#     import re
+#     # Safety check for inputs
+#     if not fields_to_process or not inf_model:
+#         print("Missing required inputs")
+#         return []
+#     # Handle case where prompt_template is a dictionary (from UI components)
+#     if isinstance(prompt_template, dict) and 'value' in prompt_template:
+#         prompt_template = prompt_template['value']
+#     elif not isinstance(prompt_template, str):
+#         print(f"Invalid prompt template type: {type(prompt_template)}, expected string")
+#         return []
+#     # Extract field names from the prompt template using regex
+#     # This finds all strings between curly braces
+#     field_pattern = r'\{([^{}]+)\}'
+#     template_fields = re.findall(field_pattern, prompt_template)
+#     if not template_fields:
+#         print("No field placeholders found in template")
+#         return []
+#     # Create formatted prompts from the documents
+#     formatted_prompts = []
+#     for doc in fields_to_process:
+#         try:
+#             # Create a dictionary of field values to substitute
+#             field_values = {}
+#             for field in template_fields:
+#                 # Try direct match first
+#                 if field in doc:
+#                     field_values[field] = doc[field] if doc[field] is not None else ""
+#                 # If field contains periods (e.g., "data.title"), evaluate it
+#                 elif '.' in field:
+#                     try:
+#                         # Build a safe evaluation string
+#                         parts = field.split('.')
+#                         value = doc
+#                         for part in parts:
+#                             if isinstance(value, dict) and part in value:
+#                                 value = value[part]
+#                             else:
+#                                 value = None
+#                                 break
+#                         field_values[field] = value if value is not None else ""
+#                     except:
+#                         field_values[field] = ""
+#                 else:
+#                     # Default to empty string if field not found
+#                     field_values[field] = ""
+#             # Handle None values at the top level to ensure formatting works
+#             for key in field_values:
+#                 if field_values[key] is None:
+#                     field_values[key] = ""
+#             # Format the prompt with all available fields
+#             prompt = prompt_template.format(**field_values)
+#             formatted_prompts.append(prompt)
+#         except Exception as e:
+#             print(f"Error formatting prompt: {str(e)}")
+#             print(f"Field values: {field_values}")
+#             continue
+#     # Return empty list if no valid prompts
+#     if not formatted_prompts:
+#         print("No valid prompts generated")
+#         return []
+#     # Print a sample of the formatted prompts for debugging
+#     if formatted_prompts:
+#         print(f"Sample formatted prompt: {formatted_prompts[0][:200]}...")
+#     # Split into batches
+#     batches = [formatted_prompts[i:i + batch_size] for i in range(0, len(formatted_prompts), batch_size)]
+#     results = []
+#     # Process each batch
+#     for i, batch in enumerate(batches):
+#         start_time = time.time()
+#         try:
+#             # Use the provided inference model to generate responses
+#             print(f"Sending batch {i+1} of {len(batches)} to model")
+#             # Call the inference model with the batch of prompts and params
+#             batch_results = inf_model.generate_text(prompt=batch, params=params)
+#             results.extend(batch_results)
+#         except Exception as e:
+#             print(f"Error in batch {i+1}: {str(e)}")
+#             continue
+#         end_time = time.time()
+#         inference_time = end_time - start_time
+#         print(f"Inference time for Batch {i+1}: {inference_time:.2f} seconds")
+#     return results
 def append_llm_results_to_dataframe(target_dataframe, fields_to_process, llm_results, selection_table, column_name=None):
     """
     Add LLM processing results directly to the target DataFrame using selection indices