MilanM commited on
Commit
2b49576
·
verified ·
1 Parent(s): 65dde49

Update helper_functions/table_helper_functions.py

Browse files
helper_functions/table_helper_functions.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  def process_with_llm(fields_to_process, prompt_template, inf_model, params, batch_size=10):
3
  """
4
  Process documents with LLM using a prompt template with dynamic field mapping.
@@ -14,6 +13,7 @@ def process_with_llm(fields_to_process, prompt_template, inf_model, params, batc
14
  Returns:
15
  list: Processed results from the LLM
16
  """
 
17
  import time
18
  import re
19
 
@@ -97,28 +97,166 @@ def process_with_llm(fields_to_process, prompt_template, inf_model, params, batc
97
  results = []
98
 
99
  # Process each batch
100
- for i, batch in enumerate(batches):
101
- start_time = time.time()
102
-
103
- try:
104
- # Use the provided inference model to generate responses
105
- print(f"Sending batch {i+1} of {len(batches)} to model")
 
 
 
 
 
 
106
 
107
- # Call the inference model with the batch of prompts and params
108
- batch_results = inf_model.generate_text(prompt=batch, params=params)
 
109
 
110
- results.extend(batch_results)
 
111
 
112
- except Exception as e:
113
- print(f"Error in batch {i+1}: {str(e)}")
114
- continue
115
 
116
- end_time = time.time()
117
- inference_time = end_time - start_time
118
- print(f"Inference time for Batch {i+1}: {inference_time:.2f} seconds")
 
 
 
 
 
 
 
 
 
 
119
 
120
  return results
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  def append_llm_results_to_dataframe(target_dataframe, fields_to_process, llm_results, selection_table, column_name=None):
123
  """
124
  Add LLM processing results directly to the target DataFrame using selection indices
 
 
1
  def process_with_llm(fields_to_process, prompt_template, inf_model, params, batch_size=10):
2
  """
3
  Process documents with LLM using a prompt template with dynamic field mapping.
 
13
  Returns:
14
  list: Processed results from the LLM
15
  """
16
+ import marimo as mo
17
  import time
18
  import re
19
 
 
97
  results = []
98
 
99
  # Process each batch
100
+ with mo.status.progress_bar(
101
+ total=len(batches),
102
+ title="Processing Batches",
103
+ subtitle=f"Processing {len(formatted_prompts)} prompts in {len(batches)} batches",
104
+ completion_title="Processing Complete",
105
+ completion_subtitle=f"Processed {len(formatted_prompts)} prompts successfully",
106
+ show_rate=True,
107
+ show_eta=True,
108
+ remove_on_exit=True
109
+ ) as progress:
110
+ for i, batch in enumerate(batches):
111
+ start_time = time.time()
112
 
113
+ try:
114
+ # Use the provided inference model to generate responses
115
+ print(f"Sending batch {i+1} of {len(batches)} to model")
116
 
117
+ # Call the inference model with the batch of prompts and params
118
+ batch_results = inf_model.generate_text(prompt=batch, params=params)
119
 
120
+ results.extend(batch_results)
 
 
121
 
122
+ except Exception as e:
123
+ print(f"Error in batch {i+1}: {str(e)}")
124
+ continue
125
+
126
+ end_time = time.time()
127
+ inference_time = end_time - start_time
128
+ print(f"Inference time for Batch {i+1}: {inference_time:.2f} seconds")
129
+
130
+ # Update progress bar
131
+ progress.update(increment=1)
132
+
133
+ # Add 1 second delay on completion before removing
134
+ time.sleep(1)
135
 
136
  return results
137
 
138
+
139
+
140
+ # def process_with_llm_no_progress_bar(fields_to_process, prompt_template, inf_model, params, batch_size=10):
141
+ # """
142
+ # Process documents with LLM using a prompt template with dynamic field mapping.
143
+ # Uses template fields to extract values from pre-standardized document fields.
144
+
145
+ # Args:
146
+ # fields_to_process (list): List of document dictionaries to process
147
+ # prompt_template (str): Template with {field_name} placeholders matching keys in documents
148
+ # inf_model: The inference model instance to use for generation
149
+ # params: Parameters to pass to the inference model
150
+ # batch_size (int): Number of documents to process per batch
151
+
152
+ # Returns:
153
+ # list: Processed results from the LLM
154
+ # """
155
+ # import time
156
+ # import re
157
+
158
+ # # Safety check for inputs
159
+ # if not fields_to_process or not inf_model:
160
+ # print("Missing required inputs")
161
+ # return []
162
+
163
+ # # Handle case where prompt_template is a dictionary (from UI components)
164
+ # if isinstance(prompt_template, dict) and 'value' in prompt_template:
165
+ # prompt_template = prompt_template['value']
166
+ # elif not isinstance(prompt_template, str):
167
+ # print(f"Invalid prompt template type: {type(prompt_template)}, expected string")
168
+ # return []
169
+
170
+ # # Extract field names from the prompt template using regex
171
+ # # This finds all strings between curly braces
172
+ # field_pattern = r'\{([^{}]+)\}'
173
+ # template_fields = re.findall(field_pattern, prompt_template)
174
+
175
+ # if not template_fields:
176
+ # print("No field placeholders found in template")
177
+ # return []
178
+
179
+ # # Create formatted prompts from the documents
180
+ # formatted_prompts = []
181
+ # for doc in fields_to_process:
182
+ # try:
183
+ # # Create a dictionary of field values to substitute
184
+ # field_values = {}
185
+
186
+ # for field in template_fields:
187
+ # # Try direct match first
188
+ # if field in doc:
189
+ # field_values[field] = doc[field] if doc[field] is not None else ""
190
+ # # If field contains periods (e.g., "data.title"), evaluate it
191
+ # elif '.' in field:
192
+ # try:
193
+ # # Build a safe evaluation string
194
+ # parts = field.split('.')
195
+ # value = doc
196
+ # for part in parts:
197
+ # if isinstance(value, dict) and part in value:
198
+ # value = value[part]
199
+ # else:
200
+ # value = None
201
+ # break
202
+ # field_values[field] = value if value is not None else ""
203
+ # except:
204
+ # field_values[field] = ""
205
+ # else:
206
+ # # Default to empty string if field not found
207
+ # field_values[field] = ""
208
+
209
+ # # Handle None values at the top level to ensure formatting works
210
+ # for key in field_values:
211
+ # if field_values[key] is None:
212
+ # field_values[key] = ""
213
+
214
+ # # Format the prompt with all available fields
215
+ # prompt = prompt_template.format(**field_values)
216
+ # formatted_prompts.append(prompt)
217
+
218
+ # except Exception as e:
219
+ # print(f"Error formatting prompt: {str(e)}")
220
+ # print(f"Field values: {field_values}")
221
+ # continue
222
+
223
+ # # Return empty list if no valid prompts
224
+ # if not formatted_prompts:
225
+ # print("No valid prompts generated")
226
+ # return []
227
+
228
+ # # Print a sample of the formatted prompts for debugging
229
+ # if formatted_prompts:
230
+ # print(f"Sample formatted prompt: {formatted_prompts[0][:200]}...")
231
+
232
+ # # Split into batches
233
+ # batches = [formatted_prompts[i:i + batch_size] for i in range(0, len(formatted_prompts), batch_size)]
234
+
235
+ # results = []
236
+
237
+ # # Process each batch
238
+ # for i, batch in enumerate(batches):
239
+ # start_time = time.time()
240
+
241
+ # try:
242
+ # # Use the provided inference model to generate responses
243
+ # print(f"Sending batch {i+1} of {len(batches)} to model")
244
+
245
+ # # Call the inference model with the batch of prompts and params
246
+ # batch_results = inf_model.generate_text(prompt=batch, params=params)
247
+
248
+ # results.extend(batch_results)
249
+
250
+ # except Exception as e:
251
+ # print(f"Error in batch {i+1}: {str(e)}")
252
+ # continue
253
+
254
+ # end_time = time.time()
255
+ # inference_time = end_time - start_time
256
+ # print(f"Inference time for Batch {i+1}: {inference_time:.2f} seconds")
257
+
258
+ # return results
259
+
260
  def append_llm_results_to_dataframe(target_dataframe, fields_to_process, llm_results, selection_table, column_name=None):
261
  """
262
  Add LLM processing results directly to the target DataFrame using selection indices