Update helper_functions/table_helper_functions.py
Browse files
helper_functions/table_helper_functions.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
|
2 |
def process_with_llm(fields_to_process, prompt_template, inf_model, params, batch_size=10):
|
3 |
"""
|
4 |
Process documents with LLM using a prompt template with dynamic field mapping.
|
@@ -14,6 +13,7 @@ def process_with_llm(fields_to_process, prompt_template, inf_model, params, batc
|
|
14 |
Returns:
|
15 |
list: Processed results from the LLM
|
16 |
"""
|
|
|
17 |
import time
|
18 |
import re
|
19 |
|
@@ -97,28 +97,166 @@ def process_with_llm(fields_to_process, prompt_template, inf_model, params, batc
|
|
97 |
results = []
|
98 |
|
99 |
# Process each batch
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
-
|
108 |
-
|
|
|
109 |
|
110 |
-
|
|
|
111 |
|
112 |
-
|
113 |
-
print(f"Error in batch {i+1}: {str(e)}")
|
114 |
-
continue
|
115 |
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
return results
|
121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
def append_llm_results_to_dataframe(target_dataframe, fields_to_process, llm_results, selection_table, column_name=None):
|
123 |
"""
|
124 |
Add LLM processing results directly to the target DataFrame using selection indices
|
|
|
|
|
1 |
def process_with_llm(fields_to_process, prompt_template, inf_model, params, batch_size=10):
|
2 |
"""
|
3 |
Process documents with LLM using a prompt template with dynamic field mapping.
|
|
|
13 |
Returns:
|
14 |
list: Processed results from the LLM
|
15 |
"""
|
16 |
+
import marimo as mo
|
17 |
import time
|
18 |
import re
|
19 |
|
|
|
97 |
results = []
|
98 |
|
99 |
# Process each batch
|
100 |
+
with mo.status.progress_bar(
|
101 |
+
total=len(batches),
|
102 |
+
title="Processing Batches",
|
103 |
+
subtitle=f"Processing {len(formatted_prompts)} prompts in {len(batches)} batches",
|
104 |
+
completion_title="Processing Complete",
|
105 |
+
completion_subtitle=f"Processed {len(formatted_prompts)} prompts successfully",
|
106 |
+
show_rate=True,
|
107 |
+
show_eta=True,
|
108 |
+
remove_on_exit=True
|
109 |
+
) as progress:
|
110 |
+
for i, batch in enumerate(batches):
|
111 |
+
start_time = time.time()
|
112 |
|
113 |
+
try:
|
114 |
+
# Use the provided inference model to generate responses
|
115 |
+
print(f"Sending batch {i+1} of {len(batches)} to model")
|
116 |
|
117 |
+
# Call the inference model with the batch of prompts and params
|
118 |
+
batch_results = inf_model.generate_text(prompt=batch, params=params)
|
119 |
|
120 |
+
results.extend(batch_results)
|
|
|
|
|
121 |
|
122 |
+
except Exception as e:
|
123 |
+
print(f"Error in batch {i+1}: {str(e)}")
|
124 |
+
continue
|
125 |
+
|
126 |
+
end_time = time.time()
|
127 |
+
inference_time = end_time - start_time
|
128 |
+
print(f"Inference time for Batch {i+1}: {inference_time:.2f} seconds")
|
129 |
+
|
130 |
+
# Update progress bar
|
131 |
+
progress.update(increment=1)
|
132 |
+
|
133 |
+
# Add 1 second delay on completion before removing
|
134 |
+
time.sleep(1)
|
135 |
|
136 |
return results
|
137 |
|
138 |
+
|
139 |
+
|
140 |
+
# def process_with_llm_no_progress_bar(fields_to_process, prompt_template, inf_model, params, batch_size=10):
|
141 |
+
# """
|
142 |
+
# Process documents with LLM using a prompt template with dynamic field mapping.
|
143 |
+
# Uses template fields to extract values from pre-standardized document fields.
|
144 |
+
|
145 |
+
# Args:
|
146 |
+
# fields_to_process (list): List of document dictionaries to process
|
147 |
+
# prompt_template (str): Template with {field_name} placeholders matching keys in documents
|
148 |
+
# inf_model: The inference model instance to use for generation
|
149 |
+
# params: Parameters to pass to the inference model
|
150 |
+
# batch_size (int): Number of documents to process per batch
|
151 |
+
|
152 |
+
# Returns:
|
153 |
+
# list: Processed results from the LLM
|
154 |
+
# """
|
155 |
+
# import time
|
156 |
+
# import re
|
157 |
+
|
158 |
+
# # Safety check for inputs
|
159 |
+
# if not fields_to_process or not inf_model:
|
160 |
+
# print("Missing required inputs")
|
161 |
+
# return []
|
162 |
+
|
163 |
+
# # Handle case where prompt_template is a dictionary (from UI components)
|
164 |
+
# if isinstance(prompt_template, dict) and 'value' in prompt_template:
|
165 |
+
# prompt_template = prompt_template['value']
|
166 |
+
# elif not isinstance(prompt_template, str):
|
167 |
+
# print(f"Invalid prompt template type: {type(prompt_template)}, expected string")
|
168 |
+
# return []
|
169 |
+
|
170 |
+
# # Extract field names from the prompt template using regex
|
171 |
+
# # This finds all strings between curly braces
|
172 |
+
# field_pattern = r'\{([^{}]+)\}'
|
173 |
+
# template_fields = re.findall(field_pattern, prompt_template)
|
174 |
+
|
175 |
+
# if not template_fields:
|
176 |
+
# print("No field placeholders found in template")
|
177 |
+
# return []
|
178 |
+
|
179 |
+
# # Create formatted prompts from the documents
|
180 |
+
# formatted_prompts = []
|
181 |
+
# for doc in fields_to_process:
|
182 |
+
# try:
|
183 |
+
# # Create a dictionary of field values to substitute
|
184 |
+
# field_values = {}
|
185 |
+
|
186 |
+
# for field in template_fields:
|
187 |
+
# # Try direct match first
|
188 |
+
# if field in doc:
|
189 |
+
# field_values[field] = doc[field] if doc[field] is not None else ""
|
190 |
+
# # If field contains periods (e.g., "data.title"), evaluate it
|
191 |
+
# elif '.' in field:
|
192 |
+
# try:
|
193 |
+
# # Build a safe evaluation string
|
194 |
+
# parts = field.split('.')
|
195 |
+
# value = doc
|
196 |
+
# for part in parts:
|
197 |
+
# if isinstance(value, dict) and part in value:
|
198 |
+
# value = value[part]
|
199 |
+
# else:
|
200 |
+
# value = None
|
201 |
+
# break
|
202 |
+
# field_values[field] = value if value is not None else ""
|
203 |
+
# except:
|
204 |
+
# field_values[field] = ""
|
205 |
+
# else:
|
206 |
+
# # Default to empty string if field not found
|
207 |
+
# field_values[field] = ""
|
208 |
+
|
209 |
+
# # Handle None values at the top level to ensure formatting works
|
210 |
+
# for key in field_values:
|
211 |
+
# if field_values[key] is None:
|
212 |
+
# field_values[key] = ""
|
213 |
+
|
214 |
+
# # Format the prompt with all available fields
|
215 |
+
# prompt = prompt_template.format(**field_values)
|
216 |
+
# formatted_prompts.append(prompt)
|
217 |
+
|
218 |
+
# except Exception as e:
|
219 |
+
# print(f"Error formatting prompt: {str(e)}")
|
220 |
+
# print(f"Field values: {field_values}")
|
221 |
+
# continue
|
222 |
+
|
223 |
+
# # Return empty list if no valid prompts
|
224 |
+
# if not formatted_prompts:
|
225 |
+
# print("No valid prompts generated")
|
226 |
+
# return []
|
227 |
+
|
228 |
+
# # Print a sample of the formatted prompts for debugging
|
229 |
+
# if formatted_prompts:
|
230 |
+
# print(f"Sample formatted prompt: {formatted_prompts[0][:200]}...")
|
231 |
+
|
232 |
+
# # Split into batches
|
233 |
+
# batches = [formatted_prompts[i:i + batch_size] for i in range(0, len(formatted_prompts), batch_size)]
|
234 |
+
|
235 |
+
# results = []
|
236 |
+
|
237 |
+
# # Process each batch
|
238 |
+
# for i, batch in enumerate(batches):
|
239 |
+
# start_time = time.time()
|
240 |
+
|
241 |
+
# try:
|
242 |
+
# # Use the provided inference model to generate responses
|
243 |
+
# print(f"Sending batch {i+1} of {len(batches)} to model")
|
244 |
+
|
245 |
+
# # Call the inference model with the batch of prompts and params
|
246 |
+
# batch_results = inf_model.generate_text(prompt=batch, params=params)
|
247 |
+
|
248 |
+
# results.extend(batch_results)
|
249 |
+
|
250 |
+
# except Exception as e:
|
251 |
+
# print(f"Error in batch {i+1}: {str(e)}")
|
252 |
+
# continue
|
253 |
+
|
254 |
+
# end_time = time.time()
|
255 |
+
# inference_time = end_time - start_time
|
256 |
+
# print(f"Inference time for Batch {i+1}: {inference_time:.2f} seconds")
|
257 |
+
|
258 |
+
# return results
|
259 |
+
|
260 |
def append_llm_results_to_dataframe(target_dataframe, fields_to_process, llm_results, selection_table, column_name=None):
|
261 |
"""
|
262 |
Add LLM processing results directly to the target DataFrame using selection indices
|