Ali2206 commited on
Commit
f260d4a
Β·
verified Β·
1 Parent(s): 9a0b74b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -49
app.py CHANGED
@@ -32,8 +32,10 @@ sys.path.insert(0, src_path)
32
  from txagent.txagent import TxAgent
33
 
34
  # Constants
35
- MAX_TOKENS = 32768
36
- MAX_NEW_TOKENS = 2048
 
 
37
 
38
  def clean_response(text: str) -> str:
39
  try:
@@ -46,40 +48,56 @@ def clean_response(text: str) -> str:
46
  return text.strip()
47
 
48
  def estimate_tokens(text: str) -> int:
49
- return len(text) // 3.5
 
50
 
51
  def extract_text_from_excel(file_path: str) -> str:
 
52
  all_text = []
53
- xls = pd.ExcelFile(file_path)
54
- for sheet_name in xls.sheet_names:
55
- df = xls.parse(sheet_name)
56
- df = df.astype(str).fillna("")
57
- rows = df.apply(lambda row: " | ".join(row), axis=1)
58
- sheet_text = [f"[{sheet_name}] {line}" for line in rows]
59
- all_text.extend(sheet_text)
 
 
 
60
  return "\n".join(all_text)
61
 
62
- def split_text_into_chunks(text: str, max_tokens: int = MAX_TOKENS) -> List[str]:
 
 
 
 
 
 
 
 
63
  lines = text.split("\n")
64
  chunks = []
65
  current_chunk = []
66
  current_tokens = 0
67
 
68
  for line in lines:
69
- tokens = estimate_tokens(line)
70
- if current_tokens + tokens > max_tokens:
71
- chunks.append("\n".join(current_chunk))
 
72
  current_chunk = [line]
73
- current_tokens = tokens
74
  else:
75
  current_chunk.append(line)
76
- current_tokens += tokens
77
 
78
  if current_chunk:
79
  chunks.append("\n".join(current_chunk))
 
80
  return chunks
81
 
82
  def build_prompt_from_text(chunk: str) -> str:
 
83
  return f"""
84
  ### Unstructured Clinical Records
85
 
@@ -100,6 +118,7 @@ Please analyze the above and provide:
100
  """
101
 
102
  def init_agent():
 
103
  default_tool_path = os.path.abspath("data/new_tool.json")
104
  target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
105
 
@@ -120,6 +139,7 @@ def init_agent():
120
  return agent
121
 
122
  def process_final_report(agent, file, chatbot_state: List[Dict[str, str]]) -> Tuple[List[Dict[str, str]], Union[str, None]]:
 
123
  messages = chatbot_state if chatbot_state else []
124
  report_path = None
125
 
@@ -131,61 +151,118 @@ def process_final_report(agent, file, chatbot_state: List[Dict[str, str]]) -> Tu
131
  messages.append({"role": "user", "content": f"Processing Excel file: {os.path.basename(file.name)}"})
132
  messages.append({"role": "assistant", "content": "⏳ Extracting and analyzing data..."})
133
 
 
134
  extracted_text = extract_text_from_excel(file.name)
135
- chunks = split_text_into_chunks(extracted_text)
136
  chunk_responses = []
137
 
 
138
  for i, chunk in enumerate(chunks):
139
  messages.append({"role": "assistant", "content": f"πŸ” Analyzing chunk {i+1}/{len(chunks)}..."})
140
 
141
  prompt = build_prompt_from_text(chunk)
 
 
 
 
 
142
  response = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  for result in agent.run_gradio_chat(
144
- message=prompt,
145
  history=[],
146
  temperature=0.2,
147
  max_new_tokens=MAX_NEW_TOKENS,
148
- max_token=MAX_TOKENS,
149
  call_agent=False,
150
  conversation=[],
151
  ):
152
  if isinstance(result, str):
153
- response += result
154
  elif hasattr(result, "content"):
155
- response += result.content
156
  elif isinstance(result, list):
157
  for r in result:
158
  if hasattr(r, "content"):
159
- response += r.content
160
-
161
- chunk_responses.append(clean_response(response))
162
- messages.append({"role": "assistant", "content": f"βœ… Chunk {i+1} analysis complete"})
163
-
164
- final_prompt = "\n\n".join(chunk_responses) + "\n\nSummarize the key findings above."
165
- messages.append({"role": "assistant", "content": "πŸ“Š Generating final report..."})
166
 
167
- stream_text = ""
168
- for result in agent.run_gradio_chat(
169
- message=final_prompt,
170
- history=[],
171
- temperature=0.2,
172
- max_new_tokens=MAX_NEW_TOKENS,
173
- max_token=MAX_TOKENS,
174
- call_agent=False,
175
- conversation=[],
176
- ):
177
- if isinstance(result, str):
178
- stream_text += result
179
- elif hasattr(result, "content"):
180
- stream_text += result.content
181
- elif isinstance(result, list):
182
- for r in result:
183
- if hasattr(r, "content"):
184
- stream_text += r.content
185
-
186
- final_report = f"# \U0001f9e0 Final Patient Report\n\n{clean_response(stream_text)}"
187
- messages[-1]["content"] = f"πŸ“Š Final Report:\n\n{clean_response(stream_text)}"
188
 
 
189
  timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
190
  report_path = os.path.join(report_dir, f"report_{timestamp}.md")
191
 
@@ -200,6 +277,7 @@ def process_final_report(agent, file, chatbot_state: List[Dict[str, str]]) -> Tu
200
  return messages, report_path
201
 
202
  def create_ui(agent):
 
203
  with gr.Blocks(title="Patient History Chat", css=".gradio-container {max-width: 900px !important}") as demo:
204
  gr.Markdown("## πŸ₯ Patient History Analysis Tool")
205
 
 
32
  from txagent.txagent import TxAgent
33
 
34
  # Constants
35
+ MAX_MODEL_TOKENS = 32768 # Model's maximum sequence length
36
+ MAX_CHUNK_TOKENS = 8192 # Chunk size aligned with max_num_batched_tokens
37
+ MAX_NEW_TOKENS = 2048 # Maximum tokens for generation
38
+ PROMPT_OVERHEAD = 500 # Estimated tokens for prompt template overhead
39
 
40
  def clean_response(text: str) -> str:
41
  try:
 
48
  return text.strip()
49
 
50
  def estimate_tokens(text: str) -> int:
51
+ """Estimate the number of tokens based on character length."""
52
+ return len(text) // 3.5 + 1 # Add 1 to avoid zero estimates
53
 
54
  def extract_text_from_excel(file_path: str) -> str:
55
+ """Extract text from all sheets in an Excel file."""
56
  all_text = []
57
+ try:
58
+ xls = pd.ExcelFile(file_path)
59
+ for sheet_name in xls.sheet_names:
60
+ df = xls.parse(sheet_name)
61
+ df = df.astype(str).fillna("")
62
+ rows = df.apply(lambda row: " | ".join(row), axis=1)
63
+ sheet_text = [f"[{sheet_name}] {line}" for line in rows]
64
+ all_text.extend(sheet_text)
65
+ except Exception as e:
66
+ raise ValueError(f"Failed to extract text from Excel file: {str(e)}")
67
  return "\n".join(all_text)
68
 
69
+ def split_text_into_chunks(text: str, max_tokens: int = MAX_CHUNK_TOKENS) -> List[str]:
70
+ """
71
+ Split text into chunks, ensuring each chunk is within token limits,
72
+ accounting for prompt overhead.
73
+ """
74
+ effective_max_tokens = max_tokens - PROMPT_OVERHEAD
75
+ if effective_max_tokens <= 0:
76
+ raise ValueError(f"Effective max tokens ({effective_max_tokens}) must be positive.")
77
+
78
  lines = text.split("\n")
79
  chunks = []
80
  current_chunk = []
81
  current_tokens = 0
82
 
83
  for line in lines:
84
+ line_tokens = estimate_tokens(line)
85
+ if current_tokens + line_tokens > effective_max_tokens:
86
+ if current_chunk: # Save the current chunk if it's not empty
87
+ chunks.append("\n".join(current_chunk))
88
  current_chunk = [line]
89
+ current_tokens = line_tokens
90
  else:
91
  current_chunk.append(line)
92
+ current_tokens += line_tokens
93
 
94
  if current_chunk:
95
  chunks.append("\n".join(current_chunk))
96
+
97
  return chunks
98
 
99
  def build_prompt_from_text(chunk: str) -> str:
100
+ """Build a prompt for analyzing a chunk of clinical data."""
101
  return f"""
102
  ### Unstructured Clinical Records
103
 
 
118
  """
119
 
120
  def init_agent():
121
+ """Initialize the TxAgent with model and tool configurations."""
122
  default_tool_path = os.path.abspath("data/new_tool.json")
123
  target_tool_path = os.path.join(tool_cache_dir, "new_tool.json")
124
 
 
139
  return agent
140
 
141
  def process_final_report(agent, file, chatbot_state: List[Dict[str, str]]) -> Tuple[List[Dict[str, str]], Union[str, None]]:
142
+ """Process the Excel file and generate a final report."""
143
  messages = chatbot_state if chatbot_state else []
144
  report_path = None
145
 
 
151
  messages.append({"role": "user", "content": f"Processing Excel file: {os.path.basename(file.name)}"})
152
  messages.append({"role": "assistant", "content": "⏳ Extracting and analyzing data..."})
153
 
154
+ # Extract text and split into chunks
155
  extracted_text = extract_text_from_excel(file.name)
156
+ chunks = split_text_into_chunks(extracted_text, max_tokens=MAX_CHUNK_TOKENS)
157
  chunk_responses = []
158
 
159
+ # Process each chunk
160
  for i, chunk in enumerate(chunks):
161
  messages.append({"role": "assistant", "content": f"πŸ” Analyzing chunk {i+1}/{len(chunks)}..."})
162
 
163
  prompt = build_prompt_from_text(chunk)
164
+ prompt_tokens = estimate_tokens(prompt)
165
+ if prompt_tokens > MAX_MODEL_TOKENS:
166
+ messages.append({"role": "assistant", "content": f"❌ Chunk {i+1} prompt too long ({prompt_tokens} tokens). Skipping..."})
167
+ continue
168
+
169
  response = ""
170
+ try:
171
+ for result in agent.run_gradio_chat(
172
+ message=prompt,
173
+ history=[],
174
+ temperature=0.2,
175
+ max_new_tokens=MAX_NEW_TOKENS,
176
+ max_token=MAX_MODEL_TOKENS,
177
+ call_agent=False,
178
+ conversation=[],
179
+ ):
180
+ if isinstance(result, str):
181
+ response += result
182
+ elif hasattr(result, "content"):
183
+ response += result.content
184
+ elif isinstance(result, list):
185
+ for r in result:
186
+ if hasattr(r, "content"):
187
+ response += r.content
188
+ except Exception as e:
189
+ messages.append({"role": "assistant", "content": f"❌ Error analyzing chunk {i+1}: {str(e)}"})
190
+ continue
191
+
192
+ chunk_responses.append(clean_response(response))
193
+ messages.append({"role": "assistant", "content": f"βœ… Chunk {i+1} analysis complete"})
194
+
195
+ if not chunk_responses:
196
+ messages.append({"role": "assistant", "content": "❌ No valid chunk responses to summarize."})
197
+ return messages, report_path
198
+
199
+ # Summarize chunk responses incrementally to avoid token limit
200
+ summary = ""
201
+ current_summary_tokens = 0
202
+ for i, response in enumerate(chunk_responses):
203
+ response_tokens = estimate_tokens(response)
204
+ if current_summary_tokens + response_tokens > MAX_MODEL_TOKENS - PROMPT_OVERHEAD - MAX_NEW_TOKENS:
205
+ # Summarize current summary
206
+ summary_prompt = f"Summarize the following analysis:\n\n{summary}\n\nProvide a concise summary."
207
+ summary_response = ""
208
+ try:
209
+ for result in agent.run_gradio_chat(
210
+ message=summary_prompt,
211
+ history=[],
212
+ temperature=0.2,
213
+ max_new_tokens=MAX_NEW_TOKENS,
214
+ max_token=MAX_MODEL_TOKENS,
215
+ call_agent=False,
216
+ conversation=[],
217
+ ):
218
+ if isinstance(result, str):
219
+ summary_response += result
220
+ elif hasattr(result, "content"):
221
+ summary_response += result.content
222
+ elif isinstance(result, list):
223
+ for r in result:
224
+ if hasattr(r, "content"):
225
+ summary_response += r.content
226
+ summary = clean_response(summary_response)
227
+ current_summary_tokens = estimate_tokens(summary)
228
+ except Exception as e:
229
+ messages.append({"role": "assistant", "content": f"❌ Error summarizing intermediate results: {str(e)}"})
230
+ return messages, report_path
231
+
232
+ summary += f"\n\n### Chunk {i+1} Analysis\n{response}"
233
+ current_summary_tokens += response_tokens
234
+
235
+ # Final summarization
236
+ final_prompt = f"Summarize the key findings from the following analyses:\n\n{summary}"
237
+ messages.append({"role": "assistant", "content": "πŸ“Š Generating final report..."})
238
+
239
+ final_report_text = ""
240
+ try:
241
  for result in agent.run_gradio_chat(
242
+ message=final_prompt,
243
  history=[],
244
  temperature=0.2,
245
  max_new_tokens=MAX_NEW_TOKENS,
246
+ max_token=MAX_MODEL_TOKENS,
247
  call_agent=False,
248
  conversation=[],
249
  ):
250
  if isinstance(result, str):
251
+ final_report_text += result
252
  elif hasattr(result, "content"):
253
+ final_report_text += result.content
254
  elif isinstance(result, list):
255
  for r in result:
256
  if hasattr(r, "content"):
257
+ final_report_text += r.content
258
+ except Exception as e:
259
+ messages.append({"role": "assistant", "content": f"❌ Error generating final report: {str(e)}"})
260
+ return messages, report_path
 
 
 
261
 
262
+ final_report = f"# \U0001f9e0 Final Patient Report\n\n{clean_response(final_report_text)}"
263
+ messages[-1]["content"] = f"πŸ“Š Final Report:\n\n{clean_response(final_report_text)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
+ # Save the report
266
  timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
267
  report_path = os.path.join(report_dir, f"report_{timestamp}.md")
268
 
 
277
  return messages, report_path
278
 
279
  def create_ui(agent):
280
+ """Create the Gradio UI for the patient history analysis tool."""
281
  with gr.Blocks(title="Patient History Chat", css=".gradio-container {max-width: 900px !important}") as demo:
282
  gr.Markdown("## πŸ₯ Patient History Analysis Tool")
283