Ali2206 commited on
Commit
b20bb52
·
verified ·
1 Parent(s): 67af08d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -4
app.py CHANGED
@@ -9,7 +9,7 @@ MAX_MODEL_TOKENS = 131072
9
  MAX_NEW_TOKENS = 4096
10
  MAX_CHUNK_TOKENS = 8192
11
  PROMPT_OVERHEAD = 300
12
- BATCH_SIZE = 3 # group 3 chunks together
13
 
14
  # Paths
15
  persistent_dir = "/data/hf_cache"
@@ -43,9 +43,10 @@ def extract_text_from_excel(path: str) -> str:
43
  all_text = []
44
  xls = pd.ExcelFile(path)
45
  for sheet in xls.sheet_names:
46
- df = xls.parse(sheet).astype(str).fillna("")
 
47
  rows = df.apply(lambda row: " | ".join(row), axis=1)
48
- all_text += [f"[{sheet}] {line}" for line in rows]
49
  return "\n".join(all_text)
50
 
51
  def split_text(text: str, max_tokens=MAX_CHUNK_TOKENS) -> List[str]:
@@ -83,7 +84,7 @@ def init_agent() -> TxAgent:
83
  agent.init_model()
84
  return agent
85
 
86
- # Serial processing (safe for vLLM)
87
  def analyze_serial(agent, batch_chunks: List[List[str]]) -> List[str]:
88
  results = []
89
  for idx, batch in enumerate(batch_chunks):
 
9
  MAX_NEW_TOKENS = 4096
10
  MAX_CHUNK_TOKENS = 8192
11
  PROMPT_OVERHEAD = 300
12
+ BATCH_SIZE = 10 # Bigger batch for faster processing
13
 
14
  # Paths
15
  persistent_dir = "/data/hf_cache"
 
43
  all_text = []
44
  xls = pd.ExcelFile(path)
45
  for sheet in xls.sheet_names:
46
+ df = xls.parse(sheet).astype(str).fillna("").drop_duplicates()
47
+ df = df[~df.apply(lambda x: x.str.len().le(5)).any(axis=1)] # remove very short rows
48
  rows = df.apply(lambda row: " | ".join(row), axis=1)
49
+ all_text += [f"[{sheet}] {line}" for line in rows if line.strip()]
50
  return "\n".join(all_text)
51
 
52
  def split_text(text: str, max_tokens=MAX_CHUNK_TOKENS) -> List[str]:
 
84
  agent.init_model()
85
  return agent
86
 
87
+ # Serial analyze (safe for vLLM)
88
  def analyze_serial(agent, batch_chunks: List[List[str]]) -> List[str]:
89
  results = []
90
  for idx, batch in enumerate(batch_chunks):