CPS-Test-Mobile

Paused

Ali2206 commited on 26 days ago

Commit

b20bb52

verified ·

1 Parent(s): 67af08d

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ MAX_MODEL_TOKENS = 131072
 MAX_NEW_TOKENS = 4096
 MAX_CHUNK_TOKENS = 8192
 PROMPT_OVERHEAD = 300
-BATCH_SIZE = 3  # group 3 chunks together
 # Paths
 persistent_dir = "/data/hf_cache"
@@ -43,9 +43,10 @@ def extract_text_from_excel(path: str) -> str:
     all_text = []
     xls = pd.ExcelFile(path)
     for sheet in xls.sheet_names:
-        df = xls.parse(sheet).astype(str).fillna("")
         rows = df.apply(lambda row: " | ".join(row), axis=1)
-        all_text += [f"[{sheet}] {line}" for line in rows]
     return "\n".join(all_text)
 def split_text(text: str, max_tokens=MAX_CHUNK_TOKENS) -> List[str]:
@@ -83,7 +84,7 @@ def init_agent() -> TxAgent:
     agent.init_model()
     return agent
-# Serial processing (safe for vLLM)
 def analyze_serial(agent, batch_chunks: List[List[str]]) -> List[str]:
     results = []
     for idx, batch in enumerate(batch_chunks):

 MAX_NEW_TOKENS = 4096
 MAX_CHUNK_TOKENS = 8192
 PROMPT_OVERHEAD = 300
+BATCH_SIZE = 10  # Bigger batch for faster processing
 # Paths
 persistent_dir = "/data/hf_cache"
     all_text = []
     xls = pd.ExcelFile(path)
     for sheet in xls.sheet_names:
+        df = xls.parse(sheet).astype(str).fillna("").drop_duplicates()
+        df = df[~df.apply(lambda x: x.str.len().le(5)).any(axis=1)]  # remove very short rows
         rows = df.apply(lambda row: " | ".join(row), axis=1)
+        all_text += [f"[{sheet}] {line}" for line in rows if line.strip()]
     return "\n".join(all_text)
 def split_text(text: str, max_tokens=MAX_CHUNK_TOKENS) -> List[str]:
     agent.init_model()
     return agent
+# Serial analyze (safe for vLLM)
 def analyze_serial(agent, batch_chunks: List[List[str]]) -> List[str]:
     results = []
     for idx, batch in enumerate(batch_chunks):