Update app.py
Browse files
app.py
CHANGED
@@ -9,7 +9,7 @@ MAX_MODEL_TOKENS = 131072
|
|
9 |
MAX_NEW_TOKENS = 4096
|
10 |
MAX_CHUNK_TOKENS = 8192
|
11 |
PROMPT_OVERHEAD = 300
|
12 |
-
BATCH_SIZE =
|
13 |
|
14 |
# Paths
|
15 |
persistent_dir = "/data/hf_cache"
|
@@ -43,9 +43,10 @@ def extract_text_from_excel(path: str) -> str:
|
|
43 |
all_text = []
|
44 |
xls = pd.ExcelFile(path)
|
45 |
for sheet in xls.sheet_names:
|
46 |
-
df = xls.parse(sheet).astype(str).fillna("")
|
|
|
47 |
rows = df.apply(lambda row: " | ".join(row), axis=1)
|
48 |
-
all_text += [f"[{sheet}] {line}" for line in rows]
|
49 |
return "\n".join(all_text)
|
50 |
|
51 |
def split_text(text: str, max_tokens=MAX_CHUNK_TOKENS) -> List[str]:
|
@@ -83,7 +84,7 @@ def init_agent() -> TxAgent:
|
|
83 |
agent.init_model()
|
84 |
return agent
|
85 |
|
86 |
-
# Serial
|
87 |
def analyze_serial(agent, batch_chunks: List[List[str]]) -> List[str]:
|
88 |
results = []
|
89 |
for idx, batch in enumerate(batch_chunks):
|
|
|
9 |
MAX_NEW_TOKENS = 4096
|
10 |
MAX_CHUNK_TOKENS = 8192
|
11 |
PROMPT_OVERHEAD = 300
|
12 |
+
BATCH_SIZE = 10 # Bigger batch for faster processing
|
13 |
|
14 |
# Paths
|
15 |
persistent_dir = "/data/hf_cache"
|
|
|
43 |
all_text = []
|
44 |
xls = pd.ExcelFile(path)
|
45 |
for sheet in xls.sheet_names:
|
46 |
+
df = xls.parse(sheet).astype(str).fillna("").drop_duplicates()
|
47 |
+
df = df[~df.apply(lambda x: x.str.len().le(5)).any(axis=1)] # remove very short rows
|
48 |
rows = df.apply(lambda row: " | ".join(row), axis=1)
|
49 |
+
all_text += [f"[{sheet}] {line}" for line in rows if line.strip()]
|
50 |
return "\n".join(all_text)
|
51 |
|
52 |
def split_text(text: str, max_tokens=MAX_CHUNK_TOKENS) -> List[str]:
|
|
|
84 |
agent.init_model()
|
85 |
return agent
|
86 |
|
87 |
+
# Serial analyze (safe for vLLM)
|
88 |
def analyze_serial(agent, batch_chunks: List[List[str]]) -> List[str]:
|
89 |
results = []
|
90 |
for idx, batch in enumerate(batch_chunks):
|