Damien Benveniste commited on
Commit
b9fb207
·
1 Parent(s): ae23345
Files changed (1) hide show
  1. app.py +0 -1
app.py CHANGED
@@ -17,7 +17,6 @@ engine = AsyncLLMEngine.from_engine_args(
17
  max_num_seqs=16, # Reduced for T4
18
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
19
  max_model_len=512, # Phi-3-mini-4k context length
20
- quantization='awq', # Enable quantization if supported by the model
21
  enforce_eager=True, # Disable CUDA graph
22
  dtype='half', # Use half precision
23
  )
 
17
  max_num_seqs=16, # Reduced for T4
18
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
19
  max_model_len=512, # Phi-3-mini-4k context length
 
20
  enforce_eager=True, # Disable CUDA graph
21
  dtype='half', # Use half precision
22
  )