hmrizal commited on
Commit
0cca13a
·
verified ·
1 Parent(s): 93d9ba0

modify create_fallback_pipeline and initialize_model_once without CUDA

Browse files
Files changed (1) hide show
  1. app.py +62 -31
app.py CHANGED
@@ -160,21 +160,35 @@ def initialize_model_once(model_key):
160
 
161
  # Handle standard HF models
162
  else:
163
- quantization_config = BitsAndBytesConfig(
164
- load_in_4bit=True,
165
- bnb_4bit_compute_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
166
- bnb_4bit_quant_type="nf4",
167
- bnb_4bit_use_double_quant=True
168
- )
169
  MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
170
- MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
171
- model_name,
172
- quantization_config=quantization_config,
173
- torch_dtype=model_info["dtype"],
174
- device_map="auto" if torch.cuda.is_available() else None,
175
- low_cpu_mem_usage=True,
176
- trust_remote_code=True
177
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  MODEL_CACHE["is_gguf"] = False
179
 
180
  print(f"Model {model_name} loaded successfully")
@@ -258,24 +272,41 @@ def create_llm_pipeline(model_key):
258
  def create_fallback_pipeline():
259
  """Create a fallback pipeline with a very small model"""
260
  model_key = "Fallback Model"
261
- tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG[model_key]["name"])
262
- model = AutoModelForCausalLM.from_pretrained(
263
- MODEL_CONFIG[model_key]["name"],
264
- torch_dtype=MODEL_CONFIG[model_key]["dtype"],
265
- device_map="auto" if torch.cuda.is_available() else None,
266
- low_cpu_mem_usage=True
267
- )
268
 
269
- pipe = pipeline(
270
- "text-generation",
271
- model=model,
272
- tokenizer=tokenizer,
273
- max_new_tokens=128,
274
- temperature=0.3,
275
- return_full_text=False,
276
- )
277
-
278
- return HuggingFacePipeline(pipeline=pipe)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
280
  def handle_model_loading_error(model_key, session_id):
281
  """Handle model loading errors with fallback options"""
 
160
 
161
  # Handle standard HF models
162
  else:
 
 
 
 
 
 
163
  MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
164
+
165
+ # Only use quantization if CUDA is available
166
+ if torch.cuda.is_available():
167
+ quantization_config = BitsAndBytesConfig(
168
+ load_in_4bit=True,
169
+ bnb_4bit_compute_dtype=torch.float16,
170
+ bnb_4bit_quant_type="nf4",
171
+ bnb_4bit_use_double_quant=True
172
+ )
173
+
174
+ MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
175
+ model_name,
176
+ quantization_config=quantization_config,
177
+ torch_dtype=model_info["dtype"],
178
+ device_map="auto",
179
+ low_cpu_mem_usage=True,
180
+ trust_remote_code=True
181
+ )
182
+ else:
183
+ # For CPU-only environments, load without quantization
184
+ MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
185
+ model_name,
186
+ torch_dtype=torch.float32, # Use float32 for CPU
187
+ device_map=None,
188
+ low_cpu_mem_usage=True,
189
+ trust_remote_code=True
190
+ )
191
+
192
  MODEL_CACHE["is_gguf"] = False
193
 
194
  print(f"Model {model_name} loaded successfully")
 
272
  def create_fallback_pipeline():
273
  """Create a fallback pipeline with a very small model"""
274
  model_key = "Fallback Model"
275
+ print(f"Creating minimal fallback pipeline with {MODEL_CONFIG[model_key]['name']}")
 
 
 
 
 
 
276
 
277
+ # Avoid using bitsandbytes for quantization when CUDA is not available
278
+ try:
279
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG[model_key]["name"])
280
+
281
+ # Load model in 8-bit or without quantization for CPU
282
+ if torch.cuda.is_available():
283
+ model = AutoModelForCausalLM.from_pretrained(
284
+ MODEL_CONFIG[model_key]["name"],
285
+ torch_dtype=MODEL_CONFIG[model_key]["dtype"],
286
+ device_map="auto",
287
+ low_cpu_mem_usage=True
288
+ )
289
+ else:
290
+ # For CPU-only environments, avoid quantization
291
+ model = AutoModelForCausalLM.from_pretrained(
292
+ MODEL_CONFIG[model_key]["name"],
293
+ torch_dtype=torch.float32, # Use float32 for CPU
294
+ low_cpu_mem_usage=True
295
+ )
296
+
297
+ pipe = pipeline(
298
+ "text-generation",
299
+ model=model,
300
+ tokenizer=tokenizer,
301
+ max_new_tokens=64, # Reduced for CPU performance
302
+ temperature=0.3,
303
+ return_full_text=False,
304
+ )
305
+
306
+ return HuggingFacePipeline(pipeline=pipe)
307
+ except Exception as e:
308
+ print(f"Error creating minimal fallback pipeline: {str(e)}")
309
+ raise
310
 
311
  def handle_model_loading_error(model_key, session_id):
312
  """Handle model loading errors with fallback options"""