writing-prototypes

Running

kcarnold commited on Mar 23

Commit

819c4b4

1 Parent(s): 697e79c

Prepare to move up to Gemma3

Files changed (1) hide show

custom_llm.py CHANGED Viewed

@@ -32,8 +32,16 @@ async def models_lifespan(app: FastAPI):
     #model_name = 'google/gemma-1.1-7b-it'
     #model_name = 'google/gemma-1.1-2b-it'
     model_name = 'google/gemma-2-9b-it'
-    dtype = torch.bfloat16 if USE_GPU else torch.float16
     ml_models["llm"] = llm = {
         'tokenizer': AutoTokenizer.from_pretrained(model_name),
@@ -41,7 +49,8 @@ async def models_lifespan(app: FastAPI):
             model_name,
             device_map="auto" if USE_GPU else "cpu",
             torch_dtype=dtype,
-            attn_implementation='eager'
         )
     }
     print("Loaded llm with device map:")

     #model_name = 'google/gemma-1.1-7b-it'
     #model_name = 'google/gemma-1.1-2b-it'
     model_name = 'google/gemma-2-9b-it'
+    #model_name = 'google/gemma-3-12b-it'
+    #model_name = 'google/gemma-3-4b-it'
+    if USE_GPU:
+        dtype = torch.bfloat16
+        from transformers import TorchAoConfig
+        quantization_config = None#TorchAoConfig("int4_weight_only", group_size=128)
+    else:
+        dtype = torch.float16
+        quantization_config = None
     ml_models["llm"] = llm = {
         'tokenizer': AutoTokenizer.from_pretrained(model_name),
             model_name,
             device_map="auto" if USE_GPU else "cpu",
             torch_dtype=dtype,
+            attn_implementation='eager',
+            quantization_config=quantization_config,
         )
     }
     print("Loaded llm with device map:")