Spaces:

Tonic
/

Pixtral

Paused

App Files Files Community

Tonic commited on Sep 12, 2024

Commit

2749bcc

unverified ·

1 Parent(s): 5b6abbe

model loading fix

Browse files

Files changed (1) hide show

app.py +62 -45

app.py CHANGED Viewed

@@ -174,16 +174,39 @@ class PixtralModel(nn.Module):
         else:
             return vision_output
-def load_model(params, model_path):
-    model = PixtralModel(params)
-    with safe_open(f'{model_path}/consolidated.safetensors', framework="pt", device="cpu") as f:
-        for name, param in model.named_parameters():
-            if name in f.keys():
-                param.data = f.get_tensor(name)
-    model.eval()
-    return model
-model = load_model(params, model_path)
 tokenizer = MistralTokenizer.from_model("pixtral")
 def preprocess_image(image):
@@ -206,39 +229,12 @@ def gpu_memory_manager():
         torch.cuda.empty_cache()
         gc.collect()
-def cuda_error_handler(func):
-    def wrapper(*args, **kwargs):
-        try:
-            return func(*args, **kwargs)
-        except RuntimeError as e:
-            if "CUDA" in str(e):
-                print(f"CUDA error occurred: {str(e)}")
-                print("Attempting to recover...")
-                torch.cuda.empty_cache()
-                gc.collect()
-                try:
-                    return func(*args, **kwargs)
-                except Exception as e2:
-                    print(f"Recovery failed. Error: {str(e2)}")
-                    return f"An error occurred: {str(e2)}", 0, 0
-            else:
-                raise
-        except Exception as e:
-            print(f"An unexpected error occurred: {str(e)}")
-            traceback.print_exc()
-            return f"An unexpected error occurred: {str(e)}", 0, 0
-    return wrapper
 @spaces.GPU()
-@cuda_error_handler
 def generate_text(image, prompt, max_tokens):
     try:
         with gpu_memory_manager():
-            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-            # Use load_img here
             image_pil = load_img(image, output_type="pil", input_type="auto")
             image_tensor = preprocess_image(image_pil).to(device)
-            model.to(device)
             tokenized = tokenizer.encode_chat_completion(
                 ChatCompletionRequest(
@@ -260,8 +256,6 @@ def generate_text(image, prompt, max_tokens):
             generated_text = tokenizer.decode(generated_ids[0].tolist())
-        # # Move model back to CPU and clear CUDA memory
-        # model.to("cpu")
         torch.cuda.empty_cache()
         return generated_text, len(generated_ids[0]), 1
@@ -271,17 +265,13 @@ def generate_text(image, prompt, max_tokens):
         return f"Error: {str(e)}", 0, 0
 @spaces.GPU()
-@cuda_error_handler
 def calculate_similarity(image1, image2):
     try:
         with gpu_memory_manager():
-            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-            # Use load_img for both images
             pil_image1 = load_img(image1, output_type="pil", input_type="auto")
             pil_image2 = load_img(image2, output_type="pil", input_type="auto")
             tensor1 = preprocess_image(pil_image1).to(device)
             tensor2 = preprocess_image(pil_image2).to(device)
-            model.to(device)
             with torch.no_grad():
                 embedding1 = model(tensor1).mean(dim=1)
@@ -289,8 +279,6 @@ def calculate_similarity(image1, image2):
             similarity = F.cosine_similarity(embedding1, embedding2).item()
-        # # Move model back to CPU and clear CUDA memory
-        # model.to("cpu")
         torch.cuda.empty_cache()
         return similarity
@@ -298,6 +286,35 @@ def calculate_similarity(image1, image2):
         print(f"Error in calculate_similarity: {str(e)}")
         traceback.print_exc()
         return f"Error: {str(e)}"
 with gr.Blocks() as demo:
     gr.Markdown(title)

         else:
             return vision_output
+@contextmanager
+def gpu_memory_manager():
+    try:
+        torch.cuda.empty_cache()
+        yield
+    finally:
+        torch.cuda.empty_cache()
+        gc.collect()
+def load_model_with_fallback(params, model_path):
+    try:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model = PixtralModel(params)
+        with safe_open(f'{model_path}/consolidated.safetensors', framework="pt", device="cpu") as f:
+            for name, param in model.named_parameters():
+                if name in f.keys():
+                    param.data = f.get_tensor(name)
+        model.eval()
+        model.to(device)
+        return model, device
+    except RuntimeError as e:
+        print(f"Error loading model on GPU: {str(e)}")
+        print("Falling back to CPU...")
+        model = PixtralModel(params)
+        with safe_open(f'{model_path}/consolidated.safetensors', framework="pt", device="cpu") as f:
+            for name, param in model.named_parameters():
+                if name in f.keys():
+                    param.data = f.get_tensor(name)
+        model.eval()
+        return model, torch.device("cpu")
+model, device = load_model_with_fallback(params, model_path)
 tokenizer = MistralTokenizer.from_model("pixtral")
 def preprocess_image(image):
         torch.cuda.empty_cache()
         gc.collect()
 @spaces.GPU()
 def generate_text(image, prompt, max_tokens):
     try:
         with gpu_memory_manager():
             image_pil = load_img(image, output_type="pil", input_type="auto")
             image_tensor = preprocess_image(image_pil).to(device)
             tokenized = tokenizer.encode_chat_completion(
                 ChatCompletionRequest(
             generated_text = tokenizer.decode(generated_ids[0].tolist())
         torch.cuda.empty_cache()
         return generated_text, len(generated_ids[0]), 1
         return f"Error: {str(e)}", 0, 0
 @spaces.GPU()
 def calculate_similarity(image1, image2):
     try:
         with gpu_memory_manager():
             pil_image1 = load_img(image1, output_type="pil", input_type="auto")
             pil_image2 = load_img(image2, output_type="pil", input_type="auto")
             tensor1 = preprocess_image(pil_image1).to(device)
             tensor2 = preprocess_image(pil_image2).to(device)
             with torch.no_grad():
                 embedding1 = model(tensor1).mean(dim=1)
             similarity = F.cosine_similarity(embedding1, embedding2).item()
         torch.cuda.empty_cache()
         return similarity
         print(f"Error in calculate_similarity: {str(e)}")
         traceback.print_exc()
         return f"Error: {str(e)}"
+# @spaces.GPU()
+# @cuda_error_handler
+# def calculate_similarity(image1, image2):
+#     try:
+#         with gpu_memory_manager():
+#             device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+#             # Use load_img for both images
+#             pil_image1 = load_img(image1, output_type="pil", input_type="auto")
+#             pil_image2 = load_img(image2, output_type="pil", input_type="auto")
+#             tensor1 = preprocess_image(pil_image1).to(device)
+#             tensor2 = preprocess_image(pil_image2).to(device)
+#             model.to(device)
+#             with torch.no_grad():
+#                 embedding1 = model(tensor1).mean(dim=1)
+#                 embedding2 = model(tensor2).mean(dim=1)
+#             similarity = F.cosine_similarity(embedding1, embedding2).item()
+#         # # Move model back to CPU and clear CUDA memory
+#         # model.to("cpu")
+#         torch.cuda.empty_cache()
+#         return similarity
+#     except Exception as e:
+#         print(f"Error in calculate_similarity: {str(e)}")
+#         traceback.print_exc()
+#         return f"Error: {str(e)}"
 with gr.Blocks() as demo:
     gr.Markdown(title)