adilkh26 commited on
Commit
bb1f5be
·
verified ·
1 Parent(s): c54660f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -7
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
4
  import deepspeed
 
5
 
6
  # Model name
7
  model_name = "OpenGVLab/InternVideo2_5_Chat_8B"
@@ -9,19 +9,27 @@ model_name = "OpenGVLab/InternVideo2_5_Chat_8B"
9
  # Load tokenizer
10
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
11
 
12
- # Load model efficiently
 
 
 
 
 
 
 
13
  model = AutoModelForCausalLM.from_pretrained(
14
  model_name,
15
  trust_remote_code=True,
16
- torch_dtype=torch.float16, # Use float16 for lower memory usage
17
- device_map="auto", # Automatically place model on available GPU
18
- deepspeed={"stage": 3} # Enable DeepSpeed ZeRO-3
19
-
20
  )
21
 
 
 
 
22
  # Define inference function
23
  def chat_with_model(prompt):
24
- inputs = tokenizer(prompt, return_tensors="pt").to("cuda") # Move inputs to GPU
25
  output = model.generate(**inputs, max_length=200)
26
  return tokenizer.decode(output[0], skip_special_tokens=True)
27
 
 
1
  import gradio as gr
2
  import torch
 
3
  import deepspeed
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
 
6
  # Model name
7
  model_name = "OpenGVLab/InternVideo2_5_Chat_8B"
 
9
  # Load tokenizer
10
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
11
 
12
+ # Enable DeepSpeed Inference (ZeRO-3)
13
+ ds_engine = deepspeed.init_inference(
14
+ dtype=torch.float16, # Use float16 for efficiency
15
+ replace_method="auto", # Automatically replace ops for inference
16
+ replace_with_kernel_inject=True
17
+ )
18
+
19
+ # Load model with DeepSpeed
20
  model = AutoModelForCausalLM.from_pretrained(
21
  model_name,
22
  trust_remote_code=True,
23
+ torch_dtype=torch.float16,
24
+ device_map="auto" # Auto place on GPU
 
 
25
  )
26
 
27
+ # Apply DeepSpeed to model
28
+ model = ds_engine.module(model)
29
+
30
  # Define inference function
31
  def chat_with_model(prompt):
32
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
33
  output = model.generate(**inputs, max_length=200)
34
  return tokenizer.decode(output[0], skip_special_tokens=True)
35