Avinash109 commited on
Commit
ed64278
Β·
verified Β·
1 Parent(s): 6f080ab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -76
app.py CHANGED
@@ -2,130 +2,129 @@ import streamlit as st
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import datetime
 
 
 
 
 
5
 
6
  # Set page configuration
7
  st.set_page_config(
8
  page_title="Qwen2.5-Coder Chat",
9
  page_icon="πŸ’¬",
10
- layout="wide"
11
  )
12
 
13
  # Initialize session state
14
  if 'messages' not in st.session_state:
15
  st.session_state.messages = []
 
 
16
 
17
- @st.cache_resource
18
  def load_model_and_tokenizer():
19
  try:
20
- # Display loading message
21
- with st.spinner("πŸ”„ Loading model and tokenizer... This might take a few minutes..."):
22
- model_name = "Qwen/Qwen2.5-Coder-3B-Instruct"
23
-
24
  # Load tokenizer first
25
  tokenizer = AutoTokenizer.from_pretrained(
26
  model_name,
27
  trust_remote_code=True
28
  )
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- # Determine device and display info
31
- device = "cuda" if torch.cuda.is_available() else "cpu"
32
- st.info(f"πŸ’» Using device: {device}")
33
 
34
- # Load model with appropriate settings
35
- if device == "cuda":
36
- model = AutoModelForCausalLM.from_pretrained(
37
- model_name,
38
- torch_dtype=torch.float16, # Use float16 for GPU
39
- device_map="auto",
40
- trust_remote_code=True
41
- ).eval() # Set to evaluation mode
42
- else:
43
- model = AutoModelForCausalLM.from_pretrained(
44
- model_name,
45
- device_map={"": device},
46
- trust_remote_code=True,
47
- low_cpu_mem_usage=True
48
- ).eval() # Set to evaluation mode
49
 
 
50
  return tokenizer, model
 
51
  except Exception as e:
52
  st.error(f"❌ Error loading model: {str(e)}")
53
- raise e
54
 
55
- def generate_response(prompt, model, tokenizer, max_new_tokens=512, temperature=0.7, top_p=0.9):
56
- """Generate response from the model with better error handling"""
57
  try:
58
- # Tokenize input
59
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
60
 
61
- # Generate response with progress bar
62
- with torch.no_grad(), st.spinner("πŸ€” Thinking..."):
 
 
 
 
 
 
 
 
63
  outputs = model.generate(
64
  **inputs,
65
- max_new_tokens=max_new_tokens,
66
- temperature=temperature,
67
- top_p=top_p,
68
  do_sample=True,
69
  pad_token_id=tokenizer.pad_token_id,
70
  eos_token_id=tokenizer.eos_token_id,
71
- repetition_penalty=1.1,
72
- no_repeat_ngram_size=3
73
  )
74
 
75
- # Decode and return response
 
 
76
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
77
  return response[len(prompt):].strip()
78
-
79
  except torch.cuda.OutOfMemoryError:
80
- st.error("πŸ’Ύ GPU memory exceeded. Try reducing the maximum length or clearing the conversation.")
81
  return None
82
  except Exception as e:
83
- st.error(f"❌ Error generating response: {str(e)}")
84
  return None
85
 
86
  # Main UI
87
  st.title("πŸ’¬ Qwen2.5-Coder Chat")
88
 
89
- # Sidebar settings
90
  with st.sidebar:
91
  st.header("βš™οΈ Settings")
92
 
93
- # Model settings
94
  max_length = st.slider(
95
- "Maximum Length πŸ“",
96
  min_value=64,
97
- max_value=2048,
98
- value=512,
99
- step=64
100
- )
101
-
102
- temperature = st.slider(
103
- "Temperature 🌑️",
104
- min_value=0.1,
105
- max_value=2.0,
106
- value=0.7,
107
- step=0.1
108
  )
109
 
110
- top_p = st.slider(
111
- "Top P πŸ“Š",
112
- min_value=0.1,
113
- max_value=1.0,
114
- value=0.9,
115
- step=0.1
116
- )
117
-
118
- # Clear conversation button
119
  if st.button("πŸ—‘οΈ Clear Conversation"):
120
  st.session_state.messages = []
121
  st.rerun()
122
 
123
  # Load model
124
- try:
 
 
 
 
125
  tokenizer, model = load_model_and_tokenizer()
126
- except Exception as e:
127
- st.error("❌ Failed to load model. Please check the logs and refresh the page.")
128
- st.stop()
129
 
130
  # Display conversation history
131
  for message in st.session_state.messages:
@@ -148,19 +147,14 @@ if prompt := st.chat_input("πŸ’­ Ask me anything about coding..."):
148
 
149
  # Generate and display response
150
  with st.chat_message("assistant"):
151
- # Prepare conversation context (limit to last 3 messages to prevent context overflow)
152
- conversation = "\n".join(
153
- f"{'Human' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}"
154
- for msg in st.session_state.messages[-3:]
155
- ) + "\nAssistant:"
156
 
157
  response = generate_response(
158
  conversation,
159
  model,
160
  tokenizer,
161
- max_new_tokens=max_length,
162
- temperature=temperature,
163
- top_p=top_p
164
  )
165
 
166
  if response:
@@ -174,4 +168,7 @@ if prompt := st.chat_input("πŸ’­ Ask me anything about coding..."):
174
  "timestamp": timestamp
175
  })
176
  else:
177
- st.error("❌ Failed to generate response. Please try again with different settings.")
 
 
 
 
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import datetime
5
+ import gc
6
+ import os
7
+
8
+ # Enable memory efficient options
9
+ os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
10
 
11
  # Set page configuration
12
  st.set_page_config(
13
  page_title="Qwen2.5-Coder Chat",
14
  page_icon="πŸ’¬",
15
+ layout="wide",
16
  )
17
 
18
  # Initialize session state
19
  if 'messages' not in st.session_state:
20
  st.session_state.messages = []
21
+ if 'model_loaded' not in st.session_state:
22
+ st.session_state.model_loaded = False
23
 
24
+ @st.cache_resource(show_spinner=False)
25
  def load_model_and_tokenizer():
26
  try:
27
+ model_name = "Qwen/Qwen2.5-Coder-3B-Instruct"
28
+
29
+ with st.spinner("πŸ”„ Loading tokenizer..."):
 
30
  # Load tokenizer first
31
  tokenizer = AutoTokenizer.from_pretrained(
32
  model_name,
33
  trust_remote_code=True
34
  )
35
+
36
+ with st.spinner("πŸ”„ Loading model... (this may take a few minutes on CPU)"):
37
+ # Load model with 8-bit quantization for CPU
38
+ model = AutoModelForCausalLM.from_pretrained(
39
+ model_name,
40
+ device_map={"": "cpu"},
41
+ trust_remote_code=True,
42
+ low_cpu_mem_usage=True,
43
+ torch_dtype=torch.float32,
44
+ load_in_8bit=True # Enable 8-bit quantization
45
+ )
46
 
47
+ # Force CPU mode and eval mode
48
+ model = model.to("cpu").eval()
 
49
 
50
+ # Clear memory after loading
51
+ gc.collect()
52
+ torch.cuda.empty_cache() if torch.cuda.is_available() else None
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ st.session_state.model_loaded = True
55
  return tokenizer, model
56
+
57
  except Exception as e:
58
  st.error(f"❌ Error loading model: {str(e)}")
59
+ return None, None
60
 
61
+ def generate_response(prompt, model, tokenizer, max_length=256):
 
62
  try:
63
+ # Clear memory before generation
64
+ gc.collect()
65
 
66
+ # Tokenize with shorter maximum length
67
+ inputs = tokenizer(
68
+ prompt,
69
+ return_tensors="pt",
70
+ max_length=512,
71
+ truncation=True
72
+ ).to("cpu")
73
+
74
+ # Generate with minimal parameters for CPU
75
+ with torch.no_grad(), st.spinner("πŸ€” Thinking... (please be patient)"):
76
  outputs = model.generate(
77
  **inputs,
78
+ max_new_tokens=max_length,
79
+ temperature=0.7,
80
+ top_p=0.9,
81
  do_sample=True,
82
  pad_token_id=tokenizer.pad_token_id,
83
  eos_token_id=tokenizer.eos_token_id,
84
+ num_beams=1, # Disable beam search
85
+ early_stopping=True
86
  )
87
 
88
+ # Clear memory after generation
89
+ gc.collect()
90
+
91
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
92
  return response[len(prompt):].strip()
93
+
94
  except torch.cuda.OutOfMemoryError:
95
+ st.error("πŸ’Ύ Memory exceeded. Try reducing the maximum length.")
96
  return None
97
  except Exception as e:
98
+ st.error(f"❌ Error: {str(e)}")
99
  return None
100
 
101
  # Main UI
102
  st.title("πŸ’¬ Qwen2.5-Coder Chat")
103
 
104
+ # Sidebar with minimal settings
105
  with st.sidebar:
106
  st.header("βš™οΈ Settings")
107
 
 
108
  max_length = st.slider(
109
+ "Response Length πŸ“",
110
  min_value=64,
111
+ max_value=512,
112
+ value=256,
113
+ step=64,
114
+ help="Shorter lengths are recommended for CPU"
 
 
 
 
 
 
 
115
  )
116
 
 
 
 
 
 
 
 
 
 
117
  if st.button("πŸ—‘οΈ Clear Conversation"):
118
  st.session_state.messages = []
119
  st.rerun()
120
 
121
  # Load model
122
+ if not st.session_state.model_loaded:
123
+ tokenizer, model = load_model_and_tokenizer()
124
+ if model is None:
125
+ st.stop()
126
+ else:
127
  tokenizer, model = load_model_and_tokenizer()
 
 
 
128
 
129
  # Display conversation history
130
  for message in st.session_state.messages:
 
147
 
148
  # Generate and display response
149
  with st.chat_message("assistant"):
150
+ # Keep only last message for context to reduce memory usage
151
+ conversation = f"Human: {prompt}\nAssistant:"
 
 
 
152
 
153
  response = generate_response(
154
  conversation,
155
  model,
156
  tokenizer,
157
+ max_length=max_length
 
 
158
  )
159
 
160
  if response:
 
168
  "timestamp": timestamp
169
  })
170
  else:
171
+ st.error("❌ Failed to generate response. Please try again with a shorter length.")
172
+
173
+ # Clear memory after response
174
+ gc.collect()