Avinash109 commited on
Commit
59fb13e
·
verified ·
1 Parent(s): 17e9de5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -31
app.py CHANGED
@@ -4,24 +4,24 @@ import torch
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
  import datetime
6
 
7
- # Set up page configuration
8
  st.set_page_config(
9
- page_title="Qwen2.5-Coder Chat",
10
  page_icon="💬",
11
  layout="wide"
12
  )
13
 
14
- # Set cache directory explicitly
15
  os.environ["TRANSFORMERS_CACHE"] = "/root/.cache/huggingface"
16
 
17
  # Initialize session state for conversation history
18
  if 'messages' not in st.session_state:
19
  st.session_state.messages = []
20
 
21
- # Cache model loading
22
  @st.cache_resource
23
  def load_model_and_tokenizer():
24
- model_name = "Qwen/Qwen2.5-Coder-3B-Instruct" # Model identifier
25
 
26
  # Load tokenizer
27
  tokenizer = AutoTokenizer.from_pretrained(
@@ -33,22 +33,14 @@ def load_model_and_tokenizer():
33
  device = "cuda" if torch.cuda.is_available() else "cpu"
34
  st.info(f"Using device: {device}")
35
 
36
- # Load model
37
- if device == "cuda":
38
- model = AutoModelForCausalLM.from_pretrained(
39
- model_name,
40
- torch_dtype=torch.float16,
41
- device_map="auto",
42
- trust_remote_code=True
43
- )
44
- else:
45
- model = AutoModelForCausalLM.from_pretrained(
46
- model_name,
47
- torch_dtype=torch.float32,
48
- device_map={"": device},
49
- trust_remote_code=True,
50
- low_cpu_mem_usage=True
51
- )
52
 
53
  return tokenizer, model
54
 
@@ -62,8 +54,8 @@ with st.sidebar:
62
  max_length = st.slider(
63
  "Maximum Length",
64
  min_value=64,
65
- max_value=2048,
66
- value=512,
67
  step=64,
68
  help="Maximum number of tokens to generate"
69
  )
@@ -71,8 +63,8 @@ with st.sidebar:
71
  temperature = st.slider(
72
  "Temperature",
73
  min_value=0.1,
74
- max_value=2.0,
75
- value=0.7,
76
  step=0.1,
77
  help="Higher values make output more random, lower values more deterministic"
78
  )
@@ -81,7 +73,7 @@ with st.sidebar:
81
  "Top P",
82
  min_value=0.1,
83
  max_value=1.0,
84
- value=0.9,
85
  step=0.1,
86
  help="Nucleus sampling: higher values consider more tokens, lower values are more focused"
87
  )
@@ -99,11 +91,13 @@ except Exception as e:
99
  st.stop()
100
 
101
  # Response generation function
102
- def generate_response(prompt, max_new_tokens=512, temperature=0.7, top_p=0.9):
103
  """Generate response from the model"""
104
  try:
 
105
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
106
 
 
107
  with torch.no_grad():
108
  outputs = model.generate(
109
  **inputs,
@@ -115,15 +109,16 @@ def generate_response(prompt, max_new_tokens=512, temperature=0.7, top_p=0.9):
115
  eos_token_id=tokenizer.eos_token_id,
116
  )
117
 
 
118
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
119
- return response[len(prompt):].strip() # Extract only the response
120
 
121
  except Exception as e:
122
  st.error(f"Error generating response: {str(e)}")
123
  return None
124
 
125
  # Display conversation history
126
- for message in st.session_state.messages:
127
  with st.chat_message(message["role"]):
128
  st.write(f"{message['content']}\n\n_{message['timestamp']}_")
129
 
@@ -144,10 +139,10 @@ if prompt := st.chat_input("Ask me anything about coding..."):
144
  # Generate and display response
145
  with st.chat_message("assistant"):
146
  with st.spinner("Thinking..."):
147
- # Prepare conversation context
148
  conversation = "\n".join(
149
  f"{'Human' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}"
150
- for msg in st.session_state.messages
151
  ) + "\nAssistant:"
152
 
153
  response = generate_response(
 
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
  import datetime
6
 
7
+ # Page configuration
8
  st.set_page_config(
9
+ page_title="💬 Qwen2.5-Coder Chat",
10
  page_icon="💬",
11
  layout="wide"
12
  )
13
 
14
+ # Set cache directory explicitly for Hugging Face Spaces
15
  os.environ["TRANSFORMERS_CACHE"] = "/root/.cache/huggingface"
16
 
17
  # Initialize session state for conversation history
18
  if 'messages' not in st.session_state:
19
  st.session_state.messages = []
20
 
21
+ # Cache model loading to prevent re-loading each session
22
  @st.cache_resource
23
  def load_model_and_tokenizer():
24
+ model_name = "Qwen/Qwen2.5-Coder-3B-Instruct" # Smaller 3B model for efficiency
25
 
26
  # Load tokenizer
27
  tokenizer = AutoTokenizer.from_pretrained(
 
33
  device = "cuda" if torch.cuda.is_available() else "cpu"
34
  st.info(f"Using device: {device}")
35
 
36
+ # Load model with optimizations for CPU
37
+ model = AutoModelForCausalLM.from_pretrained(
38
+ model_name,
39
+ torch_dtype=torch.float32 if device == "cpu" else torch.float16,
40
+ device_map="auto" if device == "cuda" else {"": device},
41
+ trust_remote_code=True,
42
+ low_cpu_mem_usage=True # Reduce memory usage for CPU
43
+ )
 
 
 
 
 
 
 
 
44
 
45
  return tokenizer, model
46
 
 
54
  max_length = st.slider(
55
  "Maximum Length",
56
  min_value=64,
57
+ max_value=1024, # Lowered for CPU
58
+ value=256, # Default setting for CPU
59
  step=64,
60
  help="Maximum number of tokens to generate"
61
  )
 
63
  temperature = st.slider(
64
  "Temperature",
65
  min_value=0.1,
66
+ max_value=1.5, # Lower range to make output more deterministic
67
+ value=0.5,
68
  step=0.1,
69
  help="Higher values make output more random, lower values more deterministic"
70
  )
 
73
  "Top P",
74
  min_value=0.1,
75
  max_value=1.0,
76
+ value=0.8,
77
  step=0.1,
78
  help="Nucleus sampling: higher values consider more tokens, lower values are more focused"
79
  )
 
91
  st.stop()
92
 
93
  # Response generation function
94
+ def generate_response(prompt, max_new_tokens=256, temperature=0.5, top_p=0.8):
95
  """Generate response from the model"""
96
  try:
97
+ # Tokenize the input
98
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
99
 
100
+ # Generate response
101
  with torch.no_grad():
102
  outputs = model.generate(
103
  **inputs,
 
109
  eos_token_id=tokenizer.eos_token_id,
110
  )
111
 
112
+ # Decode and return response
113
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
114
+ return response[len(prompt):].strip() # Extract only the model's response
115
 
116
  except Exception as e:
117
  st.error(f"Error generating response: {str(e)}")
118
  return None
119
 
120
  # Display conversation history
121
+ for message in st.session_state.messages[-5:]: # Limit to last 5 messages for efficiency
122
  with st.chat_message(message["role"]):
123
  st.write(f"{message['content']}\n\n_{message['timestamp']}_")
124
 
 
139
  # Generate and display response
140
  with st.chat_message("assistant"):
141
  with st.spinner("Thinking..."):
142
+ # Prepare conversation context, limited to recent exchanges
143
  conversation = "\n".join(
144
  f"{'Human' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}"
145
+ for msg in st.session_state.messages[-3:] # Send only the last 3 messages
146
  ) + "\nAssistant:"
147
 
148
  response = generate_response(