File size: 4,982 Bytes
80351f4
bba8253
 
fb19b6e
3e64474
bba8253
80351f4
bba8253
 
 
088f906
bba8253
 
80351f4
 
 
088f906
 
 
bba8253
80351f4
bba8253
088f906
80351f4
 
fb19b6e
088f906
 
 
 
80351f4
 
fb19b6e
 
80351f4
 
fb19b6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
088f906
bba8253
 
80351f4
088f906
0bd5ba6
088f906
 
 
 
 
 
 
80351f4
3e64474
088f906
 
bba8253
 
088f906
bba8253
 
088f906
bba8253
 
088f906
bba8253
 
088f906
 
bba8253
 
 
 
088f906
bba8253
088f906
 
 
 
bba8253
80351f4
088f906
 
 
 
 
 
 
80351f4
088f906
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80351f4
088f906
 
 
 
 
80351f4
088f906
 
 
 
 
 
80351f4
088f906
 
 
 
 
 
 
 
 
 
 
 
 
 
80351f4
 
 
 
 
088f906
 
 
 
 
 
 
 
 
 
 
 
80351f4
088f906
 
 
 
80351f4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import os
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import datetime

# Set up page configuration
st.set_page_config(
    page_title="Qwen2.5-Coder Chat",
    page_icon="πŸ’¬",
    layout="wide"
)

# Set cache directory explicitly
os.environ["TRANSFORMERS_CACHE"] = "/root/.cache/huggingface"

# Initialize session state for conversation history
if 'messages' not in st.session_state:
    st.session_state.messages = []

# Cache model loading
@st.cache_resource
def load_model_and_tokenizer():
    model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"  # Model identifier

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_name, 
        trust_remote_code=True
    )

    # Device configuration
    device = "cuda" if torch.cuda.is_available() else "cpu"
    st.info(f"Using device: {device}")

    # Load model
    if device == "cuda":
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True
        )
    else:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float32,
            device_map={"": device},
            trust_remote_code=True,
            low_cpu_mem_usage=True
        )
    
    return tokenizer, model

# Title
st.title("πŸ’¬ Qwen2.5-Coder Chat")

# Sidebar settings
with st.sidebar:
    st.header("Settings")
    
    max_length = st.slider(
        "Maximum Length",
        min_value=64,
        max_value=2048,
        value=512,
        step=64,
        help="Maximum number of tokens to generate"
    )
    
    temperature = st.slider(
        "Temperature",
        min_value=0.1,
        max_value=2.0,
        value=0.7,
        step=0.1,
        help="Higher values make output more random, lower values more deterministic"
    )
    
    top_p = st.slider(
        "Top P",
        min_value=0.1,
        max_value=1.0,
        value=0.9,
        step=0.1,
        help="Nucleus sampling: higher values consider more tokens, lower values are more focused"
    )
    
    if st.button("Clear Conversation"):
        st.session_state.messages = []
        st.rerun()

# Load model with caching
try:
    with st.spinner("Loading model... Please wait..."):
        tokenizer, model = load_model_and_tokenizer()
except Exception as e:
    st.error(f"Error loading model: {str(e)}")
    st.stop()

# Response generation function
def generate_response(prompt, max_new_tokens=512, temperature=0.7, top_p=0.9):
    """Generate response from the model"""
    try:
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response[len(prompt):].strip()  # Extract only the response
    
    except Exception as e:
        st.error(f"Error generating response: {str(e)}")
        return None

# Display conversation history
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.write(f"{message['content']}\n\n_{message['timestamp']}_")

# Chat input
if prompt := st.chat_input("Ask me anything about coding..."):
    # Add user message
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    st.session_state.messages.append({
        "role": "user",
        "content": prompt,
        "timestamp": timestamp
    })
    
    # Display user message
    with st.chat_message("user"):
        st.write(f"{prompt}\n\n_{timestamp}_")
    
    # Generate and display response
    with st.chat_message("assistant"):
        with st.spinner("Thinking..."):
            # Prepare conversation context
            conversation = "\n".join(
                f"{'Human' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}" 
                for msg in st.session_state.messages
            ) + "\nAssistant:"
            
            response = generate_response(
                conversation,
                max_new_tokens=max_length,
                temperature=temperature,
                top_p=top_p
            )
            
            if response:
                timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                st.write(f"{response}\n\n_{timestamp}_")
                
                # Add response to chat history
                st.session_state.messages.append({
                    "role": "assistant",
                    "content": response,
                    "timestamp": timestamp
                })