Spaces:

Locon213
/

ThinkLite

Running

App Files Files Community

Locon213 commited on 11 days ago

Commit

f8e3c6a

verified ·

1 Parent(s): bf7602e

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -71

app.py CHANGED Viewed

@@ -1,43 +1,21 @@
 from peft import PeftModel
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    GenerationConfig,
-    TextIteratorStreamer
-)
-import torch
 import gradio as gr
-from threading import Thread
-# Загрузка и объединение модели с адаптерами
 base_model = AutoModelForCausalLM.from_pretrained(
     "Qwen/Qwen2.5-0.5B-Instruct",
-    device_map="auto",
-    torch_dtype=torch.float16,
-    low_cpu_mem_usage=True
 )
-# Объединение основной модели с адаптерами
 model = PeftModel.from_pretrained(base_model, "Locon213/ThinkLite")
-model = model.merge_and_unload()
-# Применяем оптимизации для CPU
-model = torch.quantization.quantize_dynamic(
-    model,
-    {torch.nn.Linear},
-    dtype=torch.qint8
-)
-model.config.use_cache = True
-# Загрузка токенизатора
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
-# Конфигурация генерации с оптимизированными параметрами
 generation_config = GenerationConfig(
     temperature=0.7,
     top_p=0.9,
     top_k=50,
-    max_new_tokens=256,  # Уменьшено для экономии памяти
     repetition_penalty=1.1,
     do_sample=True
 )
@@ -49,57 +27,35 @@ def format_prompt(message, history):
     prompt += f"<<<USER>>> {message}\n<<<ASSISTANT>>>"
     return prompt
-def generate_stream(message, history):
     formatted_prompt = format_prompt(message, history)
-    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
-    streamer = TextIteratorStreamer(
-        tokenizer,
-        skip_prompt=True,
-        skip_special_tokens=True,
-        timeout=30
-    )
-    generation_kwargs = dict(
         **inputs,
         generation_config=generation_config,
-        streamer=streamer,
         pad_token_id=tokenizer.eos_token_id
     )
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    partial_message = ""
-    for new_token in streamer:
-        partial_message += new_token
-        yield partial_message
-# Создание интерфейса с оптимизированным дизайном
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# ThinkLite Chat (Optimized)")
-    gr.Markdown("🚀 Версия с потоковым выводом и оптимизацией для CPU")
-    chatbot = gr.Chatbot(height=400)
-    msg = gr.Textbox(label="Ваше сообщение")
-    clear_btn = gr.Button("Очистить историю")
-    def user(message, chat_history):
-        return "", chat_history + [[message, None]]
-    def bot(chat_history):
-        message = chat_history[-1][0]
-        history = chat_history[:-1]
-        chat_history[-1][1] = ""
-        for response in generate_stream(message, history):
-            chat_history[-1][1] = response
-            yield chat_history
-    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
-        bot, chatbot, chatbot
-    )
-    clear_btn.click(lambda: [], None, chatbot, queue=False)
 if __name__ == "__main__":
-    demo.queue(max_size=10).launch(debug=False)

 from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
 import gradio as gr
+# Загрузка модели и токенизатора
 base_model = AutoModelForCausalLM.from_pretrained(
     "Qwen/Qwen2.5-0.5B-Instruct",
+    device_map="auto"
 )
 model = PeftModel.from_pretrained(base_model, "Locon213/ThinkLite")
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
+# Конфигурация генерации
 generation_config = GenerationConfig(
     temperature=0.7,
     top_p=0.9,
     top_k=50,
+    max_new_tokens=512,
     repetition_penalty=1.1,
     do_sample=True
 )
     prompt += f"<<<USER>>> {message}\n<<<ASSISTANT>>>"
     return prompt
+def generate_response(message, history):
+    # Форматируем промпт с историей чата
     formatted_prompt = format_prompt(message, history)
+    # Токенизация и генерация
+    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
+    outputs = model.generate(
         **inputs,
         generation_config=generation_config,
         pad_token_id=tokenizer.eos_token_id
     )
+    # Декодирование и извлечение ответа
+    response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
+    return response.strip()
+# Создание чат-интерфейса
+chat_interface = gr.ChatInterface(
+    fn=generate_response,
+    examples=[
+        "Объясни квантовую запутанность простыми словами",
+        "Как научиться программировать?",
+        "Напиши стихотворение про ИИ"
+    ],
+    title="ThinkLite Chat",
+    description="Общайтесь с ThinkLite - адаптированной версией Qwen2.5-0.5B-Instruct",
+    theme="soft"
+)
 if __name__ == "__main__":
+    chat_interface.launch()