import inspect from typing import get_type_hints, Callable, Any import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM # --- Load Model and Tokenizer --- model_id = "unsloth/SmolLM2-135M-Instruct-GGUF" filename = "SmolLM2-135M-Instruct-Q8_0.gguf" tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename) model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename) # --- System Prompt Template --- SYSTEM_PROMPT = """You are a helpful AI assistant. Your job is to provide clear and concise responses based on the user's input. Keep your answers straightforward and avoid unnecessary information.""" def parse_docstring(func): doc = inspect.getdoc(func) if not doc: return {"title": "Untitled", "description": ""} lines = doc.splitlines() title = next((line.replace("Title:", "").strip() for line in lines if line.startswith("Title:")), "Untitled") description = "\n".join(line.strip() for line in lines if line.startswith("Description:")) description = description.replace("Description:", "").strip() return {"title": title, "description": description} def gradio_app_with_docs(func: Callable) -> Callable: sig = inspect.signature(func) type_hints = get_type_hints(func) metadata = parse_docstring(func) """ A decorator that automatically builds and launches a Gradio interface based on function type hints. Args: func: A callable with type-hinted parameters and return type. Returns: The wrapped function with a `.launch()` method to start the app. """ def _map_type(t: type) -> gr.Component: if t == str: return gr.Textbox(label="Input") elif t == int: return gr.Number(precision=0) elif t == float: return gr.Number() elif t == bool: return gr.Checkbox() elif hasattr(t, "__origin__") and t.__origin__ == list: elem_type = t.__args__[0] if elem_type == str: return gr.Dropdown(choices=["Option1", "Option2"]) else: raise ValueError(f"Unsupported list element type: {elem_type}") else: raise ValueError(f"Unsupported type: {t}") # Build inputs inputs = [] for name, param in sig.parameters.items(): if name == "self": continue param_type = type_hints.get(name, Any) component = _map_type(param_type) component.label = name.replace("_", " ").title() inputs.append(component) # Build outputs return_type = type_hints.get("return", Any) outputs = _map_type(return_type) # Wrap function with Gradio interface with gr.Blocks() as demo: gr.Markdown(f"## {metadata['title']}\n{metadata['description']}") gr.Interface(fn=func, inputs=inputs, outputs=outputs) def wrapper(*args, **kwargs): return func(*args, **kwargs) wrapper.launch = lambda: demo.launch() return wrapper @gradio_app_with_docs def generate_response(prompt: str) -> str: """ Title: Super Tiny GGUF Model on CPU Description: A Simple app to test out the potentials of small GGUF LLM model. Args: prompt (str): A simple prompt. Returns: str: Simplified response. """ # Apply system prompt + user input # full_prompt = f"<|begin_of_text|>System: {SYSTEM_PROMPT}\nUser: {prompt}\nAssistant:" # inputs = tokenizer(full_prompt, return_tensors="pt").to("cpu") messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": prompt} ] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=True # Switches between thinking and non-thinking modes. Default is True. ) inputs = tokenizer([text], return_tensors="pt").to(model.device) outputs = model.generate( **inputs, max_new_tokens=100, # temperature=0.7, # top_p=0.9 ) return tokenizer.decode(outputs[0], skip_special_tokens=True) if __name__ == "__main__": generate_response.launch()