Athspi commited on
Commit
63faa06
·
verified ·
1 Parent(s): cbe3363

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -0
app.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer
3
+ import onnxruntime as ort
4
+ import numpy as np
5
+
6
+ # Local model directory
7
+ model_dir = "cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4"
8
+
9
+ # Load tokenizer and ONNX model
10
+ tokenizer = AutoTokenizer.from_pretrained(model_dir)
11
+ session = ort.InferenceSession(f"{model_dir}/model.onnx", providers=["CPUExecutionProvider"])
12
+
13
+ # Inference function
14
+ def generate_response(prompt):
15
+ full_prompt = f"<|user|>\n{prompt}\n<|assistant|>\n"
16
+ inputs = tokenizer(full_prompt, return_tensors="np")
17
+
18
+ # ONNX model expects input_ids and attention_mask
19
+ ort_inputs = {
20
+ "input_ids": inputs["input_ids"].astype(np.int64),
21
+ "attention_mask": inputs["attention_mask"].astype(np.int64)
22
+ }
23
+
24
+ # Run model
25
+ outputs = session.run(None, ort_inputs)
26
+ generated_ids = outputs[0]
27
+
28
+ # Decode output
29
+ response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
30
+
31
+ # Clean response
32
+ if "<|assistant|>" in response:
33
+ response = response.split("<|assistant|>")[-1].strip()
34
+ return response
35
+
36
+ # Gradio interface
37
+ demo = gr.Interface(
38
+ fn=generate_response,
39
+ inputs=gr.Textbox(label="Your Prompt", placeholder="Type your question here...", lines=4),
40
+ outputs=gr.Textbox(label="AI Response"),
41
+ title="Phi-4-Mini ONNX Chatbot",
42
+ description="Runs locally with ONNX for fast inference (int4 optimized)."
43
+ )
44
+
45
+ # Launch the app
46
+ demo.launch()