abdullahalioo commited on
Commit
3ada3ad
·
verified ·
1 Parent(s): e576e65

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +43 -34
main.py CHANGED
@@ -2,14 +2,11 @@ from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from fastapi.middleware.cors import CORSMiddleware
4
  from fastapi.responses import StreamingResponse
5
- import httpx
6
- import asyncio
7
- import json
8
 
9
- # FastAPI app
10
  app = FastAPI()
11
 
12
- # CORS Middleware
13
  app.add_middleware(
14
  CORSMiddleware,
15
  allow_origins=["*"],
@@ -18,42 +15,54 @@ app.add_middleware(
18
  allow_headers=["*"],
19
  )
20
 
21
- # Request body model
 
 
 
 
 
 
 
 
22
  class Question(BaseModel):
23
  question: str
24
 
25
- # Your OWN Hosted HuggingFace Space URL
26
- YOUR_SPACE_URL = "https://abdullahalioo-aiapp.hf.space" # 🔥 change this!
27
-
28
- async def generate_response_chunks(prompt: str):
29
- payload = {
30
- "messages": [
31
- {"role": "system", "content": "You are an Orion AI assistant created by Abdullah Ali who is very intelligent, 13 years old, and lives in Lahore."},
32
  {"role": "user", "content": prompt}
33
- ],
34
- "temperature": 0.7,
35
- "max_tokens": 512,
36
- "stream": True # Tell your server to stream output
37
- }
38
-
39
- async with httpx.AsyncClient(timeout=None) as client:
40
- async with client.stream("POST", f"{YOUR_SPACE_URL}/v1/chat/completions", json=payload) as response:
41
- async for line in response.aiter_lines():
42
- if line.strip():
43
- try:
44
- # The server sends stream chunks, decode them
45
- data = json.loads(line)
46
- content = data['choices'][0]['delta']['content']
47
- if content:
48
- for letter in content:
49
- yield letter
50
- await asyncio.sleep(0.01) # simulate typing
51
- except Exception as e:
52
- yield f"Error decoding stream: {e}"
 
 
 
 
 
 
53
 
54
  @app.post("/ask")
55
  async def ask(question: Question):
56
  return StreamingResponse(
57
  generate_response_chunks(question.question),
58
  media_type="text/plain"
59
- )
 
2
  from pydantic import BaseModel
3
  from fastapi.middleware.cors import CORSMiddleware
4
  from fastapi.responses import StreamingResponse
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
+ import torch
 
7
 
 
8
  app = FastAPI()
9
 
 
10
  app.add_middleware(
11
  CORSMiddleware,
12
  allow_origins=["*"],
 
15
  allow_headers=["*"],
16
  )
17
 
18
+ # Load model and tokenizer (do this once at startup)
19
+ model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
20
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
21
+ model = AutoModelForCausalLM.from_pretrained(
22
+ model_name,
23
+ torch_dtype=torch.float16,
24
+ device_map="auto"
25
+ )
26
+
27
  class Question(BaseModel):
28
  question: str
29
 
30
+ def generate_response_chunks(prompt: str):
31
+ try:
32
+ # Prepare input
33
+ messages = [
34
+ {"role": "system", "content": "You are Orion AI assistant..."},
 
 
35
  {"role": "user", "content": prompt}
36
+ ]
37
+ inputs = tokenizer.apply_chat_template(
38
+ messages,
39
+ tokenize=True,
40
+ add_generation_prompt=True,
41
+ return_tensors="pt"
42
+ ).to(model.device)
43
+
44
+ # Generate streamingly
45
+ with torch.no_grad():
46
+ for outputs in model.generate(
47
+ inputs,
48
+ max_new_tokens=512,
49
+ do_sample=True,
50
+ temperature=0.7,
51
+ top_p=0.9,
52
+ streamer=None, # We'll implement manual streaming
53
+ stopping_criteria=None
54
+ ):
55
+ chunk = outputs[0, inputs.shape[1]:]
56
+ text = tokenizer.decode(chunk, skip_special_tokens=True)
57
+ if text:
58
+ yield text
59
+
60
+ except Exception as e:
61
+ yield f"Error occurred: {e}"
62
 
63
  @app.post("/ask")
64
  async def ask(question: Question):
65
  return StreamingResponse(
66
  generate_response_chunks(question.question),
67
  media_type="text/plain"
68
+ )