Spaces:

abdullahalioo
/

aiapp

Sleeping

App Files Files Community

abdullahalioo commited on 5 days ago

Commit

3ada3ad

verified ·

1 Parent(s): e576e65

Update main.py

Browse files

Files changed (1) hide show

main.py +43 -34

main.py CHANGED Viewed

@@ -2,14 +2,11 @@ from fastapi import FastAPI
 from pydantic import BaseModel
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse
-import httpx
-import asyncio
-import json
-# FastAPI app
 app = FastAPI()
-# CORS Middleware
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -18,42 +15,54 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Request body model
 class Question(BaseModel):
     question: str
-# Your OWN Hosted HuggingFace Space URL
-YOUR_SPACE_URL = "https://abdullahalioo-aiapp.hf.space"  # 🔥 change this!
-async def generate_response_chunks(prompt: str):
-    payload = {
-        "messages": [
-            {"role": "system", "content": "You are an Orion AI assistant created by Abdullah Ali who is very intelligent, 13 years old, and lives in Lahore."},
             {"role": "user", "content": prompt}
-        ],
-        "temperature": 0.7,
-        "max_tokens": 512,
-        "stream": True  # Tell your server to stream output
-    }
-    async with httpx.AsyncClient(timeout=None) as client:
-        async with client.stream("POST", f"{YOUR_SPACE_URL}/v1/chat/completions", json=payload) as response:
-            async for line in response.aiter_lines():
-                if line.strip():
-                    try:
-                        # The server sends stream chunks, decode them
-                        data = json.loads(line)
-                        content = data['choices'][0]['delta']['content']
-                        if content:
-                            for letter in content:
-                                yield letter
-                                await asyncio.sleep(0.01)  # simulate typing
-                    except Exception as e:
-                        yield f"Error decoding stream: {e}"
 @app.post("/ask")
 async def ask(question: Question):
     return StreamingResponse(
         generate_response_chunks(question.question),
         media_type="text/plain"
-    )

 from pydantic import BaseModel
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# Load model and tokenizer (do this once at startup)
+model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
 class Question(BaseModel):
     question: str
+def generate_response_chunks(prompt: str):
+    try:
+        # Prepare input
+        messages = [
+            {"role": "system", "content": "You are Orion AI assistant..."},
             {"role": "user", "content": prompt}
+        ]
+        inputs = tokenizer.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_tensors="pt"
+        ).to(model.device)
+        # Generate streamingly
+        with torch.no_grad():
+            for outputs in model.generate(
+                inputs,
+                max_new_tokens=512,
+                do_sample=True,
+                temperature=0.7,
+                top_p=0.9,
+                streamer=None,  # We'll implement manual streaming
+                stopping_criteria=None
+            ):
+                chunk = outputs[0, inputs.shape[1]:]
+                text = tokenizer.decode(chunk, skip_special_tokens=True)
+                if text:
+                    yield text
+    except Exception as e:
+        yield f"Error occurred: {e}"
 @app.post("/ask")
 async def ask(question: Question):
     return StreamingResponse(
         generate_response_chunks(question.question),
         media_type="text/plain"
+    )