Spaces:

abdullahalioo
/

aiapp

Sleeping

App Files Files Community

abdullahalioo commited on Apr 26

Commit

2c97dd8

verified ·

1 Parent(s): 09dee7d

Update main.py

Browse files

Files changed (1) hide show

main.py +30 -65

main.py CHANGED Viewed

@@ -1,24 +1,18 @@
-from fastapi import FastAPI, Request
 from pydantic import BaseModel
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import torch
 import asyncio
-import logging
-# Set up logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 # FastAPI app
 app = FastAPI()
-# CORS Middleware (for frontend access)
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],  # Update to specific frontend URL in production
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
@@ -28,67 +22,38 @@ app.add_middleware(
 class Question(BaseModel):
     question: str
-# Load the model and tokenizer
-model_name = "Qwen/Qwen2.5-7B-Instruct"
-try:
-    logger.info(f"Loading model {model_name}...")
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        torch_dtype=torch.float16,
-        device_map="auto",
-        trust_remote_code=True
-    )
-    logger.info("Model loaded successfully.")
-except Exception as e:
-    logger.error(f"Failed to load model: {e}")
-    raise
 async def generate_response_chunks(prompt: str):
-    try:
-        # Prepare the input prompt
-        messages = [
-            {"role": "system", "content": "You are Orion AI assistant created by Abdullah Ali, who is very intelligent, 13 years old, and lives in Lahore."},
             {"role": "user", "content": prompt}
-        ]
-        inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)
-        # Asynchronous generator to yield tokens
-        async def stream_tokens():
-            for output in model.generate(
-                inputs,
-                max_new_tokens=512,
-                temperature=0.7,
-                top_p=0.9,
-                do_sample=True,
-                pad_token_id=tokenizer.eos_token_id,
-                return_dict_in_generate=True,
-                output_scores=False,
-                streaming=True
-            ):
-                token_id = output.sequences[0][-1]
-                token_text = tokenizer.decode([token_id], skip_special_tokens=True)
-                if token_text:
-                    yield token_text
-                    await asyncio.sleep(0.01)  # Control streaming speed
-            logger.info("Streaming completed.")
-        # Yield tokens from stream_tokens
-        async for token in stream_tokens():
-            yield token
-    except Exception as e:
-        logger.error(f"Error during generation: {e}")
-        yield f"Error occurred: {e}"
 @app.post("/ask")
 async def ask(question: Question):
-    logger.info(f"Received question: {question.question}")
     return StreamingResponse(
         generate_response_chunks(question.question),
         media_type="text/plain"
     )
-@app.get("/")
-async def root():
-    return {"message": "Orion AI Chat API is running!"}

+from fastapi import FastAPI
 from pydantic import BaseModel
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse
+import httpx
 import asyncio
+import json
 # FastAPI app
 app = FastAPI()
+# CORS Middleware
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 class Question(BaseModel):
     question: str
+# Your OWN Hosted HuggingFace Space URL
+YOUR_SPACE_URL = "https://your-space-name-username.hf.space"  # 🔥 change this!
 async def generate_response_chunks(prompt: str):
+    payload = {
+        "messages": [
+            {"role": "system", "content": "You are a Orion AI assistant created by abdullah ali who is very intelligent and he is 13 years old and lives in Lahore."},
             {"role": "user", "content": prompt}
+        ],
+        "temperature": 0.7,
+        "max_tokens": 512,
+        "stream": True  # Tell your server to stream output
+    }
+    async with httpx.AsyncClient(timeout=None) as client:
+        async with client.stream("POST", f"{YOUR_SPACE_URL}/v1/chat/completions", json=payload) as response:
+            async for line in response.aiter_lines():
+                if line.strip():
+                    try:
+                        # The server sends stream chunks, decode them
+                        data = json.loads(line)
+                        content = data['choices'][0]['delta']['content']
+                        if content:
+                            for letter in content:
+                                yield letter
+                                await asyncio.sleep(0.01)  # simulate typing
+                    except Exception as e:
+                        yield f"Error decoding stream: {e}"
 @app.post("/ask")
 async def ask(question: Question):
     return StreamingResponse(
         generate_response_chunks(question.question),
         media_type="text/plain"
     )