import os from fastapi import FastAPI, HTTPException, Query from fastapi.responses import StreamingResponse from openai import AsyncOpenAI app = FastAPI() # System prompt system = '''You are DeepSeek R1, an advanced reasoning assistant. Your responses consist of two parts: 1. A block — This is your internal reasoning. You think step-by-step, carefully analyzing the question, considering context, alternatives, and edge cases. This section must be at least 10 lines long and enclosed between and . This part is not shown to the user in real-world applications, but is visible during debugging or development. 2. The final answer — This is the polished, professional response provided after you’ve thought through the problem. It is clear, structured, and concise. 3. always provide code in this foramte ``````. Your behavior guidelines: - Maintain a calm, analytical, and formal tone. - Use bullet points or numbered lists when appropriate. - Avoid casual language, emojis, or redundant filler. - If context is missing, mention assumptions. - Never refer to yourself as an AI or language model. - Do not repeat the part in your final answer. Format every response exactly as follows: [Begin detailed, line-by-line reasoning here — minimum 10 lines. Think aloud.] [Final answer starts here — no label, just a clean professional response.] ''' # In-memory chat history chat_history = {} # Supported models AVAILABLE_MODELS = { "openai/gpt-4.1": "OpenAI GPT-4.1", "openai/gpt-4.1-mini": "OpenAI GPT-4.1-mini", "deepseek/DeepSeek-R1": "DeepSeek-R1", "microsoft/Phi-3.5-mini-instruct": "Phi-3.5-mini instruct", "meta/Meta-Llama-3.1-8B-Instruct": "Meta-Llama-3.1-8B-Instruct", # Add more as needed... } async def generate_ai_response(chat_id: str, prompt: str, model: str): token = os.getenv("GITHUB_TOKEN") if not token: raise HTTPException(status_code=500, detail="GitHub token not configured") if model not in AVAILABLE_MODELS: raise HTTPException(status_code=400, detail=f"Invalid model. Choose from: {', '.join(AVAILABLE_MODELS)}") endpoint = "https://models.github.ai/inference" client = AsyncOpenAI(base_url=endpoint, api_key=token) # Retrieve or initialize message history messages = chat_history.get(chat_id, []) if not messages: messages.append({"role": "system", "content": system}) messages.append({"role": "user", "content": prompt}) try: stream = await client.chat.completions.create( messages=messages, model=model, temperature=1.0, top_p=1.0, stream=True ) # Update history only if generation starts chat_history[chat_id] = messages async for chunk in stream: if chunk.choices and chunk.choices[0].delta.content: yield chunk.choices[0].delta.content except Exception as err: yield f"Error: {str(err)}" raise HTTPException(status_code=500, detail="AI generation failed") @app.post("/generate") async def generate_response( chat_id: str = Query(..., description="Chat session ID"), prompt: str = Query(..., description="User prompt"), model: str = Query("openai/gpt-4.1-mini", description="Model name") ): if not prompt: raise HTTPException(status_code=400, detail="Prompt cannot be empty") return StreamingResponse( generate_ai_response(chat_id, prompt, model), media_type="text/event-stream" ) @app.post("/reset") async def reset_chat(chat_id: str = Query(..., description="Chat session ID to reset")): chat_history.pop(chat_id, None) return {"message": f"Chat history for {chat_id} has been cleared."} def get_app(): return app