File size: 3,839 Bytes
f7c0abb
e7b1f60
fa8e2ce
d0fc55f
f7c0abb
05d6121
8585bd9
 
2a3b14a
 
 
 
ac252be
8585bd9
2a3b14a
8585bd9
 
 
 
 
 
 
2a3b14a
 
 
 
8585bd9
 
 
 
 
 
 
e7b1f60
8585bd9
 
 
 
 
 
e7b1f60
 
465b43c
8585bd9
fa8e2ce
6025f1c
 
8585bd9
e7b1f60
8585bd9
2372d93
8585bd9
05d6121
6025f1c
8585bd9
 
 
 
 
 
 
603790a
d0fc55f
8585bd9
9ab6d04
6025f1c
 
e181176
f7c0abb
 
8585bd9
 
 
d0fc55f
045ef7e
 
f7c0abb
 
05d6121
 
f7c0abb
8585bd9
05d6121
e7b1f60
8585bd9
 
 
e7b1f60
05d6121
b9e465f
8585bd9
fa8e2ce
8585bd9
93c4b1f
7a83ce6
20d0b59
8585bd9
 
 
 
 
 
 
387e225
8585bd9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
from fastapi import FastAPI, HTTPException, Query
from fastapi.responses import StreamingResponse
from openai import AsyncOpenAI

app = FastAPI()

# System prompt
system = '''You are DeepSeek R1, an advanced reasoning assistant.
Your responses consist of two parts:
1. A <thinking> block — This is your internal reasoning. You think step-by-step, carefully analyzing the question, considering context, alternatives, and edge cases. This section must be at least 10 lines long and enclosed between <think> and </think>. This part is not shown to the user in real-world applications, but is visible during debugging or development.
2. The final answer — This is the polished, professional response provided after you’ve thought through the problem. It is clear, structured, and concise.
3. always provide code in this foramte ```<code>```.

Your behavior guidelines:
- Maintain a calm, analytical, and formal tone.
- Use bullet points or numbered lists when appropriate.
- Avoid casual language, emojis, or redundant filler.
- If context is missing, mention assumptions.
- Never refer to yourself as an AI or language model.
- Do not repeat the <thinking> part in your final answer.

Format every response exactly as follows:
<think>  
[Begin detailed, line-by-line reasoning here — minimum 10 lines. Think aloud.]  
</think>  
[Final answer starts here — no label, just a clean professional response.]  
'''

# In-memory chat history
chat_history = {}

# Supported models
AVAILABLE_MODELS = {
    "openai/gpt-4.1": "OpenAI GPT-4.1",
    "openai/gpt-4.1-mini": "OpenAI GPT-4.1-mini",
    "deepseek/DeepSeek-R1": "DeepSeek-R1",
    "microsoft/Phi-3.5-mini-instruct": "Phi-3.5-mini instruct",
    "meta/Meta-Llama-3.1-8B-Instruct": "Meta-Llama-3.1-8B-Instruct",
    # Add more as needed...
}


async def generate_ai_response(chat_id: str, prompt: str, model: str):
    token = os.getenv("GITHUB_TOKEN")
    if not token:
        raise HTTPException(status_code=500, detail="GitHub token not configured")

    if model not in AVAILABLE_MODELS:
        raise HTTPException(status_code=400, detail=f"Invalid model. Choose from: {', '.join(AVAILABLE_MODELS)}")

    endpoint = "https://models.github.ai/inference"
    client = AsyncOpenAI(base_url=endpoint, api_key=token)

    # Retrieve or initialize message history
    messages = chat_history.get(chat_id, [])
    if not messages:
        messages.append({"role": "system", "content": system})

    messages.append({"role": "user", "content": prompt})

    try:
        stream = await client.chat.completions.create(
            messages=messages,
            model=model,
            temperature=1.0,
            top_p=1.0,
            stream=True
        )

        # Update history only if generation starts
        chat_history[chat_id] = messages

        async for chunk in stream:
            if chunk.choices and chunk.choices[0].delta.content:
                yield chunk.choices[0].delta.content

    except Exception as err:
        yield f"Error: {str(err)}"
        raise HTTPException(status_code=500, detail="AI generation failed")


@app.post("/generate")
async def generate_response(
    chat_id: str = Query(..., description="Chat session ID"),
    prompt: str = Query(..., description="User prompt"),
    model: str = Query("openai/gpt-4.1-mini", description="Model name")
):
    if not prompt:
        raise HTTPException(status_code=400, detail="Prompt cannot be empty")

    return StreamingResponse(
        generate_ai_response(chat_id, prompt, model),
        media_type="text/event-stream"
    )


@app.post("/reset")
async def reset_chat(chat_id: str = Query(..., description="Chat session ID to reset")):
    chat_history.pop(chat_id, None)
    return {"message": f"Chat history for {chat_id} has been cleared."}


def get_app():
    return app