Spaces:

Dragneel
/

TinyLlama

Sleeping

App Files Files Community

Drag2121 commited on Sep 17, 2024

Commit

0ba5adf

1 Parent(s): 15ec193

stream code again

Browse files

Files changed (1) hide show

app.py +29 -15

app.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import os
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from langchain_ollama import ChatOllama
-from langchain.schema import StrOutputParser
-from langchain.prompts import ChatPromptTemplate
 import logging
 from functools import lru_cache
@@ -13,19 +12,12 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 app = FastAPI()
 MODEL_NAME = 'phi3:mini'
 @lru_cache()
 def get_llm():
     return ChatOllama(model=MODEL_NAME)
-@lru_cache()
-def get_chain():
-    llm = get_llm()
-    prompt = ChatPromptTemplate.from_template("Question: {question}\n\nAnswer:")
-    return prompt | llm | StrOutputParser()
 class Question(BaseModel):
     text: str
@@ -37,20 +29,42 @@ def read_root():
 async def ask_question(question: Question):
     try:
         logger.info(f"Received question: {question.text}")
-        chain = get_chain()
-        response = chain.invoke({"question": question.text})
         logger.info("Response generated successfully")
-        return {"answer": response}
     except Exception as e:
         logger.error(f"Error in /ask endpoint: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.on_event("startup")
 async def startup_event():
     logger.info(f"Starting up with model: {MODEL_NAME}")
     # Warm up the cache
-    get_chain()
 @app.on_event("shutdown")
 async def shutdown_event():

 import os
 from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 from langchain_ollama import ChatOllama
+from langchain.schema import HumanMessage
 import logging
 from functools import lru_cache
 logger = logging.getLogger(__name__)
 app = FastAPI()
 MODEL_NAME = 'phi3:mini'
 @lru_cache()
 def get_llm():
     return ChatOllama(model=MODEL_NAME)
 class Question(BaseModel):
     text: str
 async def ask_question(question: Question):
     try:
         logger.info(f"Received question: {question.text}")
+        llm = get_llm()
+        messages = [HumanMessage(content=question.text)]
+        response = llm(messages)
         logger.info("Response generated successfully")
+        return {"answer": response.content}
     except Exception as e:
         logger.error(f"Error in /ask endpoint: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
+@app.post("/ask_stream")
+async def ask_question_stream(question: Question):
+    try:
+        logger.info(f"Received question for streaming: {question.text}")
+        llm = get_llm()
+        messages = [HumanMessage(content=question.text)]
+        async def generate():
+            full_response = ""
+            async for chunk in llm.astream(messages):
+                if chunk.content:
+                    full_response += chunk.content
+                    yield chunk.content
+            # Log the full response after streaming is complete
+            logger.info(f"Full streamed response: {full_response}")
+        return StreamingResponse(generate(), media_type="text/plain")
+    except Exception as e:
+        logger.error(f"Error in /ask_stream endpoint: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
 @app.on_event("startup")
 async def startup_event():
     logger.info(f"Starting up with model: {MODEL_NAME}")
     # Warm up the cache
+    get_llm()
 @app.on_event("shutdown")
 async def shutdown_event():