Spaces:

HumbleBeeAI
/

llm_host

Running

Bahodir Nematjonov commited on Feb 10

Commit

a4ac1ab

1 Parent(s): efb3b66

debuging docker

Files changed (2) hide show

Dockerfile CHANGED Viewed

@@ -8,27 +8,11 @@ WORKDIR /code
 COPY ./requirements.txt /code/requirements.txt
 RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
-# Install Ollama (needed for LLM response generation)
 RUN curl -fsSL https://ollama.com/install.sh | sh
-# Create a new user named "user" with user ID 1000 (non-root user for security)
-RUN useradd -m -u 1000 user
-# Switch to the "user" user
-USER user
-# Set environment variables
-ENV HOME=/home/user \
-    PATH=/home/user/.local/bin:$PATH
-# Set the working directory to the user's home directory
-WORKDIR $HOME/app
-# Copy project files and set ownership to the user
-COPY --chown=user . $HOME/app
-# Expose the port FastAPI will run on
 EXPOSE 7860
-# Start FastAPI server with Uvicorn
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

 COPY ./requirements.txt /code/requirements.txt
 RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# Install Ollama
 RUN curl -fsSL https://ollama.com/install.sh | sh
+# Expose FastAPI's port
 EXPOSE 7860
+# Start Ollama in the background and then run FastAPI
+CMD ollama serve & uvicorn main:app --host 0.0.0.0 --port 7860

utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import asyncio
 import ollama
 from typing import List
 def cosine_similarity(embedding_0, embedding_1):
     pass
@@ -9,15 +9,26 @@ def generate_embedding(model, text: str, model_type: str) -> List[float]:
     pass
 async def generate_stream(query: str):
-    """Stream responses from Ollama in real-time."""
-    stream = ollama.chat(
-        model="llama3.2",  # Choose your model (mistral, llama2, gemma)
-        messages=[{"role": "user", "content": query}],
-        stream=True  # Enable streaming
-    )
-    for chunk in stream:
-        if "message" in chunk and "content" in chunk["message"]:
-            yield chunk["message"]["content"]
-            await asyncio.sleep(0)  # Allow async executi

 import asyncio
 import ollama
 from typing import List
+import time
 def cosine_similarity(embedding_0, embedding_1):
     pass
     pass
 async def generate_stream(query: str):
+    """Stream responses from Ollama with automatic retries."""
+    max_retries = 5  # Retry 5 times
+    delay = 3  # Wait 3 seconds before retrying
+    for attempt in range(max_retries):
+        try:
+            stream = ollama.chat(
+                model="mistral",  # Use your preferred model
+                messages=[{"role": "user", "content": query}],
+                stream=True
+            )
+            for chunk in stream:
+                if "message" in chunk and "content" in chunk["message"]:
+                    yield chunk["message"]["content"]
+                    await asyncio.sleep(0)
+            return
+        except Exception as e:
+            print(f"❌ Ollama connection failed (Attempt {attempt+1}/{max_retries}): {str(e)}")
+            if attempt < max_retries - 1:
+                time.sleep(delay)  # Wait before retrying
+            else:
+                yield "⚠️ Error: Could not connect to Ollama after multiple attempts."