Spaces:

HumbleBeeAI
/

llm_host

Running

App Files Files Community

Bahodir Nematjonov commited on Feb 10

Commit

4fb1c18

1 Parent(s): 2f136fb

debuging

Browse files

Files changed (3) hide show

Dockerfile +19 -18
requirements.txt +2 -3
utils.py +17 -42

Dockerfile CHANGED Viewed

@@ -1,26 +1,27 @@
-# Use a lightweight PyTorch image with CUDA support
-FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime
-# Set working directory
 WORKDIR /code
-# Install system dependencies
-RUN apt-get update && apt-get install -y \
-    libpq-dev \
-    python3-dev \
-    gcc \
-    && rm -rf /var/lib/apt/lists/*
-# Copy files
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-# Reduce memory usage and set cache directory
-ENV TRANSFORMERS_NO_ADVISORY_WARNINGS=1
-ENV HF_HOME="/code/hf_cache"
-# Expose API port
-EXPOSE 7860
-# Run FastAPI
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

+# Use the official Python 3.9 image
+FROM python:3.9
+# Set the working directory to /code
 WORKDIR /code
+# Copy the current directory contents into the container at /code
+COPY ./requirements.txt /code/requirements.txt
+# Install requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

requirements.txt CHANGED Viewed

@@ -32,9 +32,8 @@ python-multipart
 sqlalchemy
 psycopg2-binary
 python-dotenv
-torch
-transformers
-accelerate
 # Docker and deployment

 sqlalchemy
 psycopg2-binary
 python-dotenv
+ollama
 # Docker and deployment

utils.py CHANGED Viewed

@@ -1,45 +1,20 @@
 import asyncio
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-from accelerate import init_empty_weights, load_checkpoint_and_dispatch
-# Model name (Ensure it's available on Hugging Face)
-MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"  # Use smaller if needed
-# Detect device
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load tokenizer
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-# Enable disk offloading if using CPU (to prevent memory overload)
-with init_empty_weights():
-    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
-# Offload model to disk if no GPU available
-model = load_checkpoint_and_dispatch(
-    model,
-    MODEL_NAME,
-    device_map="auto",
-    offload_folder="/code/model_cache",  # Ensure a valid folder for offloading
-    offload_state_dict=True
-).to(device)
-# Hugging Face pipeline for text generation
-generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
 async def generate_stream(query: str):
-    """Stream responses using Hugging Face Transformers (LLaMA 2)."""
-    input_ids = tokenizer(query, return_tensors="pt").input_ids.to(device)
-    # Generate text with controlled memory usage
-    output = generator(query, max_length=512, do_sample=True, temperature=0.7)
-    response_text = output[0]["generated_text"]
-    # Simulate streaming output
-    for word in response_text.split():
-        yield word + " "
-        await asyncio.sleep(0.05)
-    yield "\n"

 import asyncio
+import ollama
 async def generate_stream(query: str):
+    """Generates streamed responses from Ollama using LLaMA 3 or Mistral."""
+    try:
+        stream = ollama.chat(
+            model="llama3",  # Change to 'mistral' if needed
+            messages=[{"role": "user", "content": query}],
+            stream=True
+        )
+        # Stream the response in real-time
+        for chunk in stream:
+            if "message" in chunk and "content" in chunk["message"]:
+                yield chunk["message"]["content"]
+                await asyncio.sleep(0)
+    except Exception as e:
+        yield f"⚠️ Error: {str(e)}"