Spaces:

HumbleBeeAI
/

llm_host

Running

App Files Files Community

Bahodir Nematjonov commited on Feb 10

Commit

2f136fb

1 Parent(s): e5cbb51

debuging

Browse files

Files changed (2) hide show

Dockerfile +10 -6
utils.py +22 -14

Dockerfile CHANGED Viewed

@@ -1,22 +1,26 @@
-# Use a lightweight Python image
-FROM python:3.9
 # Set working directory
 WORKDIR /code
-# Install system dependencies required for PostgreSQL and transformers
 RUN apt-get update && apt-get install -y \
     libpq-dev \
     python3-dev \
     gcc \
     && rm -rf /var/lib/apt/lists/*
-# Copy requirements.txt and install dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# Expose FastAPI port
 EXPOSE 7860
-# Start FastAPI
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

+# Use a lightweight PyTorch image with CUDA support
+FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime
 # Set working directory
 WORKDIR /code
+# Install system dependencies
 RUN apt-get update && apt-get install -y \
     libpq-dev \
     python3-dev \
     gcc \
     && rm -rf /var/lib/apt/lists/*
+# Copy files
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+# Reduce memory usage and set cache directory
+ENV TRANSFORMERS_NO_ADVISORY_WARNINGS=1
+ENV HF_HOME="/code/hf_cache"
+# Expose API port
 EXPOSE 7860
+# Run FastAPI
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

utils.py CHANGED Viewed

@@ -1,37 +1,45 @@
 import asyncio
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-# Load latest available LLaMA model (Change this if LLaMA 3 becomes available)
-MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
-# Detect device (Use GPU if available)
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-model = AutoModelForCausalLM.from_pretrained(
     MODEL_NAME,
-    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-    device_map="auto"
 ).to(device)
-# Text generation pipeline for efficient inference
 generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
 async def generate_stream(query: str):
-    """Stream responses using LLaMA."""
     input_ids = tokenizer(query, return_tensors="pt").input_ids.to(device)
-    # Generate text
     output = generator(query, max_length=512, do_sample=True, temperature=0.7)
     response_text = output[0]["generated_text"]
-    # Simulate streaming
     for word in response_text.split():
         yield word + " "
         await asyncio.sleep(0.05)
     yield "\n"

 import asyncio
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+from accelerate import init_empty_weights, load_checkpoint_and_dispatch
+# Model name (Ensure it's available on Hugging Face)
+MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"  # Use smaller if needed
+# Detect device
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+# Enable disk offloading if using CPU (to prevent memory overload)
+with init_empty_weights():
+    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
+# Offload model to disk if no GPU available
+model = load_checkpoint_and_dispatch(
+    model,
     MODEL_NAME,
+    device_map="auto",
+    offload_folder="/code/model_cache",  # Ensure a valid folder for offloading
+    offload_state_dict=True
 ).to(device)
+# Hugging Face pipeline for text generation
 generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
 async def generate_stream(query: str):
+    """Stream responses using Hugging Face Transformers (LLaMA 2)."""
     input_ids = tokenizer(query, return_tensors="pt").input_ids.to(device)
+    # Generate text with controlled memory usage
     output = generator(query, max_length=512, do_sample=True, temperature=0.7)
     response_text = output[0]["generated_text"]
+    # Simulate streaming output
     for word in response_text.split():
         yield word + " "
         await asyncio.sleep(0.05)
     yield "\n"