Spaces:

HumbleBeeAI
/

llm_host

Running

Bahodir Nematjonov commited on Feb 10

Commit

7c59172

1 Parent(s): 04f2bac

updated model

Files changed (3) hide show

Dockerfile CHANGED Viewed

@@ -1,24 +1,19 @@
-# Use the official Python 3.9 image
-FROM python:3.9
 # Set working directory
 WORKDIR /code
-# Copy requirements.txt and install dependencies
-COPY ./requirements.txt /code/requirements.txt
-RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
-# Install Ollama
-RUN curl -fsSL https://ollama.com/install.sh | sh
-# Create a writable directory for Ollama
-RUN mkdir -p /home/user/.ollama && chmod -R 777 /home/user/.ollama
-# Set Ollama to use this directory
-ENV OLLAMA_HOME=/home/user/.ollama
-# Expose the FastAPI port
 EXPOSE 7860
-# Start Ollama in the background and then FastAPI
-CMD OLLAMA_HOME=/home/user/.ollama ollama serve & uvicorn main:app --host 0.0.0.0 --port 7860

+# Use a lightweight PyTorch image with CUDA support
+FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime
 # Set working directory
 WORKDIR /code
+# Install dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Reduce memory usage (optional)
+ENV TRANSFORMERS_NO_ADVISORY_WARNINGS=1
+ENV HF_HOME="/code/hf_cache"
+# Expose API port
 EXPOSE 7860
+# Run FastAPI
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

requirements.txt CHANGED Viewed

@@ -32,7 +32,10 @@ python-multipart
 sqlalchemy
 psycopg2
 python-dotenv
-ollama
 # Docker and deployment
 gunicorn  # Optional: for production deployment

 sqlalchemy
 psycopg2
 python-dotenv
+torch
+transformers
+accelerate
 # Docker and deployment
 gunicorn  # Optional: for production deployment

utils.py CHANGED Viewed

@@ -1,34 +1,37 @@
 import asyncio
-import ollama
-from typing import List
-import time
-def cosine_similarity(embedding_0, embedding_1):
-    pass
-def generate_embedding(model, text: str, model_type: str) -> List[float]:
-    pass
 async def generate_stream(query: str):
-    """Stream responses from Ollama with automatic retries."""
-    max_retries = 5  # Retry 5 times
-    delay = 3  # Wait 3 seconds before retrying
-    for attempt in range(max_retries):
-        try:
-            stream = ollama.chat(
-                model="mistral",  # Use your preferred model
-                messages=[{"role": "user", "content": query}],
-                stream=True
-            )
-            for chunk in stream:
-                if "message" in chunk and "content" in chunk["message"]:
-                    yield chunk["message"]["content"]
-                    await asyncio.sleep(0)
-            return
-        except Exception as e:
-            print(f"❌ Ollama connection failed (Attempt {attempt+1}/{max_retries}): {str(e)}")
-            if attempt < max_retries - 1:
-                time.sleep(delay)  # Wait before retrying
-            else:
-                yield "⚠️ Error: Could not connect to Ollama after multiple attempts."

 import asyncio
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+# Load latest available LLaMA model (Change this if LLaMA 3 becomes available)
+MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
+# Detect device (Use GPU if available)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    device_map="auto"
+).to(device)
+# Text generation pipeline for efficient inference
+generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
 async def generate_stream(query: str):
+    """Stream responses using LLaMA."""
+    input_ids = tokenizer(query, return_tensors="pt").input_ids.to(device)
+    # Generate text
+    output = generator(query, max_length=512, do_sample=True, temperature=0.7)
+    response_text = output[0]["generated_text"]
+    # Simulate streaming
+    for word in response_text.split():
+        yield word + " "
+        await asyncio.sleep(0.05)
+    yield "\n"