Bahodir Nematjonov commited on
Commit
2f136fb
·
1 Parent(s): e5cbb51
Files changed (2) hide show
  1. Dockerfile +10 -6
  2. utils.py +22 -14
Dockerfile CHANGED
@@ -1,22 +1,26 @@
1
- # Use a lightweight Python image
2
- FROM python:3.9
3
 
4
  # Set working directory
5
  WORKDIR /code
6
 
7
- # Install system dependencies required for PostgreSQL and transformers
8
  RUN apt-get update && apt-get install -y \
9
  libpq-dev \
10
  python3-dev \
11
  gcc \
12
  && rm -rf /var/lib/apt/lists/*
13
 
14
- # Copy requirements.txt and install dependencies
15
  COPY requirements.txt .
16
  RUN pip install --no-cache-dir -r requirements.txt
17
 
18
- # Expose FastAPI port
 
 
 
 
19
  EXPOSE 7860
20
 
21
- # Start FastAPI
22
  CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ # Use a lightweight PyTorch image with CUDA support
2
+ FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime
3
 
4
  # Set working directory
5
  WORKDIR /code
6
 
7
+ # Install system dependencies
8
  RUN apt-get update && apt-get install -y \
9
  libpq-dev \
10
  python3-dev \
11
  gcc \
12
  && rm -rf /var/lib/apt/lists/*
13
 
14
+ # Copy files
15
  COPY requirements.txt .
16
  RUN pip install --no-cache-dir -r requirements.txt
17
 
18
+ # Reduce memory usage and set cache directory
19
+ ENV TRANSFORMERS_NO_ADVISORY_WARNINGS=1
20
+ ENV HF_HOME="/code/hf_cache"
21
+
22
+ # Expose API port
23
  EXPOSE 7860
24
 
25
+ # Run FastAPI
26
  CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
utils.py CHANGED
@@ -1,37 +1,45 @@
1
  import asyncio
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
4
 
5
- # Load latest available LLaMA model (Change this if LLaMA 3 becomes available)
6
- MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
7
 
8
- # Detect device (Use GPU if available)
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
10
 
11
- # Load tokenizer and model
12
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
13
- model = AutoModelForCausalLM.from_pretrained(
 
 
 
 
 
 
 
14
  MODEL_NAME,
15
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
16
- device_map="auto"
 
17
  ).to(device)
18
 
19
- # Text generation pipeline for efficient inference
20
  generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
21
 
22
  async def generate_stream(query: str):
23
- """Stream responses using LLaMA."""
24
-
25
  input_ids = tokenizer(query, return_tensors="pt").input_ids.to(device)
26
 
27
- # Generate text
28
  output = generator(query, max_length=512, do_sample=True, temperature=0.7)
29
 
30
  response_text = output[0]["generated_text"]
31
-
32
- # Simulate streaming
33
  for word in response_text.split():
34
  yield word + " "
35
  await asyncio.sleep(0.05)
36
-
37
  yield "\n"
 
1
  import asyncio
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
4
+ from accelerate import init_empty_weights, load_checkpoint_and_dispatch
5
 
6
+ # Model name (Ensure it's available on Hugging Face)
7
+ MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1" # Use smaller if needed
8
 
9
+ # Detect device
10
  device = "cuda" if torch.cuda.is_available() else "cpu"
11
 
12
+ # Load tokenizer
13
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
14
+
15
+ # Enable disk offloading if using CPU (to prevent memory overload)
16
+ with init_empty_weights():
17
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
18
+
19
+ # Offload model to disk if no GPU available
20
+ model = load_checkpoint_and_dispatch(
21
+ model,
22
  MODEL_NAME,
23
+ device_map="auto",
24
+ offload_folder="/code/model_cache", # Ensure a valid folder for offloading
25
+ offload_state_dict=True
26
  ).to(device)
27
 
28
+ # Hugging Face pipeline for text generation
29
  generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
30
 
31
  async def generate_stream(query: str):
32
+ """Stream responses using Hugging Face Transformers (LLaMA 2)."""
 
33
  input_ids = tokenizer(query, return_tensors="pt").input_ids.to(device)
34
 
35
+ # Generate text with controlled memory usage
36
  output = generator(query, max_length=512, do_sample=True, temperature=0.7)
37
 
38
  response_text = output[0]["generated_text"]
39
+
40
+ # Simulate streaming output
41
  for word in response_text.split():
42
  yield word + " "
43
  await asyncio.sleep(0.05)
44
+
45
  yield "\n"