Bahodir Nematjonov commited on
Commit
4fb1c18
·
1 Parent(s): 2f136fb
Files changed (3) hide show
  1. Dockerfile +19 -18
  2. requirements.txt +2 -3
  3. utils.py +17 -42
Dockerfile CHANGED
@@ -1,26 +1,27 @@
1
- # Use a lightweight PyTorch image with CUDA support
2
- FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime
3
 
4
- # Set working directory
5
  WORKDIR /code
6
 
7
- # Install system dependencies
8
- RUN apt-get update && apt-get install -y \
9
- libpq-dev \
10
- python3-dev \
11
- gcc \
12
- && rm -rf /var/lib/apt/lists/*
13
 
14
- # Copy files
15
- COPY requirements.txt .
16
- RUN pip install --no-cache-dir -r requirements.txt
17
 
18
- # Reduce memory usage and set cache directory
19
- ENV TRANSFORMERS_NO_ADVISORY_WARNINGS=1
20
- ENV HF_HOME="/code/hf_cache"
 
 
 
 
21
 
22
- # Expose API port
23
- EXPOSE 7860
 
 
 
24
 
25
- # Run FastAPI
26
  CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ # Use the official Python 3.9 image
2
+ FROM python:3.9
3
 
4
+ # Set the working directory to /code
5
  WORKDIR /code
6
 
7
+ # Copy the current directory contents into the container at /code
8
+ COPY ./requirements.txt /code/requirements.txt
 
 
 
 
9
 
10
+ # Install requirements.txt
11
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
 
12
 
13
+ # Set up a new user named "user" with user ID 1000
14
+ RUN useradd -m -u 1000 user
15
+ # Switch to the "user" user
16
+ USER user
17
+ # Set home to the user's home directory
18
+ ENV HOME=/home/user \
19
+ PATH=/home/user/.local/bin:$PATH
20
 
21
+ # Set the working directory to the user's home directory
22
+ WORKDIR $HOME/app
23
+
24
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
25
+ COPY --chown=user . $HOME/app
26
 
 
27
  CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
requirements.txt CHANGED
@@ -32,9 +32,8 @@ python-multipart
32
  sqlalchemy
33
  psycopg2-binary
34
  python-dotenv
35
- torch
36
- transformers
37
- accelerate
38
 
39
 
40
  # Docker and deployment
 
32
  sqlalchemy
33
  psycopg2-binary
34
  python-dotenv
35
+ ollama
36
+
 
37
 
38
 
39
  # Docker and deployment
utils.py CHANGED
@@ -1,45 +1,20 @@
1
  import asyncio
2
- import torch
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
4
- from accelerate import init_empty_weights, load_checkpoint_and_dispatch
5
-
6
- # Model name (Ensure it's available on Hugging Face)
7
- MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1" # Use smaller if needed
8
-
9
- # Detect device
10
- device = "cuda" if torch.cuda.is_available() else "cpu"
11
-
12
- # Load tokenizer
13
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
14
-
15
- # Enable disk offloading if using CPU (to prevent memory overload)
16
- with init_empty_weights():
17
- model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
18
-
19
- # Offload model to disk if no GPU available
20
- model = load_checkpoint_and_dispatch(
21
- model,
22
- MODEL_NAME,
23
- device_map="auto",
24
- offload_folder="/code/model_cache", # Ensure a valid folder for offloading
25
- offload_state_dict=True
26
- ).to(device)
27
-
28
- # Hugging Face pipeline for text generation
29
- generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
30
 
31
  async def generate_stream(query: str):
32
- """Stream responses using Hugging Face Transformers (LLaMA 2)."""
33
- input_ids = tokenizer(query, return_tensors="pt").input_ids.to(device)
34
-
35
- # Generate text with controlled memory usage
36
- output = generator(query, max_length=512, do_sample=True, temperature=0.7)
37
-
38
- response_text = output[0]["generated_text"]
39
-
40
- # Simulate streaming output
41
- for word in response_text.split():
42
- yield word + " "
43
- await asyncio.sleep(0.05)
44
-
45
- yield "\n"
 
 
 
1
  import asyncio
2
+ import ollama
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  async def generate_stream(query: str):
5
+ """Generates streamed responses from Ollama using LLaMA 3 or Mistral."""
6
+ try:
7
+ stream = ollama.chat(
8
+ model="llama3", # Change to 'mistral' if needed
9
+ messages=[{"role": "user", "content": query}],
10
+ stream=True
11
+ )
12
+
13
+ # Stream the response in real-time
14
+ for chunk in stream:
15
+ if "message" in chunk and "content" in chunk["message"]:
16
+ yield chunk["message"]["content"]
17
+ await asyncio.sleep(0)
18
+
19
+ except Exception as e:
20
+ yield f"⚠️ Error: {str(e)}"