Bahodir Nematjonov commited on
Commit
7c59172
·
1 Parent(s): 04f2bac

updated model

Browse files
Files changed (3) hide show
  1. Dockerfile +11 -16
  2. requirements.txt +4 -1
  3. utils.py +32 -29
Dockerfile CHANGED
@@ -1,24 +1,19 @@
1
- # Use the official Python 3.9 image
2
- FROM python:3.9
3
 
4
  # Set working directory
5
  WORKDIR /code
6
 
7
- # Copy requirements.txt and install dependencies
8
- COPY ./requirements.txt /code/requirements.txt
9
- RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
10
 
11
- # Install Ollama
12
- RUN curl -fsSL https://ollama.com/install.sh | sh
 
13
 
14
- # Create a writable directory for Ollama
15
- RUN mkdir -p /home/user/.ollama && chmod -R 777 /home/user/.ollama
16
-
17
- # Set Ollama to use this directory
18
- ENV OLLAMA_HOME=/home/user/.ollama
19
-
20
- # Expose the FastAPI port
21
  EXPOSE 7860
22
 
23
- # Start Ollama in the background and then FastAPI
24
- CMD OLLAMA_HOME=/home/user/.ollama ollama serve & uvicorn main:app --host 0.0.0.0 --port 7860
 
1
+ # Use a lightweight PyTorch image with CUDA support
2
+ FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime
3
 
4
  # Set working directory
5
  WORKDIR /code
6
 
7
+ # Install dependencies
8
+ COPY requirements.txt .
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
 
11
+ # Reduce memory usage (optional)
12
+ ENV TRANSFORMERS_NO_ADVISORY_WARNINGS=1
13
+ ENV HF_HOME="/code/hf_cache"
14
 
15
+ # Expose API port
 
 
 
 
 
 
16
  EXPOSE 7860
17
 
18
+ # Run FastAPI
19
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
requirements.txt CHANGED
@@ -32,7 +32,10 @@ python-multipart
32
  sqlalchemy
33
  psycopg2
34
  python-dotenv
35
- ollama
 
 
 
36
 
37
  # Docker and deployment
38
  gunicorn # Optional: for production deployment
 
32
  sqlalchemy
33
  psycopg2
34
  python-dotenv
35
+ torch
36
+ transformers
37
+ accelerate
38
+
39
 
40
  # Docker and deployment
41
  gunicorn # Optional: for production deployment
utils.py CHANGED
@@ -1,34 +1,37 @@
1
  import asyncio
2
- import ollama
3
- from typing import List
4
- import time
5
- def cosine_similarity(embedding_0, embedding_1):
6
- pass
7
 
8
- def generate_embedding(model, text: str, model_type: str) -> List[float]:
9
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  async def generate_stream(query: str):
12
- """Stream responses from Ollama with automatic retries."""
13
- max_retries = 5 # Retry 5 times
14
- delay = 3 # Wait 3 seconds before retrying
15
-
16
- for attempt in range(max_retries):
17
- try:
18
- stream = ollama.chat(
19
- model="mistral", # Use your preferred model
20
- messages=[{"role": "user", "content": query}],
21
- stream=True
22
- )
23
- for chunk in stream:
24
- if "message" in chunk and "content" in chunk["message"]:
25
- yield chunk["message"]["content"]
26
- await asyncio.sleep(0)
27
- return
28
- except Exception as e:
29
- print(f"❌ Ollama connection failed (Attempt {attempt+1}/{max_retries}): {str(e)}")
30
- if attempt < max_retries - 1:
31
- time.sleep(delay) # Wait before retrying
32
- else:
33
- yield "⚠️ Error: Could not connect to Ollama after multiple attempts."
34
 
 
 
1
  import asyncio
2
+ import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
 
 
4
 
5
+ # Load latest available LLaMA model (Change this if LLaMA 3 becomes available)
6
+ MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
7
+
8
+ # Detect device (Use GPU if available)
9
+ device = "cuda" if torch.cuda.is_available() else "cpu"
10
+
11
+ # Load tokenizer and model
12
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
13
+ model = AutoModelForCausalLM.from_pretrained(
14
+ MODEL_NAME,
15
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
16
+ device_map="auto"
17
+ ).to(device)
18
+
19
+ # Text generation pipeline for efficient inference
20
+ generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
21
 
22
  async def generate_stream(query: str):
23
+ """Stream responses using LLaMA."""
24
+
25
+ input_ids = tokenizer(query, return_tensors="pt").input_ids.to(device)
26
+
27
+ # Generate text
28
+ output = generator(query, max_length=512, do_sample=True, temperature=0.7)
29
+
30
+ response_text = output[0]["generated_text"]
31
+
32
+ # Simulate streaming
33
+ for word in response_text.split():
34
+ yield word + " "
35
+ await asyncio.sleep(0.05)
 
 
 
 
 
 
 
 
 
36
 
37
+ yield "\n"