Spaces:
Running
Running
Bahodir Nematjonov
commited on
Commit
·
7c59172
1
Parent(s):
04f2bac
updated model
Browse files- Dockerfile +11 -16
- requirements.txt +4 -1
- utils.py +32 -29
Dockerfile
CHANGED
@@ -1,24 +1,19 @@
|
|
1 |
-
# Use
|
2 |
-
FROM
|
3 |
|
4 |
# Set working directory
|
5 |
WORKDIR /code
|
6 |
|
7 |
-
#
|
8 |
-
COPY
|
9 |
-
RUN pip install --no-cache-dir
|
10 |
|
11 |
-
#
|
12 |
-
|
|
|
13 |
|
14 |
-
#
|
15 |
-
RUN mkdir -p /home/user/.ollama && chmod -R 777 /home/user/.ollama
|
16 |
-
|
17 |
-
# Set Ollama to use this directory
|
18 |
-
ENV OLLAMA_HOME=/home/user/.ollama
|
19 |
-
|
20 |
-
# Expose the FastAPI port
|
21 |
EXPOSE 7860
|
22 |
|
23 |
-
#
|
24 |
-
CMD
|
|
|
1 |
+
# Use a lightweight PyTorch image with CUDA support
|
2 |
+
FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime
|
3 |
|
4 |
# Set working directory
|
5 |
WORKDIR /code
|
6 |
|
7 |
+
# Install dependencies
|
8 |
+
COPY requirements.txt .
|
9 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
10 |
|
11 |
+
# Reduce memory usage (optional)
|
12 |
+
ENV TRANSFORMERS_NO_ADVISORY_WARNINGS=1
|
13 |
+
ENV HF_HOME="/code/hf_cache"
|
14 |
|
15 |
+
# Expose API port
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
EXPOSE 7860
|
17 |
|
18 |
+
# Run FastAPI
|
19 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
requirements.txt
CHANGED
@@ -32,7 +32,10 @@ python-multipart
|
|
32 |
sqlalchemy
|
33 |
psycopg2
|
34 |
python-dotenv
|
35 |
-
|
|
|
|
|
|
|
36 |
|
37 |
# Docker and deployment
|
38 |
gunicorn # Optional: for production deployment
|
|
|
32 |
sqlalchemy
|
33 |
psycopg2
|
34 |
python-dotenv
|
35 |
+
torch
|
36 |
+
transformers
|
37 |
+
accelerate
|
38 |
+
|
39 |
|
40 |
# Docker and deployment
|
41 |
gunicorn # Optional: for production deployment
|
utils.py
CHANGED
@@ -1,34 +1,37 @@
|
|
1 |
import asyncio
|
2 |
-
import
|
3 |
-
from
|
4 |
-
import time
|
5 |
-
def cosine_similarity(embedding_0, embedding_1):
|
6 |
-
pass
|
7 |
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
async def generate_stream(query: str):
|
12 |
-
"""Stream responses
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
yield chunk["message"]["content"]
|
26 |
-
await asyncio.sleep(0)
|
27 |
-
return
|
28 |
-
except Exception as e:
|
29 |
-
print(f"❌ Ollama connection failed (Attempt {attempt+1}/{max_retries}): {str(e)}")
|
30 |
-
if attempt < max_retries - 1:
|
31 |
-
time.sleep(delay) # Wait before retrying
|
32 |
-
else:
|
33 |
-
yield "⚠️ Error: Could not connect to Ollama after multiple attempts."
|
34 |
|
|
|
|
1 |
import asyncio
|
2 |
+
import torch
|
3 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
|
|
|
|
|
|
4 |
|
5 |
+
# Load latest available LLaMA model (Change this if LLaMA 3 becomes available)
|
6 |
+
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
|
7 |
+
|
8 |
+
# Detect device (Use GPU if available)
|
9 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
10 |
+
|
11 |
+
# Load tokenizer and model
|
12 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
13 |
+
model = AutoModelForCausalLM.from_pretrained(
|
14 |
+
MODEL_NAME,
|
15 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
16 |
+
device_map="auto"
|
17 |
+
).to(device)
|
18 |
+
|
19 |
+
# Text generation pipeline for efficient inference
|
20 |
+
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
|
21 |
|
22 |
async def generate_stream(query: str):
|
23 |
+
"""Stream responses using LLaMA."""
|
24 |
+
|
25 |
+
input_ids = tokenizer(query, return_tensors="pt").input_ids.to(device)
|
26 |
+
|
27 |
+
# Generate text
|
28 |
+
output = generator(query, max_length=512, do_sample=True, temperature=0.7)
|
29 |
+
|
30 |
+
response_text = output[0]["generated_text"]
|
31 |
+
|
32 |
+
# Simulate streaming
|
33 |
+
for word in response_text.split():
|
34 |
+
yield word + " "
|
35 |
+
await asyncio.sleep(0.05)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
+
yield "\n"
|