Spaces:

giulio98
/

beyondrag

Running on Zero

giulio98 commited on Mar 28

Commit

163c806

verified ·

1 Parent(s): b5ae53a

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 from langchain_docling import DoclingLoader
 from langchain_docling.loader import ExportType
 from langchain_text_splitters import RecursiveCharacterTextSplitter
-from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, TextIteratorStreamer
 from transformers.models.llama.modeling_llama import rotate_half
 import threading
 import shutil
@@ -30,9 +30,12 @@ from utils import (
 # Initialize the model and tokenizer.
 api_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
-model_name = "meta-llama/Llama-3.1-8B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_name, token=api_token)
-model = AutoModelForCausalLM.from_pretrained(model_name, token=api_token, torch_dtype=torch.float16)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = model.eval()
 model.to(device)

 from langchain_docling import DoclingLoader
 from langchain_docling.loader import ExportType
 from langchain_text_splitters import RecursiveCharacterTextSplitter
+from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, TextIteratorStreamer, BitsAndBytesConfig
 from transformers.models.llama.modeling_llama import rotate_half
 import threading
 import shutil
 # Initialize the model and tokenizer.
 api_token = os.getenv("HUGGING_FACE_HUB_TOKEN")
+# model_name = "meta-llama/Llama-3.1-8B-Instruct"
+model_name = "google/gemma-3-27b-it"
 tokenizer = AutoTokenizer.from_pretrained(model_name, token=api_token)
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+# model = AutoModelForCausalLM.from_pretrained(model_name, token=api_token, torch_dtype=torch.float16)
+model = AutoModelForCausalLM.from_pretrained(model_name, token=api_token, quantization_config=quantization_config, torch_dtype="auto")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = model.eval()
 model.to(device)