import sys sys.path.append("../") from transformers import AutoModel, AutoTokenizer from transformers.utils import hub from functools import partial from bw_utils import get_child_folders import torch import os class EmbeddingModel: def __init__(self, model_name, language='en'): self.model_name = model_name self.language = language cache_dir = hub.default_cache_path model_provider = model_name.split("/")[0] model_smallname = model_name.split("/")[1] model_path = os.path.join(cache_dir, f"models--{model_provider}--{model_smallname}/snapshots/") if os.path.exists(model_path) and get_child_folders(model_path): try: model_path = os.path.join(model_path,get_child_folders(model_path)[0]) self.tokenizer = AutoTokenizer.from_pretrained(model_path) self.model = AutoModel.from_pretrained(model_path) except Exception as e: print(e) self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModel.from_pretrained(model_name) else: self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModel.from_pretrained(model_name) def __call__(self, input): inputs = self.tokenizer(input, return_tensors="pt", padding=True, truncation=True, max_length=256) with torch.no_grad(): outputs = self.model(**inputs) embeddings = outputs.last_hidden_state[:, 0, :].tolist() return embeddings class OpenAIEmbedding: def __init__(self, model_name="text-embedding-ada-002"): from openai import OpenAI self.client = OpenAI() self.model_name = model_name def __call__(self, input): if isinstance(input, str): input = input.replace("\n", " ") return self.client.embeddings.create(input=[input], model=self.model_name).data[0].embedding elif isinstance(input,list): return [self.client.embeddings.create(input=[sentence.replace("\n", " ")], model=self.model_name).data[0].embedding for sentence in input] def get_embedding_model(embed_name, language='en'): model_name_dict = { "bge-m3":"BAAI/bge-m3", "bge": "BAAI/bge-large-", "luotuo": "silk-road/luotuo-bert-medium", "bert": "google-bert/bert-base-multilingual-cased", } if embed_name in model_name_dict: model_name = model_name_dict[embed_name] if embed_name == 'bge': model_name += language return EmbeddingModel(model_name) else: return OpenAIEmbedding()