Spaces:
Running
Running
File size: 5,167 Bytes
fafbbd5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import re
import pandas as pd
import numpy as np
from sentence_transformers import util
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
tqdm.pandas()
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import streamlit as st
import spacy
import pymorphy3
if torch.cuda.is_available():
print("CUDA доступна!")
device = torch.device("cuda")
else:
print("CUDA недоступна. Вычисления будут выполняться на CPU.")
device = torch.device("cpu")
max_length = 512
@st.cache_data
def load_models():
# Создание объекта для морфологического анализа
morph = pymorphy3.MorphAnalyzer()
# Загрузка модели spaCy для русского языка
nlp = spacy.load("ru_core_news_lg")
return nlp, morph
def get_df():
df = pd.read_csv('/home/marena/Elbrus_phase_2/Semantic-Search/data/movie_data.csv')
df['all_text'] = df.apply(lambda row: f"{row['title']} {row['genre']} {row['director']} {row['actors']} {row['description']}", axis=1)
df['all_text'][0]
return df
@st.cache_data
def autobot():
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2').to(device).half()
return model
@st.cache_data
def token():
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', truncation=True, max_length=max_length)
return tokenizer
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0]
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
@st.cache_data
def get_sentence_embedding(text):
tokenizer = token()
model = autobot()
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(device)
with torch.amp.autocast('cuda'):
with torch.no_grad():
outputs = model(**inputs)
embeddings = mean_pooling(outputs, inputs['attention_mask']).cpu().numpy()[0] # Удаление лишней размерности [0]
return embeddings
@st.cache_data
def get_embs():
with open('/home/marena/Elbrus_phase_2/Semantic-Search/data/embeddings', 'rb') as file:
embeddings = pickle.load(file)
return embeddings
def search_movie(query, top_k=8, year=None):
query_embedding = get_sentence_embedding(query)
embeddings = get_embs()
df = get_df()
cos_scores = torch.nn.functional.cosine_similarity(torch.tensor(query_embedding), torch.tensor(embeddings))
df['similarity'] = cos_scores.tolist()
res = df.sort_values(by='similarity', ascending=False)
if year:
res = res[res['year'] == year]
return res.head(top_k)
def sort_by_entities(df: pd.DataFrame, text: str, morph: pymorphy3.analyzer.MorphAnalyzer, nlp):
genres = {'аниме',
'биография',
'боевик',
'вестерн',
'военный',
'детектив',
'детский',
'документальный',
'драма',
'исторический',
'комедия',
'короткометражный',
'криминал',
'мелодрама',
'музыкальный',
'мультфильмы',
'мюзикл',
'приключения',
'семейный',
'спорт',
'триллер',
'ужасы',
'фантастика',
'фэнтези',
'эротика'}
# Обработка текста
doc = nlp(text)
# Извлечение сущностей
entities = [entity.text for entity in doc.ents if entity.label_ == "PER"]
persons = []
for entity in entities:
persons.append(" ".join([morph.parse(person)[0].normal_form for person in entity.split()]))
conditions = []
for person in persons:
for word in person.split(" "):
if len(word) > 3:
word = word[:-1]
conditions.append(df["actors"].str.contains(word, na=False, case=False))
conditions.append(df["director"].str.contains(word, na=False, case=False))
combined_condition = pd.Series([False] * len(df), index=df.index)
if len(conditions) > 1:
combined_condition = conditions[0]
for condition in conditions[1:]:
combined_condition |= condition
search_genre = []
for genre in genres:
if genre in text.lower():
search_genre.append(df[genre] == 1)
if len(search_genre) > 0:
for condition in search_genre:
combined_condition |= condition
if len(search_genre) + len(persons) > 0:
filtered = pd.concat([df[combined_condition], df[~combined_condition]])
else:
filtered = df
return filtered
|