File size: 5,167 Bytes
fafbbd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import re
import pandas as pd
import numpy as np
from sentence_transformers import util
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
tqdm.pandas()
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import streamlit as st
import spacy
import pymorphy3

if torch.cuda.is_available():
    print("CUDA доступна!")
    device = torch.device("cuda")
else:
    print("CUDA недоступна. Вычисления будут выполняться на CPU.")
    device = torch.device("cpu")

max_length = 512
@st.cache_data
def load_models():
    # Создание объекта для морфологического анализа
    morph = pymorphy3.MorphAnalyzer()
    # Загрузка модели spaCy для русского языка
    nlp = spacy.load("ru_core_news_lg")
    return nlp, morph

def get_df():
    df = pd.read_csv('/home/marena/Elbrus_phase_2/Semantic-Search/data/movie_data.csv')
    df['all_text'] = df.apply(lambda row: f"{row['title']} {row['genre']} {row['director']} {row['actors']} {row['description']}", axis=1)
    df['all_text'][0]
    return df

@st.cache_data
def autobot():
    model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2').to(device).half()
    return model
@st.cache_data
def token():
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', truncation=True, max_length=max_length)
    return tokenizer

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

@st.cache_data
def get_sentence_embedding(text):
    tokenizer = token()
    model = autobot()
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(device)
    with torch.amp.autocast('cuda'):
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = mean_pooling(outputs, inputs['attention_mask']).cpu().numpy()[0] # Удаление лишней размерности [0]
    return embeddings

@st.cache_data
def get_embs():
    with open('/home/marena/Elbrus_phase_2/Semantic-Search/data/embeddings', 'rb') as file:
        embeddings = pickle.load(file)
    return embeddings

def search_movie(query, top_k=8, year=None):
    query_embedding = get_sentence_embedding(query)
    embeddings = get_embs()
    df = get_df()
    cos_scores = torch.nn.functional.cosine_similarity(torch.tensor(query_embedding), torch.tensor(embeddings))
    df['similarity'] = cos_scores.tolist()
    res = df.sort_values(by='similarity', ascending=False)
    if year:
        res = res[res['year'] == year]
    return res.head(top_k)




def sort_by_entities(df: pd.DataFrame, text: str, morph: pymorphy3.analyzer.MorphAnalyzer, nlp):
    genres = {'аниме',
              'биография',
              'боевик',
              'вестерн',
              'военный',
              'детектив',
              'детский',
              'документальный',
              'драма',
              'исторический',
              'комедия',
              'короткометражный',
              'криминал',
              'мелодрама',
              'музыкальный',
              'мультфильмы',
              'мюзикл',
              'приключения',
              'семейный',
              'спорт',
              'триллер',
              'ужасы',
              'фантастика',
              'фэнтези',
              'эротика'}
    # Обработка текста
    doc = nlp(text)

    # Извлечение сущностей
    entities = [entity.text for entity in doc.ents if entity.label_ == "PER"]

    persons = []
    for entity in entities:
        persons.append(" ".join([morph.parse(person)[0].normal_form for person in entity.split()]))

    conditions = []
    for person in persons:
        for word in person.split(" "):
            if len(word) > 3:
                word = word[:-1]
            conditions.append(df["actors"].str.contains(word, na=False, case=False))
            conditions.append(df["director"].str.contains(word, na=False, case=False))

    combined_condition = pd.Series([False] * len(df), index=df.index)
    if len(conditions) > 1:
        combined_condition = conditions[0]
        for condition in conditions[1:]:
            combined_condition |= condition

    search_genre = []
    for genre in genres:
        if genre in text.lower():
            search_genre.append(df[genre] == 1)

    if len(search_genre) > 0:
        for condition in search_genre:
            combined_condition |= condition

    if len(search_genre) + len(persons) > 0:
        filtered = pd.concat([df[combined_condition], df[~combined_condition]])
    else:
        filtered = df
    return filtered