import pandas as pd import numpy as np import re import string from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score import nltk from nltk.corpus import stopwords from pymorphy2 import MorphAnalyzer import torch from sentence_transformers import SentenceTransformer nltk.download("stopwords") # Загрузка данных DATA_PATH = "dataset/" TEXT_FILE = DATA_PATH + "text_data.csv" RESULT_FILE = DATA_PATH + "similar_texts.csv" # Инициализация модели эмбеддингов для русского языка embedding_model = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v1") # Инициализация морфологического анализатора и стоп-слов morph = MorphAnalyzer() stop_words = set(stopwords.words("russian")) # Предобработка текста def preprocess_text(text): text = text.lower() text = re.sub(f"[{string.punctuation}]", "", text) # Удаляем знаки препинания text = re.sub(r"\d+", "", text) # Удаляем цифры words = text.split() words = [morph.parse(word)[0].normal_form for word in words if word not in stop_words] return " ".join(words) # Загрузка и обработка данных df = pd.read_csv(TEXT_FILE) df.dropna(subset=["text"], inplace=True) # Удаляем пустые строки df.drop_duplicates(subset=["text"], inplace=True) # Убираем дубликаты df["clean_text"] = df["text"].apply(preprocess_text) # Разделение данных на тренировочные и тестовые train_texts, test_texts = train_test_split(df["clean_text"], test_size=0.2, random_state=42) # Векторизация текста (TF-IDF) vectorizer = TfidfVectorizer() text_vectors = vectorizer.fit_transform(df["clean_text"]) # Генерация эмбеддингов embeddings = embedding_model.encode(df["clean_text"].tolist(), convert_to_tensor=True) # Функция для поиска похожих текстов и сохранения результатов в CSV def find_similar(text, top_n=5, save_to_csv=False, use_embeddings=True): text = preprocess_text(text) if use_embeddings: text_vector = embedding_model.encode([text], convert_to_tensor=True) similarities = cosine_similarity(text_vector.cpu().numpy(), embeddings.cpu().numpy()).flatten() else: text_vector = vectorizer.transform([text]) similarities = cosine_similarity(text_vector, text_vectors).flatten() top_indices = np.argsort(similarities)[-top_n:][::-1] result_df = df.iloc[top_indices][["text", "clean_text"]].copy() result_df["similarity"] = similarities[top_indices] if save_to_csv: result_df.to_csv(RESULT_FILE, index=False) print(f"Results saved to {RESULT_FILE}") return result_df # Оценка точности модели sample_test_texts = test_texts[:10].tolist() predictions = [find_similar(text, top_n=1, use_embeddings=True).iloc[0]["clean_text"] for text in sample_test_texts] accuracy = accuracy_score(sample_test_texts, predictions) print(f"Model Accuracy: {accuracy:.4f}") # Тест примера if __name__ == "__main__": sample_text = "Мне нужен ноутбук с хорошей батареей и легким корпусом." similar_texts = find_similar(sample_text, save_to_csv=True, use_embeddings=True) print("Top similar texts:") print(similar_texts)