{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "38957f6a", "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 6, "id": "9ec92866", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\ukhal\\anaconda3\\envs\\datascience\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "WARNING:tensorflow:From c:\\Users\\ukhal\\anaconda3\\envs\\datascience\\Lib\\site-packages\\tf_keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n", "\n" ] } ], "source": [ "from sentence_transformers import SentenceTransformer\n", "\n", "model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n", "model.save('my_local_models/miniLM-v2')" ] }, { "cell_type": "code", "execution_count": null, "id": "a9fc3745", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Batches: 100%|██████████| 1720/1720 [05:45<00:00, 4.98it/s]\n" ] } ], "source": [ "vectors = model.encode(df['text'].tolist(), batch_size=32, show_progress_bar=True)\n", "\n", "# Add the vectors as a new column\n", "df['vector'] = list(vectors)" ] }, { "cell_type": "code", "execution_count": 8, "id": "616a89d5", "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import LabelEncoder\n", "\n", "country_encoder = LabelEncoder()\n", "df['country_id'] = country_encoder.fit_transform(df['country'])" ] }, { "cell_type": "code", "execution_count": null, "id": "1a5d9807", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1 — Loss: 559.1745\n", "Epoch 2 — Loss: 511.0904\n", "Epoch 3 — Loss: 487.1494\n", "Epoch 4 — Loss: 476.0557\n", "Epoch 5 — Loss: 463.6449\n", "Epoch 6 — Loss: 458.0139\n", "Epoch 7 — Loss: 454.9403\n", "Epoch 8 — Loss: 445.9739\n", "Epoch 9 — Loss: 443.4053\n", "Epoch 10 — Loss: 441.2702\n", "Epoch 11 — Loss: 435.5733\n", "Epoch 12 — Loss: 432.5762\n", "Epoch 13 — Loss: 428.4215\n", "Epoch 14 — Loss: 424.5392\n", "Epoch 15 — Loss: 427.4328\n", "Epoch 16 — Loss: 419.4463\n", "Epoch 17 — Loss: 420.8522\n", "Epoch 18 — Loss: 418.8724\n", "Epoch 19 — Loss: 410.7244\n", "Epoch 20 — Loss: 408.1810\n", "Epoch 21 — Loss: 404.8192\n", "Epoch 22 — Loss: 402.0590\n", "Epoch 23 — Loss: 400.0788\n", "Epoch 24 — Loss: 395.5753\n", "Epoch 25 — Loss: 391.3283\n", "Epoch 26 — Loss: 390.9558\n", "Epoch 27 — Loss: 386.5741\n", " precision recall f1-score support\n", "\n", " Not Yes 0.78 0.88 0.83 27643\n", " Yes 0.86 0.75 0.80 27377\n", "\n", " accuracy 0.81 55020\n", " macro avg 0.82 0.81 0.81 55020\n", "weighted avg 0.82 0.81 0.81 55020\n", "\n" ] } ], "source": [ "import torch\n", "import torch.nn as nn\n", "import numpy as np\n", "from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.metrics import classification_report\n", "\n", "# ----------------------------\n", "# Модель\n", "# ----------------------------\n", "\n", "class VotePredictor(nn.Module):\n", " def __init__(self, text_dim=384, country_count=193, country_emb_dim=32, hidden_dim=256):\n", " super(VotePredictor, self).__init__()\n", " self.country_embedding = nn.Embedding(country_count, country_emb_dim)\n", "\n", " self.model = nn.Sequential(\n", " nn.Linear(text_dim + country_emb_dim, hidden_dim),\n", " nn.ReLU(),\n", " nn.Dropout(0.3),\n", " nn.Linear(hidden_dim, 1)\n", " )\n", "\n", " def forward(self, text_vecs, country_ids):\n", " country_vecs = self.country_embedding(country_ids)\n", " x = torch.cat([text_vecs, country_vecs], dim=1)\n", " return self.model(x)\n", "\n", "# ----------------------------\n", "# Подготовка данных\n", "# ----------------------------\n", "\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "model = VotePredictor().to(device)\n", "criterion = nn.BCEWithLogitsLoss()\n", "optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)\n", "\n", "# Подготовка тензоров\n", "X_vectors = np.stack(df['vector'].values)\n", "y_labels = df['vote'].values\n", "country_ids = country_encoder.fit_transform(df['country'].values)\n", "\n", "X_tensor = torch.tensor(X_vectors, dtype=torch.float32)\n", "y_tensor = torch.tensor(y_labels, dtype=torch.float32)\n", "c_tensor = torch.tensor(country_ids, dtype=torch.long)\n", "\n", "# Тензорный датасет\n", "dataset = TensorDataset(X_tensor, c_tensor, y_tensor)\n", "\n", "# ----------------------------\n", "# Логика весов\n", "# ----------------------------\n", "\n", "# Веса\n", "class_sample_count = np.array([(y_tensor == 0).sum(), (y_tensor == 1).sum()])\n", "weights = 1. / class_sample_count\n", "sample_weights = weights[y_tensor.long().numpy()]\n", "\n", "sampler = WeightedRandomSampler(\n", " weights=sample_weights,\n", " num_samples=len(sample_weights),\n", " replacement=True\n", ")\n", "\n", "# Загружаем данные\n", "train_loader = DataLoader(dataset, batch_size=64, sampler=sampler)\n", "\n", "# ----------------------------\n", "# Эпохи обучения\n", "# ----------------------------\n", "\n", "for epoch in range(27):\n", " model.train()\n", " total_loss = 0\n", "\n", " for batch_x, batch_c, batch_y in train_loader:\n", " batch_x, batch_c, batch_y = batch_x.to(device), batch_c.to(device), batch_y.to(device)\n", "\n", " optimizer.zero_grad()\n", " logits = model(batch_x, batch_c).squeeze()\n", " loss = criterion(logits, batch_y)\n", " loss.backward()\n", " optimizer.step()\n", "\n", " total_loss += loss.item()\n", "\n", " print(f\"Epoch {epoch+1} — Loss: {total_loss:.4f}\")\n", "\n", "# ----------------------------\n", "# Оценка\n", "# ----------------------------\n", "\n", "model.eval()\n", "all_preds, all_true, all_country_ids = [], [], []\n", "\n", "with torch.no_grad():\n", " for batch_x, batch_c, batch_y in train_loader: # or use test_loader if you split\n", " logits = model(batch_x.to(device), batch_c.to(device)).squeeze()\n", " probs = torch.sigmoid(logits).cpu().numpy()\n", " preds = (probs > 0.5445639).astype(int)\n", "\n", " all_preds.extend(preds)\n", " all_true.extend(batch_y.numpy())\n", " all_country_ids.extend(batch_c.numpy()) # <— Here's the missing link\n", "\n", "print(classification_report(all_true, all_preds, target_names=['Not Yes', 'Yes']))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "7ff81e59", "metadata": {}, "outputs": [], "source": [ "problem_countries = df_metrics[df_metrics['f1'] < 0.7]['country'].tolist()\n", "print(f\"{len(problem_countries)} countries with F1 < 0.7.\")" ] }, { "cell_type": "code", "execution_count": 15, "id": "9d345404", "metadata": {}, "outputs": [], "source": [ "df_problem = df[df['country'].isin(problem_countries)].copy()" ] }, { "cell_type": "code", "execution_count": 16, "id": "dac22a07", "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import LabelEncoder\n", "\n", "problem_country_encoder = LabelEncoder()\n", "df_problem['country_id'] = problem_country_encoder.fit_transform(df_problem['country'])" ] }, { "cell_type": "code", "execution_count": null, "id": "ebf3b626", "metadata": {}, "outputs": [], "source": [ "X_problem = np.stack(df_problem['vector'].values)\n", "y_problem = df_problem['vote'].values\n", "c_problem = df_problem['country_id'].values\n", "\n", "X_tensor = torch.tensor(X_problem, dtype=torch.float32)\n", "y_tensor = torch.tensor(y_problem, dtype=torch.float32)\n", "c_tensor = torch.tensor(c_problem, dtype=torch.long)\n", "\n", "from torch.utils.data import TensorDataset, DataLoader\n", "\n", "dataset = TensorDataset(X_tensor, c_tensor, y_tensor)\n", "\n", "class_sample_count = np.array([(y_tensor == 0).sum(), (y_tensor == 1).sum()])\n", "weights = 1. / class_sample_count\n", "sample_weights = weights[y_tensor.long().numpy()]\n", "sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)\n", "\n", "train_loader = DataLoader(dataset, batch_size=64, sampler=sampler)\n", "\n", "problem_model = VotePredictor(country_count=len(problem_country_encoder.classes_)).to(device)\n", "criterion = nn.BCEWithLogitsLoss()\n", "optimizer = torch.optim.Adam(problem_model.parameters(), lr=1e-4)" ] }, { "cell_type": "code", "execution_count": null, "id": "facb3c23", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1 — Loss: 176.5783\n", "Epoch 2 — Loss: 172.1360\n", "Epoch 3 — Loss: 169.1655\n", "Epoch 4 — Loss: 167.5052\n", "Epoch 5 — Loss: 167.0431\n", "Epoch 6 — Loss: 164.9137\n", "Epoch 7 — Loss: 165.0920\n", "Epoch 8 — Loss: 164.1620\n", "\n", "🧾 SPECIAL MODEL EVALUATION (Bad-F1 Countries Only):\n", "\n", " precision recall f1-score support\n", "\n", " Not Yes 0.64 0.64 0.64 8252\n", " Yes 0.64 0.64 0.64 8254\n", "\n", " accuracy 0.64 16506\n", " macro avg 0.64 0.64 0.64 16506\n", "weighted avg 0.64 0.64 0.64 16506\n", "\n" ] } ], "source": [ "import torch\n", "import torch.nn as nn\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.metrics import classification_report\n", "from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler\n", "\n", "# ----------------------\n", "# Модель\n", "# ----------------------\n", "\n", "class VotePredictor(nn.Module):\n", " def __init__(self, text_dim=384, country_count=50, country_emb_dim=32, hidden_dim=256):\n", " super(VotePredictor, self).__init__()\n", " self.country_embedding = nn.Embedding(country_count, country_emb_dim)\n", " self.model = nn.Sequential(\n", " nn.Linear(text_dim + country_emb_dim, hidden_dim),\n", " nn.ReLU(),\n", " nn.Dropout(0.3),\n", " nn.Linear(hidden_dim, 1)\n", " )\n", "\n", " def forward(self, text_vecs, country_ids):\n", " country_vecs = self.country_embedding(country_ids)\n", " x = torch.cat([text_vecs, country_vecs], dim=1)\n", " return self.model(x)\n", "\n", "# ----------------------\n", "# STEP 1: Фильтруем проблемные страны\n", "# ----------------------\n", "\n", "problem_countries = df_metrics[df_metrics['f1'] < 0.7]['country'].tolist()\n", "df_problem = df[df['country'].isin(problem_countries)].copy()\n", "\n", "# ----------------------\n", "# STEP 2: Энкодинг стран\n", "# ----------------------\n", "\n", "problem_country_encoder = LabelEncoder()\n", "df_problem['country_id'] = problem_country_encoder.fit_transform(df_problem['country'])\n", "\n", "X_problem = np.stack(df_problem['vector'].values)\n", "y_problem = df_problem['vote'].values\n", "c_problem = df_problem['country_id'].values\n", "\n", "# ----------------------\n", "# STEP 3: Подготовка тензоров\n", "# ----------------------\n", "\n", "X_tensor = torch.tensor(X_problem, dtype=torch.float32)\n", "y_tensor = torch.tensor(y_problem, dtype=torch.float32)\n", "c_tensor = torch.tensor(c_problem, dtype=torch.long)\n", "\n", "dataset = TensorDataset(X_tensor, c_tensor, y_tensor)\n", "\n", "# Веса\n", "class_sample_count = np.array([(y_tensor == 0).sum(), (y_tensor == 1).sum()])\n", "weights = 1. / class_sample_count\n", "sample_weights = weights[y_tensor.long().numpy()]\n", "sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)\n", "\n", "train_loader = DataLoader(dataset, batch_size=64, sampler=sampler)\n", "\n", "# ----------------------\n", "# STEP 4: Тренировка модели\n", "# ----------------------\n", "\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "model = VotePredictor(country_count=len(problem_country_encoder.classes_)).to(device)\n", "criterion = nn.BCEWithLogitsLoss()\n", "optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)\n", "\n", "# Эпохи обучения\n", "for epoch in range(8):\n", " model.train()\n", " total_loss = 0\n", "\n", " for batch_x, batch_c, batch_y in train_loader:\n", " batch_x, batch_c, batch_y = batch_x.to(device), batch_c.to(device), batch_y.to(device)\n", "\n", " optimizer.zero_grad()\n", " logits = model(batch_x, batch_c).squeeze()\n", " loss = criterion(logits, batch_y)\n", " loss.backward()\n", " optimizer.step()\n", "\n", " total_loss += loss.item()\n", "\n", " print(f\"Epoch {epoch+1} — Loss: {total_loss:.4f}\")\n", "\n", "# ----------------------\n", "# STEP 5: Оценка\n", "# ----------------------\n", "\n", "model.eval()\n", "all_preds, all_true = [], []\n", "\n", "with torch.no_grad():\n", " for batch_x, batch_c, batch_y in train_loader:\n", " logits = model(batch_x.to(device), batch_c.to(device)).squeeze()\n", " probs = torch.sigmoid(logits).cpu().numpy()\n", " preds = (probs > 0.5).astype(int)\n", "\n", " all_preds.extend(preds)\n", " all_true.extend(batch_y.numpy())\n", "\n", "print(\"\\n🧾 SPECIAL MODEL EVALUATION (Bad-F1 Countries Only):\\n\")\n", "print(classification_report(all_true, all_preds, target_names=['Not Yes', 'Yes']))\n" ] }, { "cell_type": "code", "execution_count": 54, "id": "39995c95", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['SURINAME',\n", " 'TURKMENISTAN',\n", " 'MARSHALL ISLANDS',\n", " 'MYANMAR',\n", " 'GABON',\n", " 'CENTRAL AFRICAN REPUBLIC',\n", " 'ISRAEL',\n", " 'REPUBLIC OF THE CONGO',\n", " 'LIBERIA',\n", " 'SOMALIA',\n", " 'CANADA',\n", " \"LAO PEOPLE'S DEMOCRATIC REPUBLIC\",\n", " 'TUVALU',\n", " 'DEMOCRATIC REPUBLIC OF THE CONGO',\n", " 'MONTENEGRO',\n", " 'VANUATU',\n", " 'UNITED STATES',\n", " 'TÜRKİYE',\n", " 'SEYCHELLES',\n", " 'SERBIA',\n", " 'CABO VERDE',\n", " 'VENEZUELA (BOLIVARIAN REPUBLIC OF)',\n", " 'KIRIBATI',\n", " 'IRAN (ISLAMIC REPUBLIC OF)',\n", " 'SOUTH SUDAN',\n", " 'ALBANIA',\n", " 'CZECHIA',\n", " 'DOMINICA',\n", " 'SAO TOME AND PRINCIPE',\n", " 'ESWATINI',\n", " 'CHAD',\n", " 'EQUATORIAL GUINEA',\n", " 'GAMBIA',\n", " 'LIBYA',\n", " \"CÔTE D'IVOIRE\",\n", " 'SAINT CHRISTOPHER AND NEVIS',\n", " 'RWANDA',\n", " 'TONGA',\n", " 'NIGER',\n", " 'MICRONESIA (FEDERATED STATES OF)',\n", " 'SYRIAN ARAB REPUBLIC',\n", " 'NAURU',\n", " 'PALAU',\n", " 'NORTH MACEDONIA',\n", " 'NETHERLANDS',\n", " 'BOLIVIA (PLURINATIONAL STATE OF)']" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(set(problem_countries))" ] } ], "metadata": { "kernelspec": { "display_name": "datascience", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.9" } }, "nbformat": 4, "nbformat_minor": 5 }