{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "38957f6a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "9ec92866",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\ukhal\\anaconda3\\envs\\datascience\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From c:\\Users\\ukhal\\anaconda3\\envs\\datascience\\Lib\\site-packages\\tf_keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sentence_transformers import SentenceTransformer\n",
    "\n",
    "model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n",
    "model.save('my_local_models/miniLM-v2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9fc3745",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Batches: 100%|██████████| 1720/1720 [05:45<00:00,  4.98it/s]\n"
     ]
    }
   ],
   "source": [
    "vectors = model.encode(df['text'].tolist(), batch_size=32, show_progress_bar=True)\n",
    "\n",
    "# Add the vectors as a new column\n",
    "df['vector'] = list(vectors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "616a89d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "country_encoder = LabelEncoder()\n",
    "df['country_id'] = country_encoder.fit_transform(df['country'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a5d9807",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1 — Loss: 559.1745\n",
      "Epoch 2 — Loss: 511.0904\n",
      "Epoch 3 — Loss: 487.1494\n",
      "Epoch 4 — Loss: 476.0557\n",
      "Epoch 5 — Loss: 463.6449\n",
      "Epoch 6 — Loss: 458.0139\n",
      "Epoch 7 — Loss: 454.9403\n",
      "Epoch 8 — Loss: 445.9739\n",
      "Epoch 9 — Loss: 443.4053\n",
      "Epoch 10 — Loss: 441.2702\n",
      "Epoch 11 — Loss: 435.5733\n",
      "Epoch 12 — Loss: 432.5762\n",
      "Epoch 13 — Loss: 428.4215\n",
      "Epoch 14 — Loss: 424.5392\n",
      "Epoch 15 — Loss: 427.4328\n",
      "Epoch 16 — Loss: 419.4463\n",
      "Epoch 17 — Loss: 420.8522\n",
      "Epoch 18 — Loss: 418.8724\n",
      "Epoch 19 — Loss: 410.7244\n",
      "Epoch 20 — Loss: 408.1810\n",
      "Epoch 21 — Loss: 404.8192\n",
      "Epoch 22 — Loss: 402.0590\n",
      "Epoch 23 — Loss: 400.0788\n",
      "Epoch 24 — Loss: 395.5753\n",
      "Epoch 25 — Loss: 391.3283\n",
      "Epoch 26 — Loss: 390.9558\n",
      "Epoch 27 — Loss: 386.5741\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "     Not Yes       0.78      0.88      0.83     27643\n",
      "         Yes       0.86      0.75      0.80     27377\n",
      "\n",
      "    accuracy                           0.81     55020\n",
      "   macro avg       0.82      0.81      0.81     55020\n",
      "weighted avg       0.82      0.81      0.81     55020\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import numpy as np\n",
    "from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "# ----------------------------\n",
    "# Модель\n",
    "# ----------------------------\n",
    "\n",
    "class VotePredictor(nn.Module):\n",
    "    def __init__(self, text_dim=384, country_count=193, country_emb_dim=32, hidden_dim=256):\n",
    "        super(VotePredictor, self).__init__()\n",
    "        self.country_embedding = nn.Embedding(country_count, country_emb_dim)\n",
    "\n",
    "        self.model = nn.Sequential(\n",
    "            nn.Linear(text_dim + country_emb_dim, hidden_dim),\n",
    "            nn.ReLU(),\n",
    "            nn.Dropout(0.3),\n",
    "            nn.Linear(hidden_dim, 1)\n",
    "        )\n",
    "\n",
    "    def forward(self, text_vecs, country_ids):\n",
    "        country_vecs = self.country_embedding(country_ids)\n",
    "        x = torch.cat([text_vecs, country_vecs], dim=1)\n",
    "        return self.model(x)\n",
    "\n",
    "# ----------------------------\n",
    "# Подготовка данных\n",
    "# ----------------------------\n",
    "\n",
    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
    "model = VotePredictor().to(device)\n",
    "criterion = nn.BCEWithLogitsLoss()\n",
    "optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)\n",
    "\n",
    "# Подготовка тензоров\n",
    "X_vectors = np.stack(df['vector'].values)\n",
    "y_labels = df['vote'].values\n",
    "country_ids = country_encoder.fit_transform(df['country'].values)\n",
    "\n",
    "X_tensor = torch.tensor(X_vectors, dtype=torch.float32)\n",
    "y_tensor = torch.tensor(y_labels, dtype=torch.float32)\n",
    "c_tensor = torch.tensor(country_ids, dtype=torch.long)\n",
    "\n",
    "# Тензорный датасет\n",
    "dataset = TensorDataset(X_tensor, c_tensor, y_tensor)\n",
    "\n",
    "# ----------------------------\n",
    "# Логика весов\n",
    "# ----------------------------\n",
    "\n",
    "# Веса\n",
    "class_sample_count = np.array([(y_tensor == 0).sum(), (y_tensor == 1).sum()])\n",
    "weights = 1. / class_sample_count\n",
    "sample_weights = weights[y_tensor.long().numpy()]\n",
    "\n",
    "sampler = WeightedRandomSampler(\n",
    "    weights=sample_weights,\n",
    "    num_samples=len(sample_weights),\n",
    "    replacement=True\n",
    ")\n",
    "\n",
    "# Загружаем данные\n",
    "train_loader = DataLoader(dataset, batch_size=64, sampler=sampler)\n",
    "\n",
    "# ----------------------------\n",
    "# Эпохи обучения\n",
    "# ----------------------------\n",
    "\n",
    "for epoch in range(27):\n",
    "    model.train()\n",
    "    total_loss = 0\n",
    "\n",
    "    for batch_x, batch_c, batch_y in train_loader:\n",
    "        batch_x, batch_c, batch_y = batch_x.to(device), batch_c.to(device), batch_y.to(device)\n",
    "\n",
    "        optimizer.zero_grad()\n",
    "        logits = model(batch_x, batch_c).squeeze()\n",
    "        loss = criterion(logits, batch_y)\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "\n",
    "        total_loss += loss.item()\n",
    "\n",
    "    print(f\"Epoch {epoch+1} — Loss: {total_loss:.4f}\")\n",
    "\n",
    "# ----------------------------\n",
    "# Оценка\n",
    "# ----------------------------\n",
    "\n",
    "model.eval()\n",
    "all_preds, all_true, all_country_ids = [], [], []\n",
    "\n",
    "with torch.no_grad():\n",
    "    for batch_x, batch_c, batch_y in train_loader:  # or use test_loader if you split\n",
    "        logits = model(batch_x.to(device), batch_c.to(device)).squeeze()\n",
    "        probs = torch.sigmoid(logits).cpu().numpy()\n",
    "        preds = (probs > 0.5445639).astype(int)\n",
    "\n",
    "        all_preds.extend(preds)\n",
    "        all_true.extend(batch_y.numpy())\n",
    "        all_country_ids.extend(batch_c.numpy())  # <— Here's the missing link\n",
    "\n",
    "print(classification_report(all_true, all_preds, target_names=['Not Yes', 'Yes']))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7ff81e59",
   "metadata": {},
   "outputs": [],
   "source": [
    "problem_countries = df_metrics[df_metrics['f1'] < 0.7]['country'].tolist()\n",
    "print(f\"{len(problem_countries)} countries with F1 < 0.7.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "9d345404",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_problem = df[df['country'].isin(problem_countries)].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "dac22a07",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "problem_country_encoder = LabelEncoder()\n",
    "df_problem['country_id'] = problem_country_encoder.fit_transform(df_problem['country'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ebf3b626",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_problem = np.stack(df_problem['vector'].values)\n",
    "y_problem = df_problem['vote'].values\n",
    "c_problem = df_problem['country_id'].values\n",
    "\n",
    "X_tensor = torch.tensor(X_problem, dtype=torch.float32)\n",
    "y_tensor = torch.tensor(y_problem, dtype=torch.float32)\n",
    "c_tensor = torch.tensor(c_problem, dtype=torch.long)\n",
    "\n",
    "from torch.utils.data import TensorDataset, DataLoader\n",
    "\n",
    "dataset = TensorDataset(X_tensor, c_tensor, y_tensor)\n",
    "\n",
    "class_sample_count = np.array([(y_tensor == 0).sum(), (y_tensor == 1).sum()])\n",
    "weights = 1. / class_sample_count\n",
    "sample_weights = weights[y_tensor.long().numpy()]\n",
    "sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)\n",
    "\n",
    "train_loader = DataLoader(dataset, batch_size=64, sampler=sampler)\n",
    "\n",
    "problem_model = VotePredictor(country_count=len(problem_country_encoder.classes_)).to(device)\n",
    "criterion = nn.BCEWithLogitsLoss()\n",
    "optimizer = torch.optim.Adam(problem_model.parameters(), lr=1e-4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "facb3c23",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1 — Loss: 176.5783\n",
      "Epoch 2 — Loss: 172.1360\n",
      "Epoch 3 — Loss: 169.1655\n",
      "Epoch 4 — Loss: 167.5052\n",
      "Epoch 5 — Loss: 167.0431\n",
      "Epoch 6 — Loss: 164.9137\n",
      "Epoch 7 — Loss: 165.0920\n",
      "Epoch 8 — Loss: 164.1620\n",
      "\n",
      "🧾 SPECIAL MODEL EVALUATION (Bad-F1 Countries Only):\n",
      "\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "     Not Yes       0.64      0.64      0.64      8252\n",
      "         Yes       0.64      0.64      0.64      8254\n",
      "\n",
      "    accuracy                           0.64     16506\n",
      "   macro avg       0.64      0.64      0.64     16506\n",
      "weighted avg       0.64      0.64      0.64     16506\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.metrics import classification_report\n",
    "from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler\n",
    "\n",
    "# ----------------------\n",
    "# Модель\n",
    "# ----------------------\n",
    "\n",
    "class VotePredictor(nn.Module):\n",
    "    def __init__(self, text_dim=384, country_count=50, country_emb_dim=32, hidden_dim=256):\n",
    "        super(VotePredictor, self).__init__()\n",
    "        self.country_embedding = nn.Embedding(country_count, country_emb_dim)\n",
    "        self.model = nn.Sequential(\n",
    "            nn.Linear(text_dim + country_emb_dim, hidden_dim),\n",
    "            nn.ReLU(),\n",
    "            nn.Dropout(0.3),\n",
    "            nn.Linear(hidden_dim, 1)\n",
    "        )\n",
    "\n",
    "    def forward(self, text_vecs, country_ids):\n",
    "        country_vecs = self.country_embedding(country_ids)\n",
    "        x = torch.cat([text_vecs, country_vecs], dim=1)\n",
    "        return self.model(x)\n",
    "\n",
    "# ----------------------\n",
    "# STEP 1: Фильтруем проблемные страны\n",
    "# ----------------------\n",
    "\n",
    "problem_countries = df_metrics[df_metrics['f1'] < 0.7]['country'].tolist()\n",
    "df_problem = df[df['country'].isin(problem_countries)].copy()\n",
    "\n",
    "# ----------------------\n",
    "# STEP 2: Энкодинг стран\n",
    "# ----------------------\n",
    "\n",
    "problem_country_encoder = LabelEncoder()\n",
    "df_problem['country_id'] = problem_country_encoder.fit_transform(df_problem['country'])\n",
    "\n",
    "X_problem = np.stack(df_problem['vector'].values)\n",
    "y_problem = df_problem['vote'].values\n",
    "c_problem = df_problem['country_id'].values\n",
    "\n",
    "# ----------------------\n",
    "# STEP 3: Подготовка тензоров\n",
    "# ----------------------\n",
    "\n",
    "X_tensor = torch.tensor(X_problem, dtype=torch.float32)\n",
    "y_tensor = torch.tensor(y_problem, dtype=torch.float32)\n",
    "c_tensor = torch.tensor(c_problem, dtype=torch.long)\n",
    "\n",
    "dataset = TensorDataset(X_tensor, c_tensor, y_tensor)\n",
    "\n",
    "# Веса\n",
    "class_sample_count = np.array([(y_tensor == 0).sum(), (y_tensor == 1).sum()])\n",
    "weights = 1. / class_sample_count\n",
    "sample_weights = weights[y_tensor.long().numpy()]\n",
    "sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)\n",
    "\n",
    "train_loader = DataLoader(dataset, batch_size=64, sampler=sampler)\n",
    "\n",
    "# ----------------------\n",
    "# STEP 4: Тренировка модели\n",
    "# ----------------------\n",
    "\n",
    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
    "model = VotePredictor(country_count=len(problem_country_encoder.classes_)).to(device)\n",
    "criterion = nn.BCEWithLogitsLoss()\n",
    "optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)\n",
    "\n",
    "# Эпохи обучения\n",
    "for epoch in range(8):\n",
    "    model.train()\n",
    "    total_loss = 0\n",
    "\n",
    "    for batch_x, batch_c, batch_y in train_loader:\n",
    "        batch_x, batch_c, batch_y = batch_x.to(device), batch_c.to(device), batch_y.to(device)\n",
    "\n",
    "        optimizer.zero_grad()\n",
    "        logits = model(batch_x, batch_c).squeeze()\n",
    "        loss = criterion(logits, batch_y)\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "\n",
    "        total_loss += loss.item()\n",
    "\n",
    "    print(f\"Epoch {epoch+1} — Loss: {total_loss:.4f}\")\n",
    "\n",
    "# ----------------------\n",
    "# STEP 5: Оценка\n",
    "# ----------------------\n",
    "\n",
    "model.eval()\n",
    "all_preds, all_true = [], []\n",
    "\n",
    "with torch.no_grad():\n",
    "    for batch_x, batch_c, batch_y in train_loader:\n",
    "        logits = model(batch_x.to(device), batch_c.to(device)).squeeze()\n",
    "        probs = torch.sigmoid(logits).cpu().numpy()\n",
    "        preds = (probs > 0.5).astype(int)\n",
    "\n",
    "        all_preds.extend(preds)\n",
    "        all_true.extend(batch_y.numpy())\n",
    "\n",
    "print(\"\\n🧾 SPECIAL MODEL EVALUATION (Bad-F1 Countries Only):\\n\")\n",
    "print(classification_report(all_true, all_preds, target_names=['Not Yes', 'Yes']))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "39995c95",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['SURINAME',\n",
       " 'TURKMENISTAN',\n",
       " 'MARSHALL ISLANDS',\n",
       " 'MYANMAR',\n",
       " 'GABON',\n",
       " 'CENTRAL AFRICAN REPUBLIC',\n",
       " 'ISRAEL',\n",
       " 'REPUBLIC OF THE CONGO',\n",
       " 'LIBERIA',\n",
       " 'SOMALIA',\n",
       " 'CANADA',\n",
       " \"LAO PEOPLE'S DEMOCRATIC REPUBLIC\",\n",
       " 'TUVALU',\n",
       " 'DEMOCRATIC REPUBLIC OF THE CONGO',\n",
       " 'MONTENEGRO',\n",
       " 'VANUATU',\n",
       " 'UNITED STATES',\n",
       " 'TÜRKİYE',\n",
       " 'SEYCHELLES',\n",
       " 'SERBIA',\n",
       " 'CABO VERDE',\n",
       " 'VENEZUELA (BOLIVARIAN REPUBLIC OF)',\n",
       " 'KIRIBATI',\n",
       " 'IRAN (ISLAMIC REPUBLIC OF)',\n",
       " 'SOUTH SUDAN',\n",
       " 'ALBANIA',\n",
       " 'CZECHIA',\n",
       " 'DOMINICA',\n",
       " 'SAO TOME AND PRINCIPE',\n",
       " 'ESWATINI',\n",
       " 'CHAD',\n",
       " 'EQUATORIAL GUINEA',\n",
       " 'GAMBIA',\n",
       " 'LIBYA',\n",
       " \"CÔTE D'IVOIRE\",\n",
       " 'SAINT CHRISTOPHER AND NEVIS',\n",
       " 'RWANDA',\n",
       " 'TONGA',\n",
       " 'NIGER',\n",
       " 'MICRONESIA (FEDERATED STATES OF)',\n",
       " 'SYRIAN ARAB REPUBLIC',\n",
       " 'NAURU',\n",
       " 'PALAU',\n",
       " 'NORTH MACEDONIA',\n",
       " 'NETHERLANDS',\n",
       " 'BOLIVIA (PLURINATIONAL STATE OF)']"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(set(problem_countries))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "datascience",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}