In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [6]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
model.save('my_local_models/miniLM-v2')

 from .autonotebook import tqdm as notebook_tqdm





In [None]:
vectors = model.encode(df['text'].tolist(), batch_size=32, show_progress_bar=True)

# Add the vectors as a new column
df['vector'] = list(vectors)

Batches: 100%|██████████| 1720/1720 [05:45<00:00, 4.98it/s]


In [8]:
from sklearn.preprocessing import LabelEncoder

country_encoder = LabelEncoder()
df['country_id'] = country_encoder.fit_transform(df['country'])

In [None]:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# ----------------------------
# Модель
# ----------------------------

class VotePredictor(nn.Module):
 def __init__(self, text_dim=384, country_count=193, country_emb_dim=32, hidden_dim=256):
 super(VotePredictor, self).__init__()
 self.country_embedding = nn.Embedding(country_count, country_emb_dim)

 self.model = nn.Sequential(
 nn.Linear(text_dim + country_emb_dim, hidden_dim),
 nn.ReLU(),
 nn.Dropout(0.3),
 nn.Linear(hidden_dim, 1)
 )

 def forward(self, text_vecs, country_ids):
 country_vecs = self.country_embedding(country_ids)
 x = torch.cat([text_vecs, country_vecs], dim=1)
 return self.model(x)

# ----------------------------
# Подготовка данных
# ----------------------------

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = VotePredictor().to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Подготовка тензоров
X_vectors = np.stack(df['vector'].values)
y_labels = df['vote'].values
country_ids = country_encoder.fit_transform(df['country'].values)

X_tensor = torch.tensor(X_vectors, dtype=torch.float32)
y_tensor = torch.tensor(y_labels, dtype=torch.float32)
c_tensor = torch.tensor(country_ids, dtype=torch.long)

# Тензорный датасет
dataset = TensorDataset(X_tensor, c_tensor, y_tensor)

# ----------------------------
# Логика весов
# ----------------------------

# Веса
class_sample_count = np.array([(y_tensor == 0).sum(), (y_tensor == 1).sum()])
weights = 1. / class_sample_count
sample_weights = weights[y_tensor.long().numpy()]

sampler = WeightedRandomSampler(
 weights=sample_weights,
 num_samples=len(sample_weights),
 replacement=True
)

# Загружаем данные
train_loader = DataLoader(dataset, batch_size=64, sampler=sampler)

# ----------------------------
# Эпохи обучения
# ----------------------------

for epoch in range(27):
 model.train()
 total_loss = 0

 for batch_x, batch_c, batch_y in train_loader:
 batch_x, batch_c, batch_y = batch_x.to(device), batch_c.to(device), batch_y.to(device)

 optimizer.zero_grad()
 logits = model(batch_x, batch_c).squeeze()
 loss = criterion(logits, batch_y)
 loss.backward()
 optimizer.step()

 total_loss += loss.item()

 print(f"Epoch {epoch+1} — Loss: {total_loss:.4f}")

# ----------------------------
# Оценка
# ----------------------------

model.eval()
all_preds, all_true, all_country_ids = [], [], []

with torch.no_grad():
 for batch_x, batch_c, batch_y in train_loader: # or use test_loader if you split
 logits = model(batch_x.to(device), batch_c.to(device)).squeeze()
 probs = torch.sigmoid(logits).cpu().numpy()
 preds = (probs > 0.5445639).astype(int)

 all_preds.extend(preds)
 all_true.extend(batch_y.numpy())
 all_country_ids.extend(batch_c.numpy()) # <— Here's the missing link

print(classification_report(all_true, all_preds, target_names=['Not Yes', 'Yes']))


Epoch 1 — Loss: 559.1745
Epoch 2 — Loss: 511.0904
Epoch 3 — Loss: 487.1494
Epoch 4 — Loss: 476.0557
Epoch 5 — Loss: 463.6449
Epoch 6 — Loss: 458.0139
Epoch 7 — Loss: 454.9403
Epoch 8 — Loss: 445.9739
Epoch 9 — Loss: 443.4053
Epoch 10 — Loss: 441.2702
Epoch 11 — Loss: 435.5733
Epoch 12 — Loss: 432.5762
Epoch 13 — Loss: 428.4215
Epoch 14 — Loss: 424.5392
Epoch 15 — Loss: 427.4328
Epoch 16 — Loss: 419.4463
Epoch 17 — Loss: 420.8522
Epoch 18 — Loss: 418.8724
Epoch 19 — Loss: 410.7244
Epoch 20 — Loss: 408.1810
Epoch 21 — Loss: 404.8192
Epoch 22 — Loss: 402.0590
Epoch 23 — Loss: 400.0788
Epoch 24 — Loss: 395.5753
Epoch 25 — Loss: 391.3283
Epoch 26 — Loss: 390.9558
Epoch 27 — Loss: 386.5741
 precision recall f1-score support

 Not Yes 0.78 0.88 0.83 27643
 Yes 0.86 0.75 0.80 27377

 accuracy 0.81 55020
 macro avg 0.82 0.81 0.81 55020
weighted avg 0.82 0.81 0.81 55020



In [None]:
problem_countries = df_metrics[df_metrics['f1'] < 0.7]['country'].tolist()
print(f"{len(problem_countries)} countries with F1 < 0.7.")

In [15]:
df_problem = df[df['country'].isin(problem_countries)].copy()

In [16]:
from sklearn.preprocessing import LabelEncoder

problem_country_encoder = LabelEncoder()
df_problem['country_id'] = problem_country_encoder.fit_transform(df_problem['country'])

In [None]:
X_problem = np.stack(df_problem['vector'].values)
y_problem = df_problem['vote'].values
c_problem = df_problem['country_id'].values

X_tensor = torch.tensor(X_problem, dtype=torch.float32)
y_tensor = torch.tensor(y_problem, dtype=torch.float32)
c_tensor = torch.tensor(c_problem, dtype=torch.long)

from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(X_tensor, c_tensor, y_tensor)

class_sample_count = np.array([(y_tensor == 0).sum(), (y_tensor == 1).sum()])
weights = 1. / class_sample_count
sample_weights = weights[y_tensor.long().numpy()]
sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)

train_loader = DataLoader(dataset, batch_size=64, sampler=sampler)

problem_model = VotePredictor(country_count=len(problem_country_encoder.classes_)).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(problem_model.parameters(), lr=1e-4)

In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler

# ----------------------
# Модель
# ----------------------

class VotePredictor(nn.Module):
 def __init__(self, text_dim=384, country_count=50, country_emb_dim=32, hidden_dim=256):
 super(VotePredictor, self).__init__()
 self.country_embedding = nn.Embedding(country_count, country_emb_dim)
 self.model = nn.Sequential(
 nn.Linear(text_dim + country_emb_dim, hidden_dim),
 nn.ReLU(),
 nn.Dropout(0.3),
 nn.Linear(hidden_dim, 1)
 )

 def forward(self, text_vecs, country_ids):
 country_vecs = self.country_embedding(country_ids)
 x = torch.cat([text_vecs, country_vecs], dim=1)
 return self.model(x)

# ----------------------
# STEP 1: Фильтруем проблемные страны
# ----------------------

problem_countries = df_metrics[df_metrics['f1'] < 0.7]['country'].tolist()
df_problem = df[df['country'].isin(problem_countries)].copy()

# ----------------------
# STEP 2: Энкодинг стран
# ----------------------

problem_country_encoder = LabelEncoder()
df_problem['country_id'] = problem_country_encoder.fit_transform(df_problem['country'])

X_problem = np.stack(df_problem['vector'].values)
y_problem = df_problem['vote'].values
c_problem = df_problem['country_id'].values

# ----------------------
# STEP 3: Подготовка тензоров
# ----------------------

X_tensor = torch.tensor(X_problem, dtype=torch.float32)
y_tensor = torch.tensor(y_problem, dtype=torch.float32)
c_tensor = torch.tensor(c_problem, dtype=torch.long)

dataset = TensorDataset(X_tensor, c_tensor, y_tensor)

# Веса
class_sample_count = np.array([(y_tensor == 0).sum(), (y_tensor == 1).sum()])
weights = 1. / class_sample_count
sample_weights = weights[y_tensor.long().numpy()]
sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)

train_loader = DataLoader(dataset, batch_size=64, sampler=sampler)

# ----------------------
# STEP 4: Тренировка модели
# ----------------------

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = VotePredictor(country_count=len(problem_country_encoder.classes_)).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Эпохи обучения
for epoch in range(8):
 model.train()
 total_loss = 0

 for batch_x, batch_c, batch_y in train_loader:
 batch_x, batch_c, batch_y = batch_x.to(device), batch_c.to(device), batch_y.to(device)

 optimizer.zero_grad()
 logits = model(batch_x, batch_c).squeeze()
 loss = criterion(logits, batch_y)
 loss.backward()
 optimizer.step()

 total_loss += loss.item()

 print(f"Epoch {epoch+1} — Loss: {total_loss:.4f}")

# ----------------------
# STEP 5: Оценка
# ----------------------

model.eval()
all_preds, all_true = [], []

with torch.no_grad():
 for batch_x, batch_c, batch_y in train_loader:
 logits = model(batch_x.to(device), batch_c.to(device)).squeeze()
 probs = torch.sigmoid(logits).cpu().numpy()
 preds = (probs > 0.5).astype(int)

 all_preds.extend(preds)
 all_true.extend(batch_y.numpy())

print("\n🧾 SPECIAL MODEL EVALUATION (Bad-F1 Countries Only):\n")
print(classification_report(all_true, all_preds, target_names=['Not Yes', 'Yes']))


Epoch 1 — Loss: 176.5783
Epoch 2 — Loss: 172.1360
Epoch 3 — Loss: 169.1655
Epoch 4 — Loss: 167.5052
Epoch 5 — Loss: 167.0431
Epoch 6 — Loss: 164.9137
Epoch 7 — Loss: 165.0920
Epoch 8 — Loss: 164.1620

🧾 SPECIAL MODEL EVALUATION (Bad-F1 Countries Only):

 precision recall f1-score support

 Not Yes 0.64 0.64 0.64 8252
 Yes 0.64 0.64 0.64 8254

 accuracy 0.64 16506
 macro avg 0.64 0.64 0.64 16506
weighted avg 0.64 0.64 0.64 16506



In [54]:
list(set(problem_countries))

['SURINAME',
 'TURKMENISTAN',
 'MARSHALL ISLANDS',
 'MYANMAR',
 'GABON',
 'CENTRAL AFRICAN REPUBLIC',
 'ISRAEL',
 'REPUBLIC OF THE CONGO',
 'LIBERIA',
 'SOMALIA',
 'CANADA',
 "LAO PEOPLE'S DEMOCRATIC REPUBLIC",
 'TUVALU',
 'DEMOCRATIC REPUBLIC OF THE CONGO',
 'MONTENEGRO',
 'VANUATU',
 'UNITED STATES',
 'TÜRKİYE',
 'SEYCHELLES',
 'SERBIA',
 'CABO VERDE',
 'VENEZUELA (BOLIVARIAN REPUBLIC OF)',
 'KIRIBATI',
 'IRAN (ISLAMIC REPUBLIC OF)',
 'SOUTH SUDAN',
 'ALBANIA',
 'CZECHIA',
 'DOMINICA',
 'SAO TOME AND PRINCIPE',
 'ESWATINI',
 'CHAD',
 'EQUATORIAL GUINEA',
 'GAMBIA',
 'LIBYA',
 "CÔTE D'IVOIRE",
 'SAINT CHRISTOPHER AND NEVIS',
 'RWANDA',
 'TONGA',
 'NIGER',
 'MICRONESIA (FEDERATED STATES OF)',
 'SYRIAN ARAB REPUBLIC',
 'NAURU',
 'PALAU',
 'NORTH MACEDONIA',
 'NETHERLANDS',
 'BOLIVIA (PLURINATIONAL STATE OF)']