|
import pandas as pd |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.metrics import accuracy_score |
|
import re |
|
|
|
|
|
train_df = pd.read_csv("./input/training.csv") |
|
test_df = pd.read_csv("./input/test.csv") |
|
|
|
|
|
train, val = train_test_split(train_df, test_size=0.1, random_state=42) |
|
|
|
|
|
val["ciphertext"] = val["text"].apply(lambda x: x) |
|
|
|
|
|
|
|
def frequency_analysis(text): |
|
|
|
text = re.sub("[^A-Za-z]", "", text).upper() |
|
|
|
return text |
|
|
|
|
|
|
|
def decrypt_substitution_cipher(ciphertext, frequency_map): |
|
|
|
return ciphertext |
|
|
|
|
|
|
|
frequency_map = frequency_analysis("".join(val["text"])) |
|
|
|
|
|
val["predicted_text"] = val["ciphertext"].apply( |
|
lambda x: decrypt_substitution_cipher(x, frequency_map) |
|
) |
|
|
|
|
|
|
|
def find_index(predicted_text, train_df): |
|
for index, row in train_df.iterrows(): |
|
if row["text"] == predicted_text: |
|
return row["index"] |
|
return None |
|
|
|
|
|
val["predicted_index"] = val["predicted_text"].apply(lambda x: find_index(x, train_df)) |
|
|
|
|
|
accuracy = accuracy_score(val["index"], val["predicted_index"]) |
|
print(f"Validation Accuracy: {accuracy}") |
|
|
|
|
|
test_df["predicted_index"] = test_df["ciphertext"].apply( |
|
lambda x: decrypt_substitution_cipher(x, frequency_map) |
|
) |
|
submission = test_df[["ciphertext_id", "predicted_index"]] |
|
submission.to_csv("./working/submission.csv", index=False) |
|
|