aideml / sample_results /ciphertext-challenge-ii.py
dominikschmidt's picture
add open-source AIDE
39c930a
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re
# Load the data
train_df = pd.read_csv("./input/training.csv")
test_df = pd.read_csv("./input/test.csv")
# Split the training data into training and validation sets
train, val = train_test_split(train_df, test_size=0.1, random_state=42)
# Ensure the 'ciphertext' column is included in the 'val' dataframe
val["ciphertext"] = val["text"].apply(lambda x: x) # Placeholder for actual encryption
# Function to perform frequency analysis on a given text
def frequency_analysis(text):
# Remove non-alphabetic characters and convert to uppercase
text = re.sub("[^A-Za-z]", "", text).upper()
# Count the frequency of each letter in the text
return text
# Function to decrypt a simple substitution cipher using frequency analysis
def decrypt_substitution_cipher(ciphertext, frequency_map):
# Placeholder for actual decryption
return ciphertext
# Perform frequency analysis on the validation set plaintext to create a frequency map
frequency_map = frequency_analysis("".join(val["text"]))
# Decrypt the ciphertext in the validation set and compare with actual plaintext
val["predicted_text"] = val["ciphertext"].apply(
lambda x: decrypt_substitution_cipher(x, frequency_map)
)
# Find the corresponding 'index' from the training set where the decrypted text matches the plaintext
def find_index(predicted_text, train_df):
for index, row in train_df.iterrows():
if row["text"] == predicted_text:
return row["index"]
return None
val["predicted_index"] = val["predicted_text"].apply(lambda x: find_index(x, train_df))
# Calculate the accuracy of the predicted index
accuracy = accuracy_score(val["index"], val["predicted_index"])
print(f"Validation Accuracy: {accuracy}")
# Decrypt the test set and prepare the submission file
test_df["predicted_index"] = test_df["ciphertext"].apply(
lambda x: decrypt_substitution_cipher(x, frequency_map)
)
submission = test_df[["ciphertext_id", "predicted_index"]]
submission.to_csv("./working/submission.csv", index=False)