import numpy as np from sklearn.metrics import classification_report import zipfile import json import pandas as pd from .config import TAG_NAMES def load_data(test_data_path): # zip file handler zip_file = zipfile.ZipFile('code_classification_dataset.zip') # list available files in the container names = zip_file.namelist() data = [] features = ["prob_desc_description","prob_desc_input_spec","prob_desc_output_spec"] cols = features + ["tags"] # extract a specific file from the zip container for name in names[1:]: f = zip_file.open(name) # save the extraced file content = f.read() d = json.loads(content) # json_fmt = json.dumps(d, indent=2) # print(json_fmt) row = [] for c in cols: row.append(d[c]) data.append(row) df = pd.DataFrame(data, columns=cols) return df def preprocessing(df): # Example dataset texts = df["prob_desc_description"].values.tolist() labels = df[TAG_NAMES].values.tolist() # data: # texts = ["text1", "text2", ...] # list of texts # labels = [[0,1,0,0,1,0,1,1,0], [0,1,1,0,0,0,0,0,0],, ...] # list of labels df = pd.DataFrame({'text':texts, 'labels': labels}) def evaluate_model(test_data_path): df = load_data(test_data_path) df = preprocessing(df) return metrics