
feat: Add separate sliders for all and selected repositories in the PapersWithCode tasks tab
15420a6
import pandas as pd | |
import ast | |
import json | |
import plotly.express as px | |
import plotly.graph_objects as go | |
class TaskVisualizations: | |
def __init__( | |
self, task_counts_path, selected_task_counts_path, tasks_with_areas_path | |
): | |
self.tasks_with_areas_df = self.load_tasks_with_areas_df( | |
task_counts_path, tasks_with_areas_path | |
) | |
self.selected_tasks_with_areas_df = self.load_tasks_with_areas_df( | |
selected_task_counts_path, tasks_with_areas_path | |
) | |
def load_tasks_with_areas_df( | |
cls, task_counts_path, tasks_with_areas_path="data/paperswithcode_tasks.csv" | |
): | |
task_counts_df = pd.read_csv(task_counts_path) | |
raw_tasks_with_areas_df = pd.read_csv(tasks_with_areas_path) | |
return raw_tasks_with_areas_df.merge(task_counts_df, on="task") | |
def get_topk_merge_others(cls, df, by_col, val_col, k=10, val_threshold=1000): | |
sorted_df = df.copy().sort_values(val_col, ascending=False) | |
topk_dict = ( | |
sorted_df[[by_col, val_col]].set_index(by_col).iloc[:k].to_dict()[val_col] | |
) | |
print(topk_dict) | |
sorted_df[by_col] = sorted_df[by_col].apply( | |
lambda k: k | |
if k in topk_dict.keys() and topk_dict[k] >= val_threshold | |
else "other" | |
) | |
sorted_df = sorted_df.groupby(by_col).agg({val_col: sum}) | |
return sorted_df | |
def get_displayed_tasks_with_areas_df(cls, tasks_with_areas_df, min_task_count): | |
displayed_tasks_with_areas_df = tasks_with_areas_df.dropna().copy() | |
displayed_tasks_with_areas_df["task"] = displayed_tasks_with_areas_df.apply( | |
lambda r: r["task"] if r["count"] >= min_task_count else "other", axis=1 | |
) | |
displayed_tasks_with_areas_df = ( | |
displayed_tasks_with_areas_df.groupby("area") | |
.apply(lambda df: cls.get_topk_merge_others(df, "task", "count")) | |
.reset_index() | |
) | |
displayed_tasks_with_areas_df["task"] = ( | |
displayed_tasks_with_areas_df["task"] | |
+ " " | |
+ displayed_tasks_with_areas_df["count"].apply(str) | |
) | |
return displayed_tasks_with_areas_df | |
def get_tasks_sunbursts(self, min_task_count_all, min_task_count_selected): | |
all_df = self.tasks_with_areas_df | |
selected_df = self.selected_tasks_with_areas_df | |
displayed_tasks_all_df = self.get_displayed_tasks_with_areas_df( | |
all_df, min_task_count_all | |
) | |
displayed_tasks_selected_df = self.get_displayed_tasks_with_areas_df( | |
selected_df, min_task_count_selected | |
) | |
all_sunburst = px.sunburst( | |
displayed_tasks_all_df, path=["area", "task"], values="count" | |
) | |
selected_sunburst = px.sunburst( | |
displayed_tasks_selected_df, path=["area", "task"], values="count" | |
) | |
return all_sunburst, selected_sunburst | |