Spaces:
Sleeping
Sleeping
import matplotlib.pyplot as plt | |
import matplotlib | |
matplotlib.use('agg') | |
import plot_utils | |
from constants import * | |
class MatplotlibDataPlotter: | |
def __init__(self, single_df, pair_df, num_domains_in_region_df): | |
self.single_df = single_df | |
self.pair_df = pair_df | |
self.num_domains_in_region_df = num_domains_in_region_df | |
self.single_domains_fig = plt.figure(figsize=(5, 10)) | |
self.pair_domains_fig = plt.figure(figsize=(5, 10)) | |
def plot_single_domains(self, num_domains, split_name="stratified"): | |
selected_region_ids = self.num_domains_in_region_df.loc[ | |
self.num_domains_in_region_df.num_domains >= num_domains, | |
'cds_region_id'].values | |
single_df_subset = self.single_df.loc[self.single_df.cds_region_id.isin(selected_region_ids)] | |
biosyn_counts_single = single_df_subset[['cds_region_id', 'biosyn_class']].drop_duplicates().groupby("biosyn_class", as_index=False).count() | |
hue2count_single = dict(biosyn_counts_single.values) | |
# split_name = 'stratified' | |
column_name = f'cosine_similarity_{split_name}' | |
# single_df_subset = single_df.loc[single_df.dom_location_len >= num_domains] | |
selected_keyword_index = single_df_subset.groupby('cds_region_id').agg( | |
{column_name: 'idxmax'} | |
).values.flatten() | |
targets_list = single_df_subset.loc[selected_keyword_index, 'biosyn_class_index'].values | |
label_list = single_df_subset.loc[selected_keyword_index, 'profile_name'].values | |
top_n=5 | |
bin_width=1 | |
hue_group_offset=0.5 | |
width=0.9 | |
fig = self.single_domains_fig | |
fig.clf() | |
ax = fig.gca() | |
plot_utils.draw_barplots( | |
targets_list, | |
label_list=label_list, | |
top_n=top_n, | |
bin_width=bin_width, | |
hue_group_offset=hue_group_offset, | |
hue_order=BIOSYN_CLASS_NAMES, | |
hue2count=hue2count_single, | |
width=width, | |
ax=ax, | |
show_legend=False, | |
palette=COLOR_PALETTE | |
) | |
fig.tight_layout() | |
return fig | |
def plot_pair_domains(self, num_domains, split_name="stratified"): | |
selected_region_ids = self.num_domains_in_region_df.loc[ | |
self.num_domains_in_region_df.num_domains >= num_domains, | |
'cds_region_id'].values | |
pair_df_subset = self.pair_df.loc[self.pair_df.cds_region_id.isin(selected_region_ids)] | |
biosyn_counts_pairs = pair_df_subset[['cds_region_id', 'biosyn_class']].drop_duplicates().groupby("biosyn_class", as_index=False).count() | |
hue2count_pairs = dict(biosyn_counts_pairs.values) | |
column_name = f'cosine_similarity_{split_name}' | |
selected_keyword_index = pair_df_subset.groupby('cds_region_id').agg( | |
{column_name: 'idxmax'} | |
).values.flatten() | |
targets_list = pair_df_subset.loc[ | |
selected_keyword_index, 'biosyn_class_index'].values | |
label_list=pair_df_subset.loc[ | |
selected_keyword_index, 'profile_name'].values | |
top_n=5 | |
bin_width=1 | |
hue_group_offset=0.5 | |
# hue_order=BIOSYN_CLASS_NAMES | |
hue2count={} | |
width=0.9 | |
show_legend=False | |
fig = self.pair_domains_fig | |
fig.clf() | |
ax = fig.gca() | |
plot_utils.draw_barplots( | |
targets_list, | |
label_list=label_list, | |
top_n=top_n, | |
bin_width=bin_width, | |
hue_group_offset=hue_group_offset, | |
hue_order=BIOSYN_CLASS_NAMES, | |
hue2count=hue2count_pairs, | |
width=width, | |
ax=ax, | |
show_legend=show_legend, | |
palette=COLOR_PALETTE | |
) | |
fig.tight_layout() | |
return fig #plt.gcf() | |