Benchmark
In [1]:
Copied!
import pandas as pd
import numpy as np
import seaborn as sns
import pandas as pd
import numpy as np
import seaborn as sns
Gene body¶
In [33]:
Copied!
gb_bench_martrix = pd.read_csv('./feature_metrics_gene_body_v3.csv')
gb_bench_martrix = pd.read_csv('./feature_metrics_gene_body_v3.csv')
In [34]:
Copied!
gb_bench_martrix.head()
gb_bench_martrix.head()
Out[34]:
Unnamed: 0 | ARI_Louvain | AMI_Louvain | Homogeneity_Louvain | ARI_leiden | AMI_leiden | Homogeneity_leiden | method | feature_number | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | NaN | NaN | NaN | 0.228188 | 0.554318 | 0.600083 | feature_select_random_ | 3000 |
1 | 1 | NaN | NaN | NaN | 0.229584 | 0.510317 | 0.558553 | feature_select_var_ | 3000 |
2 | 2 | NaN | NaN | NaN | 0.439136 | 0.783196 | 0.834210 | feature_select_sr_ | 3000 |
3 | 3 | NaN | NaN | NaN | 0.205842 | 0.386525 | 0.431316 | feature_select_dispersion_ | 3000 |
4 | 4 | NaN | NaN | NaN | 0.424175 | 0.764100 | 0.817355 | feature_select_residual_ | 3000 |
In [5]:
Copied!
# df = df.rename(columns={'variable':'metric','value':'metric value'})
plot = sns.catplot(x='k',y='value',hue='gene_selection_via',data=df,kind='point',col='variable',)
for ax in plot.axes.flatten():
for item in ax.get_xticklabels():
item.set_rotation(45)
#plot.savefig('./figures/gene_body_feature_metric_v3_300_1000.pdf', dpi=300, format=None)
# df = df.rename(columns={'variable':'metric','value':'metric value'})
plot = sns.catplot(x='k',y='value',hue='gene_selection_via',data=df,kind='point',col='variable',)
for ax in plot.axes.flatten():
for item in ax.get_xticklabels():
item.set_rotation(45)
#plot.savefig('./figures/gene_body_feature_metric_v3_300_1000.pdf', dpi=300, format=None)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[5], line 2 1 # df = df.rename(columns={'variable':'metric','value':'metric value'}) ----> 2 plot = sns.catplot(x='k',y='value',hue='gene_selection_via',data=df,kind='point',col='variable',) 3 for ax in plot.axes.flatten(): 4 for item in ax.get_xticklabels(): NameError: name 'df' is not defined
In [35]:
Copied!
df_metrics = gb_bench_martrix
df_metrics = gb_bench_martrix
In [36]:
Copied!
def get_plot_df(res):
gene_selection_via = []
k = []
dim_reduction_method = []
ari = []
ami = []
homogeneity = []
# 使用 DataFrame 的列名直接访问数据并添加到各自的列表
for idx, row in res.iterrows():
gene_selection_via.append(row['method'])
k.append(row['feature_number'])
ari.append(row['ARI_leiden'])
ami.append(row['AMI_leiden'])
homogeneity.append(row['Homogeneity_leiden'])
data_dict = dict(gene_selection_via=gene_selection_via,
k=k,
ari = ari,
ami = ami,
homogeneity = homogeneity
)
data = pd.DataFrame(data_dict)
return data.melt(id_vars=['gene_selection_via','k'])
# 重置索引,将索引列转为普通列
df = df_metrics
df = get_plot_df(df_metrics)
df = df.reset_index(drop=True)
def get_plot_df(res):
gene_selection_via = []
k = []
dim_reduction_method = []
ari = []
ami = []
homogeneity = []
# 使用 DataFrame 的列名直接访问数据并添加到各自的列表
for idx, row in res.iterrows():
gene_selection_via.append(row['method'])
k.append(row['feature_number'])
ari.append(row['ARI_leiden'])
ami.append(row['AMI_leiden'])
homogeneity.append(row['Homogeneity_leiden'])
data_dict = dict(gene_selection_via=gene_selection_via,
k=k,
ari = ari,
ami = ami,
homogeneity = homogeneity
)
data = pd.DataFrame(data_dict)
return data.melt(id_vars=['gene_selection_via','k'])
# 重置索引,将索引列转为普通列
df = df_metrics
df = get_plot_df(df_metrics)
df = df.reset_index(drop=True)
In [37]:
Copied!
# 删除字符末尾的下划线
df['gene_selection_via'] = df['gene_selection_via'].str.rstrip('_')
# 将 'scm' 替换为 'scMethtools'
df['gene_selection_via'] = df['gene_selection_via'].str.replace('scm', 'scMethtools')
# 删除字符末尾的下划线
df['gene_selection_via'] = df['gene_selection_via'].str.rstrip('_')
# 将 'scm' 替换为 'scMethtools'
df['gene_selection_via'] = df['gene_selection_via'].str.replace('scm', 'scMethtools')
In [38]:
Copied!
df
df
Out[38]:
gene_selection_via | k | variable | value | |
---|---|---|---|---|
0 | feature_select_random | 3000 | ari | 0.228188 |
1 | feature_select_var | 3000 | ari | 0.229584 |
2 | feature_select_sr | 3000 | ari | 0.439136 |
3 | feature_select_dispersion | 3000 | ari | 0.205842 |
4 | feature_select_residual | 3000 | ari | 0.424175 |
... | ... | ... | ... | ... |
115 | scMethtools_feature_select_random | 28537 | homogeneity | 0.847398 |
116 | scMethtools_feature_select_var | 28537 | homogeneity | 0.847398 |
117 | scMethtools_feature_select_sr | 28537 | homogeneity | 0.847398 |
118 | scMethtools_feature_select_dispersion | 28537 | homogeneity | 0.847398 |
119 | scMethtools_feature_select_residual | 28537 | homogeneity | 0.847398 |
120 rows × 4 columns
In [40]:
Copied!
method = ['feature_select_random','feature_select_var','feature_select_dispersion','scMethtools_feature_select_sr']
method = ['feature_select_random','feature_select_var','feature_select_dispersion','scMethtools_feature_select_sr']
In [85]:
Copied!
filtered_df = df[df['gene_selection_via'].isin(method)]
filtered_df = df[df['gene_selection_via'].isin(method)]
In [86]:
Copied!
filtered_df
filtered_df
Out[86]:
gene_selection_via | k | variable | value | |
---|---|---|---|---|
0 | feature_select_random | 3000 | ari | 0.228188 |
1 | feature_select_var | 3000 | ari | 0.229584 |
3 | feature_select_dispersion | 3000 | ari | 0.205842 |
5 | feature_select_random | 5000 | ari | 0.320749 |
6 | feature_select_var | 5000 | ari | 0.266127 |
8 | feature_select_dispersion | 5000 | ari | 0.228710 |
10 | feature_select_random | 10000 | ari | 0.387008 |
11 | feature_select_var | 10000 | ari | 0.352698 |
13 | feature_select_dispersion | 10000 | ari | 0.284312 |
15 | feature_select_random | 28537 | ari | 0.422763 |
16 | feature_select_var | 28537 | ari | 0.422763 |
18 | feature_select_dispersion | 28537 | ari | 0.422763 |
22 | scMethtools_feature_select_sr | 3000 | ari | 0.446674 |
27 | scMethtools_feature_select_sr | 5000 | ari | 0.438144 |
32 | scMethtools_feature_select_sr | 10000 | ari | 0.514701 |
37 | scMethtools_feature_select_sr | 28537 | ari | 0.493098 |
40 | feature_select_random | 3000 | ami | 0.554318 |
41 | feature_select_var | 3000 | ami | 0.510317 |
43 | feature_select_dispersion | 3000 | ami | 0.386525 |
45 | feature_select_random | 5000 | ami | 0.659198 |
46 | feature_select_var | 5000 | ami | 0.618409 |
48 | feature_select_dispersion | 5000 | ami | 0.516154 |
50 | feature_select_random | 10000 | ami | 0.720927 |
51 | feature_select_var | 10000 | ami | 0.716170 |
53 | feature_select_dispersion | 10000 | ami | 0.646381 |
55 | feature_select_random | 28537 | ami | 0.772037 |
56 | feature_select_var | 28537 | ami | 0.772037 |
58 | feature_select_dispersion | 28537 | ami | 0.772037 |
62 | scMethtools_feature_select_sr | 3000 | ami | 0.789079 |
67 | scMethtools_feature_select_sr | 5000 | ami | 0.788785 |
72 | scMethtools_feature_select_sr | 10000 | ami | 0.801440 |
77 | scMethtools_feature_select_sr | 28537 | ami | 0.801598 |
80 | feature_select_random | 3000 | homogeneity | 0.600083 |
81 | feature_select_var | 3000 | homogeneity | 0.558553 |
83 | feature_select_dispersion | 3000 | homogeneity | 0.431316 |
85 | feature_select_random | 5000 | homogeneity | 0.701129 |
86 | feature_select_var | 5000 | homogeneity | 0.666750 |
88 | feature_select_dispersion | 5000 | homogeneity | 0.554015 |
90 | feature_select_random | 10000 | homogeneity | 0.777818 |
91 | feature_select_var | 10000 | homogeneity | 0.772963 |
93 | feature_select_dispersion | 10000 | homogeneity | 0.702188 |
95 | feature_select_random | 28537 | homogeneity | 0.824734 |
96 | feature_select_var | 28537 | homogeneity | 0.824734 |
98 | feature_select_dispersion | 28537 | homogeneity | 0.824734 |
102 | scMethtools_feature_select_sr | 3000 | homogeneity | 0.836942 |
107 | scMethtools_feature_select_sr | 5000 | homogeneity | 0.837973 |
112 | scMethtools_feature_select_sr | 10000 | homogeneity | 0.844679 |
117 | scMethtools_feature_select_sr | 28537 | homogeneity | 0.847398 |
In [48]:
Copied!
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
In [110]:
Copied!
import scanpy as sc
import scanpy as sc
In [114]:
Copied!
sc.set_figure_params()
# 设置 Seaborn 样式为白色背景
sns.set_style('white')
sc.set_figure_params()
# 设置 Seaborn 样式为白色背景
sns.set_style('white')
In [117]:
Copied!
plot = sns.catplot(x='k',y='value',hue='gene_selection_via',data=filtered_df,kind='point',col='variable')
plot.legend.remove()
plot.add_legend(title='Gene Selection Method', title_fontsize='14', fontsize='14', ncol=4, loc='lower center')
# 设置各个子图标题的字体大小
for ax in plot.axes.flat:
ax.set_title(ax.get_title(), fontsize=16)
# 调整图形布局以确保图例不会被遮挡
plt.subplots_adjust(bottom=0.25)
plot.savefig('./figures/gene_body_feature_metric_v5_300_all.pdf', dpi=300, format=None)
plot = sns.catplot(x='k',y='value',hue='gene_selection_via',data=filtered_df,kind='point',col='variable')
plot.legend.remove()
plot.add_legend(title='Gene Selection Method', title_fontsize='14', fontsize='14', ncol=4, loc='lower center')
# 设置各个子图标题的字体大小
for ax in plot.axes.flat:
ax.set_title(ax.get_title(), fontsize=16)
# 调整图形布局以确保图例不会被遮挡
plt.subplots_adjust(bottom=0.25)
plot.savefig('./figures/gene_body_feature_metric_v5_300_all.pdf', dpi=300, format=None)
下采样结果¶
In [118]:
Copied!
downsample_bench_martrix = pd.read_csv('./downsample_feature_metrics_v1.csv')
downsample_bench_martrix = pd.read_csv('./downsample_feature_metrics_v1.csv')
In [120]:
Copied!
downsample_bench_martrix.head()
downsample_bench_martrix.head()
Out[120]:
Unnamed: 0 | Sample | Feature | ARI | AMI | Homogeneity | |
---|---|---|---|---|---|---|
0 | 0 | 10 | 1000 | 0.376579 | 0.660696 | 0.752957 |
1 | 1 | 10 | 3000 | 0.346674 | 0.659011 | 0.754903 |
2 | 2 | 10 | 5000 | 0.355119 | 0.676024 | 0.788836 |
3 | 3 | 10 | 10000 | 0.401669 | 0.695064 | 0.805697 |
4 | 4 | 10 | 28537 | 0.394273 | 0.702486 | 0.808378 |
In [121]:
Copied!
def get_plot_df(res):
sample_num = []
k = []
dim_reduction_method = []
ari = []
ami = []
homogeneity = []
# 使用 DataFrame 的列名直接访问数据并添加到各自的列表
for idx, row in res.iterrows():
sample_num.append(row['Sample'])
k.append(row['Feature'])
ari.append(row['ARI'])
ami.append(row['AMI'])
homogeneity.append(row['Homogeneity'])
data_dict = dict(sample_num=sample_num,
k=k,
ari = ari,
ami = ami,
homogeneity = homogeneity
)
data = pd.DataFrame(data_dict)
return data.melt(id_vars=['sample_num','k'])
# 重置索引,将索引列转为普通列
df = downsample_bench_martrix
df = get_plot_df(downsample_bench_martrix)
df = df.reset_index(drop=True)
def get_plot_df(res):
sample_num = []
k = []
dim_reduction_method = []
ari = []
ami = []
homogeneity = []
# 使用 DataFrame 的列名直接访问数据并添加到各自的列表
for idx, row in res.iterrows():
sample_num.append(row['Sample'])
k.append(row['Feature'])
ari.append(row['ARI'])
ami.append(row['AMI'])
homogeneity.append(row['Homogeneity'])
data_dict = dict(sample_num=sample_num,
k=k,
ari = ari,
ami = ami,
homogeneity = homogeneity
)
data = pd.DataFrame(data_dict)
return data.melt(id_vars=['sample_num','k'])
# 重置索引,将索引列转为普通列
df = downsample_bench_martrix
df = get_plot_df(downsample_bench_martrix)
df = df.reset_index(drop=True)
In [126]:
Copied!
# 将 'k' 列转换为整数类型
df['k'] = df['k'].astype(int)
# 将 'k' 列转换为整数类型
df['k'] = df['k'].astype(int)
In [134]:
Copied!
# 确保列 'k' 是浮点数类型
df['sample_num'] = df['sample_num'].astype(float)
# 将列 'k' 转换为百分数
df['k_percentage'] = df['sample_num'] * 100
# 确保列 'k' 是浮点数类型
df['sample_num'] = df['sample_num'].astype(float)
# 将列 'k' 转换为百分数
df['k_percentage'] = df['sample_num'] * 100
In [139]:
Copied!
# 创建映射字典
mapping = {
10.0: '10% (n=231)',
30.0: '30% (n=693)',
50.0: '50% (n=1156)',
70.0: '70% (n=1619)',
90.0: '90% (n=2081)'
}
# 根据 sample 列的值匹配 detail 列的内容
df['detail'] = df['sample_num'].map(mapping)
# 创建映射字典
mapping = {
10.0: '10% (n=231)',
30.0: '30% (n=693)',
50.0: '50% (n=1156)',
70.0: '70% (n=1619)',
90.0: '90% (n=2081)'
}
# 根据 sample 列的值匹配 detail 列的内容
df['detail'] = df['sample_num'].map(mapping)
In [140]:
Copied!
df
df
Out[140]:
sample_num | k | variable | value | k_percentage | detail | |
---|---|---|---|---|---|---|
0 | 10.0 | 1000 | ari | 0.376579 | 1000.0 | 10% (n=231) |
1 | 10.0 | 3000 | ari | 0.346674 | 1000.0 | 10% (n=231) |
2 | 10.0 | 5000 | ari | 0.355119 | 1000.0 | 10% (n=231) |
3 | 10.0 | 10000 | ari | 0.401669 | 1000.0 | 10% (n=231) |
4 | 10.0 | 28537 | ari | 0.394273 | 1000.0 | 10% (n=231) |
... | ... | ... | ... | ... | ... | ... |
70 | 90.0 | 1000 | homogeneity | 0.831687 | 9000.0 | 90% (n=2081) |
71 | 90.0 | 3000 | homogeneity | 0.834220 | 9000.0 | 90% (n=2081) |
72 | 90.0 | 5000 | homogeneity | 0.840011 | 9000.0 | 90% (n=2081) |
73 | 90.0 | 10000 | homogeneity | 0.839620 | 9000.0 | 90% (n=2081) |
74 | 90.0 | 28537 | homogeneity | 0.846224 | 9000.0 | 90% (n=2081) |
75 rows × 6 columns
In [141]:
Copied!
plot = sns.catplot(x='k',y='value',hue='detail',data=df,kind='point',col='variable')
plot.legend.remove()
plot.add_legend(title='Sample fraction', title_fontsize='14', fontsize='14', ncol=5, loc='lower center')
# 设置各个子图标题的字体大小
for ax in plot.axes.flat:
ax.set_title(ax.get_title(), fontsize=16)
# 调整图形布局以确保图例不会被遮挡
plt.subplots_adjust(bottom=0.25)
plot.savefig('./figures/downsample_feature_metric_all.pdf', dpi=300, format=None)
plot = sns.catplot(x='k',y='value',hue='detail',data=df,kind='point',col='variable')
plot.legend.remove()
plot.add_legend(title='Sample fraction', title_fontsize='14', fontsize='14', ncol=5, loc='lower center')
# 设置各个子图标题的字体大小
for ax in plot.axes.flat:
ax.set_title(ax.get_title(), fontsize=16)
# 调整图形布局以确保图例不会被遮挡
plt.subplots_adjust(bottom=0.25)
plot.savefig('./figures/downsample_feature_metric_all.pdf', dpi=300, format=None)
In [142]:
Copied!
df.to_csv('downsample_feature_metrics_v2.csv')
df.to_csv('downsample_feature_metrics_v2.csv')
In [ ]:
Copied!