GSE56879
In [1]:
Copied!
import pandas as pd
import numpy as np
import scanpy as sc
import scMethtools as scm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scanpy as sc
import scMethtools as scm
import matplotlib.pyplot as plt
#### #### # # ##### # # ##### #### #### # #### # # # ## ## # # # # # # # # # # #### # # ## # # ###### # # # # # # #### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #### #### # # # # # # #### #### ###### #### Version: 1.0.0, Tutorials: https://ngdc.cncb.ac.cn/methbank/scm
In [40]:
Copied!
windows = scm.pp.sliding_windows(window_size=100000,step_size=100000,ref=scm.ref.hg38)
windows = scm.pp.sliding_windows(window_size=100000,step_size=100000,ref=scm.ref.hg38)
In [21]:
Copied!
scbs = ad.read('/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scbs/scMethools.h5ad')
scbs = ad.read('/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scbs/scMethools.h5ad')
/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/anndata/__init__.py:51: FutureWarning: `anndata.read` is deprecated, use `anndata.read_h5ad` instead. `ad.read` will be removed in mid 2024. warnings.warn(
In [36]:
Copied!
scbs.X
scbs.X
Out[36]:
array([[ 0. , 0. , 0. , ..., 0. , 0. , 0. ], [ 0. , 0. , 0. , ..., 0. , 0. , 0. ], [ 0. , 0. , 0. , ..., 0. , 0. , 0. ], ..., [ 0. , 0. , 0. , ..., 0. , -0.34285364, 0. ], [ 0. , 0. , 0. , ..., 0. , 0. , 0. ], [ 0. , 0. , 0.15473133, ..., 0. , 0.23935609, 0. ]])
In [26]:
Copied!
enhancers = {'chr1':{(69960,73220),(173160,175770),(255690,257990)}}
enhancers
enhancers = {'chr1':{(69960,73220),(173160,175770),(255690,257990)}}
enhancers
Out[26]:
{'chr1': {(69960, 73220), (173160, 175770), (255690, 257990)}}
In [41]:
Copied!
scm.pp.features_to_scm(features=[windows],feature_names=["test_scbs_smooth"],meta_df=None,npz_path="/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scm/",output_dir="/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scbs/filtered_data",out_file=None,cpu=20,relative=True,smooth=False,copy=False)
scm.pp.features_to_scm(features=[windows],feature_names=["test_scbs_smooth"],meta_df=None,npz_path="/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scm/",output_dir="/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scbs/filtered_data",out_file=None,cpu=20,relative=True,smooth=False,copy=False)
Saving results in: /xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scbs/filtered_data
In [37]:
Copied!
import anndata as ad
test = ad.read('/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scbs/filtered_data/test_scbs.h5ad')
import anndata as ad
test = ad.read('/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scbs/filtered_data/test_scbs.h5ad')
/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/anndata/__init__.py:51: FutureWarning: `anndata.read` is deprecated, use `anndata.read_h5ad` instead. `ad.read` will be removed in mid 2024. warnings.warn(
In [38]:
Copied!
test.X.toarray()
test.X.toarray()
Out[38]:
array([[nan, nan, 0. ], [nan, nan, nan], [0. , nan, nan], ..., [nan, nan, 1. ], [nan, nan, nan], [nan, 0.5, nan]], dtype=float32)
In [39]:
Copied!
test.layers['relative'].toarray()
test.layers['relative'].toarray()
Out[39]:
array([[ nan, nan, 0. ], [ nan, nan, nan], [0. , nan, nan], ..., [ nan, nan, 0.5 ], [ nan, nan, nan], [ nan, 0.01698969, nan]], dtype=float32)
In [8]:
Copied!
scm_100k = scm.pp.load_scm("/xtdisk/methbank_baoym/zongwt/single/data/GSE56789/sscm/windows.h5")
scm_100k = scm.pp.load_scm("/xtdisk/methbank_baoym/zongwt/single/data/GSE56789/sscm/windows.h5")
In [10]:
Copied!
scm_1k = scm.pp.load_scm("/xtdisk/methbank_baoym/zongwt/single/data/GSE56789/sscm/w1k.h5")
scm_1k = scm.pp.load_scm("/xtdisk/methbank_baoym/zongwt/single/data/GSE56789/sscm/w1k.h5")
In [11]:
Copied!
#add meta information
scm.pp.add_meta(scm_1k,meta_file='/xtdisk/methbank_baoym/zongwt/single/data/GSE56789/meta/GSE56879_Meta.txt',sep='\t',index_col='sample')
#add meta information
scm.pp.add_meta(scm_1k,meta_file='/xtdisk/methbank_baoym/zongwt/single/data/GSE56789/meta/GSE56879_Meta.txt',sep='\t',index_col='sample')
No column 'label' found in metadata, 'unknown' is used as the default cell labels No column 'label_color' found in metadata, random color is generated for each cell label
In [12]:
Copied!
# filters
# filter by cell
scm.pp.filter_cells(scm_1k,min_n_features = 10000)
# filters
# filter by cell
scm.pp.filter_cells(scm_1k,min_n_features = 10000)
filter cells based on min_n_features >= 10000 after filtering out low-quality cells: 46 cells, 2725532 regions
In [13]:
Copied!
# filter by features
scm.pp.filter_features(scm_1k,min_n_cells=40)
# filter by features
scm.pp.filter_features(scm_1k,min_n_cells=40)
/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scMethtools/preprocessing/scm.py:193: RuntimeWarning: Mean of empty slice feature_mean = np.nanmean(dense_X, axis=0)
Filter regions based on min_n_cells After filtering out low coveraged regions: 46 cells, 67300 regions
In [8]:
Copied!
scm_1k.var
scm_1k.var
Out[8]:
chromosome | start | end | covered_cell | var | sr_var | mean | pct_cell | |
---|---|---|---|---|---|---|---|---|
index | ||||||||
chr1:3003001-3004000 | chr1 | 3003001 | 3004000 | 33 | 0.140508 | 0.112315 | 0.683879 | 0.717391 |
chr1:3020001-3021000 | chr1 | 3020001 | 3021000 | 31 | 0.151429 | 0.106888 | 0.545968 | 0.673913 |
chr1:3021001-3022000 | chr1 | 3021001 | 3022000 | 30 | 0.135351 | 0.079366 | 0.660300 | 0.652174 |
chr1:3027001-3028000 | chr1 | 3027001 | 3028000 | 31 | 0.140963 | 0.125479 | 0.545516 | 0.673913 |
chr1:3070001-3071000 | chr1 | 3070001 | 3071000 | 31 | 0.133552 | 0.103920 | 0.659839 | 0.673913 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
chrY:90823001-90824000 | chrY | 90823001 | 90824000 | 35 | 0.051064 | 0.020274 | 0.126743 | 0.760870 |
chrY:90824001-90825000 | chrY | 90824001 | 90825000 | 36 | 0.082315 | 0.065267 | 0.553028 | 0.782609 |
chrY:90825001-90826000 | chrY | 90825001 | 90826000 | 40 | 0.073739 | 0.080637 | 0.578650 | 0.869565 |
chrY:90828001-90829000 | chrY | 90828001 | 90829000 | 37 | 0.109403 | 0.079763 | 0.329270 | 0.804348 |
chrY:90829001-90830000 | chrY | 90829001 | 90830000 | 30 | 0.119436 | 0.066563 | 0.353967 | 0.652174 |
535825 rows × 8 columns
In [ ]:
Copied!
#Select hvm feature
import seaborn as sns
from matplotlib.gridspec import GridSpec
from scipy.stats import gaussian_kde
dotsize=3
#for broken lines
linewidth_broken = 2
color_broken = 'tab:red'
#axis limits
xlim_satija = [1e-4,1e2]
ylim_satija_theta = [10**-7.5,10**1.5]
ylim_satija_beta0 = [-60,20]
title_fontsize = 22
title_fontweight = "bold"
title_position = (-0.2,1)
title_location = 'center'
title_y_padding = -10
markerfirst = True
alpha=0.05
def make_panel(x,
y,
ax,
letter='',
title='',
color_kde=True,
xlim=None,
ylim=None,
xlabel='mean methylation level',
ylabel='',
xscale='linear',
yscale='log'):
ax.text(*title_position,letter,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
if color_kde:
xtrans = np.log10(x) if xscale == 'log' else x
ytrans = np.log10(y) if yscale == 'log' else y
data = np.vstack((xtrans,ytrans))
kde = gaussian_kde(data)
logdensity = kde.logpdf(data)
points = ax.scatter(x, y, s=dotsize, c=logdensity, rasterized=True)
else:
ax.scatter(x, y, s=dotsize, rasterized=True)
ax.set_xscale(xscale)
ax.set_yscale(yscale)
ax.set_ylabel(ylabel)
ax.set_xlabel(xlabel)
if xlim:
ax.set_xlim(xlim)
if ylim:
ax.set_ylim(ylim)
ax.set_title(title)
def add_line(x,y,ax):
ax.plot(x,y, '--',c=color_broken, linewidth=linewidth_broken)
def fix_ticks(ax):
ax.set_yticks(ax.get_yticks()[1:-1])
with sns.plotting_context("talk"):
fig = plt.figure(figsize=(18,9)) #constrained_layout=True)
gs = GridSpec(1,2 , figure=fig)
ax1 = fig.add_subplot(gs[0, 0]) #var
ax2 = fig.add_subplot(gs[0, 1]) #sr_var
make_panel(x=scm_1k.var['mean'],
y=scm_1k.var['var'],
ax=ax1,
letter='a',
ylabel=r'var',
yscale='linear')
# beta0_prediction = np.log(mean_range_orig / np.mean(ns_orig))
# add_line(x=mean_range_orig,y=beta0_prediction,ax=ax1)
fix_ticks(ax1)
make_panel(x=scm_1k.var['mean'],
y=scm_1k.var['sr_var'],
ax=ax2,
letter='b',
ylabel=r'shrunken residual var',
yscale='linear')
# beta0_prediction = np.log(mean_range_orig / np.mean(ns_orig))
# add_line(x=winows100k_scm.var['mean'],y=winows100k_scm.var['sr_var'],ax=ax1)
# ax2.hlines(np.log(10),mean_min_orig,mean_max_orig,linestyles='--', colors=color_broken, linewidths=linewidth_broken)
sns.despine()
plt.tight_layout()
#plt.savefig('figures/Fig1.pdf', dpi=300, format=None)
plt.show()
#Select hvm feature
import seaborn as sns
from matplotlib.gridspec import GridSpec
from scipy.stats import gaussian_kde
dotsize=3
#for broken lines
linewidth_broken = 2
color_broken = 'tab:red'
#axis limits
xlim_satija = [1e-4,1e2]
ylim_satija_theta = [10**-7.5,10**1.5]
ylim_satija_beta0 = [-60,20]
title_fontsize = 22
title_fontweight = "bold"
title_position = (-0.2,1)
title_location = 'center'
title_y_padding = -10
markerfirst = True
alpha=0.05
def make_panel(x,
y,
ax,
letter='',
title='',
color_kde=True,
xlim=None,
ylim=None,
xlabel='mean methylation level',
ylabel='',
xscale='linear',
yscale='log'):
ax.text(*title_position,letter,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
if color_kde:
xtrans = np.log10(x) if xscale == 'log' else x
ytrans = np.log10(y) if yscale == 'log' else y
data = np.vstack((xtrans,ytrans))
kde = gaussian_kde(data)
logdensity = kde.logpdf(data)
points = ax.scatter(x, y, s=dotsize, c=logdensity, rasterized=True)
else:
ax.scatter(x, y, s=dotsize, rasterized=True)
ax.set_xscale(xscale)
ax.set_yscale(yscale)
ax.set_ylabel(ylabel)
ax.set_xlabel(xlabel)
if xlim:
ax.set_xlim(xlim)
if ylim:
ax.set_ylim(ylim)
ax.set_title(title)
def add_line(x,y,ax):
ax.plot(x,y, '--',c=color_broken, linewidth=linewidth_broken)
def fix_ticks(ax):
ax.set_yticks(ax.get_yticks()[1:-1])
with sns.plotting_context("talk"):
fig = plt.figure(figsize=(18,9)) #constrained_layout=True)
gs = GridSpec(1,2 , figure=fig)
ax1 = fig.add_subplot(gs[0, 0]) #var
ax2 = fig.add_subplot(gs[0, 1]) #sr_var
make_panel(x=scm_1k.var['mean'],
y=scm_1k.var['var'],
ax=ax1,
letter='a',
ylabel=r'var',
yscale='linear')
# beta0_prediction = np.log(mean_range_orig / np.mean(ns_orig))
# add_line(x=mean_range_orig,y=beta0_prediction,ax=ax1)
fix_ticks(ax1)
make_panel(x=scm_1k.var['mean'],
y=scm_1k.var['sr_var'],
ax=ax2,
letter='b',
ylabel=r'shrunken residual var',
yscale='linear')
# beta0_prediction = np.log(mean_range_orig / np.mean(ns_orig))
# add_line(x=winows100k_scm.var['mean'],y=winows100k_scm.var['sr_var'],ax=ax1)
# ax2.hlines(np.log(10),mean_min_orig,mean_max_orig,linestyles='--', colors=color_broken, linewidths=linewidth_broken)
sns.despine()
plt.tight_layout()
#plt.savefig('figures/Fig1.pdf', dpi=300, format=None)
plt.show()
In [20]:
Copied!
dotsize = 2
starsize = 125
staredges = 0.85
legend_loc = (0.1,0.6)
titleletter_loc = (-0.17,1.015)
hline_width = 1.5
dotsize=3
#for broken lines
linewidth_broken = 2
color_broken = 'tab:red'
#axis limits
xlim_satija = [1e-4,1e2]
ylim_satija_theta = [10**-7.5,10**1.5]
ylim_satija_beta0 = [-60,20]
title_fontsize = 22
title_fontweight = "bold"
title_position = (-0.2,1)
title_location = 'center'
title_y_padding = -10
markerfirst = True
dotsize = 2
starsize = 125
staredges = 0.85
legend_loc = (0.1,0.6)
titleletter_loc = (-0.17,1.015)
hline_width = 1.5
dotsize=3
#for broken lines
linewidth_broken = 2
color_broken = 'tab:red'
#axis limits
xlim_satija = [1e-4,1e2]
ylim_satija_theta = [10**-7.5,10**1.5]
ylim_satija_beta0 = [-60,20]
title_fontsize = 22
title_fontweight = "bold"
title_position = (-0.2,1)
title_location = 'center'
title_y_padding = -10
markerfirst = True
In [60]:
Copied!
dotsize = 2
starsize = 125
staredges = 0.85
legend_loc = (0.1,0.6)
titleletter_loc = (-0.17,1.015)
hline_width = 1.5
gene_means = scm_1k.var['mean']
sqrt_var = scm_1k.var['var']
sqrt_variable_genes_idx = scm_1k.var['feature_select_var']
residual_var = scm_1k.var['sr_var']
residuals_variable_genes_idx = scm_1k.var['feature_select']
with sns.plotting_context("talk"):
fig = plt.figure(figsize=(18,6))
gs = GridSpec(2, 6, figure=fig)
ax1 = fig.add_subplot(gs[:, 0:2])
ax2 = fig.add_subplot(gs[:, 2:4])
supaxis = fig.add_subplot(gs[:, 4:6])
ax3 = fig.add_subplot(gs[0, 4])
ax4 = fig.add_subplot(gs[1, 4])
ax5 = fig.add_subplot(gs[0, 5])
ax6 = fig.add_subplot(gs[1, 5])
tsne_axes = [ax3,ax4,ax5,ax6]
ax1.text(*titleletter_loc,'a',transform=ax1.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax2.set_title(r'var')
ax1.set_ylabel('variance')
ax1.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True)
ax1.scatter(gene_means[residuals_variable_genes_idx],sqrt_var[residuals_variable_genes_idx],color='tab:red',s=dotsize,label='selection by Pearson residuals', rasterized=True)
# ax1.scatter(gene_means[residuals_variable_genes_idx & example_genes_idx],sqrt_var[residuals_variable_genes_idx & example_genes_idx],color='tab:red',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
# ax1.scatter(gene_means[~residuals_variable_genes_idx & example_genes_idx],sqrt_var[~residuals_variable_genes_idx & example_genes_idx],color='tab:blue',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
# ax1.hlines(dataset['sqrt_lazy_threshold'],xmin,xmax,linestyle=':',linewidth=hline_width)
# add_labels(dataset,xdata=gene_means, ydata=sqrt_var,example_genes=example_genes,textoffsets=example_gene_textoffsets_a,lines=example_gene_lines_a,ax=ax1)
# add_largedot_legend(ax1,'lower right',dict(fontsize=15))
ax2.text(*titleletter_loc,'b',transform=ax2.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax2.set_title(r'sr_var')
ax2.scatter(gene_means,residual_var,s=dotsize, rasterized=True)
ax2.scatter(gene_means[sqrt_variable_genes_idx],residual_var[sqrt_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
# ax2.scatter(gene_means[example_genes_idx & sqrt_variable_genes_idx],residual_var[example_genes_idx & sqrt_variable_genes_idx],color='tab:red',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
# ax2.scatter(gene_means[example_genes_idx & ~sqrt_variable_genes_idx],residual_var[example_genes_idx & ~sqrt_variable_genes_idx],color='tab:blue',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
# ax2.hlines(dataset['residuals_theta100_threshold'],xmin,xmax,linestyle=':',linewidth=hline_width)
# add_labels(dataset,xdata=gene_means, ydata=residual_var,example_genes=example_genes,textoffsets=example_gene_textoffsets_b,lines=example_gene_lines_b,ax=ax2)
# add_largedot_legend(ax2,'lower right',dict(fontsize=15))
# supaxis.text(*titleletter_loc,'c',transform=supaxis.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
# supaxis.axis('off')
# for (tsne_ax,example_gene) in zip(tsne_axes,example_genes):
# gene_idx = dataset['genes'] == example_gene
# sqrted_counts = np.squeeze(dataset['sqrt_lazy'][:,gene_idx])
# tsne_ax.set_title(example_gene.lower().capitalize())
# tsne_ax.scatter(dataset['residuals_theta100_tsne']['coords'][:,0], dataset['residuals_theta100_tsne']['coords'][:,1], s=1,c=sqrted_counts, edgecolor='none',cmap=cmap, rasterized=True)
# tsne_ax.axis('off')
sns.despine()
plt.tight_layout()
# plt.savefig('figures/FigS3.pdf', dpi=300, format=None)
dotsize = 2
starsize = 125
staredges = 0.85
legend_loc = (0.1,0.6)
titleletter_loc = (-0.17,1.015)
hline_width = 1.5
gene_means = scm_1k.var['mean']
sqrt_var = scm_1k.var['var']
sqrt_variable_genes_idx = scm_1k.var['feature_select_var']
residual_var = scm_1k.var['sr_var']
residuals_variable_genes_idx = scm_1k.var['feature_select']
with sns.plotting_context("talk"):
fig = plt.figure(figsize=(18,6))
gs = GridSpec(2, 6, figure=fig)
ax1 = fig.add_subplot(gs[:, 0:2])
ax2 = fig.add_subplot(gs[:, 2:4])
supaxis = fig.add_subplot(gs[:, 4:6])
ax3 = fig.add_subplot(gs[0, 4])
ax4 = fig.add_subplot(gs[1, 4])
ax5 = fig.add_subplot(gs[0, 5])
ax6 = fig.add_subplot(gs[1, 5])
tsne_axes = [ax3,ax4,ax5,ax6]
ax1.text(*titleletter_loc,'a',transform=ax1.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax2.set_title(r'var')
ax1.set_ylabel('variance')
ax1.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True)
ax1.scatter(gene_means[residuals_variable_genes_idx],sqrt_var[residuals_variable_genes_idx],color='tab:red',s=dotsize,label='selection by Pearson residuals', rasterized=True)
# ax1.scatter(gene_means[residuals_variable_genes_idx & example_genes_idx],sqrt_var[residuals_variable_genes_idx & example_genes_idx],color='tab:red',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
# ax1.scatter(gene_means[~residuals_variable_genes_idx & example_genes_idx],sqrt_var[~residuals_variable_genes_idx & example_genes_idx],color='tab:blue',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
# ax1.hlines(dataset['sqrt_lazy_threshold'],xmin,xmax,linestyle=':',linewidth=hline_width)
# add_labels(dataset,xdata=gene_means, ydata=sqrt_var,example_genes=example_genes,textoffsets=example_gene_textoffsets_a,lines=example_gene_lines_a,ax=ax1)
# add_largedot_legend(ax1,'lower right',dict(fontsize=15))
ax2.text(*titleletter_loc,'b',transform=ax2.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax2.set_title(r'sr_var')
ax2.scatter(gene_means,residual_var,s=dotsize, rasterized=True)
ax2.scatter(gene_means[sqrt_variable_genes_idx],residual_var[sqrt_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
# ax2.scatter(gene_means[example_genes_idx & sqrt_variable_genes_idx],residual_var[example_genes_idx & sqrt_variable_genes_idx],color='tab:red',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
# ax2.scatter(gene_means[example_genes_idx & ~sqrt_variable_genes_idx],residual_var[example_genes_idx & ~sqrt_variable_genes_idx],color='tab:blue',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
# ax2.hlines(dataset['residuals_theta100_threshold'],xmin,xmax,linestyle=':',linewidth=hline_width)
# add_labels(dataset,xdata=gene_means, ydata=residual_var,example_genes=example_genes,textoffsets=example_gene_textoffsets_b,lines=example_gene_lines_b,ax=ax2)
# add_largedot_legend(ax2,'lower right',dict(fontsize=15))
# supaxis.text(*titleletter_loc,'c',transform=supaxis.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
# supaxis.axis('off')
# for (tsne_ax,example_gene) in zip(tsne_axes,example_genes):
# gene_idx = dataset['genes'] == example_gene
# sqrted_counts = np.squeeze(dataset['sqrt_lazy'][:,gene_idx])
# tsne_ax.set_title(example_gene.lower().capitalize())
# tsne_ax.scatter(dataset['residuals_theta100_tsne']['coords'][:,0], dataset['residuals_theta100_tsne']['coords'][:,1], s=1,c=sqrted_counts, edgecolor='none',cmap=cmap, rasterized=True)
# tsne_ax.axis('off')
sns.despine()
plt.tight_layout()
# plt.savefig('figures/FigS3.pdf', dpi=300, format=None)
In [85]:
Copied!
def feature_select(adata,number=3000):
print(adata.shape)
adata.raw = adata.copy()
df = adata.var
feature_subset_sr = df.index.isin(df.sort_values('sr_var', ascending=False).index[:number])
feature_subset_var = df.index.isin(df.sort_values('var', ascending=False).index[:number])
feature_subset_dispersion = df.index.isin(df.sort_values('dispersion_norm', ascending=True).index[:number])
# 随机选择 number 个特征的索引
random_indices = np.random.choice(df.index, size=number, replace=False)
# 获取随机选择的特征子集
feature_subset_random = df.index.isin(random_indices)
df['feature_select_sr'] = feature_subset_sr
df['feature_select_var'] = feature_subset_var
df['feature_select_dispersion'] = feature_subset_dispersion
df['feature_select_random'] = feature_subset_random
adata.var = df
# data_filter = adata[:,adata.var['feature_select']].copy()
return adata
def feature_select(adata,number=3000):
print(adata.shape)
adata.raw = adata.copy()
df = adata.var
feature_subset_sr = df.index.isin(df.sort_values('sr_var', ascending=False).index[:number])
feature_subset_var = df.index.isin(df.sort_values('var', ascending=False).index[:number])
feature_subset_dispersion = df.index.isin(df.sort_values('dispersion_norm', ascending=True).index[:number])
# 随机选择 number 个特征的索引
random_indices = np.random.choice(df.index, size=number, replace=False)
# 获取随机选择的特征子集
feature_subset_random = df.index.isin(random_indices)
df['feature_select_sr'] = feature_subset_sr
df['feature_select_var'] = feature_subset_var
df['feature_select_dispersion'] = feature_subset_dispersion
df['feature_select_random'] = feature_subset_random
adata.var = df
# data_filter = adata[:,adata.var['feature_select']].copy()
return adata
In [86]:
Copied!
feature_select(scm_1k,number=5000)
feature_select(scm_1k,number=5000)
(46, 67300)
Out[86]:
AnnData object with n_obs × n_vars = 46 × 67300 obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Series', 'condition', 'sample', 'Run', 'Organism', 'Cell_ID', 'Source_name', 'Cell_type', 'total_cell_type', 'subgroup', 'Development_stage', 'Strain', 'Tissue', 'Cell_line', 'Genotype', 'Treatment', 'Age', 'Sex', 'Race', 'other', 'Disease', 'CenterName', 'avgLength', 'LibraryStrategy', 'LibraryLayout', 'Platform', 'Model', 'Total_bases.Gb', 'Total_reads', 'Reads_After_trim', 'Uniq_mapping.reads', 'Mapping_ratio', 'Conversion_ratio', 'TotalCpGs(1X)', 'CpGsTotalUnique(1X)', 'CpGsMeanCov(1X)', 'CpGsMethRatio(1X)', 'CpGsMethReadRatio(1X)', 'TotalCpGs(3X)', 'CpGsTotalUnique(3X)', 'CpGsMeanCov(3X)', 'CpGsMethRatio(3X)', 'CpGsMethReadRatio(3X)', 'TotalCpGs(5X)', 'CpGsTotalUnique(5X)', 'CpGsMeanCov(5X)', 'CpGsMethRatio(5X)', 'CpGsMethReadRatio(5X)', 'n_features', 'pct_peaks' var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var', 'mean', 'pct_cell', 'dispersion', 'log_dispersion', 'mean_bin', 'cov_bin', 'dispersion_norm', 'feature_select_sr', 'feature_select_var', 'feature_select_dispersion', 'feature_select_random' uns: 'workdir', 'label_color', 'X_pca_var_ratios', 'Cell_type_colors', 'Treatment_colors', 'neighbors', 'umap' obsm: 'X_pca', 'X_tsne', 'X_umap' layers: 'relative' obsp: 'distances', 'connectivities'
In [28]:
Copied!
import scipy.sparse as sparse
def dispersion(xdata):
if sparse.issparse(xdata.X):
n_features = xdata.X.shape[1] # features, 列
n_samples = xdata.X.shape[0] # samples, 行
# 计算每个样本中覆盖的特征数
coverage_cells = [n_features - np.isnan(line.toarray()).sum() for line in xdata.X]
# 计算每个样本的平均甲基化
mean_cell_methylation = [np.nansum(line.toarray()) / n_features for line in xdata.X]
# 将结果存储到 adata.obs
xdata.obs['coverage_cells'] = coverage_cells
xdata.obs['mean_cell_methylation'] = mean_cell_methylation
# 计算每个特征在多少个样本中有覆盖
coverage_feature = [ n_samples- np.isnan(line.toarray()).sum() for line in xdata.X.T]
# 计算每个特征的平均甲基化
mean_feature_methylation = [np.nansum(line.toarray()) / n_samples for line in xdata.X.T]
# 将结果存储到 adata.var
xdata.var['coverage_feature'] = coverage_feature
xdata.var['mean_feature_methylation'] = mean_feature_methylation
# 计算平方均值 (x * x).mean()
mean_sq = (xdata.X.power(2).mean(axis=0)).A1
# 计算均值
mean = xdata.X.mean(axis=0).A1
# 计算方差 (mean_sq - mean**2) * (n_samples / (n_samples - 1))
var = (mean_sq - mean**2) * (n_samples / (n_samples - 1))
# 将均值中小于阈值的值替换为 1e-12
threshold = 1e-12
mean[mean < threshold] = threshold
# 计算 dispersion
dispersion = var / mean
# 将结果存储到 adata.var
xdata.var['dispersion'] = dispersion
xdata.var['dispersion_log'] = np.log(dispersion)
# Calculate mean and coverage
mean_mean = xdata.var['mean_feature_methylation'].mean()
mean_coverage = xdata.var['coverage_feature'].mean()
xdata.var['NormalizedDispersion'] = xdata.var['dispersion'] / (mean_mean * mean_coverage)
else:
length1 = xdata.X.shape[0] # 55017 features,列
length2 = xdata.X.shape[1] # 3379 samples 行
xdata.obs['coverage_feature'] = [length1 - np.isnan(line).sum() for line in xdata.X] #每一个样本中覆盖的特征数
xdata.obs['mean_cell_methylation'] = [np.nansum(line)/length1 for line in xdata.X]
xdata.var['coverage_cells'] = [length2 - np.isnan(line).sum() for line in xdata.X.T] #每一个特征在多少个样本中有覆盖
xdata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in xdata.X.T]
# 计算平方均值 (x * x).mean()
mean_sq = (xdata.X.power(2).mean(axis=0)).A1
# 计算均值
mean = xdata.X.mean(axis=0).A1
# 计算方差 (mean_sq - mean**2) * (n_samples / (n_samples - 1))
var = (mean_sq - mean**2) * (n_samples / (n_samples - 1))
# 将均值中小于阈值的值替换为 1e-12
threshold = 1e-12
mean[mean < threshold] = threshold
# 计算 dispersion
dispersion = var / mean
# 将结果存储到 adata.var
xdata.var['dispersion'] = dispersion
xdata.var['dispersion_log'] = np.log(dispersion)
# Calculate mean and coverage
mean_mean = xdata.var['mean'].mean()
mean_coverage = xdata.var['covered_cell'].mean()
xdata.var['NormalizedDispersion'] = xdata.var['dispersion'] / (mean_mean * mean_coverage)
import scipy.sparse as sparse
def dispersion(xdata):
if sparse.issparse(xdata.X):
n_features = xdata.X.shape[1] # features, 列
n_samples = xdata.X.shape[0] # samples, 行
# 计算每个样本中覆盖的特征数
coverage_cells = [n_features - np.isnan(line.toarray()).sum() for line in xdata.X]
# 计算每个样本的平均甲基化
mean_cell_methylation = [np.nansum(line.toarray()) / n_features for line in xdata.X]
# 将结果存储到 adata.obs
xdata.obs['coverage_cells'] = coverage_cells
xdata.obs['mean_cell_methylation'] = mean_cell_methylation
# 计算每个特征在多少个样本中有覆盖
coverage_feature = [ n_samples- np.isnan(line.toarray()).sum() for line in xdata.X.T]
# 计算每个特征的平均甲基化
mean_feature_methylation = [np.nansum(line.toarray()) / n_samples for line in xdata.X.T]
# 将结果存储到 adata.var
xdata.var['coverage_feature'] = coverage_feature
xdata.var['mean_feature_methylation'] = mean_feature_methylation
# 计算平方均值 (x * x).mean()
mean_sq = (xdata.X.power(2).mean(axis=0)).A1
# 计算均值
mean = xdata.X.mean(axis=0).A1
# 计算方差 (mean_sq - mean**2) * (n_samples / (n_samples - 1))
var = (mean_sq - mean**2) * (n_samples / (n_samples - 1))
# 将均值中小于阈值的值替换为 1e-12
threshold = 1e-12
mean[mean < threshold] = threshold
# 计算 dispersion
dispersion = var / mean
# 将结果存储到 adata.var
xdata.var['dispersion'] = dispersion
xdata.var['dispersion_log'] = np.log(dispersion)
# Calculate mean and coverage
mean_mean = xdata.var['mean_feature_methylation'].mean()
mean_coverage = xdata.var['coverage_feature'].mean()
xdata.var['NormalizedDispersion'] = xdata.var['dispersion'] / (mean_mean * mean_coverage)
else:
length1 = xdata.X.shape[0] # 55017 features,列
length2 = xdata.X.shape[1] # 3379 samples 行
xdata.obs['coverage_feature'] = [length1 - np.isnan(line).sum() for line in xdata.X] #每一个样本中覆盖的特征数
xdata.obs['mean_cell_methylation'] = [np.nansum(line)/length1 for line in xdata.X]
xdata.var['coverage_cells'] = [length2 - np.isnan(line).sum() for line in xdata.X.T] #每一个特征在多少个样本中有覆盖
xdata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in xdata.X.T]
# 计算平方均值 (x * x).mean()
mean_sq = (xdata.X.power(2).mean(axis=0)).A1
# 计算均值
mean = xdata.X.mean(axis=0).A1
# 计算方差 (mean_sq - mean**2) * (n_samples / (n_samples - 1))
var = (mean_sq - mean**2) * (n_samples / (n_samples - 1))
# 将均值中小于阈值的值替换为 1e-12
threshold = 1e-12
mean[mean < threshold] = threshold
# 计算 dispersion
dispersion = var / mean
# 将结果存储到 adata.var
xdata.var['dispersion'] = dispersion
xdata.var['dispersion_log'] = np.log(dispersion)
# Calculate mean and coverage
mean_mean = xdata.var['mean'].mean()
mean_coverage = xdata.var['covered_cell'].mean()
xdata.var['NormalizedDispersion'] = xdata.var['dispersion'] / (mean_mean * mean_coverage)
In [29]:
Copied!
dispersion(scm_1k)
dispersion(scm_1k)
In [30]:
Copied!
scm_1k.var
scm_1k.var
Out[30]:
chromosome | start | end | covered_cell | var | sr_var | mean | pct_cell | dispersion | dispersion_log | NormalizedDispersion | mean_bin | cov_bin | dispersion_norm | coverage_feature | mean_feature_methylation | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | ||||||||||||||||
chr1:3670001-3671000 | chr1 | 3670001 | 3671000 | 42 | 0.024906 | 0.007794 | 0.048214 | 0.913043 | NaN | NaN | NaN | 0 | 4 | NaN | 42 | 0.044022 |
chr1:3671001-3672000 | chr1 | 3671001 | 3672000 | 44 | 0.000687 | 0.000623 | 0.012477 | 0.956522 | NaN | NaN | NaN | 0 | 4 | NaN | 44 | 0.011935 |
chr1:3852001-3853000 | chr1 | 3852001 | 3853000 | 40 | 0.106419 | 0.139342 | 0.697150 | 0.869565 | NaN | NaN | NaN | 13 | 4 | NaN | 40 | 0.606217 |
chr1:4071001-4072000 | chr1 | 4071001 | 4072000 | 40 | 0.168057 | 0.079962 | 0.506600 | 0.869565 | NaN | NaN | NaN | 10 | 4 | NaN | 40 | 0.440522 |
chr1:4259001-4260000 | chr1 | 4259001 | 4260000 | 40 | 0.072315 | 0.098618 | 0.810025 | 0.869565 | NaN | NaN | NaN | 16 | 4 | NaN | 40 | 0.704370 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
chrY:90809001-90810000 | chrY | 90809001 | 90810000 | 44 | 0.107024 | 0.081044 | 0.393182 | 0.956522 | NaN | NaN | NaN | 7 | 4 | NaN | 44 | 0.376087 |
chrY:90810001-90811000 | chrY | 90810001 | 90811000 | 45 | 0.075000 | 0.070901 | 0.398844 | 0.978261 | NaN | NaN | NaN | 7 | 4 | NaN | 45 | 0.390174 |
chrY:90811001-90812000 | chrY | 90811001 | 90812000 | 46 | 0.098502 | 0.076623 | 0.571891 | 1.000000 | 0.176067 | -1.736889 | 0.011269 | 11 | 4 | -1.769526 | 46 | 0.571891 |
chrY:90812001-90813000 | chrY | 90812001 | 90813000 | 45 | 0.069395 | 0.072013 | 0.847378 | 0.978261 | NaN | NaN | NaN | 16 | 4 | NaN | 45 | 0.828957 |
chrY:90825001-90826000 | chrY | 90825001 | 90826000 | 40 | 0.073739 | 0.080637 | 0.578650 | 0.869565 | NaN | NaN | NaN | 11 | 4 | NaN | 40 | 0.503174 |
67300 rows × 16 columns
In [13]:
Copied!
scm_1k.var['dispersion'].describe()
scm_1k.var['dispersion'].describe()
Out[13]:
count 179.000000 mean 0.198436 std 0.132269 min 0.000000 25% 0.083394 50% 0.207215 75% 0.270974 max 0.726533 Name: dispersion, dtype: float64
In [14]:
Copied!
df = scm_1k.var
df = scm_1k.var
In [45]:
Copied!
def calculate_dispersion_norm(df):
df['dispersion'] = df['mean'] / df['var']
df['log_dispersion'] = np.log(df['dispersion'])
mean_binsize = 0.01
cov_binsize = 10
bin_min_features=5
# instead of n_bins, use bin_size, because cov and mc are in different scale
df['mean_bin'] = (df['mean'] / mean_binsize).astype(int)
df['cov_bin'] = (df['covered_cell'] / cov_binsize).astype(int)
# save bin_count df, gather bins with more than bin_min_features features
bin_count = df.groupby(['mean_bin', 'cov_bin']) \
.apply(lambda i: i.shape[0]) \
.reset_index() \
.sort_values(0, ascending=False)
bin_count.head()
bin_more_than = bin_count[bin_count[0] > bin_min_features]
if bin_more_than.shape[0] == 0:
raise ValueError(f'No bin have more than {bin_min_features} features, use larger bin size.')
# this usually don't cause much difference (a few hundred features), but the scatter plot will look more nature
index_map = {}
for _, (mean_id, cov_id, count) in bin_count.iterrows():
if count > 1:
index_map[(mean_id, cov_id)] = (mean_id, cov_id)
manhattan_dist = (bin_more_than['mean_bin'] - mean_id).abs() + (bin_more_than['cov_bin'] - cov_id).abs()
closest_more_than = manhattan_dist.sort_values().index[0]
closest = bin_more_than.loc[closest_more_than]
index_map[(mean_id, cov_id)] = tuple(closest.tolist()[:2])
# apply index_map to original df
raw_bin = df[['mean_bin', 'cov_bin']].set_index(['mean_bin', 'cov_bin'])
raw_bin['use_mean'] = pd.Series(index_map).apply(lambda i: i[0])
raw_bin['use_cov'] = pd.Series(index_map).apply(lambda i: i[1])
df['mean_bin'] = raw_bin['use_mean'].values
df['cov_bin'] = raw_bin['use_cov'].values
# calculate bin mean and std, now disp_std_bin shouldn't have NAs
disp_grouped = df.groupby(['mean_bin', 'cov_bin'])['dispersion']
disp_mean_bin = disp_grouped.mean()
disp_std_bin = disp_grouped.std(ddof=1)
# actually do the normalization
_mean_norm = disp_mean_bin.loc[list(zip(df['mean_bin'], df['cov_bin']))]
_std_norm = disp_std_bin.loc[list(zip(df['mean_bin'], df['cov_bin']))]
df['dispersion_norm'] = (df['dispersion'].values # use values here as index differs
- _mean_norm.values) / _std_norm.values
dispersion_norm = df['dispersion_norm'].values.astype('float32')
return df
def calculate_dispersion_norm(df):
df['dispersion'] = df['mean'] / df['var']
df['log_dispersion'] = np.log(df['dispersion'])
mean_binsize = 0.01
cov_binsize = 10
bin_min_features=5
# instead of n_bins, use bin_size, because cov and mc are in different scale
df['mean_bin'] = (df['mean'] / mean_binsize).astype(int)
df['cov_bin'] = (df['covered_cell'] / cov_binsize).astype(int)
# save bin_count df, gather bins with more than bin_min_features features
bin_count = df.groupby(['mean_bin', 'cov_bin']) \
.apply(lambda i: i.shape[0]) \
.reset_index() \
.sort_values(0, ascending=False)
bin_count.head()
bin_more_than = bin_count[bin_count[0] > bin_min_features]
if bin_more_than.shape[0] == 0:
raise ValueError(f'No bin have more than {bin_min_features} features, use larger bin size.')
# this usually don't cause much difference (a few hundred features), but the scatter plot will look more nature
index_map = {}
for _, (mean_id, cov_id, count) in bin_count.iterrows():
if count > 1:
index_map[(mean_id, cov_id)] = (mean_id, cov_id)
manhattan_dist = (bin_more_than['mean_bin'] - mean_id).abs() + (bin_more_than['cov_bin'] - cov_id).abs()
closest_more_than = manhattan_dist.sort_values().index[0]
closest = bin_more_than.loc[closest_more_than]
index_map[(mean_id, cov_id)] = tuple(closest.tolist()[:2])
# apply index_map to original df
raw_bin = df[['mean_bin', 'cov_bin']].set_index(['mean_bin', 'cov_bin'])
raw_bin['use_mean'] = pd.Series(index_map).apply(lambda i: i[0])
raw_bin['use_cov'] = pd.Series(index_map).apply(lambda i: i[1])
df['mean_bin'] = raw_bin['use_mean'].values
df['cov_bin'] = raw_bin['use_cov'].values
# calculate bin mean and std, now disp_std_bin shouldn't have NAs
disp_grouped = df.groupby(['mean_bin', 'cov_bin'])['dispersion']
disp_mean_bin = disp_grouped.mean()
disp_std_bin = disp_grouped.std(ddof=1)
# actually do the normalization
_mean_norm = disp_mean_bin.loc[list(zip(df['mean_bin'], df['cov_bin']))]
_std_norm = disp_std_bin.loc[list(zip(df['mean_bin'], df['cov_bin']))]
df['dispersion_norm'] = (df['dispersion'].values # use values here as index differs
- _mean_norm.values) / _std_norm.values
dispersion_norm = df['dispersion_norm'].values.astype('float32')
return df
In [46]:
Copied!
df = calculate_dispersion_norm(df)
df = calculate_dispersion_norm(df)
/tmp/ipykernel_31166/896870716.py:13: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. bin_count = df.groupby(['mean_bin', 'cov_bin']) \
In [47]:
Copied!
df
df
Out[47]:
chromosome | start | end | covered_cell | var | sr_var | mean | pct_cell | dispersion | log_dispersion | mean_bin | cov_bin | dispersion_norm | feature_select_sr | feature_select_var | feature_select_dispersion | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | ||||||||||||||||
chr1:3670001-3671000 | chr1 | 3670001 | 3671000 | 42 | 0.024906 | 0.007794 | 0.048214 | 0.913043 | 1.935845 | 0.660544 | 4 | 4 | -0.998066 | False | False | False |
chr1:3671001-3672000 | chr1 | 3671001 | 3672000 | 44 | 0.000687 | 0.000623 | 0.012477 | 0.956522 | 18.174609 | 2.900026 | 1 | 4 | 0.013054 | False | False | False |
chr1:3852001-3853000 | chr1 | 3852001 | 3853000 | 40 | 0.106419 | 0.139342 | 0.697150 | 0.869565 | 6.551002 | 1.879618 | 69 | 4 | 0.358447 | False | False | False |
chr1:4071001-4072000 | chr1 | 4071001 | 4072000 | 40 | 0.168057 | 0.079962 | 0.506600 | 0.869565 | 3.014449 | 1.103417 | 50 | 4 | -0.298661 | False | False | False |
chr1:4259001-4260000 | chr1 | 4259001 | 4260000 | 40 | 0.072315 | 0.098618 | 0.810025 | 0.869565 | 11.201269 | 2.416027 | 81 | 4 | 0.071335 | False | False | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
chrY:90809001-90810000 | chrY | 90809001 | 90810000 | 44 | 0.107024 | 0.081044 | 0.393182 | 0.956522 | 3.673783 | 1.301222 | 39 | 4 | 1.331865 | False | False | False |
chrY:90810001-90811000 | chrY | 90810001 | 90811000 | 45 | 0.075000 | 0.070901 | 0.398844 | 0.978261 | 5.317922 | 1.671083 | 39 | 4 | 4.114590 | False | False | False |
chrY:90811001-90812000 | chrY | 90811001 | 90812000 | 46 | 0.098502 | 0.076623 | 0.571891 | 1.000000 | 5.805861 | 1.758868 | 57 | 4 | 2.436943 | False | False | False |
chrY:90812001-90813000 | chrY | 90812001 | 90813000 | 45 | 0.069395 | 0.072013 | 0.847378 | 0.978261 | 12.210983 | 2.502336 | 84 | 4 | -0.541338 | False | False | False |
chrY:90825001-90826000 | chrY | 90825001 | 90826000 | 40 | 0.073739 | 0.080637 | 0.578650 | 0.869565 | 7.847285 | 2.060168 | 57 | 4 | 4.947618 | False | False | False |
67300 rows × 16 columns
In [27]:
Copied!
sparse.issparse(scm_1k.X)
sparse.issparse(scm_1k.X)
Out[27]:
True
In [31]:
Copied!
def obs(xdata):
if sparse.issparse(xdata.X):
n_features = xdata.X.shape[1] # features, 列
n_samples = xdata.X.shape[0] # samples, 行
# 计算每个样本中覆盖的特征数
coverage_cells = [n_features - np.isnan(line.toarray()).sum() for line in xdata.X]
# 计算每个样本的平均甲基化
mean_cell_methylation = [np.nansum(line.toarray()) / n_features for line in xdata.X]
# 将结果存储到 adata.obs
xdata.obs['coverage_cells'] = coverage_cells
xdata.obs['mean_cell_methylation'] = mean_cell_methylation
# 计算每个特征在多少个样本中有覆盖
coverage_feature = [ n_samples- np.isnan(line.toarray()).sum() for line in xdata.X.T]
# 计算每个特征的平均甲基化
mean_feature_methylation = [np.nansum(line.toarray()) / n_samples for line in xdata.X.T]
# 将结果存储到 adata.var
xdata.var['coverage_feature'] = coverage_feature
xdata.var['mean_feature_methylation'] = mean_feature_methylation
# 计算平方均值 (x * x).mean()
mean_sq = (xdata.X.power(2).mean(axis=0)).A1
# 计算均值
mean = xdata.X.mean(axis=0).A1
# 计算方差 (mean_sq - mean**2) * (n_samples / (n_samples - 1))
var = (mean_sq - mean**2) * (n_samples / (n_samples - 1))
# 将均值中小于阈值的值替换为 1e-12
threshold = 1e-12
mean[mean < threshold] = threshold
# 计算 dispersion
dispersion = var / mean
# 将结果存储到 adata.var
xdata.var['dispersion'] = dispersion
xdata.var['dispersion_log'] = np.log(dispersion)
# Calculate mean and coverage
mean_mean = xdata.var['mean_feature_methylation'].mean()
mean_coverage = xdata.var['coverage_feature'].mean()
xdata.var['NormalizedDispersion'] = xdata.var['dispersion'] / (mean_mean * mean_coverage)
else:
length1 = len(xdata.X[0,:]) # 55017 features,列
length2 = len(xdata.X[:,0]) # 3379 samples 行
xdata.obs['coverage_cells'] = [length1 - np.isnan(line).sum() for line in xdata.X] #每一个样本中覆盖的特征数
xdata.obs['mean_cell_methylation'] = [np.nansum(line)/length1 for line in xdata.X]
xdata.var['coverage_feature'] = [length2 - np.isnan(line).sum() for line in xdata.X.T] #每一个特征在多少个样本中有覆盖
xdata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in xdata.X.T]
# 计算平方均值 (x * x).mean()
mean_sq = (xdata.X.power(2).mean(axis=0)).A1
# 计算均值
mean = xdata.X.mean(axis=0).A1
# 计算方差 (mean_sq - mean**2) * (n_samples / (n_samples - 1))
var = (mean_sq - mean**2) * (n_samples / (n_samples - 1))
# 将均值中小于阈值的值替换为 1e-12
threshold = 1e-12
mean[mean < threshold] = threshold
# 计算 dispersion
dispersion = var / mean
# 将结果存储到 adata.var
xdata.var['dispersion'] = dispersion
xdata.var['dispersion_log'] = np.log(dispersion)
# Calculate mean and coverage
mean_mean = xdata.var['mean_feature_methylation'].mean()
mean_coverage = xdata.var['coverage_feature'].mean()
xdata.var['NormalizedDispersion'] = xdata.var['dispersion'] / (mean_mean * mean_coverage)
def obs(xdata):
if sparse.issparse(xdata.X):
n_features = xdata.X.shape[1] # features, 列
n_samples = xdata.X.shape[0] # samples, 行
# 计算每个样本中覆盖的特征数
coverage_cells = [n_features - np.isnan(line.toarray()).sum() for line in xdata.X]
# 计算每个样本的平均甲基化
mean_cell_methylation = [np.nansum(line.toarray()) / n_features for line in xdata.X]
# 将结果存储到 adata.obs
xdata.obs['coverage_cells'] = coverage_cells
xdata.obs['mean_cell_methylation'] = mean_cell_methylation
# 计算每个特征在多少个样本中有覆盖
coverage_feature = [ n_samples- np.isnan(line.toarray()).sum() for line in xdata.X.T]
# 计算每个特征的平均甲基化
mean_feature_methylation = [np.nansum(line.toarray()) / n_samples for line in xdata.X.T]
# 将结果存储到 adata.var
xdata.var['coverage_feature'] = coverage_feature
xdata.var['mean_feature_methylation'] = mean_feature_methylation
# 计算平方均值 (x * x).mean()
mean_sq = (xdata.X.power(2).mean(axis=0)).A1
# 计算均值
mean = xdata.X.mean(axis=0).A1
# 计算方差 (mean_sq - mean**2) * (n_samples / (n_samples - 1))
var = (mean_sq - mean**2) * (n_samples / (n_samples - 1))
# 将均值中小于阈值的值替换为 1e-12
threshold = 1e-12
mean[mean < threshold] = threshold
# 计算 dispersion
dispersion = var / mean
# 将结果存储到 adata.var
xdata.var['dispersion'] = dispersion
xdata.var['dispersion_log'] = np.log(dispersion)
# Calculate mean and coverage
mean_mean = xdata.var['mean_feature_methylation'].mean()
mean_coverage = xdata.var['coverage_feature'].mean()
xdata.var['NormalizedDispersion'] = xdata.var['dispersion'] / (mean_mean * mean_coverage)
else:
length1 = len(xdata.X[0,:]) # 55017 features,列
length2 = len(xdata.X[:,0]) # 3379 samples 行
xdata.obs['coverage_cells'] = [length1 - np.isnan(line).sum() for line in xdata.X] #每一个样本中覆盖的特征数
xdata.obs['mean_cell_methylation'] = [np.nansum(line)/length1 for line in xdata.X]
xdata.var['coverage_feature'] = [length2 - np.isnan(line).sum() for line in xdata.X.T] #每一个特征在多少个样本中有覆盖
xdata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in xdata.X.T]
# 计算平方均值 (x * x).mean()
mean_sq = (xdata.X.power(2).mean(axis=0)).A1
# 计算均值
mean = xdata.X.mean(axis=0).A1
# 计算方差 (mean_sq - mean**2) * (n_samples / (n_samples - 1))
var = (mean_sq - mean**2) * (n_samples / (n_samples - 1))
# 将均值中小于阈值的值替换为 1e-12
threshold = 1e-12
mean[mean < threshold] = threshold
# 计算 dispersion
dispersion = var / mean
# 将结果存储到 adata.var
xdata.var['dispersion'] = dispersion
xdata.var['dispersion_log'] = np.log(dispersion)
# Calculate mean and coverage
mean_mean = xdata.var['mean_feature_methylation'].mean()
mean_coverage = xdata.var['coverage_feature'].mean()
xdata.var['NormalizedDispersion'] = xdata.var['dispersion'] / (mean_mean * mean_coverage)
In [32]:
Copied!
obs(scm_1k)
obs(scm_1k)
In [37]:
Copied!
scm_1k.var
scm_1k.var
Out[37]:
chromosome | start | end | covered_cell | var | sr_var | mean | pct_cell | dispersion | dispersion_log | NormalizedDispersion | mean_bin | cov_bin | dispersion_norm | coverage_feature | mean_feature_methylation | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | ||||||||||||||||
chr1:3670001-3671000 | chr1 | 3670001 | 3671000 | 42 | 0.024906 | 0.007794 | 0.048214 | 0.913043 | 1.935845 | 0.660544 | 0.110874 | 0 | 4 | NaN | 42 | 0.044022 |
chr1:3671001-3672000 | chr1 | 3671001 | 3672000 | 44 | 0.000687 | 0.000623 | 0.012477 | 0.956522 | 18.174609 | 2.900026 | 1.040934 | 0 | 4 | NaN | 44 | 0.011935 |
chr1:3852001-3853000 | chr1 | 3852001 | 3853000 | 40 | 0.106419 | 0.139342 | 0.697150 | 0.869565 | 6.551002 | 1.879618 | 0.375203 | 13 | 4 | NaN | 40 | 0.606217 |
chr1:4071001-4072000 | chr1 | 4071001 | 4072000 | 40 | 0.168057 | 0.079962 | 0.506600 | 0.869565 | 3.014449 | 1.103417 | 0.172650 | 10 | 4 | NaN | 40 | 0.440522 |
chr1:4259001-4260000 | chr1 | 4259001 | 4260000 | 40 | 0.072315 | 0.098618 | 0.810025 | 0.869565 | 11.201269 | 2.416027 | 0.641543 | 16 | 4 | NaN | 40 | 0.704370 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
chrY:90809001-90810000 | chrY | 90809001 | 90810000 | 44 | 0.107024 | 0.081044 | 0.393182 | 0.956522 | 3.673783 | 1.301222 | 0.210413 | 7 | 4 | NaN | 44 | 0.376087 |
chrY:90810001-90811000 | chrY | 90810001 | 90811000 | 45 | 0.075000 | 0.070901 | 0.398844 | 0.978261 | 5.317922 | 1.671083 | 0.304579 | 7 | 4 | NaN | 45 | 0.390174 |
chrY:90811001-90812000 | chrY | 90811001 | 90812000 | 46 | 0.098502 | 0.076623 | 0.571891 | 1.000000 | 5.805861 | 1.758868 | 0.332526 | 11 | 4 | -1.769526 | 46 | 0.571891 |
chrY:90812001-90813000 | chrY | 90812001 | 90813000 | 45 | 0.069395 | 0.072013 | 0.847378 | 0.978261 | 12.210983 | 2.502336 | 0.699373 | 16 | 4 | NaN | 45 | 0.828957 |
chrY:90825001-90826000 | chrY | 90825001 | 90826000 | 40 | 0.073739 | 0.080637 | 0.578650 | 0.869565 | 7.847285 | 2.060168 | 0.449446 | 11 | 4 | NaN | 40 | 0.503174 |
67300 rows × 16 columns
In [34]:
Copied!
scm_1k.var['dispersion'] = scm_1k.var['mean'] / scm_1k.var['var']
scm_1k.var['dispersion'] = scm_1k.var['mean'] / scm_1k.var['var']
In [36]:
Copied!
scm_1k.var['dispersion_log'] = np.log(scm_1k.var['dispersion'])
# Calculate mean and coverage
mean_mean = scm_1k.var['mean'].mean()
mean_coverage = scm_1k.var['covered_cell'].mean()
scm_1k.var['NormalizedDispersion'] = scm_1k.var['dispersion'] / (mean_mean * mean_coverage)
scm_1k.var['dispersion_log'] = np.log(scm_1k.var['dispersion'])
# Calculate mean and coverage
mean_mean = scm_1k.var['mean'].mean()
mean_coverage = scm_1k.var['covered_cell'].mean()
scm_1k.var['NormalizedDispersion'] = scm_1k.var['dispersion'] / (mean_mean * mean_coverage)
In [62]:
Copied!
gene_means = scm_1k.var['mean']
sqrt_var = scm_1k.var['dispersion_log']
#sqrt_variable_genes_idx = scm_1k.var['feature_select_var']
sqrt_variable_genes_idx = scm_1k.var['feature_select']
residual_var = scm_1k.var['NormalizedDispersion']
#residuals_variable_genes_idx = scm_1k.var['feature_select']
residuals_variable_genes_idx = scm_1k.var['feature_select_var']
with sns.plotting_context("talk"):
fig = plt.figure(figsize=(18,6))
gs = GridSpec(2, 4, figure=fig)
ax1 = fig.add_subplot(gs[:, 0:2])
ax2 = fig.add_subplot(gs[:, 2:4])
ax1.text(*titleletter_loc,'a',transform=ax1.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax2.set_title(r'var')
ax1.set_ylabel('variance')
ax1.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
ax1.scatter(gene_means[residuals_variable_genes_idx],sqrt_var[residuals_variable_genes_idx],color='tab:red',s=dotsize,label='selection by Pearson residuals', rasterized=True)
# ax1.scatter(gene_means[residuals_variable_genes_idx & example_genes_idx],sqrt_var[residuals_variable_genes_idx & example_genes_idx],color='tab:red',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
# ax1.scatter(gene_means[~residuals_variable_genes_idx & example_genes_idx],sqrt_var[~residuals_variable_genes_idx & example_genes_idx],color='tab:blue',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
# ax1.hlines(dataset['sqrt_lazy_threshold'],xmin,xmax,linestyle=':',linewidth=hline_width)
# add_labels(dataset,xdata=gene_means, ydata=sqrt_var,example_genes=example_genes,textoffsets=example_gene_textoffsets_a,lines=example_gene_lines_a,ax=ax1)
# add_largedot_legend(ax1,'lower right',dict(fontsize=15))
ax2.text(*titleletter_loc,'b',transform=ax2.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax2.set_title(r'sr_var')
ax2.scatter(gene_means,residual_var,s=dotsize, rasterized=True)
ax2.scatter(gene_means[sqrt_variable_genes_idx],residual_var[sqrt_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
# ax2.scatter(gene_means[example_genes_idx & sqrt_variable_genes_idx],residual_var[example_genes_idx & sqrt_variable_genes_idx],color='tab:red',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
# ax2.scatter(gene_means[example_genes_idx & ~sqrt_variable_genes_idx],residual_var[example_genes_idx & ~sqrt_variable_genes_idx],color='tab:blue',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
# ax2.hlines(dataset['residuals_theta100_threshold'],xmin,xmax,linestyle=':',linewidth=hline_width)
# add_labels(dataset,xdata=gene_means, ydata=residual_var,example_genes=example_genes,textoffsets=example_gene_textoffsets_b,lines=example_gene_lines_b,ax=ax2)
# add_largedot_legend(ax2,'lower right',dict(fontsize=15))
# supaxis.text(*titleletter_loc,'c',transform=supaxis.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
# supaxis.axis('off')
# for (tsne_ax,example_gene) in zip(tsne_axes,example_genes):
# gene_idx = dataset['genes'] == example_gene
# sqrted_counts = np.squeeze(dataset['sqrt_lazy'][:,gene_idx])
# tsne_ax.set_title(example_gene.lower().capitalize())
# tsne_ax.scatter(dataset['residuals_theta100_tsne']['coords'][:,0], dataset['residuals_theta100_tsne']['coords'][:,1], s=1,c=sqrted_counts, edgecolor='none',cmap=cmap, rasterized=True)
# tsne_ax.axis('off')
sns.despine()
plt.tight_layout()
# plt.savefig('figures/FigS3.pdf', dpi=300, format=None)
gene_means = scm_1k.var['mean']
sqrt_var = scm_1k.var['dispersion_log']
#sqrt_variable_genes_idx = scm_1k.var['feature_select_var']
sqrt_variable_genes_idx = scm_1k.var['feature_select']
residual_var = scm_1k.var['NormalizedDispersion']
#residuals_variable_genes_idx = scm_1k.var['feature_select']
residuals_variable_genes_idx = scm_1k.var['feature_select_var']
with sns.plotting_context("talk"):
fig = plt.figure(figsize=(18,6))
gs = GridSpec(2, 4, figure=fig)
ax1 = fig.add_subplot(gs[:, 0:2])
ax2 = fig.add_subplot(gs[:, 2:4])
ax1.text(*titleletter_loc,'a',transform=ax1.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax2.set_title(r'var')
ax1.set_ylabel('variance')
ax1.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
ax1.scatter(gene_means[residuals_variable_genes_idx],sqrt_var[residuals_variable_genes_idx],color='tab:red',s=dotsize,label='selection by Pearson residuals', rasterized=True)
# ax1.scatter(gene_means[residuals_variable_genes_idx & example_genes_idx],sqrt_var[residuals_variable_genes_idx & example_genes_idx],color='tab:red',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
# ax1.scatter(gene_means[~residuals_variable_genes_idx & example_genes_idx],sqrt_var[~residuals_variable_genes_idx & example_genes_idx],color='tab:blue',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
# ax1.hlines(dataset['sqrt_lazy_threshold'],xmin,xmax,linestyle=':',linewidth=hline_width)
# add_labels(dataset,xdata=gene_means, ydata=sqrt_var,example_genes=example_genes,textoffsets=example_gene_textoffsets_a,lines=example_gene_lines_a,ax=ax1)
# add_largedot_legend(ax1,'lower right',dict(fontsize=15))
ax2.text(*titleletter_loc,'b',transform=ax2.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax2.set_title(r'sr_var')
ax2.scatter(gene_means,residual_var,s=dotsize, rasterized=True)
ax2.scatter(gene_means[sqrt_variable_genes_idx],residual_var[sqrt_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
# ax2.scatter(gene_means[example_genes_idx & sqrt_variable_genes_idx],residual_var[example_genes_idx & sqrt_variable_genes_idx],color='tab:red',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
# ax2.scatter(gene_means[example_genes_idx & ~sqrt_variable_genes_idx],residual_var[example_genes_idx & ~sqrt_variable_genes_idx],color='tab:blue',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
# ax2.hlines(dataset['residuals_theta100_threshold'],xmin,xmax,linestyle=':',linewidth=hline_width)
# add_labels(dataset,xdata=gene_means, ydata=residual_var,example_genes=example_genes,textoffsets=example_gene_textoffsets_b,lines=example_gene_lines_b,ax=ax2)
# add_largedot_legend(ax2,'lower right',dict(fontsize=15))
# supaxis.text(*titleletter_loc,'c',transform=supaxis.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
# supaxis.axis('off')
# for (tsne_ax,example_gene) in zip(tsne_axes,example_genes):
# gene_idx = dataset['genes'] == example_gene
# sqrted_counts = np.squeeze(dataset['sqrt_lazy'][:,gene_idx])
# tsne_ax.set_title(example_gene.lower().capitalize())
# tsne_ax.scatter(dataset['residuals_theta100_tsne']['coords'][:,0], dataset['residuals_theta100_tsne']['coords'][:,1], s=1,c=sqrted_counts, edgecolor='none',cmap=cmap, rasterized=True)
# tsne_ax.axis('off')
sns.despine()
plt.tight_layout()
# plt.savefig('figures/FigS3.pdf', dpi=300, format=None)
In [21]:
Copied!
gene_means = scm_1k.var['mean']
sqrt_var = scm_1k.var['log_dispersion']
vars_variable_genes_idx = scm_1k.var['feature_select_var']
#sqrt_variable_genes_idx = scm_1k.var['feature_select']
residual_var = scm_1k.var['dispersion_norm']
residuals_variable_genes_idx = scm_1k.var['feature_select_sr']
normdispersion_variable_genes_idx = scm_1k.var['feature_select_dispersion']
#residuals_variable_genes_idx = scm_1k.var['feature_select_var']
with sns.plotting_context("talk"):
fig = plt.figure(figsize=(18,6))
gs = GridSpec(3, 4, figure=fig)
ax1 = fig.add_subplot(gs[:, 0:2])
ax2 = fig.add_subplot(gs[:, 2:4])
ax3 = fig.add_subplot(gs[:, 2:4])
ax1.text(*titleletter_loc,'a',transform=ax1.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax2.set_title(r'var')
ax1.set_ylabel('variance')
ax1.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
#ax1.scatter(gene_means[residuals_variable_genes_idx],sqrt_var[residuals_variable_genes_idx],color='tab:red',s=dotsize,label='selection by Pearson residuals', rasterized=True)
ax2.text(*titleletter_loc,'b',transform=ax2.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax2.set_title(r'sr_var')
ax2.scatter(gene_means,residual_var,s=dotsize, rasterized=True)
#ax2.scatter(gene_means[sqrt_variable_genes_idx],residual_var[sqrt_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
sns.despine()
plt.tight_layout()
gene_means = scm_1k.var['mean']
sqrt_var = scm_1k.var['log_dispersion']
vars_variable_genes_idx = scm_1k.var['feature_select_var']
#sqrt_variable_genes_idx = scm_1k.var['feature_select']
residual_var = scm_1k.var['dispersion_norm']
residuals_variable_genes_idx = scm_1k.var['feature_select_sr']
normdispersion_variable_genes_idx = scm_1k.var['feature_select_dispersion']
#residuals_variable_genes_idx = scm_1k.var['feature_select_var']
with sns.plotting_context("talk"):
fig = plt.figure(figsize=(18,6))
gs = GridSpec(3, 4, figure=fig)
ax1 = fig.add_subplot(gs[:, 0:2])
ax2 = fig.add_subplot(gs[:, 2:4])
ax3 = fig.add_subplot(gs[:, 2:4])
ax1.text(*titleletter_loc,'a',transform=ax1.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax2.set_title(r'var')
ax1.set_ylabel('variance')
ax1.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
#ax1.scatter(gene_means[residuals_variable_genes_idx],sqrt_var[residuals_variable_genes_idx],color='tab:red',s=dotsize,label='selection by Pearson residuals', rasterized=True)
ax2.text(*titleletter_loc,'b',transform=ax2.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax2.set_title(r'sr_var')
ax2.scatter(gene_means,residual_var,s=dotsize, rasterized=True)
#ax2.scatter(gene_means[sqrt_variable_genes_idx],residual_var[sqrt_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
sns.despine()
plt.tight_layout()
In [88]:
Copied!
gene_means = scm_1k.var['mean']
sqrt_var = scm_1k.var['var']
vars_variable_genes_idx = scm_1k.var['feature_select_var']
#sqrt_variable_genes_idx = scm_1k.var['feature_select']
residual_var = scm_1k.var['dispersion_norm']
residuals_variable_genes_idx = scm_1k.var['feature_select_sr']
normdispersion_variable_genes_idx = scm_1k.var['feature_select_dispersion']
random_genes_idx = scm_1k.var['feature_select_random']
#residuals_variable_genes_idx = scm_1k.var['feature_select_var']
with sns.plotting_context("talk"):
fig = plt.figure(figsize=(18,6))
gs = GridSpec(2, 8, figure=fig)
ax1 = fig.add_subplot(gs[:, 0:2])
ax2 = fig.add_subplot(gs[:, 2:4])
ax3 = fig.add_subplot(gs[:, 4:6])
ax4 = fig.add_subplot(gs[:, 6:8])
ax1.text(*titleletter_loc,'a',transform=ax1.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax1.set_title(r'Variance')
ax1.set_ylabel('Variance')
ax1.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
ax1.scatter(gene_means[vars_variable_genes_idx],sqrt_var[vars_variable_genes_idx],color='tab:red',s=dotsize,label='selection by Pearson residuals', rasterized=True)
ax2.text(*titleletter_loc,'b',transform=ax2.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax2.set_title(r'Residuals Variance')
ax2.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
ax2.scatter(gene_means[residuals_variable_genes_idx],sqrt_var[residuals_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
ax3.text(*titleletter_loc,'c',transform=ax3.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax3.set_title(r'NormDispersion')
ax3.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
ax3.scatter(gene_means[normdispersion_variable_genes_idx],sqrt_var[normdispersion_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
ax4.text(*titleletter_loc,'c',transform=ax4.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax4.set_title(r'Random')
ax4.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
ax4.scatter(gene_means[random_genes_idx],sqrt_var[random_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
sns.despine()
plt.tight_layout()
gene_means = scm_1k.var['mean']
sqrt_var = scm_1k.var['var']
vars_variable_genes_idx = scm_1k.var['feature_select_var']
#sqrt_variable_genes_idx = scm_1k.var['feature_select']
residual_var = scm_1k.var['dispersion_norm']
residuals_variable_genes_idx = scm_1k.var['feature_select_sr']
normdispersion_variable_genes_idx = scm_1k.var['feature_select_dispersion']
random_genes_idx = scm_1k.var['feature_select_random']
#residuals_variable_genes_idx = scm_1k.var['feature_select_var']
with sns.plotting_context("talk"):
fig = plt.figure(figsize=(18,6))
gs = GridSpec(2, 8, figure=fig)
ax1 = fig.add_subplot(gs[:, 0:2])
ax2 = fig.add_subplot(gs[:, 2:4])
ax3 = fig.add_subplot(gs[:, 4:6])
ax4 = fig.add_subplot(gs[:, 6:8])
ax1.text(*titleletter_loc,'a',transform=ax1.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax1.set_title(r'Variance')
ax1.set_ylabel('Variance')
ax1.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
ax1.scatter(gene_means[vars_variable_genes_idx],sqrt_var[vars_variable_genes_idx],color='tab:red',s=dotsize,label='selection by Pearson residuals', rasterized=True)
ax2.text(*titleletter_loc,'b',transform=ax2.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax2.set_title(r'Residuals Variance')
ax2.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
ax2.scatter(gene_means[residuals_variable_genes_idx],sqrt_var[residuals_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
ax3.text(*titleletter_loc,'c',transform=ax3.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax3.set_title(r'NormDispersion')
ax3.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
ax3.scatter(gene_means[normdispersion_variable_genes_idx],sqrt_var[normdispersion_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
ax4.text(*titleletter_loc,'c',transform=ax4.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax4.set_title(r'Random')
ax4.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
ax4.scatter(gene_means[random_genes_idx],sqrt_var[random_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
sns.despine()
plt.tight_layout()
In [40]:
Copied!
scm_1k.var
scm_1k.var
Out[40]:
chromosome | start | end | covered_cell | var | sr_var | mean | pct_cell | dispersion | log_dispersion | mean_bin | cov_bin | dispersion_norm | feature_select_sr | feature_select_var | feature_select_dispersion | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | ||||||||||||||||
chr1:3670001-3671000 | chr1 | 3670001 | 3671000 | 42 | 0.024906 | 0.007794 | 0.048214 | 0.913043 | 1.935845 | 0.660544 | 0 | 4 | -1.025961 | False | False | False |
chr1:3671001-3672000 | chr1 | 3671001 | 3672000 | 44 | 0.000687 | 0.000623 | 0.012477 | 0.956522 | 18.174609 | 2.900026 | 0 | 4 | -0.024248 | False | False | False |
chr1:3852001-3853000 | chr1 | 3852001 | 3853000 | 40 | 0.106419 | 0.139342 | 0.697150 | 0.869565 | 6.551002 | 1.879618 | 13 | 4 | 0.716161 | False | False | False |
chr1:4071001-4072000 | chr1 | 4071001 | 4072000 | 40 | 0.168057 | 0.079962 | 0.506600 | 0.869565 | 3.014449 | 1.103417 | 10 | 4 | -0.497702 | False | False | False |
chr1:4259001-4260000 | chr1 | 4259001 | 4260000 | 40 | 0.072315 | 0.098618 | 0.810025 | 0.869565 | 11.201269 | 2.416027 | 16 | 4 | -0.182424 | False | False | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
chrY:90809001-90810000 | chrY | 90809001 | 90810000 | 44 | 0.107024 | 0.081044 | 0.393182 | 0.956522 | 3.673783 | 1.301222 | 7 | 4 | 1.332989 | False | False | False |
chrY:90810001-90811000 | chrY | 90810001 | 90811000 | 45 | 0.075000 | 0.070901 | 0.398844 | 0.978261 | 5.317922 | 1.671083 | 7 | 4 | 4.020874 | False | False | True |
chrY:90811001-90812000 | chrY | 90811001 | 90812000 | 46 | 0.098502 | 0.076623 | 0.571891 | 1.000000 | 5.805861 | 1.758868 | 11 | 4 | 2.669185 | False | False | True |
chrY:90812001-90813000 | chrY | 90812001 | 90813000 | 45 | 0.069395 | 0.072013 | 0.847378 | 0.978261 | 12.210983 | 2.502336 | 16 | 4 | 0.180835 | False | False | False |
chrY:90825001-90826000 | chrY | 90825001 | 90826000 | 40 | 0.073739 | 0.080637 | 0.578650 | 0.869565 | 7.847285 | 2.060168 | 11 | 4 | 5.370180 | False | False | True |
67300 rows × 16 columns
In [60]:
Copied!
scm.pp.pca(scm_1k,n_pcs=10,impute='median',plot=False)
scm.pp.pca(scm_1k,n_pcs=10,impute='median',plot=False)
... using all features
Out[60]:
AnnData object with n_obs × n_vars = 46 × 67300 obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Series', 'condition', 'sample', 'Run', 'Organism', 'Cell_ID', 'Source_name', 'Cell_type', 'total_cell_type', 'subgroup', 'Development_stage', 'Strain', 'Tissue', 'Cell_line', 'Genotype', 'Treatment', 'Age', 'Sex', 'Race', 'other', 'Disease', 'CenterName', 'avgLength', 'LibraryStrategy', 'LibraryLayout', 'Platform', 'Model', 'Total_bases.Gb', 'Total_reads', 'Reads_After_trim', 'Uniq_mapping.reads', 'Mapping_ratio', 'Conversion_ratio', 'TotalCpGs(1X)', 'CpGsTotalUnique(1X)', 'CpGsMeanCov(1X)', 'CpGsMethRatio(1X)', 'CpGsMethReadRatio(1X)', 'TotalCpGs(3X)', 'CpGsTotalUnique(3X)', 'CpGsMeanCov(3X)', 'CpGsMethRatio(3X)', 'CpGsMethReadRatio(3X)', 'TotalCpGs(5X)', 'CpGsTotalUnique(5X)', 'CpGsMeanCov(5X)', 'CpGsMethRatio(5X)', 'CpGsMethReadRatio(5X)', 'n_features', 'pct_peaks' var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var', 'mean', 'pct_cell', 'dispersion', 'log_dispersion', 'mean_bin', 'cov_bin', 'dispersion_norm', 'feature_select_sr', 'feature_select_var', 'feature_select_dispersion' uns: 'workdir', 'label_color', 'X_pca_var_ratios', 'Cell_type_colors', 'Treatment_colors' obsm: 'X_pca', 'X_tsne' layers: 'relative'
In [61]:
Copied!
scm.pp.tsne(scm_1k)
scm.pp.tsne(scm_1k)
In [62]:
Copied!
scm.pl.tsne(scm_1k,frameon='small',color=['Cell_type','Treatment','n_total'],size=100,wspace=0.3)
scm.pl.tsne(scm_1k,frameon='small',color=['Cell_type','Treatment','n_total'],size=100,wspace=0.3)
In [63]:
Copied!
scm.pl.pca(scm_1k,frameon='small',color=['Cell_type','Treatment','n_total'],size=100,wspace=0.3)
scm.pl.pca(scm_1k,frameon='small',color=['Cell_type','Treatment','n_total'],size=100,wspace=0.3)
In [64]:
Copied!
sc.pp.neighbors(scm_1k)
scm.pp.umap(scm_1k)
scm.pl.umap(scm_1k,frameon='small',color=['Cell_type','Treatment'],size=100,wspace=0.3)
sc.pp.neighbors(scm_1k)
scm.pp.umap(scm_1k)
scm.pl.umap(scm_1k,frameon='small',color=['Cell_type','Treatment'],size=100,wspace=0.3)
In [65]:
Copied!
pca(scm_1k,n_pcs=30,impute='median',use_hvm=True,key='feature_select_dispersion')
sc.pp.neighbors(scm_1k)
scm.pp.umap(scm_1k)
scm.pl.umap(scm_1k,frameon='small',color=['Cell_type','Treatment'],size=100,wspace=0.3)
pca(scm_1k,n_pcs=30,impute='median',use_hvm=True,key='feature_select_dispersion')
sc.pp.neighbors(scm_1k)
scm.pp.umap(scm_1k)
scm.pl.umap(scm_1k,frameon='small',color=['Cell_type','Treatment'],size=100,wspace=0.3)
... using top variable features
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) Cell In[65], line 1 ----> 1 scm.pp.pca(scm_1k,n_pcs=30,impute='median',use_hvm=True,key='feature_select_dispersion') 2 sc.pp.neighbors(scm_1k) 3 scm.pp.umap(scm_1k) File /asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scMethtools/preprocessing/dimension_reduction.py:100, in pca(adata, use_hvm, key, impute, n_pcs, layer, plot, first_pc, n_pc, fig_size, pad, w_pad, h_pad) 98 print('... using top variable features') 99 if 'feature_select' not in adata.var.keys(): --> 100 raise KeyError('Please run select_variable_genes() first!') 101 if layer is not None: 102 if layer in adata.layers: KeyError: 'Please run select_variable_genes() first!'
In [95]:
Copied!
pca(scm_1k,n_pcs=20,impute='median',use_hvm=True,hvm_obs='feature_select_dispersion',plot=True)
pca(scm_1k,n_pcs=20,impute='median',use_hvm=True,hvm_obs='feature_select_dispersion',plot=True)
... using top variable features based on feature_select_dispersion
Out[95]:
AnnData object with n_obs × n_vars = 46 × 67300 obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Series', 'condition', 'sample', 'Run', 'Organism', 'Cell_ID', 'Source_name', 'Cell_type', 'total_cell_type', 'subgroup', 'Development_stage', 'Strain', 'Tissue', 'Cell_line', 'Genotype', 'Treatment', 'Age', 'Sex', 'Race', 'other', 'Disease', 'CenterName', 'avgLength', 'LibraryStrategy', 'LibraryLayout', 'Platform', 'Model', 'Total_bases.Gb', 'Total_reads', 'Reads_After_trim', 'Uniq_mapping.reads', 'Mapping_ratio', 'Conversion_ratio', 'TotalCpGs(1X)', 'CpGsTotalUnique(1X)', 'CpGsMeanCov(1X)', 'CpGsMethRatio(1X)', 'CpGsMethReadRatio(1X)', 'TotalCpGs(3X)', 'CpGsTotalUnique(3X)', 'CpGsMeanCov(3X)', 'CpGsMethRatio(3X)', 'CpGsMethReadRatio(3X)', 'TotalCpGs(5X)', 'CpGsTotalUnique(5X)', 'CpGsMeanCov(5X)', 'CpGsMethRatio(5X)', 'CpGsMethReadRatio(5X)', 'n_features', 'pct_peaks' var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var', 'mean', 'pct_cell', 'dispersion', 'log_dispersion', 'mean_bin', 'cov_bin', 'dispersion_norm', 'feature_select_sr', 'feature_select_var', 'feature_select_dispersion', 'feature_select_random' uns: 'workdir', 'label_color', 'X_pca_var_ratios', 'Cell_type_colors', 'Treatment_colors', 'neighbors', 'umap' obsm: 'X_pca', 'X_tsne', 'X_umap' layers: 'relative' obsp: 'distances', 'connectivities'
In [99]:
Copied!
sc.pp.neighbors(scm_1k,n_neighbors=10)
scm.pp.umap(scm_1k)
scm.pl.umap(scm_1k,frameon='small',color=['Cell_type','Treatment'],size=100,wspace=0.3)
sc.pp.neighbors(scm_1k,n_neighbors=10)
scm.pp.umap(scm_1k)
scm.pl.umap(scm_1k,frameon='small',color=['Cell_type','Treatment'],size=100,wspace=0.3)
In [76]:
Copied!
scm.pl.pca(scm_1k,frameon='small',color=['Cell_type','Treatment','n_total'],size=100,wspace=0.3)
scm.pl.pca(scm_1k,frameon='small',color=['Cell_type','Treatment','n_total'],size=100,wspace=0.3)
In [79]:
Copied!
pca(scm_1k,n_pcs=5,impute='median',use_hvm=True,hvm_obs='feature_select_sr',plot=True)
pca(scm_1k,n_pcs=5,impute='median',use_hvm=True,hvm_obs='feature_select_sr',plot=True)
... using top variable features based on feature_select_sr
Out[79]:
AnnData object with n_obs × n_vars = 46 × 67300 obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Series', 'condition', 'sample', 'Run', 'Organism', 'Cell_ID', 'Source_name', 'Cell_type', 'total_cell_type', 'subgroup', 'Development_stage', 'Strain', 'Tissue', 'Cell_line', 'Genotype', 'Treatment', 'Age', 'Sex', 'Race', 'other', 'Disease', 'CenterName', 'avgLength', 'LibraryStrategy', 'LibraryLayout', 'Platform', 'Model', 'Total_bases.Gb', 'Total_reads', 'Reads_After_trim', 'Uniq_mapping.reads', 'Mapping_ratio', 'Conversion_ratio', 'TotalCpGs(1X)', 'CpGsTotalUnique(1X)', 'CpGsMeanCov(1X)', 'CpGsMethRatio(1X)', 'CpGsMethReadRatio(1X)', 'TotalCpGs(3X)', 'CpGsTotalUnique(3X)', 'CpGsMeanCov(3X)', 'CpGsMethRatio(3X)', 'CpGsMethReadRatio(3X)', 'TotalCpGs(5X)', 'CpGsTotalUnique(5X)', 'CpGsMeanCov(5X)', 'CpGsMethRatio(5X)', 'CpGsMethReadRatio(5X)', 'n_features', 'pct_peaks' var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var', 'mean', 'pct_cell', 'dispersion', 'log_dispersion', 'mean_bin', 'cov_bin', 'dispersion_norm', 'feature_select_sr', 'feature_select_var', 'feature_select_dispersion' uns: 'workdir', 'label_color', 'X_pca_var_ratios', 'Cell_type_colors', 'Treatment_colors', 'neighbors', 'umap' obsm: 'X_pca', 'X_tsne', 'X_umap' layers: 'relative' obsp: 'distances', 'connectivities'
In [80]:
Copied!
sc.pp.neighbors(scm_1k)
scm.pp.umap(scm_1k)
scm.pl.umap(scm_1k,frameon='small',color=['Cell_type','Treatment'],size=100,wspace=0.3)
sc.pp.neighbors(scm_1k)
scm.pp.umap(scm_1k)
scm.pl.umap(scm_1k,frameon='small',color=['Cell_type','Treatment'],size=100,wspace=0.3)
In [93]:
Copied!
pca(scm_1k,n_pcs=20,impute='median',use_hvm=True,hvm_obs='feature_select_random',plot=True)
pca(scm_1k,n_pcs=20,impute='median',use_hvm=True,hvm_obs='feature_select_random',plot=True)
... using top variable features based on feature_select_random
Out[93]:
AnnData object with n_obs × n_vars = 46 × 67300 obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Series', 'condition', 'sample', 'Run', 'Organism', 'Cell_ID', 'Source_name', 'Cell_type', 'total_cell_type', 'subgroup', 'Development_stage', 'Strain', 'Tissue', 'Cell_line', 'Genotype', 'Treatment', 'Age', 'Sex', 'Race', 'other', 'Disease', 'CenterName', 'avgLength', 'LibraryStrategy', 'LibraryLayout', 'Platform', 'Model', 'Total_bases.Gb', 'Total_reads', 'Reads_After_trim', 'Uniq_mapping.reads', 'Mapping_ratio', 'Conversion_ratio', 'TotalCpGs(1X)', 'CpGsTotalUnique(1X)', 'CpGsMeanCov(1X)', 'CpGsMethRatio(1X)', 'CpGsMethReadRatio(1X)', 'TotalCpGs(3X)', 'CpGsTotalUnique(3X)', 'CpGsMeanCov(3X)', 'CpGsMethRatio(3X)', 'CpGsMethReadRatio(3X)', 'TotalCpGs(5X)', 'CpGsTotalUnique(5X)', 'CpGsMeanCov(5X)', 'CpGsMethRatio(5X)', 'CpGsMethReadRatio(5X)', 'n_features', 'pct_peaks' var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var', 'mean', 'pct_cell', 'dispersion', 'log_dispersion', 'mean_bin', 'cov_bin', 'dispersion_norm', 'feature_select_sr', 'feature_select_var', 'feature_select_dispersion', 'feature_select_random' uns: 'workdir', 'label_color', 'X_pca_var_ratios', 'Cell_type_colors', 'Treatment_colors', 'neighbors', 'umap' obsm: 'X_pca', 'X_tsne', 'X_umap' layers: 'relative' obsp: 'distances', 'connectivities'
In [94]:
Copied!
sc.pp.neighbors(scm_1k)
scm.pp.umap(scm_1k)
scm.pl.umap(scm_1k,frameon='small',color=['Cell_type','Treatment'],size=100,wspace=0.3)
sc.pp.neighbors(scm_1k)
scm.pp.umap(scm_1k)
scm.pl.umap(scm_1k,frameon='small',color=['Cell_type','Treatment'],size=100,wspace=0.3)
In [ ]:
Copied!