Human Neuron
In [ ]:
Copied!
In [2]:
Copied!
import pandas as pd
import numpy as np
import scanpy as sc
import scMethtools as scm
import matplotlib.pyplot as plt
import anndata as ad
import pandas as pd
import numpy as np
import scanpy as sc
import scMethtools as scm
import matplotlib.pyplot as plt
import anndata as ad
#### #### # # ##### # # ##### #### #### # #### # # # ## ## # # # # # # # # # # #### # # ## # # ###### # # # # # # #### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #### #### # # # # # # #### #### ###### #### Version: 1.0.0, Tutorials: https://ngdc.cncb.ac.cn/methbank/scm
In [12]:
Copied!
scm_100k = scm.pp.load_scm("/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scm/windows.h5")
scanpy_100k = ad.read_h5ad("/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scanpy/scanpy_100kb_matrix_test_CG.h5ad")
scm_100k = scm.pp.load_scm("/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scm/windows.h5")
scanpy_100k = ad.read_h5ad("/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scanpy/scanpy_100kb_matrix_test_CG.h5ad")
scmat analysis¶
In [4]:
Copied!
#for scmart analysis
scm.pp.add_meta(scm_100k,meta_file='/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/meta/Luo_GSE97179_human_mid_cortex_raw_meta.txt',index_col='Sample_id')
#for scmart analysis
scm.pp.add_meta(scm_100k,meta_file='/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/meta/Luo_GSE97179_human_mid_cortex_raw_meta.txt',index_col='Sample_id')
No column 'label' found in metadata, 'unknown' is used as the default cell labels No column 'label_color' found in metadata, random color is generated for each cell label
In [37]:
Copied!
scm_100k.obs
scm_100k.obs
Out[37]:
cell_id | cell_name | sites | meth | n_total | global_meth_level | Sample_id | Sample | Animal age | FACS date | ... | mCG/CG | mCH/CH | Estimated mCG/CG | Estimated mCH/CH | Coverage (%) | Neuron type | tSNE x coordinate | tSNE y coordinate | n_features | pct_peaks | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | GSM2556580 | 3186266 | 2496679 | 3825044 | 0.783575 | GSM2556580 | Pool_1026_AD002_indexed | 25 yr | 2012/6/29 | ... | 0.79871 | 0.02211 | 0.79762 | 0.01680 | 6.01 | hSst-2 | 20.93320 | 13.43860 | 29498 | 0.954813 |
1 | 1 | GSM2557770 | 2572709 | 1931054 | 3242784 | 0.750592 | GSM2557770 | Pool_263_AD010_indexed | 25 yr | 2012/6/29 | ... | 0.76486 | 0.03588 | 0.76305 | 0.02847 | 5.11 | hL2/3 | -7.61221 | 13.74140 | 29498 | 0.954813 |
2 | 2 | GSM2556556 | 3247648 | 2505895 | 3882099 | 0.771603 | GSM2556556 | Pool_1012_AD006_indexed | 25 yr | 2012/6/29 | ... | 0.78647 | 0.05192 | 0.78433 | 0.04240 | 6.50 | hL2/3 | -14.57080 | 4.75207 | 29497 | 0.954781 |
3 | 3 | GSM2556719 | 3593247 | 2753322 | 4304895 | 0.766249 | GSM2556719 | Pool_1090_AD006_indexed | 25 yr | 2012/6/29 | ... | 0.78164 | 0.04276 | 0.77984 | 0.03487 | 7.19 | hL2/3 | -12.23290 | 10.33370 | 29504 | 0.955007 |
4 | 4 | GSM2558838 | 1671616 | 1213688 | 1944619 | 0.726057 | GSM2558838 | Pool_771_AD008_indexed | 25 yr | 2012/6/29 | ... | 0.73986 | 0.04119 | 0.73763 | 0.03297 | 3.26 | hL2/3 | -12.52800 | 12.85390 | 29474 | 0.954036 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2360 | 2360 | GSM2557118 | 4030037 | 3049428 | 5723469 | 0.756675 | GSM2557118 | Pool_1381_AD002_indexed | 25 yr | 2012/6/29 | ... | 0.76468 | 0.05545 | 0.76216 | 0.04534 | 7.86 | hDL-1 | -9.11158 | -1.04451 | 29499 | 0.954846 |
2361 | 2361 | GSM2558140 | 3335079 | 2526078 | 4135394 | 0.757427 | GSM2558140 | Pool_441_AD008_indexed | 25 yr | 2012/6/29 | ... | 0.76573 | 0.03698 | 0.76416 | 0.03051 | 6.79 | hL2/3 | -11.26030 | 12.10890 | 29501 | 0.954910 |
2362 | 2362 | GSM2557032 | 4045664 | 3312230 | 5141670 | 0.818711 | GSM2557032 | Pool_1359_AD010_indexed | 25 yr | 2012/6/29 | ... | 0.83100 | 0.08086 | 0.82834 | 0.06641 | 8.27 | hSst-3 | 15.37830 | -9.40008 | 29498 | 0.954813 |
2363 | 2363 | GSM2558877 | 1554042 | 1192705 | 1806746 | 0.767486 | GSM2558877 | Pool_790_AD010_indexed | 25 yr | 2012/6/29 | ... | 0.78764 | 0.03502 | 0.78601 | 0.02759 | 2.96 | hNdnf | 16.07330 | 8.95018 | 29327 | 0.949278 |
2364 | 2364 | GSM2557111 | 4053094 | 3004728 | 5423767 | 0.741342 | GSM2557111 | Pool_1379_AD010_indexed | 25 yr | 2012/6/29 | ... | 0.75210 | 0.03068 | 0.75041 | 0.02408 | 8.11 | hL2/3 | -5.32003 | 13.90730 | 29496 | 0.954748 |
2364 rows × 41 columns
In [6]:
Copied!
# filters
# filter by cell
scm.pp.filter_cells(scm_100k,min_n_features = 10000)
# filter by features
scm.pp.filter_features(scm_100k,min_n_cells=2000)
# filters
# filter by cell
scm.pp.filter_cells(scm_100k,min_n_features = 10000)
# filter by features
scm.pp.filter_features(scm_100k,min_n_cells=2000)
filter cells based on min_n_features >= 10000 after filtering out low-quality cells: 2364 cells, 30894 regions
/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scMethtools/preprocessing/scm.py:207: RuntimeWarning: Mean of empty slice feature_mean = np.nanmean(dense_X, axis=0)
Filter regions based on min_n_cells After filtering out low coveraged regions: 2364 cells, 29485 regions
In [14]:
Copied!
scm_100k.var
scm_100k.var
Out[14]:
chromosome | start | end | covered_cell | var | sr_var | mean | pct_cell | |
---|---|---|---|---|---|---|---|---|
index | ||||||||
chr1:1-100000 | chr1 | 1 | 100000 | 2353 | 0.035165 | 0.005667 | 0.710558 | 0.995347 |
chr1:100001-200000 | chr1 | 100001 | 200000 | 2354 | 0.009842 | 0.002561 | 0.810938 | 0.995770 |
chr1:200001-300000 | chr1 | 200001 | 300000 | 2351 | 0.030100 | 0.009758 | 0.730608 | 0.994501 |
chr1:300001-400000 | chr1 | 300001 | 400000 | 2323 | 0.026505 | 0.015129 | 0.819564 | 0.982657 |
chr1:400001-500000 | chr1 | 400001 | 500000 | 2346 | 0.014380 | 0.009514 | 0.836147 | 0.992386 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
chrY:56800001-56900000 | chrY | 56800001 | 56900000 | 2361 | 0.002751 | 0.002164 | 0.503138 | 0.998731 |
chrY:56900001-57000000 | chrY | 56900001 | 57000000 | 2122 | 0.098136 | 0.039726 | 0.585631 | 0.897631 |
chrY:57000001-57100000 | chrY | 57000001 | 57100000 | 2252 | 0.066954 | 0.031287 | 0.664001 | 0.952623 |
chrY:57100001-57200000 | chrY | 57100001 | 57200000 | 2311 | 0.046440 | 0.026938 | 0.684819 | 0.977580 |
chrY:57200001-57227415 | chrY | 57200001 | 57227415 | 2125 | 0.040066 | 0.022826 | 0.790896 | 0.898900 |
29509 rows × 8 columns
In [7]:
Copied!
def calculate_dispersion_norm(adata):
df = adata.var
df['dispersion'] = df['var'] / df['mean']
df['log_dispersion'] = np.log(df['dispersion'])
mean_binsize = 0.01
cov_binsize = 10
bin_min_features=5
# instead of n_bins, use bin_size, because cov and mc are in different scale
df['mean_bin'] = (df['mean'] / mean_binsize).astype(int)
df['cov_bin'] = (df['covered_cell'] / cov_binsize).astype(int)
# save bin_count df, gather bins with more than bin_min_features features
bin_count = df.groupby(['mean_bin', 'cov_bin']) \
.apply(lambda i: i.shape[0]) \
.reset_index() \
.sort_values(0, ascending=False)
bin_count.head()
bin_more_than = bin_count[bin_count[0] > bin_min_features]
if bin_more_than.shape[0] == 0:
raise ValueError(f'No bin have more than {bin_min_features} features, use larger bin size.')
# this usually don't cause much difference (a few hundred features), but the scatter plot will look more nature
index_map = {}
for _, (mean_id, cov_id, count) in bin_count.iterrows():
if count > 1:
index_map[(mean_id, cov_id)] = (mean_id, cov_id)
manhattan_dist = (bin_more_than['mean_bin'] - mean_id).abs() + (bin_more_than['cov_bin'] - cov_id).abs()
closest_more_than = manhattan_dist.sort_values().index[0]
closest = bin_more_than.loc[closest_more_than]
index_map[(mean_id, cov_id)] = tuple(closest.tolist()[:2])
# apply index_map to original df
raw_bin = df[['mean_bin', 'cov_bin']].set_index(['mean_bin', 'cov_bin'])
raw_bin['use_mean'] = pd.Series(index_map).apply(lambda i: i[0])
raw_bin['use_cov'] = pd.Series(index_map).apply(lambda i: i[1])
df['mean_bin'] = raw_bin['use_mean'].values
df['cov_bin'] = raw_bin['use_cov'].values
# calculate bin mean and std, now disp_std_bin shouldn't have NAs
disp_grouped = df.groupby(['mean_bin', 'cov_bin'])['dispersion']
disp_mean_bin = disp_grouped.mean()
disp_std_bin = disp_grouped.std(ddof=1)
# actually do the normalization
_mean_norm = disp_mean_bin.loc[list(zip(df['mean_bin'], df['cov_bin']))]
_std_norm = disp_std_bin.loc[list(zip(df['mean_bin'], df['cov_bin']))]
df['dispersion_norm'] = (df['dispersion'].values # use values here as index differs
- _mean_norm.values) / _std_norm.values
dispersion_norm = df['dispersion_norm'].values.astype('float32')
return df
def calculate_dispersion_norm(adata):
df = adata.var
df['dispersion'] = df['var'] / df['mean']
df['log_dispersion'] = np.log(df['dispersion'])
mean_binsize = 0.01
cov_binsize = 10
bin_min_features=5
# instead of n_bins, use bin_size, because cov and mc are in different scale
df['mean_bin'] = (df['mean'] / mean_binsize).astype(int)
df['cov_bin'] = (df['covered_cell'] / cov_binsize).astype(int)
# save bin_count df, gather bins with more than bin_min_features features
bin_count = df.groupby(['mean_bin', 'cov_bin']) \
.apply(lambda i: i.shape[0]) \
.reset_index() \
.sort_values(0, ascending=False)
bin_count.head()
bin_more_than = bin_count[bin_count[0] > bin_min_features]
if bin_more_than.shape[0] == 0:
raise ValueError(f'No bin have more than {bin_min_features} features, use larger bin size.')
# this usually don't cause much difference (a few hundred features), but the scatter plot will look more nature
index_map = {}
for _, (mean_id, cov_id, count) in bin_count.iterrows():
if count > 1:
index_map[(mean_id, cov_id)] = (mean_id, cov_id)
manhattan_dist = (bin_more_than['mean_bin'] - mean_id).abs() + (bin_more_than['cov_bin'] - cov_id).abs()
closest_more_than = manhattan_dist.sort_values().index[0]
closest = bin_more_than.loc[closest_more_than]
index_map[(mean_id, cov_id)] = tuple(closest.tolist()[:2])
# apply index_map to original df
raw_bin = df[['mean_bin', 'cov_bin']].set_index(['mean_bin', 'cov_bin'])
raw_bin['use_mean'] = pd.Series(index_map).apply(lambda i: i[0])
raw_bin['use_cov'] = pd.Series(index_map).apply(lambda i: i[1])
df['mean_bin'] = raw_bin['use_mean'].values
df['cov_bin'] = raw_bin['use_cov'].values
# calculate bin mean and std, now disp_std_bin shouldn't have NAs
disp_grouped = df.groupby(['mean_bin', 'cov_bin'])['dispersion']
disp_mean_bin = disp_grouped.mean()
disp_std_bin = disp_grouped.std(ddof=1)
# actually do the normalization
_mean_norm = disp_mean_bin.loc[list(zip(df['mean_bin'], df['cov_bin']))]
_std_norm = disp_std_bin.loc[list(zip(df['mean_bin'], df['cov_bin']))]
df['dispersion_norm'] = (df['dispersion'].values # use values here as index differs
- _mean_norm.values) / _std_norm.values
dispersion_norm = df['dispersion_norm'].values.astype('float32')
return df
In [10]:
Copied!
X = scm_100k.layers['relative'].copy()
n_samples = X.shape[0]
df = scm_100k.var
mean_feature_methylation = [np.nansum(np.abs(line.toarray())) / n_samples for line in X.T]
df['mean_sr'] = mean_feature_methylation
df['sr_dispersion'] = df['sr_var'] / df['mean_sr']
X = scm_100k.layers['relative'].copy()
n_samples = X.shape[0]
df = scm_100k.var
mean_feature_methylation = [np.nansum(np.abs(line.toarray())) / n_samples for line in X.T]
df['mean_sr'] = mean_feature_methylation
df['sr_dispersion'] = df['sr_var'] / df['mean_sr']
In [17]:
Copied!
df
df
Out[17]:
chromosome | start | end | covered_cell | var | sr_var | mean | pct_cell | mean_sr | sr_dispersion | |
---|---|---|---|---|---|---|---|---|---|---|
index | ||||||||||
chr1:1-100000 | chr1 | 1 | 100000 | 2353 | 0.035165 | 0.005667 | 0.710558 | 0.995347 | 0.054952 | 0.103125 |
chr1:100001-200000 | chr1 | 100001 | 200000 | 2354 | 0.009842 | 0.002561 | 0.810938 | 0.995770 | 0.037709 | 0.067909 |
chr1:200001-300000 | chr1 | 200001 | 300000 | 2351 | 0.030100 | 0.009758 | 0.730608 | 0.994501 | 0.075547 | 0.129171 |
chr1:300001-400000 | chr1 | 300001 | 400000 | 2323 | 0.026505 | 0.015129 | 0.819564 | 0.982657 | 0.085704 | 0.176523 |
chr1:400001-500000 | chr1 | 400001 | 500000 | 2346 | 0.014380 | 0.009514 | 0.836147 | 0.992386 | 0.072957 | 0.130408 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
chrY:56800001-56900000 | chrY | 56800001 | 56900000 | 2361 | 0.002751 | 0.002164 | 0.503138 | 0.998731 | 0.036448 | 0.059383 |
chrY:56900001-57000000 | chrY | 56900001 | 57000000 | 2122 | 0.098136 | 0.039726 | 0.585631 | 0.897631 | 0.147727 | 0.268912 |
chrY:57000001-57100000 | chrY | 57000001 | 57100000 | 2252 | 0.066954 | 0.031287 | 0.664001 | 0.952623 | 0.134009 | 0.233471 |
chrY:57100001-57200000 | chrY | 57100001 | 57200000 | 2311 | 0.046440 | 0.026938 | 0.684819 | 0.977580 | 0.128262 | 0.210021 |
chrY:57200001-57227415 | chrY | 57200001 | 57227415 | 2125 | 0.040066 | 0.022826 | 0.790896 | 0.898900 | 0.079380 | 0.287560 |
29509 rows × 10 columns
In [11]:
Copied!
scm_100k.var = df
scm_100k.var = df
In [8]:
Copied!
#Select hvm feature
import seaborn as sns
from matplotlib.gridspec import GridSpec
from scipy.stats import gaussian_kde
dotsize = 2
starsize = 125
staredges = 0.85
legend_loc = (0.1,0.6)
titleletter_loc = (-0.17,1.015)
hline_width = 1.5
dotsize=3
#for broken lines
linewidth_broken = 2
color_broken = 'tab:red'
#axis limits
xlim_satija = [1e-4,1e2]
ylim_satija_theta = [10**-7.5,10**1.5]
ylim_satija_beta0 = [-60,20]
title_fontsize = 22
title_fontweight = "bold"
title_position = (-0.2,1)
title_location = 'center'
title_y_padding = -10
markerfirst = True
#Select hvm feature
import seaborn as sns
from matplotlib.gridspec import GridSpec
from scipy.stats import gaussian_kde
dotsize = 2
starsize = 125
staredges = 0.85
legend_loc = (0.1,0.6)
titleletter_loc = (-0.17,1.015)
hline_width = 1.5
dotsize=3
#for broken lines
linewidth_broken = 2
color_broken = 'tab:red'
#axis limits
xlim_satija = [1e-4,1e2]
ylim_satija_theta = [10**-7.5,10**1.5]
ylim_satija_beta0 = [-60,20]
title_fontsize = 22
title_fontweight = "bold"
title_position = (-0.2,1)
title_location = 'center'
title_y_padding = -10
markerfirst = True
In [24]:
Copied!
df['log_sr_dispersion'] = np.log(df['sr_dispersion'])
gene_means = df['mean']
sqrt_var = df['var']
sr_dispersion = df['log_sr_dispersion']
dispersion = df['log_dispersion']
# #sqrt_variable_genes_idx = scm_1k.var['feature_select_var']
# sqrt_variable_genes_idx = scm_1k.var['feature_select']
# residual_var = scm_1k.var['NormalizedDispersion']
# #residuals_variable_genes_idx = scm_1k.var['feature_select']
# residuals_variable_genes_idx = scm_1k.var['feature_select_var']
with sns.plotting_context("talk"):
fig = plt.figure(figsize=(18,6))
gs = GridSpec(2, 6, figure=fig)
ax1 = fig.add_subplot(gs[:, 0:2])
ax2 = fig.add_subplot(gs[:, 2:4])
ax3 = fig.add_subplot(gs[:, 4:6])
ax1.text(*titleletter_loc,'a',transform=ax1.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax1.set_title(r'variance')
ax1.set_ylabel('variance')
ax1.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
#ax1.scatter(gene_means[residuals_variable_genes_idx],sqrt_var[residuals_variable_genes_idx],color='tab:red',s=dotsize,label='selection by Pearson residuals', rasterized=True)
ax2.text(*titleletter_loc,'b',transform=ax2.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax2.set_title(r'dispersion')
ax2.scatter(gene_means,dispersion,s=dotsize, rasterized=True,color='lightgrey')
#ax2.scatter(gene_means[sqrt_variable_genes_idx],residual_var[sqrt_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
ax3.text(*titleletter_loc,'c',transform=ax3.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax3.set_title(r'residuals dispersion')
ax3.scatter(gene_means,sr_dispersion,s=dotsize, rasterized=True,color='lightgrey')
sns.despine()
plt.tight_layout()
plt.savefig('./figures/mean_var_relationship.pdf', dpi=300, format=None)
df['log_sr_dispersion'] = np.log(df['sr_dispersion'])
gene_means = df['mean']
sqrt_var = df['var']
sr_dispersion = df['log_sr_dispersion']
dispersion = df['log_dispersion']
# #sqrt_variable_genes_idx = scm_1k.var['feature_select_var']
# sqrt_variable_genes_idx = scm_1k.var['feature_select']
# residual_var = scm_1k.var['NormalizedDispersion']
# #residuals_variable_genes_idx = scm_1k.var['feature_select']
# residuals_variable_genes_idx = scm_1k.var['feature_select_var']
with sns.plotting_context("talk"):
fig = plt.figure(figsize=(18,6))
gs = GridSpec(2, 6, figure=fig)
ax1 = fig.add_subplot(gs[:, 0:2])
ax2 = fig.add_subplot(gs[:, 2:4])
ax3 = fig.add_subplot(gs[:, 4:6])
ax1.text(*titleletter_loc,'a',transform=ax1.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax1.set_title(r'variance')
ax1.set_ylabel('variance')
ax1.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
#ax1.scatter(gene_means[residuals_variable_genes_idx],sqrt_var[residuals_variable_genes_idx],color='tab:red',s=dotsize,label='selection by Pearson residuals', rasterized=True)
ax2.text(*titleletter_loc,'b',transform=ax2.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax2.set_title(r'dispersion')
ax2.scatter(gene_means,dispersion,s=dotsize, rasterized=True,color='lightgrey')
#ax2.scatter(gene_means[sqrt_variable_genes_idx],residual_var[sqrt_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
ax3.text(*titleletter_loc,'c',transform=ax3.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax3.set_title(r'residuals dispersion')
ax3.scatter(gene_means,sr_dispersion,s=dotsize, rasterized=True,color='lightgrey')
sns.despine()
plt.tight_layout()
plt.savefig('./figures/mean_var_relationship.pdf', dpi=300, format=None)
In [25]:
Copied!
#Select hvm feature
import seaborn as sns
from matplotlib.gridspec import GridSpec
from scipy.stats import gaussian_kde
dotsize=3
#for broken lines
linewidth_broken = 2
color_broken = 'tab:red'
#axis limits
xlim_satija = [1e-4,1e2]
ylim_satija_theta = [10**-7.5,10**1.5]
ylim_satija_beta0 = [-60,20]
title_fontsize = 22
title_fontweight = "bold"
title_position = (-0.2,1)
title_location = 'center'
title_y_padding = -10
markerfirst = True
alpha=0.05
def make_panel(x,
y,
ax,
letter='',
title='',
color_kde=True,
xlim=None,
ylim=None,
xlabel='mean methylation level',
ylabel='',
xscale='linear',
yscale='log'):
ax.text(*title_position,letter,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
if color_kde:
xtrans = np.log10(x) if xscale == 'log' else x
ytrans = np.log10(y) if yscale == 'log' else y
data = np.vstack((xtrans,ytrans))
kde = gaussian_kde(data)
logdensity = kde.logpdf(data)
points = ax.scatter(x, y, s=dotsize, c=logdensity, rasterized=True)
else:
ax.scatter(x, y, s=dotsize, rasterized=True)
ax.set_xscale(xscale)
ax.set_yscale(yscale)
ax.set_ylabel(ylabel)
ax.set_xlabel(xlabel)
if xlim:
ax.set_xlim(xlim)
if ylim:
ax.set_ylim(ylim)
ax.set_title(title)
def add_line(x,y,ax):
ax.plot(x,y, '--',c=color_broken, linewidth=linewidth_broken)
def fix_ticks(ax):
ax.set_yticks(ax.get_yticks()[1:-1])
with sns.plotting_context("talk"):
fig = plt.figure(figsize=(18,9)) #constrained_layout=True)
gs = GridSpec(1,3 , figure=fig)
ax1 = fig.add_subplot(gs[0, 0]) #var
ax2 = fig.add_subplot(gs[0, 1]) #sr_var
ax3 = fig.add_subplot(gs[0, 2])
make_panel(x=df['mean'],
y=df['var'],
ax=ax1,
letter='a',
ylabel=r'var',
yscale='linear')
# beta0_prediction = np.log(mean_range_orig / np.mean(ns_orig))
# add_line(x=mean_range_orig,y=beta0_prediction,ax=ax1)
fix_ticks(ax1)
make_panel(x=df['mean'],
y=df['log_dispersion'],
ax=ax2,
letter='b',
ylabel=r'shrunken residual var',
yscale='linear')
# beta0_prediction = np.log(mean_range_orig / np.mean(ns_orig))
# add_line(x=winows100k_scm.var['mean'],y=winows100k_scm.var['sr_var'],ax=ax1)
# ax2.hlines(np.log(10),mean_min_orig,mean_max_orig,linestyles='--', colors=color_broken, linewidths=linewidth_broken)
make_panel(x=df['mean'],
y=df['log_sr_dispersion'],
ax=ax3,
letter='c',
ylabel=r'shrunken residual dispersion',
yscale='linear')
sns.despine()
plt.tight_layout()
plt.savefig('./figures/mean_var_ped.pdf', dpi=300, format=None)
plt.show()
#Select hvm feature
import seaborn as sns
from matplotlib.gridspec import GridSpec
from scipy.stats import gaussian_kde
dotsize=3
#for broken lines
linewidth_broken = 2
color_broken = 'tab:red'
#axis limits
xlim_satija = [1e-4,1e2]
ylim_satija_theta = [10**-7.5,10**1.5]
ylim_satija_beta0 = [-60,20]
title_fontsize = 22
title_fontweight = "bold"
title_position = (-0.2,1)
title_location = 'center'
title_y_padding = -10
markerfirst = True
alpha=0.05
def make_panel(x,
y,
ax,
letter='',
title='',
color_kde=True,
xlim=None,
ylim=None,
xlabel='mean methylation level',
ylabel='',
xscale='linear',
yscale='log'):
ax.text(*title_position,letter,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
if color_kde:
xtrans = np.log10(x) if xscale == 'log' else x
ytrans = np.log10(y) if yscale == 'log' else y
data = np.vstack((xtrans,ytrans))
kde = gaussian_kde(data)
logdensity = kde.logpdf(data)
points = ax.scatter(x, y, s=dotsize, c=logdensity, rasterized=True)
else:
ax.scatter(x, y, s=dotsize, rasterized=True)
ax.set_xscale(xscale)
ax.set_yscale(yscale)
ax.set_ylabel(ylabel)
ax.set_xlabel(xlabel)
if xlim:
ax.set_xlim(xlim)
if ylim:
ax.set_ylim(ylim)
ax.set_title(title)
def add_line(x,y,ax):
ax.plot(x,y, '--',c=color_broken, linewidth=linewidth_broken)
def fix_ticks(ax):
ax.set_yticks(ax.get_yticks()[1:-1])
with sns.plotting_context("talk"):
fig = plt.figure(figsize=(18,9)) #constrained_layout=True)
gs = GridSpec(1,3 , figure=fig)
ax1 = fig.add_subplot(gs[0, 0]) #var
ax2 = fig.add_subplot(gs[0, 1]) #sr_var
ax3 = fig.add_subplot(gs[0, 2])
make_panel(x=df['mean'],
y=df['var'],
ax=ax1,
letter='a',
ylabel=r'var',
yscale='linear')
# beta0_prediction = np.log(mean_range_orig / np.mean(ns_orig))
# add_line(x=mean_range_orig,y=beta0_prediction,ax=ax1)
fix_ticks(ax1)
make_panel(x=df['mean'],
y=df['log_dispersion'],
ax=ax2,
letter='b',
ylabel=r'shrunken residual var',
yscale='linear')
# beta0_prediction = np.log(mean_range_orig / np.mean(ns_orig))
# add_line(x=winows100k_scm.var['mean'],y=winows100k_scm.var['sr_var'],ax=ax1)
# ax2.hlines(np.log(10),mean_min_orig,mean_max_orig,linestyles='--', colors=color_broken, linewidths=linewidth_broken)
make_panel(x=df['mean'],
y=df['log_sr_dispersion'],
ax=ax3,
letter='c',
ylabel=r'shrunken residual dispersion',
yscale='linear')
sns.despine()
plt.tight_layout()
plt.savefig('./figures/mean_var_ped.pdf', dpi=300, format=None)
plt.show()
In [9]:
Copied!
calculate_dispersion_norm(scm_100k)
calculate_dispersion_norm(scm_100k)
/tmp/ipykernel_17336/590794251.py:14: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. bin_count = df.groupby(['mean_bin', 'cov_bin']) \
Out[9]:
chromosome | start | end | covered_cell | var | sr_var | mean | pct_cell | dispersion | log_dispersion | mean_bin | cov_bin | dispersion_norm | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | |||||||||||||
chr1:1-100000 | chr1 | 1 | 100000 | 2353 | 0.035165 | 0.005667 | 0.710558 | 0.995347 | 0.049490 | -3.005991 | 71 | 235 | 1.717898 |
chr1:100001-200000 | chr1 | 100001 | 200000 | 2354 | 0.009842 | 0.002561 | 0.810938 | 0.995770 | 0.012137 | -4.411531 | 81 | 235 | -0.480892 |
chr1:200001-300000 | chr1 | 200001 | 300000 | 2351 | 0.030100 | 0.009758 | 0.730608 | 0.994501 | 0.041198 | -3.189357 | 73 | 235 | 1.499175 |
chr1:300001-400000 | chr1 | 300001 | 400000 | 2323 | 0.026505 | 0.015129 | 0.819564 | 0.982657 | 0.032340 | -3.431454 | 81 | 233 | -0.181737 |
chr1:400001-500000 | chr1 | 400001 | 500000 | 2346 | 0.014380 | 0.009514 | 0.836147 | 0.992386 | 0.017198 | -4.062990 | 83 | 234 | -0.236255 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
chrY:56800001-56900000 | chrY | 56800001 | 56900000 | 2361 | 0.002751 | 0.002164 | 0.503138 | 0.998731 | 0.005467 | -5.209000 | 50 | 236 | -1.719908 |
chrY:56900001-57000000 | chrY | 56900001 | 57000000 | 2122 | 0.098136 | 0.039726 | 0.585631 | 0.897631 | 0.167572 | -1.786340 | 58 | 235 | 2.684742 |
chrY:57000001-57100000 | chrY | 57000001 | 57100000 | 2252 | 0.066954 | 0.031287 | 0.664001 | 0.952623 | 0.100833 | -2.294286 | 66 | 234 | 1.676650 |
chrY:57100001-57200000 | chrY | 57100001 | 57200000 | 2311 | 0.046440 | 0.026938 | 0.684819 | 0.977580 | 0.067813 | -2.690995 | 69 | 234 | 0.141551 |
chrY:57200001-57227415 | chrY | 57200001 | 57227415 | 2125 | 0.040066 | 0.022826 | 0.790896 | 0.898900 | 0.050659 | -2.982648 | 77 | 232 | -0.077491 |
29485 rows × 13 columns
In [23]:
Copied!
def feature_select(adata,number=3000):
print(adata.shape)
#adata.raw = adata.copy()
df = adata.var
feature_subset_sr = df.index.isin(df.sort_values('sr_var', ascending=False).index[:number])
feature_subset_var = df.index.isin(df.sort_values('var', ascending=False).index[:number])
feature_subset_dispersion = df.index.isin(df.sort_values('dispersion_norm', ascending=False).index[:number])
feature_subset_residual = df.index.isin(df.sort_values('sr_dispersion', ascending=False).index[:number])
# 随机选择 number 个特征的索引
random_indices = np.random.choice(df.index, size=number, replace=False)
# 获取随机选择的特征子集
feature_subset_random = df.index.isin(random_indices)
df['feature_select_sr'] = feature_subset_sr
df['feature_select_var'] = feature_subset_var
df['feature_select_dispersion'] = feature_subset_dispersion
df['feature_select_random'] = feature_subset_random
df['feature_select_residual'] = feature_subset_residual
adata.var = df
# data_filter = adata[:,adata.var['feature_select']].copy()
return adata
def feature_select(adata,number=3000):
print(adata.shape)
#adata.raw = adata.copy()
df = adata.var
feature_subset_sr = df.index.isin(df.sort_values('sr_var', ascending=False).index[:number])
feature_subset_var = df.index.isin(df.sort_values('var', ascending=False).index[:number])
feature_subset_dispersion = df.index.isin(df.sort_values('dispersion_norm', ascending=False).index[:number])
feature_subset_residual = df.index.isin(df.sort_values('sr_dispersion', ascending=False).index[:number])
# 随机选择 number 个特征的索引
random_indices = np.random.choice(df.index, size=number, replace=False)
# 获取随机选择的特征子集
feature_subset_random = df.index.isin(random_indices)
df['feature_select_sr'] = feature_subset_sr
df['feature_select_var'] = feature_subset_var
df['feature_select_dispersion'] = feature_subset_dispersion
df['feature_select_random'] = feature_subset_random
df['feature_select_residual'] = feature_subset_residual
adata.var = df
# data_filter = adata[:,adata.var['feature_select']].copy()
return adata
In [24]:
Copied!
feature_select(scm_100k,number=5000)
feature_select(scm_100k,number=5000)
(2364, 29485)
Out[24]:
AnnData object with n_obs × n_vars = 2364 × 29485 obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Sample_id', 'Sample', 'Animal age', 'FACS date', 'Brain area', 'Laminar layer', 'Labeling', 'FACS channel', 'FACS count', 'Bisulfite conversion method', 'Library type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Sequencing run mode', 'Total reads', 'Mapped reads', '% Mapped reads', 'Filtered reads', '% Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Coverage (%)', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate', 'n_features', 'pct_peaks' var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var', 'mean', 'pct_cell', 'dispersion', 'log_dispersion', 'mean_bin', 'cov_bin', 'dispersion_norm', 'mean_sr', 'sr_dispersion', 'feature_select_sr', 'feature_select_var', 'feature_select_dispersion', 'feature_select_random', 'feature_select_residual' uns: 'workdir', 'label_color' layers: 'relative'
In [382]:
Copied!
scm_100k.var
scm_100k.var
Out[382]:
chromosome | start | end | covered_cell | var | sr_var | mean | pct_cell | dispersion | log_dispersion | ... | feature_select_var | feature_select_dispersion | feature_select_random | mean_sr | sr_dispersion | log_sr_dispersion | feature_select_residual | Accession | Gene | Distance | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | |||||||||||||||||||||
chr1:1-100000 | chr1 | 1 | 100000 | 2353 | 0.035220 | 0.029984 | 0.710440 | 0.995347 | 20.171644 | 3.004278 | ... | True | False | False | 0.333772 | 0.089833 | 2.400980 | True | ENSG00000186092.7 | OR4F5 | 0 |
chr1:100001-200000 | chr1 | 100001 | 200000 | 2354 | 0.009842 | 0.016549 | 0.810938 | 0.995770 | 82.395519 | 4.411531 | ... | False | False | False | 0.366151 | 0.045198 | 3.093073 | False | ENSG00000186092.7 | OR4F5 | 28417 |
chr1:200001-300000 | chr1 | 200001 | 300000 | 2351 | 0.030100 | 0.035758 | 0.730608 | 0.994501 | 24.272813 | 3.189357 | ... | True | False | False | 0.383961 | 0.093130 | 2.362730 | True | ENSG00000186092.7 | OR4F5 | 128417 |
chr1:300001-400000 | chr1 | 300001 | 400000 | 2323 | 0.026505 | 0.055943 | 0.819564 | 0.982657 | 30.921580 | 3.431454 | ... | True | False | False | 0.327454 | 0.170844 | 1.713199 | True | ENSG00000284733.2 | OR4F29 | 50740 |
chr1:400001-500000 | chr1 | 400001 | 500000 | 2346 | 0.014380 | 0.038490 | 0.836147 | 0.992386 | 58.147901 | 4.062990 | ... | False | False | False | 0.382297 | 0.100680 | 2.278761 | True | ENSG00000284733.2 | OR4F29 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
chrY:56800001-56900000 | chrY | 56800001 | 56900000 | 2361 | 0.002751 | 0.003180 | 0.503138 | 0.998731 | 182.910968 | 5.209000 | ... | False | True | False | 0.256378 | 0.012404 | 4.388891 | False | ENSG00000124333.16_PAR_Y | VAMP7 | 167865 |
chrY:56900001-57000000 | chrY | 56900001 | 57000000 | 2122 | 0.098136 | 0.067797 | 0.585631 | 0.897631 | 5.967571 | 1.786340 | ... | True | False | True | 0.251633 | 0.269430 | 1.080738 | True | ENSG00000124333.16_PAR_Y | VAMP7 | 67865 |
chrY:57000001-57100000 | chrY | 57000001 | 57100000 | 2252 | 0.066954 | 0.067381 | 0.664001 | 0.952623 | 9.917348 | 2.294286 | ... | True | False | False | 0.311553 | 0.216276 | 1.406113 | True | ENSG00000124333.16_PAR_Y | VAMP7 | 0 |
chrY:57100001-57200000 | chrY | 57100001 | 57200000 | 2311 | 0.046440 | 0.054673 | 0.684819 | 0.977580 | 14.746337 | 2.690995 | ... | True | False | False | 0.300699 | 0.181819 | 1.595794 | True | ENSG00000124333.16_PAR_Y | VAMP7 | 0 |
chrY:57200001-57227415 | chrY | 57200001 | 57227415 | 2125 | 0.040066 | 0.080984 | 0.790896 | 0.898900 | 19.740019 | 2.982648 | ... | True | False | False | 0.292204 | 0.277147 | 1.184860 | True | ENSG00000182484.15_PAR_Y | WASH6P | 0 |
29485 rows × 24 columns
In [ ]:
Copied!
In [26]:
Copied!
scm.pp.pca(scm_100k,n_pcs=30,impute='median')
sc.pp.neighbors(scm_100k)
scm.pp.umap(scm_100k)
scm.pl.pca(scm_100k,frameon='small',color=['Neuron type'],size=40,wspace=0.3)
scm.pl.umap(scm_100k,frameon='small',color=['Neuron type','Brain area'],size=100,wspace=0.3)
scm.pp.pca(scm_100k,n_pcs=30,impute='median')
sc.pp.neighbors(scm_100k)
scm.pp.umap(scm_100k)
scm.pl.pca(scm_100k,frameon='small',color=['Neuron type'],size=40,wspace=0.3)
scm.pl.umap(scm_100k,frameon='small',color=['Neuron type','Brain area'],size=100,wspace=0.3)
... using all features
In [ ]:
Copied!
#第一主成分的差异主要是Na造成的,filter一下看看结果
#第一主成分的差异主要是Na造成的,filter一下看看结果
In [25]:
Copied!
# 删除 Neuron type 为 NA 的数据
scm_100k_filter = scm_100k[~scm_100k.obs['Neuron type'].isna()].copy()
scm_100k_filter
# 删除 Neuron type 为 NA 的数据
scm_100k_filter = scm_100k[~scm_100k.obs['Neuron type'].isna()].copy()
scm_100k_filter
Out[25]:
AnnData object with n_obs × n_vars = 2313 × 29485 obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Sample_id', 'Sample', 'Animal age', 'FACS date', 'Brain area', 'Laminar layer', 'Labeling', 'FACS channel', 'FACS count', 'Bisulfite conversion method', 'Library type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Sequencing run mode', 'Total reads', 'Mapped reads', '% Mapped reads', 'Filtered reads', '% Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Coverage (%)', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate', 'n_features', 'pct_peaks' var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var', 'mean', 'pct_cell', 'dispersion', 'log_dispersion', 'mean_bin', 'cov_bin', 'dispersion_norm', 'mean_sr', 'sr_dispersion', 'feature_select_sr', 'feature_select_var', 'feature_select_dispersion', 'feature_select_random', 'feature_select_residual' uns: 'workdir', 'label_color' layers: 'relative'
In [28]:
Copied!
scm.pp.pca(scm_100k_filter,n_pcs=30,impute='median')
# sc.pp.neighbors(scm_100k)
# scm.pp.umap(scm_100k)
scm.pl.pca(scm_100k_filter,frameon='small',color=['Neuron type'],size=40,wspace=0.3)
# scm.pl.umap(scm_100k,frameon='small',color=['Neuron type','Brain area'],size=100,wspace=0.3)
scm.pp.pca(scm_100k_filter,n_pcs=30,impute='median')
# sc.pp.neighbors(scm_100k)
# scm.pp.umap(scm_100k)
scm.pl.pca(scm_100k_filter,frameon='small',color=['Neuron type'],size=40,wspace=0.3)
# scm.pl.umap(scm_100k,frameon='small',color=['Neuron type','Brain area'],size=100,wspace=0.3)
... using all features
In [32]:
Copied!
sc.pp.neighbors(scm_100k_filter)
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=30,wspace=0.3)
sc.pp.neighbors(scm_100k_filter)
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=30,wspace=0.3)
In [34]:
Copied!
scm.pp.pca(scm_100k_filter,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_dispersion')
sc.pp.neighbors(scm_100k_filter)
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=100,wspace=0.3)
scm.pp.pca(scm_100k_filter,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_dispersion')
sc.pp.neighbors(scm_100k_filter)
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=100,wspace=0.3)
... using top variable features based on feature_select_dispersion
In [513]:
Copied!
# pca(scm_100k,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_dispersion')
# sc.pp.neighbors(scm_100k)
# scm.pp.umap(scm_100k)
scm.pl.pca(scm_100k,frameon='small',color=['Neuron type','layer'],size=40,wspace=0.3)
scm.pl.umap(scm_100k,frameon='small',color=['Neuron type','layer'],size=40,wspace=0.3)
# pca(scm_100k,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_dispersion')
# sc.pp.neighbors(scm_100k)
# scm.pp.umap(scm_100k)
scm.pl.pca(scm_100k,frameon='small',color=['Neuron type','layer'],size=40,wspace=0.3)
scm.pl.umap(scm_100k,frameon='small',color=['Neuron type','layer'],size=40,wspace=0.3)
In [35]:
Copied!
scm.pp.pca(scm_100k_filter,n_pcs=30,impute='median',use_hvm=True,hvm_obs='feature_select_dispersion',layer='relative')
sc.pp.neighbors(scm_100k_filter,use_rep='relative_pca')
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=100,wspace=0.3,palette=scm.pl.ditto_palette())
scm.pp.pca(scm_100k_filter,n_pcs=30,impute='median',use_hvm=True,hvm_obs='feature_select_dispersion',layer='relative')
sc.pp.neighbors(scm_100k_filter,use_rep='relative_pca')
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=100,wspace=0.3,palette=scm.pl.ditto_palette())
... using top variable features based on feature_select_dispersion
In [37]:
Copied!
#使用所有的特征进行计算
scm.pp.pca(scm_100k_filter,n_pcs=30,impute='median',layer='relative')
sc.pp.neighbors(scm_100k_filter,use_rep='relative_pca')
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=40,wspace=0.3,palette=scm.pl.ditto_palette())
#使用所有的特征进行计算
scm.pp.pca(scm_100k_filter,n_pcs=30,impute='median',layer='relative')
sc.pp.neighbors(scm_100k_filter,use_rep='relative_pca')
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=40,wspace=0.3,palette=scm.pl.ditto_palette())
In [415]:
Copied!
pca(scm_100k,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_sr')
sc.pp.neighbors(scm_100k)
scm.pp.umap(scm_100k)
scm.pl.umap(scm_100k,frameon='small',color=['Neuron type','layer'],size=100,wspace=0.3)
pca(scm_100k,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_sr')
sc.pp.neighbors(scm_100k)
scm.pp.umap(scm_100k)
scm.pl.umap(scm_100k,frameon='small',color=['Neuron type','layer'],size=100,wspace=0.3)
... using top variable features based on feature_select_sr
In [38]:
Copied!
scm.pp.pca(scm_100k_filter,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_sr',layer='relative')
sc.pp.neighbors(scm_100k_filter,use_rep='relative_pca')
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=40,wspace=0.3)
scm.pp.pca(scm_100k_filter,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_sr',layer='relative')
sc.pp.neighbors(scm_100k_filter,use_rep='relative_pca')
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=40,wspace=0.3)
... using top variable features based on feature_select_sr
In [418]:
Copied!
pca(scm_100k,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_sr',layer='relative')
sc.pp.neighbors(scm_100k,use_rep='Xrelative_pca', method='umap', metric='cosine')
scm.pp.umap(scm_100k)
scm.pl.umap(scm_100k,frameon='small',color=['Neuron type','layer'],size=100,wspace=0.3)
pca(scm_100k,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_sr',layer='relative')
sc.pp.neighbors(scm_100k,use_rep='Xrelative_pca', method='umap', metric='cosine')
scm.pp.umap(scm_100k)
scm.pl.umap(scm_100k,frameon='small',color=['Neuron type','layer'],size=100,wspace=0.3)
... using top variable features based on feature_select_sr
In [39]:
Copied!
sc.tl.tsne(scm_100k_filter)
scm.pl.tsne(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=40,wspace=0.3)
sc.tl.tsne(scm_100k_filter)
scm.pl.tsne(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=40,wspace=0.3)
In [ ]:
Copied!
### using residuals data to take hvm (calculating residuals var and mean)
### using residuals data to take hvm (calculating residuals var and mean)
In [41]:
Copied!
scm.pp.pca(scm_100k_filter,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_residual',layer='relative')
sc.pp.neighbors(scm_100k_filter,use_rep='relative_pca')
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=30,wspace=0.3,palette=scm.pl.ditto_palette())
scm.pp.pca(scm_100k_filter,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_residual',layer='relative')
sc.pp.neighbors(scm_100k_filter,use_rep='relative_pca')
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=30,wspace=0.3,palette=scm.pl.ditto_palette())
... using top variable features based on feature_select_residual
In [43]:
Copied!
#using mean methylation to conduct dimension
scm.pp.pca(scm_100k_filter,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_residual')
sc.pp.neighbors(scm_100k_filter)
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=30,wspace=0.3,palette=scm.pl.ditto_palette())
#using mean methylation to conduct dimension
scm.pp.pca(scm_100k_filter,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_residual')
sc.pp.neighbors(scm_100k_filter)
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=30,wspace=0.3,palette=scm.pl.ditto_palette())
... using top variable features based on feature_select_residual
scm_100k¶
In [44]:
Copied!
annot = []
for n in scm_100k.obs['Neuron type']:
if n in ['hL6-1', 'hL6-2','hL6-3','hDL-1', 'hDL-2', 'hDL-3']:
annot.append('deep layer')
elif n in ['hL5-1', 'hL5-2','hL5-3','hL4']:
annot.append('middle layer')
elif n in ['hSst-1','hSst-2','hSst-3','hPv-1','hPv-2','hVip-1', 'hVip-2','hNdnf','hNos']:
annot.append('Inhibitory')
else:
annot.append('upper layer - L2/3')
scm_100k.obs['layer'] = annot
sc.pl.umap(scm_100k, color=['layer', 'Neuron type'],wspace =0.2)
annot = []
for n in scm_100k.obs['Neuron type']:
if n in ['hL6-1', 'hL6-2','hL6-3','hDL-1', 'hDL-2', 'hDL-3']:
annot.append('deep layer')
elif n in ['hL5-1', 'hL5-2','hL5-3','hL4']:
annot.append('middle layer')
elif n in ['hSst-1','hSst-2','hSst-3','hPv-1','hPv-2','hVip-1', 'hVip-2','hNdnf','hNos']:
annot.append('Inhibitory')
else:
annot.append('upper layer - L2/3')
scm_100k.obs['layer'] = annot
sc.pl.umap(scm_100k, color=['layer', 'Neuron type'],wspace =0.2)
/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:1234: FutureWarning: The default value of 'ignore' for the `na_action` parameter in pandas.Categorical.map is deprecated and will be changed to 'None' in a future version. Please set na_action to the desired value to avoid seeing this warning color_vector = pd.Categorical(values.map(color_map)) /asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter( /asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:1234: FutureWarning: The default value of 'ignore' for the `na_action` parameter in pandas.Categorical.map is deprecated and will be changed to 'None' in a future version. Please set na_action to the desired value to avoid seeing this warning color_vector = pd.Categorical(values.map(color_map)) /asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
In [111]:
Copied!
scm_100k.obs['Neuron type'].unique()
scm_100k.obs['Neuron type'].unique()
Out[111]:
['hSst-2', 'hL2/3', 'hDL-1', NaN, 'hL5-2', ..., 'hL5-1', 'hSst-1', 'hSst-3', 'hDL-2', 'hPv-1'] Length: 22 Categories (21, object): ['hDL-1', 'hDL-2', 'hDL-3', 'hL2/3', ..., 'hSst-2', 'hSst-3', 'hVip-1', 'hVip-2']
In [97]:
Copied!
scm_100k.var
scm_100k.var
Out[97]:
chromosome | start | end | covered_cell | var | sr_var | mean | pct_cell | dispersion | log_dispersion | ... | cov_bin | dispersion_norm | feature_select_sr | feature_select_var | feature_select_dispersion | feature_select_random | mean_sr | sr_dispersion | log_sr_dispersion | feature_select_residual | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | |||||||||||||||||||||
chr1:1-100000 | chr1 | 1 | 100000 | 2353 | 0.035220 | 0.029984 | 0.710440 | 0.995347 | 20.171644 | 3.004278 | ... | 235 | -0.832056 | True | True | False | True | 0.330840 | 11.033983 | 2.400980 | False |
chr1:100001-200000 | chr1 | 100001 | 200000 | 2354 | 0.009842 | 0.016549 | 0.810938 | 0.995770 | 82.395519 | 4.411531 | ... | 235 | 0.051438 | False | False | False | False | 0.364824 | 22.044721 | 3.093073 | False |
chr1:200001-300000 | chr1 | 200001 | 300000 | 2351 | 0.030100 | 0.035758 | 0.730608 | 0.994501 | 24.272813 | 3.189357 | ... | 235 | -1.055196 | True | True | False | False | 0.379749 | 10.619905 | 2.362730 | False |
chr1:300001-400000 | chr1 | 300001 | 400000 | 2323 | 0.026505 | 0.055943 | 0.819564 | 0.982657 | 30.921580 | 3.431454 | ... | 233 | -0.015202 | True | True | False | True | 0.310300 | 5.546675 | 1.713199 | False |
chr1:400001-500000 | chr1 | 400001 | 500000 | 2346 | 0.014380 | 0.038490 | 0.836147 | 0.992386 | 58.147901 | 4.062990 | ... | 234 | -0.125231 | True | False | False | False | 0.375837 | 9.764573 | 2.278761 | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
chrY:56800001-56900000 | chrY | 56800001 | 56900000 | 2361 | 0.002751 | 0.003180 | 0.503138 | 0.998731 | 182.910968 | 5.209000 | ... | 236 | 4.556713 | False | False | True | False | 0.256169 | 80.551067 | 4.388891 | True |
chrY:56900001-57000000 | chrY | 56900001 | 57000000 | 2122 | 0.098136 | 0.067797 | 0.585631 | 0.897631 | 5.967571 | 1.786340 | ... | 235 | -0.670919 | True | True | False | False | 0.199789 | 2.946852 | 1.080738 | False |
chrY:57000001-57100000 | chrY | 57000001 | 57100000 | 2252 | 0.066954 | 0.067381 | 0.664001 | 0.952623 | 9.917348 | 2.294286 | ... | 234 | -0.554419 | True | True | False | False | 0.274920 | 4.080066 | 1.406113 | False |
chrY:57100001-57200000 | chrY | 57100001 | 57200000 | 2311 | 0.046440 | 0.054673 | 0.684819 | 0.977580 | 14.746337 | 2.690995 | ... | 234 | -0.448433 | True | True | False | False | 0.269660 | 4.932242 | 1.595794 | False |
chrY:57200001-57227415 | chrY | 57200001 | 57227415 | 2125 | 0.040066 | 0.080984 | 0.790896 | 0.898900 | 19.740019 | 2.982648 | ... | 232 | -0.297100 | True | True | False | True | 0.264834 | 3.270228 | 1.184860 | False |
29485 rows × 21 columns
Episcanpy method¶
In [182]:
Copied!
#load 100kb windows methylation matrix
scanpy_100k = ad.read("/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/sc/")
#load 100kb windows methylation matrix
scanpy_100k = ad.read("/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/sc/")
FutureWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/anndata/__init__.py:51: `anndata.read` is deprecated, use `anndata.read_h5ad` instead. `ad.read` will be removed in mid 2024.
In [184]:
Copied!
w_mtx_0 = pd.read_csv("/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/raw/test_CG_100kb",sep='\t', header=None)
w_mtx_0 = pd.read_csv("/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/raw/test_CG_100kb",sep='\t', header=None)
In [185]:
Copied!
w_mtx_0 = w_mtx_0.set_index(w_mtx_0.columns[0])
w_mtx_0
w_mtx_0 = w_mtx_0.set_index(w_mtx_0.columns[0])
w_mtx_0
Out[185]:
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 30869 | 30870 | 30871 | 30872 | 30873 | 30874 | 30875 | 30876 | 30877 | 30878 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | |||||||||||||||||||||
GSM2559344_updated.bed | 0.833 | 0.987 | 0.000 | 0.000 | 0.929 | NaN | 0.963 | 0.000 | 0.000 | 0.998 | ... | NaN | NaN | NaN | 0.999 | 0.000 | 0.000 | NaN | 0.929 | 0.000 | NaN |
GSM2557929_updated.bed | 0.000 | 0.980 | 0.889 | 0.966 | 0.000 | 0.941 | 0.986 | 0.000 | 0.994 | 0.998 | ... | NaN | NaN | NaN | 1.000 | 0.000 | 0.000 | NaN | 0.889 | 0.000 | NaN |
GSM2558161_updated.bed | 0.982 | 0.993 | 0.000 | 0.000 | 0.964 | 0.964 | 0.000 | 0.000 | 0.995 | 0.999 | ... | NaN | NaN | NaN | 0.000 | 0.000 | 0.000 | 0.857 | 0.900 | 0.955 | NaN |
GSM2557053_updated.bed | 0.988 | 0.996 | 0.000 | 0.000 | 0.986 | 0.975 | 0.994 | 0.000 | 0.999 | 0.999 | ... | NaN | NaN | NaN | 1.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | NaN |
GSM2558171_updated.bed | 0.974 | 0.000 | 0.944 | 0.952 | 0.909 | 0.000 | 0.000 | 0.000 | 0.993 | 0.999 | ... | NaN | NaN | NaN | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
GSM2558767_updated.bed | 0.000 | 0.980 | 0.000 | NaN | 0.950 | 0.950 | 0.985 | 0.974 | 0.982 | 0.998 | ... | NaN | NaN | NaN | 0.999 | 0.000 | 0.992 | NaN | NaN | 0.960 | NaN |
GSM2558199_updated.bed | 0.923 | 0.972 | 0.923 | 0.962 | 0.947 | 0.000 | 0.988 | 0.982 | 0.990 | 0.998 | ... | NaN | NaN | NaN | 0.999 | 0.000 | 0.000 | 0.000 | NaN | 0.833 | NaN |
GSM2558196_updated.bed | 0.000 | 0.992 | 0.958 | 0.977 | 0.970 | 0.000 | 0.988 | 0.991 | 0.996 | 0.999 | ... | NaN | NaN | NaN | 0.999 | 0.999 | 0.000 | 0.000 | NaN | 0.950 | NaN |
GSM2556858_updated.bed | 0.000 | 0.000 | NaN | NaN | 0.952 | 0.950 | 0.979 | 0.974 | 0.000 | 0.998 | ... | NaN | NaN | NaN | 0.997 | 0.000 | 0.000 | NaN | NaN | NaN | NaN |
GSM2558354_updated.bed | 0.980 | 0.000 | 0.971 | 0.933 | 0.962 | 0.938 | 0.000 | 0.983 | 0.992 | 0.000 | ... | NaN | NaN | NaN | 0.000 | 0.000 | 0.000 | NaN | 0.909 | 0.857 | NaN |
2365 rows × 30878 columns
In [186]:
Copied!
import episcanpy as epi
import os
import glob
import anndata as ad
import pandas as pd
import episcanpy as epi
import os
import glob
import anndata as ad
import pandas as pd
In [190]:
Copied!
windows = epi.ct.make_windows(100000)
w_names = epi.ct.name_features(windows)
#w_mtx_0 = w_mtx_0.drop(columns=w_mtx_0.columns[-1])
windows = epi.ct.make_windows(100000)
w_names = epi.ct.name_features(windows)
#w_mtx_0 = w_mtx_0.drop(columns=w_mtx_0.columns[-1])
In [200]:
Copied!
var_df=pd.DataFrame(index=w_names, columns=['var_names'])
var_df=pd.DataFrame(index=w_names, columns=['var_names'])
In [203]:
Copied!
w_mtx_0.columns = w_names
w_mtx_0.columns = w_names
In [204]:
Copied!
scanpy_100k = ad.AnnData(w_mtx_0, obs=pd.DataFrame(index=w_mtx_0.index), var=var_df)
scanpy_100k = ad.AnnData(w_mtx_0, obs=pd.DataFrame(index=w_mtx_0.index), var=var_df)
In [56]:
Copied!
import episcanpy as epi
import episcanpy as epi
quality control (from episcanpy tutorials)¶
In [205]:
Copied!
xdata = scanpy_100k
length1 = len(xdata.X[0,:]) # 55017 features,列
length2 = len(xdata.X[:,0]) # 3379 samples 行
xdata.obs['coverage_feature'] = [length1 - np.isnan(line).sum() for line in xdata.X] #每一个样本中覆盖的特征数
xdata.obs['mean_cell_methylation'] = [np.nansum(line)/length1 for line in xdata.X]
xdata.var['coverage_cells'] = [length2 - np.isnan(line).sum() for line in xdata.X.T] #每一个特征在多少个样本中有覆盖
xdata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in xdata.X.T]
xdata = scanpy_100k
length1 = len(xdata.X[0,:]) # 55017 features,列
length2 = len(xdata.X[:,0]) # 3379 samples 行
xdata.obs['coverage_feature'] = [length1 - np.isnan(line).sum() for line in xdata.X] #每一个样本中覆盖的特征数
xdata.obs['mean_cell_methylation'] = [np.nansum(line)/length1 for line in xdata.X]
xdata.var['coverage_cells'] = [length2 - np.isnan(line).sum() for line in xdata.X.T] #每一个特征在多少个样本中有覆盖
xdata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in xdata.X.T]
In [206]:
Copied!
scanpy_100k.obs
scanpy_100k.obs
Out[206]:
coverage_feature | mean_cell_methylation | |
---|---|---|
0 | ||
GSM2559344_updated.bed | 29384 | 0.795401 |
GSM2557929_updated.bed | 29445 | 0.814468 |
GSM2558161_updated.bed | 29403 | 0.805425 |
GSM2557053_updated.bed | 29477 | 0.792656 |
GSM2558171_updated.bed | 29435 | 0.841988 |
... | ... | ... |
GSM2558767_updated.bed | 29306 | 0.802762 |
GSM2558199_updated.bed | 29309 | 0.827274 |
GSM2558196_updated.bed | 29489 | 0.841542 |
GSM2556858_updated.bed | 29311 | 0.789884 |
GSM2558354_updated.bed | 28520 | 0.750032 |
2365 rows × 2 columns
Filtering low quality cells¶
In [471]:
Copied!
#filter out cells with less than 10000 enhancers covered
scanpy_100k=scanpy_100k[scanpy_100k.obs['coverage_feature']>10000,:].copy()
scanpy_100k
#filter out cells with less than 10000 enhancers covered
scanpy_100k=scanpy_100k[scanpy_100k.obs['coverage_feature']>10000,:].copy()
scanpy_100k
Out[471]:
AnnData object with n_obs × n_vars = 2350 × 30877 obs: 'coverage_feature', 'mean_cell_methylation', 'coverage_cells' var: 'var_names', 'coverage_cells', 'mean_feature_methylation', 'coverage_feature'
Imputation missing data¶
In [472]:
Copied!
adata = epi.pp.imputation_met(scanpy_100k, number_cell_covered=500, imputation_value='mean', save=None, copy=True)
adata = epi.pp.imputation_met(scanpy_100k, number_cell_covered=500, imputation_value='mean', save=None, copy=True)
FutureWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/episcanpy/preprocessing/_readimpute.py:90: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
In [473]:
Copied!
# recalculing qc values
length1 = len(adata.X[0,:])
length2 = len(adata.X[:,0])
adata.obs['coverage_cells'] = [length1 - np.isnan(line).sum() for line in adata.X]
adata.obs['mean_cell_methylation'] = [np.nansum(line)/length1 for line in adata.X]
adata.var['coverage_feature'] = [length2 - np.isnan(line).sum() for line in adata.X.T]
adata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in adata.X.T]
# recalculing qc values
length1 = len(adata.X[0,:])
length2 = len(adata.X[:,0])
adata.obs['coverage_cells'] = [length1 - np.isnan(line).sum() for line in adata.X]
adata.obs['mean_cell_methylation'] = [np.nansum(line)/length1 for line in adata.X]
adata.var['coverage_feature'] = [length2 - np.isnan(line).sum() for line in adata.X.T]
adata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in adata.X.T]
Low dimensional representation¶
In [474]:
Copied!
# perform PCA, neighbor graph, tSNE and UMAP
epi.pp.lazy(adata)
# perform PCA, neighbor graph, tSNE and UMAP
epi.pp.lazy(adata)
In [475]:
Copied!
adata
adata
Out[475]:
AnnData object with n_obs × n_vars = 2350 × 29494 obs: 'coverage_feature', 'mean_cell_methylation', 'coverage_cells' var: 'var_names', 'coverage_cells', 'mean_feature_methylation', 'coverage_feature' uns: 'pca', 'neighbors', 'tsne', 'umap' obsm: 'X_pca', 'X_tsne', 'X_umap' varm: 'PCs' obsp: 'distances', 'connectivities'
In [476]:
Copied!
adata.obs
adata.obs
Out[476]:
coverage_feature | mean_cell_methylation | coverage_cells | |
---|---|---|---|
0 | |||
GSM2559344_updated.bed | 29384 | 0.834547 | 29494 |
GSM2557929_updated.bed | 29445 | 0.853282 | 29494 |
GSM2558161_updated.bed | 29403 | 0.845088 | 29494 |
GSM2557053_updated.bed | 29477 | 0.830104 | 29494 |
GSM2558171_updated.bed | 29435 | 0.882272 | 29494 |
... | ... | ... | ... |
GSM2558767_updated.bed | 29306 | 0.844110 | 29494 |
GSM2558199_updated.bed | 29309 | 0.869876 | 29494 |
GSM2558196_updated.bed | 29489 | 0.881006 | 29494 |
GSM2556858_updated.bed | 29311 | 0.830229 | 29494 |
GSM2558354_updated.bed | 28520 | 0.811096 | 29494 |
2350 rows × 3 columns
In [477]:
Copied!
# 假设你已经有一个 AnnData 对象 adata
# 添加 cell_name 列,内容为当前 obs 的索引值
adata.obs['cell_name'] = adata.obs.index
# 去掉索引中 _updated.bed 后缀
new_index = adata.obs.index.str.replace('_updated.bed', '', regex=False)
# 将修改后的索引赋值回 adata.obs
adata.obs.index = new_index
# 如果还需要,更新 cell_name 列,使其保持和索引一致
adata.obs['cell_name'] = new_index
# 假设你已经有一个 AnnData 对象 adata
# 添加 cell_name 列,内容为当前 obs 的索引值
adata.obs['cell_name'] = adata.obs.index
# 去掉索引中 _updated.bed 后缀
new_index = adata.obs.index.str.replace('_updated.bed', '', regex=False)
# 将修改后的索引赋值回 adata.obs
adata.obs.index = new_index
# 如果还需要,更新 cell_name 列,使其保持和索引一致
adata.obs['cell_name'] = new_index
In [478]:
Copied!
scm.pp.add_meta(adata,meta_file='/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/meta/Luo_GSE97179_human_mid_cortex_raw_meta.txt',index_col='Sample_id')
scm.pp.add_meta(adata,meta_file='/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/meta/Luo_GSE97179_human_mid_cortex_raw_meta.txt',index_col='Sample_id')
No column 'label' found in metadata, 'unknown' is used as the default cell labels No column 'label_color' found in metadata, random color is generated for each cell label
In [218]:
Copied!
adata.obs
adata.obs
Out[218]:
coverage_feature | mean_cell_methylation | coverage_cells | cell_name | Sample_id | Sample | Animal age | FACS date | Brain area | Laminar layer | ... | % Filtered reads | mCCC/CCC | mCG/CG | mCH/CH | Estimated mCG/CG | Estimated mCH/CH | Coverage (%) | Neuron type | tSNE x coordinate | tSNE y coordinate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 29384 | 0.834547 | 29494 | GSM2559344 | GSM2559344 | Pool_9_AD010_indexed | 25 yr | 2012/6/29 | Mid Frontal Gyrus | NaN | ... | 47.80% | 0.00742 | 0.78505 | 0.03876 | 0.78344 | 0.03157 | 4.64 | hL5-4 | -2.18678 | -12.80550 |
1 | 29445 | 0.853282 | 29494 | GSM2557929 | GSM2557929 | Pool_338_AD006_indexed | 25 yr | 2012/6/29 | Mid Frontal Gyrus | NaN | ... | 42.40% | 0.01529 | 0.78961 | 0.07792 | 0.78634 | 0.06360 | 5.00 | hL5-4 | -9.15026 | -14.39010 |
2 | 29403 | 0.845088 | 29494 | GSM2558161 | GSM2558161 | Pool_451_AD008_indexed | 25 yr | 2012/6/29 | Mid Frontal Gyrus | NaN | ... | 43.20% | 0.01145 | 0.77535 | 0.06240 | 0.77275 | 0.05154 | 7.10 | hDL-1 | -9.59999 | -1.82943 |
3 | 29477 | 0.830104 | 29494 | GSM2557053 | GSM2557053 | Pool_1365_AD006_indexed | 25 yr | 2012/6/29 | Mid Frontal Gyrus | NaN | ... | 39.00% | 0.00983 | 0.76679 | 0.05151 | 0.76447 | 0.04209 | 11.41 | hL2/3 | -16.85230 | 5.09134 |
4 | 29435 | 0.882272 | 29494 | GSM2558171 | GSM2558171 | Pool_457_AD008_indexed | 25 yr | 2012/6/29 | Mid Frontal Gyrus | NaN | ... | 47.60% | 0.01625 | 0.82785 | 0.08610 | 0.82501 | 0.07100 | 6.76 | hPv-1 | 12.97110 | -6.70419 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2345 | 29306 | 0.844110 | 29494 | GSM2558767 | GSM2558767 | Pool_737_AD002_indexed | 25 yr | 2012/6/29 | Mid Frontal Gyrus | NaN | ... | 44.60% | 0.01255 | 0.78907 | 0.06757 | 0.78639 | 0.05572 | 3.48 | hL5-4 | -9.11452 | -13.56870 |
2346 | 29309 | 0.869876 | 29494 | GSM2558199 | GSM2558199 | Pool_46_AD008_indexed | 25 yr | 2012/6/29 | Mid Frontal Gyrus | NaN | ... | 52.00% | 0.01299 | 0.83044 | 0.06731 | 0.82821 | 0.05503 | 4.58 | hVip-2 | 20.16110 | 0.64929 |
2347 | 29489 | 0.881006 | 29494 | GSM2558196 | GSM2558196 | Pool_468_AD010_indexed | 25 yr | 2012/6/29 | Mid Frontal Gyrus | NaN | ... | 39.70% | 0.01157 | 0.82074 | 0.06483 | 0.81864 | 0.05388 | 9.41 | hSst-2 | 18.23110 | -5.97131 |
2348 | 29311 | 0.830229 | 29494 | GSM2556858 | GSM2556858 | Pool_1315_AD010_indexed | 25 yr | 2012/6/29 | Mid Frontal Gyrus | NaN | ... | 64.20% | 0.00827 | 0.80346 | 0.03818 | 0.80182 | 0.03016 | 3.54 | hSst-2 | 17.80530 | -4.53349 |
2349 | 28520 | 0.811096 | 29494 | GSM2558354 | GSM2558354 | Pool_543_AD002_indexed | 25 yr | 2012/6/29 | Mid Frontal Gyrus | NaN | ... | 39.30% | 0.00344 | 0.76130 | 0.00503 | 0.76048 | 0.00160 | 4.54 | NaN | NaN | NaN |
2350 rows × 37 columns
In [219]:
Copied!
sc.pl.umap(adata, color=['Library pool', 'Mapped reads',
'Index i5', 'Index i7',
'i5 sequence', 'i7 sequence',
'mCCC/CCC', 'mCG/CG', 'mCH/CH'], wspace=0.8)
sc.pl.umap(adata, color=['Library pool', 'Mapped reads',
'Index i5', 'Index i7',
'i5 sequence', 'i7 sequence',
'mCCC/CCC', 'mCG/CG', 'mCH/CH'], wspace=0.8)
FutureWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:1234: The default value of 'ignore' for the `na_action` parameter in pandas.Categorical.map is deprecated and will be changed to 'None' in a future version. Please set na_action to the desired value to avoid seeing this warning UserWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:394: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored FutureWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:1234: The default value of 'ignore' for the `na_action` parameter in pandas.Categorical.map is deprecated and will be changed to 'None' in a future version. Please set na_action to the desired value to avoid seeing this warning UserWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:394: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored FutureWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:1234: The default value of 'ignore' for the `na_action` parameter in pandas.Categorical.map is deprecated and will be changed to 'None' in a future version. Please set na_action to the desired value to avoid seeing this warning UserWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:394: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored FutureWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:1234: The default value of 'ignore' for the `na_action` parameter in pandas.Categorical.map is deprecated and will be changed to 'None' in a future version. Please set na_action to the desired value to avoid seeing this warning UserWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:394: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored FutureWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:1234: The default value of 'ignore' for the `na_action` parameter in pandas.Categorical.map is deprecated and will be changed to 'None' in a future version. Please set na_action to the desired value to avoid seeing this warning UserWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:394: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
In [479]:
Copied!
annot = []
for n in adata.obs['Neuron type']:
if n in ['hL6-1', 'hL6-2','hL6-3']:
annot.append('deep layer - L6')
elif n in ['hL5-1', 'hL5-2','hL5-3','hL4']:
annot.append('middle layer')
elif n in ['hDL-1', 'hDL-2', 'hDL-3']:
annot.append('deep layer - L5')
elif n in ['hSst-1','hSst-2','hSst-3','hPv-1','hPv-2','hVip-1', 'hVip-2','hNdnf','hNos']:
annot.append('Inhibitory')
else:
annot.append('upper layer - L2/3')
adata.obs['layer'] = annot
# all features
sc.pl.umap(adata, color=['layer', 'Neuron type'],wspace =0.3)
annot = []
for n in adata.obs['Neuron type']:
if n in ['hL6-1', 'hL6-2','hL6-3']:
annot.append('deep layer - L6')
elif n in ['hL5-1', 'hL5-2','hL5-3','hL4']:
annot.append('middle layer')
elif n in ['hDL-1', 'hDL-2', 'hDL-3']:
annot.append('deep layer - L5')
elif n in ['hSst-1','hSst-2','hSst-3','hPv-1','hPv-2','hVip-1', 'hVip-2','hNdnf','hNos']:
annot.append('Inhibitory')
else:
annot.append('upper layer - L2/3')
adata.obs['layer'] = annot
# all features
sc.pl.umap(adata, color=['layer', 'Neuron type'],wspace =0.3)
FutureWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:1234: The default value of 'ignore' for the `na_action` parameter in pandas.Categorical.map is deprecated and will be changed to 'None' in a future version. Please set na_action to the desired value to avoid seeing this warning UserWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:394: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored FutureWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:1234: The default value of 'ignore' for the `na_action` parameter in pandas.Categorical.map is deprecated and will be changed to 'None' in a future version. Please set na_action to the desired value to avoid seeing this warning UserWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:394: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
In [ ]:
Copied!
# for top 5000 features
# for top 5000 features
In [ ]:
Copied!
adata.write('/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scanpy/windows100k.h5ad')
adata.write('/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scanpy/windows100k.h5ad')
In [234]:
Copied!
pca(scm_100k,impute='median')
sc.pp.neighbors(scm_100k)
scm.pp.umap(scm_100k)
pca(scm_100k,impute='median')
sc.pp.neighbors(scm_100k)
scm.pp.umap(scm_100k)
... using all features
In [235]:
Copied!
scm.pl.umap(scm_100k, color=['layer', 'Neuron type'],wspace=0.4)
scm.pl.umap(scm_100k, color=['layer', 'Neuron type'],wspace=0.4)
In [45]:
Copied!
#聚类效果评价
def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):
this_step = 0
this_min = float(range_min)
this_max = float(range_max)
while this_step < max_steps:
print('step ' + str(this_step))
this_resolution = this_min + ((this_max-this_min)/2)
sc.tl.leiden(adata,resolution=this_resolution)
this_clusters = adata.obs['leiden'].nunique()
print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))
if this_clusters > n_cluster:
this_max = this_resolution
elif this_clusters < n_cluster:
this_min = this_resolution
else:
return(this_resolution, adata)
this_step += 1
print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + + str(this_resolution))
#聚类效果评价
def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):
this_step = 0
this_min = float(range_min)
this_max = float(range_max)
while this_step < max_steps:
print('step ' + str(this_step))
this_resolution = this_min + ((this_max-this_min)/2)
sc.tl.leiden(adata,resolution=this_resolution)
this_clusters = adata.obs['leiden'].nunique()
print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))
if this_clusters > n_cluster:
this_max = this_resolution
elif this_clusters < n_cluster:
this_min = this_resolution
else:
return(this_resolution, adata)
this_step += 1
print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + + str(this_resolution))
In [46]:
Copied!
num_clusters = scm_100k_filter.obs['Neuron type'].nunique()
getNClusters(scm_100k_filter,n_cluster=num_clusters)
num_clusters = scm_100k_filter.obs['Neuron type'].nunique()
getNClusters(scm_100k_filter,n_cluster=num_clusters)
step 0 got 17 at resolution 1.5 step 1 got 20 at resolution 2.25 step 2 got 23 at resolution 2.625 step 3 got 23 at resolution 2.4375 step 4 got 20 at resolution 2.34375 step 5 got 22 at resolution 2.390625 step 6 got 22 at resolution 2.3671875 step 7 got 22 at resolution 2.35546875 step 8 got 22 at resolution 2.349609375 step 9 got 21 at resolution 2.3466796875
Out[46]:
(2.3466796875, AnnData object with n_obs × n_vars = 2313 × 29509 obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Sample_id', 'Sample', 'Animal age', 'FACS date', 'Brain area', 'Laminar layer', 'Labeling', 'FACS channel', 'FACS count', 'Bisulfite conversion method', 'Library type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Sequencing run mode', 'Total reads', 'Mapped reads', '% Mapped reads', 'Filtered reads', '% Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Coverage (%)', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate', 'n_features', 'pct_peaks', 'layer', 'leiden' var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var', 'mean', 'pct_cell', 'mean_sr', 'sr_dispersion', 'log_sr_dispersion', 'dispersion', 'log_dispersion', 'mean_bin', 'cov_bin', 'dispersion_norm', 'feature_select_sr', 'feature_select_var', 'feature_select_dispersion', 'feature_select_random', 'feature_select_residual' uns: 'workdir', 'label_color', 'relative_pca_var_ratios', 'neighbors', 'umap', 'Neuron type_colors', 'X_pca_var_ratios', 'Brain area_colors', 'layer_colors', 'tsne', 'leiden' obsm: 'relative_pca', 'X_umap', 'X_pca', 'X_tsne' layers: 'relative' obsp: 'distances', 'connectivities')
In [47]:
Copied!
scm.pl.dendrogram(scm_100k_filter,groupby='leiden')
scm.pl.dendrogram(scm_100k_filter,groupby='leiden')
[ 5. 15.] [0. 0.] [25. 35.] [0. 0.] [75. 85.] [0. 0.] [65.] [0.] [55.] [0.] [45.] [0.] [ 95. 105.] [0. 0.] [125. 135.] [0. 0.] [115.] [0.] [155. 165.] [0. 0.] [145.] [0.] [175. 185.] [0. 0.] [195. 205.] [0. 0.]
/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/tools/_dendrogram.py:135: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. mean_df = rep_df.groupby(level=0).mean()
Out[47]:
<Axes: >
In [48]:
Copied!
scm.pl.umap(scm_100k_filter, color=['leiden', 'Neuron type'],wspace=0.4)
scm.pl.umap(scm_100k_filter, color=['leiden', 'Neuron type'],wspace=0.4)
In [18]:
Copied!
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score
Adjust metrics¶
In [40]:
Copied!
df_metrics = pd.DataFrame(columns=['ARI_Louvain',
'AMI_Louvain',
'Homogeneity_Louvain'])
df_metrics = pd.DataFrame(columns=['ARI_Louvain',
'AMI_Louvain',
'Homogeneity_Louvain'])
In [251]:
Copied!
adata = scm_100k_filter
adata.obs['label'] = adata.obs['Neuron type']
method = 'filer_residual_dispersion_all_feature'
df_metrics = pd.DataFrame(columns=['ARI_Louvain',
'AMI_Louvain',
'Homogeneity_Louvain'])
#有空值
# 添加一个新的类别 'unknown' 并用它来填充空值
adata.obs['label'] = adata.obs['label'].cat.add_categories('unknown')
adata.obs['label'].fillna('unknown', inplace=True)
#adjusted rank index
ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['leiden'])
# ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
#adjusted mutual information
ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['leiden'],average_method='arithmetic')
# ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
#homogeneity
homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['leiden'])
# homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])
df_metrics.loc[method,['ARI_leiden']] = [ari_louvain]
df_metrics.loc[method,['AMI_leiden']] = [ami_louvain]
df_metrics.loc[method,['Homogeneity_leiden']] = [homo_louvain]
adata = scm_100k_filter
adata.obs['label'] = adata.obs['Neuron type']
method = 'filer_residual_dispersion_all_feature'
df_metrics = pd.DataFrame(columns=['ARI_Louvain',
'AMI_Louvain',
'Homogeneity_Louvain'])
#有空值
# 添加一个新的类别 'unknown' 并用它来填充空值
adata.obs['label'] = adata.obs['label'].cat.add_categories('unknown')
adata.obs['label'].fillna('unknown', inplace=True)
#adjusted rank index
ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['leiden'])
# ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
#adjusted mutual information
ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['leiden'],average_method='arithmetic')
# ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
#homogeneity
homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['leiden'])
# homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])
df_metrics.loc[method,['ARI_leiden']] = [ari_louvain]
df_metrics.loc[method,['AMI_leiden']] = [ami_louvain]
df_metrics.loc[method,['Homogeneity_leiden']] = [homo_louvain]
FutureWarning:/tmp/ipykernel_71005/593313683.py:12: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
In [42]:
Copied!
scm_100k_filter
scm_100k_filter
Out[42]:
AnnData object with n_obs × n_vars = 2313 × 29485 obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Sample_id', 'Sample', 'Animal age', 'FACS date', 'Brain area', 'Laminar layer', 'Labeling', 'FACS channel', 'FACS count', 'Bisulfite conversion method', 'Library type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Sequencing run mode', 'Total reads', 'Mapped reads', '% Mapped reads', 'Filtered reads', '% Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Coverage (%)', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate', 'n_features', 'pct_peaks', 'label', 'leiden' var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var', 'mean', 'pct_cell', 'dispersion', 'log_dispersion', 'mean_bin', 'cov_bin', 'dispersion_norm', 'mean_sr', 'sr_dispersion', 'feature_select_sr', 'feature_select_var', 'feature_select_dispersion', 'feature_select_random', 'feature_select_residual' uns: 'workdir', 'label_color', 'relative_pca_var_ratios', 'neighbors', 'umap', 'leiden' obsm: 'relative_pca', 'X_umap', 'scm_scm_feature_select_random_3000_umap', 'scm_scm_feature_select_var_3000_umap', 'scm_scm_feature_select_sr_3000_umap', 'scm_scm_feature_select_dispersion_3000_umap', 'scm_scm_feature_select_residual_3000_umap', 'scm_scm_feature_select_random_5000_umap', 'scm_scm_feature_select_var_5000_umap', 'scm_scm_feature_select_sr_5000_umap', 'scm_scm_feature_select_dispersion_5000_umap', 'scm_scm_feature_select_residual_5000_umap', 'scm_scm_feature_select_random_10000_umap', 'scm_scm_feature_select_var_10000_umap', 'scm_scm_feature_select_sr_10000_umap', 'scm_scm_feature_select_dispersion_10000_umap', 'scm_scm_feature_select_residual_10000_umap' layers: 'relative' obsp: 'distances', 'connectivities'
In [43]:
Copied!
feature_select_method = ['feature_select_random','feature_select_var','feature_select_sr','feature_select_dispersion','feature_select_residual']
feature_num = [3000,5000,10000,29485]
clusters = scm_100k_filter.obs['Neuron type'].astype(str) # 将数值类型转换为字符串类型
for i, n in enumerate(feature_num):
feature_select(scm_100k_filter,number=n)
for j, f in enumerate(feature_select_method):
scm.pp.pca(scm_100k_filter,n_pcs=30,impute='median',use_hvm=True,hvm_obs=f)
sc.pp.neighbors(scm_100k_filter)
scm.pp.umap(scm_100k_filter)
adata = scm_100k_filter
adata.obs['label'] = adata.obs['Neuron type']
method = f + '_' + str(n)
sc.tl.leiden(adata)
#有空值
# 添加一个新的类别 'unknown' 并用它来填充空值
adata.obs['label'].fillna('unknown', inplace=True)
# adata.obs['label'] = adata.obs['label'].cat.add_categories('unknown')
# adata.obs['label'].fillna('unknown', inplace=True)
#adjusted rank index
ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['leiden'])
# ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
#adjusted mutual information
ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['leiden'],average_method='arithmetic')
# ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
#homogeneity
homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['leiden'])
# homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])
df_metrics.loc[method,['ARI_leiden']] = [ari_louvain]
df_metrics.loc[method,['AMI_leiden']] = [ami_louvain]
df_metrics.loc[method,['Homogeneity_leiden']] = [homo_louvain]
scm_100k_filter.obsm[method+'_umap'] = adata.obsm["X_umap"]
# axes[j,i].scatter(adata.obsm["X_tsne"],c=colors9[celltypes_numeric],s=2,rasterized=True)
feature_select_method = ['feature_select_random','feature_select_var','feature_select_sr','feature_select_dispersion','feature_select_residual']
feature_num = [3000,5000,10000,29485]
clusters = scm_100k_filter.obs['Neuron type'].astype(str) # 将数值类型转换为字符串类型
for i, n in enumerate(feature_num):
feature_select(scm_100k_filter,number=n)
for j, f in enumerate(feature_select_method):
scm.pp.pca(scm_100k_filter,n_pcs=30,impute='median',use_hvm=True,hvm_obs=f)
sc.pp.neighbors(scm_100k_filter)
scm.pp.umap(scm_100k_filter)
adata = scm_100k_filter
adata.obs['label'] = adata.obs['Neuron type']
method = f + '_' + str(n)
sc.tl.leiden(adata)
#有空值
# 添加一个新的类别 'unknown' 并用它来填充空值
adata.obs['label'].fillna('unknown', inplace=True)
# adata.obs['label'] = adata.obs['label'].cat.add_categories('unknown')
# adata.obs['label'].fillna('unknown', inplace=True)
#adjusted rank index
ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['leiden'])
# ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
#adjusted mutual information
ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['leiden'],average_method='arithmetic')
# ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
#homogeneity
homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['leiden'])
# homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])
df_metrics.loc[method,['ARI_leiden']] = [ari_louvain]
df_metrics.loc[method,['AMI_leiden']] = [ami_louvain]
df_metrics.loc[method,['Homogeneity_leiden']] = [homo_louvain]
scm_100k_filter.obsm[method+'_umap'] = adata.obsm["X_umap"]
# axes[j,i].scatter(adata.obsm["X_tsne"],c=colors9[celltypes_numeric],s=2,rasterized=True)
(2313, 29485) ... using top variable features based on feature_select_random
/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_var
/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_sr
/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_dispersion
/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_residual
/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
(2313, 29485) ... using top variable features based on feature_select_random
/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_var
/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_sr
/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_dispersion
/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_residual
/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
(2313, 29485) ... using top variable features based on feature_select_random
/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_var
/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_sr
/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_dispersion
/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_residual
/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
(2313, 29485) ... using top variable features based on feature_select_random
/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_var
/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_sr
/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_dispersion
/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_residual
/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
In [51]:
Copied!
feature_select_method = ['feature_select_random','feature_select_var','feature_select_sr','feature_select_dispersion','feature_select_residual']
feature_num = [3000,5000,10000,29485]
clusters = scm_100k_filter.obs['Neuron type'].astype(str) # 将数值类型转换为字符串类型
for i, n in enumerate(feature_num):
feature_select(scm_100k_filter,number=n)
for j, f in enumerate(feature_select_method):
scm.pp.pca(scm_100k_filter,n_pcs=30,impute='median',use_hvm=True,hvm_obs=f,layer='relative')
sc.pp.neighbors(scm_100k_filter,use_rep='relative_pca')
scm.pp.umap(scm_100k_filter)
adata = scm_100k_filter
adata.obs['label'] = adata.obs['Neuron type']
method = 'scm_'+ f + '_' + str(n)
sc.tl.leiden(adata)
#有空值
# 添加一个新的类别 'unknown' 并用它来填充空值
#adata.obs['label'] = adata.obs['label'].cat.add_categories('unknown')
adata.obs['label'].fillna('unknown', inplace=True)
#adjusted rank index
ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['leiden'])
# ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
#adjusted mutual information
ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['leiden'],average_method='arithmetic')
# ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
#homogeneity
homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['leiden'])
# homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])
df_metrics.loc[method,['ARI_leiden']] = [ari_louvain]
df_metrics.loc[method,['AMI_leiden']] = [ami_louvain]
df_metrics.loc[method,['Homogeneity_leiden']] = [homo_louvain]
scm_100k_filter.obsm['scm_' + method + '_umap'] = adata.obsm["X_umap"]
# axes[j,i].scatter(adata.obsm["X_tsne"],c=colors9[celltypes_numeric],s=2,rasterized=True)
feature_select_method = ['feature_select_random','feature_select_var','feature_select_sr','feature_select_dispersion','feature_select_residual']
feature_num = [3000,5000,10000,29485]
clusters = scm_100k_filter.obs['Neuron type'].astype(str) # 将数值类型转换为字符串类型
for i, n in enumerate(feature_num):
feature_select(scm_100k_filter,number=n)
for j, f in enumerate(feature_select_method):
scm.pp.pca(scm_100k_filter,n_pcs=30,impute='median',use_hvm=True,hvm_obs=f,layer='relative')
sc.pp.neighbors(scm_100k_filter,use_rep='relative_pca')
scm.pp.umap(scm_100k_filter)
adata = scm_100k_filter
adata.obs['label'] = adata.obs['Neuron type']
method = 'scm_'+ f + '_' + str(n)
sc.tl.leiden(adata)
#有空值
# 添加一个新的类别 'unknown' 并用它来填充空值
#adata.obs['label'] = adata.obs['label'].cat.add_categories('unknown')
adata.obs['label'].fillna('unknown', inplace=True)
#adjusted rank index
ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['leiden'])
# ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
#adjusted mutual information
ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['leiden'],average_method='arithmetic')
# ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
#homogeneity
homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['leiden'])
# homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])
df_metrics.loc[method,['ARI_leiden']] = [ari_louvain]
df_metrics.loc[method,['AMI_leiden']] = [ami_louvain]
df_metrics.loc[method,['Homogeneity_leiden']] = [homo_louvain]
scm_100k_filter.obsm['scm_' + method + '_umap'] = adata.obsm["X_umap"]
# axes[j,i].scatter(adata.obsm["X_tsne"],c=colors9[celltypes_numeric],s=2,rasterized=True)
(2313, 29485) ... using top variable features based on feature_select_random
/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_var
/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_sr
/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_dispersion
/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_residual
/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
(2313, 29485) ... using top variable features based on feature_select_random
/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_var
/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_sr
/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_dispersion
/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_residual
/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
(2313, 29485) ... using top variable features based on feature_select_random
/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_var
/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_sr
/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_dispersion
/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_residual
/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
(2313, 29485) ... using top variable features based on feature_select_random
/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_var
/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_sr
/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_dispersion
/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
... using top variable features based on feature_select_residual
/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. adata.obs['label'].fillna('unknown', inplace=True)
In [38]:
Copied!
feature_select_method = ['feature_select_random','feature_select_var','feature_select_sr','feature_select_dispersion','feature_select_residual']
feature_num = ['3000','5000','10000']
feature_select_method = ['feature_select_random','feature_select_var','feature_select_sr','feature_select_dispersion','feature_select_residual']
feature_num = ['3000','5000','10000']
In [54]:
Copied!
df_metrics
df_metrics
Out[54]:
ARI_Louvain | AMI_Louvain | Homogeneity_Louvain | ARI_leiden | AMI_leiden | Homogeneity_leiden | method | feature_number | |
---|---|---|---|---|---|---|---|---|
0 | NaN | NaN | NaN | 0.456784 | 0.674592 | 0.600334 | NaN | NaN |
1 | NaN | NaN | NaN | 0.574402 | 0.660247 | 0.570550 | NaN | NaN |
2 | NaN | NaN | NaN | 0.491452 | 0.751388 | 0.714291 | NaN | NaN |
3 | NaN | NaN | NaN | 0.543763 | 0.675866 | 0.586256 | NaN | NaN |
4 | NaN | NaN | NaN | 0.434316 | 0.733150 | 0.724680 | NaN | NaN |
5 | NaN | NaN | NaN | 0.471703 | 0.722031 | 0.693817 | NaN | NaN |
6 | NaN | NaN | NaN | 0.513774 | 0.696885 | 0.644075 | NaN | NaN |
7 | NaN | NaN | NaN | 0.500053 | 0.773307 | 0.787560 | NaN | NaN |
8 | NaN | NaN | NaN | 0.507755 | 0.707217 | 0.637069 | NaN | NaN |
9 | NaN | NaN | NaN | 0.481533 | 0.767520 | 0.784143 | NaN | NaN |
10 | NaN | NaN | NaN | 0.448819 | 0.731739 | 0.737025 | NaN | NaN |
11 | NaN | NaN | NaN | 0.428412 | 0.724399 | 0.717524 | NaN | NaN |
12 | NaN | NaN | NaN | 0.476988 | 0.762610 | 0.781313 | NaN | NaN |
13 | NaN | NaN | NaN | 0.425266 | 0.729153 | 0.699094 | NaN | NaN |
14 | NaN | NaN | NaN | 0.482629 | 0.767571 | 0.790160 | NaN | NaN |
15 | NaN | NaN | NaN | 0.403792 | 0.745666 | 0.782104 | NaN | NaN |
16 | NaN | NaN | NaN | 0.403792 | 0.745666 | 0.782104 | NaN | NaN |
17 | NaN | NaN | NaN | 0.403792 | 0.745666 | 0.782104 | NaN | NaN |
18 | NaN | NaN | NaN | 0.403792 | 0.745666 | 0.782104 | NaN | NaN |
19 | NaN | NaN | NaN | 0.403792 | 0.745666 | 0.782104 | NaN | NaN |
20 | NaN | NaN | NaN | 0.535869 | 0.773287 | 0.795181 | scm_feature_select_random_ | 3000 |
21 | NaN | NaN | NaN | 0.476417 | 0.743845 | 0.725840 | scm_feature_select_var_ | 3000 |
22 | NaN | NaN | NaN | 0.483194 | 0.779870 | 0.819663 | scm_feature_select_sr_ | 3000 |
23 | NaN | NaN | NaN | 0.424999 | 0.732045 | 0.703400 | scm_feature_select_dispersion_ | 3000 |
24 | NaN | NaN | NaN | 0.430886 | 0.769783 | 0.815982 | scm_feature_select_residual_ | 3000 |
25 | NaN | NaN | NaN | 0.460182 | 0.764282 | 0.789593 | scm_feature_select_random_ | 5000 |
26 | NaN | NaN | NaN | 0.452659 | 0.761496 | 0.783954 | scm_feature_select_var_ | 5000 |
27 | NaN | NaN | NaN | 0.428122 | 0.777929 | 0.844399 | scm_feature_select_sr_ | 5000 |
28 | NaN | NaN | NaN | 0.462310 | 0.769854 | 0.789328 | scm_feature_select_dispersion_ | 5000 |
29 | NaN | NaN | NaN | 0.432611 | 0.776875 | 0.834491 | scm_feature_select_residual_ | 5000 |
30 | NaN | NaN | NaN | 0.466384 | 0.776299 | 0.822934 | scm_feature_select_random_ | 10000 |
31 | NaN | NaN | NaN | 0.420865 | 0.771338 | 0.830959 | scm_feature_select_var_ | 10000 |
32 | NaN | NaN | NaN | 0.427085 | 0.776544 | 0.843158 | scm_feature_select_sr_ | 10000 |
33 | NaN | NaN | NaN | 0.464543 | 0.778559 | 0.829990 | scm_feature_select_dispersion_ | 10000 |
34 | NaN | NaN | NaN | 0.429921 | 0.778979 | 0.845232 | scm_feature_select_residual_ | 10000 |
35 | NaN | NaN | NaN | 0.410057 | 0.767987 | 0.827087 | scm_feature_select_random_ | 29485 |
36 | NaN | NaN | NaN | 0.410057 | 0.767987 | 0.827087 | scm_feature_select_var_ | 29485 |
37 | NaN | NaN | NaN | 0.410057 | 0.767987 | 0.827087 | scm_feature_select_sr_ | 29485 |
38 | NaN | NaN | NaN | 0.410057 | 0.767987 | 0.827087 | scm_feature_select_dispersion_ | 29485 |
39 | NaN | NaN | NaN | 0.410057 | 0.767987 | 0.827087 | scm_feature_select_residual_ | 29485 |
In [29]:
Copied!
# 之前做了两遍所以删除前 20 行
#df_metrics = df_metrics.drop(df_metrics.index[:20])
df_metrics
# 之前做了两遍所以删除前 20 行
#df_metrics = df_metrics.drop(df_metrics.index[:20])
df_metrics
Out[29]:
ARI_Louvain | AMI_Louvain | Homogeneity_Louvain | ARI_leiden | AMI_leiden | Homogeneity_leiden | |
---|---|---|---|---|---|---|
feature_select_random_3000 | NaN | NaN | NaN | 0.458048 | 0.685203 | 0.622874 |
feature_select_var_3000 | NaN | NaN | NaN | 0.574402 | 0.660247 | 0.570550 |
feature_select_sr_3000 | NaN | NaN | NaN | 0.491452 | 0.751388 | 0.714291 |
feature_select_dispersion_3000 | NaN | NaN | NaN | 0.543763 | 0.675866 | 0.586256 |
feature_select_residual_3000 | NaN | NaN | NaN | 0.434316 | 0.733150 | 0.724680 |
feature_select_random_5000 | NaN | NaN | NaN | 0.445819 | 0.710908 | 0.683368 |
feature_select_var_5000 | NaN | NaN | NaN | 0.513774 | 0.696885 | 0.644075 |
feature_select_sr_5000 | NaN | NaN | NaN | 0.500053 | 0.773307 | 0.787560 |
feature_select_dispersion_5000 | NaN | NaN | NaN | 0.507755 | 0.707217 | 0.637069 |
feature_select_residual_5000 | NaN | NaN | NaN | 0.481533 | 0.767520 | 0.784143 |
feature_select_random_10000 | NaN | NaN | NaN | 0.485736 | 0.753611 | 0.772558 |
feature_select_var_10000 | NaN | NaN | NaN | 0.428412 | 0.724399 | 0.717524 |
feature_select_sr_10000 | NaN | NaN | NaN | 0.476988 | 0.762610 | 0.781313 |
feature_select_dispersion_10000 | NaN | NaN | NaN | 0.425266 | 0.729153 | 0.699094 |
feature_select_residual_10000 | NaN | NaN | NaN | 0.482629 | 0.767571 | 0.790160 |
scm_feature_select_random_3000 | NaN | NaN | NaN | 0.534501 | 0.785227 | 0.798622 |
scm_feature_select_var_3000 | NaN | NaN | NaN | 0.476417 | 0.743845 | 0.725840 |
scm_feature_select_sr_3000 | NaN | NaN | NaN | 0.483194 | 0.779870 | 0.819663 |
scm_feature_select_dispersion_3000 | NaN | NaN | NaN | 0.424999 | 0.732045 | 0.703400 |
scm_feature_select_residual_3000 | NaN | NaN | NaN | 0.430886 | 0.769783 | 0.815982 |
scm_feature_select_random_5000 | NaN | NaN | NaN | 0.470246 | 0.769720 | 0.800332 |
scm_feature_select_var_5000 | NaN | NaN | NaN | 0.452659 | 0.761496 | 0.783954 |
scm_feature_select_sr_5000 | NaN | NaN | NaN | 0.428122 | 0.777929 | 0.844399 |
scm_feature_select_dispersion_5000 | NaN | NaN | NaN | 0.462310 | 0.769854 | 0.789328 |
scm_feature_select_residual_5000 | NaN | NaN | NaN | 0.432611 | 0.776875 | 0.834491 |
scm_feature_select_random_10000 | NaN | NaN | NaN | 0.469592 | 0.785871 | 0.833530 |
scm_feature_select_var_10000 | NaN | NaN | NaN | 0.420865 | 0.771338 | 0.830959 |
scm_feature_select_sr_10000 | NaN | NaN | NaN | 0.427085 | 0.776544 | 0.843158 |
scm_feature_select_dispersion_10000 | NaN | NaN | NaN | 0.464543 | 0.778559 | 0.829990 |
scm_feature_select_residual_10000 | NaN | NaN | NaN | 0.429921 | 0.778979 | 0.845232 |
In [53]:
Copied!
# 重置索引,将索引列转为普通列
df = df_metrics.reset_index()
# 使用正则表达式提取前缀和数字
df[['method', 'feature_number']] = df['index'].str.extract(r'(^\D+)(\d+$)')
# 将提取出的数字列中的空值填充为 'all'
df['feature_number'] = df['feature_number'].replace('', 'all')
# 删除原始的 'index' 列
df = df.drop(columns=['index'])
df_metrics = df
# 重置索引,将索引列转为普通列
df = df_metrics.reset_index()
# 使用正则表达式提取前缀和数字
df[['method', 'feature_number']] = df['index'].str.extract(r'(^\D+)(\d+$)')
# 将提取出的数字列中的空值填充为 'all'
df['feature_number'] = df['feature_number'].replace('', 'all')
# 删除原始的 'index' 列
df = df.drop(columns=['index'])
df_metrics = df
In [31]:
Copied!
df_metrics.to_csv('feature_metrics.csv') # 不存储索引
df_metrics.to_csv('feature_metrics.csv') # 不存储索引
In [46]:
Copied!
def get_plot_df(res):
gene_selection_via = []
k = []
dim_reduction_method = []
ari = []
ami = []
homogeneity = []
# 使用 DataFrame 的列名直接访问数据并添加到各自的列表
for idx, row in res.iterrows():
gene_selection_via.append(row['method'])
k.append(row['feature_number'])
ari.append(row['ARI_leiden'])
ami.append(row['AMI_leiden'])
homogeneity.append(row['Homogeneity_leiden'])
data_dict = dict(gene_selection_via=gene_selection_via,
k=k,
ari = ari,
ami = ami,
homogeneity = homogeneity
)
data = pd.DataFrame(data_dict)
return data.melt(id_vars=['gene_selection_via','k'])
df = get_plot_df(df_metrics)
df = df.reset_index(drop=True)
def get_plot_df(res):
gene_selection_via = []
k = []
dim_reduction_method = []
ari = []
ami = []
homogeneity = []
# 使用 DataFrame 的列名直接访问数据并添加到各自的列表
for idx, row in res.iterrows():
gene_selection_via.append(row['method'])
k.append(row['feature_number'])
ari.append(row['ARI_leiden'])
ami.append(row['AMI_leiden'])
homogeneity.append(row['Homogeneity_leiden'])
data_dict = dict(gene_selection_via=gene_selection_via,
k=k,
ari = ari,
ami = ami,
homogeneity = homogeneity
)
data = pd.DataFrame(data_dict)
return data.melt(id_vars=['gene_selection_via','k'])
df = get_plot_df(df_metrics)
df = df.reset_index(drop=True)
In [47]:
Copied!
df
df
Out[47]:
gene_selection_via | k | variable | value | |
---|---|---|---|---|
0 | feature_select_random_ | 3000 | ari | 0.456784 |
1 | feature_select_var_ | 3000 | ari | 0.574402 |
2 | feature_select_sr_ | 3000 | ari | 0.491452 |
3 | feature_select_dispersion_ | 3000 | ari | 0.543763 |
4 | feature_select_residual_ | 3000 | ari | 0.434316 |
5 | feature_select_random_ | 5000 | ari | 0.471703 |
6 | feature_select_var_ | 5000 | ari | 0.513774 |
7 | feature_select_sr_ | 5000 | ari | 0.500053 |
8 | feature_select_dispersion_ | 5000 | ari | 0.507755 |
9 | feature_select_residual_ | 5000 | ari | 0.481533 |
10 | feature_select_random_ | 10000 | ari | 0.448819 |
11 | feature_select_var_ | 10000 | ari | 0.428412 |
12 | feature_select_sr_ | 10000 | ari | 0.476988 |
13 | feature_select_dispersion_ | 10000 | ari | 0.425266 |
14 | feature_select_residual_ | 10000 | ari | 0.482629 |
15 | feature_select_random_ | 29485 | ari | 0.403792 |
16 | feature_select_var_ | 29485 | ari | 0.403792 |
17 | feature_select_sr_ | 29485 | ari | 0.403792 |
18 | feature_select_dispersion_ | 29485 | ari | 0.403792 |
19 | feature_select_residual_ | 29485 | ari | 0.403792 |
20 | feature_select_random_ | 3000 | ami | 0.674592 |
21 | feature_select_var_ | 3000 | ami | 0.660247 |
22 | feature_select_sr_ | 3000 | ami | 0.751388 |
23 | feature_select_dispersion_ | 3000 | ami | 0.675866 |
24 | feature_select_residual_ | 3000 | ami | 0.733150 |
25 | feature_select_random_ | 5000 | ami | 0.722031 |
26 | feature_select_var_ | 5000 | ami | 0.696885 |
27 | feature_select_sr_ | 5000 | ami | 0.773307 |
28 | feature_select_dispersion_ | 5000 | ami | 0.707217 |
29 | feature_select_residual_ | 5000 | ami | 0.767520 |
30 | feature_select_random_ | 10000 | ami | 0.731739 |
31 | feature_select_var_ | 10000 | ami | 0.724399 |
32 | feature_select_sr_ | 10000 | ami | 0.762610 |
33 | feature_select_dispersion_ | 10000 | ami | 0.729153 |
34 | feature_select_residual_ | 10000 | ami | 0.767571 |
35 | feature_select_random_ | 29485 | ami | 0.745666 |
36 | feature_select_var_ | 29485 | ami | 0.745666 |
37 | feature_select_sr_ | 29485 | ami | 0.745666 |
38 | feature_select_dispersion_ | 29485 | ami | 0.745666 |
39 | feature_select_residual_ | 29485 | ami | 0.745666 |
40 | feature_select_random_ | 3000 | homogeneity | 0.600334 |
41 | feature_select_var_ | 3000 | homogeneity | 0.570550 |
42 | feature_select_sr_ | 3000 | homogeneity | 0.714291 |
43 | feature_select_dispersion_ | 3000 | homogeneity | 0.586256 |
44 | feature_select_residual_ | 3000 | homogeneity | 0.724680 |
45 | feature_select_random_ | 5000 | homogeneity | 0.693817 |
46 | feature_select_var_ | 5000 | homogeneity | 0.644075 |
47 | feature_select_sr_ | 5000 | homogeneity | 0.787560 |
48 | feature_select_dispersion_ | 5000 | homogeneity | 0.637069 |
49 | feature_select_residual_ | 5000 | homogeneity | 0.784143 |
50 | feature_select_random_ | 10000 | homogeneity | 0.737025 |
51 | feature_select_var_ | 10000 | homogeneity | 0.717524 |
52 | feature_select_sr_ | 10000 | homogeneity | 0.781313 |
53 | feature_select_dispersion_ | 10000 | homogeneity | 0.699094 |
54 | feature_select_residual_ | 10000 | homogeneity | 0.790160 |
55 | feature_select_random_ | 29485 | homogeneity | 0.782104 |
56 | feature_select_var_ | 29485 | homogeneity | 0.782104 |
57 | feature_select_sr_ | 29485 | homogeneity | 0.782104 |
58 | feature_select_dispersion_ | 29485 | homogeneity | 0.782104 |
59 | feature_select_residual_ | 29485 | homogeneity | 0.782104 |
In [49]:
Copied!
# df = df.rename(columns={'variable':'metric','value':'metric value'})
plot = sns.catplot(x='k',y='value',hue='gene_selection_via',data=df,kind='point',col='variable',)
for ax in plot.axes.flatten():
for item in ax.get_xticklabels():
item.set_rotation(45)
plot.savefig('./figures/feature_metric_new_mean_epi.pdf', dpi=300, format=None)
# df = df.rename(columns={'variable':'metric','value':'metric value'})
plot = sns.catplot(x='k',y='value',hue='gene_selection_via',data=df,kind='point',col='variable',)
for ax in plot.axes.flatten():
for item in ax.get_xticklabels():
item.set_rotation(45)
plot.savefig('./figures/feature_metric_new_mean_epi.pdf', dpi=300, format=None)
In [67]:
Copied!
adata.write('w100k_after.h5ad')
adata.write('w100k_after.h5ad')
In [59]:
Copied!
#tsnes = ['feature_select_random_1000_umap', 'feature_select_var_1000_umap', 'feature_select_sr_1000_umap', 'feature_select_dispersion_1000_umap', 'feature_select_residual_1000_umap', 'feature_select_random_3000_umap', 'feature_select_var_3000_umap', 'feature_select_sr_3000_umap', 'feature_select_dispersion_3000_umap', 'feature_select_residual_3000_umap', 'feature_select_random_5000_umap', 'feature_select_var_5000_umap', 'feature_select_sr_5000_umap', 'feature_select_dispersion_5000_umap', 'feature_select_residual_5000_umap','feature_select_random_10000_umap', 'feature_select_var_10000_umap', 'feature_select_sr_10000_umap', 'feature_select_dispersion_10000_umap', 'feature_select_residual_10000_umap','X_umap']
tsnes = ['scm_scm_feature_select_random_3000_umap', 'scm_scm_feature_select_var_3000_umap', 'scm_scm_feature_select_sr_3000_umap', 'scm_scm_feature_select_dispersion_3000_umap', 'scm_scm_feature_select_residual_3000_umap', 'scm_scm_feature_select_random_5000_umap', 'scm_scm_feature_select_var_5000_umap', 'scm_scm_feature_select_sr_5000_umap', 'scm_scm_feature_select_dispersion_5000_umap', 'scm_scm_feature_select_residual_5000_umap', 'scm_scm_feature_select_random_10000_umap', 'scm_scm_feature_select_var_10000_umap', 'scm_scm_feature_select_sr_10000_umap', 'scm_scm_feature_select_dispersion_10000_umap', 'scm_scm_feature_select_residual_10000_umap', 'feature_select_random_3000_umap', 'feature_select_var_3000_umap', 'feature_select_sr_3000_umap', 'feature_select_dispersion_3000_umap', 'feature_select_residual_3000_umap', 'feature_select_random_5000_umap', 'feature_select_var_5000_umap', 'feature_select_sr_5000_umap', 'feature_select_dispersion_5000_umap', 'feature_select_residual_5000_umap', 'feature_select_random_10000_umap', 'feature_select_var_10000_umap', 'feature_select_sr_10000_umap', 'feature_select_dispersion_10000_umap', 'feature_select_residual_10000_umap', 'feature_select_random_29485_umap', 'feature_select_var_29485_umap', 'feature_select_sr_29485_umap', 'feature_select_dispersion_29485_umap', 'feature_select_residual_29485_umap', 'scm_scm_feature_select_random_29485_umap', 'scm_scm_feature_select_var_29485_umap', 'scm_scm_feature_select_sr_29485_umap', 'scm_scm_feature_select_dispersion_29485_umap', 'scm_scm_feature_select_residual_29485_umap']
#tsnes = ['feature_select_random_1000_umap', 'feature_select_var_1000_umap', 'feature_select_sr_1000_umap', 'feature_select_dispersion_1000_umap', 'feature_select_residual_1000_umap', 'feature_select_random_3000_umap', 'feature_select_var_3000_umap', 'feature_select_sr_3000_umap', 'feature_select_dispersion_3000_umap', 'feature_select_residual_3000_umap', 'feature_select_random_5000_umap', 'feature_select_var_5000_umap', 'feature_select_sr_5000_umap', 'feature_select_dispersion_5000_umap', 'feature_select_residual_5000_umap','feature_select_random_10000_umap', 'feature_select_var_10000_umap', 'feature_select_sr_10000_umap', 'feature_select_dispersion_10000_umap', 'feature_select_residual_10000_umap','X_umap']
tsnes = ['scm_scm_feature_select_random_3000_umap', 'scm_scm_feature_select_var_3000_umap', 'scm_scm_feature_select_sr_3000_umap', 'scm_scm_feature_select_dispersion_3000_umap', 'scm_scm_feature_select_residual_3000_umap', 'scm_scm_feature_select_random_5000_umap', 'scm_scm_feature_select_var_5000_umap', 'scm_scm_feature_select_sr_5000_umap', 'scm_scm_feature_select_dispersion_5000_umap', 'scm_scm_feature_select_residual_5000_umap', 'scm_scm_feature_select_random_10000_umap', 'scm_scm_feature_select_var_10000_umap', 'scm_scm_feature_select_sr_10000_umap', 'scm_scm_feature_select_dispersion_10000_umap', 'scm_scm_feature_select_residual_10000_umap', 'feature_select_random_3000_umap', 'feature_select_var_3000_umap', 'feature_select_sr_3000_umap', 'feature_select_dispersion_3000_umap', 'feature_select_residual_3000_umap', 'feature_select_random_5000_umap', 'feature_select_var_5000_umap', 'feature_select_sr_5000_umap', 'feature_select_dispersion_5000_umap', 'feature_select_residual_5000_umap', 'feature_select_random_10000_umap', 'feature_select_var_10000_umap', 'feature_select_sr_10000_umap', 'feature_select_dispersion_10000_umap', 'feature_select_residual_10000_umap', 'feature_select_random_29485_umap', 'feature_select_var_29485_umap', 'feature_select_sr_29485_umap', 'feature_select_dispersion_29485_umap', 'feature_select_residual_29485_umap', 'scm_scm_feature_select_random_29485_umap', 'scm_scm_feature_select_var_29485_umap', 'scm_scm_feature_select_sr_29485_umap', 'scm_scm_feature_select_dispersion_29485_umap', 'scm_scm_feature_select_residual_29485_umap']
In [64]:
Copied!
len(tsnes)
len(tsnes)
Out[64]:
40
In [63]:
Copied!
import matplotlib.pyplot as plt
import seaborn as sns
title_fontsize=16
title_fontweight = "bold"
clusters = adata.obs['Neuron type'].astype(str) # 将数值类型转换为字符串类型
titles = ['scm_scm_feature_select_random_3000_umap', 'scm_scm_feature_select_var_3000_umap', 'scm_scm_feature_select_sr_3000_umap', 'scm_scm_feature_select_dispersion_3000_umap', 'scm_scm_feature_select_residual_3000_umap', 'scm_scm_feature_select_random_5000_umap', 'scm_scm_feature_select_var_5000_umap', 'scm_scm_feature_select_sr_5000_umap', 'scm_scm_feature_select_dispersion_5000_umap', 'scm_scm_feature_select_residual_5000_umap', 'scm_scm_feature_select_random_10000_umap', 'scm_scm_feature_select_var_10000_umap', 'scm_scm_feature_select_sr_10000_umap', 'scm_scm_feature_select_dispersion_10000_umap', 'scm_scm_feature_select_residual_10000_umap', 'feature_select_random_3000_umap', 'feature_select_var_3000_umap', 'feature_select_sr_3000_umap', 'feature_select_dispersion_3000_umap', 'feature_select_residual_3000_umap', 'feature_select_random_5000_umap', 'feature_select_var_5000_umap', 'feature_select_sr_5000_umap', 'feature_select_dispersion_5000_umap', 'feature_select_residual_5000_umap', 'feature_select_random_10000_umap', 'feature_select_var_10000_umap', 'feature_select_sr_10000_umap', 'feature_select_dispersion_10000_umap', 'feature_select_residual_10000_umap', 'feature_select_random_29485_umap', 'feature_select_var_29485_umap', 'feature_select_sr_29485_umap', 'feature_select_dispersion_29485_umap', 'feature_select_residual_29485_umap', 'scm_scm_feature_select_random_29485_umap', 'scm_scm_feature_select_var_29485_umap', 'scm_scm_feature_select_sr_29485_umap', 'scm_scm_feature_select_dispersion_29485_umap', 'scm_scm_feature_select_residual_29485_umap']
with sns.plotting_context('paper'):
plt.figure(figsize=(41,41))
for i, (t,title) in enumerate(zip(tsnes,titles)):
tsne = adata.obsm[t]
plt.subplot(5,8,i +1)
ax=plt.gca()
# plt.text(0.05,1.015,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
plt.title(title)
plt.xticks([])
plt.yticks([])
# 绘制每个群集
for cluster_id in np.unique(clusters):
cluster_idx = clusters == cluster_id
# 选取当前群集的所有点
cluster_points = tsne[cluster_idx]
# 计算当前群集所有点的均值,这将作为注释的位置
centroid = np.mean(cluster_points, axis=0)
points=plt.scatter(tsne[cluster_idx, 0], tsne[cluster_idx, 1], s=6, edgecolor='none', rasterized=True)
if i == 0:
color = points.get_facecolor().flatten()
# 在群集的中心位置添加注释,注释内容为群集的ID
text = plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=10, ha='center', va='center')
# 创建背景框对象,并设置背景框的样式
bbox_props = dict(boxstyle="round,pad=0.3", fc="white", alpha=0.2)
plt.setp(text, bbox=bbox_props)
plt.tight_layout()
sns.despine(left=True,bottom=True)
plt.savefig('./figures/feature_umap_all.pdf', dpi=300, format=None)
import matplotlib.pyplot as plt
import seaborn as sns
title_fontsize=16
title_fontweight = "bold"
clusters = adata.obs['Neuron type'].astype(str) # 将数值类型转换为字符串类型
titles = ['scm_scm_feature_select_random_3000_umap', 'scm_scm_feature_select_var_3000_umap', 'scm_scm_feature_select_sr_3000_umap', 'scm_scm_feature_select_dispersion_3000_umap', 'scm_scm_feature_select_residual_3000_umap', 'scm_scm_feature_select_random_5000_umap', 'scm_scm_feature_select_var_5000_umap', 'scm_scm_feature_select_sr_5000_umap', 'scm_scm_feature_select_dispersion_5000_umap', 'scm_scm_feature_select_residual_5000_umap', 'scm_scm_feature_select_random_10000_umap', 'scm_scm_feature_select_var_10000_umap', 'scm_scm_feature_select_sr_10000_umap', 'scm_scm_feature_select_dispersion_10000_umap', 'scm_scm_feature_select_residual_10000_umap', 'feature_select_random_3000_umap', 'feature_select_var_3000_umap', 'feature_select_sr_3000_umap', 'feature_select_dispersion_3000_umap', 'feature_select_residual_3000_umap', 'feature_select_random_5000_umap', 'feature_select_var_5000_umap', 'feature_select_sr_5000_umap', 'feature_select_dispersion_5000_umap', 'feature_select_residual_5000_umap', 'feature_select_random_10000_umap', 'feature_select_var_10000_umap', 'feature_select_sr_10000_umap', 'feature_select_dispersion_10000_umap', 'feature_select_residual_10000_umap', 'feature_select_random_29485_umap', 'feature_select_var_29485_umap', 'feature_select_sr_29485_umap', 'feature_select_dispersion_29485_umap', 'feature_select_residual_29485_umap', 'scm_scm_feature_select_random_29485_umap', 'scm_scm_feature_select_var_29485_umap', 'scm_scm_feature_select_sr_29485_umap', 'scm_scm_feature_select_dispersion_29485_umap', 'scm_scm_feature_select_residual_29485_umap']
with sns.plotting_context('paper'):
plt.figure(figsize=(41,41))
for i, (t,title) in enumerate(zip(tsnes,titles)):
tsne = adata.obsm[t]
plt.subplot(5,8,i +1)
ax=plt.gca()
# plt.text(0.05,1.015,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
plt.title(title)
plt.xticks([])
plt.yticks([])
# 绘制每个群集
for cluster_id in np.unique(clusters):
cluster_idx = clusters == cluster_id
# 选取当前群集的所有点
cluster_points = tsne[cluster_idx]
# 计算当前群集所有点的均值,这将作为注释的位置
centroid = np.mean(cluster_points, axis=0)
points=plt.scatter(tsne[cluster_idx, 0], tsne[cluster_idx, 1], s=6, edgecolor='none', rasterized=True)
if i == 0:
color = points.get_facecolor().flatten()
# 在群集的中心位置添加注释,注释内容为群集的ID
text = plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=10, ha='center', va='center')
# 创建背景框对象,并设置背景框的样式
bbox_props = dict(boxstyle="round,pad=0.3", fc="white", alpha=0.2)
plt.setp(text, bbox=bbox_props)
plt.tight_layout()
sns.despine(left=True,bottom=True)
plt.savefig('./figures/feature_umap_all.pdf', dpi=300, format=None)
In [66]:
Copied!
import matplotlib.pyplot as plt
import seaborn as sns
title_fontsize=20
title_fontweight = "bold"
tsnes = ['feature_select_random_3000_umap','feature_select_random_5000_umap','feature_select_random_10000_umap','feature_select_random_29485_umap',
'feature_select_var_3000_umap','feature_select_var_5000_umap','feature_select_var_10000_umap','feature_select_var_29485_umap',
'feature_select_dispersion_3000_umap','feature_select_dispersion_5000_umap','feature_select_dispersion_10000_umap','feature_select_dispersion_29485_umap',
'scm_scm_feature_select_random_3000_umap','scm_scm_feature_select_residual_5000_umap','scm_scm_feature_select_var_10000_umap', 'feature_select_random_29485_umap',
'scm_scm_feature_select_var_3000_umap','scm_scm_feature_select_var_5000_umap','scm_scm_feature_select_var_10000_umap','scm_scm_feature_select_var_29485_umap',
'scm_scm_feature_select_sr_3000_umap','scm_scm_feature_select_sr_5000_umap','scm_scm_feature_select_sr_10000_umap','scm_scm_feature_select_sr_29485_umap',
'scm_scm_feature_select_dispersion_3000_umap', 'scm_scm_feature_select_dispersion_5000_umap','scm_scm_feature_select_dispersion_10000_umap','scm_scm_feature_select_dispersion_29485_umap',
'scm_scm_feature_select_residual_3000_umap','scm_scm_feature_select_residual_5000_umap','scm_scm_feature_select_residual_10000_umap','scm_scm_feature_select_residual_29485_umap',
]
clusters = adata.obs['Neuron type'].astype(str) # 将数值类型转换为字符串类型
titles = ['feature_select_random_3000_umap','feature_select_random_5000_umap','feature_select_random_10000_umap','feature_select_random_29485_umap',
'feature_select_var_3000_umap','feature_select_var_5000_umap','feature_select_var_10000_umap','feature_select_var_29485_umap',
'feature_select_dispersion_3000_umap','feature_select_dispersion_5000_umap','feature_select_dispersion_10000_umap','feature_select_dispersion_29485_umap',
'scm_scm_feature_select_random_3000_umap','scm_scm_feature_select_residual_5000_umap','scm_scm_feature_select_var_10000_umap', 'feature_select_random_29485_umap',
'scm_scm_feature_select_var_3000_umap','scm_scm_feature_select_var_5000_umap','scm_scm_feature_select_var_10000_umap','scm_scm_feature_select_var_29485_umap',
'scm_scm_feature_select_sr_3000_umap','scm_scm_feature_select_sr_5000_umap','scm_scm_feature_select_sr_10000_umap','scm_scm_feature_select_sr_29485_umap',
'scm_scm_feature_select_dispersion_3000_umap', 'scm_scm_feature_select_dispersion_5000_umap','scm_scm_feature_select_dispersion_10000_umap','scm_scm_feature_select_dispersion_29485_umap',
'scm_scm_feature_select_residual_3000_umap','scm_scm_feature_select_residual_5000_umap','scm_scm_feature_select_residual_10000_umap','scm_scm_feature_select_residual_29485_umap',
]
with sns.plotting_context('paper'):
plt.figure(figsize=(41,41))
for i, (t,title) in enumerate(zip(tsnes,titles)):
tsne = adata.obsm[t]
plt.subplot(8,4,i +1)
ax=plt.gca()
# plt.text(0.05,1.015,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
plt.title(title)
plt.xticks([])
plt.yticks([])
# 绘制每个群集
for cluster_id in np.unique(clusters):
cluster_idx = clusters == cluster_id
# 选取当前群集的所有点
cluster_points = tsne[cluster_idx]
# 计算当前群集所有点的均值,这将作为注释的位置
centroid = np.mean(cluster_points, axis=0)
points=plt.scatter(tsne[cluster_idx, 0], tsne[cluster_idx, 1], s=6, edgecolor='none', rasterized=True)
if i == 0:
color = points.get_facecolor().flatten()
# 在群集的中心位置添加注释,注释内容为群集的ID
text = plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=10, ha='center', va='center')
# 创建背景框对象,并设置背景框的样式
bbox_props = dict(boxstyle="round,pad=0.3", fc="white", alpha=0.2)
plt.setp(text, bbox=bbox_props)
plt.tight_layout()
sns.despine(left=True,bottom=True)
plt.savefig('./figures/feature_umap_all_v2.pdf', dpi=300, format=None)
import matplotlib.pyplot as plt
import seaborn as sns
title_fontsize=20
title_fontweight = "bold"
tsnes = ['feature_select_random_3000_umap','feature_select_random_5000_umap','feature_select_random_10000_umap','feature_select_random_29485_umap',
'feature_select_var_3000_umap','feature_select_var_5000_umap','feature_select_var_10000_umap','feature_select_var_29485_umap',
'feature_select_dispersion_3000_umap','feature_select_dispersion_5000_umap','feature_select_dispersion_10000_umap','feature_select_dispersion_29485_umap',
'scm_scm_feature_select_random_3000_umap','scm_scm_feature_select_residual_5000_umap','scm_scm_feature_select_var_10000_umap', 'feature_select_random_29485_umap',
'scm_scm_feature_select_var_3000_umap','scm_scm_feature_select_var_5000_umap','scm_scm_feature_select_var_10000_umap','scm_scm_feature_select_var_29485_umap',
'scm_scm_feature_select_sr_3000_umap','scm_scm_feature_select_sr_5000_umap','scm_scm_feature_select_sr_10000_umap','scm_scm_feature_select_sr_29485_umap',
'scm_scm_feature_select_dispersion_3000_umap', 'scm_scm_feature_select_dispersion_5000_umap','scm_scm_feature_select_dispersion_10000_umap','scm_scm_feature_select_dispersion_29485_umap',
'scm_scm_feature_select_residual_3000_umap','scm_scm_feature_select_residual_5000_umap','scm_scm_feature_select_residual_10000_umap','scm_scm_feature_select_residual_29485_umap',
]
clusters = adata.obs['Neuron type'].astype(str) # 将数值类型转换为字符串类型
titles = ['feature_select_random_3000_umap','feature_select_random_5000_umap','feature_select_random_10000_umap','feature_select_random_29485_umap',
'feature_select_var_3000_umap','feature_select_var_5000_umap','feature_select_var_10000_umap','feature_select_var_29485_umap',
'feature_select_dispersion_3000_umap','feature_select_dispersion_5000_umap','feature_select_dispersion_10000_umap','feature_select_dispersion_29485_umap',
'scm_scm_feature_select_random_3000_umap','scm_scm_feature_select_residual_5000_umap','scm_scm_feature_select_var_10000_umap', 'feature_select_random_29485_umap',
'scm_scm_feature_select_var_3000_umap','scm_scm_feature_select_var_5000_umap','scm_scm_feature_select_var_10000_umap','scm_scm_feature_select_var_29485_umap',
'scm_scm_feature_select_sr_3000_umap','scm_scm_feature_select_sr_5000_umap','scm_scm_feature_select_sr_10000_umap','scm_scm_feature_select_sr_29485_umap',
'scm_scm_feature_select_dispersion_3000_umap', 'scm_scm_feature_select_dispersion_5000_umap','scm_scm_feature_select_dispersion_10000_umap','scm_scm_feature_select_dispersion_29485_umap',
'scm_scm_feature_select_residual_3000_umap','scm_scm_feature_select_residual_5000_umap','scm_scm_feature_select_residual_10000_umap','scm_scm_feature_select_residual_29485_umap',
]
with sns.plotting_context('paper'):
plt.figure(figsize=(41,41))
for i, (t,title) in enumerate(zip(tsnes,titles)):
tsne = adata.obsm[t]
plt.subplot(8,4,i +1)
ax=plt.gca()
# plt.text(0.05,1.015,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
plt.title(title)
plt.xticks([])
plt.yticks([])
# 绘制每个群集
for cluster_id in np.unique(clusters):
cluster_idx = clusters == cluster_id
# 选取当前群集的所有点
cluster_points = tsne[cluster_idx]
# 计算当前群集所有点的均值,这将作为注释的位置
centroid = np.mean(cluster_points, axis=0)
points=plt.scatter(tsne[cluster_idx, 0], tsne[cluster_idx, 1], s=6, edgecolor='none', rasterized=True)
if i == 0:
color = points.get_facecolor().flatten()
# 在群集的中心位置添加注释,注释内容为群集的ID
text = plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=10, ha='center', va='center')
# 创建背景框对象,并设置背景框的样式
bbox_props = dict(boxstyle="round,pad=0.3", fc="white", alpha=0.2)
plt.setp(text, bbox=bbox_props)
plt.tight_layout()
sns.despine(left=True,bottom=True)
plt.savefig('./figures/feature_umap_all_v2.pdf', dpi=300, format=None)
In [62]:
Copied!
plt.savefig('./figures/feature_umap_all.pdf', dpi=300, format=None)
plt.savefig('./figures/feature_umap_all.pdf', dpi=300, format=None)
<Figure size 640x480 with 0 Axes>
Feature¶
In [302]:
Copied!
gtf_file='/xtdisk/methbank_baoym/zongwt/single/genome/human/gencode.v41.basic.annotation.gtf'
reference = scm.dm.parse_gtf(gtf_file)
gtf_file='/xtdisk/methbank_baoym/zongwt/single/genome/human/gencode.v41.basic.annotation.gtf'
reference = scm.dm.parse_gtf(gtf_file)
... Loading gene references ... Done
In [303]:
Copied!
scm.dm.annotation(adata,ref=reference)
scm.dm.annotation(adata,ref=reference)
... Loading regions info ... Overlapping genes with regions chr1 100001 200000 1 chr1 HAVANA gene 65419 71585 . + . gene_id "ENSG00000186092.7";gene_type "protein_coding";gene_name "OR4F5";level 2;hgnc_id "HGNC:14825";havana_gene "OTTHUMG00000001094.4"; 28417 ...Overlapping finish
In [359]:
Copied!
# for feature
example_genes= ['RORB', 'LYZ', 'FOS', 'MALAT1']
## Labeling details panel (a)
example_gene_textoffsets_a = [np.array([-0.6,0]), np.array([-0.35,0]), np.array([0,-0.85]), np.array([0,-4.6])]
example_gene_lines_a = [[], [], [np.array([0,0.1]),np.array([0,-0.25])], [np.array([0,0.3]),np.array([0,-2])]]
## Labeling details panel (b)
example_gene_textoffsets_b = [np.array([-0.6,1]), np.array([0,8]), np.array([0,-1.8]), np.array([0,2.4])]
example_gene_lines_b = [[], [], [np.array([0,0.35]),np.array([0,-0.5])], []]
dotsize = 2
starsize = 125
staredges = 0.85
legend_loc = (0.05,0.8)
titleletter_loc = (-0.10,1.015)
hline_width = 1.5
xmin = min(gene_means)/1.1
xmax = max(gene_means)*1.1
# for feature
example_genes= ['RORB', 'LYZ', 'FOS', 'MALAT1']
## Labeling details panel (a)
example_gene_textoffsets_a = [np.array([-0.6,0]), np.array([-0.35,0]), np.array([0,-0.85]), np.array([0,-4.6])]
example_gene_lines_a = [[], [], [np.array([0,0.1]),np.array([0,-0.25])], [np.array([0,0.3]),np.array([0,-2])]]
## Labeling details panel (b)
example_gene_textoffsets_b = [np.array([-0.6,1]), np.array([0,8]), np.array([0,-1.8]), np.array([0,2.4])]
example_gene_lines_b = [[], [], [np.array([0,0.35]),np.array([0,-0.5])], []]
dotsize = 2
starsize = 125
staredges = 0.85
legend_loc = (0.05,0.8)
titleletter_loc = (-0.10,1.015)
hline_width = 1.5
xmin = min(gene_means)/1.1
xmax = max(gene_means)*1.1
In [361]:
Copied!
adata.var
adata.var
Out[361]:
chromosome | start | end | covered_cell | var | sr_var | mean | pct_cell | dispersion | log_dispersion | ... | feature_select_var | feature_select_dispersion | feature_select_random | mean_sr | sr_dispersion | log_sr_dispersion | feature_select_residual | Accession | Gene | Distance | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
index | |||||||||||||||||||||
chr1:1-100000 | chr1 | 1 | 100000 | 2353 | 0.035220 | 0.029984 | 0.710440 | 0.995347 | 20.171644 | 3.004278 | ... | False | False | False | 0.333772 | 11.131786 | 2.400980 | True | ENSG00000186092.7 | OR4F5 | 0 |
chr1:100001-200000 | chr1 | 100001 | 200000 | 2354 | 0.009842 | 0.016549 | 0.810938 | 0.995770 | 82.395519 | 4.411531 | ... | False | False | True | 0.366151 | 22.124874 | 3.093073 | False | ENSG00000186092.7 | OR4F5 | 28417 |
chr1:200001-300000 | chr1 | 200001 | 300000 | 2351 | 0.030100 | 0.035758 | 0.730608 | 0.994501 | 24.272813 | 3.189357 | ... | False | False | False | 0.383961 | 10.737695 | 2.362730 | True | ENSG00000186092.7 | OR4F5 | 128417 |
chr1:300001-400000 | chr1 | 300001 | 400000 | 2323 | 0.026505 | 0.055943 | 0.819564 | 0.982657 | 30.921580 | 3.431454 | ... | False | False | False | 0.327454 | 5.853303 | 1.713199 | True | ENSG00000284733.2 | OR4F29 | 50740 |
chr1:400001-500000 | chr1 | 400001 | 500000 | 2346 | 0.014380 | 0.038490 | 0.836147 | 0.992386 | 58.147901 | 4.062990 | ... | False | False | False | 0.382297 | 9.932428 | 2.278761 | True | ENSG00000284733.2 | OR4F29 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
chrY:56800001-56900000 | chrY | 56800001 | 56900000 | 2361 | 0.002751 | 0.003180 | 0.503138 | 0.998731 | 182.910968 | 5.209000 | ... | True | True | True | 0.256378 | 80.616754 | 4.388891 | False | ENSG00000124333.16_PAR_Y | VAMP7 | 167865 |
chrY:56900001-57000000 | chrY | 56900001 | 57000000 | 2122 | 0.098136 | 0.067797 | 0.585631 | 0.897631 | 5.967571 | 1.786340 | ... | False | False | True | 0.251633 | 3.711540 | 1.080738 | True | ENSG00000124333.16_PAR_Y | VAMP7 | 67865 |
chrY:57000001-57100000 | chrY | 57000001 | 57100000 | 2252 | 0.066954 | 0.067381 | 0.664001 | 0.952623 | 9.917348 | 2.294286 | ... | False | False | False | 0.311553 | 4.623727 | 1.406113 | True | ENSG00000124333.16_PAR_Y | VAMP7 | 0 |
chrY:57100001-57200000 | chrY | 57100001 | 57200000 | 2311 | 0.046440 | 0.054673 | 0.684819 | 0.977580 | 14.746337 | 2.690995 | ... | False | False | False | 0.300699 | 5.499966 | 1.595794 | True | ENSG00000124333.16_PAR_Y | VAMP7 | 0 |
chrY:57200001-57227415 | chrY | 57200001 | 57227415 | 2125 | 0.040066 | 0.080984 | 0.790896 | 0.898900 | 19.740019 | 2.982648 | ... | False | False | False | 0.292204 | 3.608189 | 1.184860 | True | ENSG00000182484.15_PAR_Y | WASH6P | 0 |
29485 rows × 24 columns
In [470]:
Copied!
adata = scm_100k
xmean_label = 'mean methylation level'
residual_var = adata.var['log_sr_dispersion']
gene_means = adata.var['mean']
sqrt_var = adata.var['var']
sqrt_variable_genes_idx = adata.var['feature_select_var']
residuals_variable_genes_idx = adata.var['feature_select_residual']
dataset = adata.var
sqrt_lazy_tsne = adata.obsm['X_umap']
with sns.plotting_context("talk"):
example_genes_idx = np.isin(adata.var['Gene'],example_genes)
fig = plt.figure(figsize=(18,6))
gs = GridSpec(2, 6, figure=fig)
ax1 = fig.add_subplot(gs[:, 0:2])
ax2 = fig.add_subplot(gs[:, 2:4])
supaxis = fig.add_subplot(gs[:, 4:6])
ax3 = fig.add_subplot(gs[0, 4])
ax4 = fig.add_subplot(gs[1, 4])
ax5 = fig.add_subplot(gs[0, 5])
ax6 = fig.add_subplot(gs[1, 5])
tsne_axes = [ax3,ax4,ax5,ax6]
tsne_arrows = [(28,39),None,None,(28,39)]
ax1.text(*titleletter_loc,'a',transform=ax1.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax1.set_title('sqrt(CPMedian)')
# ax1.set_xscale('log')
ax1.set_yscale('log')
ax1.set_xlim((xmin,xmax))
ax1.set_xlabel(xmean_label)
ax1.set_ylabel('variance')
ax1.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True)
ax1.scatter(gene_means[sqrt_variable_genes_idx],sqrt_var[sqrt_variable_genes_idx],color='tab:red',s=dotsize,label='selection by Pearson residuals', rasterized=True)
ax1.scatter(gene_means[sqrt_variable_genes_idx & example_genes_idx],sqrt_var[sqrt_variable_genes_idx & example_genes_idx],color='tab:red',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
ax1.scatter(gene_means[~sqrt_variable_genes_idx & example_genes_idx],sqrt_var[~sqrt_variable_genes_idx & example_genes_idx],color='tab:blue',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
# ax1.hlines(sqrt_threshold,xmin,xmax,linestyle=':',linewidth=hline_width)
# add_labels(dataset,xdata=gene_means, ydata=sqrt_var,example_genes=example_genes,textoffsets=example_gene_textoffsets_a,lines=example_gene_lines_a,ax=ax1)
# add_largedot_legend(ax1,'lower right',dict(fontsize=15))
ax2.text(*titleletter_loc,'b',transform=ax2.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax2.set_title(r'Pearson residuals ($\theta=5000$)')
# ax2.set_xscale('log')
ax2.set_yscale('log')
ax2.set_xlim((xmin,xmax))
ax2.set_xlabel(xmean_label)
ax2.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True)
#ax2.scatter(gene_means,residual_var,s=dotsize, rasterized=True)
ax2.scatter(gene_means[residuals_variable_genes_idx],sqrt_var[residuals_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
ax2.scatter(gene_means[example_genes_idx & sqrt_variable_genes_idx],sqrt_var[example_genes_idx & sqrt_variable_genes_idx],color='tab:red',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
ax2.scatter(gene_means[example_genes_idx & ~sqrt_variable_genes_idx],sqrt_var[example_genes_idx & ~sqrt_variable_genes_idx],color='tab:blue',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
# ax2.hlines(residuals_threshold,xmin,xmax,linestyle=':',linewidth=hline_width)
# add_labels(dataset,xdata=gene_means, ydata=residual_var,example_genes=example_genes,textoffsets=example_gene_textoffsets_b,lines=example_gene_lines_b,ax=ax2)
# add_largedot_legend(ax2,'lower right',dict(fontsize=15))
supaxis.text(*titleletter_loc,'c',transform=supaxis.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
supaxis.axis('off')
# for (tsne_ax,tsne_arrow,example_gene) in zip(tsne_axes,tsne_arrows,example_genes):
# # X = adata.X.T.toarray()
# print(dataset['Gene'] == example_gene)
# gene_idx = dataset['Gene'] == example_gene
# print(gene_idx)
# sqrt_lazy_counts = np.squeeze(adata.X[gene_idx, :].toarray())
# tsne_ax.set_title(example_gene.lower().capitalize())
# tsne_ax.scatter(sqrt_lazy_tsne[:,0], sqrt_lazy_tsne[:,1], s=1,c=sqrt_lazy_counts, edgecolor='none',cmap='Reds', rasterized=True)
# if tsne_arrow:
# tsne_ax.arrow(*tsne_arrow,-8,-8,width=0.5,head_width=6,shape='full',facecolor='k')
# tsne_ax.axis('off')
sns.despine()
plt.tight_layout()
# plt.savefig('figures/Fig2.pdf', dpi=300, format=None)
adata = scm_100k
xmean_label = 'mean methylation level'
residual_var = adata.var['log_sr_dispersion']
gene_means = adata.var['mean']
sqrt_var = adata.var['var']
sqrt_variable_genes_idx = adata.var['feature_select_var']
residuals_variable_genes_idx = adata.var['feature_select_residual']
dataset = adata.var
sqrt_lazy_tsne = adata.obsm['X_umap']
with sns.plotting_context("talk"):
example_genes_idx = np.isin(adata.var['Gene'],example_genes)
fig = plt.figure(figsize=(18,6))
gs = GridSpec(2, 6, figure=fig)
ax1 = fig.add_subplot(gs[:, 0:2])
ax2 = fig.add_subplot(gs[:, 2:4])
supaxis = fig.add_subplot(gs[:, 4:6])
ax3 = fig.add_subplot(gs[0, 4])
ax4 = fig.add_subplot(gs[1, 4])
ax5 = fig.add_subplot(gs[0, 5])
ax6 = fig.add_subplot(gs[1, 5])
tsne_axes = [ax3,ax4,ax5,ax6]
tsne_arrows = [(28,39),None,None,(28,39)]
ax1.text(*titleletter_loc,'a',transform=ax1.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax1.set_title('sqrt(CPMedian)')
# ax1.set_xscale('log')
ax1.set_yscale('log')
ax1.set_xlim((xmin,xmax))
ax1.set_xlabel(xmean_label)
ax1.set_ylabel('variance')
ax1.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True)
ax1.scatter(gene_means[sqrt_variable_genes_idx],sqrt_var[sqrt_variable_genes_idx],color='tab:red',s=dotsize,label='selection by Pearson residuals', rasterized=True)
ax1.scatter(gene_means[sqrt_variable_genes_idx & example_genes_idx],sqrt_var[sqrt_variable_genes_idx & example_genes_idx],color='tab:red',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
ax1.scatter(gene_means[~sqrt_variable_genes_idx & example_genes_idx],sqrt_var[~sqrt_variable_genes_idx & example_genes_idx],color='tab:blue',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
# ax1.hlines(sqrt_threshold,xmin,xmax,linestyle=':',linewidth=hline_width)
# add_labels(dataset,xdata=gene_means, ydata=sqrt_var,example_genes=example_genes,textoffsets=example_gene_textoffsets_a,lines=example_gene_lines_a,ax=ax1)
# add_largedot_legend(ax1,'lower right',dict(fontsize=15))
ax2.text(*titleletter_loc,'b',transform=ax2.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax2.set_title(r'Pearson residuals ($\theta=5000$)')
# ax2.set_xscale('log')
ax2.set_yscale('log')
ax2.set_xlim((xmin,xmax))
ax2.set_xlabel(xmean_label)
ax2.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True)
#ax2.scatter(gene_means,residual_var,s=dotsize, rasterized=True)
ax2.scatter(gene_means[residuals_variable_genes_idx],sqrt_var[residuals_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
ax2.scatter(gene_means[example_genes_idx & sqrt_variable_genes_idx],sqrt_var[example_genes_idx & sqrt_variable_genes_idx],color='tab:red',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
ax2.scatter(gene_means[example_genes_idx & ~sqrt_variable_genes_idx],sqrt_var[example_genes_idx & ~sqrt_variable_genes_idx],color='tab:blue',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
# ax2.hlines(residuals_threshold,xmin,xmax,linestyle=':',linewidth=hline_width)
# add_labels(dataset,xdata=gene_means, ydata=residual_var,example_genes=example_genes,textoffsets=example_gene_textoffsets_b,lines=example_gene_lines_b,ax=ax2)
# add_largedot_legend(ax2,'lower right',dict(fontsize=15))
supaxis.text(*titleletter_loc,'c',transform=supaxis.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
supaxis.axis('off')
# for (tsne_ax,tsne_arrow,example_gene) in zip(tsne_axes,tsne_arrows,example_genes):
# # X = adata.X.T.toarray()
# print(dataset['Gene'] == example_gene)
# gene_idx = dataset['Gene'] == example_gene
# print(gene_idx)
# sqrt_lazy_counts = np.squeeze(adata.X[gene_idx, :].toarray())
# tsne_ax.set_title(example_gene.lower().capitalize())
# tsne_ax.scatter(sqrt_lazy_tsne[:,0], sqrt_lazy_tsne[:,1], s=1,c=sqrt_lazy_counts, edgecolor='none',cmap='Reds', rasterized=True)
# if tsne_arrow:
# tsne_ax.arrow(*tsne_arrow,-8,-8,width=0.5,head_width=6,shape='full',facecolor='k')
# tsne_ax.axis('off')
sns.despine()
plt.tight_layout()
# plt.savefig('figures/Fig2.pdf', dpi=300, format=None)
In [320]:
Copied!
def add_labels(dataset, xdata, ydata, example_genes, textoffsets, lines, ax):
for example_gene, textoffset, line in zip(example_genes, textoffsets, lines):
gene_idx = dataset['Gene'] == example_gene
if not gene_idx.any():
print(f"Warning: {example_gene} not found in dataset['Gene']")
continue
gene_position = np.array([xdata[gene_idx], ydata[gene_idx]]).flatten()
text_position = np.array([gene_position[0] * 10**textoffset[0], gene_position[1] + textoffset[1]]).flatten()
ax.text(*text_position, example_gene.capitalize(), horizontalalignment='center')
if line:
ax.plot([gene_position[0], text_position[0]], [gene_position[1], text_position[1]], 'k-', linewidth=0.5)
def add_largedot_legend(ax,loc,kwargs={}):
lgnd = ax.legend(loc=loc,frameon=True,**kwargs)
for l in lgnd.legendHandles:
l._sizes = [30]
def add_labels(dataset, xdata, ydata, example_genes, textoffsets, lines, ax):
for example_gene, textoffset, line in zip(example_genes, textoffsets, lines):
gene_idx = dataset['Gene'] == example_gene
if not gene_idx.any():
print(f"Warning: {example_gene} not found in dataset['Gene']")
continue
gene_position = np.array([xdata[gene_idx], ydata[gene_idx]]).flatten()
text_position = np.array([gene_position[0] * 10**textoffset[0], gene_position[1] + textoffset[1]]).flatten()
ax.text(*text_position, example_gene.capitalize(), horizontalalignment='center')
if line:
ax.plot([gene_position[0], text_position[0]], [gene_position[1], text_position[1]], 'k-', linewidth=0.5)
def add_largedot_legend(ax,loc,kwargs={}):
lgnd = ax.legend(loc=loc,frameon=True,**kwargs)
for l in lgnd.legendHandles:
l._sizes = [30]
In [ ]:
Copied!
def auto(adata,svd_solver='arpack', nb_pcs=50, n_neighbors=15, perplexity=30,
method='umap', metric='euclidean', min_dist=0.5, spread=1.0,
n_components=2, copy=False):
def auto(adata,svd_solver='arpack', nb_pcs=50, n_neighbors=15, perplexity=30,
method='umap', metric='euclidean', min_dist=0.5, spread=1.0,
n_components=2, copy=False):
In [425]:
Copied!
gene_means = adata.var['mean']
sqrt_var = adata.var['var']
vars_variable_genes_idx = adata.var['feature_select_var']
#sqrt_variable_genes_idx = scm_1k.var['feature_select']
residual_var = adata.var['dispersion_norm']
residuals_variable_genes_idx = adata.var['feature_select_sr']
normdispersion_variable_genes_idx = adata.var['feature_select_dispersion']
random_genes_idx = adata.var['feature_select_random']
with sns.plotting_context("talk"):
fig = plt.figure(figsize=(18,6))
gs = GridSpec(2, 8, figure=fig)
ax1 = fig.add_subplot(gs[:, 0:2])
ax2 = fig.add_subplot(gs[:, 2:4])
ax3 = fig.add_subplot(gs[:, 4:6])
ax4 = fig.add_subplot(gs[:, 6:8])
ax1.text(*titleletter_loc,'a',transform=ax1.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax1.set_title(r'Variance')
ax1.set_ylabel('Variance')
ax1.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
ax1.scatter(gene_means[vars_variable_genes_idx],sqrt_var[vars_variable_genes_idx],color='tab:red',s=dotsize,label='selection by Pearson residuals', rasterized=True)
ax2.text(*titleletter_loc,'b',transform=ax2.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax2.set_title(r'Residuals Variance')
ax2.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
ax2.scatter(gene_means[residuals_variable_genes_idx],sqrt_var[residuals_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
ax3.text(*titleletter_loc,'c',transform=ax3.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax3.set_title(r'NormDispersion')
ax3.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
ax3.scatter(gene_means[normdispersion_variable_genes_idx],sqrt_var[normdispersion_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
ax4.text(*titleletter_loc,'c',transform=ax4.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax4.set_title(r'Random')
ax4.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
ax4.scatter(gene_means[random_genes_idx],sqrt_var[random_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
sns.despine()
plt.tight_layout()
gene_means = adata.var['mean']
sqrt_var = adata.var['var']
vars_variable_genes_idx = adata.var['feature_select_var']
#sqrt_variable_genes_idx = scm_1k.var['feature_select']
residual_var = adata.var['dispersion_norm']
residuals_variable_genes_idx = adata.var['feature_select_sr']
normdispersion_variable_genes_idx = adata.var['feature_select_dispersion']
random_genes_idx = adata.var['feature_select_random']
with sns.plotting_context("talk"):
fig = plt.figure(figsize=(18,6))
gs = GridSpec(2, 8, figure=fig)
ax1 = fig.add_subplot(gs[:, 0:2])
ax2 = fig.add_subplot(gs[:, 2:4])
ax3 = fig.add_subplot(gs[:, 4:6])
ax4 = fig.add_subplot(gs[:, 6:8])
ax1.text(*titleletter_loc,'a',transform=ax1.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax1.set_title(r'Variance')
ax1.set_ylabel('Variance')
ax1.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
ax1.scatter(gene_means[vars_variable_genes_idx],sqrt_var[vars_variable_genes_idx],color='tab:red',s=dotsize,label='selection by Pearson residuals', rasterized=True)
ax2.text(*titleletter_loc,'b',transform=ax2.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax2.set_title(r'Residuals Variance')
ax2.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
ax2.scatter(gene_means[residuals_variable_genes_idx],sqrt_var[residuals_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
ax3.text(*titleletter_loc,'c',transform=ax3.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax3.set_title(r'NormDispersion')
ax3.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
ax3.scatter(gene_means[normdispersion_variable_genes_idx],sqrt_var[normdispersion_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
ax4.text(*titleletter_loc,'c',transform=ax4.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
ax4.set_title(r'Random')
ax4.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
ax4.scatter(gene_means[random_genes_idx],sqrt_var[random_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
sns.despine()
plt.tight_layout()
In [495]:
Copied!
enhancer = ad.read('./windows_filter_10000.h5ad')
enhancer = ad.read('./windows_filter_10000.h5ad')
FutureWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/anndata/__init__.py:51: `anndata.read` is deprecated, use `anndata.read_h5ad` instead. `ad.read` will be removed in mid 2024.
In [496]:
Copied!
enhancer
enhancer
Out[496]:
AnnData object with n_obs × n_vars = 2364 × 10000 obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Sample_id', 'Sample', 'Animal age', 'FACS date', 'Brain area', 'Laminar layer', 'Labeling', 'FACS channel', 'FACS count', 'Bisulfite conversion method', 'Library type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Sequencing run mode', 'Total reads', 'Mapped reads', '% Mapped reads', 'Filtered reads', '% Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Coverage (%)', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate', 'n_features', 'pct_peaks' var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var', 'pct_cell', 'mean', 'feature_select', 'feature_select_var' uns: 'label_color', 'pca', 'tsne', 'workdir' obsm: 'X_pca', 'X_tsne', 'mylayer_pca', 'relative_tsne' varm: 'PCs' layers: 'relative'
In [499]:
Copied!
scm.pl.pca(scm_100k, color=['Neuron type'],wspace=0.4)
scm.pl.pca(scm_100k, color=['Neuron type'],wspace=0.4)
In [494]:
Copied!
scm.pl.tsne(scm_100k, color=['Neuron type'],wspace=0.4)
scm.pl.tsne(scm_100k, color=['Neuron type'],wspace=0.4)
In [500]:
Copied!
scm.pl.pca(adata, color=['Neuron type'],wspace=0.4)
scm.pl.pca(adata, color=['Neuron type'],wspace=0.4)
In [501]:
Copied!
scm.pl.umap(adata, color=['Neuron type'],wspace=0.4)
scm.pl.umap(adata, color=['Neuron type'],wspace=0.4)
In [3]:
Copied!
# 新的数据
scm_100k = scm.pp.load_scm("/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/scbs/w100k.h5ad")
# 新的数据
scm_100k = scm.pp.load_scm("/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/scbs/w100k.h5ad")
In [3]:
Copied!
scm.pp.add_meta(scm_100k,meta_file='/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/meta/Luo_GSE97179_human_mid_cortex_raw_meta.txt',index_col='Sample_id')
scm.pp.add_meta(scm_100k,meta_file='/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/meta/Luo_GSE97179_human_mid_cortex_raw_meta.txt',index_col='Sample_id')
No column 'label' found in metadata, 'unknown' is used as the default cell labels No column 'label_color' found in metadata, random color is generated for each cell label
In [5]:
Copied!
scm_100k
scm_100k
Out[5]:
AnnData object with n_obs × n_vars = 2365 × 30894 obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Sample_id', 'Sample', 'Animal age', 'FACS date', 'Brain area', 'Laminar layer', 'Labeling', 'FACS channel', 'FACS count', 'Bisulfite conversion method', 'Library type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Sequencing run mode', 'Total reads', 'Mapped reads', '% Mapped reads', 'Filtered reads', '% Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Coverage (%)', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate' var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var' uns: 'workdir', 'label_color' layers: 'relative'
In [8]:
Copied!
scm_100k.obs
scm_100k.obs
Out[8]:
cell_id | cell_name | sites | meth | n_total | global_meth_level | Sample_id | Sample | Animal age | FACS date | ... | mCG/CG | mCH/CH | Estimated mCG/CG | Estimated mCH/CH | Coverage (%) | Neuron type | tSNE x coordinate | tSNE y coordinate | n_features | pct_peaks | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | GSM2556580 | 3186266 | 2496679 | 3825044 | 0.783575 | GSM2556580 | Pool_1026_AD002_indexed | 25 yr | 2012/6/29 | ... | 0.79871 | 0.02211 | 0.79762 | 0.01680 | 6.01 | hSst-2 | 20.93320 | 13.43860 | 29498 | 0.954813 |
1 | 1 | GSM2557770 | 2572709 | 1931054 | 3242784 | 0.750592 | GSM2557770 | Pool_263_AD010_indexed | 25 yr | 2012/6/29 | ... | 0.76486 | 0.03588 | 0.76305 | 0.02847 | 5.11 | hL2/3 | -7.61221 | 13.74140 | 29498 | 0.954813 |
2 | 2 | GSM2556556 | 3247648 | 2505895 | 3882099 | 0.771603 | GSM2556556 | Pool_1012_AD006_indexed | 25 yr | 2012/6/29 | ... | 0.78647 | 0.05192 | 0.78433 | 0.04240 | 6.50 | hL2/3 | -14.57080 | 4.75207 | 29497 | 0.954781 |
3 | 3 | GSM2556719 | 3593247 | 2753322 | 4304895 | 0.766249 | GSM2556719 | Pool_1090_AD006_indexed | 25 yr | 2012/6/29 | ... | 0.78164 | 0.04276 | 0.77984 | 0.03487 | 7.19 | hL2/3 | -12.23290 | 10.33370 | 29504 | 0.955007 |
4 | 4 | GSM2558838 | 1671616 | 1213688 | 1944619 | 0.726057 | GSM2558838 | Pool_771_AD008_indexed | 25 yr | 2012/6/29 | ... | 0.73986 | 0.04119 | 0.73763 | 0.03297 | 3.26 | hL2/3 | -12.52800 | 12.85390 | 29474 | 0.954036 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2360 | 2360 | GSM2557118 | 4030037 | 3049428 | 5723469 | 0.756675 | GSM2557118 | Pool_1381_AD002_indexed | 25 yr | 2012/6/29 | ... | 0.76468 | 0.05545 | 0.76216 | 0.04534 | 7.86 | hDL-1 | -9.11158 | -1.04451 | 29499 | 0.954846 |
2361 | 2361 | GSM2558140 | 3335079 | 2526078 | 4135394 | 0.757427 | GSM2558140 | Pool_441_AD008_indexed | 25 yr | 2012/6/29 | ... | 0.76573 | 0.03698 | 0.76416 | 0.03051 | 6.79 | hL2/3 | -11.26030 | 12.10890 | 29501 | 0.954910 |
2362 | 2362 | GSM2557032 | 4045664 | 3312230 | 5141670 | 0.818711 | GSM2557032 | Pool_1359_AD010_indexed | 25 yr | 2012/6/29 | ... | 0.83100 | 0.08086 | 0.82834 | 0.06641 | 8.27 | hSst-3 | 15.37830 | -9.40008 | 29498 | 0.954813 |
2363 | 2363 | GSM2558877 | 1554042 | 1192705 | 1806746 | 0.767486 | GSM2558877 | Pool_790_AD010_indexed | 25 yr | 2012/6/29 | ... | 0.78764 | 0.03502 | 0.78601 | 0.02759 | 2.96 | hNdnf | 16.07330 | 8.95018 | 29327 | 0.949278 |
2364 | 2364 | GSM2557111 | 4053094 | 3004728 | 5423767 | 0.741342 | GSM2557111 | Pool_1379_AD010_indexed | 25 yr | 2012/6/29 | ... | 0.75210 | 0.03068 | 0.75041 | 0.02408 | 8.11 | hL2/3 | -5.32003 | 13.90730 | 29496 | 0.954748 |
2364 rows × 41 columns
In [7]:
Copied!
scm.pp.filter_cells(scm_100k,min_n_features = 10000)
scm.pp.filter_features(scm_100k,min_n_cells = 1000)
scm.pp.filter_cells(scm_100k,min_n_features = 10000)
scm.pp.filter_features(scm_100k,min_n_cells = 1000)
filter cells based on min_n_features >= 10000 after filtering out low-quality cells: 2364 cells, 30894 regions
/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scMethtools/preprocessing/scm.py:207: RuntimeWarning: Mean of empty slice feature_mean = np.nanmean(dense_X, axis=0)
Filter regions based on min_n_cells After filtering out low coveraged regions: 2364 cells, 29509 regions
In [12]:
Copied!
scm.pp.pca(scm_100k,n_pcs = 30, layer = 'relative')
sc.pp.neighbors(scm_100k, use_rep = 'relative_pca')
scm.pp.umap (scm_100k)
scm.pl.umap (scm_100k, color=['Neuron type'],size=40)
scm.pp.pca(scm_100k,n_pcs = 30, layer = 'relative')
sc.pp.neighbors(scm_100k, use_rep = 'relative_pca')
scm.pp.umap (scm_100k)
scm.pl.umap (scm_100k, color=['Neuron type'],size=40)
The history saving thread hit an unexpected error (OperationalError('database is locked')).History will not be written to the database.
In [11]:
Copied!
scm.pp.pca(scm_100k,n_pcs = 30)
sc.pp.neighbors(scm_100k)
scm.pp.umap (scm_100k)
scm.pl.umap (scm_100k, color=['Neuron type'],size=40)
scm.pp.pca(scm_100k,n_pcs = 30)
sc.pp.neighbors(scm_100k)
scm.pp.umap (scm_100k)
scm.pl.umap (scm_100k, color=['Neuron type'],size=40)
... using all features
In [ ]:
Copied!