Human Neuron

In [ ]:

In [2]:

Copied!





import pandas as pd
import numpy as np
import scanpy as sc
import scMethtools as scm
import matplotlib.pyplot as plt
import anndata as ad
import pandas as pd
import numpy as np
import scanpy as sc
import scMethtools as scm
import matplotlib.pyplot as plt
import anndata as ad

 ####   ####  #    # ##### #    # #####  ####   ####  #       ####  
 #      #    # ##  ##   #   #    #   #   #    # #    # #      #      
  ####  #      # ## #   #   ######   #   #    # #    # #       ####  
      # #      #    #   #   #    #   #   #    # #    # #           # 
 #    # #    # #    #   #   #    #   #   #    # #    # #      #    # 
  ####   ####  #    #   #   #    #   #    ####   ####  ######  ####                                                

Version: 1.0.0, Tutorials: https://ngdc.cncb.ac.cn/methbank/scm

In [12]:

Copied!

scm_100k  = scm.pp.load_scm("/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scm/windows.h5")
scanpy_100k = ad.read_h5ad("/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scanpy/scanpy_100kb_matrix_test_CG.h5ad")
scm_100k  = scm.pp.load_scm("/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scm/windows.h5")
scanpy_100k = ad.read_h5ad("/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scanpy/scanpy_100kb_matrix_test_CG.h5ad")

No description has been provided for this image

scmat analysis¶

In [4]:

Copied!

#for scmart analysis
scm.pp.add_meta(scm_100k,meta_file='/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/meta/Luo_GSE97179_human_mid_cortex_raw_meta.txt',index_col='Sample_id')
#for scmart analysis
scm.pp.add_meta(scm_100k,meta_file='/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/meta/Luo_GSE97179_human_mid_cortex_raw_meta.txt',index_col='Sample_id')

No column 'label' found in metadata, 'unknown' is used as the default cell labels
No column 'label_color' found in metadata, random color is generated for each cell label

In [37]:

Copied!

scm_100k.obs
scm_100k.obs

Out[37]:

	cell_id	cell_name	sites	meth	n_total	global_meth_level	Sample_id	Sample	Animal age	FACS date	...	mCG/CG	mCH/CH	Estimated mCG/CG	Estimated mCH/CH	Coverage (%)	Neuron type	tSNE x coordinate	tSNE y coordinate	n_features	pct_peaks
0	0	GSM2556580	3186266	2496679	3825044	0.783575	GSM2556580	Pool_1026_AD002_indexed	25 yr	2012/6/29	...	0.79871	0.02211	0.79762	0.01680	6.01	hSst-2	20.93320	13.43860	29498	0.954813
1	1	GSM2557770	2572709	1931054	3242784	0.750592	GSM2557770	Pool_263_AD010_indexed	25 yr	2012/6/29	...	0.76486	0.03588	0.76305	0.02847	5.11	hL2/3	-7.61221	13.74140	29498	0.954813
2	2	GSM2556556	3247648	2505895	3882099	0.771603	GSM2556556	Pool_1012_AD006_indexed	25 yr	2012/6/29	...	0.78647	0.05192	0.78433	0.04240	6.50	hL2/3	-14.57080	4.75207	29497	0.954781
3	3	GSM2556719	3593247	2753322	4304895	0.766249	GSM2556719	Pool_1090_AD006_indexed	25 yr	2012/6/29	...	0.78164	0.04276	0.77984	0.03487	7.19	hL2/3	-12.23290	10.33370	29504	0.955007
4	4	GSM2558838	1671616	1213688	1944619	0.726057	GSM2558838	Pool_771_AD008_indexed	25 yr	2012/6/29	...	0.73986	0.04119	0.73763	0.03297	3.26	hL2/3	-12.52800	12.85390	29474	0.954036
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2360	2360	GSM2557118	4030037	3049428	5723469	0.756675	GSM2557118	Pool_1381_AD002_indexed	25 yr	2012/6/29	...	0.76468	0.05545	0.76216	0.04534	7.86	hDL-1	-9.11158	-1.04451	29499	0.954846
2361	2361	GSM2558140	3335079	2526078	4135394	0.757427	GSM2558140	Pool_441_AD008_indexed	25 yr	2012/6/29	...	0.76573	0.03698	0.76416	0.03051	6.79	hL2/3	-11.26030	12.10890	29501	0.954910
2362	2362	GSM2557032	4045664	3312230	5141670	0.818711	GSM2557032	Pool_1359_AD010_indexed	25 yr	2012/6/29	...	0.83100	0.08086	0.82834	0.06641	8.27	hSst-3	15.37830	-9.40008	29498	0.954813
2363	2363	GSM2558877	1554042	1192705	1806746	0.767486	GSM2558877	Pool_790_AD010_indexed	25 yr	2012/6/29	...	0.78764	0.03502	0.78601	0.02759	2.96	hNdnf	16.07330	8.95018	29327	0.949278
2364	2364	GSM2557111	4053094	3004728	5423767	0.741342	GSM2557111	Pool_1379_AD010_indexed	25 yr	2012/6/29	...	0.75210	0.03068	0.75041	0.02408	8.11	hL2/3	-5.32003	13.90730	29496	0.954748

2364 rows × 41 columns

In [6]:

Copied!





# filters
# filter by cell
scm.pp.filter_cells(scm_100k,min_n_features = 10000)
# filter by features
scm.pp.filter_features(scm_100k,min_n_cells=2000)
# filters
# filter by cell
scm.pp.filter_cells(scm_100k,min_n_features = 10000)
# filter by features
scm.pp.filter_features(scm_100k,min_n_cells=2000)

filter cells based on min_n_features >=  10000
after filtering out low-quality cells: 
2364 cells, 30894 regions

/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scMethtools/preprocessing/scm.py:207: RuntimeWarning: Mean of empty slice
  feature_mean = np.nanmean(dense_X, axis=0)

Filter regions based on min_n_cells
After filtering out low coveraged regions: 
2364 cells, 29485 regions

In [14]:

Copied!

scm_100k.var
scm_100k.var

Out[14]:

	chromosome	start	end	covered_cell	var	sr_var	mean	pct_cell
index
chr1:1-100000	chr1	1	100000	2353	0.035165	0.005667	0.710558	0.995347
chr1:100001-200000	chr1	100001	200000	2354	0.009842	0.002561	0.810938	0.995770
chr1:200001-300000	chr1	200001	300000	2351	0.030100	0.009758	0.730608	0.994501
chr1:300001-400000	chr1	300001	400000	2323	0.026505	0.015129	0.819564	0.982657
chr1:400001-500000	chr1	400001	500000	2346	0.014380	0.009514	0.836147	0.992386
...	...	...	...	...	...	...	...	...
chrY:56800001-56900000	chrY	56800001	56900000	2361	0.002751	0.002164	0.503138	0.998731
chrY:56900001-57000000	chrY	56900001	57000000	2122	0.098136	0.039726	0.585631	0.897631
chrY:57000001-57100000	chrY	57000001	57100000	2252	0.066954	0.031287	0.664001	0.952623
chrY:57100001-57200000	chrY	57100001	57200000	2311	0.046440	0.026938	0.684819	0.977580
chrY:57200001-57227415	chrY	57200001	57227415	2125	0.040066	0.022826	0.790896	0.898900

29509 rows × 8 columns

In [7]:

Copied!





def calculate_dispersion_norm(adata):
    df = adata.var
    df['dispersion'] =  df['var'] / df['mean']
    df['log_dispersion'] = np.log(df['dispersion'])
    
    mean_binsize = 0.01
    cov_binsize = 10
    bin_min_features=5
    # instead of n_bins, use bin_size, because cov and mc are in different scale
    df['mean_bin'] = (df['mean'] / mean_binsize).astype(int)
    df['cov_bin'] = (df['covered_cell'] / cov_binsize).astype(int)
    
    # save bin_count df, gather bins with more than bin_min_features features
    bin_count = df.groupby(['mean_bin', 'cov_bin']) \
        .apply(lambda i: i.shape[0]) \
        .reset_index() \
        .sort_values(0, ascending=False)
    bin_count.head()
    bin_more_than = bin_count[bin_count[0] > bin_min_features]
    if bin_more_than.shape[0] == 0:
        raise ValueError(f'No bin have more than {bin_min_features} features, use larger bin size.')
    # this usually don't cause much difference (a few hundred features), but the scatter plot will look more nature
    index_map = {}
    for _, (mean_id, cov_id, count) in bin_count.iterrows():
        if count > 1:
            index_map[(mean_id, cov_id)] = (mean_id, cov_id)
        manhattan_dist = (bin_more_than['mean_bin'] - mean_id).abs() + (bin_more_than['cov_bin'] - cov_id).abs()
        closest_more_than = manhattan_dist.sort_values().index[0]
        closest = bin_more_than.loc[closest_more_than]
        index_map[(mean_id, cov_id)] = tuple(closest.tolist()[:2])
    # apply index_map to original df
    raw_bin = df[['mean_bin', 'cov_bin']].set_index(['mean_bin', 'cov_bin'])
    raw_bin['use_mean'] = pd.Series(index_map).apply(lambda i: i[0])
    raw_bin['use_cov'] = pd.Series(index_map).apply(lambda i: i[1])
    df['mean_bin'] = raw_bin['use_mean'].values
    df['cov_bin'] = raw_bin['use_cov'].values
    
    # calculate bin mean and std, now disp_std_bin shouldn't have NAs
    disp_grouped = df.groupby(['mean_bin', 'cov_bin'])['dispersion']
    disp_mean_bin = disp_grouped.mean()
    disp_std_bin = disp_grouped.std(ddof=1)
    
    # actually do the normalization
    _mean_norm = disp_mean_bin.loc[list(zip(df['mean_bin'], df['cov_bin']))]
    _std_norm = disp_std_bin.loc[list(zip(df['mean_bin'], df['cov_bin']))]
    df['dispersion_norm'] = (df['dispersion'].values  # use values here as index differs
                             - _mean_norm.values) / _std_norm.values
    dispersion_norm = df['dispersion_norm'].values.astype('float32')
    return df
def calculate_dispersion_norm(adata):
    df = adata.var
    df['dispersion'] =  df['var'] / df['mean']
    df['log_dispersion'] = np.log(df['dispersion'])
    
    mean_binsize = 0.01
    cov_binsize = 10
    bin_min_features=5
    # instead of n_bins, use bin_size, because cov and mc are in different scale
    df['mean_bin'] = (df['mean'] / mean_binsize).astype(int)
    df['cov_bin'] = (df['covered_cell'] / cov_binsize).astype(int)
    
    # save bin_count df, gather bins with more than bin_min_features features
    bin_count = df.groupby(['mean_bin', 'cov_bin']) \
        .apply(lambda i: i.shape[0]) \
        .reset_index() \
        .sort_values(0, ascending=False)
    bin_count.head()
    bin_more_than = bin_count[bin_count[0] > bin_min_features]
    if bin_more_than.shape[0] == 0:
        raise ValueError(f'No bin have more than {bin_min_features} features, use larger bin size.')
    # this usually don't cause much difference (a few hundred features), but the scatter plot will look more nature
    index_map = {}
    for _, (mean_id, cov_id, count) in bin_count.iterrows():
        if count > 1:
            index_map[(mean_id, cov_id)] = (mean_id, cov_id)
        manhattan_dist = (bin_more_than['mean_bin'] - mean_id).abs() + (bin_more_than['cov_bin'] - cov_id).abs()
        closest_more_than = manhattan_dist.sort_values().index[0]
        closest = bin_more_than.loc[closest_more_than]
        index_map[(mean_id, cov_id)] = tuple(closest.tolist()[:2])
    # apply index_map to original df
    raw_bin = df[['mean_bin', 'cov_bin']].set_index(['mean_bin', 'cov_bin'])
    raw_bin['use_mean'] = pd.Series(index_map).apply(lambda i: i[0])
    raw_bin['use_cov'] = pd.Series(index_map).apply(lambda i: i[1])
    df['mean_bin'] = raw_bin['use_mean'].values
    df['cov_bin'] = raw_bin['use_cov'].values
    
    # calculate bin mean and std, now disp_std_bin shouldn't have NAs
    disp_grouped = df.groupby(['mean_bin', 'cov_bin'])['dispersion']
    disp_mean_bin = disp_grouped.mean()
    disp_std_bin = disp_grouped.std(ddof=1)
    
    # actually do the normalization
    _mean_norm = disp_mean_bin.loc[list(zip(df['mean_bin'], df['cov_bin']))]
    _std_norm = disp_std_bin.loc[list(zip(df['mean_bin'], df['cov_bin']))]
    df['dispersion_norm'] = (df['dispersion'].values  # use values here as index differs
                             - _mean_norm.values) / _std_norm.values
    dispersion_norm = df['dispersion_norm'].values.astype('float32')
    return df

In [10]:

Copied!





X =  scm_100k.layers['relative'].copy()
n_samples = X.shape[0]
df = scm_100k.var
mean_feature_methylation = [np.nansum(np.abs(line.toarray())) /  n_samples for line in X.T]
df['mean_sr'] = mean_feature_methylation
df['sr_dispersion'] = df['sr_var'] / df['mean_sr'] 
X =  scm_100k.layers['relative'].copy()
n_samples = X.shape[0]
df = scm_100k.var
mean_feature_methylation = [np.nansum(np.abs(line.toarray())) /  n_samples for line in X.T]
df['mean_sr'] = mean_feature_methylation
df['sr_dispersion'] = df['sr_var'] / df['mean_sr'] 

In [17]:

Copied!

df
df

Out[17]:

	chromosome	start	end	covered_cell	var	sr_var	mean	pct_cell	mean_sr	sr_dispersion
index
chr1:1-100000	chr1	1	100000	2353	0.035165	0.005667	0.710558	0.995347	0.054952	0.103125
chr1:100001-200000	chr1	100001	200000	2354	0.009842	0.002561	0.810938	0.995770	0.037709	0.067909
chr1:200001-300000	chr1	200001	300000	2351	0.030100	0.009758	0.730608	0.994501	0.075547	0.129171
chr1:300001-400000	chr1	300001	400000	2323	0.026505	0.015129	0.819564	0.982657	0.085704	0.176523
chr1:400001-500000	chr1	400001	500000	2346	0.014380	0.009514	0.836147	0.992386	0.072957	0.130408
...	...	...	...	...	...	...	...	...	...	...
chrY:56800001-56900000	chrY	56800001	56900000	2361	0.002751	0.002164	0.503138	0.998731	0.036448	0.059383
chrY:56900001-57000000	chrY	56900001	57000000	2122	0.098136	0.039726	0.585631	0.897631	0.147727	0.268912
chrY:57000001-57100000	chrY	57000001	57100000	2252	0.066954	0.031287	0.664001	0.952623	0.134009	0.233471
chrY:57100001-57200000	chrY	57100001	57200000	2311	0.046440	0.026938	0.684819	0.977580	0.128262	0.210021
chrY:57200001-57227415	chrY	57200001	57227415	2125	0.040066	0.022826	0.790896	0.898900	0.079380	0.287560

29509 rows × 10 columns

In [11]:

Copied!

scm_100k.var = df
scm_100k.var = df

In [8]:

Copied!





#Select hvm feature
import seaborn as sns
from matplotlib.gridspec import GridSpec
from scipy.stats import gaussian_kde
dotsize = 2
starsize = 125
staredges = 0.85
legend_loc = (0.1,0.6)
titleletter_loc = (-0.17,1.015)
hline_width = 1.5

dotsize=3
#for broken lines
linewidth_broken = 2
color_broken = 'tab:red'
#axis limits
xlim_satija = [1e-4,1e2]
ylim_satija_theta = [10**-7.5,10**1.5]
ylim_satija_beta0 = [-60,20]
title_fontsize = 22
title_fontweight = "bold"
title_position = (-0.2,1)
title_location = 'center'
title_y_padding = -10
markerfirst = True
#Select hvm feature
import seaborn as sns
from matplotlib.gridspec import GridSpec
from scipy.stats import gaussian_kde
dotsize = 2
starsize = 125
staredges = 0.85
legend_loc = (0.1,0.6)
titleletter_loc = (-0.17,1.015)
hline_width = 1.5

dotsize=3
#for broken lines
linewidth_broken = 2
color_broken = 'tab:red'
#axis limits
xlim_satija = [1e-4,1e2]
ylim_satija_theta = [10**-7.5,10**1.5]
ylim_satija_beta0 = [-60,20]
title_fontsize = 22
title_fontweight = "bold"
title_position = (-0.2,1)
title_location = 'center'
title_y_padding = -10
markerfirst = True

In [24]:

Copied!





df['log_sr_dispersion'] = np.log(df['sr_dispersion'])
gene_means = df['mean']
sqrt_var =  df['var']
sr_dispersion = df['log_sr_dispersion']
dispersion = df['log_dispersion']
# #sqrt_variable_genes_idx = scm_1k.var['feature_select_var']
# sqrt_variable_genes_idx = scm_1k.var['feature_select']
# residual_var = scm_1k.var['NormalizedDispersion']
# #residuals_variable_genes_idx = scm_1k.var['feature_select']
# residuals_variable_genes_idx = scm_1k.var['feature_select_var']
with sns.plotting_context("talk"):


    fig = plt.figure(figsize=(18,6))
    gs = GridSpec(2, 6, figure=fig) 

    ax1 = fig.add_subplot(gs[:, 0:2])
    ax2 = fig.add_subplot(gs[:, 2:4])       
    ax3 = fig.add_subplot(gs[:, 4:6])

    ax1.text(*titleletter_loc,'a',transform=ax1.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    ax1.set_title(r'variance')
    ax1.set_ylabel('variance')
    ax1.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
    #ax1.scatter(gene_means[residuals_variable_genes_idx],sqrt_var[residuals_variable_genes_idx],color='tab:red',s=dotsize,label='selection by Pearson residuals', rasterized=True)

    ax2.text(*titleletter_loc,'b',transform=ax2.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    ax2.set_title(r'dispersion')   
    ax2.scatter(gene_means,dispersion,s=dotsize, rasterized=True,color='lightgrey')
    #ax2.scatter(gene_means[sqrt_variable_genes_idx],residual_var[sqrt_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)    

    ax3.text(*titleletter_loc,'c',transform=ax3.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    ax3.set_title(r'residuals dispersion')   
    ax3.scatter(gene_means,sr_dispersion,s=dotsize, rasterized=True,color='lightgrey')
    
    sns.despine()
    plt.tight_layout()
    plt.savefig('./figures/mean_var_relationship.pdf', dpi=300, format=None)
df['log_sr_dispersion'] = np.log(df['sr_dispersion'])
gene_means = df['mean']
sqrt_var =  df['var']
sr_dispersion = df['log_sr_dispersion']
dispersion = df['log_dispersion']
# #sqrt_variable_genes_idx = scm_1k.var['feature_select_var']
# sqrt_variable_genes_idx = scm_1k.var['feature_select']
# residual_var = scm_1k.var['NormalizedDispersion']
# #residuals_variable_genes_idx = scm_1k.var['feature_select']
# residuals_variable_genes_idx = scm_1k.var['feature_select_var']
with sns.plotting_context("talk"):


    fig = plt.figure(figsize=(18,6))
    gs = GridSpec(2, 6, figure=fig) 

    ax1 = fig.add_subplot(gs[:, 0:2])
    ax2 = fig.add_subplot(gs[:, 2:4])       
    ax3 = fig.add_subplot(gs[:, 4:6])

    ax1.text(*titleletter_loc,'a',transform=ax1.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    ax1.set_title(r'variance')
    ax1.set_ylabel('variance')
    ax1.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
    #ax1.scatter(gene_means[residuals_variable_genes_idx],sqrt_var[residuals_variable_genes_idx],color='tab:red',s=dotsize,label='selection by Pearson residuals', rasterized=True)

    ax2.text(*titleletter_loc,'b',transform=ax2.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    ax2.set_title(r'dispersion')   
    ax2.scatter(gene_means,dispersion,s=dotsize, rasterized=True,color='lightgrey')
    #ax2.scatter(gene_means[sqrt_variable_genes_idx],residual_var[sqrt_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)    

    ax3.text(*titleletter_loc,'c',transform=ax3.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    ax3.set_title(r'residuals dispersion')   
    ax3.scatter(gene_means,sr_dispersion,s=dotsize, rasterized=True,color='lightgrey')
    
    sns.despine()
    plt.tight_layout()
    plt.savefig('./figures/mean_var_relationship.pdf', dpi=300, format=None)

In [25]:

Copied!





#Select hvm feature
import seaborn as sns
from matplotlib.gridspec import GridSpec
from scipy.stats import gaussian_kde
dotsize=3
#for broken lines
linewidth_broken = 2
color_broken = 'tab:red'
#axis limits
xlim_satija = [1e-4,1e2]
ylim_satija_theta = [10**-7.5,10**1.5]
ylim_satija_beta0 = [-60,20]
title_fontsize = 22
title_fontweight = "bold"
title_position = (-0.2,1)
title_location = 'center'
title_y_padding = -10
markerfirst = True

alpha=0.05
def make_panel(x,
               y,
               ax,
               letter='',
               title='',
               color_kde=True,
               xlim=None,
               ylim=None,
               xlabel='mean methylation level',
               ylabel='',
               xscale='linear',
               yscale='log'):
    
    ax.text(*title_position,letter,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    if color_kde:
        xtrans = np.log10(x) if xscale == 'log' else x
        ytrans = np.log10(y) if yscale == 'log' else y
        data = np.vstack((xtrans,ytrans))
        kde = gaussian_kde(data)
        logdensity = kde.logpdf(data)
        points = ax.scatter(x, y, s=dotsize, c=logdensity,  rasterized=True)

    else:
        ax.scatter(x, y, s=dotsize,  rasterized=True)
    ax.set_xscale(xscale)
    ax.set_yscale(yscale)
    ax.set_ylabel(ylabel)
    ax.set_xlabel(xlabel)

    if xlim:
        ax.set_xlim(xlim)
    if ylim:
        ax.set_ylim(ylim)
        
    ax.set_title(title)
    
def add_line(x,y,ax):
    ax.plot(x,y, '--',c=color_broken, linewidth=linewidth_broken)
    
def fix_ticks(ax):
    ax.set_yticks(ax.get_yticks()[1:-1])


with sns.plotting_context("talk"):

    fig = plt.figure(figsize=(18,9)) #constrained_layout=True)

    gs = GridSpec(1,3 , figure=fig)
    ax1 = fig.add_subplot(gs[0, 0])  #var
    ax2 = fig.add_subplot(gs[0, 1])  #sr_var
    ax3 = fig.add_subplot(gs[0, 2])


    make_panel(x=df['mean'],
               y=df['var'],
               ax=ax1,
               letter='a',
               ylabel=r'var',
               yscale='linear')
    # beta0_prediction = np.log(mean_range_orig / np.mean(ns_orig))
    # add_line(x=mean_range_orig,y=beta0_prediction,ax=ax1)
    fix_ticks(ax1)
    
    make_panel(x=df['mean'],
               y=df['log_dispersion'],
               ax=ax2,
               letter='b',
               ylabel=r'shrunken residual var',
               yscale='linear')
    # beta0_prediction = np.log(mean_range_orig / np.mean(ns_orig))
    # add_line(x=winows100k_scm.var['mean'],y=winows100k_scm.var['sr_var'],ax=ax1)   
    # ax2.hlines(np.log(10),mean_min_orig,mean_max_orig,linestyles='--', colors=color_broken, linewidths=linewidth_broken)

    make_panel(x=df['mean'],
               y=df['log_sr_dispersion'],
               ax=ax3,
               letter='c',
               ylabel=r'shrunken residual dispersion',
               yscale='linear')
    
    sns.despine()    
    plt.tight_layout()
    plt.savefig('./figures/mean_var_ped.pdf', dpi=300, format=None)
    plt.show()
#Select hvm feature
import seaborn as sns
from matplotlib.gridspec import GridSpec
from scipy.stats import gaussian_kde
dotsize=3
#for broken lines
linewidth_broken = 2
color_broken = 'tab:red'
#axis limits
xlim_satija = [1e-4,1e2]
ylim_satija_theta = [10**-7.5,10**1.5]
ylim_satija_beta0 = [-60,20]
title_fontsize = 22
title_fontweight = "bold"
title_position = (-0.2,1)
title_location = 'center'
title_y_padding = -10
markerfirst = True

alpha=0.05
def make_panel(x,
               y,
               ax,
               letter='',
               title='',
               color_kde=True,
               xlim=None,
               ylim=None,
               xlabel='mean methylation level',
               ylabel='',
               xscale='linear',
               yscale='log'):
    
    ax.text(*title_position,letter,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    if color_kde:
        xtrans = np.log10(x) if xscale == 'log' else x
        ytrans = np.log10(y) if yscale == 'log' else y
        data = np.vstack((xtrans,ytrans))
        kde = gaussian_kde(data)
        logdensity = kde.logpdf(data)
        points = ax.scatter(x, y, s=dotsize, c=logdensity,  rasterized=True)

    else:
        ax.scatter(x, y, s=dotsize,  rasterized=True)
    ax.set_xscale(xscale)
    ax.set_yscale(yscale)
    ax.set_ylabel(ylabel)
    ax.set_xlabel(xlabel)

    if xlim:
        ax.set_xlim(xlim)
    if ylim:
        ax.set_ylim(ylim)
        
    ax.set_title(title)
    
def add_line(x,y,ax):
    ax.plot(x,y, '--',c=color_broken, linewidth=linewidth_broken)
    
def fix_ticks(ax):
    ax.set_yticks(ax.get_yticks()[1:-1])


with sns.plotting_context("talk"):

    fig = plt.figure(figsize=(18,9)) #constrained_layout=True)

    gs = GridSpec(1,3 , figure=fig)
    ax1 = fig.add_subplot(gs[0, 0])  #var
    ax2 = fig.add_subplot(gs[0, 1])  #sr_var
    ax3 = fig.add_subplot(gs[0, 2])


    make_panel(x=df['mean'],
               y=df['var'],
               ax=ax1,
               letter='a',
               ylabel=r'var',
               yscale='linear')
    # beta0_prediction = np.log(mean_range_orig / np.mean(ns_orig))
    # add_line(x=mean_range_orig,y=beta0_prediction,ax=ax1)
    fix_ticks(ax1)
    
    make_panel(x=df['mean'],
               y=df['log_dispersion'],
               ax=ax2,
               letter='b',
               ylabel=r'shrunken residual var',
               yscale='linear')
    # beta0_prediction = np.log(mean_range_orig / np.mean(ns_orig))
    # add_line(x=winows100k_scm.var['mean'],y=winows100k_scm.var['sr_var'],ax=ax1)   
    # ax2.hlines(np.log(10),mean_min_orig,mean_max_orig,linestyles='--', colors=color_broken, linewidths=linewidth_broken)

    make_panel(x=df['mean'],
               y=df['log_sr_dispersion'],
               ax=ax3,
               letter='c',
               ylabel=r'shrunken residual dispersion',
               yscale='linear')
    
    sns.despine()    
    plt.tight_layout()
    plt.savefig('./figures/mean_var_ped.pdf', dpi=300, format=None)
    plt.show()

In [9]:

Copied!

calculate_dispersion_norm(scm_100k)
calculate_dispersion_norm(scm_100k)

/tmp/ipykernel_17336/590794251.py:14: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  bin_count = df.groupby(['mean_bin', 'cov_bin']) \

Out[9]:

	chromosome	start	end	covered_cell	var	sr_var	mean	pct_cell	dispersion	log_dispersion	mean_bin	cov_bin	dispersion_norm
index
chr1:1-100000	chr1	1	100000	2353	0.035165	0.005667	0.710558	0.995347	0.049490	-3.005991	71	235	1.717898
chr1:100001-200000	chr1	100001	200000	2354	0.009842	0.002561	0.810938	0.995770	0.012137	-4.411531	81	235	-0.480892
chr1:200001-300000	chr1	200001	300000	2351	0.030100	0.009758	0.730608	0.994501	0.041198	-3.189357	73	235	1.499175
chr1:300001-400000	chr1	300001	400000	2323	0.026505	0.015129	0.819564	0.982657	0.032340	-3.431454	81	233	-0.181737
chr1:400001-500000	chr1	400001	500000	2346	0.014380	0.009514	0.836147	0.992386	0.017198	-4.062990	83	234	-0.236255
...	...	...	...	...	...	...	...	...	...	...	...	...	...
chrY:56800001-56900000	chrY	56800001	56900000	2361	0.002751	0.002164	0.503138	0.998731	0.005467	-5.209000	50	236	-1.719908
chrY:56900001-57000000	chrY	56900001	57000000	2122	0.098136	0.039726	0.585631	0.897631	0.167572	-1.786340	58	235	2.684742
chrY:57000001-57100000	chrY	57000001	57100000	2252	0.066954	0.031287	0.664001	0.952623	0.100833	-2.294286	66	234	1.676650
chrY:57100001-57200000	chrY	57100001	57200000	2311	0.046440	0.026938	0.684819	0.977580	0.067813	-2.690995	69	234	0.141551
chrY:57200001-57227415	chrY	57200001	57227415	2125	0.040066	0.022826	0.790896	0.898900	0.050659	-2.982648	77	232	-0.077491

29485 rows × 13 columns

In [23]:

Copied!





def feature_select(adata,number=3000):
    print(adata.shape)
    #adata.raw = adata.copy()
    df = adata.var
    feature_subset_sr = df.index.isin(df.sort_values('sr_var', ascending=False).index[:number])
    feature_subset_var = df.index.isin(df.sort_values('var', ascending=False).index[:number])
    feature_subset_dispersion = df.index.isin(df.sort_values('dispersion_norm', ascending=False).index[:number])
    feature_subset_residual = df.index.isin(df.sort_values('sr_dispersion', ascending=False).index[:number])

        # 随机选择 number 个特征的索引
    random_indices = np.random.choice(df.index, size=number, replace=False)
    
    # 获取随机选择的特征子集
    feature_subset_random = df.index.isin(random_indices)
    
    df['feature_select_sr'] = feature_subset_sr
    df['feature_select_var'] = feature_subset_var
    df['feature_select_dispersion'] = feature_subset_dispersion
    df['feature_select_random'] = feature_subset_random
    df['feature_select_residual'] = feature_subset_residual
    adata.var = df
    # data_filter = adata[:,adata.var['feature_select']].copy()
    return adata
def feature_select(adata,number=3000):
    print(adata.shape)
    #adata.raw = adata.copy()
    df = adata.var
    feature_subset_sr = df.index.isin(df.sort_values('sr_var', ascending=False).index[:number])
    feature_subset_var = df.index.isin(df.sort_values('var', ascending=False).index[:number])
    feature_subset_dispersion = df.index.isin(df.sort_values('dispersion_norm', ascending=False).index[:number])
    feature_subset_residual = df.index.isin(df.sort_values('sr_dispersion', ascending=False).index[:number])

        # 随机选择 number 个特征的索引
    random_indices = np.random.choice(df.index, size=number, replace=False)
    
    # 获取随机选择的特征子集
    feature_subset_random = df.index.isin(random_indices)
    
    df['feature_select_sr'] = feature_subset_sr
    df['feature_select_var'] = feature_subset_var
    df['feature_select_dispersion'] = feature_subset_dispersion
    df['feature_select_random'] = feature_subset_random
    df['feature_select_residual'] = feature_subset_residual
    adata.var = df
    # data_filter = adata[:,adata.var['feature_select']].copy()
    return adata

In [24]:

Copied!

feature_select(scm_100k,number=5000)
feature_select(scm_100k,number=5000)

(2364, 29485)

Out[24]:

AnnData object with n_obs × n_vars = 2364 × 29485
    obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Sample_id', 'Sample', 'Animal age', 'FACS date', 'Brain area', 'Laminar layer', 'Labeling', 'FACS channel', 'FACS count', 'Bisulfite conversion method', 'Library type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Sequencing run mode', 'Total reads', 'Mapped reads', '% Mapped reads', 'Filtered reads', '% Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Coverage (%)', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate', 'n_features', 'pct_peaks'
    var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var', 'mean', 'pct_cell', 'dispersion', 'log_dispersion', 'mean_bin', 'cov_bin', 'dispersion_norm', 'mean_sr', 'sr_dispersion', 'feature_select_sr', 'feature_select_var', 'feature_select_dispersion', 'feature_select_random', 'feature_select_residual'
    uns: 'workdir', 'label_color'
    layers: 'relative'

In [382]:

Copied!

scm_100k.var
scm_100k.var

Out[382]:

	chromosome	start	end	covered_cell	var	sr_var	mean	pct_cell	dispersion	log_dispersion	...	feature_select_var	feature_select_dispersion	feature_select_random	mean_sr	sr_dispersion	log_sr_dispersion	feature_select_residual	Accession	Gene	Distance
index
chr1:1-100000	chr1	1	100000	2353	0.035220	0.029984	0.710440	0.995347	20.171644	3.004278	...	True	False	False	0.333772	0.089833	2.400980	True	ENSG00000186092.7	OR4F5	0
chr1:100001-200000	chr1	100001	200000	2354	0.009842	0.016549	0.810938	0.995770	82.395519	4.411531	...	False	False	False	0.366151	0.045198	3.093073	False	ENSG00000186092.7	OR4F5	28417
chr1:200001-300000	chr1	200001	300000	2351	0.030100	0.035758	0.730608	0.994501	24.272813	3.189357	...	True	False	False	0.383961	0.093130	2.362730	True	ENSG00000186092.7	OR4F5	128417
chr1:300001-400000	chr1	300001	400000	2323	0.026505	0.055943	0.819564	0.982657	30.921580	3.431454	...	True	False	False	0.327454	0.170844	1.713199	True	ENSG00000284733.2	OR4F29	50740
chr1:400001-500000	chr1	400001	500000	2346	0.014380	0.038490	0.836147	0.992386	58.147901	4.062990	...	False	False	False	0.382297	0.100680	2.278761	True	ENSG00000284733.2	OR4F29	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
chrY:56800001-56900000	chrY	56800001	56900000	2361	0.002751	0.003180	0.503138	0.998731	182.910968	5.209000	...	False	True	False	0.256378	0.012404	4.388891	False	ENSG00000124333.16_PAR_Y	VAMP7	167865
chrY:56900001-57000000	chrY	56900001	57000000	2122	0.098136	0.067797	0.585631	0.897631	5.967571	1.786340	...	True	False	True	0.251633	0.269430	1.080738	True	ENSG00000124333.16_PAR_Y	VAMP7	67865
chrY:57000001-57100000	chrY	57000001	57100000	2252	0.066954	0.067381	0.664001	0.952623	9.917348	2.294286	...	True	False	False	0.311553	0.216276	1.406113	True	ENSG00000124333.16_PAR_Y	VAMP7	0
chrY:57100001-57200000	chrY	57100001	57200000	2311	0.046440	0.054673	0.684819	0.977580	14.746337	2.690995	...	True	False	False	0.300699	0.181819	1.595794	True	ENSG00000124333.16_PAR_Y	VAMP7	0
chrY:57200001-57227415	chrY	57200001	57227415	2125	0.040066	0.080984	0.790896	0.898900	19.740019	2.982648	...	True	False	False	0.292204	0.277147	1.184860	True	ENSG00000182484.15_PAR_Y	WASH6P	0

29485 rows × 24 columns

In [ ]:

In [26]:

Copied!





scm.pp.pca(scm_100k,n_pcs=30,impute='median')
sc.pp.neighbors(scm_100k)
scm.pp.umap(scm_100k)
scm.pl.pca(scm_100k,frameon='small',color=['Neuron type'],size=40,wspace=0.3)
scm.pl.umap(scm_100k,frameon='small',color=['Neuron type','Brain area'],size=100,wspace=0.3)
scm.pp.pca(scm_100k,n_pcs=30,impute='median')
sc.pp.neighbors(scm_100k)
scm.pp.umap(scm_100k)
scm.pl.pca(scm_100k,frameon='small',color=['Neuron type'],size=40,wspace=0.3)
scm.pl.umap(scm_100k,frameon='small',color=['Neuron type','Brain area'],size=100,wspace=0.3)

... using all features

In [ ]:

Copied!

#第一主成分的差异主要是Na造成的，filter一下看看结果
#第一主成分的差异主要是Na造成的，filter一下看看结果

In [25]:

Copied!

# 删除 Neuron type 为 NA 的数据
scm_100k_filter = scm_100k[~scm_100k.obs['Neuron type'].isna()].copy()
scm_100k_filter
# 删除 Neuron type 为 NA 的数据
scm_100k_filter = scm_100k[~scm_100k.obs['Neuron type'].isna()].copy()
scm_100k_filter

Out[25]:

AnnData object with n_obs × n_vars = 2313 × 29485
    obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Sample_id', 'Sample', 'Animal age', 'FACS date', 'Brain area', 'Laminar layer', 'Labeling', 'FACS channel', 'FACS count', 'Bisulfite conversion method', 'Library type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Sequencing run mode', 'Total reads', 'Mapped reads', '% Mapped reads', 'Filtered reads', '% Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Coverage (%)', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate', 'n_features', 'pct_peaks'
    var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var', 'mean', 'pct_cell', 'dispersion', 'log_dispersion', 'mean_bin', 'cov_bin', 'dispersion_norm', 'mean_sr', 'sr_dispersion', 'feature_select_sr', 'feature_select_var', 'feature_select_dispersion', 'feature_select_random', 'feature_select_residual'
    uns: 'workdir', 'label_color'
    layers: 'relative'

In [28]:

Copied!





scm.pp.pca(scm_100k_filter,n_pcs=30,impute='median')
# sc.pp.neighbors(scm_100k)
# scm.pp.umap(scm_100k)
scm.pl.pca(scm_100k_filter,frameon='small',color=['Neuron type'],size=40,wspace=0.3)
# scm.pl.umap(scm_100k,frameon='small',color=['Neuron type','Brain area'],size=100,wspace=0.3)
scm.pp.pca(scm_100k_filter,n_pcs=30,impute='median')
# sc.pp.neighbors(scm_100k)
# scm.pp.umap(scm_100k)
scm.pl.pca(scm_100k_filter,frameon='small',color=['Neuron type'],size=40,wspace=0.3)
# scm.pl.umap(scm_100k,frameon='small',color=['Neuron type','Brain area'],size=100,wspace=0.3)

... using all features

In [32]:

Copied!

sc.pp.neighbors(scm_100k_filter)
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=30,wspace=0.3)
sc.pp.neighbors(scm_100k_filter)
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=30,wspace=0.3)

In [34]:

Copied!





scm.pp.pca(scm_100k_filter,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_dispersion')
sc.pp.neighbors(scm_100k_filter)
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=100,wspace=0.3)
scm.pp.pca(scm_100k_filter,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_dispersion')
sc.pp.neighbors(scm_100k_filter)
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=100,wspace=0.3)

... using top variable features based on feature_select_dispersion

In [513]:

Copied!





# pca(scm_100k,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_dispersion')
# sc.pp.neighbors(scm_100k)
# scm.pp.umap(scm_100k)
scm.pl.pca(scm_100k,frameon='small',color=['Neuron type','layer'],size=40,wspace=0.3)
scm.pl.umap(scm_100k,frameon='small',color=['Neuron type','layer'],size=40,wspace=0.3)
# pca(scm_100k,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_dispersion')
# sc.pp.neighbors(scm_100k)
# scm.pp.umap(scm_100k)
scm.pl.pca(scm_100k,frameon='small',color=['Neuron type','layer'],size=40,wspace=0.3)
scm.pl.umap(scm_100k,frameon='small',color=['Neuron type','layer'],size=40,wspace=0.3)

In [35]:

Copied!





scm.pp.pca(scm_100k_filter,n_pcs=30,impute='median',use_hvm=True,hvm_obs='feature_select_dispersion',layer='relative')
sc.pp.neighbors(scm_100k_filter,use_rep='relative_pca')
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=100,wspace=0.3,palette=scm.pl.ditto_palette())
scm.pp.pca(scm_100k_filter,n_pcs=30,impute='median',use_hvm=True,hvm_obs='feature_select_dispersion',layer='relative')
sc.pp.neighbors(scm_100k_filter,use_rep='relative_pca')
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=100,wspace=0.3,palette=scm.pl.ditto_palette())

... using top variable features based on feature_select_dispersion

In [37]:

Copied!





#使用所有的特征进行计算
scm.pp.pca(scm_100k_filter,n_pcs=30,impute='median',layer='relative')
sc.pp.neighbors(scm_100k_filter,use_rep='relative_pca')
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=40,wspace=0.3,palette=scm.pl.ditto_palette())
#使用所有的特征进行计算
scm.pp.pca(scm_100k_filter,n_pcs=30,impute='median',layer='relative')
sc.pp.neighbors(scm_100k_filter,use_rep='relative_pca')
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=40,wspace=0.3,palette=scm.pl.ditto_palette())

In [415]:

Copied!





pca(scm_100k,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_sr')
sc.pp.neighbors(scm_100k)
scm.pp.umap(scm_100k)
scm.pl.umap(scm_100k,frameon='small',color=['Neuron type','layer'],size=100,wspace=0.3)
pca(scm_100k,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_sr')
sc.pp.neighbors(scm_100k)
scm.pp.umap(scm_100k)
scm.pl.umap(scm_100k,frameon='small',color=['Neuron type','layer'],size=100,wspace=0.3)

... using top variable features based on feature_select_sr

In [38]:

Copied!





scm.pp.pca(scm_100k_filter,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_sr',layer='relative')
sc.pp.neighbors(scm_100k_filter,use_rep='relative_pca')
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=40,wspace=0.3)
scm.pp.pca(scm_100k_filter,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_sr',layer='relative')
sc.pp.neighbors(scm_100k_filter,use_rep='relative_pca')
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=40,wspace=0.3)

... using top variable features based on feature_select_sr

In [418]:

Copied!





pca(scm_100k,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_sr',layer='relative')
sc.pp.neighbors(scm_100k,use_rep='Xrelative_pca', method='umap', metric='cosine')
scm.pp.umap(scm_100k)
scm.pl.umap(scm_100k,frameon='small',color=['Neuron type','layer'],size=100,wspace=0.3)
pca(scm_100k,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_sr',layer='relative')
sc.pp.neighbors(scm_100k,use_rep='Xrelative_pca', method='umap', metric='cosine')
scm.pp.umap(scm_100k)
scm.pl.umap(scm_100k,frameon='small',color=['Neuron type','layer'],size=100,wspace=0.3)

... using top variable features based on feature_select_sr

In [39]:

Copied!

sc.tl.tsne(scm_100k_filter)
scm.pl.tsne(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=40,wspace=0.3)
sc.tl.tsne(scm_100k_filter)
scm.pl.tsne(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=40,wspace=0.3)

In [ ]:

Copied!

### using residuals data to take hvm (calculating residuals var and mean)
### using residuals data to take hvm (calculating residuals var and mean)

In [41]:

Copied!





scm.pp.pca(scm_100k_filter,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_residual',layer='relative')
sc.pp.neighbors(scm_100k_filter,use_rep='relative_pca')
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=30,wspace=0.3,palette=scm.pl.ditto_palette())
scm.pp.pca(scm_100k_filter,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_residual',layer='relative')
sc.pp.neighbors(scm_100k_filter,use_rep='relative_pca')
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=30,wspace=0.3,palette=scm.pl.ditto_palette())

... using top variable features based on feature_select_residual

In [43]:

Copied!





#using mean methylation to conduct dimension
scm.pp.pca(scm_100k_filter,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_residual')
sc.pp.neighbors(scm_100k_filter)
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=30,wspace=0.3,palette=scm.pl.ditto_palette())
#using mean methylation to conduct dimension
scm.pp.pca(scm_100k_filter,n_pcs=50,impute='median',use_hvm=True,hvm_obs='feature_select_residual')
sc.pp.neighbors(scm_100k_filter)
scm.pp.umap(scm_100k_filter)
scm.pl.umap(scm_100k_filter,frameon='small',color=['Neuron type','layer'],size=30,wspace=0.3,palette=scm.pl.ditto_palette())

... using top variable features based on feature_select_residual

scm_100k¶

In [44]:

Copied!





annot = []
for n in scm_100k.obs['Neuron type']:
    if n in ['hL6-1', 'hL6-2','hL6-3','hDL-1', 'hDL-2', 'hDL-3']:
        annot.append('deep layer')
    elif n in ['hL5-1', 'hL5-2','hL5-3','hL4']:
        annot.append('middle layer')
    elif n in ['hSst-1','hSst-2','hSst-3','hPv-1','hPv-2','hVip-1', 'hVip-2','hNdnf','hNos']:
        annot.append('Inhibitory')
    else:
        annot.append('upper layer - L2/3')
        
        
scm_100k.obs['layer'] = annot        
sc.pl.umap(scm_100k, color=['layer', 'Neuron type'],wspace =0.2)
annot = []
for n in scm_100k.obs['Neuron type']:
    if n in ['hL6-1', 'hL6-2','hL6-3','hDL-1', 'hDL-2', 'hDL-3']:
        annot.append('deep layer')
    elif n in ['hL5-1', 'hL5-2','hL5-3','hL4']:
        annot.append('middle layer')
    elif n in ['hSst-1','hSst-2','hSst-3','hPv-1','hPv-2','hVip-1', 'hVip-2','hNdnf','hNos']:
        annot.append('Inhibitory')
    else:
        annot.append('upper layer - L2/3')
        
        
scm_100k.obs['layer'] = annot        
sc.pl.umap(scm_100k, color=['layer', 'Neuron type'],wspace =0.2)

/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:1234: FutureWarning: The default value of 'ignore' for the `na_action` parameter in pandas.Categorical.map is deprecated and will be changed to 'None' in a future version. Please set na_action to the desired value to avoid seeing this warning
  color_vector = pd.Categorical(values.map(color_map))
/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:1234: FutureWarning: The default value of 'ignore' for the `na_action` parameter in pandas.Categorical.map is deprecated and will be changed to 'None' in a future version. Please set na_action to the desired value to avoid seeing this warning
  color_vector = pd.Categorical(values.map(color_map))
/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(

In [111]:

Copied!

scm_100k.obs['Neuron type'].unique()
scm_100k.obs['Neuron type'].unique()

Out[111]:

['hSst-2', 'hL2/3', 'hDL-1', NaN, 'hL5-2', ..., 'hL5-1', 'hSst-1', 'hSst-3', 'hDL-2', 'hPv-1']
Length: 22
Categories (21, object): ['hDL-1', 'hDL-2', 'hDL-3', 'hL2/3', ..., 'hSst-2', 'hSst-3', 'hVip-1', 'hVip-2']

In [97]:

Copied!

scm_100k.var
scm_100k.var

Out[97]:

	chromosome	start	end	covered_cell	var	sr_var	mean	pct_cell	dispersion	log_dispersion	...	cov_bin	dispersion_norm	feature_select_sr	feature_select_var	feature_select_dispersion	feature_select_random	mean_sr	sr_dispersion	log_sr_dispersion	feature_select_residual
index
chr1:1-100000	chr1	1	100000	2353	0.035220	0.029984	0.710440	0.995347	20.171644	3.004278	...	235	-0.832056	True	True	False	True	0.330840	11.033983	2.400980	False
chr1:100001-200000	chr1	100001	200000	2354	0.009842	0.016549	0.810938	0.995770	82.395519	4.411531	...	235	0.051438	False	False	False	False	0.364824	22.044721	3.093073	False
chr1:200001-300000	chr1	200001	300000	2351	0.030100	0.035758	0.730608	0.994501	24.272813	3.189357	...	235	-1.055196	True	True	False	False	0.379749	10.619905	2.362730	False
chr1:300001-400000	chr1	300001	400000	2323	0.026505	0.055943	0.819564	0.982657	30.921580	3.431454	...	233	-0.015202	True	True	False	True	0.310300	5.546675	1.713199	False
chr1:400001-500000	chr1	400001	500000	2346	0.014380	0.038490	0.836147	0.992386	58.147901	4.062990	...	234	-0.125231	True	False	False	False	0.375837	9.764573	2.278761	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
chrY:56800001-56900000	chrY	56800001	56900000	2361	0.002751	0.003180	0.503138	0.998731	182.910968	5.209000	...	236	4.556713	False	False	True	False	0.256169	80.551067	4.388891	True
chrY:56900001-57000000	chrY	56900001	57000000	2122	0.098136	0.067797	0.585631	0.897631	5.967571	1.786340	...	235	-0.670919	True	True	False	False	0.199789	2.946852	1.080738	False
chrY:57000001-57100000	chrY	57000001	57100000	2252	0.066954	0.067381	0.664001	0.952623	9.917348	2.294286	...	234	-0.554419	True	True	False	False	0.274920	4.080066	1.406113	False
chrY:57100001-57200000	chrY	57100001	57200000	2311	0.046440	0.054673	0.684819	0.977580	14.746337	2.690995	...	234	-0.448433	True	True	False	False	0.269660	4.932242	1.595794	False
chrY:57200001-57227415	chrY	57200001	57227415	2125	0.040066	0.080984	0.790896	0.898900	19.740019	2.982648	...	232	-0.297100	True	True	False	True	0.264834	3.270228	1.184860	False

29485 rows × 21 columns

Episcanpy method¶

In [182]:

Copied!

#load 100kb windows methylation matrix
scanpy_100k = ad.read("/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/sc/")
#load 100kb windows methylation matrix
scanpy_100k = ad.read("/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/sc/")

 FutureWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/anndata/__init__.py:51: `anndata.read` is deprecated, use `anndata.read_h5ad` instead. `ad.read` will be removed in mid 2024.

In [184]:

Copied!

w_mtx_0 = pd.read_csv("/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/raw/test_CG_100kb",sep='\t', header=None)
w_mtx_0 = pd.read_csv("/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/raw/test_CG_100kb",sep='\t', header=None)

In [185]:

Copied!

w_mtx_0 = w_mtx_0.set_index(w_mtx_0.columns[0])
w_mtx_0
w_mtx_0 = w_mtx_0.set_index(w_mtx_0.columns[0])
w_mtx_0

Out[185]:

	1	2	3	4	5	6	7	8	9	10	...	30869	30870	30871	30872	30873	30874	30875	30876	30877	30878
0
GSM2559344_updated.bed	0.833	0.987	0.000	0.000	0.929	NaN	0.963	0.000	0.000	0.998	...	NaN	NaN	NaN	0.999	0.000	0.000	NaN	0.929	0.000	NaN
GSM2557929_updated.bed	0.000	0.980	0.889	0.966	0.000	0.941	0.986	0.000	0.994	0.998	...	NaN	NaN	NaN	1.000	0.000	0.000	NaN	0.889	0.000	NaN
GSM2558161_updated.bed	0.982	0.993	0.000	0.000	0.964	0.964	0.000	0.000	0.995	0.999	...	NaN	NaN	NaN	0.000	0.000	0.000	0.857	0.900	0.955	NaN
GSM2557053_updated.bed	0.988	0.996	0.000	0.000	0.986	0.975	0.994	0.000	0.999	0.999	...	NaN	NaN	NaN	1.000	0.000	0.000	0.000	0.000	0.000	NaN
GSM2558171_updated.bed	0.974	0.000	0.944	0.952	0.909	0.000	0.000	0.000	0.993	0.999	...	NaN	NaN	NaN	0.000	0.000	0.000	0.000	0.000	0.000	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
GSM2558767_updated.bed	0.000	0.980	0.000	NaN	0.950	0.950	0.985	0.974	0.982	0.998	...	NaN	NaN	NaN	0.999	0.000	0.992	NaN	NaN	0.960	NaN
GSM2558199_updated.bed	0.923	0.972	0.923	0.962	0.947	0.000	0.988	0.982	0.990	0.998	...	NaN	NaN	NaN	0.999	0.000	0.000	0.000	NaN	0.833	NaN
GSM2558196_updated.bed	0.000	0.992	0.958	0.977	0.970	0.000	0.988	0.991	0.996	0.999	...	NaN	NaN	NaN	0.999	0.999	0.000	0.000	NaN	0.950	NaN
GSM2556858_updated.bed	0.000	0.000	NaN	NaN	0.952	0.950	0.979	0.974	0.000	0.998	...	NaN	NaN	NaN	0.997	0.000	0.000	NaN	NaN	NaN	NaN
GSM2558354_updated.bed	0.980	0.000	0.971	0.933	0.962	0.938	0.000	0.983	0.992	0.000	...	NaN	NaN	NaN	0.000	0.000	0.000	NaN	0.909	0.857	NaN

2365 rows × 30878 columns

In [186]:

Copied!





import episcanpy as epi
import os
import glob
import anndata as ad
import pandas as pd
import episcanpy as epi
import os
import glob
import anndata as ad
import pandas as pd

In [190]:

Copied!

windows = epi.ct.make_windows(100000)
w_names = epi.ct.name_features(windows)
#w_mtx_0 = w_mtx_0.drop(columns=w_mtx_0.columns[-1])
windows = epi.ct.make_windows(100000)
w_names = epi.ct.name_features(windows)
#w_mtx_0 = w_mtx_0.drop(columns=w_mtx_0.columns[-1])

In [200]:

Copied!

var_df=pd.DataFrame(index=w_names, columns=['var_names'])
var_df=pd.DataFrame(index=w_names, columns=['var_names'])

In [203]:

Copied!

w_mtx_0.columns = w_names
w_mtx_0.columns = w_names

In [204]:

Copied!

scanpy_100k =  ad.AnnData(w_mtx_0, obs=pd.DataFrame(index=w_mtx_0.index), var=var_df)
scanpy_100k =  ad.AnnData(w_mtx_0, obs=pd.DataFrame(index=w_mtx_0.index), var=var_df)

In [56]:

Copied!

import episcanpy as epi
import episcanpy as epi

quality control (from episcanpy tutorials)¶

In [205]:

Copied!





xdata = scanpy_100k
length1 = len(xdata.X[0,:]) # 55017 features,列
length2 = len(xdata.X[:,0]) # 3379 samples 行
xdata.obs['coverage_feature'] = [length1 - np.isnan(line).sum() for line in xdata.X] #每一个样本中覆盖的特征数
xdata.obs['mean_cell_methylation'] = [np.nansum(line)/length1 for line in xdata.X]
xdata.var['coverage_cells'] = [length2 - np.isnan(line).sum() for line in xdata.X.T] #每一个特征在多少个样本中有覆盖
xdata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in xdata.X.T]
xdata = scanpy_100k
length1 = len(xdata.X[0,:]) # 55017 features,列
length2 = len(xdata.X[:,0]) # 3379 samples 行
xdata.obs['coverage_feature'] = [length1 - np.isnan(line).sum() for line in xdata.X] #每一个样本中覆盖的特征数
xdata.obs['mean_cell_methylation'] = [np.nansum(line)/length1 for line in xdata.X]
xdata.var['coverage_cells'] = [length2 - np.isnan(line).sum() for line in xdata.X.T] #每一个特征在多少个样本中有覆盖
xdata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in xdata.X.T]

In [206]:

Copied!

scanpy_100k.obs
scanpy_100k.obs

Out[206]:

	coverage_feature	mean_cell_methylation
0
GSM2559344_updated.bed	29384	0.795401
GSM2557929_updated.bed	29445	0.814468
GSM2558161_updated.bed	29403	0.805425
GSM2557053_updated.bed	29477	0.792656
GSM2558171_updated.bed	29435	0.841988
...	...	...
GSM2558767_updated.bed	29306	0.802762
GSM2558199_updated.bed	29309	0.827274
GSM2558196_updated.bed	29489	0.841542
GSM2556858_updated.bed	29311	0.789884
GSM2558354_updated.bed	28520	0.750032

2365 rows × 2 columns

Filtering low quality cells¶

In [471]:

Copied!

#filter out cells with less than 10000 enhancers covered
scanpy_100k=scanpy_100k[scanpy_100k.obs['coverage_feature']>10000,:].copy()
scanpy_100k
#filter out cells with less than 10000 enhancers covered
scanpy_100k=scanpy_100k[scanpy_100k.obs['coverage_feature']>10000,:].copy()
scanpy_100k

Out[471]:

AnnData object with n_obs × n_vars = 2350 × 30877
    obs: 'coverage_feature', 'mean_cell_methylation', 'coverage_cells'
    var: 'var_names', 'coverage_cells', 'mean_feature_methylation', 'coverage_feature'

Imputation missing data¶

In [472]:

Copied!

adata = epi.pp.imputation_met(scanpy_100k, number_cell_covered=500, imputation_value='mean', save=None, copy=True)
adata = epi.pp.imputation_met(scanpy_100k, number_cell_covered=500, imputation_value='mean', save=None, copy=True)

 FutureWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/episcanpy/preprocessing/_readimpute.py:90: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

In [473]:

Copied!





# recalculing qc values
length1 = len(adata.X[0,:])
length2 = len(adata.X[:,0])
adata.obs['coverage_cells'] = [length1 - np.isnan(line).sum() for line in adata.X]
adata.obs['mean_cell_methylation'] = [np.nansum(line)/length1 for line in adata.X]
adata.var['coverage_feature'] = [length2 - np.isnan(line).sum() for line in adata.X.T]
adata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in adata.X.T]
# recalculing qc values
length1 = len(adata.X[0,:])
length2 = len(adata.X[:,0])
adata.obs['coverage_cells'] = [length1 - np.isnan(line).sum() for line in adata.X]
adata.obs['mean_cell_methylation'] = [np.nansum(line)/length1 for line in adata.X]
adata.var['coverage_feature'] = [length2 - np.isnan(line).sum() for line in adata.X.T]
adata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in adata.X.T]

Low dimensional representation¶

In [474]:

Copied!

# perform PCA, neighbor graph, tSNE and UMAP
epi.pp.lazy(adata)
# perform PCA, neighbor graph, tSNE and UMAP
epi.pp.lazy(adata)

In [475]:

Copied!

adata
adata

Out[475]:

AnnData object with n_obs × n_vars = 2350 × 29494
    obs: 'coverage_feature', 'mean_cell_methylation', 'coverage_cells'
    var: 'var_names', 'coverage_cells', 'mean_feature_methylation', 'coverage_feature'
    uns: 'pca', 'neighbors', 'tsne', 'umap'
    obsm: 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

In [476]:

Copied!

adata.obs
adata.obs

Out[476]:

	coverage_feature	mean_cell_methylation	coverage_cells
0
GSM2559344_updated.bed	29384	0.834547	29494
GSM2557929_updated.bed	29445	0.853282	29494
GSM2558161_updated.bed	29403	0.845088	29494
GSM2557053_updated.bed	29477	0.830104	29494
GSM2558171_updated.bed	29435	0.882272	29494
...	...	...	...
GSM2558767_updated.bed	29306	0.844110	29494
GSM2558199_updated.bed	29309	0.869876	29494
GSM2558196_updated.bed	29489	0.881006	29494
GSM2556858_updated.bed	29311	0.830229	29494
GSM2558354_updated.bed	28520	0.811096	29494

2350 rows × 3 columns

In [477]:

Copied!





# 假设你已经有一个 AnnData 对象 adata
# 添加 cell_name 列，内容为当前 obs 的索引值
adata.obs['cell_name'] = adata.obs.index

# 去掉索引中 _updated.bed 后缀
new_index = adata.obs.index.str.replace('_updated.bed', '', regex=False)

# 将修改后的索引赋值回 adata.obs
adata.obs.index = new_index

# 如果还需要，更新 cell_name 列，使其保持和索引一致
adata.obs['cell_name'] = new_index
# 假设你已经有一个 AnnData 对象 adata
# 添加 cell_name 列，内容为当前 obs 的索引值
adata.obs['cell_name'] = adata.obs.index

# 去掉索引中 _updated.bed 后缀
new_index = adata.obs.index.str.replace('_updated.bed', '', regex=False)

# 将修改后的索引赋值回 adata.obs
adata.obs.index = new_index

# 如果还需要，更新 cell_name 列，使其保持和索引一致
adata.obs['cell_name'] = new_index

In [478]:

Copied!

scm.pp.add_meta(adata,meta_file='/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/meta/Luo_GSE97179_human_mid_cortex_raw_meta.txt',index_col='Sample_id')
scm.pp.add_meta(adata,meta_file='/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/meta/Luo_GSE97179_human_mid_cortex_raw_meta.txt',index_col='Sample_id')

No column 'label' found in metadata, 'unknown' is used as the default cell labels
No column 'label_color' found in metadata, random color is generated for each cell label

In [218]:

Copied!

adata.obs
adata.obs

Out[218]:

	coverage_feature	mean_cell_methylation	coverage_cells	cell_name	Sample_id	Sample	Animal age	FACS date	Brain area	Laminar layer	...	% Filtered reads	mCCC/CCC	mCG/CG	mCH/CH	Estimated mCG/CG	Estimated mCH/CH	Coverage (%)	Neuron type	tSNE x coordinate	tSNE y coordinate
0	29384	0.834547	29494	GSM2559344	GSM2559344	Pool_9_AD010_indexed	25 yr	2012/6/29	Mid Frontal Gyrus	NaN	...	47.80%	0.00742	0.78505	0.03876	0.78344	0.03157	4.64	hL5-4	-2.18678	-12.80550
1	29445	0.853282	29494	GSM2557929	GSM2557929	Pool_338_AD006_indexed	25 yr	2012/6/29	Mid Frontal Gyrus	NaN	...	42.40%	0.01529	0.78961	0.07792	0.78634	0.06360	5.00	hL5-4	-9.15026	-14.39010
2	29403	0.845088	29494	GSM2558161	GSM2558161	Pool_451_AD008_indexed	25 yr	2012/6/29	Mid Frontal Gyrus	NaN	...	43.20%	0.01145	0.77535	0.06240	0.77275	0.05154	7.10	hDL-1	-9.59999	-1.82943
3	29477	0.830104	29494	GSM2557053	GSM2557053	Pool_1365_AD006_indexed	25 yr	2012/6/29	Mid Frontal Gyrus	NaN	...	39.00%	0.00983	0.76679	0.05151	0.76447	0.04209	11.41	hL2/3	-16.85230	5.09134
4	29435	0.882272	29494	GSM2558171	GSM2558171	Pool_457_AD008_indexed	25 yr	2012/6/29	Mid Frontal Gyrus	NaN	...	47.60%	0.01625	0.82785	0.08610	0.82501	0.07100	6.76	hPv-1	12.97110	-6.70419
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2345	29306	0.844110	29494	GSM2558767	GSM2558767	Pool_737_AD002_indexed	25 yr	2012/6/29	Mid Frontal Gyrus	NaN	...	44.60%	0.01255	0.78907	0.06757	0.78639	0.05572	3.48	hL5-4	-9.11452	-13.56870
2346	29309	0.869876	29494	GSM2558199	GSM2558199	Pool_46_AD008_indexed	25 yr	2012/6/29	Mid Frontal Gyrus	NaN	...	52.00%	0.01299	0.83044	0.06731	0.82821	0.05503	4.58	hVip-2	20.16110	0.64929
2347	29489	0.881006	29494	GSM2558196	GSM2558196	Pool_468_AD010_indexed	25 yr	2012/6/29	Mid Frontal Gyrus	NaN	...	39.70%	0.01157	0.82074	0.06483	0.81864	0.05388	9.41	hSst-2	18.23110	-5.97131
2348	29311	0.830229	29494	GSM2556858	GSM2556858	Pool_1315_AD010_indexed	25 yr	2012/6/29	Mid Frontal Gyrus	NaN	...	64.20%	0.00827	0.80346	0.03818	0.80182	0.03016	3.54	hSst-2	17.80530	-4.53349
2349	28520	0.811096	29494	GSM2558354	GSM2558354	Pool_543_AD002_indexed	25 yr	2012/6/29	Mid Frontal Gyrus	NaN	...	39.30%	0.00344	0.76130	0.00503	0.76048	0.00160	4.54	NaN	NaN	NaN

2350 rows × 37 columns

In [219]:

Copied!





sc.pl.umap(adata, color=['Library pool', 'Mapped reads',
                         'Index i5', 'Index i7',
                         'i5 sequence', 'i7 sequence',
                         'mCCC/CCC', 'mCG/CG', 'mCH/CH'], wspace=0.8)
sc.pl.umap(adata, color=['Library pool', 'Mapped reads',
                         'Index i5', 'Index i7',
                         'i5 sequence', 'i7 sequence',
                         'mCCC/CCC', 'mCG/CG', 'mCH/CH'], wspace=0.8)

 FutureWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:1234: The default value of 'ignore' for the `na_action` parameter in pandas.Categorical.map is deprecated and will be changed to 'None' in a future version. Please set na_action to the desired value to avoid seeing this warning
 UserWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:394: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
 FutureWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:1234: The default value of 'ignore' for the `na_action` parameter in pandas.Categorical.map is deprecated and will be changed to 'None' in a future version. Please set na_action to the desired value to avoid seeing this warning
 UserWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:394: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
 FutureWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:1234: The default value of 'ignore' for the `na_action` parameter in pandas.Categorical.map is deprecated and will be changed to 'None' in a future version. Please set na_action to the desired value to avoid seeing this warning
 UserWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:394: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
 FutureWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:1234: The default value of 'ignore' for the `na_action` parameter in pandas.Categorical.map is deprecated and will be changed to 'None' in a future version. Please set na_action to the desired value to avoid seeing this warning
 UserWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:394: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
 FutureWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:1234: The default value of 'ignore' for the `na_action` parameter in pandas.Categorical.map is deprecated and will be changed to 'None' in a future version. Please set na_action to the desired value to avoid seeing this warning
 UserWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:394: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored

In [479]:

Copied!





annot = []
for n in adata.obs['Neuron type']:
    if n in ['hL6-1', 'hL6-2','hL6-3']:
        annot.append('deep layer - L6')
    elif n in ['hL5-1', 'hL5-2','hL5-3','hL4']:
        annot.append('middle layer')
    elif n in ['hDL-1', 'hDL-2', 'hDL-3']:
        annot.append('deep layer - L5')
    elif n in ['hSst-1','hSst-2','hSst-3','hPv-1','hPv-2','hVip-1', 'hVip-2','hNdnf','hNos']:
        annot.append('Inhibitory')
    else:
        annot.append('upper layer - L2/3')
        
        
adata.obs['layer'] = annot

# all features

sc.pl.umap(adata, color=['layer', 'Neuron type'],wspace =0.3)
annot = []
for n in adata.obs['Neuron type']:
    if n in ['hL6-1', 'hL6-2','hL6-3']:
        annot.append('deep layer - L6')
    elif n in ['hL5-1', 'hL5-2','hL5-3','hL4']:
        annot.append('middle layer')
    elif n in ['hDL-1', 'hDL-2', 'hDL-3']:
        annot.append('deep layer - L5')
    elif n in ['hSst-1','hSst-2','hSst-3','hPv-1','hPv-2','hVip-1', 'hVip-2','hNdnf','hNos']:
        annot.append('Inhibitory')
    else:
        annot.append('upper layer - L2/3')
        
        
adata.obs['layer'] = annot

# all features

sc.pl.umap(adata, color=['layer', 'Neuron type'],wspace =0.3)

 FutureWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:1234: The default value of 'ignore' for the `na_action` parameter in pandas.Categorical.map is deprecated and will be changed to 'None' in a future version. Please set na_action to the desired value to avoid seeing this warning
 UserWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:394: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
 FutureWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:1234: The default value of 'ignore' for the `na_action` parameter in pandas.Categorical.map is deprecated and will be changed to 'None' in a future version. Please set na_action to the desired value to avoid seeing this warning
 UserWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/plotting/_tools/scatterplots.py:394: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored

In [ ]:

Copied!

# for top 5000 features
# for top 5000 features

In [ ]:

Copied!

adata.write('/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scanpy/windows100k.h5ad')
adata.write('/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scanpy/windows100k.h5ad')

In [234]:

Copied!

pca(scm_100k,impute='median')
sc.pp.neighbors(scm_100k)
scm.pp.umap(scm_100k)
pca(scm_100k,impute='median')
sc.pp.neighbors(scm_100k)
scm.pp.umap(scm_100k)

... using all features

In [235]:

Copied!

scm.pl.umap(scm_100k, color=['layer', 'Neuron type'],wspace=0.4)
scm.pl.umap(scm_100k, color=['layer', 'Neuron type'],wspace=0.4)

In [45]:

Copied!





#聚类效果评价
def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):
    this_step = 0
    this_min = float(range_min)
    this_max = float(range_max)
    while this_step < max_steps:
        print('step ' + str(this_step))
        this_resolution = this_min + ((this_max-this_min)/2)
        sc.tl.leiden(adata,resolution=this_resolution)
        this_clusters = adata.obs['leiden'].nunique()
        
        print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))
        
        if this_clusters > n_cluster:
            this_max = this_resolution
        elif this_clusters < n_cluster:
            this_min = this_resolution
        else:
            return(this_resolution, adata)
        this_step += 1
    print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + + str(this_resolution))
#聚类效果评价
def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):
    this_step = 0
    this_min = float(range_min)
    this_max = float(range_max)
    while this_step < max_steps:
        print('step ' + str(this_step))
        this_resolution = this_min + ((this_max-this_min)/2)
        sc.tl.leiden(adata,resolution=this_resolution)
        this_clusters = adata.obs['leiden'].nunique()
        
        print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))
        
        if this_clusters > n_cluster:
            this_max = this_resolution
        elif this_clusters < n_cluster:
            this_min = this_resolution
        else:
            return(this_resolution, adata)
        this_step += 1
    print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + + str(this_resolution))

In [46]:

Copied!

num_clusters = scm_100k_filter.obs['Neuron type'].nunique()
getNClusters(scm_100k_filter,n_cluster=num_clusters)
num_clusters = scm_100k_filter.obs['Neuron type'].nunique()
getNClusters(scm_100k_filter,n_cluster=num_clusters)

step 0
got 17 at resolution 1.5
step 1
got 20 at resolution 2.25
step 2
got 23 at resolution 2.625
step 3
got 23 at resolution 2.4375
step 4
got 20 at resolution 2.34375
step 5
got 22 at resolution 2.390625
step 6
got 22 at resolution 2.3671875
step 7
got 22 at resolution 2.35546875
step 8
got 22 at resolution 2.349609375
step 9
got 21 at resolution 2.3466796875

Out[46]:

(2.3466796875,
 AnnData object with n_obs × n_vars = 2313 × 29509
     obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Sample_id', 'Sample', 'Animal age', 'FACS date', 'Brain area', 'Laminar layer', 'Labeling', 'FACS channel', 'FACS count', 'Bisulfite conversion method', 'Library type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Sequencing run mode', 'Total reads', 'Mapped reads', '% Mapped reads', 'Filtered reads', '% Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Coverage (%)', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate', 'n_features', 'pct_peaks', 'layer', 'leiden'
     var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var', 'mean', 'pct_cell', 'mean_sr', 'sr_dispersion', 'log_sr_dispersion', 'dispersion', 'log_dispersion', 'mean_bin', 'cov_bin', 'dispersion_norm', 'feature_select_sr', 'feature_select_var', 'feature_select_dispersion', 'feature_select_random', 'feature_select_residual'
     uns: 'workdir', 'label_color', 'relative_pca_var_ratios', 'neighbors', 'umap', 'Neuron type_colors', 'X_pca_var_ratios', 'Brain area_colors', 'layer_colors', 'tsne', 'leiden'
     obsm: 'relative_pca', 'X_umap', 'X_pca', 'X_tsne'
     layers: 'relative'
     obsp: 'distances', 'connectivities')

In [47]:

Copied!

scm.pl.dendrogram(scm_100k_filter,groupby='leiden')
scm.pl.dendrogram(scm_100k_filter,groupby='leiden')

[ 5. 15.] [0. 0.]
[25. 35.] [0. 0.]
[75. 85.] [0. 0.]
[65.] [0.]
[55.] [0.]
[45.] [0.]
[ 95. 105.] [0. 0.]
[125. 135.] [0. 0.]
[115.] [0.]
[155. 165.] [0. 0.]
[145.] [0.]
[175. 185.] [0. 0.]
[195. 205.] [0. 0.]

/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scanpy/tools/_dendrogram.py:135: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  mean_df = rep_df.groupby(level=0).mean()

Out[47]:

<Axes: >

In [48]:

Copied!

scm.pl.umap(scm_100k_filter, color=['leiden', 'Neuron type'],wspace=0.4)
scm.pl.umap(scm_100k_filter, color=['leiden', 'Neuron type'],wspace=0.4)

In [18]:

Copied!





from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score

Adjust metrics¶

In [40]:

Copied!

df_metrics = pd.DataFrame(columns=['ARI_Louvain',
                                   'AMI_Louvain',
                                   'Homogeneity_Louvain'])
df_metrics = pd.DataFrame(columns=['ARI_Louvain',
                                   'AMI_Louvain',
                                   'Homogeneity_Louvain'])

In [251]:

Copied!





adata = scm_100k_filter
adata.obs['label'] = adata.obs['Neuron type']
method = 'filer_residual_dispersion_all_feature'
df_metrics = pd.DataFrame(columns=['ARI_Louvain',
                                   'AMI_Louvain',
                                   'Homogeneity_Louvain'])


#有空值
# 添加一个新的类别 'unknown' 并用它来填充空值
adata.obs['label'] = adata.obs['label'].cat.add_categories('unknown')
adata.obs['label'].fillna('unknown', inplace=True)

#adjusted rank index
ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['leiden'])

# ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
#adjusted mutual information
ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['leiden'],average_method='arithmetic')

# ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
#homogeneity
homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['leiden'])
# homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])

df_metrics.loc[method,['ARI_leiden']] = [ari_louvain]
df_metrics.loc[method,['AMI_leiden']] = [ami_louvain]
df_metrics.loc[method,['Homogeneity_leiden']] = [homo_louvain] 
adata = scm_100k_filter
adata.obs['label'] = adata.obs['Neuron type']
method = 'filer_residual_dispersion_all_feature'
df_metrics = pd.DataFrame(columns=['ARI_Louvain',
                                   'AMI_Louvain',
                                   'Homogeneity_Louvain'])


#有空值
# 添加一个新的类别 'unknown' 并用它来填充空值
adata.obs['label'] = adata.obs['label'].cat.add_categories('unknown')
adata.obs['label'].fillna('unknown', inplace=True)

#adjusted rank index
ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['leiden'])

# ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
#adjusted mutual information
ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['leiden'],average_method='arithmetic')

# ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
#homogeneity
homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['leiden'])
# homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])

df_metrics.loc[method,['ARI_leiden']] = [ari_louvain]
df_metrics.loc[method,['AMI_leiden']] = [ami_louvain]
df_metrics.loc[method,['Homogeneity_leiden']] = [homo_louvain] 

 FutureWarning:/tmp/ipykernel_71005/593313683.py:12: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.

In [42]:

Copied!

scm_100k_filter
scm_100k_filter

Out[42]:

AnnData object with n_obs × n_vars = 2313 × 29485
    obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Sample_id', 'Sample', 'Animal age', 'FACS date', 'Brain area', 'Laminar layer', 'Labeling', 'FACS channel', 'FACS count', 'Bisulfite conversion method', 'Library type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Sequencing run mode', 'Total reads', 'Mapped reads', '% Mapped reads', 'Filtered reads', '% Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Coverage (%)', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate', 'n_features', 'pct_peaks', 'label', 'leiden'
    var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var', 'mean', 'pct_cell', 'dispersion', 'log_dispersion', 'mean_bin', 'cov_bin', 'dispersion_norm', 'mean_sr', 'sr_dispersion', 'feature_select_sr', 'feature_select_var', 'feature_select_dispersion', 'feature_select_random', 'feature_select_residual'
    uns: 'workdir', 'label_color', 'relative_pca_var_ratios', 'neighbors', 'umap', 'leiden'
    obsm: 'relative_pca', 'X_umap', 'scm_scm_feature_select_random_3000_umap', 'scm_scm_feature_select_var_3000_umap', 'scm_scm_feature_select_sr_3000_umap', 'scm_scm_feature_select_dispersion_3000_umap', 'scm_scm_feature_select_residual_3000_umap', 'scm_scm_feature_select_random_5000_umap', 'scm_scm_feature_select_var_5000_umap', 'scm_scm_feature_select_sr_5000_umap', 'scm_scm_feature_select_dispersion_5000_umap', 'scm_scm_feature_select_residual_5000_umap', 'scm_scm_feature_select_random_10000_umap', 'scm_scm_feature_select_var_10000_umap', 'scm_scm_feature_select_sr_10000_umap', 'scm_scm_feature_select_dispersion_10000_umap', 'scm_scm_feature_select_residual_10000_umap'
    layers: 'relative'
    obsp: 'distances', 'connectivities'

In [43]:

Copied!





feature_select_method = ['feature_select_random','feature_select_var','feature_select_sr','feature_select_dispersion','feature_select_residual']
feature_num = [3000,5000,10000,29485]
clusters = scm_100k_filter.obs['Neuron type'].astype(str)   # 将数值类型转换为字符串类型

for i, n in enumerate(feature_num):
    feature_select(scm_100k_filter,number=n)
    
    for j, f in enumerate(feature_select_method):
        scm.pp.pca(scm_100k_filter,n_pcs=30,impute='median',use_hvm=True,hvm_obs=f)
        sc.pp.neighbors(scm_100k_filter)
        scm.pp.umap(scm_100k_filter)
    
        adata = scm_100k_filter
        adata.obs['label'] = adata.obs['Neuron type']
        method = f + '_' + str(n)

        sc.tl.leiden(adata)
        #有空值
        # 添加一个新的类别 'unknown' 并用它来填充空值
        adata.obs['label'].fillna('unknown', inplace=True)
        # adata.obs['label'] = adata.obs['label'].cat.add_categories('unknown')
        # adata.obs['label'].fillna('unknown', inplace=True)
        
        #adjusted rank index
        ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['leiden'])
        
        # ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
        #adjusted mutual information
        ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['leiden'],average_method='arithmetic')
        
        # ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
        #homogeneity
        homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['leiden'])
        # homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])
        
        df_metrics.loc[method,['ARI_leiden']] = [ari_louvain]
        df_metrics.loc[method,['AMI_leiden']] = [ami_louvain]
        df_metrics.loc[method,['Homogeneity_leiden']] = [homo_louvain]
    
        scm_100k_filter.obsm[method+'_umap'] = adata.obsm["X_umap"]
            # axes[j,i].scatter(adata.obsm["X_tsne"],c=colors9[celltypes_numeric],s=2,rasterized=True)
feature_select_method = ['feature_select_random','feature_select_var','feature_select_sr','feature_select_dispersion','feature_select_residual']
feature_num = [3000,5000,10000,29485]
clusters = scm_100k_filter.obs['Neuron type'].astype(str)   # 将数值类型转换为字符串类型

for i, n in enumerate(feature_num):
    feature_select(scm_100k_filter,number=n)
    
    for j, f in enumerate(feature_select_method):
        scm.pp.pca(scm_100k_filter,n_pcs=30,impute='median',use_hvm=True,hvm_obs=f)
        sc.pp.neighbors(scm_100k_filter)
        scm.pp.umap(scm_100k_filter)
    
        adata = scm_100k_filter
        adata.obs['label'] = adata.obs['Neuron type']
        method = f + '_' + str(n)

        sc.tl.leiden(adata)
        #有空值
        # 添加一个新的类别 'unknown' 并用它来填充空值
        adata.obs['label'].fillna('unknown', inplace=True)
        # adata.obs['label'] = adata.obs['label'].cat.add_categories('unknown')
        # adata.obs['label'].fillna('unknown', inplace=True)
        
        #adjusted rank index
        ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['leiden'])
        
        # ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
        #adjusted mutual information
        ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['leiden'],average_method='arithmetic')
        
        # ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
        #homogeneity
        homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['leiden'])
        # homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])
        
        df_metrics.loc[method,['ARI_leiden']] = [ari_louvain]
        df_metrics.loc[method,['AMI_leiden']] = [ami_louvain]
        df_metrics.loc[method,['Homogeneity_leiden']] = [homo_louvain]
    
        scm_100k_filter.obsm[method+'_umap'] = adata.obsm["X_umap"]
            # axes[j,i].scatter(adata.obsm["X_tsne"],c=colors9[celltypes_numeric],s=2,rasterized=True)

(2313, 29485)
... using top variable features based on feature_select_random

/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_var

/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_sr

/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_dispersion

/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_residual

/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

(2313, 29485)
... using top variable features based on feature_select_random

/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_var

/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_sr

/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_dispersion

/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_residual

/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

(2313, 29485)
... using top variable features based on feature_select_random

/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_var

/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_sr

/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_dispersion

/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_residual

/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

(2313, 29485)
... using top variable features based on feature_select_random

/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_var

/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_sr

/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_dispersion

/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_residual

/tmp/ipykernel_17336/1835152923.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

In [51]:

Copied!





feature_select_method = ['feature_select_random','feature_select_var','feature_select_sr','feature_select_dispersion','feature_select_residual']
feature_num = [3000,5000,10000,29485]
clusters = scm_100k_filter.obs['Neuron type'].astype(str)   # 将数值类型转换为字符串类型

for i, n in enumerate(feature_num):
    feature_select(scm_100k_filter,number=n)
    
    for j, f in enumerate(feature_select_method):
        scm.pp.pca(scm_100k_filter,n_pcs=30,impute='median',use_hvm=True,hvm_obs=f,layer='relative')
        sc.pp.neighbors(scm_100k_filter,use_rep='relative_pca')
        scm.pp.umap(scm_100k_filter)
    
        adata = scm_100k_filter
        adata.obs['label'] = adata.obs['Neuron type']
        method = 'scm_'+ f + '_' + str(n)

        sc.tl.leiden(adata)
        #有空值
        # 添加一个新的类别 'unknown' 并用它来填充空值
        #adata.obs['label'] = adata.obs['label'].cat.add_categories('unknown')
        adata.obs['label'].fillna('unknown', inplace=True)
        
        #adjusted rank index
        ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['leiden'])
        
        # ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
        #adjusted mutual information
        ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['leiden'],average_method='arithmetic')
        
        # ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
        #homogeneity
        homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['leiden'])
        # homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])
        
        df_metrics.loc[method,['ARI_leiden']] = [ari_louvain]
        df_metrics.loc[method,['AMI_leiden']] = [ami_louvain]
        df_metrics.loc[method,['Homogeneity_leiden']] = [homo_louvain]
    
        scm_100k_filter.obsm['scm_' + method + '_umap'] = adata.obsm["X_umap"]
            # axes[j,i].scatter(adata.obsm["X_tsne"],c=colors9[celltypes_numeric],s=2,rasterized=True)
feature_select_method = ['feature_select_random','feature_select_var','feature_select_sr','feature_select_dispersion','feature_select_residual']
feature_num = [3000,5000,10000,29485]
clusters = scm_100k_filter.obs['Neuron type'].astype(str)   # 将数值类型转换为字符串类型

for i, n in enumerate(feature_num):
    feature_select(scm_100k_filter,number=n)
    
    for j, f in enumerate(feature_select_method):
        scm.pp.pca(scm_100k_filter,n_pcs=30,impute='median',use_hvm=True,hvm_obs=f,layer='relative')
        sc.pp.neighbors(scm_100k_filter,use_rep='relative_pca')
        scm.pp.umap(scm_100k_filter)
    
        adata = scm_100k_filter
        adata.obs['label'] = adata.obs['Neuron type']
        method = 'scm_'+ f + '_' + str(n)

        sc.tl.leiden(adata)
        #有空值
        # 添加一个新的类别 'unknown' 并用它来填充空值
        #adata.obs['label'] = adata.obs['label'].cat.add_categories('unknown')
        adata.obs['label'].fillna('unknown', inplace=True)
        
        #adjusted rank index
        ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['leiden'])
        
        # ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
        #adjusted mutual information
        ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['leiden'],average_method='arithmetic')
        
        # ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
        #homogeneity
        homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['leiden'])
        # homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])
        
        df_metrics.loc[method,['ARI_leiden']] = [ari_louvain]
        df_metrics.loc[method,['AMI_leiden']] = [ami_louvain]
        df_metrics.loc[method,['Homogeneity_leiden']] = [homo_louvain]
    
        scm_100k_filter.obsm['scm_' + method + '_umap'] = adata.obsm["X_umap"]
            # axes[j,i].scatter(adata.obsm["X_tsne"],c=colors9[celltypes_numeric],s=2,rasterized=True)

(2313, 29485)
... using top variable features based on feature_select_random

/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_var

/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_sr

/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_dispersion

/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_residual

/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

(2313, 29485)
... using top variable features based on feature_select_random

/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_var

/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_sr

/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_dispersion

/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_residual

/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

(2313, 29485)
... using top variable features based on feature_select_random

/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_var

/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_sr

/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_dispersion

/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_residual

/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

(2313, 29485)
... using top variable features based on feature_select_random

/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_var

/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_sr

/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_dispersion

/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

... using top variable features based on feature_select_residual

/tmp/ipykernel_17336/3271770054.py:21: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adata.obs['label'].fillna('unknown', inplace=True)

In [38]:

Copied!

feature_select_method = ['feature_select_random','feature_select_var','feature_select_sr','feature_select_dispersion','feature_select_residual']
feature_num = ['3000','5000','10000']
feature_select_method = ['feature_select_random','feature_select_var','feature_select_sr','feature_select_dispersion','feature_select_residual']
feature_num = ['3000','5000','10000']

In [54]:

Copied!

df_metrics
df_metrics

Out[54]:

	ARI_Louvain	AMI_Louvain	Homogeneity_Louvain	ARI_leiden	AMI_leiden	Homogeneity_leiden	method	feature_number
0	NaN	NaN	NaN	0.456784	0.674592	0.600334	NaN	NaN
1	NaN	NaN	NaN	0.574402	0.660247	0.570550	NaN	NaN
2	NaN	NaN	NaN	0.491452	0.751388	0.714291	NaN	NaN
3	NaN	NaN	NaN	0.543763	0.675866	0.586256	NaN	NaN
4	NaN	NaN	NaN	0.434316	0.733150	0.724680	NaN	NaN
5	NaN	NaN	NaN	0.471703	0.722031	0.693817	NaN	NaN
6	NaN	NaN	NaN	0.513774	0.696885	0.644075	NaN	NaN
7	NaN	NaN	NaN	0.500053	0.773307	0.787560	NaN	NaN
8	NaN	NaN	NaN	0.507755	0.707217	0.637069	NaN	NaN
9	NaN	NaN	NaN	0.481533	0.767520	0.784143	NaN	NaN
10	NaN	NaN	NaN	0.448819	0.731739	0.737025	NaN	NaN
11	NaN	NaN	NaN	0.428412	0.724399	0.717524	NaN	NaN
12	NaN	NaN	NaN	0.476988	0.762610	0.781313	NaN	NaN
13	NaN	NaN	NaN	0.425266	0.729153	0.699094	NaN	NaN
14	NaN	NaN	NaN	0.482629	0.767571	0.790160	NaN	NaN
15	NaN	NaN	NaN	0.403792	0.745666	0.782104	NaN	NaN
16	NaN	NaN	NaN	0.403792	0.745666	0.782104	NaN	NaN
17	NaN	NaN	NaN	0.403792	0.745666	0.782104	NaN	NaN
18	NaN	NaN	NaN	0.403792	0.745666	0.782104	NaN	NaN
19	NaN	NaN	NaN	0.403792	0.745666	0.782104	NaN	NaN
20	NaN	NaN	NaN	0.535869	0.773287	0.795181	scm_feature_select_random_	3000
21	NaN	NaN	NaN	0.476417	0.743845	0.725840	scm_feature_select_var_	3000
22	NaN	NaN	NaN	0.483194	0.779870	0.819663	scm_feature_select_sr_	3000
23	NaN	NaN	NaN	0.424999	0.732045	0.703400	scm_feature_select_dispersion_	3000
24	NaN	NaN	NaN	0.430886	0.769783	0.815982	scm_feature_select_residual_	3000
25	NaN	NaN	NaN	0.460182	0.764282	0.789593	scm_feature_select_random_	5000
26	NaN	NaN	NaN	0.452659	0.761496	0.783954	scm_feature_select_var_	5000
27	NaN	NaN	NaN	0.428122	0.777929	0.844399	scm_feature_select_sr_	5000
28	NaN	NaN	NaN	0.462310	0.769854	0.789328	scm_feature_select_dispersion_	5000
29	NaN	NaN	NaN	0.432611	0.776875	0.834491	scm_feature_select_residual_	5000
30	NaN	NaN	NaN	0.466384	0.776299	0.822934	scm_feature_select_random_	10000
31	NaN	NaN	NaN	0.420865	0.771338	0.830959	scm_feature_select_var_	10000
32	NaN	NaN	NaN	0.427085	0.776544	0.843158	scm_feature_select_sr_	10000
33	NaN	NaN	NaN	0.464543	0.778559	0.829990	scm_feature_select_dispersion_	10000
34	NaN	NaN	NaN	0.429921	0.778979	0.845232	scm_feature_select_residual_	10000
35	NaN	NaN	NaN	0.410057	0.767987	0.827087	scm_feature_select_random_	29485
36	NaN	NaN	NaN	0.410057	0.767987	0.827087	scm_feature_select_var_	29485
37	NaN	NaN	NaN	0.410057	0.767987	0.827087	scm_feature_select_sr_	29485
38	NaN	NaN	NaN	0.410057	0.767987	0.827087	scm_feature_select_dispersion_	29485
39	NaN	NaN	NaN	0.410057	0.767987	0.827087	scm_feature_select_residual_	29485

In [29]:

Copied!

# 之前做了两遍所以删除前 20 行
#df_metrics = df_metrics.drop(df_metrics.index[:20])
df_metrics
# 之前做了两遍所以删除前 20 行
#df_metrics = df_metrics.drop(df_metrics.index[:20])
df_metrics

Out[29]:

	ARI_Louvain	AMI_Louvain	Homogeneity_Louvain	ARI_leiden	AMI_leiden	Homogeneity_leiden
feature_select_random_3000	NaN	NaN	NaN	0.458048	0.685203	0.622874
feature_select_var_3000	NaN	NaN	NaN	0.574402	0.660247	0.570550
feature_select_sr_3000	NaN	NaN	NaN	0.491452	0.751388	0.714291
feature_select_dispersion_3000	NaN	NaN	NaN	0.543763	0.675866	0.586256
feature_select_residual_3000	NaN	NaN	NaN	0.434316	0.733150	0.724680
feature_select_random_5000	NaN	NaN	NaN	0.445819	0.710908	0.683368
feature_select_var_5000	NaN	NaN	NaN	0.513774	0.696885	0.644075
feature_select_sr_5000	NaN	NaN	NaN	0.500053	0.773307	0.787560
feature_select_dispersion_5000	NaN	NaN	NaN	0.507755	0.707217	0.637069
feature_select_residual_5000	NaN	NaN	NaN	0.481533	0.767520	0.784143
feature_select_random_10000	NaN	NaN	NaN	0.485736	0.753611	0.772558
feature_select_var_10000	NaN	NaN	NaN	0.428412	0.724399	0.717524
feature_select_sr_10000	NaN	NaN	NaN	0.476988	0.762610	0.781313
feature_select_dispersion_10000	NaN	NaN	NaN	0.425266	0.729153	0.699094
feature_select_residual_10000	NaN	NaN	NaN	0.482629	0.767571	0.790160
scm_feature_select_random_3000	NaN	NaN	NaN	0.534501	0.785227	0.798622
scm_feature_select_var_3000	NaN	NaN	NaN	0.476417	0.743845	0.725840
scm_feature_select_sr_3000	NaN	NaN	NaN	0.483194	0.779870	0.819663
scm_feature_select_dispersion_3000	NaN	NaN	NaN	0.424999	0.732045	0.703400
scm_feature_select_residual_3000	NaN	NaN	NaN	0.430886	0.769783	0.815982
scm_feature_select_random_5000	NaN	NaN	NaN	0.470246	0.769720	0.800332
scm_feature_select_var_5000	NaN	NaN	NaN	0.452659	0.761496	0.783954
scm_feature_select_sr_5000	NaN	NaN	NaN	0.428122	0.777929	0.844399
scm_feature_select_dispersion_5000	NaN	NaN	NaN	0.462310	0.769854	0.789328
scm_feature_select_residual_5000	NaN	NaN	NaN	0.432611	0.776875	0.834491
scm_feature_select_random_10000	NaN	NaN	NaN	0.469592	0.785871	0.833530
scm_feature_select_var_10000	NaN	NaN	NaN	0.420865	0.771338	0.830959
scm_feature_select_sr_10000	NaN	NaN	NaN	0.427085	0.776544	0.843158
scm_feature_select_dispersion_10000	NaN	NaN	NaN	0.464543	0.778559	0.829990
scm_feature_select_residual_10000	NaN	NaN	NaN	0.429921	0.778979	0.845232

In [53]:

Copied!





# 重置索引，将索引列转为普通列
df = df_metrics.reset_index()

# 使用正则表达式提取前缀和数字
df[['method', 'feature_number']] = df['index'].str.extract(r'(^\D+)(\d+$)')

# 将提取出的数字列中的空值填充为 'all'
df['feature_number'] = df['feature_number'].replace('', 'all')

# 删除原始的 'index' 列
df = df.drop(columns=['index'])
df_metrics = df
# 重置索引，将索引列转为普通列
df = df_metrics.reset_index()

# 使用正则表达式提取前缀和数字
df[['method', 'feature_number']] = df['index'].str.extract(r'(^\D+)(\d+$)')

# 将提取出的数字列中的空值填充为 'all'
df['feature_number'] = df['feature_number'].replace('', 'all')

# 删除原始的 'index' 列
df = df.drop(columns=['index'])
df_metrics = df

In [31]:

Copied!

df_metrics.to_csv('feature_metrics.csv')  # 不存储索引
df_metrics.to_csv('feature_metrics.csv')  # 不存储索引

In [46]:

Copied!





def get_plot_df(res):
    gene_selection_via = []
    k = []
    dim_reduction_method = []
    ari = []
    ami = []
    homogeneity = []
    # 使用 DataFrame 的列名直接访问数据并添加到各自的列表
    for idx, row in res.iterrows():
        gene_selection_via.append(row['method'])
        k.append(row['feature_number'])
        ari.append(row['ARI_leiden'])
        ami.append(row['AMI_leiden'])
        homogeneity.append(row['Homogeneity_leiden'])

    data_dict = dict(gene_selection_via=gene_selection_via,
                     k=k,
                     ari = ari,
                     ami = ami,
                     homogeneity = homogeneity
                        )
    data = pd.DataFrame(data_dict)
    return data.melt(id_vars=['gene_selection_via','k'])

df = get_plot_df(df_metrics)
df = df.reset_index(drop=True)
def get_plot_df(res):
    gene_selection_via = []
    k = []
    dim_reduction_method = []
    ari = []
    ami = []
    homogeneity = []
    # 使用 DataFrame 的列名直接访问数据并添加到各自的列表
    for idx, row in res.iterrows():
        gene_selection_via.append(row['method'])
        k.append(row['feature_number'])
        ari.append(row['ARI_leiden'])
        ami.append(row['AMI_leiden'])
        homogeneity.append(row['Homogeneity_leiden'])

    data_dict = dict(gene_selection_via=gene_selection_via,
                     k=k,
                     ari = ari,
                     ami = ami,
                     homogeneity = homogeneity
                        )
    data = pd.DataFrame(data_dict)
    return data.melt(id_vars=['gene_selection_via','k'])

df = get_plot_df(df_metrics)
df = df.reset_index(drop=True)

In [47]:

Copied!

df
df

Out[47]:

	gene_selection_via	k	variable	value
0	feature_select_random_	3000	ari	0.456784
1	feature_select_var_	3000	ari	0.574402
2	feature_select_sr_	3000	ari	0.491452
3	feature_select_dispersion_	3000	ari	0.543763
4	feature_select_residual_	3000	ari	0.434316
5	feature_select_random_	5000	ari	0.471703
6	feature_select_var_	5000	ari	0.513774
7	feature_select_sr_	5000	ari	0.500053
8	feature_select_dispersion_	5000	ari	0.507755
9	feature_select_residual_	5000	ari	0.481533
10	feature_select_random_	10000	ari	0.448819
11	feature_select_var_	10000	ari	0.428412
12	feature_select_sr_	10000	ari	0.476988
13	feature_select_dispersion_	10000	ari	0.425266
14	feature_select_residual_	10000	ari	0.482629
15	feature_select_random_	29485	ari	0.403792
16	feature_select_var_	29485	ari	0.403792
17	feature_select_sr_	29485	ari	0.403792
18	feature_select_dispersion_	29485	ari	0.403792
19	feature_select_residual_	29485	ari	0.403792
20	feature_select_random_	3000	ami	0.674592
21	feature_select_var_	3000	ami	0.660247
22	feature_select_sr_	3000	ami	0.751388
23	feature_select_dispersion_	3000	ami	0.675866
24	feature_select_residual_	3000	ami	0.733150
25	feature_select_random_	5000	ami	0.722031
26	feature_select_var_	5000	ami	0.696885
27	feature_select_sr_	5000	ami	0.773307
28	feature_select_dispersion_	5000	ami	0.707217
29	feature_select_residual_	5000	ami	0.767520
30	feature_select_random_	10000	ami	0.731739
31	feature_select_var_	10000	ami	0.724399
32	feature_select_sr_	10000	ami	0.762610
33	feature_select_dispersion_	10000	ami	0.729153
34	feature_select_residual_	10000	ami	0.767571
35	feature_select_random_	29485	ami	0.745666
36	feature_select_var_	29485	ami	0.745666
37	feature_select_sr_	29485	ami	0.745666
38	feature_select_dispersion_	29485	ami	0.745666
39	feature_select_residual_	29485	ami	0.745666
40	feature_select_random_	3000	homogeneity	0.600334
41	feature_select_var_	3000	homogeneity	0.570550
42	feature_select_sr_	3000	homogeneity	0.714291
43	feature_select_dispersion_	3000	homogeneity	0.586256
44	feature_select_residual_	3000	homogeneity	0.724680
45	feature_select_random_	5000	homogeneity	0.693817
46	feature_select_var_	5000	homogeneity	0.644075
47	feature_select_sr_	5000	homogeneity	0.787560
48	feature_select_dispersion_	5000	homogeneity	0.637069
49	feature_select_residual_	5000	homogeneity	0.784143
50	feature_select_random_	10000	homogeneity	0.737025
51	feature_select_var_	10000	homogeneity	0.717524
52	feature_select_sr_	10000	homogeneity	0.781313
53	feature_select_dispersion_	10000	homogeneity	0.699094
54	feature_select_residual_	10000	homogeneity	0.790160
55	feature_select_random_	29485	homogeneity	0.782104
56	feature_select_var_	29485	homogeneity	0.782104
57	feature_select_sr_	29485	homogeneity	0.782104
58	feature_select_dispersion_	29485	homogeneity	0.782104
59	feature_select_residual_	29485	homogeneity	0.782104

In [49]:

Copied!





# df = df.rename(columns={'variable':'metric','value':'metric value'})
plot = sns.catplot(x='k',y='value',hue='gene_selection_via',data=df,kind='point',col='variable',)
for ax in plot.axes.flatten():
    for item in ax.get_xticklabels():
        item.set_rotation(45)
plot.savefig('./figures/feature_metric_new_mean_epi.pdf', dpi=300, format=None)
# df = df.rename(columns={'variable':'metric','value':'metric value'})
plot = sns.catplot(x='k',y='value',hue='gene_selection_via',data=df,kind='point',col='variable',)
for ax in plot.axes.flatten():
    for item in ax.get_xticklabels():
        item.set_rotation(45)
plot.savefig('./figures/feature_metric_new_mean_epi.pdf', dpi=300, format=None)

In [67]:

Copied!

adata.write('w100k_after.h5ad')
adata.write('w100k_after.h5ad')

In [59]:

Copied!

#tsnes = ['feature_select_random_1000_umap', 'feature_select_var_1000_umap', 'feature_select_sr_1000_umap', 'feature_select_dispersion_1000_umap', 'feature_select_residual_1000_umap', 'feature_select_random_3000_umap', 'feature_select_var_3000_umap', 'feature_select_sr_3000_umap', 'feature_select_dispersion_3000_umap', 'feature_select_residual_3000_umap', 'feature_select_random_5000_umap', 'feature_select_var_5000_umap', 'feature_select_sr_5000_umap', 'feature_select_dispersion_5000_umap', 'feature_select_residual_5000_umap','feature_select_random_10000_umap', 'feature_select_var_10000_umap', 'feature_select_sr_10000_umap', 'feature_select_dispersion_10000_umap', 'feature_select_residual_10000_umap','X_umap']
tsnes = ['scm_scm_feature_select_random_3000_umap', 'scm_scm_feature_select_var_3000_umap', 'scm_scm_feature_select_sr_3000_umap', 'scm_scm_feature_select_dispersion_3000_umap', 'scm_scm_feature_select_residual_3000_umap', 'scm_scm_feature_select_random_5000_umap', 'scm_scm_feature_select_var_5000_umap', 'scm_scm_feature_select_sr_5000_umap', 'scm_scm_feature_select_dispersion_5000_umap', 'scm_scm_feature_select_residual_5000_umap', 'scm_scm_feature_select_random_10000_umap', 'scm_scm_feature_select_var_10000_umap', 'scm_scm_feature_select_sr_10000_umap', 'scm_scm_feature_select_dispersion_10000_umap', 'scm_scm_feature_select_residual_10000_umap', 'feature_select_random_3000_umap', 'feature_select_var_3000_umap', 'feature_select_sr_3000_umap', 'feature_select_dispersion_3000_umap', 'feature_select_residual_3000_umap', 'feature_select_random_5000_umap', 'feature_select_var_5000_umap', 'feature_select_sr_5000_umap', 'feature_select_dispersion_5000_umap', 'feature_select_residual_5000_umap', 'feature_select_random_10000_umap', 'feature_select_var_10000_umap', 'feature_select_sr_10000_umap', 'feature_select_dispersion_10000_umap', 'feature_select_residual_10000_umap', 'feature_select_random_29485_umap', 'feature_select_var_29485_umap', 'feature_select_sr_29485_umap', 'feature_select_dispersion_29485_umap', 'feature_select_residual_29485_umap', 'scm_scm_feature_select_random_29485_umap', 'scm_scm_feature_select_var_29485_umap', 'scm_scm_feature_select_sr_29485_umap', 'scm_scm_feature_select_dispersion_29485_umap', 'scm_scm_feature_select_residual_29485_umap']
#tsnes = ['feature_select_random_1000_umap', 'feature_select_var_1000_umap', 'feature_select_sr_1000_umap', 'feature_select_dispersion_1000_umap', 'feature_select_residual_1000_umap', 'feature_select_random_3000_umap', 'feature_select_var_3000_umap', 'feature_select_sr_3000_umap', 'feature_select_dispersion_3000_umap', 'feature_select_residual_3000_umap', 'feature_select_random_5000_umap', 'feature_select_var_5000_umap', 'feature_select_sr_5000_umap', 'feature_select_dispersion_5000_umap', 'feature_select_residual_5000_umap','feature_select_random_10000_umap', 'feature_select_var_10000_umap', 'feature_select_sr_10000_umap', 'feature_select_dispersion_10000_umap', 'feature_select_residual_10000_umap','X_umap']
tsnes = ['scm_scm_feature_select_random_3000_umap', 'scm_scm_feature_select_var_3000_umap', 'scm_scm_feature_select_sr_3000_umap', 'scm_scm_feature_select_dispersion_3000_umap', 'scm_scm_feature_select_residual_3000_umap', 'scm_scm_feature_select_random_5000_umap', 'scm_scm_feature_select_var_5000_umap', 'scm_scm_feature_select_sr_5000_umap', 'scm_scm_feature_select_dispersion_5000_umap', 'scm_scm_feature_select_residual_5000_umap', 'scm_scm_feature_select_random_10000_umap', 'scm_scm_feature_select_var_10000_umap', 'scm_scm_feature_select_sr_10000_umap', 'scm_scm_feature_select_dispersion_10000_umap', 'scm_scm_feature_select_residual_10000_umap', 'feature_select_random_3000_umap', 'feature_select_var_3000_umap', 'feature_select_sr_3000_umap', 'feature_select_dispersion_3000_umap', 'feature_select_residual_3000_umap', 'feature_select_random_5000_umap', 'feature_select_var_5000_umap', 'feature_select_sr_5000_umap', 'feature_select_dispersion_5000_umap', 'feature_select_residual_5000_umap', 'feature_select_random_10000_umap', 'feature_select_var_10000_umap', 'feature_select_sr_10000_umap', 'feature_select_dispersion_10000_umap', 'feature_select_residual_10000_umap', 'feature_select_random_29485_umap', 'feature_select_var_29485_umap', 'feature_select_sr_29485_umap', 'feature_select_dispersion_29485_umap', 'feature_select_residual_29485_umap', 'scm_scm_feature_select_random_29485_umap', 'scm_scm_feature_select_var_29485_umap', 'scm_scm_feature_select_sr_29485_umap', 'scm_scm_feature_select_dispersion_29485_umap', 'scm_scm_feature_select_residual_29485_umap']

In [64]:

Copied!

len(tsnes)
len(tsnes)

Out[64]:

In [63]:

Copied!





import matplotlib.pyplot as plt
import seaborn as sns

title_fontsize=16
title_fontweight = "bold"

clusters = adata.obs['Neuron type'].astype(str)   # 将数值类型转换为字符串类型
titles = ['scm_scm_feature_select_random_3000_umap', 'scm_scm_feature_select_var_3000_umap', 'scm_scm_feature_select_sr_3000_umap', 'scm_scm_feature_select_dispersion_3000_umap', 'scm_scm_feature_select_residual_3000_umap', 'scm_scm_feature_select_random_5000_umap', 'scm_scm_feature_select_var_5000_umap', 'scm_scm_feature_select_sr_5000_umap', 'scm_scm_feature_select_dispersion_5000_umap', 'scm_scm_feature_select_residual_5000_umap', 'scm_scm_feature_select_random_10000_umap', 'scm_scm_feature_select_var_10000_umap', 'scm_scm_feature_select_sr_10000_umap', 'scm_scm_feature_select_dispersion_10000_umap', 'scm_scm_feature_select_residual_10000_umap', 'feature_select_random_3000_umap', 'feature_select_var_3000_umap', 'feature_select_sr_3000_umap', 'feature_select_dispersion_3000_umap', 'feature_select_residual_3000_umap', 'feature_select_random_5000_umap', 'feature_select_var_5000_umap', 'feature_select_sr_5000_umap', 'feature_select_dispersion_5000_umap', 'feature_select_residual_5000_umap', 'feature_select_random_10000_umap', 'feature_select_var_10000_umap', 'feature_select_sr_10000_umap', 'feature_select_dispersion_10000_umap', 'feature_select_residual_10000_umap', 'feature_select_random_29485_umap', 'feature_select_var_29485_umap', 'feature_select_sr_29485_umap', 'feature_select_dispersion_29485_umap', 'feature_select_residual_29485_umap', 'scm_scm_feature_select_random_29485_umap', 'scm_scm_feature_select_var_29485_umap', 'scm_scm_feature_select_sr_29485_umap', 'scm_scm_feature_select_dispersion_29485_umap', 'scm_scm_feature_select_residual_29485_umap']
with sns.plotting_context('paper'):
    plt.figure(figsize=(41,41))
    
    for i, (t,title) in enumerate(zip(tsnes,titles)):
        tsne = adata.obsm[t]

        plt.subplot(5,8,i +1)
        ax=plt.gca()
        # plt.text(0.05,1.015,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
        plt.title(title)
        plt.xticks([])
        plt.yticks([])
                    # 绘制每个群集
        for cluster_id in np.unique(clusters):
            cluster_idx = clusters == cluster_id
                    # 选取当前群集的所有点
            cluster_points = tsne[cluster_idx]
            # 计算当前群集所有点的均值，这将作为注释的位置
            centroid = np.mean(cluster_points, axis=0)
            points=plt.scatter(tsne[cluster_idx, 0], tsne[cluster_idx, 1], s=6, edgecolor='none', rasterized=True)
            if i == 0:
                color = points.get_facecolor().flatten()
                # 在群集的中心位置添加注释，注释内容为群集的ID
                text = plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=10, ha='center', va='center')
                # 创建背景框对象，并设置背景框的样式
                bbox_props = dict(boxstyle="round,pad=0.3", fc="white", alpha=0.2)
                plt.setp(text, bbox=bbox_props)
    plt.tight_layout()
    
    sns.despine(left=True,bottom=True)

    plt.savefig('./figures/feature_umap_all.pdf', dpi=300, format=None)
import matplotlib.pyplot as plt
import seaborn as sns

title_fontsize=16
title_fontweight = "bold"

clusters = adata.obs['Neuron type'].astype(str)   # 将数值类型转换为字符串类型
titles = ['scm_scm_feature_select_random_3000_umap', 'scm_scm_feature_select_var_3000_umap', 'scm_scm_feature_select_sr_3000_umap', 'scm_scm_feature_select_dispersion_3000_umap', 'scm_scm_feature_select_residual_3000_umap', 'scm_scm_feature_select_random_5000_umap', 'scm_scm_feature_select_var_5000_umap', 'scm_scm_feature_select_sr_5000_umap', 'scm_scm_feature_select_dispersion_5000_umap', 'scm_scm_feature_select_residual_5000_umap', 'scm_scm_feature_select_random_10000_umap', 'scm_scm_feature_select_var_10000_umap', 'scm_scm_feature_select_sr_10000_umap', 'scm_scm_feature_select_dispersion_10000_umap', 'scm_scm_feature_select_residual_10000_umap', 'feature_select_random_3000_umap', 'feature_select_var_3000_umap', 'feature_select_sr_3000_umap', 'feature_select_dispersion_3000_umap', 'feature_select_residual_3000_umap', 'feature_select_random_5000_umap', 'feature_select_var_5000_umap', 'feature_select_sr_5000_umap', 'feature_select_dispersion_5000_umap', 'feature_select_residual_5000_umap', 'feature_select_random_10000_umap', 'feature_select_var_10000_umap', 'feature_select_sr_10000_umap', 'feature_select_dispersion_10000_umap', 'feature_select_residual_10000_umap', 'feature_select_random_29485_umap', 'feature_select_var_29485_umap', 'feature_select_sr_29485_umap', 'feature_select_dispersion_29485_umap', 'feature_select_residual_29485_umap', 'scm_scm_feature_select_random_29485_umap', 'scm_scm_feature_select_var_29485_umap', 'scm_scm_feature_select_sr_29485_umap', 'scm_scm_feature_select_dispersion_29485_umap', 'scm_scm_feature_select_residual_29485_umap']
with sns.plotting_context('paper'):
    plt.figure(figsize=(41,41))
    
    for i, (t,title) in enumerate(zip(tsnes,titles)):
        tsne = adata.obsm[t]

        plt.subplot(5,8,i +1)
        ax=plt.gca()
        # plt.text(0.05,1.015,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
        plt.title(title)
        plt.xticks([])
        plt.yticks([])
                    # 绘制每个群集
        for cluster_id in np.unique(clusters):
            cluster_idx = clusters == cluster_id
                    # 选取当前群集的所有点
            cluster_points = tsne[cluster_idx]
            # 计算当前群集所有点的均值，这将作为注释的位置
            centroid = np.mean(cluster_points, axis=0)
            points=plt.scatter(tsne[cluster_idx, 0], tsne[cluster_idx, 1], s=6, edgecolor='none', rasterized=True)
            if i == 0:
                color = points.get_facecolor().flatten()
                # 在群集的中心位置添加注释，注释内容为群集的ID
                text = plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=10, ha='center', va='center')
                # 创建背景框对象，并设置背景框的样式
                bbox_props = dict(boxstyle="round,pad=0.3", fc="white", alpha=0.2)
                plt.setp(text, bbox=bbox_props)
    plt.tight_layout()
    
    sns.despine(left=True,bottom=True)

    plt.savefig('./figures/feature_umap_all.pdf', dpi=300, format=None)

In [66]:

Copied!





import matplotlib.pyplot as plt
import seaborn as sns

title_fontsize=20
title_fontweight = "bold"

tsnes = ['feature_select_random_3000_umap','feature_select_random_5000_umap','feature_select_random_10000_umap','feature_select_random_29485_umap',
        'feature_select_var_3000_umap','feature_select_var_5000_umap','feature_select_var_10000_umap','feature_select_var_29485_umap',
        'feature_select_dispersion_3000_umap','feature_select_dispersion_5000_umap','feature_select_dispersion_10000_umap','feature_select_dispersion_29485_umap',
        'scm_scm_feature_select_random_3000_umap','scm_scm_feature_select_residual_5000_umap','scm_scm_feature_select_var_10000_umap', 'feature_select_random_29485_umap', 
         'scm_scm_feature_select_var_3000_umap','scm_scm_feature_select_var_5000_umap','scm_scm_feature_select_var_10000_umap','scm_scm_feature_select_var_29485_umap',
        'scm_scm_feature_select_sr_3000_umap','scm_scm_feature_select_sr_5000_umap','scm_scm_feature_select_sr_10000_umap','scm_scm_feature_select_sr_29485_umap',
         'scm_scm_feature_select_dispersion_3000_umap', 'scm_scm_feature_select_dispersion_5000_umap','scm_scm_feature_select_dispersion_10000_umap','scm_scm_feature_select_dispersion_29485_umap',
         'scm_scm_feature_select_residual_3000_umap','scm_scm_feature_select_residual_5000_umap','scm_scm_feature_select_residual_10000_umap','scm_scm_feature_select_residual_29485_umap',
        ]

clusters = adata.obs['Neuron type'].astype(str)   # 将数值类型转换为字符串类型
titles = ['feature_select_random_3000_umap','feature_select_random_5000_umap','feature_select_random_10000_umap','feature_select_random_29485_umap',
        'feature_select_var_3000_umap','feature_select_var_5000_umap','feature_select_var_10000_umap','feature_select_var_29485_umap',
        'feature_select_dispersion_3000_umap','feature_select_dispersion_5000_umap','feature_select_dispersion_10000_umap','feature_select_dispersion_29485_umap',
        'scm_scm_feature_select_random_3000_umap','scm_scm_feature_select_residual_5000_umap','scm_scm_feature_select_var_10000_umap', 'feature_select_random_29485_umap', 
         'scm_scm_feature_select_var_3000_umap','scm_scm_feature_select_var_5000_umap','scm_scm_feature_select_var_10000_umap','scm_scm_feature_select_var_29485_umap',
        'scm_scm_feature_select_sr_3000_umap','scm_scm_feature_select_sr_5000_umap','scm_scm_feature_select_sr_10000_umap','scm_scm_feature_select_sr_29485_umap',
         'scm_scm_feature_select_dispersion_3000_umap', 'scm_scm_feature_select_dispersion_5000_umap','scm_scm_feature_select_dispersion_10000_umap','scm_scm_feature_select_dispersion_29485_umap',
         'scm_scm_feature_select_residual_3000_umap','scm_scm_feature_select_residual_5000_umap','scm_scm_feature_select_residual_10000_umap','scm_scm_feature_select_residual_29485_umap',
        ]
with sns.plotting_context('paper'):
    plt.figure(figsize=(41,41))
    
    for i, (t,title) in enumerate(zip(tsnes,titles)):
        tsne = adata.obsm[t]

        plt.subplot(8,4,i +1)
        ax=plt.gca()
        # plt.text(0.05,1.015,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
        plt.title(title)
        plt.xticks([])
        plt.yticks([])
                    # 绘制每个群集
        for cluster_id in np.unique(clusters):
            cluster_idx = clusters == cluster_id
                    # 选取当前群集的所有点
            cluster_points = tsne[cluster_idx]
            # 计算当前群集所有点的均值，这将作为注释的位置
            centroid = np.mean(cluster_points, axis=0)
            points=plt.scatter(tsne[cluster_idx, 0], tsne[cluster_idx, 1], s=6, edgecolor='none', rasterized=True)
            if i == 0:
                color = points.get_facecolor().flatten()
                # 在群集的中心位置添加注释，注释内容为群集的ID
                text = plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=10, ha='center', va='center')
                # 创建背景框对象，并设置背景框的样式
                bbox_props = dict(boxstyle="round,pad=0.3", fc="white", alpha=0.2)
                plt.setp(text, bbox=bbox_props)
    plt.tight_layout()
    
    sns.despine(left=True,bottom=True)

    plt.savefig('./figures/feature_umap_all_v2.pdf', dpi=300, format=None)
import matplotlib.pyplot as plt
import seaborn as sns

title_fontsize=20
title_fontweight = "bold"

tsnes = ['feature_select_random_3000_umap','feature_select_random_5000_umap','feature_select_random_10000_umap','feature_select_random_29485_umap',
        'feature_select_var_3000_umap','feature_select_var_5000_umap','feature_select_var_10000_umap','feature_select_var_29485_umap',
        'feature_select_dispersion_3000_umap','feature_select_dispersion_5000_umap','feature_select_dispersion_10000_umap','feature_select_dispersion_29485_umap',
        'scm_scm_feature_select_random_3000_umap','scm_scm_feature_select_residual_5000_umap','scm_scm_feature_select_var_10000_umap', 'feature_select_random_29485_umap', 
         'scm_scm_feature_select_var_3000_umap','scm_scm_feature_select_var_5000_umap','scm_scm_feature_select_var_10000_umap','scm_scm_feature_select_var_29485_umap',
        'scm_scm_feature_select_sr_3000_umap','scm_scm_feature_select_sr_5000_umap','scm_scm_feature_select_sr_10000_umap','scm_scm_feature_select_sr_29485_umap',
         'scm_scm_feature_select_dispersion_3000_umap', 'scm_scm_feature_select_dispersion_5000_umap','scm_scm_feature_select_dispersion_10000_umap','scm_scm_feature_select_dispersion_29485_umap',
         'scm_scm_feature_select_residual_3000_umap','scm_scm_feature_select_residual_5000_umap','scm_scm_feature_select_residual_10000_umap','scm_scm_feature_select_residual_29485_umap',
        ]

clusters = adata.obs['Neuron type'].astype(str)   # 将数值类型转换为字符串类型
titles = ['feature_select_random_3000_umap','feature_select_random_5000_umap','feature_select_random_10000_umap','feature_select_random_29485_umap',
        'feature_select_var_3000_umap','feature_select_var_5000_umap','feature_select_var_10000_umap','feature_select_var_29485_umap',
        'feature_select_dispersion_3000_umap','feature_select_dispersion_5000_umap','feature_select_dispersion_10000_umap','feature_select_dispersion_29485_umap',
        'scm_scm_feature_select_random_3000_umap','scm_scm_feature_select_residual_5000_umap','scm_scm_feature_select_var_10000_umap', 'feature_select_random_29485_umap', 
         'scm_scm_feature_select_var_3000_umap','scm_scm_feature_select_var_5000_umap','scm_scm_feature_select_var_10000_umap','scm_scm_feature_select_var_29485_umap',
        'scm_scm_feature_select_sr_3000_umap','scm_scm_feature_select_sr_5000_umap','scm_scm_feature_select_sr_10000_umap','scm_scm_feature_select_sr_29485_umap',
         'scm_scm_feature_select_dispersion_3000_umap', 'scm_scm_feature_select_dispersion_5000_umap','scm_scm_feature_select_dispersion_10000_umap','scm_scm_feature_select_dispersion_29485_umap',
         'scm_scm_feature_select_residual_3000_umap','scm_scm_feature_select_residual_5000_umap','scm_scm_feature_select_residual_10000_umap','scm_scm_feature_select_residual_29485_umap',
        ]
with sns.plotting_context('paper'):
    plt.figure(figsize=(41,41))
    
    for i, (t,title) in enumerate(zip(tsnes,titles)):
        tsne = adata.obsm[t]

        plt.subplot(8,4,i +1)
        ax=plt.gca()
        # plt.text(0.05,1.015,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
        plt.title(title)
        plt.xticks([])
        plt.yticks([])
                    # 绘制每个群集
        for cluster_id in np.unique(clusters):
            cluster_idx = clusters == cluster_id
                    # 选取当前群集的所有点
            cluster_points = tsne[cluster_idx]
            # 计算当前群集所有点的均值，这将作为注释的位置
            centroid = np.mean(cluster_points, axis=0)
            points=plt.scatter(tsne[cluster_idx, 0], tsne[cluster_idx, 1], s=6, edgecolor='none', rasterized=True)
            if i == 0:
                color = points.get_facecolor().flatten()
                # 在群集的中心位置添加注释，注释内容为群集的ID
                text = plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=10, ha='center', va='center')
                # 创建背景框对象，并设置背景框的样式
                bbox_props = dict(boxstyle="round,pad=0.3", fc="white", alpha=0.2)
                plt.setp(text, bbox=bbox_props)
    plt.tight_layout()
    
    sns.despine(left=True,bottom=True)

    plt.savefig('./figures/feature_umap_all_v2.pdf', dpi=300, format=None)

In [62]:

Copied!

plt.savefig('./figures/feature_umap_all.pdf', dpi=300, format=None)
plt.savefig('./figures/feature_umap_all.pdf', dpi=300, format=None)

<Figure size 640x480 with 0 Axes>

Feature¶

In [302]:

Copied!

gtf_file='/xtdisk/methbank_baoym/zongwt/single/genome/human/gencode.v41.basic.annotation.gtf'
reference = scm.dm.parse_gtf(gtf_file)
gtf_file='/xtdisk/methbank_baoym/zongwt/single/genome/human/gencode.v41.basic.annotation.gtf'
reference = scm.dm.parse_gtf(gtf_file)

... Loading gene references
... Done

In [303]:

Copied!

scm.dm.annotation(adata,ref=reference)
scm.dm.annotation(adata,ref=reference)

... Loading regions info
... Overlapping genes with regions
chr1	100001	200000	1	chr1	HAVANA	gene	65419	71585	.	+	.	gene_id "ENSG00000186092.7";gene_type "protein_coding";gene_name "OR4F5";level 2;hgnc_id "HGNC:14825";havana_gene "OTTHUMG00000001094.4";	28417

...Overlapping finish

In [359]:

Copied!





# for feature
example_genes=               ['RORB',             'LYZ',             'FOS',                                      'MALAT1'] 
## Labeling details panel (a)
example_gene_textoffsets_a = [np.array([-0.6,0]),   np.array([-0.35,0]), np.array([0,-0.85]),                       np.array([0,-4.6])]
example_gene_lines_a =       [[],                  [],                 [np.array([0,0.1]),np.array([0,-0.25])], [np.array([0,0.3]),np.array([0,-2])]]
## Labeling details panel (b)
example_gene_textoffsets_b = [np.array([-0.6,1]),  np.array([0,8]),  np.array([0,-1.8]),                        np.array([0,2.4])]
example_gene_lines_b =       [[],                  [],                 [np.array([0,0.35]),np.array([0,-0.5])], []]

dotsize = 2
starsize = 125
staredges = 0.85
legend_loc = (0.05,0.8)
titleletter_loc = (-0.10,1.015)
hline_width = 1.5
xmin = min(gene_means)/1.1
xmax = max(gene_means)*1.1
# for feature
example_genes=               ['RORB',             'LYZ',             'FOS',                                      'MALAT1'] 
## Labeling details panel (a)
example_gene_textoffsets_a = [np.array([-0.6,0]),   np.array([-0.35,0]), np.array([0,-0.85]),                       np.array([0,-4.6])]
example_gene_lines_a =       [[],                  [],                 [np.array([0,0.1]),np.array([0,-0.25])], [np.array([0,0.3]),np.array([0,-2])]]
## Labeling details panel (b)
example_gene_textoffsets_b = [np.array([-0.6,1]),  np.array([0,8]),  np.array([0,-1.8]),                        np.array([0,2.4])]
example_gene_lines_b =       [[],                  [],                 [np.array([0,0.35]),np.array([0,-0.5])], []]

dotsize = 2
starsize = 125
staredges = 0.85
legend_loc = (0.05,0.8)
titleletter_loc = (-0.10,1.015)
hline_width = 1.5
xmin = min(gene_means)/1.1
xmax = max(gene_means)*1.1

In [361]:

Copied!

adata.var
adata.var

Out[361]:

	chromosome	start	end	covered_cell	var	sr_var	mean	pct_cell	dispersion	log_dispersion	...	feature_select_var	feature_select_dispersion	feature_select_random	mean_sr	sr_dispersion	log_sr_dispersion	feature_select_residual	Accession	Gene	Distance
index
chr1:1-100000	chr1	1	100000	2353	0.035220	0.029984	0.710440	0.995347	20.171644	3.004278	...	False	False	False	0.333772	11.131786	2.400980	True	ENSG00000186092.7	OR4F5	0
chr1:100001-200000	chr1	100001	200000	2354	0.009842	0.016549	0.810938	0.995770	82.395519	4.411531	...	False	False	True	0.366151	22.124874	3.093073	False	ENSG00000186092.7	OR4F5	28417
chr1:200001-300000	chr1	200001	300000	2351	0.030100	0.035758	0.730608	0.994501	24.272813	3.189357	...	False	False	False	0.383961	10.737695	2.362730	True	ENSG00000186092.7	OR4F5	128417
chr1:300001-400000	chr1	300001	400000	2323	0.026505	0.055943	0.819564	0.982657	30.921580	3.431454	...	False	False	False	0.327454	5.853303	1.713199	True	ENSG00000284733.2	OR4F29	50740
chr1:400001-500000	chr1	400001	500000	2346	0.014380	0.038490	0.836147	0.992386	58.147901	4.062990	...	False	False	False	0.382297	9.932428	2.278761	True	ENSG00000284733.2	OR4F29	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
chrY:56800001-56900000	chrY	56800001	56900000	2361	0.002751	0.003180	0.503138	0.998731	182.910968	5.209000	...	True	True	True	0.256378	80.616754	4.388891	False	ENSG00000124333.16_PAR_Y	VAMP7	167865
chrY:56900001-57000000	chrY	56900001	57000000	2122	0.098136	0.067797	0.585631	0.897631	5.967571	1.786340	...	False	False	True	0.251633	3.711540	1.080738	True	ENSG00000124333.16_PAR_Y	VAMP7	67865
chrY:57000001-57100000	chrY	57000001	57100000	2252	0.066954	0.067381	0.664001	0.952623	9.917348	2.294286	...	False	False	False	0.311553	4.623727	1.406113	True	ENSG00000124333.16_PAR_Y	VAMP7	0
chrY:57100001-57200000	chrY	57100001	57200000	2311	0.046440	0.054673	0.684819	0.977580	14.746337	2.690995	...	False	False	False	0.300699	5.499966	1.595794	True	ENSG00000124333.16_PAR_Y	VAMP7	0
chrY:57200001-57227415	chrY	57200001	57227415	2125	0.040066	0.080984	0.790896	0.898900	19.740019	2.982648	...	False	False	False	0.292204	3.608189	1.184860	True	ENSG00000182484.15_PAR_Y	WASH6P	0

29485 rows × 24 columns

In [470]:

Copied!





adata = scm_100k
xmean_label = 'mean methylation level'
residual_var = adata.var['log_sr_dispersion']
gene_means = adata.var['mean']
sqrt_var =  adata.var['var']
sqrt_variable_genes_idx = adata.var['feature_select_var']
residuals_variable_genes_idx = adata.var['feature_select_residual']
dataset = adata.var
sqrt_lazy_tsne = adata.obsm['X_umap']
with sns.plotting_context("talk"):
    
    example_genes_idx = np.isin(adata.var['Gene'],example_genes)

    fig = plt.figure(figsize=(18,6))
    gs = GridSpec(2, 6, figure=fig) 
    
    ax1 = fig.add_subplot(gs[:, 0:2])
    ax2 = fig.add_subplot(gs[:, 2:4])    
    supaxis = fig.add_subplot(gs[:, 4:6])      
    ax3 = fig.add_subplot(gs[0, 4])  
    ax4 = fig.add_subplot(gs[1, 4])  
    ax5 = fig.add_subplot(gs[0, 5])  
    ax6 = fig.add_subplot(gs[1, 5])  
    
    tsne_axes = [ax3,ax4,ax5,ax6]
    tsne_arrows = [(28,39),None,None,(28,39)]
    
    ax1.text(*titleletter_loc,'a',transform=ax1.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    ax1.set_title('sqrt(CPMedian)')
    # ax1.set_xscale('log')
    ax1.set_yscale('log')
    ax1.set_xlim((xmin,xmax))
    ax1.set_xlabel(xmean_label)
    ax1.set_ylabel('variance')
    ax1.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True)
    ax1.scatter(gene_means[sqrt_variable_genes_idx],sqrt_var[sqrt_variable_genes_idx],color='tab:red',s=dotsize,label='selection by Pearson residuals', rasterized=True)
    ax1.scatter(gene_means[sqrt_variable_genes_idx & example_genes_idx],sqrt_var[sqrt_variable_genes_idx & example_genes_idx],color='tab:red',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
    ax1.scatter(gene_means[~sqrt_variable_genes_idx & example_genes_idx],sqrt_var[~sqrt_variable_genes_idx & example_genes_idx],color='tab:blue',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
    # ax1.hlines(sqrt_threshold,xmin,xmax,linestyle=':',linewidth=hline_width)
    
    # add_labels(dataset,xdata=gene_means, ydata=sqrt_var,example_genes=example_genes,textoffsets=example_gene_textoffsets_a,lines=example_gene_lines_a,ax=ax1)
    # add_largedot_legend(ax1,'lower right',dict(fontsize=15))
  
    ax2.text(*titleletter_loc,'b',transform=ax2.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    ax2.set_title(r'Pearson residuals ($\theta=5000$)')
    # ax2.set_xscale('log')
    ax2.set_yscale('log')
    ax2.set_xlim((xmin,xmax))
    ax2.set_xlabel(xmean_label)
    ax2.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True)
    #ax2.scatter(gene_means,residual_var,s=dotsize, rasterized=True)
    ax2.scatter(gene_means[residuals_variable_genes_idx],sqrt_var[residuals_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)    
    ax2.scatter(gene_means[example_genes_idx & sqrt_variable_genes_idx],sqrt_var[example_genes_idx & sqrt_variable_genes_idx],color='tab:red',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
    ax2.scatter(gene_means[example_genes_idx & ~sqrt_variable_genes_idx],sqrt_var[example_genes_idx & ~sqrt_variable_genes_idx],color='tab:blue',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
    # ax2.hlines(residuals_threshold,xmin,xmax,linestyle=':',linewidth=hline_width)    

    # add_labels(dataset,xdata=gene_means, ydata=residual_var,example_genes=example_genes,textoffsets=example_gene_textoffsets_b,lines=example_gene_lines_b,ax=ax2)
    # add_largedot_legend(ax2,'lower right',dict(fontsize=15))    
        
    supaxis.text(*titleletter_loc,'c',transform=supaxis.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)    
    supaxis.axis('off')
    # for (tsne_ax,tsne_arrow,example_gene) in zip(tsne_axes,tsne_arrows,example_genes):        
    #     # X = adata.X.T.toarray()
    #     print(dataset['Gene'] == example_gene)
    #     gene_idx = dataset['Gene'] == example_gene
    #     print(gene_idx)
    #     sqrt_lazy_counts = np.squeeze(adata.X[gene_idx, :].toarray())
    #     tsne_ax.set_title(example_gene.lower().capitalize())
    #     tsne_ax.scatter(sqrt_lazy_tsne[:,0], sqrt_lazy_tsne[:,1], s=1,c=sqrt_lazy_counts, edgecolor='none',cmap='Reds', rasterized=True)
    #     if tsne_arrow:
    #         tsne_ax.arrow(*tsne_arrow,-8,-8,width=0.5,head_width=6,shape='full',facecolor='k')
    #     tsne_ax.axis('off')
        
    sns.despine()
    plt.tight_layout()
    # plt.savefig('figures/Fig2.pdf', dpi=300, format=None)
adata = scm_100k
xmean_label = 'mean methylation level'
residual_var = adata.var['log_sr_dispersion']
gene_means = adata.var['mean']
sqrt_var =  adata.var['var']
sqrt_variable_genes_idx = adata.var['feature_select_var']
residuals_variable_genes_idx = adata.var['feature_select_residual']
dataset = adata.var
sqrt_lazy_tsne = adata.obsm['X_umap']
with sns.plotting_context("talk"):
    
    example_genes_idx = np.isin(adata.var['Gene'],example_genes)

    fig = plt.figure(figsize=(18,6))
    gs = GridSpec(2, 6, figure=fig) 
    
    ax1 = fig.add_subplot(gs[:, 0:2])
    ax2 = fig.add_subplot(gs[:, 2:4])    
    supaxis = fig.add_subplot(gs[:, 4:6])      
    ax3 = fig.add_subplot(gs[0, 4])  
    ax4 = fig.add_subplot(gs[1, 4])  
    ax5 = fig.add_subplot(gs[0, 5])  
    ax6 = fig.add_subplot(gs[1, 5])  
    
    tsne_axes = [ax3,ax4,ax5,ax6]
    tsne_arrows = [(28,39),None,None,(28,39)]
    
    ax1.text(*titleletter_loc,'a',transform=ax1.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    ax1.set_title('sqrt(CPMedian)')
    # ax1.set_xscale('log')
    ax1.set_yscale('log')
    ax1.set_xlim((xmin,xmax))
    ax1.set_xlabel(xmean_label)
    ax1.set_ylabel('variance')
    ax1.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True)
    ax1.scatter(gene_means[sqrt_variable_genes_idx],sqrt_var[sqrt_variable_genes_idx],color='tab:red',s=dotsize,label='selection by Pearson residuals', rasterized=True)
    ax1.scatter(gene_means[sqrt_variable_genes_idx & example_genes_idx],sqrt_var[sqrt_variable_genes_idx & example_genes_idx],color='tab:red',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
    ax1.scatter(gene_means[~sqrt_variable_genes_idx & example_genes_idx],sqrt_var[~sqrt_variable_genes_idx & example_genes_idx],color='tab:blue',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
    # ax1.hlines(sqrt_threshold,xmin,xmax,linestyle=':',linewidth=hline_width)
    
    # add_labels(dataset,xdata=gene_means, ydata=sqrt_var,example_genes=example_genes,textoffsets=example_gene_textoffsets_a,lines=example_gene_lines_a,ax=ax1)
    # add_largedot_legend(ax1,'lower right',dict(fontsize=15))
  
    ax2.text(*titleletter_loc,'b',transform=ax2.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    ax2.set_title(r'Pearson residuals ($\theta=5000$)')
    # ax2.set_xscale('log')
    ax2.set_yscale('log')
    ax2.set_xlim((xmin,xmax))
    ax2.set_xlabel(xmean_label)
    ax2.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True)
    #ax2.scatter(gene_means,residual_var,s=dotsize, rasterized=True)
    ax2.scatter(gene_means[residuals_variable_genes_idx],sqrt_var[residuals_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)    
    ax2.scatter(gene_means[example_genes_idx & sqrt_variable_genes_idx],sqrt_var[example_genes_idx & sqrt_variable_genes_idx],color='tab:red',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
    ax2.scatter(gene_means[example_genes_idx & ~sqrt_variable_genes_idx],sqrt_var[example_genes_idx & ~sqrt_variable_genes_idx],color='tab:blue',s=starsize,marker='*',edgecolors='k',linewidths=staredges, rasterized=True)
    # ax2.hlines(residuals_threshold,xmin,xmax,linestyle=':',linewidth=hline_width)    

    # add_labels(dataset,xdata=gene_means, ydata=residual_var,example_genes=example_genes,textoffsets=example_gene_textoffsets_b,lines=example_gene_lines_b,ax=ax2)
    # add_largedot_legend(ax2,'lower right',dict(fontsize=15))    
        
    supaxis.text(*titleletter_loc,'c',transform=supaxis.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)    
    supaxis.axis('off')
    # for (tsne_ax,tsne_arrow,example_gene) in zip(tsne_axes,tsne_arrows,example_genes):        
    #     # X = adata.X.T.toarray()
    #     print(dataset['Gene'] == example_gene)
    #     gene_idx = dataset['Gene'] == example_gene
    #     print(gene_idx)
    #     sqrt_lazy_counts = np.squeeze(adata.X[gene_idx, :].toarray())
    #     tsne_ax.set_title(example_gene.lower().capitalize())
    #     tsne_ax.scatter(sqrt_lazy_tsne[:,0], sqrt_lazy_tsne[:,1], s=1,c=sqrt_lazy_counts, edgecolor='none',cmap='Reds', rasterized=True)
    #     if tsne_arrow:
    #         tsne_ax.arrow(*tsne_arrow,-8,-8,width=0.5,head_width=6,shape='full',facecolor='k')
    #     tsne_ax.axis('off')
        
    sns.despine()
    plt.tight_layout()
    # plt.savefig('figures/Fig2.pdf', dpi=300, format=None)

In [320]:

Copied!





def add_labels(dataset, xdata, ydata, example_genes, textoffsets, lines, ax):
    for example_gene, textoffset, line in zip(example_genes, textoffsets, lines):
        gene_idx = dataset['Gene'] == example_gene
        if not gene_idx.any():
            print(f"Warning: {example_gene} not found in dataset['Gene']")
            continue
        gene_position = np.array([xdata[gene_idx], ydata[gene_idx]]).flatten()
        text_position = np.array([gene_position[0] * 10**textoffset[0], gene_position[1] + textoffset[1]]).flatten()
        ax.text(*text_position, example_gene.capitalize(), horizontalalignment='center')
        if line:
            ax.plot([gene_position[0], text_position[0]], [gene_position[1], text_position[1]], 'k-', linewidth=0.5)

def add_largedot_legend(ax,loc,kwargs={}):
    lgnd = ax.legend(loc=loc,frameon=True,**kwargs)
    for l in lgnd.legendHandles:
        l._sizes = [30]
def add_labels(dataset, xdata, ydata, example_genes, textoffsets, lines, ax):
    for example_gene, textoffset, line in zip(example_genes, textoffsets, lines):
        gene_idx = dataset['Gene'] == example_gene
        if not gene_idx.any():
            print(f"Warning: {example_gene} not found in dataset['Gene']")
            continue
        gene_position = np.array([xdata[gene_idx], ydata[gene_idx]]).flatten()
        text_position = np.array([gene_position[0] * 10**textoffset[0], gene_position[1] + textoffset[1]]).flatten()
        ax.text(*text_position, example_gene.capitalize(), horizontalalignment='center')
        if line:
            ax.plot([gene_position[0], text_position[0]], [gene_position[1], text_position[1]], 'k-', linewidth=0.5)

def add_largedot_legend(ax,loc,kwargs={}):
    lgnd = ax.legend(loc=loc,frameon=True,**kwargs)
    for l in lgnd.legendHandles:
        l._sizes = [30]

In [ ]:

Copied!

def auto(adata,svd_solver='arpack', nb_pcs=50, n_neighbors=15, perplexity=30, 
         method='umap', metric='euclidean', min_dist=0.5, spread=1.0, 
         n_components=2, copy=False):
    
def auto(adata,svd_solver='arpack', nb_pcs=50, n_neighbors=15, perplexity=30, 
         method='umap', metric='euclidean', min_dist=0.5, spread=1.0, 
         n_components=2, copy=False):

In [425]:

Copied!





gene_means = adata.var['mean']
sqrt_var =  adata.var['var']
vars_variable_genes_idx = adata.var['feature_select_var']
#sqrt_variable_genes_idx = scm_1k.var['feature_select']
residual_var = adata.var['dispersion_norm']
residuals_variable_genes_idx = adata.var['feature_select_sr']
normdispersion_variable_genes_idx = adata.var['feature_select_dispersion']
random_genes_idx = adata.var['feature_select_random']

with sns.plotting_context("talk"):

    fig = plt.figure(figsize=(18,6))
    gs = GridSpec(2, 8, figure=fig) 

    ax1 = fig.add_subplot(gs[:, 0:2])
    ax2 = fig.add_subplot(gs[:, 2:4])
    ax3 = fig.add_subplot(gs[:, 4:6])
    ax4 = fig.add_subplot(gs[:, 6:8]) 

    ax1.text(*titleletter_loc,'a',transform=ax1.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    ax1.set_title(r'Variance')
    ax1.set_ylabel('Variance') 
    ax1.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
    ax1.scatter(gene_means[vars_variable_genes_idx],sqrt_var[vars_variable_genes_idx],color='tab:red',s=dotsize,label='selection by Pearson residuals', rasterized=True)

    ax2.text(*titleletter_loc,'b',transform=ax2.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    ax2.set_title(r'Residuals Variance')   
    ax2.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
    ax2.scatter(gene_means[residuals_variable_genes_idx],sqrt_var[residuals_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)    

    ax3.text(*titleletter_loc,'c',transform=ax3.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    ax3.set_title(r'NormDispersion')   
    ax3.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
    ax3.scatter(gene_means[normdispersion_variable_genes_idx],sqrt_var[normdispersion_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)

    ax4.text(*titleletter_loc,'c',transform=ax4.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    ax4.set_title(r'Random')   
    ax4.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
    ax4.scatter(gene_means[random_genes_idx],sqrt_var[random_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
    
    sns.despine()
    plt.tight_layout()
gene_means = adata.var['mean']
sqrt_var =  adata.var['var']
vars_variable_genes_idx = adata.var['feature_select_var']
#sqrt_variable_genes_idx = scm_1k.var['feature_select']
residual_var = adata.var['dispersion_norm']
residuals_variable_genes_idx = adata.var['feature_select_sr']
normdispersion_variable_genes_idx = adata.var['feature_select_dispersion']
random_genes_idx = adata.var['feature_select_random']

with sns.plotting_context("talk"):

    fig = plt.figure(figsize=(18,6))
    gs = GridSpec(2, 8, figure=fig) 

    ax1 = fig.add_subplot(gs[:, 0:2])
    ax2 = fig.add_subplot(gs[:, 2:4])
    ax3 = fig.add_subplot(gs[:, 4:6])
    ax4 = fig.add_subplot(gs[:, 6:8]) 

    ax1.text(*titleletter_loc,'a',transform=ax1.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    ax1.set_title(r'Variance')
    ax1.set_ylabel('Variance') 
    ax1.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
    ax1.scatter(gene_means[vars_variable_genes_idx],sqrt_var[vars_variable_genes_idx],color='tab:red',s=dotsize,label='selection by Pearson residuals', rasterized=True)

    ax2.text(*titleletter_loc,'b',transform=ax2.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    ax2.set_title(r'Residuals Variance')   
    ax2.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
    ax2.scatter(gene_means[residuals_variable_genes_idx],sqrt_var[residuals_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)    

    ax3.text(*titleletter_loc,'c',transform=ax3.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    ax3.set_title(r'NormDispersion')   
    ax3.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
    ax3.scatter(gene_means[normdispersion_variable_genes_idx],sqrt_var[normdispersion_variable_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)

    ax4.text(*titleletter_loc,'c',transform=ax4.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    ax4.set_title(r'Random')   
    ax4.scatter(gene_means,sqrt_var,s=dotsize, rasterized=True,color='lightgrey')
    ax4.scatter(gene_means[random_genes_idx],sqrt_var[random_genes_idx],color='tab:red',s=dotsize,label='selection by sqrt(CPMedian)', rasterized=True)
    
    sns.despine()
    plt.tight_layout()

In [495]:

Copied!

enhancer = ad.read('./windows_filter_10000.h5ad')
enhancer = ad.read('./windows_filter_10000.h5ad')

 FutureWarning:/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/anndata/__init__.py:51: `anndata.read` is deprecated, use `anndata.read_h5ad` instead. `ad.read` will be removed in mid 2024.

In [496]:

Copied!

enhancer
enhancer

Out[496]:

AnnData object with n_obs × n_vars = 2364 × 10000
    obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Sample_id', 'Sample', 'Animal age', 'FACS date', 'Brain area', 'Laminar layer', 'Labeling', 'FACS channel', 'FACS count', 'Bisulfite conversion method', 'Library type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Sequencing run mode', 'Total reads', 'Mapped reads', '% Mapped reads', 'Filtered reads', '% Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Coverage (%)', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate', 'n_features', 'pct_peaks'
    var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var', 'pct_cell', 'mean', 'feature_select', 'feature_select_var'
    uns: 'label_color', 'pca', 'tsne', 'workdir'
    obsm: 'X_pca', 'X_tsne', 'mylayer_pca', 'relative_tsne'
    varm: 'PCs'
    layers: 'relative'

In [499]:

Copied!

scm.pl.pca(scm_100k, color=['Neuron type'],wspace=0.4)
scm.pl.pca(scm_100k, color=['Neuron type'],wspace=0.4)

In [494]:

Copied!

scm.pl.tsne(scm_100k, color=['Neuron type'],wspace=0.4)
scm.pl.tsne(scm_100k, color=['Neuron type'],wspace=0.4)

In [500]:

Copied!

scm.pl.pca(adata, color=['Neuron type'],wspace=0.4)
scm.pl.pca(adata, color=['Neuron type'],wspace=0.4)

In [501]:

Copied!

scm.pl.umap(adata, color=['Neuron type'],wspace=0.4)
scm.pl.umap(adata, color=['Neuron type'],wspace=0.4)

In [3]:

Copied!

# 新的数据
scm_100k  = scm.pp.load_scm("/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/scbs/w100k.h5ad")
# 新的数据
scm_100k  = scm.pp.load_scm("/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/scbs/w100k.h5ad")

In [3]:

Copied!

scm.pp.add_meta(scm_100k,meta_file='/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/meta/Luo_GSE97179_human_mid_cortex_raw_meta.txt',index_col='Sample_id')
scm.pp.add_meta(scm_100k,meta_file='/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/meta/Luo_GSE97179_human_mid_cortex_raw_meta.txt',index_col='Sample_id')

No column 'label' found in metadata, 'unknown' is used as the default cell labels
No column 'label_color' found in metadata, random color is generated for each cell label

In [5]:

Copied!

scm_100k
scm_100k

Out[5]:

AnnData object with n_obs × n_vars = 2365 × 30894
    obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Sample_id', 'Sample', 'Animal age', 'FACS date', 'Brain area', 'Laminar layer', 'Labeling', 'FACS channel', 'FACS count', 'Bisulfite conversion method', 'Library type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Sequencing run mode', 'Total reads', 'Mapped reads', '% Mapped reads', 'Filtered reads', '% Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Coverage (%)', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate'
    var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var'
    uns: 'workdir', 'label_color'
    layers: 'relative'

In [8]:

Copied!

scm_100k.obs
scm_100k.obs

Out[8]:

	cell_id	cell_name	sites	meth	n_total	global_meth_level	Sample_id	Sample	Animal age	FACS date	...	mCG/CG	mCH/CH	Estimated mCG/CG	Estimated mCH/CH	Coverage (%)	Neuron type	tSNE x coordinate	tSNE y coordinate	n_features	pct_peaks
0	0	GSM2556580	3186266	2496679	3825044	0.783575	GSM2556580	Pool_1026_AD002_indexed	25 yr	2012/6/29	...	0.79871	0.02211	0.79762	0.01680	6.01	hSst-2	20.93320	13.43860	29498	0.954813
1	1	GSM2557770	2572709	1931054	3242784	0.750592	GSM2557770	Pool_263_AD010_indexed	25 yr	2012/6/29	...	0.76486	0.03588	0.76305	0.02847	5.11	hL2/3	-7.61221	13.74140	29498	0.954813
2	2	GSM2556556	3247648	2505895	3882099	0.771603	GSM2556556	Pool_1012_AD006_indexed	25 yr	2012/6/29	...	0.78647	0.05192	0.78433	0.04240	6.50	hL2/3	-14.57080	4.75207	29497	0.954781
3	3	GSM2556719	3593247	2753322	4304895	0.766249	GSM2556719	Pool_1090_AD006_indexed	25 yr	2012/6/29	...	0.78164	0.04276	0.77984	0.03487	7.19	hL2/3	-12.23290	10.33370	29504	0.955007
4	4	GSM2558838	1671616	1213688	1944619	0.726057	GSM2558838	Pool_771_AD008_indexed	25 yr	2012/6/29	...	0.73986	0.04119	0.73763	0.03297	3.26	hL2/3	-12.52800	12.85390	29474	0.954036
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2360	2360	GSM2557118	4030037	3049428	5723469	0.756675	GSM2557118	Pool_1381_AD002_indexed	25 yr	2012/6/29	...	0.76468	0.05545	0.76216	0.04534	7.86	hDL-1	-9.11158	-1.04451	29499	0.954846
2361	2361	GSM2558140	3335079	2526078	4135394	0.757427	GSM2558140	Pool_441_AD008_indexed	25 yr	2012/6/29	...	0.76573	0.03698	0.76416	0.03051	6.79	hL2/3	-11.26030	12.10890	29501	0.954910
2362	2362	GSM2557032	4045664	3312230	5141670	0.818711	GSM2557032	Pool_1359_AD010_indexed	25 yr	2012/6/29	...	0.83100	0.08086	0.82834	0.06641	8.27	hSst-3	15.37830	-9.40008	29498	0.954813
2363	2363	GSM2558877	1554042	1192705	1806746	0.767486	GSM2558877	Pool_790_AD010_indexed	25 yr	2012/6/29	...	0.78764	0.03502	0.78601	0.02759	2.96	hNdnf	16.07330	8.95018	29327	0.949278
2364	2364	GSM2557111	4053094	3004728	5423767	0.741342	GSM2557111	Pool_1379_AD010_indexed	25 yr	2012/6/29	...	0.75210	0.03068	0.75041	0.02408	8.11	hL2/3	-5.32003	13.90730	29496	0.954748

2364 rows × 41 columns

In [7]:

Copied!

scm.pp.filter_cells(scm_100k,min_n_features = 10000)
scm.pp.filter_features(scm_100k,min_n_cells = 1000)
scm.pp.filter_cells(scm_100k,min_n_features = 10000)
scm.pp.filter_features(scm_100k,min_n_cells = 1000)

filter cells based on min_n_features >=  10000
after filtering out low-quality cells: 
2364 cells, 30894 regions

/asnas/baoym_group/zongwt/software/miniconda3/envs/py39/lib/python3.9/site-packages/scMethtools/preprocessing/scm.py:207: RuntimeWarning: Mean of empty slice
  feature_mean = np.nanmean(dense_X, axis=0)

Filter regions based on min_n_cells
After filtering out low coveraged regions: 
2364 cells, 29509 regions

In [12]:

Copied!





scm.pp.pca(scm_100k,n_pcs = 30, layer = 'relative')
sc.pp.neighbors(scm_100k, use_rep = 'relative_pca')
scm.pp.umap (scm_100k)
scm.pl.umap (scm_100k, color=['Neuron type'],size=40)
scm.pp.pca(scm_100k,n_pcs = 30, layer = 'relative')
sc.pp.neighbors(scm_100k, use_rep = 'relative_pca')
scm.pp.umap (scm_100k)
scm.pl.umap (scm_100k, color=['Neuron type'],size=40)

The history saving thread hit an unexpected error (OperationalError('database is locked')).History will not be written to the database.

In [11]:

Copied!





scm.pp.pca(scm_100k,n_pcs = 30)
sc.pp.neighbors(scm_100k)
scm.pp.umap (scm_100k)
scm.pl.umap (scm_100k, color=['Neuron type'],size=40)
scm.pp.pca(scm_100k,n_pcs = 30)
sc.pp.neighbors(scm_100k)
scm.pp.umap (scm_100k)
scm.pl.umap (scm_100k, color=['Neuron type'],size=40)

... using all features

In [ ]: