Feature Selection

In [2]:

Copied!





import pandas as pd
import numpy as np
import scanpy as sc
import episcanpy as epi
import scMethtools as scm
import pandas as pd
import numpy as np
import scanpy as sc
import episcanpy as epi
import scMethtools as scm

In [11]:

Copied!

%load_ext autoreload
%autoreload 2
%load_ext autoreload
%autoreload 2

In [3]:

Copied!





import os
import scipy.sparse as sparse
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
from scipy.stats import gaussian_kde
import os
import scipy.sparse as sparse
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
from scipy.stats import gaussian_kde

In [4]:

Copied!





#setting workdir
workdir = 'D://Test/GSE97179/scMethtools'
os.chdir(workdir)
# path_fm = os.path.join(workdir,'feature_matrices/')
# path_clusters = os.path.join(workdir,'clusters/')
# path_metrics = os.path.join(workdir,'metrics/')
# os.system('mkdir -p '+path_clusters)
# os.system('mkdir -p '+path_metrics)
#setting workdir
workdir = 'D://Test/GSE97179/scMethtools'
os.chdir(workdir)
# path_fm = os.path.join(workdir,'feature_matrices/')
# path_clusters = os.path.join(workdir,'clusters/')
# path_metrics = os.path.join(workdir,'metrics/')
# os.system('mkdir -p '+path_clusters)
# os.system('mkdir -p '+path_metrics)

In [5]:

Copied!

winows100k_scm  = scm.pp.load_scm('./windows.h5')
winows100k_scm  = scm.pp.load_scm('./windows.h5')

No description has been provided for this image

Single Feature -- Fixed Windows(10kb)¶

In [6]:

Copied!

winows100k_scm
winows100k_scm

Out[6]:

AnnData object with n_obs × n_vars = 2365 × 30894
    obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level'
    var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var'
    uns: 'workdir'
    layers: 'relative'

In [6]:

Copied!

scm.pp.add_meta(winows100k_scm,meta_file='../meta/Luo_GSE97179_human_mid_cortex_raw_meta.txt',sep='\t',index_col='Sample_id')
scm.pp.add_meta(winows100k_scm,meta_file='../meta/Luo_GSE97179_human_mid_cortex_raw_meta.txt',sep='\t',index_col='Sample_id')

No column 'label' found in metadata, 'unknown' is used as the default cell labels
No column 'label_color' found in metadata, random color is generated for each cell label

 UserWarning:D:\conda\envs\py39\lib\site-packages\anndata\_core\anndata.py:798: 
AnnData expects .obs.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer

In [8]:

Copied!

winows100k_scm.obs
winows100k_scm.obs

Out[8]:

	cell_id	cell_name	sites	meth	n_total	global_meth_level	Sample_id	Sample	Animal age	FACS date	...	% Filtered reads	mCCC/CCC	mCG/CG	mCH/CH	Estimated mCG/CG	Estimated mCH/CH	Coverage (%)	Neuron type	tSNE x coordinate	tSNE y coordinate
0	0	GSM2556580	3186266	2496679	3825044	0.783575	GSM2556580	Pool_1026_AD002_indexed	25 yr	2012/6/29	...	48.40%	0.00540	0.79871	0.02211	0.79762	0.01680	6.01	hSst-2	20.93320	13.43860
1	1	GSM2557770	2572709	1931054	3242784	0.750592	GSM2557770	Pool_263_AD010_indexed	25 yr	2012/6/29	...	51.30%	0.00763	0.76486	0.03588	0.76305	0.02847	5.11	hL2/3	-7.61221	13.74140
2	2	GSM2556556	3247648	2505895	3882099	0.771603	GSM2556556	Pool_1012_AD006_indexed	25 yr	2012/6/29	...	53.40%	0.00994	0.78647	0.05192	0.78433	0.04240	6.50	hL2/3	-14.57080	4.75207
3	3	GSM2556719	3593247	2753322	4304895	0.766249	GSM2556719	Pool_1090_AD006_indexed	25 yr	2012/6/29	...	47.80%	0.00818	0.78164	0.04276	0.77984	0.03487	7.19	hL2/3	-12.23290	10.33370
4	4	GSM2558838	1671616	1213688	1944619	0.726057	GSM2558838	Pool_771_AD008_indexed	25 yr	2012/6/29	...	50.30%	0.00850	0.73986	0.04119	0.73763	0.03297	3.26	hL2/3	-12.52800	12.85390
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2360	2360	GSM2557118	4030037	3049428	5723469	0.756675	GSM2557118	Pool_1381_AD002_indexed	25 yr	2012/6/29	...	38.90%	0.01059	0.76468	0.05545	0.76216	0.04534	7.86	hDL-1	-9.11158	-1.04451
2361	2361	GSM2558140	3335079	2526078	4135394	0.757427	GSM2558140	Pool_441_AD008_indexed	25 yr	2012/6/29	...	41.10%	0.00667	0.76573	0.03698	0.76416	0.03051	6.79	hL2/3	-11.26030	12.10890
2362	2362	GSM2557032	4045664	3312230	5141670	0.818711	GSM2557032	Pool_1359_AD010_indexed	25 yr	2012/6/29	...	43.10%	0.01548	0.83100	0.08086	0.82834	0.06641	8.27	hSst-3	15.37830	-9.40008
2363	2363	GSM2558877	1554042	1192705	1806746	0.767486	GSM2558877	Pool_790_AD010_indexed	25 yr	2012/6/29	...	49.00%	0.00764	0.78764	0.03502	0.78601	0.02759	2.96	hNdnf	16.07330	8.95018
2364	2364	GSM2557111	4053094	3004728	5423767	0.741342	GSM2557111	Pool_1379_AD010_indexed	25 yr	2012/6/29	...	41.10%	0.00676	0.75210	0.03068	0.75041	0.02408	8.11	hL2/3	-5.32003	13.90730

2365 rows × 39 columns

In [9]:

Copied!

winows100k_scm.uns['workdir']
winows100k_scm.uns['workdir']

Out[9]:

'/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scm'

Filter¶

In [8]:

Copied!

filter_cells(winows100k_scm,min_n_features = 10000)
filter_features(winows100k_scm,min_n_cells=500)
filter_cells(winows100k_scm,min_n_features = 10000)
filter_features(winows100k_scm,min_n_cells=500)

filter cells based on min_n_features >=  10000

 ImplicitModificationWarning:D:\conda\envs\py39\lib\site-packages\anndata\_core\anndata.py:121: Transforming to str index.

after filtering out low-quality cells: 
2364 cells, 30894 regions

 RuntimeWarning:C:\Users\zong\AppData\Local\Temp\ipykernel_15988\2075018014.py:11: Mean of empty slice

Filter regions based on min_n_cells
After filtering out low coveraged regions: 
2364 cells, 29510 regions

In [7]:

Copied!





def filter_features(adata,
                    min_n_cells = None, max_n_cells=None,
                    min_pct_cells = None, max_pct_cells=None,
                    assay=None):
    feature = 'regions'
    # if('covered_cell' in adata.var_keys()):
    #     n_cells = adata.var['covered_cell']
    # else:
    if sparse.issparse(adata.X):
        dense_X = adata.X.toarray()
        feature_mean = np.nanmean(dense_X, axis=0)
        n_cells = np.sum(~np.isnan(dense_X), axis=0).astype(int)
    else:
        n_cells = np.sum(~np.isnan(adata.X), axis=0).astype(int)
        feature_mean = np.nanmean(adata.X, axis=0)
    adata.var['covered_cell'] = n_cells
    adata.var['mean'] = feature_mean
        
    if('pct_cell' in adata.var_keys()): 
        pct_cells = adata.var['pct_cell']
    else:
        pct_cells = n_cells/adata.shape[0]
        adata.var['pct_cell'] = pct_cells

        
    if(sum(list(map(lambda x: x is None,[min_n_cells,min_pct_cells,
                                         max_n_cells,max_pct_cells,])))==4):
        print('No filtering')
    else:
        feature_subset = np.ones(len(adata.var_names),dtype=bool)
        if(min_n_cells!=None):
            print('Filter '+feature+' based on min_n_cells')
            feature_subset = (n_cells>=min_n_cells) & feature_subset
        if(max_n_cells!=None):
            print('Filter '+feature+' based on max_n_cells')
            feature_subset = (n_cells<=max_n_cells) & feature_subset
        if(min_pct_cells!=None):
            print('Filter '+feature+' based on min_pct_cells')
            feature_subset = (pct_cells>=min_pct_cells) & feature_subset
        if(max_pct_cells!=None):
            print('Filter '+feature+' based on max_pct_cells')
            feature_subset = (pct_cells<=max_pct_cells) & feature_subset
        adata._inplace_subset_var(feature_subset)
        print('After filtering out low coveraged '+feature+': ')
        print(str(adata.shape[0])+' cells, ' + str(adata.shape[1])+' '+feature)
    return None

def filter_cells(adata,
                 min_n_features = None, max_n_features = None,
                 min_pct_features = None, max_pct_features = None,
                 min_mc_level =None, max_mc_level =None,
                 min_n_sites = None,
                 assay=None):
    feature = 'regions'         
    if('sites' in adata.obs_keys()):
        n_sites = adata.obs['sites']                
    if('global_meth_level' in adata.obs_keys()):
        methylation_level = adata.obs['global_meth_level']
    if('features_number' in adata.obs_keys()):
        n_features = adata.obs['features_number']
    else:
        if sparse.issparse(adata.X):
            n_features = np.sum(~np.isnan(adata.X.toarray()), axis=1).astype(int)
        else:
            n_features = np.sum(~np.isnan(adata.X), axis=1).astype(int)
        adata.obs['n_features'] = n_features       
    if('pct_features' in adata.obs_keys()):
        pct_features = adata.obs['pct_features']
    else:
        pct_features = n_features/adata.shape[1]
        adata.obs['pct_peaks'] = pct_features           

    if(sum(list(map(lambda x: x is None,[min_n_features,min_pct_features,min_n_sites,min_mc_level,
                                         max_n_features,max_pct_features,max_mc_level])))==7):
        print('No filtering')    
    else:
        cell_subset = np.ones(len(adata.obs_names),dtype=bool)
        if(min_n_features!=None):
            print('filter cells based on min_n_features >= ', min_n_features)
            cell_subset = (n_features>=min_n_features) & cell_subset
        if(max_n_features!=None):
            print('filter cells based on max_n_features <=', max_n_features)
            cell_subset = (n_features<=max_n_features) & cell_subset
        if(min_pct_features!=None):
            print('filter cells based on min_pct_features >= ',min_pct_features)
            cell_subset = (pct_features>=min_pct_features) & cell_subset
        if(max_pct_features!=None):
            print('filter cells based on max_pct_features <= ',max_pct_features)
            cell_subset = (pct_features<=max_pct_features) & cell_subset
        adata._inplace_subset_obs(cell_subset)
        print('after filtering out low-quality cells: ')
        print(str(adata.shape[0])+' cells, ' + str(adata.shape[1])+' '+feature)
    return None
def filter_features(adata,
                    min_n_cells = None, max_n_cells=None,
                    min_pct_cells = None, max_pct_cells=None,
                    assay=None):
    feature = 'regions'
    # if('covered_cell' in adata.var_keys()):
    #     n_cells = adata.var['covered_cell']
    # else:
    if sparse.issparse(adata.X):
        dense_X = adata.X.toarray()
        feature_mean = np.nanmean(dense_X, axis=0)
        n_cells = np.sum(~np.isnan(dense_X), axis=0).astype(int)
    else:
        n_cells = np.sum(~np.isnan(adata.X), axis=0).astype(int)
        feature_mean = np.nanmean(adata.X, axis=0)
    adata.var['covered_cell'] = n_cells
    adata.var['mean'] = feature_mean
        
    if('pct_cell' in adata.var_keys()): 
        pct_cells = adata.var['pct_cell']
    else:
        pct_cells = n_cells/adata.shape[0]
        adata.var['pct_cell'] = pct_cells

        
    if(sum(list(map(lambda x: x is None,[min_n_cells,min_pct_cells,
                                         max_n_cells,max_pct_cells,])))==4):
        print('No filtering')
    else:
        feature_subset = np.ones(len(adata.var_names),dtype=bool)
        if(min_n_cells!=None):
            print('Filter '+feature+' based on min_n_cells')
            feature_subset = (n_cells>=min_n_cells) & feature_subset
        if(max_n_cells!=None):
            print('Filter '+feature+' based on max_n_cells')
            feature_subset = (n_cells<=max_n_cells) & feature_subset
        if(min_pct_cells!=None):
            print('Filter '+feature+' based on min_pct_cells')
            feature_subset = (pct_cells>=min_pct_cells) & feature_subset
        if(max_pct_cells!=None):
            print('Filter '+feature+' based on max_pct_cells')
            feature_subset = (pct_cells<=max_pct_cells) & feature_subset
        adata._inplace_subset_var(feature_subset)
        print('After filtering out low coveraged '+feature+': ')
        print(str(adata.shape[0])+' cells, ' + str(adata.shape[1])+' '+feature)
    return None

def filter_cells(adata,
                 min_n_features = None, max_n_features = None,
                 min_pct_features = None, max_pct_features = None,
                 min_mc_level =None, max_mc_level =None,
                 min_n_sites = None,
                 assay=None):
    feature = 'regions'         
    if('sites' in adata.obs_keys()):
        n_sites = adata.obs['sites']                
    if('global_meth_level' in adata.obs_keys()):
        methylation_level = adata.obs['global_meth_level']
    if('features_number' in adata.obs_keys()):
        n_features = adata.obs['features_number']
    else:
        if sparse.issparse(adata.X):
            n_features = np.sum(~np.isnan(adata.X.toarray()), axis=1).astype(int)
        else:
            n_features = np.sum(~np.isnan(adata.X), axis=1).astype(int)
        adata.obs['n_features'] = n_features       
    if('pct_features' in adata.obs_keys()):
        pct_features = adata.obs['pct_features']
    else:
        pct_features = n_features/adata.shape[1]
        adata.obs['pct_peaks'] = pct_features           

    if(sum(list(map(lambda x: x is None,[min_n_features,min_pct_features,min_n_sites,min_mc_level,
                                         max_n_features,max_pct_features,max_mc_level])))==7):
        print('No filtering')    
    else:
        cell_subset = np.ones(len(adata.obs_names),dtype=bool)
        if(min_n_features!=None):
            print('filter cells based on min_n_features >= ', min_n_features)
            cell_subset = (n_features>=min_n_features) & cell_subset
        if(max_n_features!=None):
            print('filter cells based on max_n_features <=', max_n_features)
            cell_subset = (n_features<=max_n_features) & cell_subset
        if(min_pct_features!=None):
            print('filter cells based on min_pct_features >= ',min_pct_features)
            cell_subset = (pct_features>=min_pct_features) & cell_subset
        if(max_pct_features!=None):
            print('filter cells based on max_pct_features <= ',max_pct_features)
            cell_subset = (pct_features<=max_pct_features) & cell_subset
        adata._inplace_subset_obs(cell_subset)
        print('after filtering out low-quality cells: ')
        print(str(adata.shape[0])+' cells, ' + str(adata.shape[1])+' '+feature)
    return None

Dimension Reduction¶

In [61]:

Copied!





from sklearn.decomposition import PCA as sklearnPCA
def pca(adata,layer=None,feature=None,n_pc = 30,max_pc = 50,first_pc = False,use_precomputed=True,
                                    save_fig=False,fig_name='top_pcs.pdf',fig_path=None,fig_size=(4,4),
                                    pad=1.08,w_pad=None,h_pad=None):
    
    # if(fig_path is None):
    #     fig_path = adata.uns['workdir']
    # fig_size = mpl.rcParams['figure.figsize'] if fig_size is None else fig_size  
    if(feature is None):
        feature = 'all'
    if(feature not in ['all','hvm']):
        raise ValueError("unrecognized feature '%s'" % feature)

    # 检查主数据矩阵中完全为空的列
    # 检查主数据矩阵并移除完全为空的列
    if sparse.issparse(adata.X):
        all_nan_columns = np.isnan(adata.X.toarray()).all(axis=0)
    else:
        all_nan_columns = np.isnan(adata.X).all(axis=0)
    
    # 移除主矩阵和所有层中的相应列
    adata[:, ~all_nan_columns].X
    # for layer_name in adata.layers.keys():
    #     adata.layers[layer_name] = adata.layers[layer_name][:, ~all_nan_columns]
    # 移除所有层中的相应列
    for layer_name in adata.layers.keys():
        adata[:, ~all_nan_columns].layers[layer_name]
        
    max_pc = min(min(adata.shape), n_pc) # in case cells are smaller than n_pcs
    sklearn_pca = sklearnPCA(n_components=max_pc,svd_solver='arpack',random_state=42)
    from sklearn.impute import SimpleImputer
    imp = SimpleImputer(missing_values=np.nan, strategy="mean")
    #！！行缺失值填充或其他转换时，如果存在完全为空的列，那么在转换过程中这些列可能会被删除，导致最终的数据矩阵列数少于原始数据。
    if layer is not None:
        adata.layers[layer]  = imp.fit_transform(adata.layers[layer])
        if(feature == 'hvm'):
            print('using top variable regions ...')
            adata_filter = adata[:,adata.var['feature_select']].copy()
            sc.tl.pca(adata_filter, svd_solver='arpack')
        else:
            print('using all the features ...')
            adata.obsm["mylayer_pca"] = sc.tl.pca(adata.layers[layer], svd_solver='arpack')
    else:
        adata.X = imp.fit_transform(adata.X)
        if(feature == 'hvm'):
            print('using top variable regions ...')
            adata_filter = adata[:,adata.var['feature_select']].copy()
            sc.tl.pca(adata_filter, svd_solver='arpack')     # ！！！Error 
        else:
            print('using all the features ...')
            sc.tl.pca(adata, svd_solver='arpack')
    return None
from sklearn.decomposition import PCA as sklearnPCA
def pca(adata,layer=None,feature=None,n_pc = 30,max_pc = 50,first_pc = False,use_precomputed=True,
                                    save_fig=False,fig_name='top_pcs.pdf',fig_path=None,fig_size=(4,4),
                                    pad=1.08,w_pad=None,h_pad=None):
    
    # if(fig_path is None):
    #     fig_path = adata.uns['workdir']
    # fig_size = mpl.rcParams['figure.figsize'] if fig_size is None else fig_size  
    if(feature is None):
        feature = 'all'
    if(feature not in ['all','hvm']):
        raise ValueError("unrecognized feature '%s'" % feature)

    # 检查主数据矩阵中完全为空的列
    # 检查主数据矩阵并移除完全为空的列
    if sparse.issparse(adata.X):
        all_nan_columns = np.isnan(adata.X.toarray()).all(axis=0)
    else:
        all_nan_columns = np.isnan(adata.X).all(axis=0)
    
    # 移除主矩阵和所有层中的相应列
    adata[:, ~all_nan_columns].X
    # for layer_name in adata.layers.keys():
    #     adata.layers[layer_name] = adata.layers[layer_name][:, ~all_nan_columns]
    # 移除所有层中的相应列
    for layer_name in adata.layers.keys():
        adata[:, ~all_nan_columns].layers[layer_name]
        
    max_pc = min(min(adata.shape), n_pc) # in case cells are smaller than n_pcs
    sklearn_pca = sklearnPCA(n_components=max_pc,svd_solver='arpack',random_state=42)
    from sklearn.impute import SimpleImputer
    imp = SimpleImputer(missing_values=np.nan, strategy="mean")
    #！！行缺失值填充或其他转换时，如果存在完全为空的列，那么在转换过程中这些列可能会被删除，导致最终的数据矩阵列数少于原始数据。
    if layer is not None:
        adata.layers[layer]  = imp.fit_transform(adata.layers[layer])
        if(feature == 'hvm'):
            print('using top variable regions ...')
            adata_filter = adata[:,adata.var['feature_select']].copy()
            sc.tl.pca(adata_filter, svd_solver='arpack')
        else:
            print('using all the features ...')
            adata.obsm["mylayer_pca"] = sc.tl.pca(adata.layers[layer], svd_solver='arpack')
    else:
        adata.X = imp.fit_transform(adata.X)
        if(feature == 'hvm'):
            print('using top variable regions ...')
            adata_filter = adata[:,adata.var['feature_select']].copy()
            sc.tl.pca(adata_filter, svd_solver='arpack')     # ！！！Error 
        else:
            print('using all the features ...')
            sc.tl.pca(adata, svd_solver='arpack')
    return None

In [62]:

Copied!





#winows100k_scm
pca(winows100k_scm,layer="relative")
sc.tl.tsne(winows100k_scm,use_rep="mylayer_pca")
winows100k_scm.obsm["relative_tsne"] = winows100k_scm.obsm['X_tsne']
pca(winows100k_scm)
sc.tl.tsne(winows100k_scm) 
#winows100k_scm
pca(winows100k_scm,layer="relative")
sc.tl.tsne(winows100k_scm,use_rep="mylayer_pca")
winows100k_scm.obsm["relative_tsne"] = winows100k_scm.obsm['X_tsne']
pca(winows100k_scm)
sc.tl.tsne(winows100k_scm) 

using all the features ...
using all the features ...

In [111]:

Copied!





import matplotlib.pyplot as plt
import seaborn as sns

title_fontsize=16
title_fontweight = "bold"
marker_offset = {1:[-10,-25],         2:[0,-20],     3: [-20,+8],              4: [3,-30],               5: [+23,-3]}
clusters = winows100k_scm.obs['Neuron type'].astype(str)  # 将数值类型转换为字符串类型
tsnes = [winows100k_scm.obsm["X_tsne"],winows100k_scm.obsm["relative_tsne"]]
titles = ['Mean methylation level, PCA', 'Relative methylation level, PCA']
ylabels = ['','']
letters = ['a','b']
with sns.plotting_context('paper'):
    plt.figure(figsize=(18,18))
    
    for i, (tsne,title,letter,ylabel) in enumerate(zip(tsnes,titles,letters,ylabels)):
        
        plt.subplot(331+i)
        ax=plt.gca()
        plt.text(0.05,1.015,letter,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
        plt.title(title)
        plt.xticks([])
        plt.yticks([])
        ax.set_ylabel(ylabel)
                    # 绘制每个群集
        for cluster_id in np.unique(clusters):
            cluster_idx = clusters == cluster_id
                    # 选取当前群集的所有点
            cluster_points = tsne_coords[cluster_idx]
            # 计算当前群集所有点的均值，这将作为注释的位置
            centroid = np.mean(cluster_points, axis=0)
            points=plt.scatter(tsne_coords[cluster_idx, 0], tsne_coords[cluster_idx, 1], s=6, edgecolor='none', rasterized=True)
            if i == 0:
                color = points.get_facecolor().flatten()
                # 在群集的中心位置添加注释，注释内容为群集的ID
                text = plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=8, ha='center', va='center',fontweight='bold')
                # 创建背景框对象，并设置背景框的样式
                # bbox_props = dict(boxstyle="round,pad=0.3", fc="white", alpha=0.2)
                # plt.setp(text, bbox=bbox_props)
    plt.subplot(333)
    ax=plt.gca()
    plt.text(0.05,1.015,'c',transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    plt.title('raw tsne, Luo')
    plt.xticks([])
    plt.yticks([])
    for cluster_id in np.unique(clusters):
            cluster_idx = clusters == cluster_id
            points= plt.scatter(promoters.obs['tSNE x coordinate'][cluster_idx], promoters.obs['tSNE y coordinate'][cluster_idx], s=5, edgecolor='none',rasterized=True)
            centroid_x = np.mean(promoters.obs['tSNE x coordinate'][cluster_idx])
            centroid_y = np.mean(promoters.obs['tSNE y coordinate'][cluster_idx])
            plt.text(centroid_x, centroid_y-1, cluster_id, fontsize=8, ha='center', va='center')
    plt.tight_layout()
    sns.despine(left=True,bottom=True)
import matplotlib.pyplot as plt
import seaborn as sns

title_fontsize=16
title_fontweight = "bold"
marker_offset = {1:[-10,-25],         2:[0,-20],     3: [-20,+8],              4: [3,-30],               5: [+23,-3]}
clusters = winows100k_scm.obs['Neuron type'].astype(str)  # 将数值类型转换为字符串类型
tsnes = [winows100k_scm.obsm["X_tsne"],winows100k_scm.obsm["relative_tsne"]]
titles = ['Mean methylation level, PCA', 'Relative methylation level, PCA']
ylabels = ['','']
letters = ['a','b']
with sns.plotting_context('paper'):
    plt.figure(figsize=(18,18))
    
    for i, (tsne,title,letter,ylabel) in enumerate(zip(tsnes,titles,letters,ylabels)):
        
        plt.subplot(331+i)
        ax=plt.gca()
        plt.text(0.05,1.015,letter,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
        plt.title(title)
        plt.xticks([])
        plt.yticks([])
        ax.set_ylabel(ylabel)
                    # 绘制每个群集
        for cluster_id in np.unique(clusters):
            cluster_idx = clusters == cluster_id
                    # 选取当前群集的所有点
            cluster_points = tsne_coords[cluster_idx]
            # 计算当前群集所有点的均值，这将作为注释的位置
            centroid = np.mean(cluster_points, axis=0)
            points=plt.scatter(tsne_coords[cluster_idx, 0], tsne_coords[cluster_idx, 1], s=6, edgecolor='none', rasterized=True)
            if i == 0:
                color = points.get_facecolor().flatten()
                # 在群集的中心位置添加注释，注释内容为群集的ID
                text = plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=8, ha='center', va='center',fontweight='bold')
                # 创建背景框对象，并设置背景框的样式
                # bbox_props = dict(boxstyle="round,pad=0.3", fc="white", alpha=0.2)
                # plt.setp(text, bbox=bbox_props)
    plt.subplot(333)
    ax=plt.gca()
    plt.text(0.05,1.015,'c',transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    plt.title('raw tsne, Luo')
    plt.xticks([])
    plt.yticks([])
    for cluster_id in np.unique(clusters):
            cluster_idx = clusters == cluster_id
            points= plt.scatter(promoters.obs['tSNE x coordinate'][cluster_idx], promoters.obs['tSNE y coordinate'][cluster_idx], s=5, edgecolor='none',rasterized=True)
            centroid_x = np.mean(promoters.obs['tSNE x coordinate'][cluster_idx])
            centroid_y = np.mean(promoters.obs['tSNE y coordinate'][cluster_idx])
            plt.text(centroid_x, centroid_y-1, cluster_id, fontsize=8, ha='center', va='center')
    plt.tight_layout()
    sns.despine(left=True,bottom=True)

posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values

In [121]:

Copied!





import matplotlib.pyplot as plt
import seaborn as sns

title_fontsize=16
title_fontweight = "bold"
marker_offset = {1:[-10,-25],         2:[0,-20],     3: [-20,+8],              4: [3,-30],               5: [+23,-3]}
clusters = promoters.obs['Neuron type'].astype(str)   # 将数值类型转换为字符串类型
tsnes = [promoters.obsm["X_tsne"], promoters.obsm["relative_tsne"]
        ]
titles = ['Mean methylation level, TSNE', 'Relative methylation level, TSNE']
ylabels = ['','']
letters = ['a','b']
with sns.plotting_context('paper'):
    plt.figure(figsize=(21,21))
    
    for i, (tsne,title,letter,ylabel) in enumerate(zip(tsnes,titles,letters,ylabels)):
        print(i,tsne)
        plt.subplot(331+i)
        ax=plt.gca()
        plt.text(0.05,1.015,letter,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
        plt.title(title)
        plt.xticks([])
        plt.yticks([])
        ax.set_ylabel(ylabel)
                    # 绘制每个群集
        for cluster_id in np.unique(clusters):
            cluster_idx = clusters == cluster_id
                    # 选取当前群集的所有点
            cluster_points = tsne[cluster_idx]
            # 计算当前群集所有点的均值，这将作为注释的位置
            centroid = np.mean(cluster_points, axis=0)
            points=plt.scatter(tsne[cluster_idx, 0], tsne[cluster_idx, 1], s=6, edgecolor='none', rasterized=True)
            if i == 0:
                color = points.get_facecolor().flatten()
                # 在群集的中心位置添加注释，注释内容为群集的ID
                text = plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=10, ha='center', va='center')
                # 创建背景框对象，并设置背景框的样式
                # bbox_props = dict(boxstyle="round,pad=0.3", fc="white", alpha=0.2)
                # plt.setp(text, bbox=bbox_props)
    plt.subplot(333)
    ax=plt.gca()
    plt.text(0.05,1.015,'c',transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    plt.title('raw tsne, Luo')
    plt.xticks([])
    plt.yticks([])
    for cluster_id in np.unique(clusters):
            cluster_idx = clusters == cluster_id
            points= plt.scatter(promoters.obs['tSNE x coordinate'][cluster_idx], promoters.obs['tSNE y coordinate'][cluster_idx], s=5, edgecolor='none',rasterized=True)
            centroid_x = np.mean(promoters.obs['tSNE x coordinate'][cluster_idx])
            centroid_y = np.mean(promoters.obs['tSNE y coordinate'][cluster_idx])
            plt.text(centroid_x, centroid_y-1, cluster_id, fontsize=8, ha='center', va='center')
    plt.tight_layout()
    sns.despine(left=True,bottom=True)
import matplotlib.pyplot as plt
import seaborn as sns

title_fontsize=16
title_fontweight = "bold"
marker_offset = {1:[-10,-25],         2:[0,-20],     3: [-20,+8],              4: [3,-30],               5: [+23,-3]}
clusters = promoters.obs['Neuron type'].astype(str)   # 将数值类型转换为字符串类型
tsnes = [promoters.obsm["X_tsne"], promoters.obsm["relative_tsne"]
        ]
titles = ['Mean methylation level, TSNE', 'Relative methylation level, TSNE']
ylabels = ['','']
letters = ['a','b']
with sns.plotting_context('paper'):
    plt.figure(figsize=(21,21))
    
    for i, (tsne,title,letter,ylabel) in enumerate(zip(tsnes,titles,letters,ylabels)):
        print(i,tsne)
        plt.subplot(331+i)
        ax=plt.gca()
        plt.text(0.05,1.015,letter,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
        plt.title(title)
        plt.xticks([])
        plt.yticks([])
        ax.set_ylabel(ylabel)
                    # 绘制每个群集
        for cluster_id in np.unique(clusters):
            cluster_idx = clusters == cluster_id
                    # 选取当前群集的所有点
            cluster_points = tsne[cluster_idx]
            # 计算当前群集所有点的均值，这将作为注释的位置
            centroid = np.mean(cluster_points, axis=0)
            points=plt.scatter(tsne[cluster_idx, 0], tsne[cluster_idx, 1], s=6, edgecolor='none', rasterized=True)
            if i == 0:
                color = points.get_facecolor().flatten()
                # 在群集的中心位置添加注释，注释内容为群集的ID
                text = plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=10, ha='center', va='center')
                # 创建背景框对象，并设置背景框的样式
                # bbox_props = dict(boxstyle="round,pad=0.3", fc="white", alpha=0.2)
                # plt.setp(text, bbox=bbox_props)
    plt.subplot(333)
    ax=plt.gca()
    plt.text(0.05,1.015,'c',transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    plt.title('raw tsne, Luo')
    plt.xticks([])
    plt.yticks([])
    for cluster_id in np.unique(clusters):
            cluster_idx = clusters == cluster_id
            points= plt.scatter(promoters.obs['tSNE x coordinate'][cluster_idx], promoters.obs['tSNE y coordinate'][cluster_idx], s=5, edgecolor='none',rasterized=True)
            centroid_x = np.mean(promoters.obs['tSNE x coordinate'][cluster_idx])
            centroid_y = np.mean(promoters.obs['tSNE y coordinate'][cluster_idx])
            plt.text(centroid_x, centroid_y-1, cluster_id, fontsize=8, ha='center', va='center')
    plt.tight_layout()
    sns.despine(left=True,bottom=True)

0 [[-24.962395   24.903795 ]
 [ 44.67266   -24.92722  ]
 [ 55.89689    10.648388 ]
 ...
 [-44.904964   12.718646 ]
 [  7.6758413  48.378998 ]
 [ 43.32974   -35.63923  ]]
1 [[-11.600098  28.261988]
 [ 46.33883   -6.064981]
 [ 34.6808   -32.11615 ]
 ...
 [-20.024689  57.827988]
 [-42.27409   14.973227]
 [ 51.225082   2.255647]]

posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values

In [53]:

Out[53]:

0       hSst-2
1        hL2/3
2        hL2/3
3        hL2/3
4        hL2/3
         ...  
2360     hDL-1
2361     hL2/3
2362    hSst-3
2363     hNdnf
2364     hL2/3
Name: Neuron type, Length: 2364, dtype: object

In [85]:

Copied!





def feature_select(adata,number=3000):
    print(adata.shape)
    adata.raw = adata.copy()
    
    df = adata.var
    feature_subset = df.index.isin(df.sort_values('sr_var', ascending=True).index[:number])
    df['feature_select'] = feature_subset
    adata.var = df
    data_filter = adata[:,adata.var['feature_select']].copy()
    return data_filter
def feature_select(adata,number=3000):
    print(adata.shape)
    adata.raw = adata.copy()
    
    df = adata.var
    feature_subset = df.index.isin(df.sort_values('sr_var', ascending=True).index[:number])
    df['feature_select'] = feature_subset
    adata.var = df
    data_filter = adata[:,adata.var['feature_select']].copy()
    return data_filter

In [86]:

Copied!





promoters = winows100k_scm
promoters_filter = feature_select(promoters,number=10000)
pca(promoters_filter)
pca(promoters_filter,layer="relative")
sc.tl.tsne(promoters_filter,use_rep="mylayer_pca")
promoters_filter.obsm["relative_tsne"] = promoters_filter.obsm['X_tsne']
sc.tl.tsne(promoters_filter) 
promoters = winows100k_scm
promoters_filter = feature_select(promoters,number=10000)
pca(promoters_filter)
pca(promoters_filter,layer="relative")
sc.tl.tsne(promoters_filter,use_rep="mylayer_pca")
promoters_filter.obsm["relative_tsne"] = promoters_filter.obsm['X_tsne']
sc.tl.tsne(promoters_filter) 

using all the features ...
using all the features ...

In [87]:

Copied!





promoters_filter_6000 = feature_select(promoters,number=10000)
promoters_filter_6000
pca(promoters_filter_6000)
pca(promoters_filter_6000,layer="relative")
sc.tl.tsne(promoters_filter_6000,use_rep="mylayer_pca")
promoters_filter_6000.obsm["relative_tsne"] = promoters_filter_6000.obsm['X_tsne']
sc.tl.tsne(promoters_filter_6000)
promoters_filter_6000 = feature_select(promoters,number=10000)
promoters_filter_6000
pca(promoters_filter_6000)
pca(promoters_filter_6000,layer="relative")
sc.tl.tsne(promoters_filter_6000,use_rep="mylayer_pca")
promoters_filter_6000.obsm["relative_tsne"] = promoters_filter_6000.obsm['X_tsne']
sc.tl.tsne(promoters_filter_6000)

using all the features ...
using all the features ...

In [114]:

Copied!





# 设置风格和大小
title_fontsize = 12
title_fontweight = "bold"
clusters = promoters.obs['Neuron type'].astype(str)  
tsnes = [promoters.obsm["X_tsne"], promoters.obsm["relative_tsne"],
         promoters_filter.obsm["X_tsne"], promoters_filter.obsm["relative_tsne"],
         promoters_filter_6000.obsm["X_tsne"], promoters_filter_6000.obsm["relative_tsne"]
        ]
titles = ['Mean methylation level, (All Feature)', 'Relative methylation level, (All Feature)',
          'Mean methylation level, (HVM_3000)', 'Relative methylation level, (HVM_3000)',
         'Mean methylation level, (HVM_6000)', 'Relative methylation level, (HVM_6000)']
ylabels = ['', '','', '','', '']
letters = ['a', 'b','c','d','e','f']

with sns.plotting_context('talk'):
    plt.figure(figsize=(18, 18))
    
    for i, (tsne_coords, title, letter, ylabel) in enumerate(zip(tsnes, titles, letters, ylabels)):
        
        ax = plt.subplot(320 + i +1)
        plt.text(0.05, 1.015, letter, transform=ax.transAxes, fontsize=title_fontsize, fontweight=title_fontweight)
        plt.title(title)
        plt.xticks([])
        plt.yticks([])
        ax.set_ylabel(ylabel)
    
        # 绘制每个群集
        for cluster_id in np.unique(clusters):
            cluster_idx = clusters == cluster_id
                    # 选取当前群集的所有点
            cluster_points = tsne_coords[cluster_idx]
            # 计算当前群集所有点的均值，这将作为注释的位置
            centroid = np.mean(cluster_points, axis=0)
            points=plt.scatter(tsne_coords[cluster_idx, 0], tsne_coords[cluster_idx, 1], s=6, edgecolor='none', rasterized=True)
            if i == 0:
                # 在群集的中心位置添加注释，注释内容为群集的ID
                plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=8, ha='center', va='center')
                
            #     color = points.get_facecolor().flatten()
            #     this_cluster_mean = np.mean(tsne[cluster_idx,:],axis=0)
            #     plt.text(this_cluster_mean[0],this_cluster_mean[1],cluster_id,fontsize=10,fontweight='bold',color=color)

            plt.tight_layout()
                
            
        plt.tight_layout()
        sns.despine(left=True, bottom=True)
# 设置风格和大小
title_fontsize = 12
title_fontweight = "bold"
clusters = promoters.obs['Neuron type'].astype(str)  
tsnes = [promoters.obsm["X_tsne"], promoters.obsm["relative_tsne"],
         promoters_filter.obsm["X_tsne"], promoters_filter.obsm["relative_tsne"],
         promoters_filter_6000.obsm["X_tsne"], promoters_filter_6000.obsm["relative_tsne"]
        ]
titles = ['Mean methylation level, (All Feature)', 'Relative methylation level, (All Feature)',
          'Mean methylation level, (HVM_3000)', 'Relative methylation level, (HVM_3000)',
         'Mean methylation level, (HVM_6000)', 'Relative methylation level, (HVM_6000)']
ylabels = ['', '','', '','', '']
letters = ['a', 'b','c','d','e','f']

with sns.plotting_context('talk'):
    plt.figure(figsize=(18, 18))
    
    for i, (tsne_coords, title, letter, ylabel) in enumerate(zip(tsnes, titles, letters, ylabels)):
        
        ax = plt.subplot(320 + i +1)
        plt.text(0.05, 1.015, letter, transform=ax.transAxes, fontsize=title_fontsize, fontweight=title_fontweight)
        plt.title(title)
        plt.xticks([])
        plt.yticks([])
        ax.set_ylabel(ylabel)
    
        # 绘制每个群集
        for cluster_id in np.unique(clusters):
            cluster_idx = clusters == cluster_id
                    # 选取当前群集的所有点
            cluster_points = tsne_coords[cluster_idx]
            # 计算当前群集所有点的均值，这将作为注释的位置
            centroid = np.mean(cluster_points, axis=0)
            points=plt.scatter(tsne_coords[cluster_idx, 0], tsne_coords[cluster_idx, 1], s=6, edgecolor='none', rasterized=True)
            if i == 0:
                # 在群集的中心位置添加注释，注释内容为群集的ID
                plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=8, ha='center', va='center')
                
            #     color = points.get_facecolor().flatten()
            #     this_cluster_mean = np.mean(tsne[cluster_idx,:],axis=0)
            #     plt.text(this_cluster_mean[0],this_cluster_mean[1],cluster_id,fontsize=10,fontweight='bold',color=color)

            plt.tight_layout()
                
            
        plt.tight_layout()
        sns.despine(left=True, bottom=True)

 UserWarning:C:\Users\zong\AppData\Local\Temp\ipykernel_4492\1118335867.py:43: The figure layout has changed to tight
 UserWarning:C:\Users\zong\AppData\Local\Temp\ipykernel_4492\1118335867.py:46: The figure layout has changed to tight

In [89]:

Copied!





sc.pp.neighbors(promoters, use_rep='mylayer_pca')
sc.tl.umap(promoters)
promoters.obsm["relative_umap"] = promoters.obsm['X_umap']
sc.pp.neighbors(promoters)
sc.tl.umap(promoters)
sc.pp.neighbors(promoters, use_rep='mylayer_pca')
sc.tl.umap(promoters)
promoters.obsm["relative_umap"] = promoters.obsm['X_umap']
sc.pp.neighbors(promoters)
sc.tl.umap(promoters)

In [77]:

Copied!

promoters
promoters

Out[77]:

AnnData object with n_obs × n_vars = 2364 × 29510
    obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Sample_id', 'Sample', 'Animal age', 'FACS date', 'Brain area', 'Laminar layer', 'Labeling', 'FACS channel', 'FACS count', 'Bisulfite conversion method', 'Library type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Sequencing run mode', 'Total reads', 'Mapped reads', '% Mapped reads', 'Filtered reads', '% Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Coverage (%)', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate', 'n_features', 'pct_peaks'
    var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var', 'pct_cell', 'feature_select'
    uns: 'workdir', 'label_color', 'pca', 'tsne', 'neighbors', 'umap'
    obsm: 'X_pca', 'mylayer_pca', 'X_tsne', 'relative_tsne', 'X_umap', 'relative_umap'
    varm: 'PCs'
    layers: 'relative'
    obsp: 'distances', 'connectivities'

In [90]:

Copied!





#3000 feature
sc.pp.neighbors(promoters_filter, use_rep='mylayer_pca')
sc.tl.umap(promoters_filter)
promoters_filter.obsm["relative_umap"] = promoters_filter.obsm['X_umap']
sc.pp.neighbors(promoters_filter)
sc.tl.umap(promoters_filter)
#6000 feature
sc.pp.neighbors(promoters_filter_6000, use_rep='mylayer_pca')
sc.tl.umap(promoters_filter_6000)
promoters_filter_6000.obsm["relative_umap"] = promoters_filter_6000.obsm['X_umap']
sc.pp.neighbors(promoters_filter_6000)
sc.tl.umap(promoters_filter_6000)
#3000 feature
sc.pp.neighbors(promoters_filter, use_rep='mylayer_pca')
sc.tl.umap(promoters_filter)
promoters_filter.obsm["relative_umap"] = promoters_filter.obsm['X_umap']
sc.pp.neighbors(promoters_filter)
sc.tl.umap(promoters_filter)
#6000 feature
sc.pp.neighbors(promoters_filter_6000, use_rep='mylayer_pca')
sc.tl.umap(promoters_filter_6000)
promoters_filter_6000.obsm["relative_umap"] = promoters_filter_6000.obsm['X_umap']
sc.pp.neighbors(promoters_filter_6000)
sc.tl.umap(promoters_filter_6000)

In [95]:

Copied!





# 设置风格和大小
title_fontsize = 12
title_fontweight = "bold"
clusters = promoters.obs['Labeling'].astype(str)  
tsnes = [promoters.obsm["X_umap"], promoters.obsm["relative_umap"],
         promoters_filter.obsm["X_umap"], promoters_filter.obsm["relative_umap"],
         promoters_filter_6000.obsm["X_umap"], promoters_filter_6000.obsm["relative_umap"]
        ]
titles = ['Mean methylation level, (All Feature)', 'Relative methylation level, (All Feature)',
          'Mean methylation level, (HVM_3000)', 'Relative methylation level, (HVM_3000)',
         'Mean methylation level, (HVM_6000)', 'Relative methylation level, (HVM_6000)']
ylabels = ['', '','', '','', '']
letters = ['a', 'b','c','d','e','f']

with sns.plotting_context('talk'):
    plt.figure(figsize=(18, 18))
    
    for i, (tsne_coords, title, letter, ylabel) in enumerate(zip(tsnes, titles, letters, ylabels)):
        
        ax = plt.subplot(320 + i +1)
        plt.text(0.05, 1.015, letter, transform=ax.transAxes, fontsize=title_fontsize, fontweight=title_fontweight)
        plt.title(title)
        plt.xticks([])
        plt.yticks([])
        ax.set_ylabel(ylabel)
    
        # 绘制每个群集
        for cluster_id in np.unique(clusters):
            cluster_idx = clusters == cluster_id
                    # 选取当前群集的所有点
            cluster_points = tsne_coords[cluster_idx]
            # 计算当前群集所有点的均值，这将作为注释的位置
            centroid = np.mean(cluster_points, axis=0)
            points=plt.scatter(tsne_coords[cluster_idx, 0], tsne_coords[cluster_idx, 1], s=6, edgecolor='none', rasterized=True, cmap='tab20')
            if i == 0:
                # 在群集的中心位置添加注释，注释内容为群集的ID
                plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=8, ha='center', va='center')
            #     color = points.get_facecolor().flatten()
            #     this_cluster_mean = np.mean(tsne[cluster_idx,:],axis=0)
            #     plt.text(this_cluster_mean[0],this_cluster_mean[1],cluster_id,fontsize=10,fontweight='bold',color=color)

            plt.tight_layout()
                
            
        plt.tight_layout()
        sns.despine(left=True, bottom=True)
# 设置风格和大小
title_fontsize = 12
title_fontweight = "bold"
clusters = promoters.obs['Labeling'].astype(str)  
tsnes = [promoters.obsm["X_umap"], promoters.obsm["relative_umap"],
         promoters_filter.obsm["X_umap"], promoters_filter.obsm["relative_umap"],
         promoters_filter_6000.obsm["X_umap"], promoters_filter_6000.obsm["relative_umap"]
        ]
titles = ['Mean methylation level, (All Feature)', 'Relative methylation level, (All Feature)',
          'Mean methylation level, (HVM_3000)', 'Relative methylation level, (HVM_3000)',
         'Mean methylation level, (HVM_6000)', 'Relative methylation level, (HVM_6000)']
ylabels = ['', '','', '','', '']
letters = ['a', 'b','c','d','e','f']

with sns.plotting_context('talk'):
    plt.figure(figsize=(18, 18))
    
    for i, (tsne_coords, title, letter, ylabel) in enumerate(zip(tsnes, titles, letters, ylabels)):
        
        ax = plt.subplot(320 + i +1)
        plt.text(0.05, 1.015, letter, transform=ax.transAxes, fontsize=title_fontsize, fontweight=title_fontweight)
        plt.title(title)
        plt.xticks([])
        plt.yticks([])
        ax.set_ylabel(ylabel)
    
        # 绘制每个群集
        for cluster_id in np.unique(clusters):
            cluster_idx = clusters == cluster_id
                    # 选取当前群集的所有点
            cluster_points = tsne_coords[cluster_idx]
            # 计算当前群集所有点的均值，这将作为注释的位置
            centroid = np.mean(cluster_points, axis=0)
            points=plt.scatter(tsne_coords[cluster_idx, 0], tsne_coords[cluster_idx, 1], s=6, edgecolor='none', rasterized=True, cmap='tab20')
            if i == 0:
                # 在群集的中心位置添加注释，注释内容为群集的ID
                plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=8, ha='center', va='center')
            #     color = points.get_facecolor().flatten()
            #     this_cluster_mean = np.mean(tsne[cluster_idx,:],axis=0)
            #     plt.text(this_cluster_mean[0],this_cluster_mean[1],cluster_id,fontsize=10,fontweight='bold',color=color)

            plt.tight_layout()
                
            
        plt.tight_layout()
        sns.despine(left=True, bottom=True)

 UserWarning:C:\Users\zong\AppData\Local\Temp\ipykernel_4492\3479663459.py:34: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
 UserWarning:C:\Users\zong\AppData\Local\Temp\ipykernel_4492\3479663459.py:42: The figure layout has changed to tight
 UserWarning:C:\Users\zong\AppData\Local\Temp\ipykernel_4492\3479663459.py:45: The figure layout has changed to tight

Feature Selectiom¶

In [130]:

Copied!





#gene selcect

with sns.plotting_context("talk"):
    fig,axes = plt.subplots(2,2,figsize=(18,18))
    for ax in axes:
        ax.scatter(promoters.var["var"],promoters.var["sr_var"],s=8, rasterized=True)
    sns.despine()
    plt.tight_layout()
#gene selcect

with sns.plotting_context("talk"):
    fig,axes = plt.subplots(2,2,figsize=(18,18))
    for ax in axes:
        ax.scatter(promoters.var["var"],promoters.var["sr_var"],s=8, rasterized=True)
    sns.despine()
    plt.tight_layout()

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[130], line 6
      4 fig,axes = plt.subplots(2,2,figsize=(18,18))
      5 for ax in axes:
----> 6     ax.scatter(promoters.var["var"],promoters.var["sr_var"],s=8, rasterized=True)
      7 sns.despine()
      8 plt.tight_layout()

AttributeError: 'numpy.ndarray' object has no attribute 'scatter'

In [15]:

Copied!

promoters = winows100k_scm
promoters.var
promoters = winows100k_scm
promoters.var

Out[15]:

	chromosome	start	end	covered_cell	var	sr_var	pct_cell
index
chr1:1-100000	chr1	1	100000	2353	0.035220	0.029984	0.995347
chr1:100001-200000	chr1	100001	200000	2354	0.009842	0.016549	0.995770
chr1:200001-300000	chr1	200001	300000	2351	0.030100	0.035758	0.994501
chr1:300001-400000	chr1	300001	400000	2323	0.026505	0.055943	0.982657
chr1:400001-500000	chr1	400001	500000	2346	0.014380	0.038490	0.992386
...	...	...	...	...	...	...	...
chrY:56800001-56900000	chrY	56800001	56900000	2361	0.002751	0.003180	0.998731
chrY:56900001-57000000	chrY	56900001	57000000	2122	0.098136	0.067797	0.897631
chrY:57000001-57100000	chrY	57000001	57100000	2252	0.066954	0.067381	0.952623
chrY:57100001-57200000	chrY	57100001	57200000	2311	0.046440	0.054673	0.977580
chrY:57200001-57227415	chrY	57200001	57227415	2125	0.040066	0.080984	0.898900

29510 rows × 7 columns

In [21]:

Copied!

winows100k_scm
winows100k_scm

Out[21]:

AnnData object with n_obs × n_vars = 2364 × 29510
    obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Sample_id', 'Sample', 'Animal age', 'FACS date', 'Brain area', 'Laminar layer', 'Labeling', 'FACS channel', 'FACS count', 'Bisulfite conversion method', 'Library type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Sequencing run mode', 'Total reads', 'Mapped reads', '% Mapped reads', 'Filtered reads', '% Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Coverage (%)', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate', 'n_features', 'pct_peaks'
    var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var', 'pct_cell', 'mean'
    uns: 'workdir', 'label_color'
    layers: 'relative'

In [14]:

Copied!





dotsize=3
#for broken lines
linewidth_broken = 2
color_broken = 'tab:red'
#axis limits
xlim_satija = [1e-4,1e2]
ylim_satija_theta = [10**-7.5,10**1.5]
ylim_satija_beta0 = [-60,20]
title_fontsize = 22
title_fontweight = "bold"
title_position = (-0.2,1)
title_location = 'center'
title_y_padding = -10
markerfirst = True

alpha=0.05
def make_panel(x,
               y,
               ax,
               letter='',
               title='',
               color_kde=True,
               xlim=None,
               ylim=None,
               xlabel='mean expression',
               ylabel='',
               xscale='log',
               yscale='log',):
    
    ax.text(*title_position,letter,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    if color_kde:
        xtrans = np.log10(x) if xscale == 'log' else x
        ytrans = np.log10(y) if yscale == 'log' else y
        data = np.vstack((xtrans,ytrans))
        kde = gaussian_kde(data)
        logdensity = kde.logpdf(data)
        points = ax.scatter(x, y, s=dotsize, c=logdensity,  rasterized=True)

    else:
        ax.scatter(x, y, s=dotsize,  rasterized=True)
    ax.set_xscale(xscale)
    ax.set_yscale(yscale)
    ax.set_ylabel(ylabel)
    ax.set_xlabel(xlabel)

    if xlim:
        ax.set_xlim(xlim)
    if ylim:
        ax.set_ylim(ylim)
        
    ax.set_title(title)
    
def add_line(x,y,ax):
    ax.plot(x,y, '--',c=color_broken, linewidth=linewidth_broken)
    
def fix_ticks(ax):
    ax.set_yticks(ax.get_yticks()[1:-1])
dotsize=3
#for broken lines
linewidth_broken = 2
color_broken = 'tab:red'
#axis limits
xlim_satija = [1e-4,1e2]
ylim_satija_theta = [10**-7.5,10**1.5]
ylim_satija_beta0 = [-60,20]
title_fontsize = 22
title_fontweight = "bold"
title_position = (-0.2,1)
title_location = 'center'
title_y_padding = -10
markerfirst = True

alpha=0.05
def make_panel(x,
               y,
               ax,
               letter='',
               title='',
               color_kde=True,
               xlim=None,
               ylim=None,
               xlabel='mean expression',
               ylabel='',
               xscale='log',
               yscale='log',):
    
    ax.text(*title_position,letter,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
    if color_kde:
        xtrans = np.log10(x) if xscale == 'log' else x
        ytrans = np.log10(y) if yscale == 'log' else y
        data = np.vstack((xtrans,ytrans))
        kde = gaussian_kde(data)
        logdensity = kde.logpdf(data)
        points = ax.scatter(x, y, s=dotsize, c=logdensity,  rasterized=True)

    else:
        ax.scatter(x, y, s=dotsize,  rasterized=True)
    ax.set_xscale(xscale)
    ax.set_yscale(yscale)
    ax.set_ylabel(ylabel)
    ax.set_xlabel(xlabel)

    if xlim:
        ax.set_xlim(xlim)
    if ylim:
        ax.set_ylim(ylim)
        
    ax.set_title(title)
    
def add_line(x,y,ax):
    ax.plot(x,y, '--',c=color_broken, linewidth=linewidth_broken)
    
def fix_ticks(ax):
    ax.set_yticks(ax.get_yticks()[1:-1])

In [ ]:

Copied!





#relarionship between mean and var/sr_var, sr_var are the variation of shrunken risiduals
with sns.plotting_context("talk"):

    fig = plt.figure(figsize=(18,9)) #constrained_layout=True)

    gs = GridSpec(1,2 , figure=fig)
    ax1 = fig.add_subplot(gs[0, 0])  #var
    ax2 = fig.add_subplot(gs[0, 1])  #sr_var


    make_panel(x=winows100k_scm.var['mean'],
               y=winows100k_scm.var['var'],
               ax=ax1,
               letter='a',
               color_kde=False,
               ylabel=r'Var ($\hat\beta_{0g}$)',
               yscale='linear')
    # beta0_prediction = np.log(mean_range_orig / np.mean(ns_orig))
    # add_line(x=mean_range_orig,y=beta0_prediction,ax=ax1)
    fix_ticks(ax1)
    
    make_panel(x=winows100k_scm.var['mean'],
               y=winows100k_scm.var['sr_var'],
               ax=ax2,
               letter='b',
               ylabel=r'sr_Var ($\hat\beta_{1g}$)',
               yscale='linear')
    # beta0_prediction = np.log(mean_range_orig / np.mean(ns_orig))
    # add_line(x=winows100k_scm.var['mean'],y=winows100k_scm.var['sr_var'],ax=ax1)   
    # ax2.hlines(np.log(10),mean_min_orig,mean_max_orig,linestyles='--', colors=color_broken, linewidths=linewidth_broken)
        
    sns.despine()    
    plt.tight_layout()
    #plt.savefig('figures/Fig1.pdf', dpi=300, format=None)
    plt.show()
#relarionship between mean and var/sr_var, sr_var are the variation of shrunken risiduals
with sns.plotting_context("talk"):

    fig = plt.figure(figsize=(18,9)) #constrained_layout=True)

    gs = GridSpec(1,2 , figure=fig)
    ax1 = fig.add_subplot(gs[0, 0])  #var
    ax2 = fig.add_subplot(gs[0, 1])  #sr_var


    make_panel(x=winows100k_scm.var['mean'],
               y=winows100k_scm.var['var'],
               ax=ax1,
               letter='a',
               color_kde=False,
               ylabel=r'Var ($\hat\beta_{0g}$)',
               yscale='linear')
    # beta0_prediction = np.log(mean_range_orig / np.mean(ns_orig))
    # add_line(x=mean_range_orig,y=beta0_prediction,ax=ax1)
    fix_ticks(ax1)
    
    make_panel(x=winows100k_scm.var['mean'],
               y=winows100k_scm.var['sr_var'],
               ax=ax2,
               letter='b',
               ylabel=r'sr_Var ($\hat\beta_{1g}$)',
               yscale='linear')
    # beta0_prediction = np.log(mean_range_orig / np.mean(ns_orig))
    # add_line(x=winows100k_scm.var['mean'],y=winows100k_scm.var['sr_var'],ax=ax1)   
    # ax2.hlines(np.log(10),mean_min_orig,mean_max_orig,linestyles='--', colors=color_broken, linewidths=linewidth_broken)
        
    sns.despine()    
    plt.tight_layout()
    #plt.savefig('figures/Fig1.pdf', dpi=300, format=None)
    plt.show()

In [ ]: