Feature Selection
In [2]:
Copied!
import pandas as pd
import numpy as np
import scanpy as sc
import episcanpy as epi
import scMethtools as scm
import pandas as pd
import numpy as np
import scanpy as sc
import episcanpy as epi
import scMethtools as scm
In [11]:
Copied!
%load_ext autoreload
%autoreload 2
%load_ext autoreload
%autoreload 2
In [3]:
Copied!
import os
import scipy.sparse as sparse
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
from scipy.stats import gaussian_kde
import os
import scipy.sparse as sparse
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
from scipy.stats import gaussian_kde
In [4]:
Copied!
#setting workdir
workdir = 'D://Test/GSE97179/scMethtools'
os.chdir(workdir)
# path_fm = os.path.join(workdir,'feature_matrices/')
# path_clusters = os.path.join(workdir,'clusters/')
# path_metrics = os.path.join(workdir,'metrics/')
# os.system('mkdir -p '+path_clusters)
# os.system('mkdir -p '+path_metrics)
#setting workdir
workdir = 'D://Test/GSE97179/scMethtools'
os.chdir(workdir)
# path_fm = os.path.join(workdir,'feature_matrices/')
# path_clusters = os.path.join(workdir,'clusters/')
# path_metrics = os.path.join(workdir,'metrics/')
# os.system('mkdir -p '+path_clusters)
# os.system('mkdir -p '+path_metrics)
In [5]:
Copied!
winows100k_scm = scm.pp.load_scm('./windows.h5')
winows100k_scm = scm.pp.load_scm('./windows.h5')
Single Feature -- Fixed Windows(10kb)¶
In [6]:
Copied!
winows100k_scm
winows100k_scm
Out[6]:
AnnData object with n_obs × n_vars = 2365 × 30894 obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level' var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var' uns: 'workdir' layers: 'relative'
In [6]:
Copied!
scm.pp.add_meta(winows100k_scm,meta_file='../meta/Luo_GSE97179_human_mid_cortex_raw_meta.txt',sep='\t',index_col='Sample_id')
scm.pp.add_meta(winows100k_scm,meta_file='../meta/Luo_GSE97179_human_mid_cortex_raw_meta.txt',sep='\t',index_col='Sample_id')
No column 'label' found in metadata, 'unknown' is used as the default cell labels No column 'label_color' found in metadata, random color is generated for each cell label
UserWarning:D:\conda\envs\py39\lib\site-packages\anndata\_core\anndata.py:798: AnnData expects .obs.index to contain strings, but got values like: [0, 1, 2, 3, 4] Inferred to be: integer
In [8]:
Copied!
winows100k_scm.obs
winows100k_scm.obs
Out[8]:
cell_id | cell_name | sites | meth | n_total | global_meth_level | Sample_id | Sample | Animal age | FACS date | ... | % Filtered reads | mCCC/CCC | mCG/CG | mCH/CH | Estimated mCG/CG | Estimated mCH/CH | Coverage (%) | Neuron type | tSNE x coordinate | tSNE y coordinate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | GSM2556580 | 3186266 | 2496679 | 3825044 | 0.783575 | GSM2556580 | Pool_1026_AD002_indexed | 25 yr | 2012/6/29 | ... | 48.40% | 0.00540 | 0.79871 | 0.02211 | 0.79762 | 0.01680 | 6.01 | hSst-2 | 20.93320 | 13.43860 |
1 | 1 | GSM2557770 | 2572709 | 1931054 | 3242784 | 0.750592 | GSM2557770 | Pool_263_AD010_indexed | 25 yr | 2012/6/29 | ... | 51.30% | 0.00763 | 0.76486 | 0.03588 | 0.76305 | 0.02847 | 5.11 | hL2/3 | -7.61221 | 13.74140 |
2 | 2 | GSM2556556 | 3247648 | 2505895 | 3882099 | 0.771603 | GSM2556556 | Pool_1012_AD006_indexed | 25 yr | 2012/6/29 | ... | 53.40% | 0.00994 | 0.78647 | 0.05192 | 0.78433 | 0.04240 | 6.50 | hL2/3 | -14.57080 | 4.75207 |
3 | 3 | GSM2556719 | 3593247 | 2753322 | 4304895 | 0.766249 | GSM2556719 | Pool_1090_AD006_indexed | 25 yr | 2012/6/29 | ... | 47.80% | 0.00818 | 0.78164 | 0.04276 | 0.77984 | 0.03487 | 7.19 | hL2/3 | -12.23290 | 10.33370 |
4 | 4 | GSM2558838 | 1671616 | 1213688 | 1944619 | 0.726057 | GSM2558838 | Pool_771_AD008_indexed | 25 yr | 2012/6/29 | ... | 50.30% | 0.00850 | 0.73986 | 0.04119 | 0.73763 | 0.03297 | 3.26 | hL2/3 | -12.52800 | 12.85390 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2360 | 2360 | GSM2557118 | 4030037 | 3049428 | 5723469 | 0.756675 | GSM2557118 | Pool_1381_AD002_indexed | 25 yr | 2012/6/29 | ... | 38.90% | 0.01059 | 0.76468 | 0.05545 | 0.76216 | 0.04534 | 7.86 | hDL-1 | -9.11158 | -1.04451 |
2361 | 2361 | GSM2558140 | 3335079 | 2526078 | 4135394 | 0.757427 | GSM2558140 | Pool_441_AD008_indexed | 25 yr | 2012/6/29 | ... | 41.10% | 0.00667 | 0.76573 | 0.03698 | 0.76416 | 0.03051 | 6.79 | hL2/3 | -11.26030 | 12.10890 |
2362 | 2362 | GSM2557032 | 4045664 | 3312230 | 5141670 | 0.818711 | GSM2557032 | Pool_1359_AD010_indexed | 25 yr | 2012/6/29 | ... | 43.10% | 0.01548 | 0.83100 | 0.08086 | 0.82834 | 0.06641 | 8.27 | hSst-3 | 15.37830 | -9.40008 |
2363 | 2363 | GSM2558877 | 1554042 | 1192705 | 1806746 | 0.767486 | GSM2558877 | Pool_790_AD010_indexed | 25 yr | 2012/6/29 | ... | 49.00% | 0.00764 | 0.78764 | 0.03502 | 0.78601 | 0.02759 | 2.96 | hNdnf | 16.07330 | 8.95018 |
2364 | 2364 | GSM2557111 | 4053094 | 3004728 | 5423767 | 0.741342 | GSM2557111 | Pool_1379_AD010_indexed | 25 yr | 2012/6/29 | ... | 41.10% | 0.00676 | 0.75210 | 0.03068 | 0.75041 | 0.02408 | 8.11 | hL2/3 | -5.32003 | 13.90730 |
2365 rows × 39 columns
In [9]:
Copied!
winows100k_scm.uns['workdir']
winows100k_scm.uns['workdir']
Out[9]:
'/xtdisk/methbank_baoym/zongwt/single/data/GSE97179/out/scm'
Filter¶
In [8]:
Copied!
filter_cells(winows100k_scm,min_n_features = 10000)
filter_features(winows100k_scm,min_n_cells=500)
filter_cells(winows100k_scm,min_n_features = 10000)
filter_features(winows100k_scm,min_n_cells=500)
filter cells based on min_n_features >= 10000
ImplicitModificationWarning:D:\conda\envs\py39\lib\site-packages\anndata\_core\anndata.py:121: Transforming to str index.
after filtering out low-quality cells: 2364 cells, 30894 regions
RuntimeWarning:C:\Users\zong\AppData\Local\Temp\ipykernel_15988\2075018014.py:11: Mean of empty slice
Filter regions based on min_n_cells After filtering out low coveraged regions: 2364 cells, 29510 regions
In [7]:
Copied!
def filter_features(adata,
min_n_cells = None, max_n_cells=None,
min_pct_cells = None, max_pct_cells=None,
assay=None):
feature = 'regions'
# if('covered_cell' in adata.var_keys()):
# n_cells = adata.var['covered_cell']
# else:
if sparse.issparse(adata.X):
dense_X = adata.X.toarray()
feature_mean = np.nanmean(dense_X, axis=0)
n_cells = np.sum(~np.isnan(dense_X), axis=0).astype(int)
else:
n_cells = np.sum(~np.isnan(adata.X), axis=0).astype(int)
feature_mean = np.nanmean(adata.X, axis=0)
adata.var['covered_cell'] = n_cells
adata.var['mean'] = feature_mean
if('pct_cell' in adata.var_keys()):
pct_cells = adata.var['pct_cell']
else:
pct_cells = n_cells/adata.shape[0]
adata.var['pct_cell'] = pct_cells
if(sum(list(map(lambda x: x is None,[min_n_cells,min_pct_cells,
max_n_cells,max_pct_cells,])))==4):
print('No filtering')
else:
feature_subset = np.ones(len(adata.var_names),dtype=bool)
if(min_n_cells!=None):
print('Filter '+feature+' based on min_n_cells')
feature_subset = (n_cells>=min_n_cells) & feature_subset
if(max_n_cells!=None):
print('Filter '+feature+' based on max_n_cells')
feature_subset = (n_cells<=max_n_cells) & feature_subset
if(min_pct_cells!=None):
print('Filter '+feature+' based on min_pct_cells')
feature_subset = (pct_cells>=min_pct_cells) & feature_subset
if(max_pct_cells!=None):
print('Filter '+feature+' based on max_pct_cells')
feature_subset = (pct_cells<=max_pct_cells) & feature_subset
adata._inplace_subset_var(feature_subset)
print('After filtering out low coveraged '+feature+': ')
print(str(adata.shape[0])+' cells, ' + str(adata.shape[1])+' '+feature)
return None
def filter_cells(adata,
min_n_features = None, max_n_features = None,
min_pct_features = None, max_pct_features = None,
min_mc_level =None, max_mc_level =None,
min_n_sites = None,
assay=None):
feature = 'regions'
if('sites' in adata.obs_keys()):
n_sites = adata.obs['sites']
if('global_meth_level' in adata.obs_keys()):
methylation_level = adata.obs['global_meth_level']
if('features_number' in adata.obs_keys()):
n_features = adata.obs['features_number']
else:
if sparse.issparse(adata.X):
n_features = np.sum(~np.isnan(adata.X.toarray()), axis=1).astype(int)
else:
n_features = np.sum(~np.isnan(adata.X), axis=1).astype(int)
adata.obs['n_features'] = n_features
if('pct_features' in adata.obs_keys()):
pct_features = adata.obs['pct_features']
else:
pct_features = n_features/adata.shape[1]
adata.obs['pct_peaks'] = pct_features
if(sum(list(map(lambda x: x is None,[min_n_features,min_pct_features,min_n_sites,min_mc_level,
max_n_features,max_pct_features,max_mc_level])))==7):
print('No filtering')
else:
cell_subset = np.ones(len(adata.obs_names),dtype=bool)
if(min_n_features!=None):
print('filter cells based on min_n_features >= ', min_n_features)
cell_subset = (n_features>=min_n_features) & cell_subset
if(max_n_features!=None):
print('filter cells based on max_n_features <=', max_n_features)
cell_subset = (n_features<=max_n_features) & cell_subset
if(min_pct_features!=None):
print('filter cells based on min_pct_features >= ',min_pct_features)
cell_subset = (pct_features>=min_pct_features) & cell_subset
if(max_pct_features!=None):
print('filter cells based on max_pct_features <= ',max_pct_features)
cell_subset = (pct_features<=max_pct_features) & cell_subset
adata._inplace_subset_obs(cell_subset)
print('after filtering out low-quality cells: ')
print(str(adata.shape[0])+' cells, ' + str(adata.shape[1])+' '+feature)
return None
def filter_features(adata,
min_n_cells = None, max_n_cells=None,
min_pct_cells = None, max_pct_cells=None,
assay=None):
feature = 'regions'
# if('covered_cell' in adata.var_keys()):
# n_cells = adata.var['covered_cell']
# else:
if sparse.issparse(adata.X):
dense_X = adata.X.toarray()
feature_mean = np.nanmean(dense_X, axis=0)
n_cells = np.sum(~np.isnan(dense_X), axis=0).astype(int)
else:
n_cells = np.sum(~np.isnan(adata.X), axis=0).astype(int)
feature_mean = np.nanmean(adata.X, axis=0)
adata.var['covered_cell'] = n_cells
adata.var['mean'] = feature_mean
if('pct_cell' in adata.var_keys()):
pct_cells = adata.var['pct_cell']
else:
pct_cells = n_cells/adata.shape[0]
adata.var['pct_cell'] = pct_cells
if(sum(list(map(lambda x: x is None,[min_n_cells,min_pct_cells,
max_n_cells,max_pct_cells,])))==4):
print('No filtering')
else:
feature_subset = np.ones(len(adata.var_names),dtype=bool)
if(min_n_cells!=None):
print('Filter '+feature+' based on min_n_cells')
feature_subset = (n_cells>=min_n_cells) & feature_subset
if(max_n_cells!=None):
print('Filter '+feature+' based on max_n_cells')
feature_subset = (n_cells<=max_n_cells) & feature_subset
if(min_pct_cells!=None):
print('Filter '+feature+' based on min_pct_cells')
feature_subset = (pct_cells>=min_pct_cells) & feature_subset
if(max_pct_cells!=None):
print('Filter '+feature+' based on max_pct_cells')
feature_subset = (pct_cells<=max_pct_cells) & feature_subset
adata._inplace_subset_var(feature_subset)
print('After filtering out low coveraged '+feature+': ')
print(str(adata.shape[0])+' cells, ' + str(adata.shape[1])+' '+feature)
return None
def filter_cells(adata,
min_n_features = None, max_n_features = None,
min_pct_features = None, max_pct_features = None,
min_mc_level =None, max_mc_level =None,
min_n_sites = None,
assay=None):
feature = 'regions'
if('sites' in adata.obs_keys()):
n_sites = adata.obs['sites']
if('global_meth_level' in adata.obs_keys()):
methylation_level = adata.obs['global_meth_level']
if('features_number' in adata.obs_keys()):
n_features = adata.obs['features_number']
else:
if sparse.issparse(adata.X):
n_features = np.sum(~np.isnan(adata.X.toarray()), axis=1).astype(int)
else:
n_features = np.sum(~np.isnan(adata.X), axis=1).astype(int)
adata.obs['n_features'] = n_features
if('pct_features' in adata.obs_keys()):
pct_features = adata.obs['pct_features']
else:
pct_features = n_features/adata.shape[1]
adata.obs['pct_peaks'] = pct_features
if(sum(list(map(lambda x: x is None,[min_n_features,min_pct_features,min_n_sites,min_mc_level,
max_n_features,max_pct_features,max_mc_level])))==7):
print('No filtering')
else:
cell_subset = np.ones(len(adata.obs_names),dtype=bool)
if(min_n_features!=None):
print('filter cells based on min_n_features >= ', min_n_features)
cell_subset = (n_features>=min_n_features) & cell_subset
if(max_n_features!=None):
print('filter cells based on max_n_features <=', max_n_features)
cell_subset = (n_features<=max_n_features) & cell_subset
if(min_pct_features!=None):
print('filter cells based on min_pct_features >= ',min_pct_features)
cell_subset = (pct_features>=min_pct_features) & cell_subset
if(max_pct_features!=None):
print('filter cells based on max_pct_features <= ',max_pct_features)
cell_subset = (pct_features<=max_pct_features) & cell_subset
adata._inplace_subset_obs(cell_subset)
print('after filtering out low-quality cells: ')
print(str(adata.shape[0])+' cells, ' + str(adata.shape[1])+' '+feature)
return None
Dimension Reduction¶
In [61]:
Copied!
from sklearn.decomposition import PCA as sklearnPCA
def pca(adata,layer=None,feature=None,n_pc = 30,max_pc = 50,first_pc = False,use_precomputed=True,
save_fig=False,fig_name='top_pcs.pdf',fig_path=None,fig_size=(4,4),
pad=1.08,w_pad=None,h_pad=None):
# if(fig_path is None):
# fig_path = adata.uns['workdir']
# fig_size = mpl.rcParams['figure.figsize'] if fig_size is None else fig_size
if(feature is None):
feature = 'all'
if(feature not in ['all','hvm']):
raise ValueError("unrecognized feature '%s'" % feature)
# 检查主数据矩阵中完全为空的列
# 检查主数据矩阵并移除完全为空的列
if sparse.issparse(adata.X):
all_nan_columns = np.isnan(adata.X.toarray()).all(axis=0)
else:
all_nan_columns = np.isnan(adata.X).all(axis=0)
# 移除主矩阵和所有层中的相应列
adata[:, ~all_nan_columns].X
# for layer_name in adata.layers.keys():
# adata.layers[layer_name] = adata.layers[layer_name][:, ~all_nan_columns]
# 移除所有层中的相应列
for layer_name in adata.layers.keys():
adata[:, ~all_nan_columns].layers[layer_name]
max_pc = min(min(adata.shape), n_pc) # in case cells are smaller than n_pcs
sklearn_pca = sklearnPCA(n_components=max_pc,svd_solver='arpack',random_state=42)
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy="mean")
#!!行缺失值填充或其他转换时,如果存在完全为空的列,那么在转换过程中这些列可能会被删除,导致最终的数据矩阵列数少于原始数据。
if layer is not None:
adata.layers[layer] = imp.fit_transform(adata.layers[layer])
if(feature == 'hvm'):
print('using top variable regions ...')
adata_filter = adata[:,adata.var['feature_select']].copy()
sc.tl.pca(adata_filter, svd_solver='arpack')
else:
print('using all the features ...')
adata.obsm["mylayer_pca"] = sc.tl.pca(adata.layers[layer], svd_solver='arpack')
else:
adata.X = imp.fit_transform(adata.X)
if(feature == 'hvm'):
print('using top variable regions ...')
adata_filter = adata[:,adata.var['feature_select']].copy()
sc.tl.pca(adata_filter, svd_solver='arpack') # !!!Error
else:
print('using all the features ...')
sc.tl.pca(adata, svd_solver='arpack')
return None
from sklearn.decomposition import PCA as sklearnPCA
def pca(adata,layer=None,feature=None,n_pc = 30,max_pc = 50,first_pc = False,use_precomputed=True,
save_fig=False,fig_name='top_pcs.pdf',fig_path=None,fig_size=(4,4),
pad=1.08,w_pad=None,h_pad=None):
# if(fig_path is None):
# fig_path = adata.uns['workdir']
# fig_size = mpl.rcParams['figure.figsize'] if fig_size is None else fig_size
if(feature is None):
feature = 'all'
if(feature not in ['all','hvm']):
raise ValueError("unrecognized feature '%s'" % feature)
# 检查主数据矩阵中完全为空的列
# 检查主数据矩阵并移除完全为空的列
if sparse.issparse(adata.X):
all_nan_columns = np.isnan(adata.X.toarray()).all(axis=0)
else:
all_nan_columns = np.isnan(adata.X).all(axis=0)
# 移除主矩阵和所有层中的相应列
adata[:, ~all_nan_columns].X
# for layer_name in adata.layers.keys():
# adata.layers[layer_name] = adata.layers[layer_name][:, ~all_nan_columns]
# 移除所有层中的相应列
for layer_name in adata.layers.keys():
adata[:, ~all_nan_columns].layers[layer_name]
max_pc = min(min(adata.shape), n_pc) # in case cells are smaller than n_pcs
sklearn_pca = sklearnPCA(n_components=max_pc,svd_solver='arpack',random_state=42)
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy="mean")
#!!行缺失值填充或其他转换时,如果存在完全为空的列,那么在转换过程中这些列可能会被删除,导致最终的数据矩阵列数少于原始数据。
if layer is not None:
adata.layers[layer] = imp.fit_transform(adata.layers[layer])
if(feature == 'hvm'):
print('using top variable regions ...')
adata_filter = adata[:,adata.var['feature_select']].copy()
sc.tl.pca(adata_filter, svd_solver='arpack')
else:
print('using all the features ...')
adata.obsm["mylayer_pca"] = sc.tl.pca(adata.layers[layer], svd_solver='arpack')
else:
adata.X = imp.fit_transform(adata.X)
if(feature == 'hvm'):
print('using top variable regions ...')
adata_filter = adata[:,adata.var['feature_select']].copy()
sc.tl.pca(adata_filter, svd_solver='arpack') # !!!Error
else:
print('using all the features ...')
sc.tl.pca(adata, svd_solver='arpack')
return None
In [62]:
Copied!
#winows100k_scm
pca(winows100k_scm,layer="relative")
sc.tl.tsne(winows100k_scm,use_rep="mylayer_pca")
winows100k_scm.obsm["relative_tsne"] = winows100k_scm.obsm['X_tsne']
pca(winows100k_scm)
sc.tl.tsne(winows100k_scm)
#winows100k_scm
pca(winows100k_scm,layer="relative")
sc.tl.tsne(winows100k_scm,use_rep="mylayer_pca")
winows100k_scm.obsm["relative_tsne"] = winows100k_scm.obsm['X_tsne']
pca(winows100k_scm)
sc.tl.tsne(winows100k_scm)
using all the features ... using all the features ...
In [111]:
Copied!
import matplotlib.pyplot as plt
import seaborn as sns
title_fontsize=16
title_fontweight = "bold"
marker_offset = {1:[-10,-25], 2:[0,-20], 3: [-20,+8], 4: [3,-30], 5: [+23,-3]}
clusters = winows100k_scm.obs['Neuron type'].astype(str) # 将数值类型转换为字符串类型
tsnes = [winows100k_scm.obsm["X_tsne"],winows100k_scm.obsm["relative_tsne"]]
titles = ['Mean methylation level, PCA', 'Relative methylation level, PCA']
ylabels = ['','']
letters = ['a','b']
with sns.plotting_context('paper'):
plt.figure(figsize=(18,18))
for i, (tsne,title,letter,ylabel) in enumerate(zip(tsnes,titles,letters,ylabels)):
plt.subplot(331+i)
ax=plt.gca()
plt.text(0.05,1.015,letter,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
plt.title(title)
plt.xticks([])
plt.yticks([])
ax.set_ylabel(ylabel)
# 绘制每个群集
for cluster_id in np.unique(clusters):
cluster_idx = clusters == cluster_id
# 选取当前群集的所有点
cluster_points = tsne_coords[cluster_idx]
# 计算当前群集所有点的均值,这将作为注释的位置
centroid = np.mean(cluster_points, axis=0)
points=plt.scatter(tsne_coords[cluster_idx, 0], tsne_coords[cluster_idx, 1], s=6, edgecolor='none', rasterized=True)
if i == 0:
color = points.get_facecolor().flatten()
# 在群集的中心位置添加注释,注释内容为群集的ID
text = plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=8, ha='center', va='center',fontweight='bold')
# 创建背景框对象,并设置背景框的样式
# bbox_props = dict(boxstyle="round,pad=0.3", fc="white", alpha=0.2)
# plt.setp(text, bbox=bbox_props)
plt.subplot(333)
ax=plt.gca()
plt.text(0.05,1.015,'c',transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
plt.title('raw tsne, Luo')
plt.xticks([])
plt.yticks([])
for cluster_id in np.unique(clusters):
cluster_idx = clusters == cluster_id
points= plt.scatter(promoters.obs['tSNE x coordinate'][cluster_idx], promoters.obs['tSNE y coordinate'][cluster_idx], s=5, edgecolor='none',rasterized=True)
centroid_x = np.mean(promoters.obs['tSNE x coordinate'][cluster_idx])
centroid_y = np.mean(promoters.obs['tSNE y coordinate'][cluster_idx])
plt.text(centroid_x, centroid_y-1, cluster_id, fontsize=8, ha='center', va='center')
plt.tight_layout()
sns.despine(left=True,bottom=True)
import matplotlib.pyplot as plt
import seaborn as sns
title_fontsize=16
title_fontweight = "bold"
marker_offset = {1:[-10,-25], 2:[0,-20], 3: [-20,+8], 4: [3,-30], 5: [+23,-3]}
clusters = winows100k_scm.obs['Neuron type'].astype(str) # 将数值类型转换为字符串类型
tsnes = [winows100k_scm.obsm["X_tsne"],winows100k_scm.obsm["relative_tsne"]]
titles = ['Mean methylation level, PCA', 'Relative methylation level, PCA']
ylabels = ['','']
letters = ['a','b']
with sns.plotting_context('paper'):
plt.figure(figsize=(18,18))
for i, (tsne,title,letter,ylabel) in enumerate(zip(tsnes,titles,letters,ylabels)):
plt.subplot(331+i)
ax=plt.gca()
plt.text(0.05,1.015,letter,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
plt.title(title)
plt.xticks([])
plt.yticks([])
ax.set_ylabel(ylabel)
# 绘制每个群集
for cluster_id in np.unique(clusters):
cluster_idx = clusters == cluster_id
# 选取当前群集的所有点
cluster_points = tsne_coords[cluster_idx]
# 计算当前群集所有点的均值,这将作为注释的位置
centroid = np.mean(cluster_points, axis=0)
points=plt.scatter(tsne_coords[cluster_idx, 0], tsne_coords[cluster_idx, 1], s=6, edgecolor='none', rasterized=True)
if i == 0:
color = points.get_facecolor().flatten()
# 在群集的中心位置添加注释,注释内容为群集的ID
text = plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=8, ha='center', va='center',fontweight='bold')
# 创建背景框对象,并设置背景框的样式
# bbox_props = dict(boxstyle="round,pad=0.3", fc="white", alpha=0.2)
# plt.setp(text, bbox=bbox_props)
plt.subplot(333)
ax=plt.gca()
plt.text(0.05,1.015,'c',transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
plt.title('raw tsne, Luo')
plt.xticks([])
plt.yticks([])
for cluster_id in np.unique(clusters):
cluster_idx = clusters == cluster_id
points= plt.scatter(promoters.obs['tSNE x coordinate'][cluster_idx], promoters.obs['tSNE y coordinate'][cluster_idx], s=5, edgecolor='none',rasterized=True)
centroid_x = np.mean(promoters.obs['tSNE x coordinate'][cluster_idx])
centroid_y = np.mean(promoters.obs['tSNE y coordinate'][cluster_idx])
plt.text(centroid_x, centroid_y-1, cluster_id, fontsize=8, ha='center', va='center')
plt.tight_layout()
sns.despine(left=True,bottom=True)
posx and posy should be finite values posx and posy should be finite values posx and posy should be finite values
In [121]:
Copied!
import matplotlib.pyplot as plt
import seaborn as sns
title_fontsize=16
title_fontweight = "bold"
marker_offset = {1:[-10,-25], 2:[0,-20], 3: [-20,+8], 4: [3,-30], 5: [+23,-3]}
clusters = promoters.obs['Neuron type'].astype(str) # 将数值类型转换为字符串类型
tsnes = [promoters.obsm["X_tsne"], promoters.obsm["relative_tsne"]
]
titles = ['Mean methylation level, TSNE', 'Relative methylation level, TSNE']
ylabels = ['','']
letters = ['a','b']
with sns.plotting_context('paper'):
plt.figure(figsize=(21,21))
for i, (tsne,title,letter,ylabel) in enumerate(zip(tsnes,titles,letters,ylabels)):
print(i,tsne)
plt.subplot(331+i)
ax=plt.gca()
plt.text(0.05,1.015,letter,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
plt.title(title)
plt.xticks([])
plt.yticks([])
ax.set_ylabel(ylabel)
# 绘制每个群集
for cluster_id in np.unique(clusters):
cluster_idx = clusters == cluster_id
# 选取当前群集的所有点
cluster_points = tsne[cluster_idx]
# 计算当前群集所有点的均值,这将作为注释的位置
centroid = np.mean(cluster_points, axis=0)
points=plt.scatter(tsne[cluster_idx, 0], tsne[cluster_idx, 1], s=6, edgecolor='none', rasterized=True)
if i == 0:
color = points.get_facecolor().flatten()
# 在群集的中心位置添加注释,注释内容为群集的ID
text = plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=10, ha='center', va='center')
# 创建背景框对象,并设置背景框的样式
# bbox_props = dict(boxstyle="round,pad=0.3", fc="white", alpha=0.2)
# plt.setp(text, bbox=bbox_props)
plt.subplot(333)
ax=plt.gca()
plt.text(0.05,1.015,'c',transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
plt.title('raw tsne, Luo')
plt.xticks([])
plt.yticks([])
for cluster_id in np.unique(clusters):
cluster_idx = clusters == cluster_id
points= plt.scatter(promoters.obs['tSNE x coordinate'][cluster_idx], promoters.obs['tSNE y coordinate'][cluster_idx], s=5, edgecolor='none',rasterized=True)
centroid_x = np.mean(promoters.obs['tSNE x coordinate'][cluster_idx])
centroid_y = np.mean(promoters.obs['tSNE y coordinate'][cluster_idx])
plt.text(centroid_x, centroid_y-1, cluster_id, fontsize=8, ha='center', va='center')
plt.tight_layout()
sns.despine(left=True,bottom=True)
import matplotlib.pyplot as plt
import seaborn as sns
title_fontsize=16
title_fontweight = "bold"
marker_offset = {1:[-10,-25], 2:[0,-20], 3: [-20,+8], 4: [3,-30], 5: [+23,-3]}
clusters = promoters.obs['Neuron type'].astype(str) # 将数值类型转换为字符串类型
tsnes = [promoters.obsm["X_tsne"], promoters.obsm["relative_tsne"]
]
titles = ['Mean methylation level, TSNE', 'Relative methylation level, TSNE']
ylabels = ['','']
letters = ['a','b']
with sns.plotting_context('paper'):
plt.figure(figsize=(21,21))
for i, (tsne,title,letter,ylabel) in enumerate(zip(tsnes,titles,letters,ylabels)):
print(i,tsne)
plt.subplot(331+i)
ax=plt.gca()
plt.text(0.05,1.015,letter,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
plt.title(title)
plt.xticks([])
plt.yticks([])
ax.set_ylabel(ylabel)
# 绘制每个群集
for cluster_id in np.unique(clusters):
cluster_idx = clusters == cluster_id
# 选取当前群集的所有点
cluster_points = tsne[cluster_idx]
# 计算当前群集所有点的均值,这将作为注释的位置
centroid = np.mean(cluster_points, axis=0)
points=plt.scatter(tsne[cluster_idx, 0], tsne[cluster_idx, 1], s=6, edgecolor='none', rasterized=True)
if i == 0:
color = points.get_facecolor().flatten()
# 在群集的中心位置添加注释,注释内容为群集的ID
text = plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=10, ha='center', va='center')
# 创建背景框对象,并设置背景框的样式
# bbox_props = dict(boxstyle="round,pad=0.3", fc="white", alpha=0.2)
# plt.setp(text, bbox=bbox_props)
plt.subplot(333)
ax=plt.gca()
plt.text(0.05,1.015,'c',transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
plt.title('raw tsne, Luo')
plt.xticks([])
plt.yticks([])
for cluster_id in np.unique(clusters):
cluster_idx = clusters == cluster_id
points= plt.scatter(promoters.obs['tSNE x coordinate'][cluster_idx], promoters.obs['tSNE y coordinate'][cluster_idx], s=5, edgecolor='none',rasterized=True)
centroid_x = np.mean(promoters.obs['tSNE x coordinate'][cluster_idx])
centroid_y = np.mean(promoters.obs['tSNE y coordinate'][cluster_idx])
plt.text(centroid_x, centroid_y-1, cluster_id, fontsize=8, ha='center', va='center')
plt.tight_layout()
sns.despine(left=True,bottom=True)
0 [[-24.962395 24.903795 ] [ 44.67266 -24.92722 ] [ 55.89689 10.648388 ] ... [-44.904964 12.718646 ] [ 7.6758413 48.378998 ] [ 43.32974 -35.63923 ]] 1 [[-11.600098 28.261988] [ 46.33883 -6.064981] [ 34.6808 -32.11615 ] ... [-20.024689 57.827988] [-42.27409 14.973227] [ 51.225082 2.255647]]
posx and posy should be finite values posx and posy should be finite values posx and posy should be finite values
In [53]:
Copied!
Out[53]:
0 hSst-2 1 hL2/3 2 hL2/3 3 hL2/3 4 hL2/3 ... 2360 hDL-1 2361 hL2/3 2362 hSst-3 2363 hNdnf 2364 hL2/3 Name: Neuron type, Length: 2364, dtype: object
In [85]:
Copied!
def feature_select(adata,number=3000):
print(adata.shape)
adata.raw = adata.copy()
df = adata.var
feature_subset = df.index.isin(df.sort_values('sr_var', ascending=True).index[:number])
df['feature_select'] = feature_subset
adata.var = df
data_filter = adata[:,adata.var['feature_select']].copy()
return data_filter
def feature_select(adata,number=3000):
print(adata.shape)
adata.raw = adata.copy()
df = adata.var
feature_subset = df.index.isin(df.sort_values('sr_var', ascending=True).index[:number])
df['feature_select'] = feature_subset
adata.var = df
data_filter = adata[:,adata.var['feature_select']].copy()
return data_filter
In [86]:
Copied!
promoters = winows100k_scm
promoters_filter = feature_select(promoters,number=10000)
pca(promoters_filter)
pca(promoters_filter,layer="relative")
sc.tl.tsne(promoters_filter,use_rep="mylayer_pca")
promoters_filter.obsm["relative_tsne"] = promoters_filter.obsm['X_tsne']
sc.tl.tsne(promoters_filter)
promoters = winows100k_scm
promoters_filter = feature_select(promoters,number=10000)
pca(promoters_filter)
pca(promoters_filter,layer="relative")
sc.tl.tsne(promoters_filter,use_rep="mylayer_pca")
promoters_filter.obsm["relative_tsne"] = promoters_filter.obsm['X_tsne']
sc.tl.tsne(promoters_filter)
using all the features ... using all the features ...
In [87]:
Copied!
promoters_filter_6000 = feature_select(promoters,number=10000)
promoters_filter_6000
pca(promoters_filter_6000)
pca(promoters_filter_6000,layer="relative")
sc.tl.tsne(promoters_filter_6000,use_rep="mylayer_pca")
promoters_filter_6000.obsm["relative_tsne"] = promoters_filter_6000.obsm['X_tsne']
sc.tl.tsne(promoters_filter_6000)
promoters_filter_6000 = feature_select(promoters,number=10000)
promoters_filter_6000
pca(promoters_filter_6000)
pca(promoters_filter_6000,layer="relative")
sc.tl.tsne(promoters_filter_6000,use_rep="mylayer_pca")
promoters_filter_6000.obsm["relative_tsne"] = promoters_filter_6000.obsm['X_tsne']
sc.tl.tsne(promoters_filter_6000)
using all the features ... using all the features ...
In [114]:
Copied!
# 设置风格和大小
title_fontsize = 12
title_fontweight = "bold"
clusters = promoters.obs['Neuron type'].astype(str)
tsnes = [promoters.obsm["X_tsne"], promoters.obsm["relative_tsne"],
promoters_filter.obsm["X_tsne"], promoters_filter.obsm["relative_tsne"],
promoters_filter_6000.obsm["X_tsne"], promoters_filter_6000.obsm["relative_tsne"]
]
titles = ['Mean methylation level, (All Feature)', 'Relative methylation level, (All Feature)',
'Mean methylation level, (HVM_3000)', 'Relative methylation level, (HVM_3000)',
'Mean methylation level, (HVM_6000)', 'Relative methylation level, (HVM_6000)']
ylabels = ['', '','', '','', '']
letters = ['a', 'b','c','d','e','f']
with sns.plotting_context('talk'):
plt.figure(figsize=(18, 18))
for i, (tsne_coords, title, letter, ylabel) in enumerate(zip(tsnes, titles, letters, ylabels)):
ax = plt.subplot(320 + i +1)
plt.text(0.05, 1.015, letter, transform=ax.transAxes, fontsize=title_fontsize, fontweight=title_fontweight)
plt.title(title)
plt.xticks([])
plt.yticks([])
ax.set_ylabel(ylabel)
# 绘制每个群集
for cluster_id in np.unique(clusters):
cluster_idx = clusters == cluster_id
# 选取当前群集的所有点
cluster_points = tsne_coords[cluster_idx]
# 计算当前群集所有点的均值,这将作为注释的位置
centroid = np.mean(cluster_points, axis=0)
points=plt.scatter(tsne_coords[cluster_idx, 0], tsne_coords[cluster_idx, 1], s=6, edgecolor='none', rasterized=True)
if i == 0:
# 在群集的中心位置添加注释,注释内容为群集的ID
plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=8, ha='center', va='center')
# color = points.get_facecolor().flatten()
# this_cluster_mean = np.mean(tsne[cluster_idx,:],axis=0)
# plt.text(this_cluster_mean[0],this_cluster_mean[1],cluster_id,fontsize=10,fontweight='bold',color=color)
plt.tight_layout()
plt.tight_layout()
sns.despine(left=True, bottom=True)
# 设置风格和大小
title_fontsize = 12
title_fontweight = "bold"
clusters = promoters.obs['Neuron type'].astype(str)
tsnes = [promoters.obsm["X_tsne"], promoters.obsm["relative_tsne"],
promoters_filter.obsm["X_tsne"], promoters_filter.obsm["relative_tsne"],
promoters_filter_6000.obsm["X_tsne"], promoters_filter_6000.obsm["relative_tsne"]
]
titles = ['Mean methylation level, (All Feature)', 'Relative methylation level, (All Feature)',
'Mean methylation level, (HVM_3000)', 'Relative methylation level, (HVM_3000)',
'Mean methylation level, (HVM_6000)', 'Relative methylation level, (HVM_6000)']
ylabels = ['', '','', '','', '']
letters = ['a', 'b','c','d','e','f']
with sns.plotting_context('talk'):
plt.figure(figsize=(18, 18))
for i, (tsne_coords, title, letter, ylabel) in enumerate(zip(tsnes, titles, letters, ylabels)):
ax = plt.subplot(320 + i +1)
plt.text(0.05, 1.015, letter, transform=ax.transAxes, fontsize=title_fontsize, fontweight=title_fontweight)
plt.title(title)
plt.xticks([])
plt.yticks([])
ax.set_ylabel(ylabel)
# 绘制每个群集
for cluster_id in np.unique(clusters):
cluster_idx = clusters == cluster_id
# 选取当前群集的所有点
cluster_points = tsne_coords[cluster_idx]
# 计算当前群集所有点的均值,这将作为注释的位置
centroid = np.mean(cluster_points, axis=0)
points=plt.scatter(tsne_coords[cluster_idx, 0], tsne_coords[cluster_idx, 1], s=6, edgecolor='none', rasterized=True)
if i == 0:
# 在群集的中心位置添加注释,注释内容为群集的ID
plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=8, ha='center', va='center')
# color = points.get_facecolor().flatten()
# this_cluster_mean = np.mean(tsne[cluster_idx,:],axis=0)
# plt.text(this_cluster_mean[0],this_cluster_mean[1],cluster_id,fontsize=10,fontweight='bold',color=color)
plt.tight_layout()
plt.tight_layout()
sns.despine(left=True, bottom=True)
UserWarning:C:\Users\zong\AppData\Local\Temp\ipykernel_4492\1118335867.py:43: The figure layout has changed to tight UserWarning:C:\Users\zong\AppData\Local\Temp\ipykernel_4492\1118335867.py:46: The figure layout has changed to tight
In [89]:
Copied!
sc.pp.neighbors(promoters, use_rep='mylayer_pca')
sc.tl.umap(promoters)
promoters.obsm["relative_umap"] = promoters.obsm['X_umap']
sc.pp.neighbors(promoters)
sc.tl.umap(promoters)
sc.pp.neighbors(promoters, use_rep='mylayer_pca')
sc.tl.umap(promoters)
promoters.obsm["relative_umap"] = promoters.obsm['X_umap']
sc.pp.neighbors(promoters)
sc.tl.umap(promoters)
In [77]:
Copied!
promoters
promoters
Out[77]:
AnnData object with n_obs × n_vars = 2364 × 29510 obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Sample_id', 'Sample', 'Animal age', 'FACS date', 'Brain area', 'Laminar layer', 'Labeling', 'FACS channel', 'FACS count', 'Bisulfite conversion method', 'Library type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Sequencing run mode', 'Total reads', 'Mapped reads', '% Mapped reads', 'Filtered reads', '% Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Coverage (%)', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate', 'n_features', 'pct_peaks' var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var', 'pct_cell', 'feature_select' uns: 'workdir', 'label_color', 'pca', 'tsne', 'neighbors', 'umap' obsm: 'X_pca', 'mylayer_pca', 'X_tsne', 'relative_tsne', 'X_umap', 'relative_umap' varm: 'PCs' layers: 'relative' obsp: 'distances', 'connectivities'
In [90]:
Copied!
#3000 feature
sc.pp.neighbors(promoters_filter, use_rep='mylayer_pca')
sc.tl.umap(promoters_filter)
promoters_filter.obsm["relative_umap"] = promoters_filter.obsm['X_umap']
sc.pp.neighbors(promoters_filter)
sc.tl.umap(promoters_filter)
#6000 feature
sc.pp.neighbors(promoters_filter_6000, use_rep='mylayer_pca')
sc.tl.umap(promoters_filter_6000)
promoters_filter_6000.obsm["relative_umap"] = promoters_filter_6000.obsm['X_umap']
sc.pp.neighbors(promoters_filter_6000)
sc.tl.umap(promoters_filter_6000)
#3000 feature
sc.pp.neighbors(promoters_filter, use_rep='mylayer_pca')
sc.tl.umap(promoters_filter)
promoters_filter.obsm["relative_umap"] = promoters_filter.obsm['X_umap']
sc.pp.neighbors(promoters_filter)
sc.tl.umap(promoters_filter)
#6000 feature
sc.pp.neighbors(promoters_filter_6000, use_rep='mylayer_pca')
sc.tl.umap(promoters_filter_6000)
promoters_filter_6000.obsm["relative_umap"] = promoters_filter_6000.obsm['X_umap']
sc.pp.neighbors(promoters_filter_6000)
sc.tl.umap(promoters_filter_6000)
In [95]:
Copied!
# 设置风格和大小
title_fontsize = 12
title_fontweight = "bold"
clusters = promoters.obs['Labeling'].astype(str)
tsnes = [promoters.obsm["X_umap"], promoters.obsm["relative_umap"],
promoters_filter.obsm["X_umap"], promoters_filter.obsm["relative_umap"],
promoters_filter_6000.obsm["X_umap"], promoters_filter_6000.obsm["relative_umap"]
]
titles = ['Mean methylation level, (All Feature)', 'Relative methylation level, (All Feature)',
'Mean methylation level, (HVM_3000)', 'Relative methylation level, (HVM_3000)',
'Mean methylation level, (HVM_6000)', 'Relative methylation level, (HVM_6000)']
ylabels = ['', '','', '','', '']
letters = ['a', 'b','c','d','e','f']
with sns.plotting_context('talk'):
plt.figure(figsize=(18, 18))
for i, (tsne_coords, title, letter, ylabel) in enumerate(zip(tsnes, titles, letters, ylabels)):
ax = plt.subplot(320 + i +1)
plt.text(0.05, 1.015, letter, transform=ax.transAxes, fontsize=title_fontsize, fontweight=title_fontweight)
plt.title(title)
plt.xticks([])
plt.yticks([])
ax.set_ylabel(ylabel)
# 绘制每个群集
for cluster_id in np.unique(clusters):
cluster_idx = clusters == cluster_id
# 选取当前群集的所有点
cluster_points = tsne_coords[cluster_idx]
# 计算当前群集所有点的均值,这将作为注释的位置
centroid = np.mean(cluster_points, axis=0)
points=plt.scatter(tsne_coords[cluster_idx, 0], tsne_coords[cluster_idx, 1], s=6, edgecolor='none', rasterized=True, cmap='tab20')
if i == 0:
# 在群集的中心位置添加注释,注释内容为群集的ID
plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=8, ha='center', va='center')
# color = points.get_facecolor().flatten()
# this_cluster_mean = np.mean(tsne[cluster_idx,:],axis=0)
# plt.text(this_cluster_mean[0],this_cluster_mean[1],cluster_id,fontsize=10,fontweight='bold',color=color)
plt.tight_layout()
plt.tight_layout()
sns.despine(left=True, bottom=True)
# 设置风格和大小
title_fontsize = 12
title_fontweight = "bold"
clusters = promoters.obs['Labeling'].astype(str)
tsnes = [promoters.obsm["X_umap"], promoters.obsm["relative_umap"],
promoters_filter.obsm["X_umap"], promoters_filter.obsm["relative_umap"],
promoters_filter_6000.obsm["X_umap"], promoters_filter_6000.obsm["relative_umap"]
]
titles = ['Mean methylation level, (All Feature)', 'Relative methylation level, (All Feature)',
'Mean methylation level, (HVM_3000)', 'Relative methylation level, (HVM_3000)',
'Mean methylation level, (HVM_6000)', 'Relative methylation level, (HVM_6000)']
ylabels = ['', '','', '','', '']
letters = ['a', 'b','c','d','e','f']
with sns.plotting_context('talk'):
plt.figure(figsize=(18, 18))
for i, (tsne_coords, title, letter, ylabel) in enumerate(zip(tsnes, titles, letters, ylabels)):
ax = plt.subplot(320 + i +1)
plt.text(0.05, 1.015, letter, transform=ax.transAxes, fontsize=title_fontsize, fontweight=title_fontweight)
plt.title(title)
plt.xticks([])
plt.yticks([])
ax.set_ylabel(ylabel)
# 绘制每个群集
for cluster_id in np.unique(clusters):
cluster_idx = clusters == cluster_id
# 选取当前群集的所有点
cluster_points = tsne_coords[cluster_idx]
# 计算当前群集所有点的均值,这将作为注释的位置
centroid = np.mean(cluster_points, axis=0)
points=plt.scatter(tsne_coords[cluster_idx, 0], tsne_coords[cluster_idx, 1], s=6, edgecolor='none', rasterized=True, cmap='tab20')
if i == 0:
# 在群集的中心位置添加注释,注释内容为群集的ID
plt.text(centroid[0], centroid[1]-1, cluster_id, fontsize=8, ha='center', va='center')
# color = points.get_facecolor().flatten()
# this_cluster_mean = np.mean(tsne[cluster_idx,:],axis=0)
# plt.text(this_cluster_mean[0],this_cluster_mean[1],cluster_id,fontsize=10,fontweight='bold',color=color)
plt.tight_layout()
plt.tight_layout()
sns.despine(left=True, bottom=True)
UserWarning:C:\Users\zong\AppData\Local\Temp\ipykernel_4492\3479663459.py:34: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored UserWarning:C:\Users\zong\AppData\Local\Temp\ipykernel_4492\3479663459.py:42: The figure layout has changed to tight UserWarning:C:\Users\zong\AppData\Local\Temp\ipykernel_4492\3479663459.py:45: The figure layout has changed to tight
Feature Selectiom¶
In [130]:
Copied!
#gene selcect
with sns.plotting_context("talk"):
fig,axes = plt.subplots(2,2,figsize=(18,18))
for ax in axes:
ax.scatter(promoters.var["var"],promoters.var["sr_var"],s=8, rasterized=True)
sns.despine()
plt.tight_layout()
#gene selcect
with sns.plotting_context("talk"):
fig,axes = plt.subplots(2,2,figsize=(18,18))
for ax in axes:
ax.scatter(promoters.var["var"],promoters.var["sr_var"],s=8, rasterized=True)
sns.despine()
plt.tight_layout()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[130], line 6 4 fig,axes = plt.subplots(2,2,figsize=(18,18)) 5 for ax in axes: ----> 6 ax.scatter(promoters.var["var"],promoters.var["sr_var"],s=8, rasterized=True) 7 sns.despine() 8 plt.tight_layout() AttributeError: 'numpy.ndarray' object has no attribute 'scatter'
In [15]:
Copied!
promoters = winows100k_scm
promoters.var
promoters = winows100k_scm
promoters.var
Out[15]:
chromosome | start | end | covered_cell | var | sr_var | pct_cell | |
---|---|---|---|---|---|---|---|
index | |||||||
chr1:1-100000 | chr1 | 1 | 100000 | 2353 | 0.035220 | 0.029984 | 0.995347 |
chr1:100001-200000 | chr1 | 100001 | 200000 | 2354 | 0.009842 | 0.016549 | 0.995770 |
chr1:200001-300000 | chr1 | 200001 | 300000 | 2351 | 0.030100 | 0.035758 | 0.994501 |
chr1:300001-400000 | chr1 | 300001 | 400000 | 2323 | 0.026505 | 0.055943 | 0.982657 |
chr1:400001-500000 | chr1 | 400001 | 500000 | 2346 | 0.014380 | 0.038490 | 0.992386 |
... | ... | ... | ... | ... | ... | ... | ... |
chrY:56800001-56900000 | chrY | 56800001 | 56900000 | 2361 | 0.002751 | 0.003180 | 0.998731 |
chrY:56900001-57000000 | chrY | 56900001 | 57000000 | 2122 | 0.098136 | 0.067797 | 0.897631 |
chrY:57000001-57100000 | chrY | 57000001 | 57100000 | 2252 | 0.066954 | 0.067381 | 0.952623 |
chrY:57100001-57200000 | chrY | 57100001 | 57200000 | 2311 | 0.046440 | 0.054673 | 0.977580 |
chrY:57200001-57227415 | chrY | 57200001 | 57227415 | 2125 | 0.040066 | 0.080984 | 0.898900 |
29510 rows × 7 columns
In [21]:
Copied!
winows100k_scm
winows100k_scm
Out[21]:
AnnData object with n_obs × n_vars = 2364 × 29510 obs: 'cell_id', 'cell_name', 'sites', 'meth', 'n_total', 'global_meth_level', 'Sample_id', 'Sample', 'Animal age', 'FACS date', 'Brain area', 'Laminar layer', 'Labeling', 'FACS channel', 'FACS count', 'Bisulfite conversion method', 'Library type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Sequencing run mode', 'Total reads', 'Mapped reads', '% Mapped reads', 'Filtered reads', '% Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Coverage (%)', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate', 'n_features', 'pct_peaks' var: 'chromosome', 'start', 'end', 'covered_cell', 'var', 'sr_var', 'pct_cell', 'mean' uns: 'workdir', 'label_color' layers: 'relative'
In [14]:
Copied!
dotsize=3
#for broken lines
linewidth_broken = 2
color_broken = 'tab:red'
#axis limits
xlim_satija = [1e-4,1e2]
ylim_satija_theta = [10**-7.5,10**1.5]
ylim_satija_beta0 = [-60,20]
title_fontsize = 22
title_fontweight = "bold"
title_position = (-0.2,1)
title_location = 'center'
title_y_padding = -10
markerfirst = True
alpha=0.05
def make_panel(x,
y,
ax,
letter='',
title='',
color_kde=True,
xlim=None,
ylim=None,
xlabel='mean expression',
ylabel='',
xscale='log',
yscale='log',):
ax.text(*title_position,letter,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
if color_kde:
xtrans = np.log10(x) if xscale == 'log' else x
ytrans = np.log10(y) if yscale == 'log' else y
data = np.vstack((xtrans,ytrans))
kde = gaussian_kde(data)
logdensity = kde.logpdf(data)
points = ax.scatter(x, y, s=dotsize, c=logdensity, rasterized=True)
else:
ax.scatter(x, y, s=dotsize, rasterized=True)
ax.set_xscale(xscale)
ax.set_yscale(yscale)
ax.set_ylabel(ylabel)
ax.set_xlabel(xlabel)
if xlim:
ax.set_xlim(xlim)
if ylim:
ax.set_ylim(ylim)
ax.set_title(title)
def add_line(x,y,ax):
ax.plot(x,y, '--',c=color_broken, linewidth=linewidth_broken)
def fix_ticks(ax):
ax.set_yticks(ax.get_yticks()[1:-1])
dotsize=3
#for broken lines
linewidth_broken = 2
color_broken = 'tab:red'
#axis limits
xlim_satija = [1e-4,1e2]
ylim_satija_theta = [10**-7.5,10**1.5]
ylim_satija_beta0 = [-60,20]
title_fontsize = 22
title_fontweight = "bold"
title_position = (-0.2,1)
title_location = 'center'
title_y_padding = -10
markerfirst = True
alpha=0.05
def make_panel(x,
y,
ax,
letter='',
title='',
color_kde=True,
xlim=None,
ylim=None,
xlabel='mean expression',
ylabel='',
xscale='log',
yscale='log',):
ax.text(*title_position,letter,transform=ax.transAxes,fontsize=title_fontsize,fontweight=title_fontweight)
if color_kde:
xtrans = np.log10(x) if xscale == 'log' else x
ytrans = np.log10(y) if yscale == 'log' else y
data = np.vstack((xtrans,ytrans))
kde = gaussian_kde(data)
logdensity = kde.logpdf(data)
points = ax.scatter(x, y, s=dotsize, c=logdensity, rasterized=True)
else:
ax.scatter(x, y, s=dotsize, rasterized=True)
ax.set_xscale(xscale)
ax.set_yscale(yscale)
ax.set_ylabel(ylabel)
ax.set_xlabel(xlabel)
if xlim:
ax.set_xlim(xlim)
if ylim:
ax.set_ylim(ylim)
ax.set_title(title)
def add_line(x,y,ax):
ax.plot(x,y, '--',c=color_broken, linewidth=linewidth_broken)
def fix_ticks(ax):
ax.set_yticks(ax.get_yticks()[1:-1])
In [ ]:
Copied!
#relarionship between mean and var/sr_var, sr_var are the variation of shrunken risiduals
with sns.plotting_context("talk"):
fig = plt.figure(figsize=(18,9)) #constrained_layout=True)
gs = GridSpec(1,2 , figure=fig)
ax1 = fig.add_subplot(gs[0, 0]) #var
ax2 = fig.add_subplot(gs[0, 1]) #sr_var
make_panel(x=winows100k_scm.var['mean'],
y=winows100k_scm.var['var'],
ax=ax1,
letter='a',
color_kde=False,
ylabel=r'Var ($\hat\beta_{0g}$)',
yscale='linear')
# beta0_prediction = np.log(mean_range_orig / np.mean(ns_orig))
# add_line(x=mean_range_orig,y=beta0_prediction,ax=ax1)
fix_ticks(ax1)
make_panel(x=winows100k_scm.var['mean'],
y=winows100k_scm.var['sr_var'],
ax=ax2,
letter='b',
ylabel=r'sr_Var ($\hat\beta_{1g}$)',
yscale='linear')
# beta0_prediction = np.log(mean_range_orig / np.mean(ns_orig))
# add_line(x=winows100k_scm.var['mean'],y=winows100k_scm.var['sr_var'],ax=ax1)
# ax2.hlines(np.log(10),mean_min_orig,mean_max_orig,linestyles='--', colors=color_broken, linewidths=linewidth_broken)
sns.despine()
plt.tight_layout()
#plt.savefig('figures/Fig1.pdf', dpi=300, format=None)
plt.show()
#relarionship between mean and var/sr_var, sr_var are the variation of shrunken risiduals
with sns.plotting_context("talk"):
fig = plt.figure(figsize=(18,9)) #constrained_layout=True)
gs = GridSpec(1,2 , figure=fig)
ax1 = fig.add_subplot(gs[0, 0]) #var
ax2 = fig.add_subplot(gs[0, 1]) #sr_var
make_panel(x=winows100k_scm.var['mean'],
y=winows100k_scm.var['var'],
ax=ax1,
letter='a',
color_kde=False,
ylabel=r'Var ($\hat\beta_{0g}$)',
yscale='linear')
# beta0_prediction = np.log(mean_range_orig / np.mean(ns_orig))
# add_line(x=mean_range_orig,y=beta0_prediction,ax=ax1)
fix_ticks(ax1)
make_panel(x=winows100k_scm.var['mean'],
y=winows100k_scm.var['sr_var'],
ax=ax2,
letter='b',
ylabel=r'sr_Var ($\hat\beta_{1g}$)',
yscale='linear')
# beta0_prediction = np.log(mean_range_orig / np.mean(ns_orig))
# add_line(x=winows100k_scm.var['mean'],y=winows100k_scm.var['sr_var'],ax=ax1)
# ax2.hlines(np.log(10),mean_min_orig,mean_max_orig,linestyles='--', colors=color_broken, linewidths=linewidth_broken)
sns.despine()
plt.tight_layout()
#plt.savefig('figures/Fig1.pdf', dpi=300, format=None)
plt.show()
In [ ]:
Copied!