Quality control

In [5]:

Copied!





import pandas as pd
import numpy as np
import os
import glob
import multiprocessing as mp
from multiprocessing import Pool
from pathlib import Path

from anndata import ImplicitModificationWarning
from scipy.sparse import coo_matrix
from scipy import sparse
import datetime as datetime
from typing_extensions import Literal
from typing import Union
import anndata as ad
import subprocess
import pathlib
import collections
import click
import time
import warnings
import pandas as pd
import numpy as np
import os
import glob
import multiprocessing as mp
from multiprocessing import Pool
from pathlib import Path

from anndata import ImplicitModificationWarning
from scipy.sparse import coo_matrix
from scipy import sparse
import datetime as datetime
from typing_extensions import Literal
from typing import Union
import anndata as ad
import subprocess
import pathlib
import collections
import click
import time
import warnings

In [6]:

Copied!

adata = ad.read(
    'D://Test/GSE56789/temp/result_anndata.h5'
)
adata = ad.read(
    'D://Test/GSE56789/temp/result_anndata.h5'
)

In [7]:

Copied!

adata
adata

Out[7]:

AnnData object with n_obs × n_vars = 52 × 4912
    obs: 'methylated_CG', 'mc_class_CG', 'total_CG', 'overall_mc_level_CG', 'sample', 'sample_id'
    var: 'feature_name'

In [8]:

Copied!

adata.X
adata.X

Out[8]:

<52x4912 sparse matrix of type '<class 'numpy.float64'>'
	with 168981 stored elements in Compressed Sparse Row format>

In [9]:

Copied!





# adding QC values
##### 稀疏矩阵不适用 ########
if sparse.issparse(adata.X):
    length1 = adata.X.shape[1]  # features, 列
    length2 = adata.X.shape[0]  # samples, 行

    # 计算每个样本中覆盖的特征数
    coverage_cells = [length1 - np.isnan(line.toarray()).sum() for line in adata.X]

    # 计算每个样本的平均甲基化
    mean_cell_methylation = [np.nansum(line.toarray()) / length1 for line in adata.X]

    # 将结果存储到 adata.obs
    adata.obs['coverage_cells'] = coverage_cells
    adata.obs['mean_cell_methylation'] = mean_cell_methylation

    # 计算每个特征在多少个样本中有覆盖
    coverage_feature = [length2 - np.isnan(line.toarray()).sum() for line in adata.X.T]

    # 计算每个特征的平均甲基化
    mean_feature_methylation = [np.nansum(line.toarray()) / length2 for line in adata.X.T]

    # 将结果存储到 adata.var
    adata.var['coverage_feature'] = coverage_feature
    adata.var['mean_feature_methylation'] = mean_feature_methylation
else:
    # 如果 adata.X 不是稀疏矩阵，可以执行其他操作或者报错
    raise ValueError("adata.X is not a CSR matrix. Please handle this case accordingly.")
    length1 = len(adata.X[0,:]) # 55017 features,列
    length2 = len(adata.X[:,0]) # 3379 samples 行
    adata.obs['coverage_cells'] = [length1 - np.isnan(line).sum() for line in adata.X] #每一个样本中覆盖的特征数
    adata.obs['mean_cell_methylation'] = [np.nansum(line)/length1 for line in adata.X]
    adata.var['coverage_feature'] = [length2 - np.isnan(line).sum() for line in adata.X.T] #每一个特征在多少个样本中有覆盖
    adata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in adata.X.T]
# adding QC values
##### 稀疏矩阵不适用 ########
if sparse.issparse(adata.X):
    length1 = adata.X.shape[1]  # features, 列
    length2 = adata.X.shape[0]  # samples, 行

    # 计算每个样本中覆盖的特征数
    coverage_cells = [length1 - np.isnan(line.toarray()).sum() for line in adata.X]

    # 计算每个样本的平均甲基化
    mean_cell_methylation = [np.nansum(line.toarray()) / length1 for line in adata.X]

    # 将结果存储到 adata.obs
    adata.obs['coverage_cells'] = coverage_cells
    adata.obs['mean_cell_methylation'] = mean_cell_methylation

    # 计算每个特征在多少个样本中有覆盖
    coverage_feature = [length2 - np.isnan(line.toarray()).sum() for line in adata.X.T]

    # 计算每个特征的平均甲基化
    mean_feature_methylation = [np.nansum(line.toarray()) / length2 for line in adata.X.T]

    # 将结果存储到 adata.var
    adata.var['coverage_feature'] = coverage_feature
    adata.var['mean_feature_methylation'] = mean_feature_methylation
else:
    # 如果 adata.X 不是稀疏矩阵，可以执行其他操作或者报错
    raise ValueError("adata.X is not a CSR matrix. Please handle this case accordingly.")
    length1 = len(adata.X[0,:]) # 55017 features,列
    length2 = len(adata.X[:,0]) # 3379 samples 行
    adata.obs['coverage_cells'] = [length1 - np.isnan(line).sum() for line in adata.X] #每一个样本中覆盖的特征数
    adata.obs['mean_cell_methylation'] = [np.nansum(line)/length1 for line in adata.X]
    adata.var['coverage_feature'] = [length2 - np.isnan(line).sum() for line in adata.X.T] #每一个特征在多少个样本中有覆盖
    adata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in adata.X.T]

In [13]:

Copied!

adata.obs
adata.obs

Out[13]:

	methylated_CG	mc_class_CG	total_CG	overall_mc_level_CG	sample	sample_id	coverage_cells	mean_cell_methylation
8	2773841.0	6389935.0	10058585.0	0.275769	GSM1370523	0	4912	0.255770
9	4207956.0	9056780.0	14749706.0	0.285291	GSM1370524	1	4912	0.267832
2	1062414.0	3024372.0	3687362.0	0.288123	GSM1370525	2	4912	0.270414
7	2174791.0	5617340.0	7468542.0	0.291194	GSM1370526	3	4912	0.257664
6	2250723.0	5478455.0	7718445.0	0.291603	GSM1370527	4	4912	0.256831
0	676706.0	1997630.0	2305401.0	0.293531	GSM1370528	5	4912	0.246128
3	1235199.0	3266783.0	4241536.0	0.291215	GSM1370529	6	4912	0.247983
4	1602124.0	3903117.0	5507096.0	0.290920	GSM1370530	7	4912	0.261821
1	1229421.0	3204150.0	4160911.0	0.295469	GSM1370531	8	4912	0.254784
5	1149598.0	2822474.0	3733805.0	0.307889	GSM1370532	9	4912	0.257592
10	1586218.0	3869842.0	5403788.0	0.293538	GSM1370533	10	4912	0.253991
11	1948937.0	3800961.0	5242714.0	0.371742	GSM1370535	11	4912	0.393481
12	1271969.0	2797560.0	3506515.0	0.362744	GSM1370536	12	4912	0.346206
13	2105457.0	3372625.0	4537703.0	0.463992	GSM1370537	13	4912	0.454491
14	1521358.0	4242155.0	5799310.0	0.262334	GSM1370538	14	4912	0.236155
15	1974452.0	3917329.0	5546459.0	0.355984	GSM1370539	15	4912	0.337848
16	372428.0	3297855.0	4538546.0	0.082059	GSM1370540	16	4912	0.066197
17	2068775.0	4283144.0	5851328.0	0.353556	GSM1370541	17	4912	0.276954
18	730843.0	3950391.0	5450398.0	0.134090	GSM1370542	18	4912	0.127163
19	1318842.0	4442023.0	5843535.0	0.225692	GSM1370543	19	4912	0.214378
20	1109405.0	3710145.0	5011395.0	0.221376	GSM1370544	20	4912	0.201409
21	721897.0	3110061.0	4463927.0	0.161718	GSM1370545	21	4912	0.148684
22	1828675.0	3532788.0	4807341.0	0.380392	GSM1370546	22	4912	0.352224
23	1472.0	2953.0	3040.0	0.484211	GSM1370548	23	4912	0.018360
24	10225.0	18155.0	18548.0	0.551272	GSM1370549	24	4912	0.118256
25	2859.0	5231.0	5319.0	0.537507	GSM1370550	25	4912	0.037595
26	9729.0	17741.0	17977.0	0.541192	GSM1370551	26	4912	0.114153
27	3672.0	6739.0	6882.0	0.533566	GSM1370552	27	4912	0.053580
28	8398.0	29932.0	36007.0	0.233232	GSM1370553	28	4912	0.086694
29	1598.0	4820.0	5447.0	0.293372	GSM1370554	29	4912	0.021323
30	5078262.0	4424268.0	7002192.0	0.725239	GSM1370555	30	4912	0.633859
31	2883177.0	3262923.0	4525444.0	0.637104	GSM1370556	31	4912	0.569044
32	3010405.0	4728963.0	7751272.0	0.388376	GSM1370557	32	4912	0.338542
33	4478594.0	4965673.0	7571138.0	0.591535	GSM1370558	33	4912	0.570422
34	3200120.0	3848428.0	5622442.0	0.569169	GSM1370559	34	4912	0.498544
36	1814649.0	5795602.0	8946794.0	0.202827	GSM1370560	35	4912	0.187398
35	3755105.0	3468770.0	5304515.0	0.707907	GSM1370561	36	4912	0.629427
37	3243439.0	3557629.0	4771783.0	0.679712	GSM1370562	37	4912	0.611443
38	3826163.0	5262688.0	7106219.0	0.538425	GSM1370563	38	4912	0.508737
39	4527218.0	4938305.0	7210573.0	0.627858	GSM1370564	39	4912	0.596066
40	3487010.0	3698022.0	5098903.0	0.683875	GSM1370565	40	4912	0.615996
41	1670947.0	2077704.0	2662177.0	0.627662	GSM1370566	41	4912	0.585674
42	2014271.0	3017804.0	3828676.0	0.526101	GSM1370567	42	4912	0.468503
44	6249395.0	7502459.0	10319408.0	0.605596	GSM1370568	43	4912	0.539304
43	2641859.0	3607443.0	4826163.0	0.547404	GSM1370569	44	4912	0.477331
45	3859515.0	5568802.0	7575752.0	0.509456	GSM1370570	45	4912	0.469847
46	4079333.0	4145869.0	6017116.0	0.677955	GSM1370571	46	4912	0.616277
47	3410149.0	3967729.0	5362681.0	0.635904	GSM1370572	47	4912	0.577196
48	3488749.0	4433266.0	6109535.0	0.571033	GSM1370573	48	4912	0.552650
49	3588924.0	4523416.0	6085020.0	0.589797	GSM1370574	49	4912	0.513039
51	37221650.0	22641666.0	53520217.0	0.695469	GSM1370575	50	4912	0.581207
50	5484605.0	9727286.0	19593619.0	0.279918	GSM1413827	51	4912	0.257857

In [14]:

Copied!

import scvelo as scv
data = adata.X.toarray()
data
import scvelo as scv
data = adata.X.toarray()
data

Out[14]:

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [24]:

Copied!

adata.var
adata.var

Out[24]:

	feature_name	coverage_feature	mean_feature_methylation
0	chr1_0_100000	52	0.0
1	chr1_100000_200000	52	0.0
2	chr1_200000_300000	52	0.0
3	chr1_300000_400000	52	0.0
4	chr1_400000_500000	52	0.0
...	...	...	...
4907	chr2_241700000_241800000	52	0.0
4908	chr2_241800000_241900000	52	0.0
4909	chr2_241900000_242000000	52	0.0
4910	chr2_242000000_242100000	52	0.0
4911	chr2_242100000_242193529	52	0.0

4912 rows × 3 columns

In [20]:

Copied!

adata.X.T
adata.X.T

Out[20]:

<4912x52 sparse matrix of type '<class 'numpy.float64'>'
	with 168981 stored elements in Compressed Sparse Column format>

In [10]:

Copied!

import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

In [36]:

Copied!





def plot_hist(adata,term,title,y_label,x_label):

    # 创建一个新的图形对象
    fig, ax = plt.subplots()
    
    # number of peaks in a cell
    # plt.axvline(x=10000, color='r') # minimum number of feature covered per cells
    np.histogram(adata.obs[term])
    plt.hist(adata.obs[term],alpha=0.4)
    #adding horizontal grid lines 
    ax.yaxis.grid(True)
    # remove axis spines
    ax.spines["top"].set_visible(False)  
    ax.spines["right"].set_visible(False) 
    ax.spines["bottom"].set_visible(False) 
    ax.spines["left"].set_visible(False) 
    # labels 
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    # raised title
    return fig
def plot_hist(adata,term,title,y_label,x_label):

    # 创建一个新的图形对象
    fig, ax = plt.subplots()
    
    # number of peaks in a cell
    # plt.axvline(x=10000, color='r') # minimum number of feature covered per cells
    np.histogram(adata.obs[term])
    plt.hist(adata.obs[term],alpha=0.4)
    #adding horizontal grid lines 
    ax.yaxis.grid(True)
    # remove axis spines
    ax.spines["top"].set_visible(False)  
    ax.spines["right"].set_visible(False) 
    ax.spines["bottom"].set_visible(False) 
    ax.spines["left"].set_visible(False) 
    # labels 
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    # raised title
    return fig

In [2]:

Copied!

!pip install --upgrade scbs
!pip install --upgrade scbs

Collecting scbs
  Downloading scbs-0.6.2-py3-none-any.whl (61 kB)
     ---------------------------------------- 0.0/61.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/61.5 kB ? eta -:--:--
     ------------ ------------------------- 20.5/61.5 kB 640.0 kB/s eta 0:00:01
     -------------------------------------  61.4/61.5 kB 648.1 kB/s eta 0:00:01
     -------------------------------------- 61.5/61.5 kB 649.0 kB/s eta 0:00:00
Collecting click<8.1,>=7.1.2 (from scbs)
  Downloading click-8.0.4-py3-none-any.whl (97 kB)
     ---------------------------------------- 0.0/97.5 kB ? eta -:--:--
     ---- ----------------------------------- 10.2/97.5 kB ? eta -:--:--
     ------------------------------------- -- 92.2/97.5 kB 1.1 MB/s eta 0:00:01
     ---------------------------------------- 97.5/97.5 kB 1.1 MB/s eta 0:00:00
Collecting click-help-colors<1,>=0.9 (from scbs)
  Obtaining dependency information for click-help-colors<1,>=0.9 from https://files.pythonhosted.org/packages/c4/f7/6e0716e92a66aa5865ec133d2c236fd9229818e4b15ee0658612a1badb78/click_help_colors-0.9.2-py3-none-any.whl.metadata
  Downloading click_help_colors-0.9.2-py3-none-any.whl.metadata (4.2 kB)
Requirement already satisfied: colorama<1,>=0.3.9 in d:\conda\envs\py39\lib\site-packages (from scbs) (0.4.6)
Requirement already satisfied: numba<1,>=0.53.0 in d:\conda\envs\py39\lib\site-packages (from scbs) (0.57.1)
Requirement already satisfied: numpy<2,>=1.20.1 in d:\conda\envs\py39\lib\site-packages (from scbs) (1.24.4)
Collecting pandas<2,>=1.2.3 (from scbs)
  Downloading pandas-1.5.3-cp39-cp39-win_amd64.whl (10.9 MB)
     ---------------------------------------- 0.0/10.9 MB ? eta -:--:--
     ---------------------------------------- 0.0/10.9 MB ? eta -:--:--
     ---------------------------------------- 0.0/10.9 MB 1.4 MB/s eta 0:00:08
     ---------------------------------------- 0.1/10.9 MB 1.1 MB/s eta 0:00:11
      --------------------------------------- 0.2/10.9 MB 1.1 MB/s eta 0:00:10
     - -------------------------------------- 0.3/10.9 MB 1.8 MB/s eta 0:00:06
     -- ------------------------------------- 0.6/10.9 MB 2.6 MB/s eta 0:00:05
     -- ------------------------------------- 0.7/10.9 MB 2.6 MB/s eta 0:00:04
     -- ------------------------------------- 0.8/10.9 MB 2.6 MB/s eta 0:00:04
     --- ------------------------------------ 1.0/10.9 MB 2.7 MB/s eta 0:00:04
     ---- ----------------------------------- 1.2/10.9 MB 3.0 MB/s eta 0:00:04
     ----- ---------------------------------- 1.5/10.9 MB 3.2 MB/s eta 0:00:03
     ------ --------------------------------- 1.7/10.9 MB 3.4 MB/s eta 0:00:03
     ------ --------------------------------- 1.9/10.9 MB 3.5 MB/s eta 0:00:03
     ------- -------------------------------- 2.1/10.9 MB 3.6 MB/s eta 0:00:03
     -------- ------------------------------- 2.3/10.9 MB 3.7 MB/s eta 0:00:03
     -------- ------------------------------- 2.3/10.9 MB 3.7 MB/s eta 0:00:03
     -------- ------------------------------- 2.3/10.9 MB 3.1 MB/s eta 0:00:03
     ----------- ---------------------------- 3.0/10.9 MB 3.9 MB/s eta 0:00:03
     ------------ --------------------------- 3.3/10.9 MB 4.0 MB/s eta 0:00:02
     ------------ --------------------------- 3.4/10.9 MB 4.0 MB/s eta 0:00:02
     -------------- ------------------------- 3.9/10.9 MB 4.1 MB/s eta 0:00:02
     -------------- ------------------------- 4.1/10.9 MB 4.1 MB/s eta 0:00:02
     ---------------- ----------------------- 4.4/10.9 MB 4.1 MB/s eta 0:00:02
     ---------------- ----------------------- 4.5/10.9 MB 4.0 MB/s eta 0:00:02
     ------------------ --------------------- 5.0/10.9 MB 4.3 MB/s eta 0:00:02
     ------------------- -------------------- 5.2/10.9 MB 4.3 MB/s eta 0:00:02
     ------------------- -------------------- 5.5/10.9 MB 4.3 MB/s eta 0:00:02
     -------------------- ------------------- 5.7/10.9 MB 4.3 MB/s eta 0:00:02
     --------------------- ------------------ 5.8/10.9 MB 4.3 MB/s eta 0:00:02
     --------------------- ------------------ 5.8/10.9 MB 4.3 MB/s eta 0:00:02
     ----------------------- ---------------- 6.3/10.9 MB 4.4 MB/s eta 0:00:02
     ----------------------- ---------------- 6.5/10.9 MB 4.4 MB/s eta 0:00:02
     ------------------------ --------------- 6.7/10.9 MB 4.3 MB/s eta 0:00:01
     ------------------------ --------------- 6.7/10.9 MB 4.3 MB/s eta 0:00:01
     ------------------------ --------------- 6.8/10.9 MB 4.2 MB/s eta 0:00:01
     ------------------------- -------------- 6.9/10.9 MB 4.1 MB/s eta 0:00:01
     ------------------------- -------------- 7.0/10.9 MB 4.0 MB/s eta 0:00:01
     -------------------------- ------------- 7.3/10.9 MB 4.1 MB/s eta 0:00:01
     --------------------------- ------------ 7.5/10.9 MB 4.1 MB/s eta 0:00:01
     ---------------------------- ----------- 7.7/10.9 MB 4.1 MB/s eta 0:00:01
     ---------------------------- ----------- 7.9/10.9 MB 4.1 MB/s eta 0:00:01
     ----------------------------- ---------- 8.1/10.9 MB 4.2 MB/s eta 0:00:01
     ------------------------------ --------- 8.3/10.9 MB 4.1 MB/s eta 0:00:01
     ------------------------------ --------- 8.4/10.9 MB 4.1 MB/s eta 0:00:01
     ------------------------------- -------- 8.6/10.9 MB 4.1 MB/s eta 0:00:01
     ------------------------------- -------- 8.7/10.9 MB 4.1 MB/s eta 0:00:01
     -------------------------------- ------- 8.9/10.9 MB 4.1 MB/s eta 0:00:01
     -------------------------------- ------- 9.0/10.9 MB 4.0 MB/s eta 0:00:01
     --------------------------------- ------ 9.1/10.9 MB 4.0 MB/s eta 0:00:01
     --------------------------------- ------ 9.2/10.9 MB 4.0 MB/s eta 0:00:01
     ---------------------------------- ----- 9.4/10.9 MB 3.9 MB/s eta 0:00:01
     ---------------------------------- ----- 9.5/10.9 MB 3.9 MB/s eta 0:00:01
     ----------------------------------- ---- 9.6/10.9 MB 3.9 MB/s eta 0:00:01
     ----------------------------------- ---- 9.7/10.9 MB 3.9 MB/s eta 0:00:01
     ----------------------------------- ---- 9.8/10.9 MB 3.8 MB/s eta 0:00:01
     ------------------------------------ --- 9.9/10.9 MB 3.8 MB/s eta 0:00:01
     ------------------------------------- -- 10.1/10.9 MB 3.8 MB/s eta 0:00:01
     ------------------------------------- -- 10.3/10.9 MB 4.0 MB/s eta 0:00:01
     -------------------------------------- - 10.6/10.9 MB 4.0 MB/s eta 0:00:01
     ---------------------------------------  10.8/10.9 MB 4.0 MB/s eta 0:00:01
     ---------------------------------------  10.9/10.9 MB 4.0 MB/s eta 0:00:01
     ---------------------------------------  10.9/10.9 MB 4.0 MB/s eta 0:00:01
     ---------------------------------------  10.9/10.9 MB 4.0 MB/s eta 0:00:01
     ---------------------------------------- 10.9/10.9 MB 3.8 MB/s eta 0:00:00
Requirement already satisfied: scipy<2,>=1.6.1 in d:\conda\envs\py39\lib\site-packages (from scbs) (1.11.2)
Requirement already satisfied: statsmodels<1,>=0.12.2 in d:\conda\envs\py39\lib\site-packages (from scbs) (0.14.0)
Requirement already satisfied: llvmlite<0.41,>=0.40.0dev0 in d:\conda\envs\py39\lib\site-packages (from numba<1,>=0.53.0->scbs) (0.40.1)
Requirement already satisfied: python-dateutil>=2.8.1 in d:\conda\envs\py39\lib\site-packages (from pandas<2,>=1.2.3->scbs) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in d:\conda\envs\py39\lib\site-packages (from pandas<2,>=1.2.3->scbs) (2023.3)
Requirement already satisfied: patsy>=0.5.2 in d:\conda\envs\py39\lib\site-packages (from statsmodels<1,>=0.12.2->scbs) (0.5.3)
Requirement already satisfied: packaging>=21.3 in d:\conda\envs\py39\lib\site-packages (from statsmodels<1,>=0.12.2->scbs) (23.1)
Requirement already satisfied: six in d:\conda\envs\py39\lib\site-packages (from patsy>=0.5.2->statsmodels<1,>=0.12.2->scbs) (1.16.0)
Downloading click_help_colors-0.9.2-py3-none-any.whl (5.5 kB)
Installing collected packages: click, pandas, click-help-colors, scbs
  Attempting uninstall: click
    Found existing installation: click 8.1.7
    Uninstalling click-8.1.7:
      Successfully uninstalled click-8.1.7
  Attempting uninstall: pandas
    Found existing installation: pandas 2.0.3
    Uninstalling pandas-2.0.3:
      Successfully uninstalled pandas-2.0.3

ERROR: Could not install packages due to an OSError: [WinError 5] 拒绝访问。: 'D:\\conda\\envs\\py39\\Lib\\site-packages\\~andas\\_libs\\algos.cp39-win_amd64.pyd'
Consider using the `--user` option or check the permissions.

In [12]:

Copied!





# 提取obs中的global列和cpg列
y_values = adata.obs['total_CG']
x_values = adata.obs['overall_mc_level_CG']

# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5)  # alpha参数用于设置点的透明度，可根据需要调整

# 添加标签和标题
plt.xlabel('Global DNA methylation %')
plt.ylabel('Observed CpG Sites')

# 显示图形
plt.show()
# 提取obs中的global列和cpg列
y_values = adata.obs['total_CG']
x_values = adata.obs['overall_mc_level_CG']

# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5)  # alpha参数用于设置点的透明度，可根据需要调整

# 添加标签和标题
plt.xlabel('Global DNA methylation %')
plt.ylabel('Observed CpG Sites')

# 显示图形
plt.show()

No description has been provided for this image

In [67]:

Copied!





def plot_hist_cutoff(adata,name='',bins=100,xlim_quantile=(0.001, 0.999)):
    fig, ax1 = plt.subplots()
    # Create histogram
    data = adata.obs['total_CG']
    ax1 = plt.hist(data, bins=bins, color='blue', alpha=0.7)
    ax1.set_xlabel(name)
    ax1.set_ylabel('Count', color='blue')

    
    # # Calculate the percentage of data that meets the cutoff value
    # total_data = data.size
    # print(data[total_data])
    # x = np.linspace(data[0], data[total_data], bins)
    # passed_data = np.array([(data > i).sum() for i in x])
    # percentage_passed = (passed_data / total_data) * 100

    

    # print (percentage_passed)

    xlim = tuple(np.quantile(data, xlim_quantile))
    x = np.linspace(xlim[0], xlim[1], 500)
    count_list = np.array([(data > i).sum() for i in x])
    original_total_data = data.size
    count_list = count_list / original_total_data * 100
    data = data[(data < xlim[1]) & (data > xlim[0])]



    # Plot a dashed line for the percentage
    ax2 = ax1.twinx()
    ax2.axhline(y=percentage_passed, color='red', linestyle='--')
    ax2.set_ylabel('Percentage', color='red')
    ax2.set_ylim(0, 100)

    plt.title(f'Data Distribution with Cutoff at {cutoff_value}')
    plt.legend(loc='upper right')

    plt.show()
def plot_hist_cutoff(adata,name='',bins=100,xlim_quantile=(0.001, 0.999)):
    fig, ax1 = plt.subplots()
    # Create histogram
    data = adata.obs['total_CG']
    ax1 = plt.hist(data, bins=bins, color='blue', alpha=0.7)
    ax1.set_xlabel(name)
    ax1.set_ylabel('Count', color='blue')

    
    # # Calculate the percentage of data that meets the cutoff value
    # total_data = data.size
    # print(data[total_data])
    # x = np.linspace(data[0], data[total_data], bins)
    # passed_data = np.array([(data > i).sum() for i in x])
    # percentage_passed = (passed_data / total_data) * 100

    

    # print (percentage_passed)

    xlim = tuple(np.quantile(data, xlim_quantile))
    x = np.linspace(xlim[0], xlim[1], 500)
    count_list = np.array([(data > i).sum() for i in x])
    original_total_data = data.size
    count_list = count_list / original_total_data * 100
    data = data[(data < xlim[1]) & (data > xlim[0])]



    # Plot a dashed line for the percentage
    ax2 = ax1.twinx()
    ax2.axhline(y=percentage_passed, color='red', linestyle='--')
    ax2.set_ylabel('Percentage', color='red')
    ax2.set_ylim(0, 100)

    plt.title(f'Data Distribution with Cutoff at {cutoff_value}')
    plt.legend(loc='upper right')

    plt.show()

In [68]:

Copied!

plot_hist_cutoff(adata)
plot_hist_cutoff(adata)

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[68], line 1
----> 1 plot_hist_cutoff(adata)

Cell In[67], line 6, in plot_hist_cutoff(adata, name, bins, xlim_quantile)
      4 data = adata.obs['total_CG']
      5 ax1 = plt.hist(data, bins=bins, color='blue', alpha=0.7)
----> 6 ax1.set_xlabel(name)
      7 ax1.set_ylabel('Count', color='blue')
     10 # # Calculate the percentage of data that meets the cutoff value
     11 # total_data = data.size
     12 # print(data[total_data])
   (...)
     18 
     19 # print (percentage_passed)

AttributeError: 'tuple' object has no attribute 'set_xlabel'

In [24]:

Copied!

#plot_hist(adata,'mean_cell_methylation','gloable mean methylation level','count','methylation level')
plot_hist(adata,'mc_class_CG','CG Count','count','count')
#plot_hist(adata,'mean_cell_methylation','gloable mean methylation level','count','methylation level')
plot_hist(adata,'mc_class_CG','CG Count','count','count')

In [ ]:

In [39]:

Copied!





def plot_cell(adata):
    # 创建一个 2x2 网格布局
    fig, axes = plt.subplots(2, 2, figsize=(10, 8))

        # 调用四个函数获取各自的图像
    fig1 = plot_hist(adata,'mean_cell_methylation','gloable mean methylation level','count','methylation level')
    fig2 = plot_hist(adata,'mc_class_CG','CG Count','count','count')
    fig3 = plot_hist(adata,'mc_class_CG','CG Count','count','count')
    fig4 = plot_hist(adata,'mc_class_CG','CG Count','count','count')
    
    # 将这四个图像放入子图中
    axes[0, 0].imshow(fig1)
    axes[0, 1].imshow(fig2)
    axes[1, 0].imshow(fig3)
    axes[1, 1].imshow(fig4)

    # 调整子图之间的间距
    plt.tight_layout()
    
    # 显示图形
    plt.show()
def plot_cell(adata):
    # 创建一个 2x2 网格布局
    fig, axes = plt.subplots(2, 2, figsize=(10, 8))

        # 调用四个函数获取各自的图像
    fig1 = plot_hist(adata,'mean_cell_methylation','gloable mean methylation level','count','methylation level')
    fig2 = plot_hist(adata,'mc_class_CG','CG Count','count','count')
    fig3 = plot_hist(adata,'mc_class_CG','CG Count','count','count')
    fig4 = plot_hist(adata,'mc_class_CG','CG Count','count','count')
    
    # 将这四个图像放入子图中
    axes[0, 0].imshow(fig1)
    axes[0, 1].imshow(fig2)
    axes[1, 0].imshow(fig3)
    axes[1, 1].imshow(fig4)

    # 调整子图之间的间距
    plt.tight_layout()
    
    # 显示图形
    plt.show()

In [40]:

Copied!

plot_cell(adata)
plot_cell(adata)

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[40], line 1
----> 1 plot_cell(adata)

Cell In[39], line 12, in plot_cell(adata)
      9 fig4 = plot_hist(adata,'mc_class_CG','CG Count','count','count')
     11 # 将这四个图像放入子图中
---> 12 axes[0, 0].imshow(fig1)
     13 axes[0, 1].imshow(fig2)
     14 axes[1, 0].imshow(fig3)

File D:\conda\envs\py39\lib\site-packages\matplotlib\__init__.py:1475, in _preprocess_data.<locals>.inner(ax, data, *args, **kwargs)
   1472 @functools.wraps(func)
   1473 def inner(ax, *args, data=None, **kwargs):
   1474     if data is None:
-> 1475         return func(ax, *map(sanitize_sequence, args), **kwargs)
   1477     bound = new_sig.bind(ax, *args, **kwargs)
   1478     auto_label = (bound.arguments.get(label_namer)
   1479                   or bound.kwargs.get(label_namer))

File D:\conda\envs\py39\lib\site-packages\matplotlib\axes\_axes.py:5663, in Axes.imshow(self, X, cmap, norm, aspect, interpolation, alpha, vmin, vmax, origin, extent, interpolation_stage, filternorm, filterrad, resample, url, **kwargs)
   5655 self.set_aspect(aspect)
   5656 im = mimage.AxesImage(self, cmap=cmap, norm=norm,
   5657                       interpolation=interpolation, origin=origin,
   5658                       extent=extent, filternorm=filternorm,
   5659                       filterrad=filterrad, resample=resample,
   5660                       interpolation_stage=interpolation_stage,
   5661                       **kwargs)
-> 5663 im.set_data(X)
   5664 im.set_alpha(alpha)
   5665 if im.get_clip_path() is None:
   5666     # image does not already have clipping set, clip to axes patch

File D:\conda\envs\py39\lib\site-packages\matplotlib\image.py:701, in _ImageBase.set_data(self, A)
    697 self._A = cbook.safe_masked_invalid(A, copy=True)
    699 if (self._A.dtype != np.uint8 and
    700         not np.can_cast(self._A.dtype, float, "same_kind")):
--> 701     raise TypeError("Image data of dtype {} cannot be converted to "
    702                     "float".format(self._A.dtype))
    704 if self._A.ndim == 3 and self._A.shape[-1] == 1:
    705     # If just one dimension assume scalar and apply colormap
    706     self._A = self._A[:, :, 0]

TypeError: Image data of dtype object cannot be converted to float

In [69]:

Copied!





import matplotlib.pyplot as plt
import numpy as np

# 创建一些示例数据
data = np.random.randn(1000)  # 这里使用随机数据，你可以替换为你的实际数据

# 创建直方图
plt.hist(data, bins=30, alpha=0.5, color='b', density=True, label='数据分布')

# 计算数据百分比
percentiles = [25, 50, 75]  # 这里可以自定义百分位数
percentile_values = np.percentile(data, percentiles)

# 添加虚线和百分比标签
for percentile, value in zip(percentiles, percentile_values):
    plt.axvline(value, color='r', linestyle='--', label=f'{percentile}th Percentile: {value:.2f}')

# 添加右侧百分比y轴
plt.twinx()
percentile_labels = [f'{percentile}%' for percentile in percentiles]
percentile_positions = np.percentile(data, percentiles)
plt.yticks(percentile_positions, percentile_labels)
plt.ylabel('百分比')

# 设置图例
plt.legend(loc='upper left')

# 显示图形
plt.show()
import matplotlib.pyplot as plt
import numpy as np

# 创建一些示例数据
data = np.random.randn(1000)  # 这里使用随机数据，你可以替换为你的实际数据

# 创建直方图
plt.hist(data, bins=30, alpha=0.5, color='b', density=True, label='数据分布')

# 计算数据百分比
percentiles = [25, 50, 75]  # 这里可以自定义百分位数
percentile_values = np.percentile(data, percentiles)

# 添加虚线和百分比标签
for percentile, value in zip(percentiles, percentile_values):
    plt.axvline(value, color='r', linestyle='--', label=f'{percentile}th Percentile: {value:.2f}')

# 添加右侧百分比y轴
plt.twinx()
percentile_labels = [f'{percentile}%' for percentile in percentiles]
percentile_positions = np.percentile(data, percentiles)
plt.yticks(percentile_positions, percentile_labels)
plt.ylabel('百分比')

# 设置图例
plt.legend(loc='upper left')

# 显示图形
plt.show()

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
D:\conda\envs\py39\lib\site-packages\IPython\core\pylabtools.py:152: UserWarning: Glyph 30334 (\N{CJK UNIFIED IDEOGRAPH-767E}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
D:\conda\envs\py39\lib\site-packages\IPython\core\pylabtools.py:152: UserWarning: Glyph 20998 (\N{CJK UNIFIED IDEOGRAPH-5206}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
D:\conda\envs\py39\lib\site-packages\IPython\core\pylabtools.py:152: UserWarning: Glyph 27604 (\N{CJK UNIFIED IDEOGRAPH-6BD4}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)

In [93]:

Copied!





def plot_hist_cutoff(adata,name='',bins=100,xlim_quantile=(0.001, 0.999)):
    
    # 创建一些示例数据
    data = adata.obs['total_CG'] # 这里使用随机数据，你可以替换为你的实际数据
    
    # 创建直方图
    n, bins, patches = plt.hist(data, bins=30, alpha=0.5, color='b', density=False, label='数据分布')
    
    # 计算不同截断值下的保留百分比
    cutoffs = np.sort(data)
    percentiles = 1- (np.arange(1, len(cutoffs) + 1) / len(cutoffs))
    
    # 添加右侧百分比y轴
    ax2 = plt.twinx()
    plt.ylabel('cell passed %')
    # 绘制保留百分比曲线
    ax2 = plt.plot(cutoffs, percentiles, 'r--', label='cell passed percent')
    
    # 设置图例
    plt.legend(loc='upper right')
    
    # 显示图形
    plt.show()
def plot_hist_cutoff(adata,name='',bins=100,xlim_quantile=(0.001, 0.999)):
    
    # 创建一些示例数据
    data = adata.obs['total_CG'] # 这里使用随机数据，你可以替换为你的实际数据
    
    # 创建直方图
    n, bins, patches = plt.hist(data, bins=30, alpha=0.5, color='b', density=False, label='数据分布')
    
    # 计算不同截断值下的保留百分比
    cutoffs = np.sort(data)
    percentiles = 1- (np.arange(1, len(cutoffs) + 1) / len(cutoffs))
    
    # 添加右侧百分比y轴
    ax2 = plt.twinx()
    plt.ylabel('cell passed %')
    # 绘制保留百分比曲线
    ax2 = plt.plot(cutoffs, percentiles, 'r--', label='cell passed percent')
    
    # 设置图例
    plt.legend(loc='upper right')
    
    # 显示图形
    plt.show()

In [94]:

Copied!

plot_hist_cutoff(adata)
plot_hist_cutoff(adata)

In [98]:

Copied!





def plot_stat(adata):
    nrows=2
    ncols=2
    dpi=300
    fig, axes = plt.subplots(figsize=(ncols * 3, nrows * 3),
                             nrows=nrows,
                             ncols=ncols,
                             dpi=300)
    x = np.linspace(0, 2*np.pi, 100)
    ax1 = axes[0, 0]
    ax2 = axes[0, 1]
    ax3 = axes[1, 1]
    ax4 = axes[1, 0]
    ax1.plot(plot_hist_cutoff(adata))
    ax2.plot(plot_hist_cutoff(adata))
    ax3.plot(plot_hist_cutoff(adata))
    ax4.plot(plot_hist(adata,'mc_class_CG','CG Count','count','count'))

    plt.show()
def plot_stat(adata):
    nrows=2
    ncols=2
    dpi=300
    fig, axes = plt.subplots(figsize=(ncols * 3, nrows * 3),
                             nrows=nrows,
                             ncols=ncols,
                             dpi=300)
    x = np.linspace(0, 2*np.pi, 100)
    ax1 = axes[0, 0]
    ax2 = axes[0, 1]
    ax3 = axes[1, 1]
    ax4 = axes[1, 0]
    ax1.plot(plot_hist_cutoff(adata))
    ax2.plot(plot_hist_cutoff(adata))
    ax3.plot(plot_hist_cutoff(adata))
    ax4.plot(plot_hist(adata,'mc_class_CG','CG Count','count','count'))

    plt.show()

In [99]:

Copied!

plot_stat(adata)
plot_stat(adata)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[99], line 1
----> 1 plot_stat(adata)

Cell In[98], line 14, in plot_stat(adata)
     12 ax3 = axes[1, 1]
     13 ax4 = axes[1, 0]
---> 14 ax1.plot(plot_hist_cutoff(adata))
     15 ax2.plot(plot_hist_cutoff(adata))
     16 ax3.plot(plot_hist_cutoff(adata))

File D:\conda\envs\py39\lib\site-packages\matplotlib\axes\_axes.py:1688, in Axes.plot(self, scalex, scaley, data, *args, **kwargs)
   1445 """
   1446 Plot y versus x as lines and/or markers.
   1447 
   (...)
   1685 (``'green'``) or hex strings (``'#008000'``).
   1686 """
   1687 kwargs = cbook.normalize_kwargs(kwargs, mlines.Line2D)
-> 1688 lines = [*self._get_lines(*args, data=data, **kwargs)]
   1689 for line in lines:
   1690     self.add_line(line)

File D:\conda\envs\py39\lib\site-packages\matplotlib\axes\_base.py:311, in _process_plot_var_args.__call__(self, data, *args, **kwargs)
    309     this += args[0],
    310     args = args[1:]
--> 311 yield from self._plot_args(
    312     this, kwargs, ambiguous_fmt_datakey=ambiguous_fmt_datakey)

File D:\conda\envs\py39\lib\site-packages\matplotlib\axes\_base.py:465, in _process_plot_var_args._plot_args(self, tup, kwargs, return_kwargs, ambiguous_fmt_datakey)
    462 # Don't allow any None value; these would be up-converted to one
    463 # element array of None which causes problems downstream.
    464 if any(v is None for v in tup):
--> 465     raise ValueError("x, y, and format string must not be None")
    467 kw = {}
    468 for prop_name, val in zip(('linestyle', 'marker', 'color'),
    469                           (linestyle, marker, color)):

ValueError: x, y, and format string must not be None

In [103]:

Copied!

cdata = ad.read("D://Test/scMeth/enhancer_c1_CG_paper.h5ad")
cdata = ad.read("D://Test/scMeth/enhancer_c1_CG_paper.h5ad")

In [110]:

Copied!





length1 = len(cdata.X[0,:]) # 55017 features,列
length2 = len(cdata.X[:,0]) # 3379 samples 行
cdata.var['coverage_cells'] = [length2 - np.isnan(line).sum() for line in cdata.X.T] #每一个特征在多少个样本中有覆盖
cdata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in cdata.X.T]
length1 = len(cdata.X[0,:]) # 55017 features,列
length2 = len(cdata.X[:,0]) # 3379 samples 行
cdata.var['coverage_cells'] = [length2 - np.isnan(line).sum() for line in cdata.X.T] #每一个特征在多少个样本中有覆盖
cdata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in cdata.X.T]

In [121]:

Copied!





# 创建一些示例数据
data = cdata.var['coverage_cells'] # 这里使用随机数据，你可以替换为你的实际数据
ax = plt.subplot() 
# 创建直方图
n, bins, patches = plt.hist(data, bins=30, alpha=0.5, color='b', density=False, label='数据分布')

# 计算不同截断值下的保留百分比
cutoffs = np.sort(data)
percentiles = 1- np.arange(1, len(cutoffs) + 1) / len(cutoffs)
# 将百分数形式转换为小数并保留两位小数
percentiles_decimal = [round(100 * p, 2) for p in percentiles]
# 添加右侧百分比y轴
ax2 = plt.twinx()
# adding horizontal grid lines 
ax2.yaxis.grid(True) 
plt.ylabel('cell passed %')
# 绘制保留百分比曲线
ax2 = plt.plot(cutoffs,percentiles_decimal, 'r--', label='cell passed percent')

# 设置图例
plt.legend(loc='upper right')

# 显示图形
plt.show()
# 创建一些示例数据
data = cdata.var['coverage_cells'] # 这里使用随机数据，你可以替换为你的实际数据
ax = plt.subplot() 
# 创建直方图
n, bins, patches = plt.hist(data, bins=30, alpha=0.5, color='b', density=False, label='数据分布')

# 计算不同截断值下的保留百分比
cutoffs = np.sort(data)
percentiles = 1- np.arange(1, len(cutoffs) + 1) / len(cutoffs)
# 将百分数形式转换为小数并保留两位小数
percentiles_decimal = [round(100 * p, 2) for p in percentiles]
# 添加右侧百分比y轴
ax2 = plt.twinx()
# adding horizontal grid lines 
ax2.yaxis.grid(True) 
plt.ylabel('cell passed %')
# 绘制保留百分比曲线
ax2 = plt.plot(cutoffs,percentiles_decimal, 'r--', label='cell passed percent')

# 设置图例
plt.legend(loc='upper right')

# 显示图形
plt.show()

In [111]:

Copied!

cdata.var
cdata.var

Out[111]:

	coverage_cells	mean_feature_methylation
0_enhancer_10000235_10002235	1030	0.111873
1_enhancer_10000473_10002473	602	0.074983
2_enhancer_100010706_100012706	664	0.085918
3_enhancer_100019316_100021316	816	0.126313
4_enhancer_100024481_100026481	651	0.112446
...	...	...
55012_enhancer_99982269_99984269	403	0.045218
55013_enhancer_99982881_99984881	661	0.103071
55014_enhancer_99983956_99985956	988	0.235307
55015_enhancer_99990006_99992006	383	0.057098
55016_enhancer_99995790_99997790	629	0.085066

55017 rows × 2 columns

In [122]:

Copied!

adata.var
adata.var

Out[122]:

	feature_name	coverage_feature	mean_feature_methylation
0	chr1_0_100000	52	0.0
1	chr1_100000_200000	52	0.0
2	chr1_200000_300000	52	0.0
3	chr1_300000_400000	52	0.0
4	chr1_400000_500000	52	0.0
...	...	...	...
4907	chr2_241700000_241800000	52	0.0
4908	chr2_241800000_241900000	52	0.0
4909	chr2_241900000_242000000	52	0.0
4910	chr2_242000000_242100000	52	0.0
4911	chr2_242100000_242193529	52	0.0

4912 rows × 3 columns

In [ ]:

In [123]:

Copied!





# 创建一些示例数据
data = adata.var['mean_feature_methylation'] # 这里使用随机数据，你可以替换为你的实际数据
ax = plt.subplot() 
# 创建直方图
n, bins, patches = plt.hist(data, bins=30, alpha=0.5, color='b', density=False, label='数据分布')

# 计算不同截断值下的保留百分比
cutoffs = np.sort(data)
percentiles = 1- np.arange(1, len(cutoffs) + 1) / len(cutoffs)
# 将百分数形式转换为小数并保留两位小数
percentiles_decimal = [round(100 * p, 2) for p in percentiles]
# 添加右侧百分比y轴
ax2 = plt.twinx()
# adding horizontal grid lines 
ax2.yaxis.grid(True) 
plt.ylabel('cell passed %')
# 绘制保留百分比曲线
ax2 = plt.plot(cutoffs,percentiles_decimal, 'r--', label='cell passed percent')

# 设置图例
plt.legend(loc='upper right')

# 显示图形
plt.show()
# 创建一些示例数据
data = adata.var['mean_feature_methylation'] # 这里使用随机数据，你可以替换为你的实际数据
ax = plt.subplot() 
# 创建直方图
n, bins, patches = plt.hist(data, bins=30, alpha=0.5, color='b', density=False, label='数据分布')

# 计算不同截断值下的保留百分比
cutoffs = np.sort(data)
percentiles = 1- np.arange(1, len(cutoffs) + 1) / len(cutoffs)
# 将百分数形式转换为小数并保留两位小数
percentiles_decimal = [round(100 * p, 2) for p in percentiles]
# 添加右侧百分比y轴
ax2 = plt.twinx()
# adding horizontal grid lines 
ax2.yaxis.grid(True) 
plt.ylabel('cell passed %')
# 绘制保留百分比曲线
ax2 = plt.plot(cutoffs,percentiles_decimal, 'r--', label='cell passed percent')

# 设置图例
plt.legend(loc='upper right')

# 显示图形
plt.show()

In [124]:

Copied!

 cdata
 cdata

Out[124]:

AnnData object with n_obs × n_vars = 3379 × 55017
    obs: 'cell_type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Total reads', 'Mapped reads', 'Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate\n', 'binary'
    var: 'coverage_cells', 'mean_feature_methylation'
    uns: 'imputation', 'omic'

In [ ]:

Copied!





def _find_range_features(
    adata
    feature_count,
    filter_lower_quantile,
    filter_upper_quantile,
    total_features,
) -> np.ndarray:
    idx = np.argsort(feature_count)
    for i in range(idx.size):
        if feature_count[idx[i]] > 0:
            break
    idx = idx[i:]
    n = idx.size
    n_lower = int(filter_lower_quantile * n)
    n_upper = int(filter_upper_quantile * n)
    idx = idx[n_lower:n-n_upper]
    return idx[::-1][:total_features]
def _find_range_features(
    adata
    feature_count,
    filter_lower_quantile,
    filter_upper_quantile,
    total_features,
) -> np.ndarray:
    idx = np.argsort(feature_count)
    for i in range(idx.size):
        if feature_count[idx[i]] > 0:
            break
    idx = idx[i:]
    n = idx.size
    n_lower = int(filter_lower_quantile * n)
    n_upper = int(filter_upper_quantile * n)
    idx = idx[n_lower:n-n_upper]
    return idx[::-1][:total_features]

In [126]:

Copied!

adata.var.shape
adata.var.shape

Out[126]:

(4912, 3)

In [128]:

Copied!

obs_dim, var_dim = adata.X.shape
obs_dim, var_dim = adata.X.shape

In [129]:

Copied!





# mean
mean = adata.X.mean(dim=obs_dim).load()

# var
# enforce R convention (unbiased estimator) for variance
mean_sq = (x * x).mean(dim=obs_dim).load()
var = (mean_sq - mean ** 2) * (x.sizes[obs_dim] / (x.sizes[obs_dim] - 1))

# now actually compute the dispersion
mean.where(mean > 1e-12, 1e-12)  # set entries equal to zero to small value
# raw dispersion is the variance normalized by mean
dispersion = var / mean
return mean, dispersion
# mean
mean = adata.X.mean(dim=obs_dim).load()

# var
# enforce R convention (unbiased estimator) for variance
mean_sq = (x * x).mean(dim=obs_dim).load()
var = (mean_sq - mean ** 2) * (x.sizes[obs_dim] / (x.sizes[obs_dim] - 1))

# now actually compute the dispersion
mean.where(mean > 1e-12, 1e-12)  # set entries equal to zero to small value
# raw dispersion is the variance normalized by mean
dispersion = var / mean
return mean, dispersion

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[129], line 2
      1 # mean
----> 2 mean = adata.X.mean(dim=obs_dim).load()
      4 # var
      5 # enforce R convention (unbiased estimator) for variance
      6 mean_sq = (x * x).mean(dim=obs_dim).load()

TypeError: mean() got an unexpected keyword argument 'dim'

In [130]:

Copied!

adata.X.mean()
adata.X.mean()

Out[130]:

0.34296776073338897

In [135]:

Copied!

xdata.X
xdata.X

Out[135]:

array([[1, 4, 7],
       [2, 5, 8],
       [3, 6, 9]], dtype=int64)

In [139]:

Copied!

adata.var
adata.var

Out[139]:

	feature_name	coverage_feature	mean_feature_methylation
0	chr1_0_100000	52	0.0
1	chr1_100000_200000	52	0.0
2	chr1_200000_300000	52	0.0
3	chr1_300000_400000	52	0.0
4	chr1_400000_500000	52	0.0
...	...	...	...
4907	chr2_241700000_241800000	52	0.0
4908	chr2_241800000_241900000	52	0.0
4909	chr2_241900000_242000000	52	0.0
4910	chr2_242000000_242100000	52	0.0
4911	chr2_242100000_242193529	52	0.0

4912 rows × 3 columns

In [140]:

Copied!

 mean_sq = (adata.X.power(2).mean(axis=0)).A1
 mean_sq = (adata.X.power(2).mean(axis=0)).A1

In [141]:

Copied!

mean_sq
mean_sq

Out[141]:

array([0., 0., 0., ..., 0., 0., 0.])

In [142]:

Copied!

mean = adata.X.mean(axis=0).A1
mean = adata.X.mean(axis=0).A1

In [143]:

Copied!

mean
mean

Out[143]:

array([0., 0., 0., ..., 0., 0., 0.])

In [145]:

Copied!

# 计算样本数和特征数
n_samples = adata.X.shape[0]
n_features = adata.X.shape[1]
# 计算样本数和特征数
n_samples = adata.X.shape[0]
n_features = adata.X.shape[1]

In [146]:

Copied!

# 计算方差 (mean_sq - mean**2) * (n_samples / (n_samples - 1))
var = (mean_sq - mean**2) * (n_samples / (n_samples - 1))
# 计算方差 (mean_sq - mean**2) * (n_samples / (n_samples - 1))
var = (mean_sq - mean**2) * (n_samples / (n_samples - 1))

In [150]:

Copied!

# 将结果存储到 adata.var
adata.var['mean_sq'] = mean_sq
adata.var['variance'] = var
# 将结果存储到 adata.var
adata.var['mean_sq'] = mean_sq
adata.var['variance'] = var

In [151]:

Copied!

adata.var.variance
adata.var.variance

Out[151]:

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
4907    0.0
4908    0.0
4909    0.0
4910    0.0
4911    0.0
Name: variance, Length: 4912, dtype: float64

In [188]:

Copied!





if sparse.issparse(adata.X):
    # 计算平方均值 (x * x).mean()
    mean_sq = (adata.X.power(2).mean(axis=0)).A1

    # 计算均值
    mean = adata.X.mean(axis=0).A1

    # 计算样本数和特征数
    n_samples = adata.X.shape[0]
    n_features = adata.X.shape[1]

    # 计算方差 (mean_sq - mean**2) * (n_samples / (n_samples - 1))
    var = (mean_sq - mean**2) * (n_samples / (n_samples - 1))

    # 将均值中小于阈值的值替换为 1e-12
    threshold = 1e-12
    mean[mean < threshold] = threshold

    # 计算 dispersion
    dispersion = var / mean
    # 将结果存储到 adata.var
    adata.var['dispersion'] = dispersion
    adata.var['dispersion_log'] = np.log(dispersion)
    # Calculate mean and coverage
    mean_mean = adata.var['mean_feature_methylation'].mean()
    mean_coverage = adata.var['coverage_feature'].mean()

    adata.var['NormalizedDispersion'] = adata.var['dispersion'] / (mean_mean * mean_coverage)
if sparse.issparse(adata.X):
    # 计算平方均值 (x * x).mean()
    mean_sq = (adata.X.power(2).mean(axis=0)).A1

    # 计算均值
    mean = adata.X.mean(axis=0).A1

    # 计算样本数和特征数
    n_samples = adata.X.shape[0]
    n_features = adata.X.shape[1]

    # 计算方差 (mean_sq - mean**2) * (n_samples / (n_samples - 1))
    var = (mean_sq - mean**2) * (n_samples / (n_samples - 1))

    # 将均值中小于阈值的值替换为 1e-12
    threshold = 1e-12
    mean[mean < threshold] = threshold

    # 计算 dispersion
    dispersion = var / mean
    # 将结果存储到 adata.var
    adata.var['dispersion'] = dispersion
    adata.var['dispersion_log'] = np.log(dispersion)
    # Calculate mean and coverage
    mean_mean = adata.var['mean_feature_methylation'].mean()
    mean_coverage = adata.var['coverage_feature'].mean()

    adata.var['NormalizedDispersion'] = adata.var['dispersion'] / (mean_mean * mean_coverage)

C:\Users\zong\AppData\Local\Temp\ipykernel_7604\4287774659.py:23: RuntimeWarning: divide by zero encountered in log
  adata.var['dispersion_log'] = np.log(dispersion)

In [191]:

Copied!

adata.var['NormalizedDispersion']
adata.var['NormalizedDispersion']

Out[191]:

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
4907    0.0
4908    0.0
4909    0.0
4910    0.0
4911    0.0
Name: NormalizedDispersion, Length: 4912, dtype: float64

In [158]:

Copied!

type(adata.var['dispersion'])
type(adata.var['dispersion'])

Out[158]:

pandas.core.series.Series

In [189]:

Copied!

adata.var
adata.var

Out[189]:

	feature_name	coverage_feature	mean_feature_methylation	mean_sq	var	variance	dispersion	dispersion_log	NormalizedCoverage	ResidualDispersion	NormalizedDispersion
0	chr1_0_100000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0
1	chr1_100000_200000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0
2	chr1_200000_300000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0
3	chr1_300000_400000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0
4	chr1_400000_500000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0
...	...	...	...	...	...	...	...	...	...	...	...
4907	chr2_241700000_241800000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0
4908	chr2_241800000_241900000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0
4909	chr2_241900000_242000000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0
4910	chr2_242000000_242100000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0
4911	chr2_242100000_242193529	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0

4912 rows × 11 columns

In [164]:

Copied!





# 创建一些示例数据
data = adata.var['coverage_feature'] # 这里使用随机数据，你可以替换为你的实际数据
ax = plt.subplot() 
# 创建直方图
n, bins, patches = plt.hist(data, bins=30, alpha=0.5, color='b', density=False, label='数据分布')

# 计算不同截断值下的保留百分比
cutoffs = np.sort(data)
percentiles = 1- np.arange(1, len(cutoffs) + 1) / len(cutoffs)
# 将百分数形式转换为小数并保留两位小数
percentiles_decimal = [round(100 * p, 2) for p in percentiles]
# 添加右侧百分比y轴
ax2 = plt.twinx()
# adding horizontal grid lines 
ax2.yaxis.grid(True) 
plt.ylabel('cell passed %')
# 绘制保留百分比曲线
ax2 = plt.plot(cutoffs,percentiles_decimal, 'r--', label='cell passed percent')

# 设置图例
plt.legend(loc='upper right')

# 显示图形
plt.show()
# 创建一些示例数据
data = adata.var['coverage_feature'] # 这里使用随机数据，你可以替换为你的实际数据
ax = plt.subplot() 
# 创建直方图
n, bins, patches = plt.hist(data, bins=30, alpha=0.5, color='b', density=False, label='数据分布')

# 计算不同截断值下的保留百分比
cutoffs = np.sort(data)
percentiles = 1- np.arange(1, len(cutoffs) + 1) / len(cutoffs)
# 将百分数形式转换为小数并保留两位小数
percentiles_decimal = [round(100 * p, 2) for p in percentiles]
# 添加右侧百分比y轴
ax2 = plt.twinx()
# adding horizontal grid lines 
ax2.yaxis.grid(True) 
plt.ylabel('cell passed %')
# 绘制保留百分比曲线
ax2 = plt.plot(cutoffs,percentiles_decimal, 'r--', label='cell passed percent')

# 设置图例
plt.legend(loc='upper right')

# 显示图形
plt.show()

In [166]:

Copied!





# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation']
y_values = adata.var['dispersion']

# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5)  # alpha参数用于设置点的透明度，可根据需要调整

# 添加标签和标题
plt.xlabel('Global DNA methylation %')
plt.ylabel('Observed CpG Sites')

# 显示图形
plt.show()
# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation']
y_values = adata.var['dispersion']

# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5)  # alpha参数用于设置点的透明度，可根据需要调整

# 添加标签和标题
plt.xlabel('Global DNA methylation %')
plt.ylabel('Observed CpG Sites')

# 显示图形
plt.show()

In [168]:

Copied!





# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation']
y_values = adata.var['dispersion_log']

# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5)  # alpha参数用于设置点的透明度，可根据需要调整

# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')

# 显示图形
plt.show()
# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation']
y_values = adata.var['dispersion_log']

# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5)  # alpha参数用于设置点的透明度，可根据需要调整

# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')

# 显示图形
plt.show()

In [190]:

Copied!





# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation']
y_values = adata.var['NormalizedDispersion']

# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5)  # alpha参数用于设置点的透明度，可根据需要调整

# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')

# 显示图形
plt.show()
# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation']
y_values = adata.var['NormalizedDispersion']

# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5)  # alpha参数用于设置点的透明度，可根据需要调整

# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')

# 显示图形
plt.show()

In [169]:

Copied!





import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

# 生成随机示例数据
np.random.seed(0)
mean_values = np.random.rand(100)
coverage_values = np.random.randint(1, 10, 100)
true_dispersion = 0.5 * mean_values + 0.2 * coverage_values + np.random.normal(0, 0.1, 100)

# 构建矩阵X，包括均值和覆盖度作为自变量
X = np.column_stack((mean_values, coverage_values))
X = sm.add_constant(X)  # 添加截距项

# 使用线性回归模型来拟合均值和覆盖度与离散度之间的关系
model = sm.OLS(true_dispersion, X)
results = model.fit()

# 获取拟合后的残差
residuals = results.resid

# 校正后的离散度为原始离散度减去残差
corrected_dispersion = true_dispersion - residuals

# 绘制原始离散度与校正后的离散度的对比图
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.scatter(mean_values, true_dispersion, label='原始离散度')
plt.xlabel('均值')
plt.ylabel('离散度')
plt.legend()

plt.subplot(1, 2, 2)
plt.scatter(mean_values, corrected_dispersion, label='校正后的离散度')
plt.xlabel('均值')
plt.ylabel('离散度')
plt.legend()

plt.tight_layout()
plt.show()
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

# 生成随机示例数据
np.random.seed(0)
mean_values = np.random.rand(100)
coverage_values = np.random.randint(1, 10, 100)
true_dispersion = 0.5 * mean_values + 0.2 * coverage_values + np.random.normal(0, 0.1, 100)

# 构建矩阵X，包括均值和覆盖度作为自变量
X = np.column_stack((mean_values, coverage_values))
X = sm.add_constant(X)  # 添加截距项

# 使用线性回归模型来拟合均值和覆盖度与离散度之间的关系
model = sm.OLS(true_dispersion, X)
results = model.fit()

# 获取拟合后的残差
residuals = results.resid

# 校正后的离散度为原始离散度减去残差
corrected_dispersion = true_dispersion - residuals

# 绘制原始离散度与校正后的离散度的对比图
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.scatter(mean_values, true_dispersion, label='原始离散度')
plt.xlabel('均值')
plt.ylabel('离散度')
plt.legend()

plt.subplot(1, 2, 2)
plt.scatter(mean_values, corrected_dispersion, label='校正后的离散度')
plt.xlabel('均值')
plt.ylabel('离散度')
plt.legend()

plt.tight_layout()
plt.show()

---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
Cell In[169], line 2
      1 import numpy as np
----> 2 import statsmodels.api as sm
      3 import matplotlib.pyplot as plt
      5 # 生成随机示例数据

File D:\conda\envs\py39\lib\site-packages\statsmodels\api.py:125
    114 from .genmod import api as genmod
    115 from .genmod.api import (
    116     GEE,
    117     GLM,
   (...)
    123     families,
    124 )
--> 125 from .graphics import api as graphics
    126 from .graphics.gofplots import ProbPlot, qqline, qqplot, qqplot_2samples
    127 from .imputation.bayes_mi import MI, BayesGaussMI

File D:\conda\envs\py39\lib\site-packages\statsmodels\graphics\api.py:1
----> 1 from . import tsaplots as tsa
      2 from .agreement import mean_diff_plot
      3 from .boxplots import beanplot, violinplot

File D:\conda\envs\py39\lib\site-packages\statsmodels\graphics\tsaplots.py:10
      7 import pandas as pd
      9 from statsmodels.graphics import utils
---> 10 from statsmodels.tsa.stattools import acf, pacf
     13 def _prepare_data_corr_plot(x, lags, zero):
     14     zero = bool(zero)

File D:\conda\envs\py39\lib\site-packages\statsmodels\tsa\stattools.py:19
     17 from scipy import stats
     18 from scipy.interpolate import interp1d
---> 19 from scipy.signal import correlate
     21 from statsmodels.regression.linear_model import OLS, yule_walker
     22 from statsmodels.tools.sm_exceptions import (
     23     CollinearityWarning,
     24     InfeasibleTestError,
   (...)
     27     ValueWarning,
     28 )

File D:\conda\envs\py39\lib\site-packages\scipy\signal\__init__.py:306
    304 from .ltisys import *
    305 from .lti_conversion import *
--> 306 from .signaltools import *
    307 from ._savitzky_golay import savgol_coeffs, savgol_filter
    308 from .spectral import *

File D:\conda\envs\py39\lib\site-packages\scipy\signal\signaltools.py:12
     10 from scipy import linalg, fft as sp_fft
     11 from scipy.fft._helper import _init_nd_shape_and_axes
---> 12 from scipy._lib._util import prod as _prod
     13 import numpy as np
     14 from scipy.special import lambertw

ImportError: cannot import name 'prod' from 'scipy._lib._util' (D:\conda\envs\py39\lib\site-packages\scipy\_lib\_util.py)

In [173]:

Copied!





# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation']
y_values = adata.var['dispersion_log']

# 创建散点图
plt.figure(figsize=(10, 5))  # 设置图形大小
plt.subplot(1, 2, 1)  # 创建第一个子图
plt.scatter(x_values, y_values, alpha=0.5)  # 添加散点

# 添加线性回归线
polyfit = np.polyfit(x_values, y_values, 1)
plt.plot(x_values, np.polyval(polyfit, x_values), color='red')

# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
plt.title('Scatter Plot with Linear Fit')

# 创建第二个子图，用于绘制热图
plt.subplot(1, 2, 2)
plt.hist2d(x_values, y_values, bins=(50, 50), cmap=plt.cm.Blues)  # 添加热图

# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
plt.title('Heatmap of Point Distribution')

# 调整子图之间的间距
plt.tight_layout()

# 显示图形
plt.colorbar()
plt.show()
# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation']
y_values = adata.var['dispersion_log']

# 创建散点图
plt.figure(figsize=(10, 5))  # 设置图形大小
plt.subplot(1, 2, 1)  # 创建第一个子图
plt.scatter(x_values, y_values, alpha=0.5)  # 添加散点

# 添加线性回归线
polyfit = np.polyfit(x_values, y_values, 1)
plt.plot(x_values, np.polyval(polyfit, x_values), color='red')

# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
plt.title('Scatter Plot with Linear Fit')

# 创建第二个子图，用于绘制热图
plt.subplot(1, 2, 2)
plt.hist2d(x_values, y_values, bins=(50, 50), cmap=plt.cm.Blues)  # 添加热图

# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
plt.title('Heatmap of Point Distribution')

# 调整子图之间的间距
plt.tight_layout()

# 显示图形
plt.colorbar()
plt.show()

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[173], line 21
     19 # 创建第二个子图，用于绘制热图
     20 plt.subplot(1, 2, 2)
---> 21 plt.hist2d(x_values, y_values, bins=(50, 50), cmap=plt.cm.Blues)  # 添加热图
     23 # 添加标签和标题
     24 plt.xlabel('mean_feature_methylation')

File D:\conda\envs\py39\lib\site-packages\matplotlib\pyplot.py:2669, in hist2d(x, y, bins, range, density, weights, cmin, cmax, data, **kwargs)
   2665 @_copy_docstring_and_deprecators(Axes.hist2d)
   2666 def hist2d(
   2667         x, y, bins=10, range=None, density=False, weights=None,
   2668         cmin=None, cmax=None, *, data=None, **kwargs):
-> 2669     __ret = gca().hist2d(
   2670         x, y, bins=bins, range=range, density=density,
   2671         weights=weights, cmin=cmin, cmax=cmax,
   2672         **({"data": data} if data is not None else {}), **kwargs)
   2673     sci(__ret[-1])
   2674     return __ret

File D:\conda\envs\py39\lib\site-packages\matplotlib\__init__.py:1475, in _preprocess_data.<locals>.inner(ax, data, *args, **kwargs)
   1472 @functools.wraps(func)
   1473 def inner(ax, *args, data=None, **kwargs):
   1474     if data is None:
-> 1475         return func(ax, *map(sanitize_sequence, args), **kwargs)
   1477     bound = new_sig.bind(ax, *args, **kwargs)
   1478     auto_label = (bound.arguments.get(label_namer)
   1479                   or bound.kwargs.get(label_namer))

File D:\conda\envs\py39\lib\site-packages\matplotlib\axes\_axes.py:7128, in Axes.hist2d(self, x, y, bins, range, density, weights, cmin, cmax, **kwargs)
   7035 @_preprocess_data(replace_names=["x", "y", "weights"])
   7036 @_docstring.dedent_interpd
   7037 def hist2d(self, x, y, bins=10, range=None, density=False, weights=None,
   7038            cmin=None, cmax=None, **kwargs):
   7039     """
   7040     Make a 2D histogram plot.
   7041 
   (...)
   7125       `.colors.PowerNorm`.
   7126     """
-> 7128     h, xedges, yedges = np.histogram2d(x, y, bins=bins, range=range,
   7129                                        density=density, weights=weights)
   7131     if cmin is not None:
   7132         h[h < cmin] = None

File <__array_function__ internals>:200, in histogram2d(*args, **kwargs)

File D:\conda\envs\py39\lib\site-packages\numpy\lib\twodim_base.py:820, in histogram2d(x, y, bins, range, density, weights)
    818     bins = [xedges, yedges]
    819 hist, edges = histogramdd([x, y], bins, range, normed, weights, density)
--> 820 return hist, edges[0], edges[1]

File <__array_function__ internals>:200, in histogramdd(*args, **kwargs)

File D:\conda\envs\py39\lib\site-packages\numpy\lib\histograms.py:1001, in histogramdd(sample, bins, range, density, weights)
    943 @array_function_dispatch(_histogramdd_dispatcher)
    944 def histogramdd(sample, bins=10, range=None, normed=None, weights=None,
    945                 density=None):
    946     """
    947     Compute the multidimensional histogram of some data.
    948 
    949     Parameters
    950     ----------
    951     sample : (N, D) array, or (D, N) array_like
    952         The data to be histogrammed.
    953 
    954         Note the unusual interpretation of sample when an array_like:
    955 
    956         * When an array, each row is a coordinate in a D-dimensional space -
    957           such as ``histogramdd(np.array([p1, p2, p3]))``.
    958         * When an array_like, each element is the list of values for single
    959           coordinate - such as ``histogramdd((X, Y, Z))``.
    960 
    961         The first form should be preferred.
    962 
    963     bins : sequence or int, optional
    964         The bin specification:
    965 
    966         * A sequence of arrays describing the monotonically increasing bin
    967           edges along each dimension.
    968         * The number of bins for each dimension (nx, ny, ... =bins)
    969         * The number of bins for all dimensions (nx=ny=...=bins).
    970 
    971     range : sequence, optional
    972         A sequence of length D, each an optional (lower, upper) tuple giving
    973         the outer bin edges to be used if the edges are not given explicitly in
    974         `bins`.
    975         An entry of None in the sequence results in the minimum and maximum
    976         values being used for the corresponding dimension.
    977         The default, None, is equivalent to passing a tuple of D None values.
    978     density : bool, optional
    979         If False, the default, returns the number of samples in each bin.
    980         If True, returns the probability *density* function at the bin,
    981         ``bin_count / sample_count / bin_volume``.
    982     normed : bool, optional
    983         An alias for the density argument that behaves identically. To avoid
    984         confusion with the broken normed argument to `histogram`, `density`
    985         should be preferred.
    986     weights : (N,) array_like, optional
    987         An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`.
    988         Weights are normalized to 1 if normed is True. If normed is False,
    989         the values of the returned histogram are equal to the sum of the
    990         weights belonging to the samples falling into each bin.
    991 
    992     Returns
    993     -------
    994     H : ndarray
    995         The multidimensional histogram of sample x. See normed and weights
    996         for the different possible semantics.
    997     edges : list
    998         A list of D arrays describing the bin edges for each dimension.
    999 
   1000     See Also
-> 1001     --------
   1002     histogram: 1-D histogram
   1003     histogram2d: 2-D histogram
   1004 
   1005     Examples
   1006     --------
   1007     >>> r = np.random.randn(100,3)
   1008     >>> H, edges = np.histogramdd(r, bins = (5, 8, 4))
   1009     >>> H.shape, edges[0].size, edges[1].size, edges[2].size
   1010     ((5, 8, 4), 6, 9, 5)
   1011 
   1012     """
   1014     try:
   1015         # Sample is an ND-array.
   1016         N, D = sample.shape

File D:\conda\envs\py39\lib\site-packages\numpy\lib\histograms.py:323, in _get_outer_edges(a, range)
    321     first_edge, last_edge = a.min(), a.max()
    322     if not (np.isfinite(first_edge) and np.isfinite(last_edge)):
--> 323         raise ValueError(
    324             "autodetected range of [{}, {}] is not finite".format(first_edge, last_edge))
    326 # expand empty range to avoid divide by zero
    327 if first_edge == last_edge:

ValueError: autodetected range of [-inf, -0.12516314295400616] is not finite

In [174]:

Copied!





df = adata.var
min_coverage = df['coverage_feature'].min()
max_coverage = df['coverage_feature'].max()
df['NormalizedCoverage'] = (df['coverage_feature'] - min_coverage) / (max_coverage - min_coverage)

# 计算残差离散度
df['ResidualDispersion'] = df['mean_feature_methylation'].var() / (1 - df['NormalizedCoverage'])

# 设置高变特征的阈值
threshold = 0.1

# 确定高变特征
highly_variable_features = df[df['ResidualDispersion'] > threshold]['feature_name']

print("高变特征:")
print(highly_variable_features)
df = adata.var
min_coverage = df['coverage_feature'].min()
max_coverage = df['coverage_feature'].max()
df['NormalizedCoverage'] = (df['coverage_feature'] - min_coverage) / (max_coverage - min_coverage)

# 计算残差离散度
df['ResidualDispersion'] = df['mean_feature_methylation'].var() / (1 - df['NormalizedCoverage'])

# 设置高变特征的阈值
threshold = 0.1

# 确定高变特征
highly_variable_features = df[df['ResidualDispersion'] > threshold]['feature_name']

print("高变特征:")
print(highly_variable_features)

高变特征:
Series([], Name: feature_name, dtype: object)

In [175]:

Copied!

df
df

Out[175]:

	feature_name	coverage_feature	mean_feature_methylation	mean_sq	var	variance	dispersion	dispersion_log	NormalizedCoverage	ResidualDispersion
0	chr1_0_100000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN
1	chr1_100000_200000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN
2	chr1_200000_300000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN
3	chr1_300000_400000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN
4	chr1_400000_500000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...
4907	chr2_241700000_241800000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN
4908	chr2_241800000_241900000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN
4909	chr2_241900000_242000000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN
4910	chr2_242000000_242100000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN
4911	chr2_242100000_242193529	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN

4912 rows × 10 columns

In [183]:

Copied!





# Sample data with feature name, mean, and number of samples
data = {
    'FeatureName': ['Feature1', 'Feature2', 'Feature3'],
    'MeanValue': [0.5, 0.6, 0.7],
    'NumSamples': [100, 200, 50]
}

df = pd.DataFrame(data)

# Calculate variance (var) using the updated formula
n_samples = df['NumSamples']
mean = df['MeanValue']
var = (mean ** 2) * (n_samples / (n_samples - 1))

# Set a threshold for mean values below which they will be replaced with 1e-12
threshold = 1e-12
mean[mean < threshold] = threshold

# Calculate dispersion using the updated formula
dispersion = var / mean

# Store the computed dispersion values in the DataFrame
df['Dispersion'] = dispersion

# Calculate mean and coverage
mean_mean = df['MeanValue'].mean()
mean_coverage = df['NumSamples'].mean()

# Normalize dispersion by both mean and coverage
df['NormalizedDispersion'] = df['Dispersion'] / (mean_mean * mean_coverage)
# Display the DataFrame with calculated dispersion
print(df)

# Sample data with feature name, mean, and number of samples
data = {
    'FeatureName': ['Feature1', 'Feature2', 'Feature3'],
    'MeanValue': [0.5, 0.6, 0.7],
    'NumSamples': [100, 200, 50]
}

df = pd.DataFrame(data)

# Calculate variance (var) using the updated formula
n_samples = df['NumSamples']
mean = df['MeanValue']
var = (mean ** 2) * (n_samples / (n_samples - 1))

# Set a threshold for mean values below which they will be replaced with 1e-12
threshold = 1e-12
mean[mean < threshold] = threshold

# Calculate dispersion using the updated formula
dispersion = var / mean

# Store the computed dispersion values in the DataFrame
df['Dispersion'] = dispersion

# Calculate mean and coverage
mean_mean = df['MeanValue'].mean()
mean_coverage = df['NumSamples'].mean()

# Normalize dispersion by both mean and coverage
df['NormalizedDispersion'] = df['Dispersion'] / (mean_mean * mean_coverage)
# Display the DataFrame with calculated dispersion
print(df)

  FeatureName  MeanValue  NumSamples  Dispersion  NormalizedDispersion
0    Feature1        0.5         100    0.505051              0.007215
1    Feature2        0.6         200    0.603015              0.008615
2    Feature3        0.7          50    0.714286              0.010204

C:\Users\zong\AppData\Local\Temp\ipykernel_7604\256784523.py:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mean[mean < threshold] = threshold

In [179]:

Copied!

df
df

Out[179]:

	FeatureName	MeanValue	Coverage	Dispersion	NormalizedDispersion
0	Feature1	0.1	100	0.063333	0.001481
1	Feature2	0.6	200	0.063333	0.001481
2	Feature3	0.4	50	0.063333	0.001481

In [185]:

Copied!





# 提取obs中的global列和cpg列
x_values = df['MeanValue']
y_values = df['Dispersion']

# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5)  # alpha参数用于设置点的透明度，可根据需要调整

# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')

# 显示图形
plt.show()
# 提取obs中的global列和cpg列
x_values = df['MeanValue']
y_values = df['Dispersion']

# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5)  # alpha参数用于设置点的透明度，可根据需要调整

# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')

# 显示图形
plt.show()

In [186]:

Copied!





# 提取obs中的global列和cpg列
x_values = df['MeanValue']
y_values = df['NormalizedDispersion']

# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5)  # alpha参数用于设置点的透明度，可根据需要调整

# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')

# 显示图形
plt.show()
# 提取obs中的global列和cpg列
x_values = df['MeanValue']
y_values = df['NormalizedDispersion']

# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5)  # alpha参数用于设置点的透明度，可根据需要调整

# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')

# 显示图形
plt.show()

In [195]:

Copied!





df = adata.var
mean_binsize = 0.05
cov_binsize = 10
bin_min_features=5
# instead of n_bins, use bin_size, because cov and mc are in different scale
df['mean_bin'] = (df['mean_feature_methylation'] / mean_binsize).astype(int)
df['cov_bin'] = (df['coverage_feature'] / cov_binsize).astype(int)
# instead of n_bins, use bin_size, because cov and mc are in different scale
df['mean_bin'] = (df['mean_feature_methylation'] / mean_binsize).astype(int)
df['cov_bin'] = (df['coverage_feature'] / cov_binsize).astype(int)
# save bin_count df, gather bins with more than bin_min_features features
bin_count = df.groupby(['mean_bin', 'cov_bin']) \
    .apply(lambda i: i.shape[0]) \
    .reset_index() \
    .sort_values(0, ascending=False)
bin_count.head()
bin_more_than = bin_count[bin_count[0] > bin_min_features]
if bin_more_than.shape[0] == 0:
    raise ValueError(f'No bin have more than {bin_min_features} features, use larger bin size.')
# this usually don't cause much difference (a few hundred features), but the scatter plot will look more nature
index_map = {}
for _, (mean_id, cov_id, count) in bin_count.iterrows():
    if count > 1:
        index_map[(mean_id, cov_id)] = (mean_id, cov_id)
    manhattan_dist = (bin_more_than['mean_bin'] - mean_id).abs() + (bin_more_than['cov_bin'] - cov_id).abs()
    closest_more_than = manhattan_dist.sort_values().index[0]
    closest = bin_more_than.loc[closest_more_than]
    index_map[(mean_id, cov_id)] = tuple(closest.tolist()[:2])
# apply index_map to original df
raw_bin = df[['mean_bin', 'cov_bin']].set_index(['mean_bin', 'cov_bin'])
raw_bin['use_mean'] = pd.Series(index_map).apply(lambda i: i[0])
raw_bin['use_cov'] = pd.Series(index_map).apply(lambda i: i[1])
df['mean_bin'] = raw_bin['use_mean'].values
df['cov_bin'] = raw_bin['use_cov'].values

# calculate bin mean and std, now disp_std_bin shouldn't have NAs
disp_grouped = df.groupby(['mean_bin', 'cov_bin'])['dispersion']
disp_mean_bin = disp_grouped.mean()
disp_std_bin = disp_grouped.std(ddof=1)

# actually do the normalization
_mean_norm = disp_mean_bin.loc[list(zip(df['mean_bin'], df['cov_bin']))]
_std_norm = disp_std_bin.loc[list(zip(df['mean_bin'], df['cov_bin']))]
df['dispersion_norm'] = (df['dispersion'].values  # use values here as index differs
                         - _mean_norm.values) / _std_norm.values
dispersion_norm = df['dispersion_norm'].values.astype('float32')
    
df = adata.var
mean_binsize = 0.05
cov_binsize = 10
bin_min_features=5
# instead of n_bins, use bin_size, because cov and mc are in different scale
df['mean_bin'] = (df['mean_feature_methylation'] / mean_binsize).astype(int)
df['cov_bin'] = (df['coverage_feature'] / cov_binsize).astype(int)
# instead of n_bins, use bin_size, because cov and mc are in different scale
df['mean_bin'] = (df['mean_feature_methylation'] / mean_binsize).astype(int)
df['cov_bin'] = (df['coverage_feature'] / cov_binsize).astype(int)
# save bin_count df, gather bins with more than bin_min_features features
bin_count = df.groupby(['mean_bin', 'cov_bin']) \
    .apply(lambda i: i.shape[0]) \
    .reset_index() \
    .sort_values(0, ascending=False)
bin_count.head()
bin_more_than = bin_count[bin_count[0] > bin_min_features]
if bin_more_than.shape[0] == 0:
    raise ValueError(f'No bin have more than {bin_min_features} features, use larger bin size.')
# this usually don't cause much difference (a few hundred features), but the scatter plot will look more nature
index_map = {}
for _, (mean_id, cov_id, count) in bin_count.iterrows():
    if count > 1:
        index_map[(mean_id, cov_id)] = (mean_id, cov_id)
    manhattan_dist = (bin_more_than['mean_bin'] - mean_id).abs() + (bin_more_than['cov_bin'] - cov_id).abs()
    closest_more_than = manhattan_dist.sort_values().index[0]
    closest = bin_more_than.loc[closest_more_than]
    index_map[(mean_id, cov_id)] = tuple(closest.tolist()[:2])
# apply index_map to original df
raw_bin = df[['mean_bin', 'cov_bin']].set_index(['mean_bin', 'cov_bin'])
raw_bin['use_mean'] = pd.Series(index_map).apply(lambda i: i[0])
raw_bin['use_cov'] = pd.Series(index_map).apply(lambda i: i[1])
df['mean_bin'] = raw_bin['use_mean'].values
df['cov_bin'] = raw_bin['use_cov'].values

# calculate bin mean and std, now disp_std_bin shouldn't have NAs
disp_grouped = df.groupby(['mean_bin', 'cov_bin'])['dispersion']
disp_mean_bin = disp_grouped.mean()
disp_std_bin = disp_grouped.std(ddof=1)

# actually do the normalization
_mean_norm = disp_mean_bin.loc[list(zip(df['mean_bin'], df['cov_bin']))]
_std_norm = disp_std_bin.loc[list(zip(df['mean_bin'], df['cov_bin']))]
df['dispersion_norm'] = (df['dispersion'].values  # use values here as index differs
                         - _mean_norm.values) / _std_norm.values
dispersion_norm = df['dispersion_norm'].values.astype('float32')
    

In [196]:

Copied!

df
df

Out[196]:

	feature_name	coverage_feature	mean_feature_methylation	mean_sq	var	variance	dispersion	dispersion_log	NormalizedCoverage	ResidualDispersion	NormalizedDispersion	mean_bin	cov_bin	dispersion_norm
0	chr1_0_100000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0	0	5	-0.031001
1	chr1_100000_200000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0	0	5	-0.031001
2	chr1_200000_300000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0	0	5	-0.031001
3	chr1_300000_400000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0	0	5	-0.031001
4	chr1_400000_500000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0	0	5	-0.031001
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4907	chr2_241700000_241800000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0	0	5	-0.031001
4908	chr2_241800000_241900000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0	0	5	-0.031001
4909	chr2_241900000_242000000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0	0	5	-0.031001
4910	chr2_242000000_242100000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0	0	5	-0.031001
4911	chr2_242100000_242193529	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0	0	5	-0.031001

4912 rows × 14 columns

In [266]:

Copied!

feature_subset = df.index.isin(df.sort_values('dispersion_norm', ascending=False).index[:1000])
feature_subset = df.index.isin(df.sort_values('dispersion_norm', ascending=False).index[:1000])

In [199]:

Copied!

feature_subset
feature_subset

Out[199]:

array([False, False, False, ..., False, False, False])

In [267]:

Copied!

df['feature_select'] = feature_subset
df['feature_select'] = feature_subset

In [201]:

Copied!

df
df

Out[201]:

	feature_name	coverage_feature	mean_feature_methylation	mean_sq	var	variance	dispersion	dispersion_log	NormalizedCoverage	ResidualDispersion	NormalizedDispersion	mean_bin	cov_bin	dispersion_norm	feature_select
0	chr1_0_100000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0	0	5	-0.031001	False
1	chr1_100000_200000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0	0	5	-0.031001	False
2	chr1_200000_300000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0	0	5	-0.031001	False
3	chr1_300000_400000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0	0	5	-0.031001	False
4	chr1_400000_500000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0	0	5	-0.031001	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4907	chr2_241700000_241800000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0	0	5	-0.031001	False
4908	chr2_241800000_241900000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0	0	5	-0.031001	False
4909	chr2_241900000_242000000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0	0	5	-0.031001	False
4910	chr2_242000000_242100000	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0	0	5	-0.031001	False
4911	chr2_242100000_242193529	52	0.0	0.0	0.0	0.0	0.0	-inf	NaN	NaN	0.0	0	5	-0.031001	False

4912 rows × 15 columns

In [203]:

Copied!

feature_subset.size
feature_subset.size

Out[203]:

In [206]:

Copied!

df['feature_select'].sum()
df['feature_select'].sum()

Out[206]:

In [207]:

Copied!





# 提取obs中的global列和cpg列
x_values = df['mean_feature_methylation']
y_values = df['dispersion_norm']

# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5)  # alpha参数用于设置点的透明度，可根据需要调整

# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')

# 显示图形
plt.show()
# 提取obs中的global列和cpg列
x_values = df['mean_feature_methylation']
y_values = df['dispersion_norm']

# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5)  # alpha参数用于设置点的透明度，可根据需要调整

# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')

# 显示图形
plt.show()

In [208]:

Copied!





#去除异常值
# 提取obs中的global列和cpg列
x_values = df['mean_feature_methylation']
y_values = df['dispersion_norm']

# 计算x和y的均值和标准差
x_mean, y_mean = np.mean(x_values), np.mean(y_values)
x_std, y_std = np.std(x_values), np.std(y_values)

# 设置阈值为3倍标准差
threshold = 3

# 筛选出不是异常值的数据点
filtered_x = df[(x_values >= x_mean - threshold * x_std) & (x_values <= x_mean + threshold * x_std)]
filtered_y = df[(y_values >= y_mean - threshold * y_std) & (y_values <= y_mean + threshold * y_std)]

# 创建散点图
plt.scatter(filtered_x, filtered_y, alpha=0.5)  # alpha参数用于设置点的透明度，可根据需要调整

# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')

# 显示图形
plt.show()
#去除异常值
# 提取obs中的global列和cpg列
x_values = df['mean_feature_methylation']
y_values = df['dispersion_norm']

# 计算x和y的均值和标准差
x_mean, y_mean = np.mean(x_values), np.mean(y_values)
x_std, y_std = np.std(x_values), np.std(y_values)

# 设置阈值为3倍标准差
threshold = 3

# 筛选出不是异常值的数据点
filtered_x = df[(x_values >= x_mean - threshold * x_std) & (x_values <= x_mean + threshold * x_std)]
filtered_y = df[(y_values >= y_mean - threshold * y_std) & (y_values <= y_mean + threshold * y_std)]

# 创建散点图
plt.scatter(filtered_x, filtered_y, alpha=0.5)  # alpha参数用于设置点的透明度，可根据需要调整

# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')

# 显示图形
plt.show()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[208], line 18
     15 filtered_y = df[(y_values >= y_mean - threshold * y_std) & (y_values <= y_mean + threshold * y_std)]
     17 # 创建散点图
---> 18 plt.scatter(filtered_x, filtered_y, alpha=0.5)  # alpha参数用于设置点的透明度，可根据需要调整
     20 # 添加标签和标题
     21 plt.xlabel('mean_feature_methylation')

File D:\conda\envs\py39\lib\site-packages\matplotlib\pyplot.py:2862, in scatter(x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, edgecolors, plotnonfinite, data, **kwargs)
   2857 @_copy_docstring_and_deprecators(Axes.scatter)
   2858 def scatter(
   2859         x, y, s=None, c=None, marker=None, cmap=None, norm=None,
   2860         vmin=None, vmax=None, alpha=None, linewidths=None, *,
   2861         edgecolors=None, plotnonfinite=False, data=None, **kwargs):
-> 2862     __ret = gca().scatter(
   2863         x, y, s=s, c=c, marker=marker, cmap=cmap, norm=norm,
   2864         vmin=vmin, vmax=vmax, alpha=alpha, linewidths=linewidths,
   2865         edgecolors=edgecolors, plotnonfinite=plotnonfinite,
   2866         **({"data": data} if data is not None else {}), **kwargs)
   2867     sci(__ret)
   2868     return __ret

File D:\conda\envs\py39\lib\site-packages\matplotlib\__init__.py:1475, in _preprocess_data.<locals>.inner(ax, data, *args, **kwargs)
   1472 @functools.wraps(func)
   1473 def inner(ax, *args, data=None, **kwargs):
   1474     if data is None:
-> 1475         return func(ax, *map(sanitize_sequence, args), **kwargs)
   1477     bound = new_sig.bind(ax, *args, **kwargs)
   1478     auto_label = (bound.arguments.get(label_namer)
   1479                   or bound.kwargs.get(label_namer))

File D:\conda\envs\py39\lib\site-packages\matplotlib\axes\_axes.py:4572, in Axes.scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, edgecolors, plotnonfinite, **kwargs)
   4462 """
   4463 A scatter plot of *y* vs. *x* with varying marker size and/or color.
   4464 
   (...)
   4569 
   4570 """
   4571 # Process **kwargs to handle aliases, conflicts with explicit kwargs:
-> 4572 x, y = self._process_unit_info([("x", x), ("y", y)], kwargs)
   4573 # np.ma.ravel yields an ndarray, not a masked array,
   4574 # unless its argument is a masked array.
   4575 x = np.ma.ravel(x)

File D:\conda\envs\py39\lib\site-packages\matplotlib\axes\_base.py:2549, in _AxesBase._process_unit_info(self, datasets, kwargs, convert)
   2547     # Update from data if axis is already set but no unit is set yet.
   2548     if axis is not None and data is not None and not axis.have_units():
-> 2549         axis.update_units(data)
   2550 for axis_name, axis in axis_map.items():
   2551     # Return if no axis is set.
   2552     if axis is None:

File D:\conda\envs\py39\lib\site-packages\matplotlib\axis.py:1675, in Axis.update_units(self, data)
   1673 neednew = self.converter != converter
   1674 self.converter = converter
-> 1675 default = self.converter.default_units(data, self)
   1676 if default is not None and self.units is None:
   1677     self.set_units(default)

File D:\conda\envs\py39\lib\site-packages\matplotlib\category.py:105, in StrCategoryConverter.default_units(data, axis)
    103 # the conversion call stack is default_units -> axis_info -> convert
    104 if axis.units is None:
--> 105     axis.set_units(UnitData(data))
    106 else:
    107     axis.units.update(data)

File D:\conda\envs\py39\lib\site-packages\matplotlib\category.py:181, in UnitData.__init__(self, data)
    179 self._counter = itertools.count()
    180 if data is not None:
--> 181     self.update(data)

File D:\conda\envs\py39\lib\site-packages\matplotlib\category.py:214, in UnitData.update(self, data)
    212 # check if convertible to number:
    213 convertible = True
--> 214 for val in OrderedDict.fromkeys(data):
    215     # OrderedDict just iterates over unique values in data.
    216     _api.check_isinstance((str, bytes), value=val)
    217     if convertible:
    218         # this will only be called so long as convertible is True.

TypeError: unhashable type: 'numpy.ndarray'

In [210]:

Copied!

selected_features = df.loc[df['feature_select'], 'feature_name']
selected_features = df.loc[df['feature_select'], 'feature_name']

In [211]:

Copied!

selected_features
selected_features

Out[211]:

203       chr1_20300000_20400000
218       chr1_21800000_21900000
242       chr1_24200000_24300000
245       chr1_24500000_24600000
280       chr1_28000000_28100000
                  ...           
4256    chr2_176600000_176700000
4266    chr2_177600000_177700000
4273    chr2_178300000_178400000
4283    chr2_179300000_179400000
4310    chr2_182000000_182100000
Name: feature_name, Length: 100, dtype: object

In [214]:

Copied!





# 提取obs中的global列和cpg列
x_values = df['mean_feature_methylation'].tolist()
y_values = df['dispersion_norm'].tolist()

# 定义离散度的阈值来确定异常值
threshold = 2.0  # 根据需要调整阈值

# 删除异常值
filtered_x_values = []
filtered_y_values = []
for x, y in zip(x_values, y_values):
    if y < threshold:
        filtered_x_values.append(x)
        filtered_y_values.append(y)

# 重新绘制没有异常值的散点图
plt.scatter(filtered_x_values, filtered_y_values, alpha=0.5)

# 显示图形
plt.show()
# 提取obs中的global列和cpg列
x_values = df['mean_feature_methylation'].tolist()
y_values = df['dispersion_norm'].tolist()

# 定义离散度的阈值来确定异常值
threshold = 2.0  # 根据需要调整阈值

# 删除异常值
filtered_x_values = []
filtered_y_values = []
for x, y in zip(x_values, y_values):
    if y < threshold:
        filtered_x_values.append(x)
        filtered_y_values.append(y)

# 重新绘制没有异常值的散点图
plt.scatter(filtered_x_values, filtered_y_values, alpha=0.5)

# 显示图形
plt.show()

In [220]:

Copied!





# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation'].tolist()
y_values = adata.var['dispersion'].tolist()

# 定义离散度的阈值来确定异常值
threshold = 0.4  # 根据需要调整阈值

# 删除异常值
filtered_x_values = []
filtered_y_values = []
for x, y in zip(x_values, y_values):
    if y < threshold:
        filtered_x_values.append(x)
        filtered_y_values.append(y)

plt.scatter(filtered_x_values, filtered_y_values, alpha=0.5)
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')

# 显示图形
plt.show()
# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation'].tolist()
y_values = adata.var['dispersion'].tolist()

# 定义离散度的阈值来确定异常值
threshold = 0.4  # 根据需要调整阈值

# 删除异常值
filtered_x_values = []
filtered_y_values = []
for x, y in zip(x_values, y_values):
    if y < threshold:
        filtered_x_values.append(x)
        filtered_y_values.append(y)

plt.scatter(filtered_x_values, filtered_y_values, alpha=0.5)
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')

# 显示图形
plt.show()

In [221]:

Copied!





from scipy.stats import linregress
# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation'].tolist()
y_values = adata.var['dispersion'].tolist()

# 定义离散度的阈值来确定异常值
threshold = 0.4  # 根据需要调整阈值

# 删除异常值
filtered_x_values = []
filtered_y_values = []
for x, y in zip(x_values, y_values):
    if y < threshold:
        filtered_x_values.append(x)
        filtered_y_values.append(y)

plt.scatter(filtered_x_values, filtered_y_values, alpha=0.5)
# 使用线性回归拟合数据
slope, intercept, r_value, p_value, std_err = linregress(x_values, y_values)

# 绘制趋势线
plt.plot(x_values, intercept + slope * np.array(x_values), color='red', label='Trend Line')
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')

# 显示图形
plt.show()
from scipy.stats import linregress
# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation'].tolist()
y_values = adata.var['dispersion'].tolist()

# 定义离散度的阈值来确定异常值
threshold = 0.4  # 根据需要调整阈值

# 删除异常值
filtered_x_values = []
filtered_y_values = []
for x, y in zip(x_values, y_values):
    if y < threshold:
        filtered_x_values.append(x)
        filtered_y_values.append(y)

plt.scatter(filtered_x_values, filtered_y_values, alpha=0.5)
# 使用线性回归拟合数据
slope, intercept, r_value, p_value, std_err = linregress(x_values, y_values)

# 绘制趋势线
plt.plot(x_values, intercept + slope * np.array(x_values), color='red', label='Trend Line')
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')

# 显示图形
plt.show()

In [222]:

Copied!





# 提取obs中的global列和cpg列
x_values = df['mean_feature_methylation'].tolist()
y_values = df['dispersion_norm'].tolist()

# 定义离散度的阈值来确定异常值
threshold = 2.0  # 根据需要调整阈值

# 删除异常值
filtered_x_values = []
filtered_y_values = []
for x, y in zip(x_values, y_values):
    if y < threshold:
        filtered_x_values.append(x)
        filtered_y_values.append(y)

# 重新绘制没有异常值的散点图
plt.scatter(filtered_x_values, filtered_y_values, alpha=0.5)

# 使用线性回归拟合数据
slope, intercept, r_value, p_value, std_err = linregress(x_values, y_values)

# 绘制趋势线
plt.plot(x_values, intercept + slope * np.array(x_values), color='red', label='Trend Line')

# 显示图形
plt.show()
# 提取obs中的global列和cpg列
x_values = df['mean_feature_methylation'].tolist()
y_values = df['dispersion_norm'].tolist()

# 定义离散度的阈值来确定异常值
threshold = 2.0  # 根据需要调整阈值

# 删除异常值
filtered_x_values = []
filtered_y_values = []
for x, y in zip(x_values, y_values):
    if y < threshold:
        filtered_x_values.append(x)
        filtered_y_values.append(y)

# 重新绘制没有异常值的散点图
plt.scatter(filtered_x_values, filtered_y_values, alpha=0.5)

# 使用线性回归拟合数据
slope, intercept, r_value, p_value, std_err = linregress(x_values, y_values)

# 绘制趋势线
plt.plot(x_values, intercept + slope * np.array(x_values), color='red', label='Trend Line')

# 显示图形
plt.show()

In [223]:

Copied!

import scanpy as  sc
import scanpy as  sc

In [226]:

Copied!

#sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca(adata, color=['overall_mc_level_CG'], wspace=0.5)
#sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca(adata, color=['overall_mc_level_CG'], wspace=0.5)

In [228]:

Copied!

sc.pl.pca(adata, color=['mc_class_CG'], wspace=0.5)
sc.pl.pca(adata, color=['mc_class_CG'], wspace=0.5)

In [268]:

Copied!

adata_filter=adata[:,adata.var['feature_select']].copy()
adata_filter2=adata_filter[adata_filter.obs['mc_class_CG']>2000000,:].copy()
adata_filter=adata[:,adata.var['feature_select']].copy()
adata_filter2=adata_filter[adata_filter.obs['mc_class_CG']>2000000,:].copy()

In [269]:

Copied!

adata_filter
adata_filter

Out[269]:

AnnData object with n_obs × n_vars = 52 × 1000
    obs: 'methylated_CG', 'mc_class_CG', 'total_CG', 'overall_mc_level_CG', 'sample', 'sample_id', 'coverage_cells', 'mean_cell_methylation'
    var: 'feature_name', 'coverage_feature', 'mean_feature_methylation', 'mean_sq', 'var', 'variance', 'dispersion', 'dispersion_log', 'NormalizedCoverage', 'ResidualDispersion', 'NormalizedDispersion', 'mean_bin', 'cov_bin', 'dispersion_norm', 'feature_select'
    uns: 'pca'
    obsm: 'X_pca', 'X_tsne'
    varm: 'PCs'

In [233]:

Copied!

sc.tl.pca(adata_filter, svd_solver='arpack')
sc.pl.pca(adata_filter, color=['overall_mc_level_CG'], wspace=0.5)
sc.tl.pca(adata_filter, svd_solver='arpack')
sc.pl.pca(adata_filter, color=['overall_mc_level_CG'], wspace=0.5)

In [236]:

Copied!

adata_filter.varm['PCs']
adata_filter.varm['PCs']

Out[236]:

array([[-0.06958627,  0.17975275,  0.03491881, ...,  0.02417051,
        -0.03574859,  0.1777849 ],
       [-0.07811921,  0.18211309,  0.03563234, ...,  0.15430381,
        -0.00066009,  0.18689157],
       [-0.10578167, -0.0394036 ,  0.1402062 , ..., -0.15728749,
        -0.06590174,  0.20133028],
       ...,
       [-0.06801507,  0.19917281,  0.03803346, ...,  0.02431435,
        -0.12863403, -0.04797525],
       [-0.10333498, -0.07916441,  0.28298015, ...,  0.08951086,
         0.08185068,  0.03221035],
       [-0.13062945, -0.02340198, -0.08192839, ..., -0.03865004,
         0.05464122, -0.02124793]])

In [237]:

Copied!

adata.varm['PCs']
adata.varm['PCs']

Out[237]:

array([[ 1.43114687e-17,  1.38777878e-17,  1.00830802e-17, ...,
        -6.50521303e-18, -3.68628739e-18, -1.03757191e-30],
       [-4.77048956e-17, -3.08997619e-17, -1.61004023e-17, ...,
        -1.38777878e-17,  2.13391703e-30,  9.01991226e-31],
       [-9.24601288e-18, -8.18398806e-21,  3.68935473e-17, ...,
         5.16849603e-31,  8.44948759e-31,  9.30009766e-31],
       ...,
       [ 0.00000000e+00, -0.00000000e+00,  0.00000000e+00, ...,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00],
       [ 0.00000000e+00, -0.00000000e+00,  0.00000000e+00, ...,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00],
       [ 0.00000000e+00, -0.00000000e+00,  0.00000000e+00, ...,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00]])

In [238]:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[238], line 1
----> 1 sc.tl.tsne(adata,
      2      obsm='X_pca',
      3      metric='euclidean',
      4      exaggeration=-1,  # auto determined
      5      perplexity=30,
      6      n_jobs=-1)
      7 sc.pl.tsne(cdata_mean,color=['Neuron type'], wspace=0.8)

TypeError: tsne() got an unexpected keyword argument 'obsm'

In [240]:

Copied!





from openTSNE import TSNEEmbedding, affinity, initialization
#tsne methods
from typing import Callable, Union
def art_of_tsne(X: np.ndarray,
                metric: Union[str, Callable] = "euclidean",
                exaggeration: float = -1,
                perplexity: int = 30,
                n_jobs: int = -1) -> TSNEEmbedding:
    """
    Implementation of Dmitry Kobak and Philipp Berens
    "The art of using t-SNE for single-cell transcriptomics" based on openTSNE.
    See https://doi.org/10.1038/s41467-019-13056-x | www.nature.com/naturecommunications
    Args:
        X				The data matrix of shape (n_cells, n_genes) i.e. (n_samples, n_features)
        metric			Any metric allowed by PyNNDescent (default: 'euclidean')
        exaggeration	The exaggeration to use for the embedding
        perplexity		The perplexity to use for the embedding

    Returns:
        The embedding as an opentsne.TSNEEmbedding object (which can be cast to an np.ndarray)
    """
    n = X.shape[0]
    if n > 100_000:
        if exaggeration == -1:
            exaggeration = 1 + n / 333_333
        # Subsample, optimize, then add the remaining cells and optimize again
        # Also, use exaggeration == 4
        logging.info(f"Creating subset of {n // 40} elements")
        # Subsample and run a regular art_of_tsne on the subset
        indices = np.random.permutation(n)
        reverse = np.argsort(indices)
        X_sample, X_rest = X[indices[:n // 40]], X[indices[n // 40:]]
        logging.info(f"Embedding subset")
        Z_sample = art_of_tsne(X_sample)

        logging.info(
            f"Preparing partial initial embedding of the {n - n // 40} remaining elements"
        )
        if isinstance(Z_sample.affinities, affinity.Multiscale):
            rest_init = Z_sample.prepare_partial(X_rest,
                                                 k=1,
                                                 perplexities=[1 / 3, 1 / 3])
        else:
            rest_init = Z_sample.prepare_partial(X_rest, k=1, perplexity=1 / 3)
        logging.info(f"Combining the initial embeddings, and standardizing")
        init_full = np.vstack((Z_sample, rest_init))[reverse]
        init_full = init_full / (np.std(init_full[:, 0]) * 10000)

        logging.info(f"Creating multiscale affinities")
        affinities = affinity.PerplexityBasedNN(X,
                                                perplexity=perplexity,
                                                metric=metric,
                                                method="approx",
                                                n_jobs=n_jobs)
        logging.info(f"Creating TSNE embedding")
        Z = TSNEEmbedding(init_full,
                          affinities,
                          negative_gradient_method="fft",
                          n_jobs=n_jobs)
        logging.info(f"Optimizing, stage 1")
        Z.optimize(n_iter=250,
                   inplace=True,
                   exaggeration=12,
                   momentum=0.5,
                   learning_rate=n / 12,
                   n_jobs=n_jobs)
        logging.info(f"Optimizing, stage 2")
        Z.optimize(n_iter=750,
                   inplace=True,
                   exaggeration=exaggeration,
                   momentum=0.8,
                   learning_rate=n / 12,
                   n_jobs=n_jobs)
    elif n > 3_000:
        if exaggeration == -1:
            exaggeration = 1
        # Use multiscale perplexity
        affinities_multiscale_mixture = affinity.Multiscale(
            X,
            perplexities=[perplexity, n / 100],
            metric=metric,
            method="approx",
            n_jobs=n_jobs)
        init = initialization.pca(X)
        Z = TSNEEmbedding(init,
                          affinities_multiscale_mixture,
                          negative_gradient_method="fft",
                          n_jobs=n_jobs)
        Z.optimize(n_iter=250,
                   inplace=True,
                   exaggeration=12,
                   momentum=0.5,
                   learning_rate=n / 12,
                   n_jobs=n_jobs)
        Z.optimize(n_iter=750,
                   inplace=True,
                   exaggeration=exaggeration,
                   momentum=0.8,
                   learning_rate=n / 12,
                   n_jobs=n_jobs)
    else:
        if exaggeration == -1:
            exaggeration = 1
        # Just a plain TSNE with high learning rate
        lr = max(200, n / 12)
        aff = affinity.PerplexityBasedNN(X,
                                         perplexity=perplexity,
                                         metric=metric,
                                         method="approx",
                                         n_jobs=n_jobs)
        init = initialization.pca(X)
        Z = TSNEEmbedding(init,
                          aff,
                          learning_rate=lr,
                          n_jobs=n_jobs,
                          negative_gradient_method="fft")
        Z.optimize(250, exaggeration=12, momentum=0.5, inplace=True, n_jobs=n_jobs)
        Z.optimize(750,
                   exaggeration=exaggeration,
                   momentum=0.8,
                   inplace=True,
                   n_jobs=n_jobs)
    return Z
def tsne(adata, obsm='X_pca',
         metric: Union[str, Callable] = "euclidean",
         exaggeration: float = -1,
         perplexity: int = 30,
         n_jobs: int = -1):
    X = adata.obsm[obsm]
    Z = art_of_tsne(X=X, metric=metric, exaggeration=exaggeration, perplexity=perplexity, n_jobs=n_jobs)
    adata.obsm['X_tsne'] = Z
    return
from openTSNE import TSNEEmbedding, affinity, initialization
#tsne methods
from typing import Callable, Union
def art_of_tsne(X: np.ndarray,
                metric: Union[str, Callable] = "euclidean",
                exaggeration: float = -1,
                perplexity: int = 30,
                n_jobs: int = -1) -> TSNEEmbedding:
    """
    Implementation of Dmitry Kobak and Philipp Berens
    "The art of using t-SNE for single-cell transcriptomics" based on openTSNE.
    See https://doi.org/10.1038/s41467-019-13056-x | www.nature.com/naturecommunications
    Args:
        X				The data matrix of shape (n_cells, n_genes) i.e. (n_samples, n_features)
        metric			Any metric allowed by PyNNDescent (default: 'euclidean')
        exaggeration	The exaggeration to use for the embedding
        perplexity		The perplexity to use for the embedding

    Returns:
        The embedding as an opentsne.TSNEEmbedding object (which can be cast to an np.ndarray)
    """
    n = X.shape[0]
    if n > 100_000:
        if exaggeration == -1:
            exaggeration = 1 + n / 333_333
        # Subsample, optimize, then add the remaining cells and optimize again
        # Also, use exaggeration == 4
        logging.info(f"Creating subset of {n // 40} elements")
        # Subsample and run a regular art_of_tsne on the subset
        indices = np.random.permutation(n)
        reverse = np.argsort(indices)
        X_sample, X_rest = X[indices[:n // 40]], X[indices[n // 40:]]
        logging.info(f"Embedding subset")
        Z_sample = art_of_tsne(X_sample)

        logging.info(
            f"Preparing partial initial embedding of the {n - n // 40} remaining elements"
        )
        if isinstance(Z_sample.affinities, affinity.Multiscale):
            rest_init = Z_sample.prepare_partial(X_rest,
                                                 k=1,
                                                 perplexities=[1 / 3, 1 / 3])
        else:
            rest_init = Z_sample.prepare_partial(X_rest, k=1, perplexity=1 / 3)
        logging.info(f"Combining the initial embeddings, and standardizing")
        init_full = np.vstack((Z_sample, rest_init))[reverse]
        init_full = init_full / (np.std(init_full[:, 0]) * 10000)

        logging.info(f"Creating multiscale affinities")
        affinities = affinity.PerplexityBasedNN(X,
                                                perplexity=perplexity,
                                                metric=metric,
                                                method="approx",
                                                n_jobs=n_jobs)
        logging.info(f"Creating TSNE embedding")
        Z = TSNEEmbedding(init_full,
                          affinities,
                          negative_gradient_method="fft",
                          n_jobs=n_jobs)
        logging.info(f"Optimizing, stage 1")
        Z.optimize(n_iter=250,
                   inplace=True,
                   exaggeration=12,
                   momentum=0.5,
                   learning_rate=n / 12,
                   n_jobs=n_jobs)
        logging.info(f"Optimizing, stage 2")
        Z.optimize(n_iter=750,
                   inplace=True,
                   exaggeration=exaggeration,
                   momentum=0.8,
                   learning_rate=n / 12,
                   n_jobs=n_jobs)
    elif n > 3_000:
        if exaggeration == -1:
            exaggeration = 1
        # Use multiscale perplexity
        affinities_multiscale_mixture = affinity.Multiscale(
            X,
            perplexities=[perplexity, n / 100],
            metric=metric,
            method="approx",
            n_jobs=n_jobs)
        init = initialization.pca(X)
        Z = TSNEEmbedding(init,
                          affinities_multiscale_mixture,
                          negative_gradient_method="fft",
                          n_jobs=n_jobs)
        Z.optimize(n_iter=250,
                   inplace=True,
                   exaggeration=12,
                   momentum=0.5,
                   learning_rate=n / 12,
                   n_jobs=n_jobs)
        Z.optimize(n_iter=750,
                   inplace=True,
                   exaggeration=exaggeration,
                   momentum=0.8,
                   learning_rate=n / 12,
                   n_jobs=n_jobs)
    else:
        if exaggeration == -1:
            exaggeration = 1
        # Just a plain TSNE with high learning rate
        lr = max(200, n / 12)
        aff = affinity.PerplexityBasedNN(X,
                                         perplexity=perplexity,
                                         metric=metric,
                                         method="approx",
                                         n_jobs=n_jobs)
        init = initialization.pca(X)
        Z = TSNEEmbedding(init,
                          aff,
                          learning_rate=lr,
                          n_jobs=n_jobs,
                          negative_gradient_method="fft")
        Z.optimize(250, exaggeration=12, momentum=0.5, inplace=True, n_jobs=n_jobs)
        Z.optimize(750,
                   exaggeration=exaggeration,
                   momentum=0.8,
                   inplace=True,
                   n_jobs=n_jobs)
    return Z
def tsne(adata, obsm='X_pca',
         metric: Union[str, Callable] = "euclidean",
         exaggeration: float = -1,
         perplexity: int = 30,
         n_jobs: int = -1):
    X = adata.obsm[obsm]
    Z = art_of_tsne(X=X, metric=metric, exaggeration=exaggeration, perplexity=perplexity, n_jobs=n_jobs)
    adata.obsm['X_tsne'] = Z
    return

In [249]:

Copied!

#tsne(adata)
sc.pl.tsne(adata, color=['overall_mc_level_CG'],wspace=0.3)
#tsne(adata)
sc.pl.tsne(adata, color=['overall_mc_level_CG'],wspace=0.3)

In [245]:

Copied!

tsne(adata_filter)
tsne(adata_filter)

In [250]:

Copied!

sc.pl.tsne(adata_filter,color=['overall_mc_level_CG'] ,wspace=0.1)
sc.pl.tsne(adata_filter,color=['overall_mc_level_CG'] ,wspace=0.1)

In [270]:

Copied!

adata_filter2
adata_filter2

Out[270]:

AnnData object with n_obs × n_vars = 44 × 1000
    obs: 'methylated_CG', 'mc_class_CG', 'total_CG', 'overall_mc_level_CG', 'sample', 'sample_id', 'coverage_cells', 'mean_cell_methylation'
    var: 'feature_name', 'coverage_feature', 'mean_feature_methylation', 'mean_sq', 'var', 'variance', 'dispersion', 'dispersion_log', 'NormalizedCoverage', 'ResidualDispersion', 'NormalizedDispersion', 'mean_bin', 'cov_bin', 'dispersion_norm', 'feature_select'
    uns: 'pca'
    obsm: 'X_pca', 'X_tsne'
    varm: 'PCs'

In [272]:

Copied!

sc.tl.pca(adata_filter2, svd_solver='arpack')
sc.pl.pca(adata_filter2, color=['total_CG'], wspace=0.5)
sc.tl.pca(adata_filter2, svd_solver='arpack')
sc.pl.pca(adata_filter2, color=['total_CG'], wspace=0.5)

In [273]:

Copied!

tsne(adata_filter2)
tsne(adata_filter2)

In [274]:

Copied!

sc.pl.tsne(adata_filter2,color=['overall_mc_level_CG'] ,wspace=0.1)
sc.pl.tsne(adata_filter2,color=['overall_mc_level_CG'] ,wspace=0.1)

In [263]:

Copied!

adata_filter2.obs
adata_filter2.obs

Out[263]:

	methylated_CG	mc_class_CG	total_CG	overall_mc_level_CG	sample	sample_id	coverage_cells	mean_cell_methylation	Cell_type	Treatment
0	2773841.0	6389935.0	10058585.0	0.275769	GSM1370523	0	4912	0.255770	MII oocyte	NaN
1	4207956.0	9056780.0	14749706.0	0.285291	GSM1370524	1	4912	0.267832	MII oocyte	NaN
2	1062414.0	3024372.0	3687362.0	0.288123	GSM1370525	2	4912	0.270414	MII oocyte	NaN
3	2174791.0	5617340.0	7468542.0	0.291194	GSM1370526	3	4912	0.257664	MII oocyte	NaN
4	2250723.0	5478455.0	7718445.0	0.291603	GSM1370527	4	4912	0.256831	MII oocyte	NaN
5	1235199.0	3266783.0	4241536.0	0.291215	GSM1370529	6	4912	0.247983	MII oocyte	NaN
6	1602124.0	3903117.0	5507096.0	0.290920	GSM1370530	7	4912	0.261821	MII oocyte	NaN
7	1229421.0	3204150.0	4160911.0	0.295469	GSM1370531	8	4912	0.254784	MII oocyte	NaN
8	1149598.0	2822474.0	3733805.0	0.307889	GSM1370532	9	4912	0.257592	MII oocyte	NaN
9	1586218.0	3869842.0	5403788.0	0.293538	GSM1370533	10	4912	0.253991	MII oocyte	NaN
10	1948937.0	3800961.0	5242714.0	0.371742	GSM1370535	11	4912	0.393481	ESC	2i
11	1271969.0	2797560.0	3506515.0	0.362744	GSM1370536	12	4912	0.346206	ESC	2i
12	2105457.0	3372625.0	4537703.0	0.463992	GSM1370537	13	4912	0.454491	ESC	2i
13	1521358.0	4242155.0	5799310.0	0.262334	GSM1370538	14	4912	0.236155	ESC	2i
14	1974452.0	3917329.0	5546459.0	0.355984	GSM1370539	15	4912	0.337848	ESC	2i
15	372428.0	3297855.0	4538546.0	0.082059	GSM1370540	16	4912	0.066197	ESC	2i
16	2068775.0	4283144.0	5851328.0	0.353556	GSM1370541	17	4912	0.276954	ESC	2i
17	730843.0	3950391.0	5450398.0	0.134090	GSM1370542	18	4912	0.127163	ESC	2i
18	1318842.0	4442023.0	5843535.0	0.225692	GSM1370543	19	4912	0.214378	ESC	2i
19	1109405.0	3710145.0	5011395.0	0.221376	GSM1370544	20	4912	0.201409	ESC	2i
20	721897.0	3110061.0	4463927.0	0.161718	GSM1370545	21	4912	0.148684	ESC	2i
21	1828675.0	3532788.0	4807341.0	0.380392	GSM1370546	22	4912	0.352224	ESC	2i
22	5078262.0	4424268.0	7002192.0	0.725239	GSM1370555	30	4912	0.633859	ESC	serum/LIF
23	2883177.0	3262923.0	4525444.0	0.637104	GSM1370556	31	4912	0.569044	ESC	serum/LIF
24	3010405.0	4728963.0	7751272.0	0.388376	GSM1370557	32	4912	0.338542	ESC	serum/LIF
25	4478594.0	4965673.0	7571138.0	0.591535	GSM1370558	33	4912	0.570422	ESC	serum/LIF
26	3200120.0	3848428.0	5622442.0	0.569169	GSM1370559	34	4912	0.498544	ESC	serum/LIF
27	1814649.0	5795602.0	8946794.0	0.202827	GSM1370560	35	4912	0.187398	ESC	serum/LIF
28	3755105.0	3468770.0	5304515.0	0.707907	GSM1370561	36	4912	0.629427	ESC	serum/LIF
29	3243439.0	3557629.0	4771783.0	0.679712	GSM1370562	37	4912	0.611443	ESC	serum/LIF
30	3826163.0	5262688.0	7106219.0	0.538425	GSM1370563	38	4912	0.508737	ESC	serum/LIF
31	4527218.0	4938305.0	7210573.0	0.627858	GSM1370564	39	4912	0.596066	ESC	serum/LIF
32	3487010.0	3698022.0	5098903.0	0.683875	GSM1370565	40	4912	0.615996	ESC	serum/LIF
33	1670947.0	2077704.0	2662177.0	0.627662	GSM1370566	41	4912	0.585674	ESC	serum/LIF
34	2014271.0	3017804.0	3828676.0	0.526101	GSM1370567	42	4912	0.468503	ESC	serum/LIF
35	6249395.0	7502459.0	10319408.0	0.605596	GSM1370568	43	4912	0.539304	ESC	serum/LIF
36	2641859.0	3607443.0	4826163.0	0.547404	GSM1370569	44	4912	0.477331	ESC	serum/LIF
37	3859515.0	5568802.0	7575752.0	0.509456	GSM1370570	45	4912	0.469847	ESC	serum/LIF
38	4079333.0	4145869.0	6017116.0	0.677955	GSM1370571	46	4912	0.616277	ESC	serum/LIF
39	3410149.0	3967729.0	5362681.0	0.635904	GSM1370572	47	4912	0.577196	ESC	serum/LIF
40	3488749.0	4433266.0	6109535.0	0.571033	GSM1370573	48	4912	0.552650	ESC	serum/LIF
41	3588924.0	4523416.0	6085020.0	0.589797	GSM1370574	49	4912	0.513039	ESC	serum/LIF
42	37221650.0	22641666.0	53520217.0	0.695469	GSM1370575	50	4912	0.581207	ESC	serum/LIF
43	5484605.0	9727286.0	19593619.0	0.279918	GSM1413827	51	4912	0.257857	MII oocyte	NaN

In [277]:

Copied!

#load_meta
metadata = pd.read_csv("D://Test/GSE56789/GSE56879_Meta.txt", sep='\t')
adata_filter2.obs =adata_filter2.obs.merge(metadata[['sample', 'Cell_type','Treatment']], left_on='sample', right_on='sample', how='left')
#load_meta
metadata = pd.read_csv("D://Test/GSE56789/GSE56879_Meta.txt", sep='\t')
adata_filter2.obs =adata_filter2.obs.merge(metadata[['sample', 'Cell_type','Treatment']], left_on='sample', right_on='sample', how='left')

D:\conda\envs\py39\lib\site-packages\anndata\_core\anndata.py:788: UserWarning: 
AnnData expects .obs.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)

In [279]:

Copied!

sc.pl.tsne(adata_filter2,color=['overall_mc_level_CG'] ,wspace=0.1)
sc.pl.tsne(adata_filter2,color=['overall_mc_level_CG'] ,wspace=0.1)

In [286]:

Copied!

sc.pp.neighbors(adata_filter2, n_neighbors=15,use_rep='X')
sc.pp.neighbors(adata_filter2, n_neighbors=15,use_rep='X')

In [308]:

Copied!

#umap
sc.tl.umap(adata_filter2)
#umap
sc.tl.umap(adata_filter2)

In [291]:

Copied!





def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):
    this_step = 0
    this_min = float(range_min)
    this_max = float(range_max)
    while this_step < max_steps:
        print('step ' + str(this_step))
        this_resolution = this_min + ((this_max-this_min)/2)
        sc.tl.louvain(adata,resolution=this_resolution)
        this_clusters = adata.obs['louvain'].nunique()
        
        print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))
        
        if this_clusters > n_cluster:
            this_max = this_resolution
        elif this_clusters < n_cluster:
            this_min = this_resolution
        else:
            return(this_resolution, adata)
        this_step += 1
    
    print('Cannot find the number of clusters')
    print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + + str(this_resolution))
def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):
    this_step = 0
    this_min = float(range_min)
    this_max = float(range_max)
    while this_step < max_steps:
        print('step ' + str(this_step))
        this_resolution = this_min + ((this_max-this_min)/2)
        sc.tl.louvain(adata,resolution=this_resolution)
        this_clusters = adata.obs['louvain'].nunique()
        
        print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))
        
        if this_clusters > n_cluster:
            this_max = this_resolution
        elif this_clusters < n_cluster:
            this_min = this_resolution
        else:
            return(this_resolution, adata)
        this_step += 1
    
    print('Cannot find the number of clusters')
    print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + + str(this_resolution))

In [341]:

Copied!

#num_clusters = len(np.unique(metadata['Treatment']))
#Louvain
getNClusters(adata_filter2,n_cluster=3)
#num_clusters = len(np.unique(metadata['Treatment']))
#Louvain
getNClusters(adata_filter2,n_cluster=3)

step 0
got 4 at resolution 1.5
step 1
got 2 at resolution 0.75
step 2
got 3 at resolution 1.125

Out[341]:

(1.125,
 AnnData object with n_obs × n_vars = 44 × 1000
     obs: 'methylated_CG', 'mc_class_CG', 'total_CG', 'overall_mc_level_CG', 'sample', 'sample_id', 'coverage_cells', 'mean_cell_methylation', 'Cell_type', 'Treatment', 'louvain', 'kmeans', 'label'
     var: 'feature_name', 'coverage_feature', 'mean_feature_methylation', 'mean_sq', 'var', 'variance', 'dispersion', 'dispersion_log', 'NormalizedCoverage', 'ResidualDispersion', 'NormalizedDispersion', 'mean_bin', 'cov_bin', 'dispersion_norm', 'feature_select'
     uns: 'pca', 'Treatment_colors', 'neighbors', 'umap', 'louvain', 'louvain_colors', 'Cell_type_colors'
     obsm: 'X_pca', 'X_tsne', 'X_umap'
     varm: 'PCs'
     obsp: 'distances', 'connectivities')

In [295]:

Copied!

!pip install louvain
!pip install louvain

Collecting louvain
  Obtaining dependency information for louvain from https://files.pythonhosted.org/packages/0c/c3/e675de4a212f09686fd4716ddf48347c244e0a6db5dcba9a5d962ab5eb41/louvain-0.8.1-cp39-cp39-win_amd64.whl.metadata
  Downloading louvain-0.8.1-cp39-cp39-win_amd64.whl.metadata (1.5 kB)
Requirement already satisfied: igraph<0.11,>=0.10.0 in d:\conda\envs\py39\lib\site-packages (from louvain) (0.10.8)
Requirement already satisfied: texttable>=1.6.2 in d:\conda\envs\py39\lib\site-packages (from igraph<0.11,>=0.10.0->louvain) (1.6.7)
Downloading louvain-0.8.1-cp39-cp39-win_amd64.whl (89 kB)
   ---------------------------------------- 0.0/89.5 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/89.5 kB ? eta -:--:--
   ------------------------------------ --- 81.9/89.5 kB 762.6 kB/s eta 0:00:01
   ---------------------------------------- 89.5/89.5 kB 843.6 kB/s eta 0:00:00
Installing collected packages: louvain
Successfully installed louvain-0.8.1

In [298]:

Copied!





from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score

In [362]:

Copied!





num_clusters =3
adata = adata_filter
adata.obs['label'] = adata.obs['Cell_type']
method = 'Test2'
df_metrics = pd.DataFrame(columns=['ARI_Louvain','ARI_kmeans',
                                   'AMI_Louvain','AMI_kmeans',
                                   'Homogeneity_Louvain','Homogeneity_kmeans'])
#kmeans
kmeans = KMeans(n_clusters=num_clusters, random_state=2019).fit(adata.X)
adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')
# #hierachical clustering 
# #ERROR: TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.
# hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)
# adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')
# #clustering metrics

#adjusted rank index
ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['louvain'])
ari_kmeans = adjusted_rand_score(adata.obs['label'], adata.obs['kmeans'])
# ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
#adjusted mutual information
ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['louvain'],average_method='arithmetic')
ami_kmeans = adjusted_mutual_info_score(adata.obs['label'], adata.obs['kmeans'],average_method='arithmetic')   
# ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
#homogeneity
homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['louvain'])
homo_kmeans = homogeneity_score(adata.obs['label'], adata.obs['kmeans'])
# homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])

df_metrics.loc[method,['ARI_Louvain','ARI_kmeans']] = [ari_louvain,ari_kmeans]
df_metrics.loc[method,['AMI_Louvain','AMI_kmeans']] = [ami_louvain,ami_kmeans]
df_metrics.loc[method,['Homogeneity_Louvain','Homogeneity_kmeans']] = [homo_louvain,homo_kmeans] 
# adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\t')
num_clusters =3
adata = adata_filter
adata.obs['label'] = adata.obs['Cell_type']
method = 'Test2'
df_metrics = pd.DataFrame(columns=['ARI_Louvain','ARI_kmeans',
                                   'AMI_Louvain','AMI_kmeans',
                                   'Homogeneity_Louvain','Homogeneity_kmeans'])
#kmeans
kmeans = KMeans(n_clusters=num_clusters, random_state=2019).fit(adata.X)
adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')
# #hierachical clustering 
# #ERROR: TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.
# hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)
# adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')
# #clustering metrics

#adjusted rank index
ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['louvain'])
ari_kmeans = adjusted_rand_score(adata.obs['label'], adata.obs['kmeans'])
# ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
#adjusted mutual information
ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['louvain'],average_method='arithmetic')
ami_kmeans = adjusted_mutual_info_score(adata.obs['label'], adata.obs['kmeans'],average_method='arithmetic')   
# ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
#homogeneity
homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['louvain'])
homo_kmeans = homogeneity_score(adata.obs['label'], adata.obs['kmeans'])
# homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])

df_metrics.loc[method,['ARI_Louvain','ARI_kmeans']] = [ari_louvain,ari_kmeans]
df_metrics.loc[method,['AMI_Louvain','AMI_kmeans']] = [ami_louvain,ami_kmeans]
df_metrics.loc[method,['Homogeneity_Louvain','Homogeneity_kmeans']] = [homo_louvain,homo_kmeans] 
# adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\t')

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File D:\conda\envs\py39\lib\site-packages\pandas\core\indexes\base.py:3802, in Index.get_loc(self, key, method, tolerance)
   3801 try:
-> 3802     return self._engine.get_loc(casted_key)
   3803 except KeyError as err:

File D:\conda\envs\py39\lib\site-packages\pandas\_libs\index.pyx:138, in pandas._libs.index.IndexEngine.get_loc()

File D:\conda\envs\py39\lib\site-packages\pandas\_libs\index.pyx:165, in pandas._libs.index.IndexEngine.get_loc()

File pandas\_libs\hashtable_class_helper.pxi:5745, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas\_libs\hashtable_class_helper.pxi:5753, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Cell_type'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[362], line 3
      1 num_clusters =3
      2 adata = adata_filter
----> 3 adata.obs['label'] = adata.obs['Cell_type']
      4 method = 'Test2'
      5 df_metrics = pd.DataFrame(columns=['ARI_Louvain','ARI_kmeans',
      6                                    'AMI_Louvain','AMI_kmeans',
      7                                    'Homogeneity_Louvain','Homogeneity_kmeans'])

File D:\conda\envs\py39\lib\site-packages\pandas\core\frame.py:3807, in DataFrame.__getitem__(self, key)
   3805 if self.columns.nlevels > 1:
   3806     return self._getitem_multilevel(key)
-> 3807 indexer = self.columns.get_loc(key)
   3808 if is_integer(indexer):
   3809     indexer = [indexer]

File D:\conda\envs\py39\lib\site-packages\pandas\core\indexes\base.py:3804, in Index.get_loc(self, key, method, tolerance)
   3802     return self._engine.get_loc(casted_key)
   3803 except KeyError as err:
-> 3804     raise KeyError(key) from err
   3805 except TypeError:
   3806     # If we have a listlike key, _check_indexing_error will raise
   3807     #  InvalidIndexError. Otherwise we fall through and re-raise
   3808     #  the TypeError.
   3809     self._check_indexing_error(key)

KeyError: 'Cell_type'

In [307]:

Copied!

df_metrics
df_metrics

Out[307]:

	ARI_Louvain	ARI_kmeans	AMI_Louvain	AMI_kmeans	Homogeneity_Louvain	Homogeneity_kmeans
Test	0.191548	0.501999	0.341999	0.686136	0.598372	1.0

In [ ]:

In [325]:

Copied!





# !pip install plotly
import plotly.express as px


def interactive_scatter(adata, hue=None, coord_base='umap', continous_cmap='viridis', size=5):
    fig = px.scatter(data_frame=pd.DataFrame(adata.obsm['X_umap'],columns=['umap1', 'umap2']),
                     x='umap1',
                     y='umap2',
                     color_continuous_scale=continous_cmap,
                     color=hue)
    fig.update_layout(paper_bgcolor='rgba(0,0,0,0)',
                      plot_bgcolor='rgba(0,0,0,0)',
                      xaxis_showticklabels=False,
                      yaxis_showticklabels=False,
                      width=800,
                      height=800)
    fig.update_traces(marker_size=size)
    return fig
# !pip install plotly
import plotly.express as px


def interactive_scatter(adata, hue=None, coord_base='umap', continous_cmap='viridis', size=5):
    fig = px.scatter(data_frame=pd.DataFrame(adata.obsm['X_umap'],columns=['umap1', 'umap2']),
                     x='umap1',
                     y='umap2',
                     color_continuous_scale=continous_cmap,
                     color=hue)
    fig.update_layout(paper_bgcolor='rgba(0,0,0,0)',
                      plot_bgcolor='rgba(0,0,0,0)',
                      xaxis_showticklabels=False,
                      yaxis_showticklabels=False,
                      width=800,
                      height=800)
    fig.update_traces(marker_size=size)
    return fig

In [327]:

Copied!

fig = interactive_scatter(adata_filter2)
fig = interactive_scatter(adata_filter2)

In [315]:

Copied!

adata.obsm['X_umap']
adata.obsm['X_umap']

Out[315]:

array([[12.484789  , 16.581722  ],
       [12.22508   , 17.664066  ],
       [11.952932  , 17.353918  ],
       [13.217374  , 17.035156  ],
       [13.089491  , 16.30659   ],
       [12.687077  , 15.8422365 ],
       [12.456281  , 16.944857  ],
       [11.835731  , 16.619175  ],
       [12.000754  , 16.130224  ],
       [11.402539  , 17.039957  ],
       [14.687245  ,  3.0443153 ],
       [14.237143  ,  3.9717314 ],
       [15.468485  ,  2.4151003 ],
       [12.659276  ,  5.184509  ],
       [13.543063  ,  3.7920406 ],
       [11.166269  ,  5.66698   ],
       [12.79142   ,  4.5101447 ],
       [11.2506485 ,  4.9574623 ],
       [11.915785  ,  4.4838223 ],
       [11.663357  ,  5.2958064 ],
       [11.917299  ,  5.6766925 ],
       [13.8773365 ,  3.3940747 ],
       [19.046078  , -0.17343952],
       [17.776234  ,  0.2363771 ],
       [13.7890005 ,  4.4060965 ],
       [18.572798  ,  1.0591211 ],
       [16.609453  ,  1.7374704 ],
       [12.103427  ,  5.0119734 ],
       [19.234375  ,  0.59901553],
       [19.425552  ,  0.03022308],
       [17.176418  ,  1.5065898 ],
       [19.219236  ,  1.0854653 ],
       [19.821619  ,  0.5332009 ],
       [18.489052  ,  0.6563332 ],
       [15.494957  ,  1.7379029 ],
       [17.372566  ,  0.9697068 ],
       [16.158829  ,  2.1730094 ],
       [16.147123  ,  1.4657835 ],
       [19.583767  , -0.44952655],
       [17.990234  ,  0.36184123],
       [18.134104  ,  1.4666414 ],
       [16.548273  ,  0.8359049 ],
       [18.279852  , -0.17635442],
       [12.905921  , 17.463284  ]], dtype=float32)

In [323]:

Copied!

data_frame=pd.DataFrame(adata.obsm['X_umap'],columns=['umap1', 'umap2'])
data_frame=pd.DataFrame(adata.obsm['X_umap'],columns=['umap1', 'umap2'])

In [324]:

Copied!

data_frame
data_frame

Out[324]:

	umap1	umap2
0	12.484789	16.581722
1	12.225080	17.664066
2	11.952932	17.353918
3	13.217374	17.035156
4	13.089491	16.306589
5	12.687077	15.842237
6	12.456281	16.944857
7	11.835731	16.619175
8	12.000754	16.130224
9	11.402539	17.039957
10	14.687245	3.044315
11	14.237143	3.971731
12	15.468485	2.415100
13	12.659276	5.184509
14	13.543063	3.792041
15	11.166269	5.666980
16	12.791420	4.510145
17	11.250648	4.957462
18	11.915785	4.483822
19	11.663357	5.295806
20	11.917299	5.676692
21	13.877337	3.394075
22	19.046078	-0.173440
23	17.776234	0.236377
24	13.789001	4.406096
25	18.572798	1.059121
26	16.609453	1.737470
27	12.103427	5.011973
28	19.234375	0.599016
29	19.425552	0.030223
30	17.176418	1.506590
31	19.219236	1.085465
32	19.821619	0.533201
33	18.489052	0.656333
34	15.494957	1.737903
35	17.372566	0.969707
36	16.158829	2.173009
37	16.147123	1.465783
38	19.583767	-0.449527
39	17.990234	0.361841
40	18.134104	1.466641
41	16.548273	0.835905
42	18.279852	-0.176354
43	12.905921	17.463284

In [344]:

Copied!





# settings for the plots
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=250,
                     frameon=True, vector_friendly=True,
                     color_map="YlGnBu", format='pdf', transparent=False,
                     ipython_format='png2x')
sc.pl.umap(adata_filter2,color=['louvain','Cell_type','Treatment','kmeans','total_CG'] ,wspace=0.5)
# settings for the plots
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=250,
                     frameon=True, vector_friendly=True,
                     color_map="YlGnBu", format='pdf', transparent=False,
                     ipython_format='png2x')
sc.pl.umap(adata_filter2,color=['louvain','Cell_type','Treatment','kmeans','total_CG'] ,wspace=0.5)

D:\conda\envs\py39\lib\site-packages\scanpy\plotting\_tools\scatterplots.py:391: UserWarning:

No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored

D:\conda\envs\py39\lib\site-packages\scanpy\plotting\_tools\scatterplots.py:391: UserWarning:

No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored

D:\conda\envs\py39\lib\site-packages\scanpy\plotting\_tools\scatterplots.py:391: UserWarning:

No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored

D:\conda\envs\py39\lib\site-packages\scanpy\plotting\_tools\scatterplots.py:391: UserWarning:

No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored

In [346]:

Copied!

sc.pl.pca_overview(adata_filter2,color=['kmeans','mc_class_CG', 'mCG/CG', 'mCH/CH' ])
sc.pl.pca_overview(adata_filter2,color=['kmeans','mc_class_CG', 'mCG/CG', 'mCH/CH' ])

D:\conda\envs\py39\lib\site-packages\scanpy\plotting\_tools\scatterplots.py:391: UserWarning:

No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored

D:\conda\envs\py39\lib\site-packages\IPython\core\pylabtools.py:152: UserWarning:

Glyph 8942 (\N{VERTICAL ELLIPSIS}) missing from current font.

In [347]:

Copied!





#!pip install adjustText
def _text_anno_scatter(data: pd.DataFrame,
                       ax,
                       x: str,
                       y: str,
                       edge_color=(0.5, 0.5, 0.5, 0.2),
                       face_color=(0.8, 0.8, 0.8, 0.2),
                       palette: dict = None,
                       dodge_text=False,
                       anno_col='text_anno',
                       text_anno_kws=None,
                       text_transform=None,
                       dodge_kws=None,
                       linewidth=0.5,
                       labelsize=5):
    """Add text annotation to a scatter plot"""
    # prepare kws
    _text_anno_kws = dict(fontsize=labelsize,
                          fontweight='black',
                          horizontalalignment='center',
                          verticalalignment='center')
    if text_anno_kws is not None:
        _text_anno_kws.update(text_anno_kws)

    # plot each text
    text_list = []
    for text, sub_df in data.groupby(anno_col):
        if text_transform is None:
            text = str(text)
        else:
            text = text_transform(text)
        if text.lower() in ['', 'nan']:
            continue
        _x, _y = sub_df[[x, y]].median()

        if palette is not None:
            _fc = palette[text]
        else:
            _fc = face_color
        text = ax.text(_x, _y, text,
                       fontdict=_text_anno_kws,
                       bbox=dict(boxstyle="round",
                                 ec=edge_color,
                                 fc=_fc,
                                 linewidth=linewidth))
        text_list.append(text)

    if dodge_text:
        try:
            from adjustText import adjust_text

            _dodge_parms = dict(force_points=(0.02, 0.05),
                                arrowprops=dict(arrowstyle="->",
                                                fc=edge_color,
                                                ec="none",
                                                connectionstyle="angle,angleA=-90,angleB=180,rad=5"),
                                autoalign='xy')
            if dodge_kws is not None:
                _dodge_parms.update(dodge_kws)
            adjust_text(text_list, x=data['x'], y=data['y'], **_dodge_parms)
        except ModuleNotFoundError:
            print('Install adjustText package to dodge text, see its github page for help')

    return
#!pip install adjustText
def _text_anno_scatter(data: pd.DataFrame,
                       ax,
                       x: str,
                       y: str,
                       edge_color=(0.5, 0.5, 0.5, 0.2),
                       face_color=(0.8, 0.8, 0.8, 0.2),
                       palette: dict = None,
                       dodge_text=False,
                       anno_col='text_anno',
                       text_anno_kws=None,
                       text_transform=None,
                       dodge_kws=None,
                       linewidth=0.5,
                       labelsize=5):
    """Add text annotation to a scatter plot"""
    # prepare kws
    _text_anno_kws = dict(fontsize=labelsize,
                          fontweight='black',
                          horizontalalignment='center',
                          verticalalignment='center')
    if text_anno_kws is not None:
        _text_anno_kws.update(text_anno_kws)

    # plot each text
    text_list = []
    for text, sub_df in data.groupby(anno_col):
        if text_transform is None:
            text = str(text)
        else:
            text = text_transform(text)
        if text.lower() in ['', 'nan']:
            continue
        _x, _y = sub_df[[x, y]].median()

        if palette is not None:
            _fc = palette[text]
        else:
            _fc = face_color
        text = ax.text(_x, _y, text,
                       fontdict=_text_anno_kws,
                       bbox=dict(boxstyle="round",
                                 ec=edge_color,
                                 fc=_fc,
                                 linewidth=linewidth))
        text_list.append(text)

    if dodge_text:
        try:
            from adjustText import adjust_text

            _dodge_parms = dict(force_points=(0.02, 0.05),
                                arrowprops=dict(arrowstyle="->",
                                                fc=edge_color,
                                                ec="none",
                                                connectionstyle="angle,angleA=-90,angleB=180,rad=5"),
                                autoalign='xy')
            if dodge_kws is not None:
                _dodge_parms.update(dodge_kws)
            adjust_text(text_list, x=data['x'], y=data['y'], **_dodge_parms)
        except ModuleNotFoundError:
            print('Install adjustText package to dodge text, see its github page for help')

    return

Collecting adjustText
  Downloading adjustText-0.8-py3-none-any.whl (9.1 kB)
Requirement already satisfied: numpy in d:\conda\envs\py39\lib\site-packages (from adjustText) (1.22.1)
Requirement already satisfied: matplotlib in d:\conda\envs\py39\lib\site-packages (from adjustText) (3.7.2)
Requirement already satisfied: contourpy>=1.0.1 in d:\conda\envs\py39\lib\site-packages (from matplotlib->adjustText) (1.1.0)
Requirement already satisfied: cycler>=0.10 in d:\conda\envs\py39\lib\site-packages (from matplotlib->adjustText) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in d:\conda\envs\py39\lib\site-packages (from matplotlib->adjustText) (4.42.1)
Requirement already satisfied: kiwisolver>=1.0.1 in d:\conda\envs\py39\lib\site-packages (from matplotlib->adjustText) (1.4.5)
Requirement already satisfied: packaging>=20.0 in d:\conda\envs\py39\lib\site-packages (from matplotlib->adjustText) (23.1)
Requirement already satisfied: pillow>=6.2.0 in d:\conda\envs\py39\lib\site-packages (from matplotlib->adjustText) (10.0.0)
Requirement already satisfied: pyparsing<3.1,>=2.3.1 in d:\conda\envs\py39\lib\site-packages (from matplotlib->adjustText) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in d:\conda\envs\py39\lib\site-packages (from matplotlib->adjustText) (2.8.2)
Requirement already satisfied: importlib-resources>=3.2.0 in d:\conda\envs\py39\lib\site-packages (from matplotlib->adjustText) (6.0.1)
Requirement already satisfied: zipp>=3.1.0 in d:\conda\envs\py39\lib\site-packages (from importlib-resources>=3.2.0->matplotlib->adjustText) (3.16.2)
Requirement already satisfied: six>=1.5 in d:\conda\envs\py39\lib\site-packages (from python-dateutil>=2.7->matplotlib->adjustText) (1.16.0)
Installing collected packages: adjustText
Successfully installed adjustText-0.8

In [350]:

Copied!





def categorical_scatter(
        data,
        ax,
        # coords
        coord_base='umap',
        x=None,
        y=None,
        # color
        hue=None,
        palette='auto',
        # text annotation
        text_anno=None,
        text_anno_kws=None,
        text_anno_palette=None,
        text_transform=None,
        dodge_text=False,
        dodge_kws=None,
        # legend
        show_legend=False,
        legend_kws=None,
        # size
        s=5,
        size=None,
        sizes: dict = None,
        size_norm=None,
        # other
        axis_format='tiny',
        max_points=5000,
        labelsize=4,
        linewidth=0,
        zoomxy=1.05,
        outline=None,
        outline_pad=3,
        outline_kws=None,
        scatter_kws=None,
        return_fig=False
):
    """
    Plot scatter plot with these options:
        - Color by a categorical variable, and generate legend of the variable if needed
        - Add text annotation using a categorical variable
        - Circle categories with outlines

    Parameters
    ----------
    data
        Dataframe that contains coordinates and categorical variables
    ax
        this function do not generate ax, must provide an ax
    coord_base
        coords name, if provided, will automatically search for x and y
    x
        x coord name
    y
        y coord name
    hue
        categorical col name or series for color hue
    palette
        palette of the hue, str or dict
    text_anno
        categorical col name or series for text annotation
    text_anno_kws
    text_anno_palette
    text_transform
    dodge_text
    dodge_kws
    show_legend
    legend_kws
    s
    size
    sizes
    size_norm
    axis_format
    max_points
    labelsize
    linewidth
    zoomxy
    outline
    outline_pad
    outline_kws
    scatter_kws
        kws dict pass to sns.scatterplot

    Returns
    -------

    """
    if isinstance(data, ad.AnnData):
        adata = data
        data = adata.obs
        x = f'{coord_base}_0'
        y = f'{coord_base}_1'
        _data = pd.DataFrame({'x': adata.obsm[f'X_{coord_base}'][:, 0],
                              'y': adata.obsm[f'X_{coord_base}'][:, 1]},
                             index=adata.obs_names)
    else:
        # add coords
        _data, x, y = _extract_coords(data, coord_base, x, y)
    # _data has 2 cols: "x" and "y"

    # down sample plot data if needed.
    if max_points is not None:
        if _data.shape[0] > max_points:
            _data = _density_based_sample(_data, seed=1, size=max_points,
                                          coords=['x', 'y'])

    # default scatter options
    _scatter_kws = {'linewidth': 0, 's': s, 'legend': None, 'palette': palette}
    if scatter_kws is not None:
        _scatter_kws.update(scatter_kws)

    # deal with color
    palette_dict = None
    if hue is not None:
        if isinstance(hue, str):
            _data['hue'] = data[hue].copy()
        else:
            _data['hue'] = hue.copy()
        hue = 'hue'
        _data['hue'] = _data['hue'].astype('category').cat.remove_unused_categories()

        # deal with color palette
        palette = _scatter_kws['palette']
        if isinstance(palette, str) or isinstance(palette, list):
            palette_dict = level_one_palette(_data['hue'], order=None, palette=palette)
        elif isinstance(palette, dict):
            palette_dict = palette
        else:
            raise TypeError(f'Palette can only be str, list or dict, '
                            f'got {type(palette)}')
        _scatter_kws['palette'] = palette_dict

    # deal with size
    if size is not None:
        # discard s from _scatter_kws and use size in sns.scatterplot
        _scatter_kws.pop('s')
    sns.scatterplot(x='x', y='y', data=_data, ax=ax, hue=hue,
                    size=size, sizes=sizes, size_norm=size_norm,
                    **_scatter_kws)

    # deal with text annotation
    if text_anno is not None:
        if isinstance(text_anno, str):
            _data['text_anno'] = data[text_anno].copy()
        else:
            _data['text_anno'] = text_anno.copy()

        _text_anno_scatter(data=_data[['x', 'y', 'text_anno']],
                           ax=ax,
                           x='x',
                           y='y',
                           dodge_text=dodge_text,
                           dodge_kws=dodge_kws,
                           palette=text_anno_palette,
                           text_transform=text_transform,
                           anno_col='text_anno',
                           text_anno_kws=text_anno_kws,
                           labelsize=labelsize)

    # deal with outline
    if outline:
        if isinstance(outline, str):
            _data['outline'] = data[outline].copy()
        else:
            _data['outline'] = outline.copy()
        _outline_kws = {'linewidth': linewidth,
                        'palette': None,
                        'c': 'lightgray',
                        'single_contour_pad': outline_pad}
        if outline_kws is not None:
            _outline_kws.update(outline_kws)
        density_contour(ax=ax, data=_data, x='x', y='y', groupby='outline', **_outline_kws)

    # clean axis
    if axis_format == 'tiny':
        _make_tiny_axis_label(ax, x, y, arrow_kws=None, fontsize=labelsize)
    elif (axis_format == 'empty') or (axis_format is None):
        sns.despine(ax=ax, left=True, bottom=True)
        ax.set(xticks=[], yticks=[], xlabel=None, ylabel=None)
    else:
        pass

    # deal with legend
    if show_legend and (hue is not None):
        n_hue = len(palette_dict)
        _legend_kws = dict(ncol=(1 if n_hue <= 20 else 2 if n_hue <= 40 else 3),
                           fontsize=labelsize,
                           bbox_to_anchor=(1.05, 1),
                           loc='upper left',
                           borderaxespad=0.)
        if legend_kws is not None:
            _legend_kws.update(legend_kws)

        handles = []
        labels = []
        exist_hues = _data['hue'].unique()
        for hue_name, color in palette_dict.items():
            if hue_name not in exist_hues:
                # skip hue_name that do not appear in the plot
                continue
            handle = Line2D([0], [0], marker='o', color='w',
                            markerfacecolor=color, markersize=_legend_kws['fontsize'])
            handles.append(handle)
            labels.append(hue_name)
        _legend_kws['handles'] = handles
        _legend_kws['labels'] = labels
        ax.legend(**_legend_kws)

    if zoomxy is not None:
        zoom_ax(ax, zoomxy)

    if return_fig:
        return ax, _data
    else:
        return
def categorical_scatter(
        data,
        ax,
        # coords
        coord_base='umap',
        x=None,
        y=None,
        # color
        hue=None,
        palette='auto',
        # text annotation
        text_anno=None,
        text_anno_kws=None,
        text_anno_palette=None,
        text_transform=None,
        dodge_text=False,
        dodge_kws=None,
        # legend
        show_legend=False,
        legend_kws=None,
        # size
        s=5,
        size=None,
        sizes: dict = None,
        size_norm=None,
        # other
        axis_format='tiny',
        max_points=5000,
        labelsize=4,
        linewidth=0,
        zoomxy=1.05,
        outline=None,
        outline_pad=3,
        outline_kws=None,
        scatter_kws=None,
        return_fig=False
):
    """
    Plot scatter plot with these options:
        - Color by a categorical variable, and generate legend of the variable if needed
        - Add text annotation using a categorical variable
        - Circle categories with outlines

    Parameters
    ----------
    data
        Dataframe that contains coordinates and categorical variables
    ax
        this function do not generate ax, must provide an ax
    coord_base
        coords name, if provided, will automatically search for x and y
    x
        x coord name
    y
        y coord name
    hue
        categorical col name or series for color hue
    palette
        palette of the hue, str or dict
    text_anno
        categorical col name or series for text annotation
    text_anno_kws
    text_anno_palette
    text_transform
    dodge_text
    dodge_kws
    show_legend
    legend_kws
    s
    size
    sizes
    size_norm
    axis_format
    max_points
    labelsize
    linewidth
    zoomxy
    outline
    outline_pad
    outline_kws
    scatter_kws
        kws dict pass to sns.scatterplot

    Returns
    -------

    """
    if isinstance(data, ad.AnnData):
        adata = data
        data = adata.obs
        x = f'{coord_base}_0'
        y = f'{coord_base}_1'
        _data = pd.DataFrame({'x': adata.obsm[f'X_{coord_base}'][:, 0],
                              'y': adata.obsm[f'X_{coord_base}'][:, 1]},
                             index=adata.obs_names)
    else:
        # add coords
        _data, x, y = _extract_coords(data, coord_base, x, y)
    # _data has 2 cols: "x" and "y"

    # down sample plot data if needed.
    if max_points is not None:
        if _data.shape[0] > max_points:
            _data = _density_based_sample(_data, seed=1, size=max_points,
                                          coords=['x', 'y'])

    # default scatter options
    _scatter_kws = {'linewidth': 0, 's': s, 'legend': None, 'palette': palette}
    if scatter_kws is not None:
        _scatter_kws.update(scatter_kws)

    # deal with color
    palette_dict = None
    if hue is not None:
        if isinstance(hue, str):
            _data['hue'] = data[hue].copy()
        else:
            _data['hue'] = hue.copy()
        hue = 'hue'
        _data['hue'] = _data['hue'].astype('category').cat.remove_unused_categories()

        # deal with color palette
        palette = _scatter_kws['palette']
        if isinstance(palette, str) or isinstance(palette, list):
            palette_dict = level_one_palette(_data['hue'], order=None, palette=palette)
        elif isinstance(palette, dict):
            palette_dict = palette
        else:
            raise TypeError(f'Palette can only be str, list or dict, '
                            f'got {type(palette)}')
        _scatter_kws['palette'] = palette_dict

    # deal with size
    if size is not None:
        # discard s from _scatter_kws and use size in sns.scatterplot
        _scatter_kws.pop('s')
    sns.scatterplot(x='x', y='y', data=_data, ax=ax, hue=hue,
                    size=size, sizes=sizes, size_norm=size_norm,
                    **_scatter_kws)

    # deal with text annotation
    if text_anno is not None:
        if isinstance(text_anno, str):
            _data['text_anno'] = data[text_anno].copy()
        else:
            _data['text_anno'] = text_anno.copy()

        _text_anno_scatter(data=_data[['x', 'y', 'text_anno']],
                           ax=ax,
                           x='x',
                           y='y',
                           dodge_text=dodge_text,
                           dodge_kws=dodge_kws,
                           palette=text_anno_palette,
                           text_transform=text_transform,
                           anno_col='text_anno',
                           text_anno_kws=text_anno_kws,
                           labelsize=labelsize)

    # deal with outline
    if outline:
        if isinstance(outline, str):
            _data['outline'] = data[outline].copy()
        else:
            _data['outline'] = outline.copy()
        _outline_kws = {'linewidth': linewidth,
                        'palette': None,
                        'c': 'lightgray',
                        'single_contour_pad': outline_pad}
        if outline_kws is not None:
            _outline_kws.update(outline_kws)
        density_contour(ax=ax, data=_data, x='x', y='y', groupby='outline', **_outline_kws)

    # clean axis
    if axis_format == 'tiny':
        _make_tiny_axis_label(ax, x, y, arrow_kws=None, fontsize=labelsize)
    elif (axis_format == 'empty') or (axis_format is None):
        sns.despine(ax=ax, left=True, bottom=True)
        ax.set(xticks=[], yticks=[], xlabel=None, ylabel=None)
    else:
        pass

    # deal with legend
    if show_legend and (hue is not None):
        n_hue = len(palette_dict)
        _legend_kws = dict(ncol=(1 if n_hue <= 20 else 2 if n_hue <= 40 else 3),
                           fontsize=labelsize,
                           bbox_to_anchor=(1.05, 1),
                           loc='upper left',
                           borderaxespad=0.)
        if legend_kws is not None:
            _legend_kws.update(legend_kws)

        handles = []
        labels = []
        exist_hues = _data['hue'].unique()
        for hue_name, color in palette_dict.items():
            if hue_name not in exist_hues:
                # skip hue_name that do not appear in the plot
                continue
            handle = Line2D([0], [0], marker='o', color='w',
                            markerfacecolor=color, markersize=_legend_kws['fontsize'])
            handles.append(handle)
            labels.append(hue_name)
        _legend_kws['handles'] = handles
        _legend_kws['labels'] = labels
        ax.legend(**_legend_kws)

    if zoomxy is not None:
        zoom_ax(ax, zoomxy)

    if return_fig:
        return ax, _data
    else:
        return

In [357]:

Copied!





import seaborn as sns
def level_one_palette(name_list, order=None, palette='auto'):
    name_set = set(name_list)
    if palette == 'auto':
        if len(name_set) < 10:
            palette = 'tab10'
        elif len(name_set) < 20:
            palette = 'tab20'
        else:
            palette = 'rainbow'

    if order is None:
        order = list(sorted(name_set))
    else:
        if (set(order) != name_set) or (len(order) != len(name_set)):
            raise ValueError('Order is not equal to set(name_list).')
    n = len(order)
    colors = sns.color_palette(palette, n)
    color_palette = {}
    for name, color in zip(order, colors):
        color_palette[name] = color
    return color_palette
import seaborn as sns
def level_one_palette(name_list, order=None, palette='auto'):
    name_set = set(name_list)
    if palette == 'auto':
        if len(name_set) < 10:
            palette = 'tab10'
        elif len(name_set) < 20:
            palette = 'tab20'
        else:
            palette = 'rainbow'

    if order is None:
        order = list(sorted(name_set))
    else:
        if (set(order) != name_set) or (len(order) != len(name_set)):
            raise ValueError('Order is not equal to set(name_list).')
    n = len(order)
    colors = sns.color_palette(palette, n)
    color_palette = {}
    for name, color in zip(order, colors):
        color_palette[name] = color
    return color_palette

In [361]:

Copied!





fig, ax = plt.subplots(figsize=(4, 4), dpi=300)
adata.obs['label'].fillna('nan', inplace=True)
_ = categorical_scatter(data = adata_filter2,
                       ax = ax,
                       coord_base = 'umap',
                       hue = 'Treatment',
                       text_anno = 'Treatment',
                       show_legend = 'True')
fig, ax = plt.subplots(figsize=(4, 4), dpi=300)
adata.obs['label'].fillna('nan', inplace=True)
_ = categorical_scatter(data = adata_filter2,
                       ax = ax,
                       coord_base = 'umap',
                       hue = 'Treatment',
                       text_anno = 'Treatment',
                       show_legend = 'True')

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[361], line 3
      1 fig, ax = plt.subplots(figsize=(4, 4), dpi=300)
      2 adata.obs['label'].fillna('nan', inplace=True)
----> 3 _ = categorical_scatter(data = adata_filter2,
      4                        ax = ax,
      5                        coord_base = 'umap',
      6                        hue = 'Treatment',
      7                        text_anno = 'Treatment',
      8                        show_legend = 'True')

Cell In[350], line 125, in categorical_scatter(data, ax, coord_base, x, y, hue, palette, text_anno, text_anno_kws, text_anno_palette, text_transform, dodge_text, dodge_kws, show_legend, legend_kws, s, size, sizes, size_norm, axis_format, max_points, labelsize, linewidth, zoomxy, outline, outline_pad, outline_kws, scatter_kws, return_fig)
    123 palette = _scatter_kws['palette']
    124 if isinstance(palette, str) or isinstance(palette, list):
--> 125     palette_dict = level_one_palette(_data['hue'], order=None, palette=palette)
    126 elif isinstance(palette, dict):
    127     palette_dict = palette

Cell In[357], line 13, in level_one_palette(name_list, order, palette)
     10         palette = 'rainbow'
     12 if order is None:
---> 13     order = list(sorted(name_set))
     14 else:
     15     if (set(order) != name_set) or (len(order) != len(name_set)):

TypeError: '<' not supported between instances of 'str' and 'float'

In [354]:

Copied!

!pip install seaborn
!pip install seaborn

Requirement already satisfied: seaborn in d:\conda\envs\py39\lib\site-packages (0.12.2)
Requirement already satisfied: numpy!=1.24.0,>=1.17 in d:\conda\envs\py39\lib\site-packages (from seaborn) (1.22.1)
Requirement already satisfied: pandas>=0.25 in d:\conda\envs\py39\lib\site-packages (from seaborn) (1.5.3)
Requirement already satisfied: matplotlib!=3.6.1,>=3.1 in d:\conda\envs\py39\lib\site-packages (from seaborn) (3.7.2)
Requirement already satisfied: contourpy>=1.0.1 in d:\conda\envs\py39\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.1.0)
Requirement already satisfied: cycler>=0.10 in d:\conda\envs\py39\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in d:\conda\envs\py39\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (4.42.1)
Requirement already satisfied: kiwisolver>=1.0.1 in d:\conda\envs\py39\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.4.5)
Requirement already satisfied: packaging>=20.0 in d:\conda\envs\py39\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (23.1)
Requirement already satisfied: pillow>=6.2.0 in d:\conda\envs\py39\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (10.0.0)
Requirement already satisfied: pyparsing<3.1,>=2.3.1 in d:\conda\envs\py39\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in d:\conda\envs\py39\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (2.8.2)
Requirement already satisfied: importlib-resources>=3.2.0 in d:\conda\envs\py39\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (6.0.1)
Requirement already satisfied: pytz>=2020.1 in d:\conda\envs\py39\lib\site-packages (from pandas>=0.25->seaborn) (2023.3)
Requirement already satisfied: zipp>=3.1.0 in d:\conda\envs\py39\lib\site-packages (from importlib-resources>=3.2.0->matplotlib!=3.6.1,>=3.1->seaborn) (3.16.2)
Requirement already satisfied: six>=1.5 in d:\conda\envs\py39\lib\site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.1->seaborn) (1.16.0)

In [ ]: