Quality control
In [5]:
Copied!
import pandas as pd
import numpy as np
import os
import glob
import multiprocessing as mp
from multiprocessing import Pool
from pathlib import Path
from anndata import ImplicitModificationWarning
from scipy.sparse import coo_matrix
from scipy import sparse
import datetime as datetime
from typing_extensions import Literal
from typing import Union
import anndata as ad
import subprocess
import pathlib
import collections
import click
import time
import warnings
import pandas as pd
import numpy as np
import os
import glob
import multiprocessing as mp
from multiprocessing import Pool
from pathlib import Path
from anndata import ImplicitModificationWarning
from scipy.sparse import coo_matrix
from scipy import sparse
import datetime as datetime
from typing_extensions import Literal
from typing import Union
import anndata as ad
import subprocess
import pathlib
import collections
import click
import time
import warnings
In [6]:
Copied!
adata = ad.read(
'D://Test/GSE56789/temp/result_anndata.h5'
)
adata = ad.read(
'D://Test/GSE56789/temp/result_anndata.h5'
)
In [7]:
Copied!
adata
adata
Out[7]:
AnnData object with n_obs × n_vars = 52 × 4912 obs: 'methylated_CG', 'mc_class_CG', 'total_CG', 'overall_mc_level_CG', 'sample', 'sample_id' var: 'feature_name'
In [8]:
Copied!
adata.X
adata.X
Out[8]:
<52x4912 sparse matrix of type '<class 'numpy.float64'>' with 168981 stored elements in Compressed Sparse Row format>
In [9]:
Copied!
# adding QC values
##### 稀疏矩阵不适用 ########
if sparse.issparse(adata.X):
length1 = adata.X.shape[1] # features, 列
length2 = adata.X.shape[0] # samples, 行
# 计算每个样本中覆盖的特征数
coverage_cells = [length1 - np.isnan(line.toarray()).sum() for line in adata.X]
# 计算每个样本的平均甲基化
mean_cell_methylation = [np.nansum(line.toarray()) / length1 for line in adata.X]
# 将结果存储到 adata.obs
adata.obs['coverage_cells'] = coverage_cells
adata.obs['mean_cell_methylation'] = mean_cell_methylation
# 计算每个特征在多少个样本中有覆盖
coverage_feature = [length2 - np.isnan(line.toarray()).sum() for line in adata.X.T]
# 计算每个特征的平均甲基化
mean_feature_methylation = [np.nansum(line.toarray()) / length2 for line in adata.X.T]
# 将结果存储到 adata.var
adata.var['coverage_feature'] = coverage_feature
adata.var['mean_feature_methylation'] = mean_feature_methylation
else:
# 如果 adata.X 不是稀疏矩阵,可以执行其他操作或者报错
raise ValueError("adata.X is not a CSR matrix. Please handle this case accordingly.")
length1 = len(adata.X[0,:]) # 55017 features,列
length2 = len(adata.X[:,0]) # 3379 samples 行
adata.obs['coverage_cells'] = [length1 - np.isnan(line).sum() for line in adata.X] #每一个样本中覆盖的特征数
adata.obs['mean_cell_methylation'] = [np.nansum(line)/length1 for line in adata.X]
adata.var['coverage_feature'] = [length2 - np.isnan(line).sum() for line in adata.X.T] #每一个特征在多少个样本中有覆盖
adata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in adata.X.T]
# adding QC values
##### 稀疏矩阵不适用 ########
if sparse.issparse(adata.X):
length1 = adata.X.shape[1] # features, 列
length2 = adata.X.shape[0] # samples, 行
# 计算每个样本中覆盖的特征数
coverage_cells = [length1 - np.isnan(line.toarray()).sum() for line in adata.X]
# 计算每个样本的平均甲基化
mean_cell_methylation = [np.nansum(line.toarray()) / length1 for line in adata.X]
# 将结果存储到 adata.obs
adata.obs['coverage_cells'] = coverage_cells
adata.obs['mean_cell_methylation'] = mean_cell_methylation
# 计算每个特征在多少个样本中有覆盖
coverage_feature = [length2 - np.isnan(line.toarray()).sum() for line in adata.X.T]
# 计算每个特征的平均甲基化
mean_feature_methylation = [np.nansum(line.toarray()) / length2 for line in adata.X.T]
# 将结果存储到 adata.var
adata.var['coverage_feature'] = coverage_feature
adata.var['mean_feature_methylation'] = mean_feature_methylation
else:
# 如果 adata.X 不是稀疏矩阵,可以执行其他操作或者报错
raise ValueError("adata.X is not a CSR matrix. Please handle this case accordingly.")
length1 = len(adata.X[0,:]) # 55017 features,列
length2 = len(adata.X[:,0]) # 3379 samples 行
adata.obs['coverage_cells'] = [length1 - np.isnan(line).sum() for line in adata.X] #每一个样本中覆盖的特征数
adata.obs['mean_cell_methylation'] = [np.nansum(line)/length1 for line in adata.X]
adata.var['coverage_feature'] = [length2 - np.isnan(line).sum() for line in adata.X.T] #每一个特征在多少个样本中有覆盖
adata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in adata.X.T]
In [13]:
Copied!
adata.obs
adata.obs
Out[13]:
methylated_CG | mc_class_CG | total_CG | overall_mc_level_CG | sample | sample_id | coverage_cells | mean_cell_methylation | |
---|---|---|---|---|---|---|---|---|
8 | 2773841.0 | 6389935.0 | 10058585.0 | 0.275769 | GSM1370523 | 0 | 4912 | 0.255770 |
9 | 4207956.0 | 9056780.0 | 14749706.0 | 0.285291 | GSM1370524 | 1 | 4912 | 0.267832 |
2 | 1062414.0 | 3024372.0 | 3687362.0 | 0.288123 | GSM1370525 | 2 | 4912 | 0.270414 |
7 | 2174791.0 | 5617340.0 | 7468542.0 | 0.291194 | GSM1370526 | 3 | 4912 | 0.257664 |
6 | 2250723.0 | 5478455.0 | 7718445.0 | 0.291603 | GSM1370527 | 4 | 4912 | 0.256831 |
0 | 676706.0 | 1997630.0 | 2305401.0 | 0.293531 | GSM1370528 | 5 | 4912 | 0.246128 |
3 | 1235199.0 | 3266783.0 | 4241536.0 | 0.291215 | GSM1370529 | 6 | 4912 | 0.247983 |
4 | 1602124.0 | 3903117.0 | 5507096.0 | 0.290920 | GSM1370530 | 7 | 4912 | 0.261821 |
1 | 1229421.0 | 3204150.0 | 4160911.0 | 0.295469 | GSM1370531 | 8 | 4912 | 0.254784 |
5 | 1149598.0 | 2822474.0 | 3733805.0 | 0.307889 | GSM1370532 | 9 | 4912 | 0.257592 |
10 | 1586218.0 | 3869842.0 | 5403788.0 | 0.293538 | GSM1370533 | 10 | 4912 | 0.253991 |
11 | 1948937.0 | 3800961.0 | 5242714.0 | 0.371742 | GSM1370535 | 11 | 4912 | 0.393481 |
12 | 1271969.0 | 2797560.0 | 3506515.0 | 0.362744 | GSM1370536 | 12 | 4912 | 0.346206 |
13 | 2105457.0 | 3372625.0 | 4537703.0 | 0.463992 | GSM1370537 | 13 | 4912 | 0.454491 |
14 | 1521358.0 | 4242155.0 | 5799310.0 | 0.262334 | GSM1370538 | 14 | 4912 | 0.236155 |
15 | 1974452.0 | 3917329.0 | 5546459.0 | 0.355984 | GSM1370539 | 15 | 4912 | 0.337848 |
16 | 372428.0 | 3297855.0 | 4538546.0 | 0.082059 | GSM1370540 | 16 | 4912 | 0.066197 |
17 | 2068775.0 | 4283144.0 | 5851328.0 | 0.353556 | GSM1370541 | 17 | 4912 | 0.276954 |
18 | 730843.0 | 3950391.0 | 5450398.0 | 0.134090 | GSM1370542 | 18 | 4912 | 0.127163 |
19 | 1318842.0 | 4442023.0 | 5843535.0 | 0.225692 | GSM1370543 | 19 | 4912 | 0.214378 |
20 | 1109405.0 | 3710145.0 | 5011395.0 | 0.221376 | GSM1370544 | 20 | 4912 | 0.201409 |
21 | 721897.0 | 3110061.0 | 4463927.0 | 0.161718 | GSM1370545 | 21 | 4912 | 0.148684 |
22 | 1828675.0 | 3532788.0 | 4807341.0 | 0.380392 | GSM1370546 | 22 | 4912 | 0.352224 |
23 | 1472.0 | 2953.0 | 3040.0 | 0.484211 | GSM1370548 | 23 | 4912 | 0.018360 |
24 | 10225.0 | 18155.0 | 18548.0 | 0.551272 | GSM1370549 | 24 | 4912 | 0.118256 |
25 | 2859.0 | 5231.0 | 5319.0 | 0.537507 | GSM1370550 | 25 | 4912 | 0.037595 |
26 | 9729.0 | 17741.0 | 17977.0 | 0.541192 | GSM1370551 | 26 | 4912 | 0.114153 |
27 | 3672.0 | 6739.0 | 6882.0 | 0.533566 | GSM1370552 | 27 | 4912 | 0.053580 |
28 | 8398.0 | 29932.0 | 36007.0 | 0.233232 | GSM1370553 | 28 | 4912 | 0.086694 |
29 | 1598.0 | 4820.0 | 5447.0 | 0.293372 | GSM1370554 | 29 | 4912 | 0.021323 |
30 | 5078262.0 | 4424268.0 | 7002192.0 | 0.725239 | GSM1370555 | 30 | 4912 | 0.633859 |
31 | 2883177.0 | 3262923.0 | 4525444.0 | 0.637104 | GSM1370556 | 31 | 4912 | 0.569044 |
32 | 3010405.0 | 4728963.0 | 7751272.0 | 0.388376 | GSM1370557 | 32 | 4912 | 0.338542 |
33 | 4478594.0 | 4965673.0 | 7571138.0 | 0.591535 | GSM1370558 | 33 | 4912 | 0.570422 |
34 | 3200120.0 | 3848428.0 | 5622442.0 | 0.569169 | GSM1370559 | 34 | 4912 | 0.498544 |
36 | 1814649.0 | 5795602.0 | 8946794.0 | 0.202827 | GSM1370560 | 35 | 4912 | 0.187398 |
35 | 3755105.0 | 3468770.0 | 5304515.0 | 0.707907 | GSM1370561 | 36 | 4912 | 0.629427 |
37 | 3243439.0 | 3557629.0 | 4771783.0 | 0.679712 | GSM1370562 | 37 | 4912 | 0.611443 |
38 | 3826163.0 | 5262688.0 | 7106219.0 | 0.538425 | GSM1370563 | 38 | 4912 | 0.508737 |
39 | 4527218.0 | 4938305.0 | 7210573.0 | 0.627858 | GSM1370564 | 39 | 4912 | 0.596066 |
40 | 3487010.0 | 3698022.0 | 5098903.0 | 0.683875 | GSM1370565 | 40 | 4912 | 0.615996 |
41 | 1670947.0 | 2077704.0 | 2662177.0 | 0.627662 | GSM1370566 | 41 | 4912 | 0.585674 |
42 | 2014271.0 | 3017804.0 | 3828676.0 | 0.526101 | GSM1370567 | 42 | 4912 | 0.468503 |
44 | 6249395.0 | 7502459.0 | 10319408.0 | 0.605596 | GSM1370568 | 43 | 4912 | 0.539304 |
43 | 2641859.0 | 3607443.0 | 4826163.0 | 0.547404 | GSM1370569 | 44 | 4912 | 0.477331 |
45 | 3859515.0 | 5568802.0 | 7575752.0 | 0.509456 | GSM1370570 | 45 | 4912 | 0.469847 |
46 | 4079333.0 | 4145869.0 | 6017116.0 | 0.677955 | GSM1370571 | 46 | 4912 | 0.616277 |
47 | 3410149.0 | 3967729.0 | 5362681.0 | 0.635904 | GSM1370572 | 47 | 4912 | 0.577196 |
48 | 3488749.0 | 4433266.0 | 6109535.0 | 0.571033 | GSM1370573 | 48 | 4912 | 0.552650 |
49 | 3588924.0 | 4523416.0 | 6085020.0 | 0.589797 | GSM1370574 | 49 | 4912 | 0.513039 |
51 | 37221650.0 | 22641666.0 | 53520217.0 | 0.695469 | GSM1370575 | 50 | 4912 | 0.581207 |
50 | 5484605.0 | 9727286.0 | 19593619.0 | 0.279918 | GSM1413827 | 51 | 4912 | 0.257857 |
In [14]:
Copied!
import scvelo as scv
data = adata.X.toarray()
data
import scvelo as scv
data = adata.X.toarray()
data
Out[14]:
array([[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]])
In [24]:
Copied!
adata.var
adata.var
Out[24]:
feature_name | coverage_feature | mean_feature_methylation | |
---|---|---|---|
0 | chr1_0_100000 | 52 | 0.0 |
1 | chr1_100000_200000 | 52 | 0.0 |
2 | chr1_200000_300000 | 52 | 0.0 |
3 | chr1_300000_400000 | 52 | 0.0 |
4 | chr1_400000_500000 | 52 | 0.0 |
... | ... | ... | ... |
4907 | chr2_241700000_241800000 | 52 | 0.0 |
4908 | chr2_241800000_241900000 | 52 | 0.0 |
4909 | chr2_241900000_242000000 | 52 | 0.0 |
4910 | chr2_242000000_242100000 | 52 | 0.0 |
4911 | chr2_242100000_242193529 | 52 | 0.0 |
4912 rows × 3 columns
In [20]:
Copied!
adata.X.T
adata.X.T
Out[20]:
<4912x52 sparse matrix of type '<class 'numpy.float64'>' with 168981 stored elements in Compressed Sparse Column format>
In [10]:
Copied!
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
In [36]:
Copied!
def plot_hist(adata,term,title,y_label,x_label):
# 创建一个新的图形对象
fig, ax = plt.subplots()
# number of peaks in a cell
# plt.axvline(x=10000, color='r') # minimum number of feature covered per cells
np.histogram(adata.obs[term])
plt.hist(adata.obs[term],alpha=0.4)
#adding horizontal grid lines
ax.yaxis.grid(True)
# remove axis spines
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["left"].set_visible(False)
# labels
plt.title(title)
plt.xlabel(x_label)
plt.ylabel(y_label)
# raised title
return fig
def plot_hist(adata,term,title,y_label,x_label):
# 创建一个新的图形对象
fig, ax = plt.subplots()
# number of peaks in a cell
# plt.axvline(x=10000, color='r') # minimum number of feature covered per cells
np.histogram(adata.obs[term])
plt.hist(adata.obs[term],alpha=0.4)
#adding horizontal grid lines
ax.yaxis.grid(True)
# remove axis spines
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["left"].set_visible(False)
# labels
plt.title(title)
plt.xlabel(x_label)
plt.ylabel(y_label)
# raised title
return fig
In [2]:
Copied!
!pip install --upgrade scbs
!pip install --upgrade scbs
Collecting scbs Downloading scbs-0.6.2-py3-none-any.whl (61 kB) ---------------------------------------- 0.0/61.5 kB ? eta -:--:-- ---------------------------------------- 0.0/61.5 kB ? eta -:--:-- ------------ ------------------------- 20.5/61.5 kB 640.0 kB/s eta 0:00:01 ------------------------------------- 61.4/61.5 kB 648.1 kB/s eta 0:00:01 -------------------------------------- 61.5/61.5 kB 649.0 kB/s eta 0:00:00 Collecting click<8.1,>=7.1.2 (from scbs) Downloading click-8.0.4-py3-none-any.whl (97 kB) ---------------------------------------- 0.0/97.5 kB ? eta -:--:-- ---- ----------------------------------- 10.2/97.5 kB ? eta -:--:-- ------------------------------------- -- 92.2/97.5 kB 1.1 MB/s eta 0:00:01 ---------------------------------------- 97.5/97.5 kB 1.1 MB/s eta 0:00:00 Collecting click-help-colors<1,>=0.9 (from scbs) Obtaining dependency information for click-help-colors<1,>=0.9 from https://files.pythonhosted.org/packages/c4/f7/6e0716e92a66aa5865ec133d2c236fd9229818e4b15ee0658612a1badb78/click_help_colors-0.9.2-py3-none-any.whl.metadata Downloading click_help_colors-0.9.2-py3-none-any.whl.metadata (4.2 kB) Requirement already satisfied: colorama<1,>=0.3.9 in d:\conda\envs\py39\lib\site-packages (from scbs) (0.4.6) Requirement already satisfied: numba<1,>=0.53.0 in d:\conda\envs\py39\lib\site-packages (from scbs) (0.57.1) Requirement already satisfied: numpy<2,>=1.20.1 in d:\conda\envs\py39\lib\site-packages (from scbs) (1.24.4) Collecting pandas<2,>=1.2.3 (from scbs) Downloading pandas-1.5.3-cp39-cp39-win_amd64.whl (10.9 MB) ---------------------------------------- 0.0/10.9 MB ? eta -:--:-- ---------------------------------------- 0.0/10.9 MB ? eta -:--:-- ---------------------------------------- 0.0/10.9 MB 1.4 MB/s eta 0:00:08 ---------------------------------------- 0.1/10.9 MB 1.1 MB/s eta 0:00:11 --------------------------------------- 0.2/10.9 MB 1.1 MB/s eta 0:00:10 - -------------------------------------- 0.3/10.9 MB 1.8 MB/s eta 0:00:06 -- ------------------------------------- 0.6/10.9 MB 2.6 MB/s eta 0:00:05 -- ------------------------------------- 0.7/10.9 MB 2.6 MB/s eta 0:00:04 -- ------------------------------------- 0.8/10.9 MB 2.6 MB/s eta 0:00:04 --- ------------------------------------ 1.0/10.9 MB 2.7 MB/s eta 0:00:04 ---- ----------------------------------- 1.2/10.9 MB 3.0 MB/s eta 0:00:04 ----- ---------------------------------- 1.5/10.9 MB 3.2 MB/s eta 0:00:03 ------ --------------------------------- 1.7/10.9 MB 3.4 MB/s eta 0:00:03 ------ --------------------------------- 1.9/10.9 MB 3.5 MB/s eta 0:00:03 ------- -------------------------------- 2.1/10.9 MB 3.6 MB/s eta 0:00:03 -------- ------------------------------- 2.3/10.9 MB 3.7 MB/s eta 0:00:03 -------- ------------------------------- 2.3/10.9 MB 3.7 MB/s eta 0:00:03 -------- ------------------------------- 2.3/10.9 MB 3.1 MB/s eta 0:00:03 ----------- ---------------------------- 3.0/10.9 MB 3.9 MB/s eta 0:00:03 ------------ --------------------------- 3.3/10.9 MB 4.0 MB/s eta 0:00:02 ------------ --------------------------- 3.4/10.9 MB 4.0 MB/s eta 0:00:02 -------------- ------------------------- 3.9/10.9 MB 4.1 MB/s eta 0:00:02 -------------- ------------------------- 4.1/10.9 MB 4.1 MB/s eta 0:00:02 ---------------- ----------------------- 4.4/10.9 MB 4.1 MB/s eta 0:00:02 ---------------- ----------------------- 4.5/10.9 MB 4.0 MB/s eta 0:00:02 ------------------ --------------------- 5.0/10.9 MB 4.3 MB/s eta 0:00:02 ------------------- -------------------- 5.2/10.9 MB 4.3 MB/s eta 0:00:02 ------------------- -------------------- 5.5/10.9 MB 4.3 MB/s eta 0:00:02 -------------------- ------------------- 5.7/10.9 MB 4.3 MB/s eta 0:00:02 --------------------- ------------------ 5.8/10.9 MB 4.3 MB/s eta 0:00:02 --------------------- ------------------ 5.8/10.9 MB 4.3 MB/s eta 0:00:02 ----------------------- ---------------- 6.3/10.9 MB 4.4 MB/s eta 0:00:02 ----------------------- ---------------- 6.5/10.9 MB 4.4 MB/s eta 0:00:02 ------------------------ --------------- 6.7/10.9 MB 4.3 MB/s eta 0:00:01 ------------------------ --------------- 6.7/10.9 MB 4.3 MB/s eta 0:00:01 ------------------------ --------------- 6.8/10.9 MB 4.2 MB/s eta 0:00:01 ------------------------- -------------- 6.9/10.9 MB 4.1 MB/s eta 0:00:01 ------------------------- -------------- 7.0/10.9 MB 4.0 MB/s eta 0:00:01 -------------------------- ------------- 7.3/10.9 MB 4.1 MB/s eta 0:00:01 --------------------------- ------------ 7.5/10.9 MB 4.1 MB/s eta 0:00:01 ---------------------------- ----------- 7.7/10.9 MB 4.1 MB/s eta 0:00:01 ---------------------------- ----------- 7.9/10.9 MB 4.1 MB/s eta 0:00:01 ----------------------------- ---------- 8.1/10.9 MB 4.2 MB/s eta 0:00:01 ------------------------------ --------- 8.3/10.9 MB 4.1 MB/s eta 0:00:01 ------------------------------ --------- 8.4/10.9 MB 4.1 MB/s eta 0:00:01 ------------------------------- -------- 8.6/10.9 MB 4.1 MB/s eta 0:00:01 ------------------------------- -------- 8.7/10.9 MB 4.1 MB/s eta 0:00:01 -------------------------------- ------- 8.9/10.9 MB 4.1 MB/s eta 0:00:01 -------------------------------- ------- 9.0/10.9 MB 4.0 MB/s eta 0:00:01 --------------------------------- ------ 9.1/10.9 MB 4.0 MB/s eta 0:00:01 --------------------------------- ------ 9.2/10.9 MB 4.0 MB/s eta 0:00:01 ---------------------------------- ----- 9.4/10.9 MB 3.9 MB/s eta 0:00:01 ---------------------------------- ----- 9.5/10.9 MB 3.9 MB/s eta 0:00:01 ----------------------------------- ---- 9.6/10.9 MB 3.9 MB/s eta 0:00:01 ----------------------------------- ---- 9.7/10.9 MB 3.9 MB/s eta 0:00:01 ----------------------------------- ---- 9.8/10.9 MB 3.8 MB/s eta 0:00:01 ------------------------------------ --- 9.9/10.9 MB 3.8 MB/s eta 0:00:01 ------------------------------------- -- 10.1/10.9 MB 3.8 MB/s eta 0:00:01 ------------------------------------- -- 10.3/10.9 MB 4.0 MB/s eta 0:00:01 -------------------------------------- - 10.6/10.9 MB 4.0 MB/s eta 0:00:01 --------------------------------------- 10.8/10.9 MB 4.0 MB/s eta 0:00:01 --------------------------------------- 10.9/10.9 MB 4.0 MB/s eta 0:00:01 --------------------------------------- 10.9/10.9 MB 4.0 MB/s eta 0:00:01 --------------------------------------- 10.9/10.9 MB 4.0 MB/s eta 0:00:01 ---------------------------------------- 10.9/10.9 MB 3.8 MB/s eta 0:00:00 Requirement already satisfied: scipy<2,>=1.6.1 in d:\conda\envs\py39\lib\site-packages (from scbs) (1.11.2) Requirement already satisfied: statsmodels<1,>=0.12.2 in d:\conda\envs\py39\lib\site-packages (from scbs) (0.14.0) Requirement already satisfied: llvmlite<0.41,>=0.40.0dev0 in d:\conda\envs\py39\lib\site-packages (from numba<1,>=0.53.0->scbs) (0.40.1) Requirement already satisfied: python-dateutil>=2.8.1 in d:\conda\envs\py39\lib\site-packages (from pandas<2,>=1.2.3->scbs) (2.8.2) Requirement already satisfied: pytz>=2020.1 in d:\conda\envs\py39\lib\site-packages (from pandas<2,>=1.2.3->scbs) (2023.3) Requirement already satisfied: patsy>=0.5.2 in d:\conda\envs\py39\lib\site-packages (from statsmodels<1,>=0.12.2->scbs) (0.5.3) Requirement already satisfied: packaging>=21.3 in d:\conda\envs\py39\lib\site-packages (from statsmodels<1,>=0.12.2->scbs) (23.1) Requirement already satisfied: six in d:\conda\envs\py39\lib\site-packages (from patsy>=0.5.2->statsmodels<1,>=0.12.2->scbs) (1.16.0) Downloading click_help_colors-0.9.2-py3-none-any.whl (5.5 kB) Installing collected packages: click, pandas, click-help-colors, scbs Attempting uninstall: click Found existing installation: click 8.1.7 Uninstalling click-8.1.7: Successfully uninstalled click-8.1.7 Attempting uninstall: pandas Found existing installation: pandas 2.0.3 Uninstalling pandas-2.0.3: Successfully uninstalled pandas-2.0.3
ERROR: Could not install packages due to an OSError: [WinError 5] 拒绝访问。: 'D:\\conda\\envs\\py39\\Lib\\site-packages\\~andas\\_libs\\algos.cp39-win_amd64.pyd' Consider using the `--user` option or check the permissions.
In [12]:
Copied!
# 提取obs中的global列和cpg列
y_values = adata.obs['total_CG']
x_values = adata.obs['overall_mc_level_CG']
# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5) # alpha参数用于设置点的透明度,可根据需要调整
# 添加标签和标题
plt.xlabel('Global DNA methylation %')
plt.ylabel('Observed CpG Sites')
# 显示图形
plt.show()
# 提取obs中的global列和cpg列
y_values = adata.obs['total_CG']
x_values = adata.obs['overall_mc_level_CG']
# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5) # alpha参数用于设置点的透明度,可根据需要调整
# 添加标签和标题
plt.xlabel('Global DNA methylation %')
plt.ylabel('Observed CpG Sites')
# 显示图形
plt.show()
In [67]:
Copied!
def plot_hist_cutoff(adata,name='',bins=100,xlim_quantile=(0.001, 0.999)):
fig, ax1 = plt.subplots()
# Create histogram
data = adata.obs['total_CG']
ax1 = plt.hist(data, bins=bins, color='blue', alpha=0.7)
ax1.set_xlabel(name)
ax1.set_ylabel('Count', color='blue')
# # Calculate the percentage of data that meets the cutoff value
# total_data = data.size
# print(data[total_data])
# x = np.linspace(data[0], data[total_data], bins)
# passed_data = np.array([(data > i).sum() for i in x])
# percentage_passed = (passed_data / total_data) * 100
# print (percentage_passed)
xlim = tuple(np.quantile(data, xlim_quantile))
x = np.linspace(xlim[0], xlim[1], 500)
count_list = np.array([(data > i).sum() for i in x])
original_total_data = data.size
count_list = count_list / original_total_data * 100
data = data[(data < xlim[1]) & (data > xlim[0])]
# Plot a dashed line for the percentage
ax2 = ax1.twinx()
ax2.axhline(y=percentage_passed, color='red', linestyle='--')
ax2.set_ylabel('Percentage', color='red')
ax2.set_ylim(0, 100)
plt.title(f'Data Distribution with Cutoff at {cutoff_value}')
plt.legend(loc='upper right')
plt.show()
def plot_hist_cutoff(adata,name='',bins=100,xlim_quantile=(0.001, 0.999)):
fig, ax1 = plt.subplots()
# Create histogram
data = adata.obs['total_CG']
ax1 = plt.hist(data, bins=bins, color='blue', alpha=0.7)
ax1.set_xlabel(name)
ax1.set_ylabel('Count', color='blue')
# # Calculate the percentage of data that meets the cutoff value
# total_data = data.size
# print(data[total_data])
# x = np.linspace(data[0], data[total_data], bins)
# passed_data = np.array([(data > i).sum() for i in x])
# percentage_passed = (passed_data / total_data) * 100
# print (percentage_passed)
xlim = tuple(np.quantile(data, xlim_quantile))
x = np.linspace(xlim[0], xlim[1], 500)
count_list = np.array([(data > i).sum() for i in x])
original_total_data = data.size
count_list = count_list / original_total_data * 100
data = data[(data < xlim[1]) & (data > xlim[0])]
# Plot a dashed line for the percentage
ax2 = ax1.twinx()
ax2.axhline(y=percentage_passed, color='red', linestyle='--')
ax2.set_ylabel('Percentage', color='red')
ax2.set_ylim(0, 100)
plt.title(f'Data Distribution with Cutoff at {cutoff_value}')
plt.legend(loc='upper right')
plt.show()
In [68]:
Copied!
plot_hist_cutoff(adata)
plot_hist_cutoff(adata)
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[68], line 1 ----> 1 plot_hist_cutoff(adata) Cell In[67], line 6, in plot_hist_cutoff(adata, name, bins, xlim_quantile) 4 data = adata.obs['total_CG'] 5 ax1 = plt.hist(data, bins=bins, color='blue', alpha=0.7) ----> 6 ax1.set_xlabel(name) 7 ax1.set_ylabel('Count', color='blue') 10 # # Calculate the percentage of data that meets the cutoff value 11 # total_data = data.size 12 # print(data[total_data]) (...) 18 19 # print (percentage_passed) AttributeError: 'tuple' object has no attribute 'set_xlabel'
In [24]:
Copied!
#plot_hist(adata,'mean_cell_methylation','gloable mean methylation level','count','methylation level')
plot_hist(adata,'mc_class_CG','CG Count','count','count')
#plot_hist(adata,'mean_cell_methylation','gloable mean methylation level','count','methylation level')
plot_hist(adata,'mc_class_CG','CG Count','count','count')
In [ ]:
Copied!
In [39]:
Copied!
def plot_cell(adata):
# 创建一个 2x2 网格布局
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
# 调用四个函数获取各自的图像
fig1 = plot_hist(adata,'mean_cell_methylation','gloable mean methylation level','count','methylation level')
fig2 = plot_hist(adata,'mc_class_CG','CG Count','count','count')
fig3 = plot_hist(adata,'mc_class_CG','CG Count','count','count')
fig4 = plot_hist(adata,'mc_class_CG','CG Count','count','count')
# 将这四个图像放入子图中
axes[0, 0].imshow(fig1)
axes[0, 1].imshow(fig2)
axes[1, 0].imshow(fig3)
axes[1, 1].imshow(fig4)
# 调整子图之间的间距
plt.tight_layout()
# 显示图形
plt.show()
def plot_cell(adata):
# 创建一个 2x2 网格布局
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
# 调用四个函数获取各自的图像
fig1 = plot_hist(adata,'mean_cell_methylation','gloable mean methylation level','count','methylation level')
fig2 = plot_hist(adata,'mc_class_CG','CG Count','count','count')
fig3 = plot_hist(adata,'mc_class_CG','CG Count','count','count')
fig4 = plot_hist(adata,'mc_class_CG','CG Count','count','count')
# 将这四个图像放入子图中
axes[0, 0].imshow(fig1)
axes[0, 1].imshow(fig2)
axes[1, 0].imshow(fig3)
axes[1, 1].imshow(fig4)
# 调整子图之间的间距
plt.tight_layout()
# 显示图形
plt.show()
In [40]:
Copied!
plot_cell(adata)
plot_cell(adata)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[40], line 1 ----> 1 plot_cell(adata) Cell In[39], line 12, in plot_cell(adata) 9 fig4 = plot_hist(adata,'mc_class_CG','CG Count','count','count') 11 # 将这四个图像放入子图中 ---> 12 axes[0, 0].imshow(fig1) 13 axes[0, 1].imshow(fig2) 14 axes[1, 0].imshow(fig3) File D:\conda\envs\py39\lib\site-packages\matplotlib\__init__.py:1475, in _preprocess_data.<locals>.inner(ax, data, *args, **kwargs) 1472 @functools.wraps(func) 1473 def inner(ax, *args, data=None, **kwargs): 1474 if data is None: -> 1475 return func(ax, *map(sanitize_sequence, args), **kwargs) 1477 bound = new_sig.bind(ax, *args, **kwargs) 1478 auto_label = (bound.arguments.get(label_namer) 1479 or bound.kwargs.get(label_namer)) File D:\conda\envs\py39\lib\site-packages\matplotlib\axes\_axes.py:5663, in Axes.imshow(self, X, cmap, norm, aspect, interpolation, alpha, vmin, vmax, origin, extent, interpolation_stage, filternorm, filterrad, resample, url, **kwargs) 5655 self.set_aspect(aspect) 5656 im = mimage.AxesImage(self, cmap=cmap, norm=norm, 5657 interpolation=interpolation, origin=origin, 5658 extent=extent, filternorm=filternorm, 5659 filterrad=filterrad, resample=resample, 5660 interpolation_stage=interpolation_stage, 5661 **kwargs) -> 5663 im.set_data(X) 5664 im.set_alpha(alpha) 5665 if im.get_clip_path() is None: 5666 # image does not already have clipping set, clip to axes patch File D:\conda\envs\py39\lib\site-packages\matplotlib\image.py:701, in _ImageBase.set_data(self, A) 697 self._A = cbook.safe_masked_invalid(A, copy=True) 699 if (self._A.dtype != np.uint8 and 700 not np.can_cast(self._A.dtype, float, "same_kind")): --> 701 raise TypeError("Image data of dtype {} cannot be converted to " 702 "float".format(self._A.dtype)) 704 if self._A.ndim == 3 and self._A.shape[-1] == 1: 705 # If just one dimension assume scalar and apply colormap 706 self._A = self._A[:, :, 0] TypeError: Image data of dtype object cannot be converted to float
In [69]:
Copied!
import matplotlib.pyplot as plt
import numpy as np
# 创建一些示例数据
data = np.random.randn(1000) # 这里使用随机数据,你可以替换为你的实际数据
# 创建直方图
plt.hist(data, bins=30, alpha=0.5, color='b', density=True, label='数据分布')
# 计算数据百分比
percentiles = [25, 50, 75] # 这里可以自定义百分位数
percentile_values = np.percentile(data, percentiles)
# 添加虚线和百分比标签
for percentile, value in zip(percentiles, percentile_values):
plt.axvline(value, color='r', linestyle='--', label=f'{percentile}th Percentile: {value:.2f}')
# 添加右侧百分比y轴
plt.twinx()
percentile_labels = [f'{percentile}%' for percentile in percentiles]
percentile_positions = np.percentile(data, percentiles)
plt.yticks(percentile_positions, percentile_labels)
plt.ylabel('百分比')
# 设置图例
plt.legend(loc='upper left')
# 显示图形
plt.show()
import matplotlib.pyplot as plt
import numpy as np
# 创建一些示例数据
data = np.random.randn(1000) # 这里使用随机数据,你可以替换为你的实际数据
# 创建直方图
plt.hist(data, bins=30, alpha=0.5, color='b', density=True, label='数据分布')
# 计算数据百分比
percentiles = [25, 50, 75] # 这里可以自定义百分位数
percentile_values = np.percentile(data, percentiles)
# 添加虚线和百分比标签
for percentile, value in zip(percentiles, percentile_values):
plt.axvline(value, color='r', linestyle='--', label=f'{percentile}th Percentile: {value:.2f}')
# 添加右侧百分比y轴
plt.twinx()
percentile_labels = [f'{percentile}%' for percentile in percentiles]
percentile_positions = np.percentile(data, percentiles)
plt.yticks(percentile_positions, percentile_labels)
plt.ylabel('百分比')
# 设置图例
plt.legend(loc='upper left')
# 显示图形
plt.show()
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. D:\conda\envs\py39\lib\site-packages\IPython\core\pylabtools.py:152: UserWarning: Glyph 30334 (\N{CJK UNIFIED IDEOGRAPH-767E}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) D:\conda\envs\py39\lib\site-packages\IPython\core\pylabtools.py:152: UserWarning: Glyph 20998 (\N{CJK UNIFIED IDEOGRAPH-5206}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) D:\conda\envs\py39\lib\site-packages\IPython\core\pylabtools.py:152: UserWarning: Glyph 27604 (\N{CJK UNIFIED IDEOGRAPH-6BD4}) missing from current font. fig.canvas.print_figure(bytes_io, **kw)
In [93]:
Copied!
def plot_hist_cutoff(adata,name='',bins=100,xlim_quantile=(0.001, 0.999)):
# 创建一些示例数据
data = adata.obs['total_CG'] # 这里使用随机数据,你可以替换为你的实际数据
# 创建直方图
n, bins, patches = plt.hist(data, bins=30, alpha=0.5, color='b', density=False, label='数据分布')
# 计算不同截断值下的保留百分比
cutoffs = np.sort(data)
percentiles = 1- (np.arange(1, len(cutoffs) + 1) / len(cutoffs))
# 添加右侧百分比y轴
ax2 = plt.twinx()
plt.ylabel('cell passed %')
# 绘制保留百分比曲线
ax2 = plt.plot(cutoffs, percentiles, 'r--', label='cell passed percent')
# 设置图例
plt.legend(loc='upper right')
# 显示图形
plt.show()
def plot_hist_cutoff(adata,name='',bins=100,xlim_quantile=(0.001, 0.999)):
# 创建一些示例数据
data = adata.obs['total_CG'] # 这里使用随机数据,你可以替换为你的实际数据
# 创建直方图
n, bins, patches = plt.hist(data, bins=30, alpha=0.5, color='b', density=False, label='数据分布')
# 计算不同截断值下的保留百分比
cutoffs = np.sort(data)
percentiles = 1- (np.arange(1, len(cutoffs) + 1) / len(cutoffs))
# 添加右侧百分比y轴
ax2 = plt.twinx()
plt.ylabel('cell passed %')
# 绘制保留百分比曲线
ax2 = plt.plot(cutoffs, percentiles, 'r--', label='cell passed percent')
# 设置图例
plt.legend(loc='upper right')
# 显示图形
plt.show()
In [94]:
Copied!
plot_hist_cutoff(adata)
plot_hist_cutoff(adata)
In [98]:
Copied!
def plot_stat(adata):
nrows=2
ncols=2
dpi=300
fig, axes = plt.subplots(figsize=(ncols * 3, nrows * 3),
nrows=nrows,
ncols=ncols,
dpi=300)
x = np.linspace(0, 2*np.pi, 100)
ax1 = axes[0, 0]
ax2 = axes[0, 1]
ax3 = axes[1, 1]
ax4 = axes[1, 0]
ax1.plot(plot_hist_cutoff(adata))
ax2.plot(plot_hist_cutoff(adata))
ax3.plot(plot_hist_cutoff(adata))
ax4.plot(plot_hist(adata,'mc_class_CG','CG Count','count','count'))
plt.show()
def plot_stat(adata):
nrows=2
ncols=2
dpi=300
fig, axes = plt.subplots(figsize=(ncols * 3, nrows * 3),
nrows=nrows,
ncols=ncols,
dpi=300)
x = np.linspace(0, 2*np.pi, 100)
ax1 = axes[0, 0]
ax2 = axes[0, 1]
ax3 = axes[1, 1]
ax4 = axes[1, 0]
ax1.plot(plot_hist_cutoff(adata))
ax2.plot(plot_hist_cutoff(adata))
ax3.plot(plot_hist_cutoff(adata))
ax4.plot(plot_hist(adata,'mc_class_CG','CG Count','count','count'))
plt.show()
In [99]:
Copied!
plot_stat(adata)
plot_stat(adata)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[99], line 1 ----> 1 plot_stat(adata) Cell In[98], line 14, in plot_stat(adata) 12 ax3 = axes[1, 1] 13 ax4 = axes[1, 0] ---> 14 ax1.plot(plot_hist_cutoff(adata)) 15 ax2.plot(plot_hist_cutoff(adata)) 16 ax3.plot(plot_hist_cutoff(adata)) File D:\conda\envs\py39\lib\site-packages\matplotlib\axes\_axes.py:1688, in Axes.plot(self, scalex, scaley, data, *args, **kwargs) 1445 """ 1446 Plot y versus x as lines and/or markers. 1447 (...) 1685 (``'green'``) or hex strings (``'#008000'``). 1686 """ 1687 kwargs = cbook.normalize_kwargs(kwargs, mlines.Line2D) -> 1688 lines = [*self._get_lines(*args, data=data, **kwargs)] 1689 for line in lines: 1690 self.add_line(line) File D:\conda\envs\py39\lib\site-packages\matplotlib\axes\_base.py:311, in _process_plot_var_args.__call__(self, data, *args, **kwargs) 309 this += args[0], 310 args = args[1:] --> 311 yield from self._plot_args( 312 this, kwargs, ambiguous_fmt_datakey=ambiguous_fmt_datakey) File D:\conda\envs\py39\lib\site-packages\matplotlib\axes\_base.py:465, in _process_plot_var_args._plot_args(self, tup, kwargs, return_kwargs, ambiguous_fmt_datakey) 462 # Don't allow any None value; these would be up-converted to one 463 # element array of None which causes problems downstream. 464 if any(v is None for v in tup): --> 465 raise ValueError("x, y, and format string must not be None") 467 kw = {} 468 for prop_name, val in zip(('linestyle', 'marker', 'color'), 469 (linestyle, marker, color)): ValueError: x, y, and format string must not be None
In [103]:
Copied!
cdata = ad.read("D://Test/scMeth/enhancer_c1_CG_paper.h5ad")
cdata = ad.read("D://Test/scMeth/enhancer_c1_CG_paper.h5ad")
In [110]:
Copied!
length1 = len(cdata.X[0,:]) # 55017 features,列
length2 = len(cdata.X[:,0]) # 3379 samples 行
cdata.var['coverage_cells'] = [length2 - np.isnan(line).sum() for line in cdata.X.T] #每一个特征在多少个样本中有覆盖
cdata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in cdata.X.T]
length1 = len(cdata.X[0,:]) # 55017 features,列
length2 = len(cdata.X[:,0]) # 3379 samples 行
cdata.var['coverage_cells'] = [length2 - np.isnan(line).sum() for line in cdata.X.T] #每一个特征在多少个样本中有覆盖
cdata.var['mean_feature_methylation'] = [np.nansum(line)/length2 for line in cdata.X.T]
In [121]:
Copied!
# 创建一些示例数据
data = cdata.var['coverage_cells'] # 这里使用随机数据,你可以替换为你的实际数据
ax = plt.subplot()
# 创建直方图
n, bins, patches = plt.hist(data, bins=30, alpha=0.5, color='b', density=False, label='数据分布')
# 计算不同截断值下的保留百分比
cutoffs = np.sort(data)
percentiles = 1- np.arange(1, len(cutoffs) + 1) / len(cutoffs)
# 将百分数形式转换为小数并保留两位小数
percentiles_decimal = [round(100 * p, 2) for p in percentiles]
# 添加右侧百分比y轴
ax2 = plt.twinx()
# adding horizontal grid lines
ax2.yaxis.grid(True)
plt.ylabel('cell passed %')
# 绘制保留百分比曲线
ax2 = plt.plot(cutoffs,percentiles_decimal, 'r--', label='cell passed percent')
# 设置图例
plt.legend(loc='upper right')
# 显示图形
plt.show()
# 创建一些示例数据
data = cdata.var['coverage_cells'] # 这里使用随机数据,你可以替换为你的实际数据
ax = plt.subplot()
# 创建直方图
n, bins, patches = plt.hist(data, bins=30, alpha=0.5, color='b', density=False, label='数据分布')
# 计算不同截断值下的保留百分比
cutoffs = np.sort(data)
percentiles = 1- np.arange(1, len(cutoffs) + 1) / len(cutoffs)
# 将百分数形式转换为小数并保留两位小数
percentiles_decimal = [round(100 * p, 2) for p in percentiles]
# 添加右侧百分比y轴
ax2 = plt.twinx()
# adding horizontal grid lines
ax2.yaxis.grid(True)
plt.ylabel('cell passed %')
# 绘制保留百分比曲线
ax2 = plt.plot(cutoffs,percentiles_decimal, 'r--', label='cell passed percent')
# 设置图例
plt.legend(loc='upper right')
# 显示图形
plt.show()
In [111]:
Copied!
cdata.var
cdata.var
Out[111]:
coverage_cells | mean_feature_methylation | |
---|---|---|
0_enhancer_10000235_10002235 | 1030 | 0.111873 |
1_enhancer_10000473_10002473 | 602 | 0.074983 |
2_enhancer_100010706_100012706 | 664 | 0.085918 |
3_enhancer_100019316_100021316 | 816 | 0.126313 |
4_enhancer_100024481_100026481 | 651 | 0.112446 |
... | ... | ... |
55012_enhancer_99982269_99984269 | 403 | 0.045218 |
55013_enhancer_99982881_99984881 | 661 | 0.103071 |
55014_enhancer_99983956_99985956 | 988 | 0.235307 |
55015_enhancer_99990006_99992006 | 383 | 0.057098 |
55016_enhancer_99995790_99997790 | 629 | 0.085066 |
55017 rows × 2 columns
In [122]:
Copied!
adata.var
adata.var
Out[122]:
feature_name | coverage_feature | mean_feature_methylation | |
---|---|---|---|
0 | chr1_0_100000 | 52 | 0.0 |
1 | chr1_100000_200000 | 52 | 0.0 |
2 | chr1_200000_300000 | 52 | 0.0 |
3 | chr1_300000_400000 | 52 | 0.0 |
4 | chr1_400000_500000 | 52 | 0.0 |
... | ... | ... | ... |
4907 | chr2_241700000_241800000 | 52 | 0.0 |
4908 | chr2_241800000_241900000 | 52 | 0.0 |
4909 | chr2_241900000_242000000 | 52 | 0.0 |
4910 | chr2_242000000_242100000 | 52 | 0.0 |
4911 | chr2_242100000_242193529 | 52 | 0.0 |
4912 rows × 3 columns
In [ ]:
Copied!
In [123]:
Copied!
# 创建一些示例数据
data = adata.var['mean_feature_methylation'] # 这里使用随机数据,你可以替换为你的实际数据
ax = plt.subplot()
# 创建直方图
n, bins, patches = plt.hist(data, bins=30, alpha=0.5, color='b', density=False, label='数据分布')
# 计算不同截断值下的保留百分比
cutoffs = np.sort(data)
percentiles = 1- np.arange(1, len(cutoffs) + 1) / len(cutoffs)
# 将百分数形式转换为小数并保留两位小数
percentiles_decimal = [round(100 * p, 2) for p in percentiles]
# 添加右侧百分比y轴
ax2 = plt.twinx()
# adding horizontal grid lines
ax2.yaxis.grid(True)
plt.ylabel('cell passed %')
# 绘制保留百分比曲线
ax2 = plt.plot(cutoffs,percentiles_decimal, 'r--', label='cell passed percent')
# 设置图例
plt.legend(loc='upper right')
# 显示图形
plt.show()
# 创建一些示例数据
data = adata.var['mean_feature_methylation'] # 这里使用随机数据,你可以替换为你的实际数据
ax = plt.subplot()
# 创建直方图
n, bins, patches = plt.hist(data, bins=30, alpha=0.5, color='b', density=False, label='数据分布')
# 计算不同截断值下的保留百分比
cutoffs = np.sort(data)
percentiles = 1- np.arange(1, len(cutoffs) + 1) / len(cutoffs)
# 将百分数形式转换为小数并保留两位小数
percentiles_decimal = [round(100 * p, 2) for p in percentiles]
# 添加右侧百分比y轴
ax2 = plt.twinx()
# adding horizontal grid lines
ax2.yaxis.grid(True)
plt.ylabel('cell passed %')
# 绘制保留百分比曲线
ax2 = plt.plot(cutoffs,percentiles_decimal, 'r--', label='cell passed percent')
# 设置图例
plt.legend(loc='upper right')
# 显示图形
plt.show()
In [124]:
Copied!
cdata
cdata
Out[124]:
AnnData object with n_obs × n_vars = 3379 × 55017 obs: 'cell_type', 'Library pool', 'Index i5', 'Index i7', 'i5 sequence', 'i7 sequence', 'random primer index', 'random primer index sequence', 'Total reads', 'Mapped reads', 'Filtered reads', 'mCCC/CCC', 'mCG/CG', 'mCH/CH', 'Estimated mCG/CG', 'Estimated mCH/CH', 'Neuron type', 'tSNE x coordinate', 'tSNE y coordinate\n', 'binary' var: 'coverage_cells', 'mean_feature_methylation' uns: 'imputation', 'omic'
In [ ]:
Copied!
def _find_range_features(
adata
feature_count,
filter_lower_quantile,
filter_upper_quantile,
total_features,
) -> np.ndarray:
idx = np.argsort(feature_count)
for i in range(idx.size):
if feature_count[idx[i]] > 0:
break
idx = idx[i:]
n = idx.size
n_lower = int(filter_lower_quantile * n)
n_upper = int(filter_upper_quantile * n)
idx = idx[n_lower:n-n_upper]
return idx[::-1][:total_features]
def _find_range_features(
adata
feature_count,
filter_lower_quantile,
filter_upper_quantile,
total_features,
) -> np.ndarray:
idx = np.argsort(feature_count)
for i in range(idx.size):
if feature_count[idx[i]] > 0:
break
idx = idx[i:]
n = idx.size
n_lower = int(filter_lower_quantile * n)
n_upper = int(filter_upper_quantile * n)
idx = idx[n_lower:n-n_upper]
return idx[::-1][:total_features]
In [126]:
Copied!
adata.var.shape
adata.var.shape
Out[126]:
(4912, 3)
In [128]:
Copied!
obs_dim, var_dim = adata.X.shape
obs_dim, var_dim = adata.X.shape
In [129]:
Copied!
# mean
mean = adata.X.mean(dim=obs_dim).load()
# var
# enforce R convention (unbiased estimator) for variance
mean_sq = (x * x).mean(dim=obs_dim).load()
var = (mean_sq - mean ** 2) * (x.sizes[obs_dim] / (x.sizes[obs_dim] - 1))
# now actually compute the dispersion
mean.where(mean > 1e-12, 1e-12) # set entries equal to zero to small value
# raw dispersion is the variance normalized by mean
dispersion = var / mean
return mean, dispersion
# mean
mean = adata.X.mean(dim=obs_dim).load()
# var
# enforce R convention (unbiased estimator) for variance
mean_sq = (x * x).mean(dim=obs_dim).load()
var = (mean_sq - mean ** 2) * (x.sizes[obs_dim] / (x.sizes[obs_dim] - 1))
# now actually compute the dispersion
mean.where(mean > 1e-12, 1e-12) # set entries equal to zero to small value
# raw dispersion is the variance normalized by mean
dispersion = var / mean
return mean, dispersion
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[129], line 2 1 # mean ----> 2 mean = adata.X.mean(dim=obs_dim).load() 4 # var 5 # enforce R convention (unbiased estimator) for variance 6 mean_sq = (x * x).mean(dim=obs_dim).load() TypeError: mean() got an unexpected keyword argument 'dim'
In [130]:
Copied!
adata.X.mean()
adata.X.mean()
Out[130]:
0.34296776073338897
In [135]:
Copied!
xdata.X
xdata.X
Out[135]:
array([[1, 4, 7], [2, 5, 8], [3, 6, 9]], dtype=int64)
In [139]:
Copied!
adata.var
adata.var
Out[139]:
feature_name | coverage_feature | mean_feature_methylation | |
---|---|---|---|
0 | chr1_0_100000 | 52 | 0.0 |
1 | chr1_100000_200000 | 52 | 0.0 |
2 | chr1_200000_300000 | 52 | 0.0 |
3 | chr1_300000_400000 | 52 | 0.0 |
4 | chr1_400000_500000 | 52 | 0.0 |
... | ... | ... | ... |
4907 | chr2_241700000_241800000 | 52 | 0.0 |
4908 | chr2_241800000_241900000 | 52 | 0.0 |
4909 | chr2_241900000_242000000 | 52 | 0.0 |
4910 | chr2_242000000_242100000 | 52 | 0.0 |
4911 | chr2_242100000_242193529 | 52 | 0.0 |
4912 rows × 3 columns
In [140]:
Copied!
mean_sq = (adata.X.power(2).mean(axis=0)).A1
mean_sq = (adata.X.power(2).mean(axis=0)).A1
In [141]:
Copied!
mean_sq
mean_sq
Out[141]:
array([0., 0., 0., ..., 0., 0., 0.])
In [142]:
Copied!
mean = adata.X.mean(axis=0).A1
mean = adata.X.mean(axis=0).A1
In [143]:
Copied!
mean
mean
Out[143]:
array([0., 0., 0., ..., 0., 0., 0.])
In [145]:
Copied!
# 计算样本数和特征数
n_samples = adata.X.shape[0]
n_features = adata.X.shape[1]
# 计算样本数和特征数
n_samples = adata.X.shape[0]
n_features = adata.X.shape[1]
In [146]:
Copied!
# 计算方差 (mean_sq - mean**2) * (n_samples / (n_samples - 1))
var = (mean_sq - mean**2) * (n_samples / (n_samples - 1))
# 计算方差 (mean_sq - mean**2) * (n_samples / (n_samples - 1))
var = (mean_sq - mean**2) * (n_samples / (n_samples - 1))
In [150]:
Copied!
# 将结果存储到 adata.var
adata.var['mean_sq'] = mean_sq
adata.var['variance'] = var
# 将结果存储到 adata.var
adata.var['mean_sq'] = mean_sq
adata.var['variance'] = var
In [151]:
Copied!
adata.var.variance
adata.var.variance
Out[151]:
0 0.0 1 0.0 2 0.0 3 0.0 4 0.0 ... 4907 0.0 4908 0.0 4909 0.0 4910 0.0 4911 0.0 Name: variance, Length: 4912, dtype: float64
In [188]:
Copied!
if sparse.issparse(adata.X):
# 计算平方均值 (x * x).mean()
mean_sq = (adata.X.power(2).mean(axis=0)).A1
# 计算均值
mean = adata.X.mean(axis=0).A1
# 计算样本数和特征数
n_samples = adata.X.shape[0]
n_features = adata.X.shape[1]
# 计算方差 (mean_sq - mean**2) * (n_samples / (n_samples - 1))
var = (mean_sq - mean**2) * (n_samples / (n_samples - 1))
# 将均值中小于阈值的值替换为 1e-12
threshold = 1e-12
mean[mean < threshold] = threshold
# 计算 dispersion
dispersion = var / mean
# 将结果存储到 adata.var
adata.var['dispersion'] = dispersion
adata.var['dispersion_log'] = np.log(dispersion)
# Calculate mean and coverage
mean_mean = adata.var['mean_feature_methylation'].mean()
mean_coverage = adata.var['coverage_feature'].mean()
adata.var['NormalizedDispersion'] = adata.var['dispersion'] / (mean_mean * mean_coverage)
if sparse.issparse(adata.X):
# 计算平方均值 (x * x).mean()
mean_sq = (adata.X.power(2).mean(axis=0)).A1
# 计算均值
mean = adata.X.mean(axis=0).A1
# 计算样本数和特征数
n_samples = adata.X.shape[0]
n_features = adata.X.shape[1]
# 计算方差 (mean_sq - mean**2) * (n_samples / (n_samples - 1))
var = (mean_sq - mean**2) * (n_samples / (n_samples - 1))
# 将均值中小于阈值的值替换为 1e-12
threshold = 1e-12
mean[mean < threshold] = threshold
# 计算 dispersion
dispersion = var / mean
# 将结果存储到 adata.var
adata.var['dispersion'] = dispersion
adata.var['dispersion_log'] = np.log(dispersion)
# Calculate mean and coverage
mean_mean = adata.var['mean_feature_methylation'].mean()
mean_coverage = adata.var['coverage_feature'].mean()
adata.var['NormalizedDispersion'] = adata.var['dispersion'] / (mean_mean * mean_coverage)
C:\Users\zong\AppData\Local\Temp\ipykernel_7604\4287774659.py:23: RuntimeWarning: divide by zero encountered in log adata.var['dispersion_log'] = np.log(dispersion)
In [191]:
Copied!
adata.var['NormalizedDispersion']
adata.var['NormalizedDispersion']
Out[191]:
0 0.0 1 0.0 2 0.0 3 0.0 4 0.0 ... 4907 0.0 4908 0.0 4909 0.0 4910 0.0 4911 0.0 Name: NormalizedDispersion, Length: 4912, dtype: float64
In [158]:
Copied!
type(adata.var['dispersion'])
type(adata.var['dispersion'])
Out[158]:
pandas.core.series.Series
In [189]:
Copied!
adata.var
adata.var
Out[189]:
feature_name | coverage_feature | mean_feature_methylation | mean_sq | var | variance | dispersion | dispersion_log | NormalizedCoverage | ResidualDispersion | NormalizedDispersion | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | chr1_0_100000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 |
1 | chr1_100000_200000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 |
2 | chr1_200000_300000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 |
3 | chr1_300000_400000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 |
4 | chr1_400000_500000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4907 | chr2_241700000_241800000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 |
4908 | chr2_241800000_241900000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 |
4909 | chr2_241900000_242000000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 |
4910 | chr2_242000000_242100000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 |
4911 | chr2_242100000_242193529 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 |
4912 rows × 11 columns
In [164]:
Copied!
# 创建一些示例数据
data = adata.var['coverage_feature'] # 这里使用随机数据,你可以替换为你的实际数据
ax = plt.subplot()
# 创建直方图
n, bins, patches = plt.hist(data, bins=30, alpha=0.5, color='b', density=False, label='数据分布')
# 计算不同截断值下的保留百分比
cutoffs = np.sort(data)
percentiles = 1- np.arange(1, len(cutoffs) + 1) / len(cutoffs)
# 将百分数形式转换为小数并保留两位小数
percentiles_decimal = [round(100 * p, 2) for p in percentiles]
# 添加右侧百分比y轴
ax2 = plt.twinx()
# adding horizontal grid lines
ax2.yaxis.grid(True)
plt.ylabel('cell passed %')
# 绘制保留百分比曲线
ax2 = plt.plot(cutoffs,percentiles_decimal, 'r--', label='cell passed percent')
# 设置图例
plt.legend(loc='upper right')
# 显示图形
plt.show()
# 创建一些示例数据
data = adata.var['coverage_feature'] # 这里使用随机数据,你可以替换为你的实际数据
ax = plt.subplot()
# 创建直方图
n, bins, patches = plt.hist(data, bins=30, alpha=0.5, color='b', density=False, label='数据分布')
# 计算不同截断值下的保留百分比
cutoffs = np.sort(data)
percentiles = 1- np.arange(1, len(cutoffs) + 1) / len(cutoffs)
# 将百分数形式转换为小数并保留两位小数
percentiles_decimal = [round(100 * p, 2) for p in percentiles]
# 添加右侧百分比y轴
ax2 = plt.twinx()
# adding horizontal grid lines
ax2.yaxis.grid(True)
plt.ylabel('cell passed %')
# 绘制保留百分比曲线
ax2 = plt.plot(cutoffs,percentiles_decimal, 'r--', label='cell passed percent')
# 设置图例
plt.legend(loc='upper right')
# 显示图形
plt.show()
In [166]:
Copied!
# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation']
y_values = adata.var['dispersion']
# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5) # alpha参数用于设置点的透明度,可根据需要调整
# 添加标签和标题
plt.xlabel('Global DNA methylation %')
plt.ylabel('Observed CpG Sites')
# 显示图形
plt.show()
# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation']
y_values = adata.var['dispersion']
# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5) # alpha参数用于设置点的透明度,可根据需要调整
# 添加标签和标题
plt.xlabel('Global DNA methylation %')
plt.ylabel('Observed CpG Sites')
# 显示图形
plt.show()
In [168]:
Copied!
# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation']
y_values = adata.var['dispersion_log']
# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5) # alpha参数用于设置点的透明度,可根据需要调整
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
# 显示图形
plt.show()
# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation']
y_values = adata.var['dispersion_log']
# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5) # alpha参数用于设置点的透明度,可根据需要调整
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
# 显示图形
plt.show()
In [190]:
Copied!
# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation']
y_values = adata.var['NormalizedDispersion']
# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5) # alpha参数用于设置点的透明度,可根据需要调整
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
# 显示图形
plt.show()
# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation']
y_values = adata.var['NormalizedDispersion']
# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5) # alpha参数用于设置点的透明度,可根据需要调整
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
# 显示图形
plt.show()
In [169]:
Copied!
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
# 生成随机示例数据
np.random.seed(0)
mean_values = np.random.rand(100)
coverage_values = np.random.randint(1, 10, 100)
true_dispersion = 0.5 * mean_values + 0.2 * coverage_values + np.random.normal(0, 0.1, 100)
# 构建矩阵X,包括均值和覆盖度作为自变量
X = np.column_stack((mean_values, coverage_values))
X = sm.add_constant(X) # 添加截距项
# 使用线性回归模型来拟合均值和覆盖度与离散度之间的关系
model = sm.OLS(true_dispersion, X)
results = model.fit()
# 获取拟合后的残差
residuals = results.resid
# 校正后的离散度为原始离散度减去残差
corrected_dispersion = true_dispersion - residuals
# 绘制原始离散度与校正后的离散度的对比图
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.scatter(mean_values, true_dispersion, label='原始离散度')
plt.xlabel('均值')
plt.ylabel('离散度')
plt.legend()
plt.subplot(1, 2, 2)
plt.scatter(mean_values, corrected_dispersion, label='校正后的离散度')
plt.xlabel('均值')
plt.ylabel('离散度')
plt.legend()
plt.tight_layout()
plt.show()
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
# 生成随机示例数据
np.random.seed(0)
mean_values = np.random.rand(100)
coverage_values = np.random.randint(1, 10, 100)
true_dispersion = 0.5 * mean_values + 0.2 * coverage_values + np.random.normal(0, 0.1, 100)
# 构建矩阵X,包括均值和覆盖度作为自变量
X = np.column_stack((mean_values, coverage_values))
X = sm.add_constant(X) # 添加截距项
# 使用线性回归模型来拟合均值和覆盖度与离散度之间的关系
model = sm.OLS(true_dispersion, X)
results = model.fit()
# 获取拟合后的残差
residuals = results.resid
# 校正后的离散度为原始离散度减去残差
corrected_dispersion = true_dispersion - residuals
# 绘制原始离散度与校正后的离散度的对比图
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.scatter(mean_values, true_dispersion, label='原始离散度')
plt.xlabel('均值')
plt.ylabel('离散度')
plt.legend()
plt.subplot(1, 2, 2)
plt.scatter(mean_values, corrected_dispersion, label='校正后的离散度')
plt.xlabel('均值')
plt.ylabel('离散度')
plt.legend()
plt.tight_layout()
plt.show()
--------------------------------------------------------------------------- ImportError Traceback (most recent call last) Cell In[169], line 2 1 import numpy as np ----> 2 import statsmodels.api as sm 3 import matplotlib.pyplot as plt 5 # 生成随机示例数据 File D:\conda\envs\py39\lib\site-packages\statsmodels\api.py:125 114 from .genmod import api as genmod 115 from .genmod.api import ( 116 GEE, 117 GLM, (...) 123 families, 124 ) --> 125 from .graphics import api as graphics 126 from .graphics.gofplots import ProbPlot, qqline, qqplot, qqplot_2samples 127 from .imputation.bayes_mi import MI, BayesGaussMI File D:\conda\envs\py39\lib\site-packages\statsmodels\graphics\api.py:1 ----> 1 from . import tsaplots as tsa 2 from .agreement import mean_diff_plot 3 from .boxplots import beanplot, violinplot File D:\conda\envs\py39\lib\site-packages\statsmodels\graphics\tsaplots.py:10 7 import pandas as pd 9 from statsmodels.graphics import utils ---> 10 from statsmodels.tsa.stattools import acf, pacf 13 def _prepare_data_corr_plot(x, lags, zero): 14 zero = bool(zero) File D:\conda\envs\py39\lib\site-packages\statsmodels\tsa\stattools.py:19 17 from scipy import stats 18 from scipy.interpolate import interp1d ---> 19 from scipy.signal import correlate 21 from statsmodels.regression.linear_model import OLS, yule_walker 22 from statsmodels.tools.sm_exceptions import ( 23 CollinearityWarning, 24 InfeasibleTestError, (...) 27 ValueWarning, 28 ) File D:\conda\envs\py39\lib\site-packages\scipy\signal\__init__.py:306 304 from .ltisys import * 305 from .lti_conversion import * --> 306 from .signaltools import * 307 from ._savitzky_golay import savgol_coeffs, savgol_filter 308 from .spectral import * File D:\conda\envs\py39\lib\site-packages\scipy\signal\signaltools.py:12 10 from scipy import linalg, fft as sp_fft 11 from scipy.fft._helper import _init_nd_shape_and_axes ---> 12 from scipy._lib._util import prod as _prod 13 import numpy as np 14 from scipy.special import lambertw ImportError: cannot import name 'prod' from 'scipy._lib._util' (D:\conda\envs\py39\lib\site-packages\scipy\_lib\_util.py)
In [173]:
Copied!
# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation']
y_values = adata.var['dispersion_log']
# 创建散点图
plt.figure(figsize=(10, 5)) # 设置图形大小
plt.subplot(1, 2, 1) # 创建第一个子图
plt.scatter(x_values, y_values, alpha=0.5) # 添加散点
# 添加线性回归线
polyfit = np.polyfit(x_values, y_values, 1)
plt.plot(x_values, np.polyval(polyfit, x_values), color='red')
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
plt.title('Scatter Plot with Linear Fit')
# 创建第二个子图,用于绘制热图
plt.subplot(1, 2, 2)
plt.hist2d(x_values, y_values, bins=(50, 50), cmap=plt.cm.Blues) # 添加热图
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
plt.title('Heatmap of Point Distribution')
# 调整子图之间的间距
plt.tight_layout()
# 显示图形
plt.colorbar()
plt.show()
# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation']
y_values = adata.var['dispersion_log']
# 创建散点图
plt.figure(figsize=(10, 5)) # 设置图形大小
plt.subplot(1, 2, 1) # 创建第一个子图
plt.scatter(x_values, y_values, alpha=0.5) # 添加散点
# 添加线性回归线
polyfit = np.polyfit(x_values, y_values, 1)
plt.plot(x_values, np.polyval(polyfit, x_values), color='red')
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
plt.title('Scatter Plot with Linear Fit')
# 创建第二个子图,用于绘制热图
plt.subplot(1, 2, 2)
plt.hist2d(x_values, y_values, bins=(50, 50), cmap=plt.cm.Blues) # 添加热图
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
plt.title('Heatmap of Point Distribution')
# 调整子图之间的间距
plt.tight_layout()
# 显示图形
plt.colorbar()
plt.show()
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[173], line 21 19 # 创建第二个子图,用于绘制热图 20 plt.subplot(1, 2, 2) ---> 21 plt.hist2d(x_values, y_values, bins=(50, 50), cmap=plt.cm.Blues) # 添加热图 23 # 添加标签和标题 24 plt.xlabel('mean_feature_methylation') File D:\conda\envs\py39\lib\site-packages\matplotlib\pyplot.py:2669, in hist2d(x, y, bins, range, density, weights, cmin, cmax, data, **kwargs) 2665 @_copy_docstring_and_deprecators(Axes.hist2d) 2666 def hist2d( 2667 x, y, bins=10, range=None, density=False, weights=None, 2668 cmin=None, cmax=None, *, data=None, **kwargs): -> 2669 __ret = gca().hist2d( 2670 x, y, bins=bins, range=range, density=density, 2671 weights=weights, cmin=cmin, cmax=cmax, 2672 **({"data": data} if data is not None else {}), **kwargs) 2673 sci(__ret[-1]) 2674 return __ret File D:\conda\envs\py39\lib\site-packages\matplotlib\__init__.py:1475, in _preprocess_data.<locals>.inner(ax, data, *args, **kwargs) 1472 @functools.wraps(func) 1473 def inner(ax, *args, data=None, **kwargs): 1474 if data is None: -> 1475 return func(ax, *map(sanitize_sequence, args), **kwargs) 1477 bound = new_sig.bind(ax, *args, **kwargs) 1478 auto_label = (bound.arguments.get(label_namer) 1479 or bound.kwargs.get(label_namer)) File D:\conda\envs\py39\lib\site-packages\matplotlib\axes\_axes.py:7128, in Axes.hist2d(self, x, y, bins, range, density, weights, cmin, cmax, **kwargs) 7035 @_preprocess_data(replace_names=["x", "y", "weights"]) 7036 @_docstring.dedent_interpd 7037 def hist2d(self, x, y, bins=10, range=None, density=False, weights=None, 7038 cmin=None, cmax=None, **kwargs): 7039 """ 7040 Make a 2D histogram plot. 7041 (...) 7125 `.colors.PowerNorm`. 7126 """ -> 7128 h, xedges, yedges = np.histogram2d(x, y, bins=bins, range=range, 7129 density=density, weights=weights) 7131 if cmin is not None: 7132 h[h < cmin] = None File <__array_function__ internals>:200, in histogram2d(*args, **kwargs) File D:\conda\envs\py39\lib\site-packages\numpy\lib\twodim_base.py:820, in histogram2d(x, y, bins, range, density, weights) 818 bins = [xedges, yedges] 819 hist, edges = histogramdd([x, y], bins, range, normed, weights, density) --> 820 return hist, edges[0], edges[1] File <__array_function__ internals>:200, in histogramdd(*args, **kwargs) File D:\conda\envs\py39\lib\site-packages\numpy\lib\histograms.py:1001, in histogramdd(sample, bins, range, density, weights) 943 @array_function_dispatch(_histogramdd_dispatcher) 944 def histogramdd(sample, bins=10, range=None, normed=None, weights=None, 945 density=None): 946 """ 947 Compute the multidimensional histogram of some data. 948 949 Parameters 950 ---------- 951 sample : (N, D) array, or (D, N) array_like 952 The data to be histogrammed. 953 954 Note the unusual interpretation of sample when an array_like: 955 956 * When an array, each row is a coordinate in a D-dimensional space - 957 such as ``histogramdd(np.array([p1, p2, p3]))``. 958 * When an array_like, each element is the list of values for single 959 coordinate - such as ``histogramdd((X, Y, Z))``. 960 961 The first form should be preferred. 962 963 bins : sequence or int, optional 964 The bin specification: 965 966 * A sequence of arrays describing the monotonically increasing bin 967 edges along each dimension. 968 * The number of bins for each dimension (nx, ny, ... =bins) 969 * The number of bins for all dimensions (nx=ny=...=bins). 970 971 range : sequence, optional 972 A sequence of length D, each an optional (lower, upper) tuple giving 973 the outer bin edges to be used if the edges are not given explicitly in 974 `bins`. 975 An entry of None in the sequence results in the minimum and maximum 976 values being used for the corresponding dimension. 977 The default, None, is equivalent to passing a tuple of D None values. 978 density : bool, optional 979 If False, the default, returns the number of samples in each bin. 980 If True, returns the probability *density* function at the bin, 981 ``bin_count / sample_count / bin_volume``. 982 normed : bool, optional 983 An alias for the density argument that behaves identically. To avoid 984 confusion with the broken normed argument to `histogram`, `density` 985 should be preferred. 986 weights : (N,) array_like, optional 987 An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`. 988 Weights are normalized to 1 if normed is True. If normed is False, 989 the values of the returned histogram are equal to the sum of the 990 weights belonging to the samples falling into each bin. 991 992 Returns 993 ------- 994 H : ndarray 995 The multidimensional histogram of sample x. See normed and weights 996 for the different possible semantics. 997 edges : list 998 A list of D arrays describing the bin edges for each dimension. 999 1000 See Also -> 1001 -------- 1002 histogram: 1-D histogram 1003 histogram2d: 2-D histogram 1004 1005 Examples 1006 -------- 1007 >>> r = np.random.randn(100,3) 1008 >>> H, edges = np.histogramdd(r, bins = (5, 8, 4)) 1009 >>> H.shape, edges[0].size, edges[1].size, edges[2].size 1010 ((5, 8, 4), 6, 9, 5) 1011 1012 """ 1014 try: 1015 # Sample is an ND-array. 1016 N, D = sample.shape File D:\conda\envs\py39\lib\site-packages\numpy\lib\histograms.py:323, in _get_outer_edges(a, range) 321 first_edge, last_edge = a.min(), a.max() 322 if not (np.isfinite(first_edge) and np.isfinite(last_edge)): --> 323 raise ValueError( 324 "autodetected range of [{}, {}] is not finite".format(first_edge, last_edge)) 326 # expand empty range to avoid divide by zero 327 if first_edge == last_edge: ValueError: autodetected range of [-inf, -0.12516314295400616] is not finite
In [174]:
Copied!
df = adata.var
min_coverage = df['coverage_feature'].min()
max_coverage = df['coverage_feature'].max()
df['NormalizedCoverage'] = (df['coverage_feature'] - min_coverage) / (max_coverage - min_coverage)
# 计算残差离散度
df['ResidualDispersion'] = df['mean_feature_methylation'].var() / (1 - df['NormalizedCoverage'])
# 设置高变特征的阈值
threshold = 0.1
# 确定高变特征
highly_variable_features = df[df['ResidualDispersion'] > threshold]['feature_name']
print("高变特征:")
print(highly_variable_features)
df = adata.var
min_coverage = df['coverage_feature'].min()
max_coverage = df['coverage_feature'].max()
df['NormalizedCoverage'] = (df['coverage_feature'] - min_coverage) / (max_coverage - min_coverage)
# 计算残差离散度
df['ResidualDispersion'] = df['mean_feature_methylation'].var() / (1 - df['NormalizedCoverage'])
# 设置高变特征的阈值
threshold = 0.1
# 确定高变特征
highly_variable_features = df[df['ResidualDispersion'] > threshold]['feature_name']
print("高变特征:")
print(highly_variable_features)
高变特征: Series([], Name: feature_name, dtype: object)
In [175]:
Copied!
df
df
Out[175]:
feature_name | coverage_feature | mean_feature_methylation | mean_sq | var | variance | dispersion | dispersion_log | NormalizedCoverage | ResidualDispersion | |
---|---|---|---|---|---|---|---|---|---|---|
0 | chr1_0_100000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN |
1 | chr1_100000_200000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN |
2 | chr1_200000_300000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN |
3 | chr1_300000_400000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN |
4 | chr1_400000_500000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4907 | chr2_241700000_241800000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN |
4908 | chr2_241800000_241900000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN |
4909 | chr2_241900000_242000000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN |
4910 | chr2_242000000_242100000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN |
4911 | chr2_242100000_242193529 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN |
4912 rows × 10 columns
In [183]:
Copied!
# Sample data with feature name, mean, and number of samples
data = {
'FeatureName': ['Feature1', 'Feature2', 'Feature3'],
'MeanValue': [0.5, 0.6, 0.7],
'NumSamples': [100, 200, 50]
}
df = pd.DataFrame(data)
# Calculate variance (var) using the updated formula
n_samples = df['NumSamples']
mean = df['MeanValue']
var = (mean ** 2) * (n_samples / (n_samples - 1))
# Set a threshold for mean values below which they will be replaced with 1e-12
threshold = 1e-12
mean[mean < threshold] = threshold
# Calculate dispersion using the updated formula
dispersion = var / mean
# Store the computed dispersion values in the DataFrame
df['Dispersion'] = dispersion
# Calculate mean and coverage
mean_mean = df['MeanValue'].mean()
mean_coverage = df['NumSamples'].mean()
# Normalize dispersion by both mean and coverage
df['NormalizedDispersion'] = df['Dispersion'] / (mean_mean * mean_coverage)
# Display the DataFrame with calculated dispersion
print(df)
# Sample data with feature name, mean, and number of samples
data = {
'FeatureName': ['Feature1', 'Feature2', 'Feature3'],
'MeanValue': [0.5, 0.6, 0.7],
'NumSamples': [100, 200, 50]
}
df = pd.DataFrame(data)
# Calculate variance (var) using the updated formula
n_samples = df['NumSamples']
mean = df['MeanValue']
var = (mean ** 2) * (n_samples / (n_samples - 1))
# Set a threshold for mean values below which they will be replaced with 1e-12
threshold = 1e-12
mean[mean < threshold] = threshold
# Calculate dispersion using the updated formula
dispersion = var / mean
# Store the computed dispersion values in the DataFrame
df['Dispersion'] = dispersion
# Calculate mean and coverage
mean_mean = df['MeanValue'].mean()
mean_coverage = df['NumSamples'].mean()
# Normalize dispersion by both mean and coverage
df['NormalizedDispersion'] = df['Dispersion'] / (mean_mean * mean_coverage)
# Display the DataFrame with calculated dispersion
print(df)
FeatureName MeanValue NumSamples Dispersion NormalizedDispersion 0 Feature1 0.5 100 0.505051 0.007215 1 Feature2 0.6 200 0.603015 0.008615 2 Feature3 0.7 50 0.714286 0.010204
C:\Users\zong\AppData\Local\Temp\ipykernel_7604\256784523.py:17: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy mean[mean < threshold] = threshold
In [179]:
Copied!
df
df
Out[179]:
FeatureName | MeanValue | Coverage | Dispersion | NormalizedDispersion | |
---|---|---|---|---|---|
0 | Feature1 | 0.1 | 100 | 0.063333 | 0.001481 |
1 | Feature2 | 0.6 | 200 | 0.063333 | 0.001481 |
2 | Feature3 | 0.4 | 50 | 0.063333 | 0.001481 |
In [185]:
Copied!
# 提取obs中的global列和cpg列
x_values = df['MeanValue']
y_values = df['Dispersion']
# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5) # alpha参数用于设置点的透明度,可根据需要调整
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
# 显示图形
plt.show()
# 提取obs中的global列和cpg列
x_values = df['MeanValue']
y_values = df['Dispersion']
# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5) # alpha参数用于设置点的透明度,可根据需要调整
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
# 显示图形
plt.show()
In [186]:
Copied!
# 提取obs中的global列和cpg列
x_values = df['MeanValue']
y_values = df['NormalizedDispersion']
# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5) # alpha参数用于设置点的透明度,可根据需要调整
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
# 显示图形
plt.show()
# 提取obs中的global列和cpg列
x_values = df['MeanValue']
y_values = df['NormalizedDispersion']
# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5) # alpha参数用于设置点的透明度,可根据需要调整
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
# 显示图形
plt.show()
In [195]:
Copied!
df = adata.var
mean_binsize = 0.05
cov_binsize = 10
bin_min_features=5
# instead of n_bins, use bin_size, because cov and mc are in different scale
df['mean_bin'] = (df['mean_feature_methylation'] / mean_binsize).astype(int)
df['cov_bin'] = (df['coverage_feature'] / cov_binsize).astype(int)
# instead of n_bins, use bin_size, because cov and mc are in different scale
df['mean_bin'] = (df['mean_feature_methylation'] / mean_binsize).astype(int)
df['cov_bin'] = (df['coverage_feature'] / cov_binsize).astype(int)
# save bin_count df, gather bins with more than bin_min_features features
bin_count = df.groupby(['mean_bin', 'cov_bin']) \
.apply(lambda i: i.shape[0]) \
.reset_index() \
.sort_values(0, ascending=False)
bin_count.head()
bin_more_than = bin_count[bin_count[0] > bin_min_features]
if bin_more_than.shape[0] == 0:
raise ValueError(f'No bin have more than {bin_min_features} features, use larger bin size.')
# this usually don't cause much difference (a few hundred features), but the scatter plot will look more nature
index_map = {}
for _, (mean_id, cov_id, count) in bin_count.iterrows():
if count > 1:
index_map[(mean_id, cov_id)] = (mean_id, cov_id)
manhattan_dist = (bin_more_than['mean_bin'] - mean_id).abs() + (bin_more_than['cov_bin'] - cov_id).abs()
closest_more_than = manhattan_dist.sort_values().index[0]
closest = bin_more_than.loc[closest_more_than]
index_map[(mean_id, cov_id)] = tuple(closest.tolist()[:2])
# apply index_map to original df
raw_bin = df[['mean_bin', 'cov_bin']].set_index(['mean_bin', 'cov_bin'])
raw_bin['use_mean'] = pd.Series(index_map).apply(lambda i: i[0])
raw_bin['use_cov'] = pd.Series(index_map).apply(lambda i: i[1])
df['mean_bin'] = raw_bin['use_mean'].values
df['cov_bin'] = raw_bin['use_cov'].values
# calculate bin mean and std, now disp_std_bin shouldn't have NAs
disp_grouped = df.groupby(['mean_bin', 'cov_bin'])['dispersion']
disp_mean_bin = disp_grouped.mean()
disp_std_bin = disp_grouped.std(ddof=1)
# actually do the normalization
_mean_norm = disp_mean_bin.loc[list(zip(df['mean_bin'], df['cov_bin']))]
_std_norm = disp_std_bin.loc[list(zip(df['mean_bin'], df['cov_bin']))]
df['dispersion_norm'] = (df['dispersion'].values # use values here as index differs
- _mean_norm.values) / _std_norm.values
dispersion_norm = df['dispersion_norm'].values.astype('float32')
df = adata.var
mean_binsize = 0.05
cov_binsize = 10
bin_min_features=5
# instead of n_bins, use bin_size, because cov and mc are in different scale
df['mean_bin'] = (df['mean_feature_methylation'] / mean_binsize).astype(int)
df['cov_bin'] = (df['coverage_feature'] / cov_binsize).astype(int)
# instead of n_bins, use bin_size, because cov and mc are in different scale
df['mean_bin'] = (df['mean_feature_methylation'] / mean_binsize).astype(int)
df['cov_bin'] = (df['coverage_feature'] / cov_binsize).astype(int)
# save bin_count df, gather bins with more than bin_min_features features
bin_count = df.groupby(['mean_bin', 'cov_bin']) \
.apply(lambda i: i.shape[0]) \
.reset_index() \
.sort_values(0, ascending=False)
bin_count.head()
bin_more_than = bin_count[bin_count[0] > bin_min_features]
if bin_more_than.shape[0] == 0:
raise ValueError(f'No bin have more than {bin_min_features} features, use larger bin size.')
# this usually don't cause much difference (a few hundred features), but the scatter plot will look more nature
index_map = {}
for _, (mean_id, cov_id, count) in bin_count.iterrows():
if count > 1:
index_map[(mean_id, cov_id)] = (mean_id, cov_id)
manhattan_dist = (bin_more_than['mean_bin'] - mean_id).abs() + (bin_more_than['cov_bin'] - cov_id).abs()
closest_more_than = manhattan_dist.sort_values().index[0]
closest = bin_more_than.loc[closest_more_than]
index_map[(mean_id, cov_id)] = tuple(closest.tolist()[:2])
# apply index_map to original df
raw_bin = df[['mean_bin', 'cov_bin']].set_index(['mean_bin', 'cov_bin'])
raw_bin['use_mean'] = pd.Series(index_map).apply(lambda i: i[0])
raw_bin['use_cov'] = pd.Series(index_map).apply(lambda i: i[1])
df['mean_bin'] = raw_bin['use_mean'].values
df['cov_bin'] = raw_bin['use_cov'].values
# calculate bin mean and std, now disp_std_bin shouldn't have NAs
disp_grouped = df.groupby(['mean_bin', 'cov_bin'])['dispersion']
disp_mean_bin = disp_grouped.mean()
disp_std_bin = disp_grouped.std(ddof=1)
# actually do the normalization
_mean_norm = disp_mean_bin.loc[list(zip(df['mean_bin'], df['cov_bin']))]
_std_norm = disp_std_bin.loc[list(zip(df['mean_bin'], df['cov_bin']))]
df['dispersion_norm'] = (df['dispersion'].values # use values here as index differs
- _mean_norm.values) / _std_norm.values
dispersion_norm = df['dispersion_norm'].values.astype('float32')
In [196]:
Copied!
df
df
Out[196]:
feature_name | coverage_feature | mean_feature_methylation | mean_sq | var | variance | dispersion | dispersion_log | NormalizedCoverage | ResidualDispersion | NormalizedDispersion | mean_bin | cov_bin | dispersion_norm | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | chr1_0_100000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 | 0 | 5 | -0.031001 |
1 | chr1_100000_200000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 | 0 | 5 | -0.031001 |
2 | chr1_200000_300000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 | 0 | 5 | -0.031001 |
3 | chr1_300000_400000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 | 0 | 5 | -0.031001 |
4 | chr1_400000_500000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 | 0 | 5 | -0.031001 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4907 | chr2_241700000_241800000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 | 0 | 5 | -0.031001 |
4908 | chr2_241800000_241900000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 | 0 | 5 | -0.031001 |
4909 | chr2_241900000_242000000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 | 0 | 5 | -0.031001 |
4910 | chr2_242000000_242100000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 | 0 | 5 | -0.031001 |
4911 | chr2_242100000_242193529 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 | 0 | 5 | -0.031001 |
4912 rows × 14 columns
In [266]:
Copied!
feature_subset = df.index.isin(df.sort_values('dispersion_norm', ascending=False).index[:1000])
feature_subset = df.index.isin(df.sort_values('dispersion_norm', ascending=False).index[:1000])
In [199]:
Copied!
feature_subset
feature_subset
Out[199]:
array([False, False, False, ..., False, False, False])
In [267]:
Copied!
df['feature_select'] = feature_subset
df['feature_select'] = feature_subset
In [201]:
Copied!
df
df
Out[201]:
feature_name | coverage_feature | mean_feature_methylation | mean_sq | var | variance | dispersion | dispersion_log | NormalizedCoverage | ResidualDispersion | NormalizedDispersion | mean_bin | cov_bin | dispersion_norm | feature_select | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | chr1_0_100000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 | 0 | 5 | -0.031001 | False |
1 | chr1_100000_200000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 | 0 | 5 | -0.031001 | False |
2 | chr1_200000_300000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 | 0 | 5 | -0.031001 | False |
3 | chr1_300000_400000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 | 0 | 5 | -0.031001 | False |
4 | chr1_400000_500000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 | 0 | 5 | -0.031001 | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4907 | chr2_241700000_241800000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 | 0 | 5 | -0.031001 | False |
4908 | chr2_241800000_241900000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 | 0 | 5 | -0.031001 | False |
4909 | chr2_241900000_242000000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 | 0 | 5 | -0.031001 | False |
4910 | chr2_242000000_242100000 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 | 0 | 5 | -0.031001 | False |
4911 | chr2_242100000_242193529 | 52 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -inf | NaN | NaN | 0.0 | 0 | 5 | -0.031001 | False |
4912 rows × 15 columns
In [203]:
Copied!
feature_subset.size
feature_subset.size
Out[203]:
4912
In [206]:
Copied!
df['feature_select'].sum()
df['feature_select'].sum()
Out[206]:
100
In [207]:
Copied!
# 提取obs中的global列和cpg列
x_values = df['mean_feature_methylation']
y_values = df['dispersion_norm']
# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5) # alpha参数用于设置点的透明度,可根据需要调整
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
# 显示图形
plt.show()
# 提取obs中的global列和cpg列
x_values = df['mean_feature_methylation']
y_values = df['dispersion_norm']
# 创建散点图
plt.scatter(x_values, y_values, alpha=0.5) # alpha参数用于设置点的透明度,可根据需要调整
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
# 显示图形
plt.show()
In [208]:
Copied!
#去除异常值
# 提取obs中的global列和cpg列
x_values = df['mean_feature_methylation']
y_values = df['dispersion_norm']
# 计算x和y的均值和标准差
x_mean, y_mean = np.mean(x_values), np.mean(y_values)
x_std, y_std = np.std(x_values), np.std(y_values)
# 设置阈值为3倍标准差
threshold = 3
# 筛选出不是异常值的数据点
filtered_x = df[(x_values >= x_mean - threshold * x_std) & (x_values <= x_mean + threshold * x_std)]
filtered_y = df[(y_values >= y_mean - threshold * y_std) & (y_values <= y_mean + threshold * y_std)]
# 创建散点图
plt.scatter(filtered_x, filtered_y, alpha=0.5) # alpha参数用于设置点的透明度,可根据需要调整
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
# 显示图形
plt.show()
#去除异常值
# 提取obs中的global列和cpg列
x_values = df['mean_feature_methylation']
y_values = df['dispersion_norm']
# 计算x和y的均值和标准差
x_mean, y_mean = np.mean(x_values), np.mean(y_values)
x_std, y_std = np.std(x_values), np.std(y_values)
# 设置阈值为3倍标准差
threshold = 3
# 筛选出不是异常值的数据点
filtered_x = df[(x_values >= x_mean - threshold * x_std) & (x_values <= x_mean + threshold * x_std)]
filtered_y = df[(y_values >= y_mean - threshold * y_std) & (y_values <= y_mean + threshold * y_std)]
# 创建散点图
plt.scatter(filtered_x, filtered_y, alpha=0.5) # alpha参数用于设置点的透明度,可根据需要调整
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
# 显示图形
plt.show()
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[208], line 18 15 filtered_y = df[(y_values >= y_mean - threshold * y_std) & (y_values <= y_mean + threshold * y_std)] 17 # 创建散点图 ---> 18 plt.scatter(filtered_x, filtered_y, alpha=0.5) # alpha参数用于设置点的透明度,可根据需要调整 20 # 添加标签和标题 21 plt.xlabel('mean_feature_methylation') File D:\conda\envs\py39\lib\site-packages\matplotlib\pyplot.py:2862, in scatter(x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, edgecolors, plotnonfinite, data, **kwargs) 2857 @_copy_docstring_and_deprecators(Axes.scatter) 2858 def scatter( 2859 x, y, s=None, c=None, marker=None, cmap=None, norm=None, 2860 vmin=None, vmax=None, alpha=None, linewidths=None, *, 2861 edgecolors=None, plotnonfinite=False, data=None, **kwargs): -> 2862 __ret = gca().scatter( 2863 x, y, s=s, c=c, marker=marker, cmap=cmap, norm=norm, 2864 vmin=vmin, vmax=vmax, alpha=alpha, linewidths=linewidths, 2865 edgecolors=edgecolors, plotnonfinite=plotnonfinite, 2866 **({"data": data} if data is not None else {}), **kwargs) 2867 sci(__ret) 2868 return __ret File D:\conda\envs\py39\lib\site-packages\matplotlib\__init__.py:1475, in _preprocess_data.<locals>.inner(ax, data, *args, **kwargs) 1472 @functools.wraps(func) 1473 def inner(ax, *args, data=None, **kwargs): 1474 if data is None: -> 1475 return func(ax, *map(sanitize_sequence, args), **kwargs) 1477 bound = new_sig.bind(ax, *args, **kwargs) 1478 auto_label = (bound.arguments.get(label_namer) 1479 or bound.kwargs.get(label_namer)) File D:\conda\envs\py39\lib\site-packages\matplotlib\axes\_axes.py:4572, in Axes.scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, edgecolors, plotnonfinite, **kwargs) 4462 """ 4463 A scatter plot of *y* vs. *x* with varying marker size and/or color. 4464 (...) 4569 4570 """ 4571 # Process **kwargs to handle aliases, conflicts with explicit kwargs: -> 4572 x, y = self._process_unit_info([("x", x), ("y", y)], kwargs) 4573 # np.ma.ravel yields an ndarray, not a masked array, 4574 # unless its argument is a masked array. 4575 x = np.ma.ravel(x) File D:\conda\envs\py39\lib\site-packages\matplotlib\axes\_base.py:2549, in _AxesBase._process_unit_info(self, datasets, kwargs, convert) 2547 # Update from data if axis is already set but no unit is set yet. 2548 if axis is not None and data is not None and not axis.have_units(): -> 2549 axis.update_units(data) 2550 for axis_name, axis in axis_map.items(): 2551 # Return if no axis is set. 2552 if axis is None: File D:\conda\envs\py39\lib\site-packages\matplotlib\axis.py:1675, in Axis.update_units(self, data) 1673 neednew = self.converter != converter 1674 self.converter = converter -> 1675 default = self.converter.default_units(data, self) 1676 if default is not None and self.units is None: 1677 self.set_units(default) File D:\conda\envs\py39\lib\site-packages\matplotlib\category.py:105, in StrCategoryConverter.default_units(data, axis) 103 # the conversion call stack is default_units -> axis_info -> convert 104 if axis.units is None: --> 105 axis.set_units(UnitData(data)) 106 else: 107 axis.units.update(data) File D:\conda\envs\py39\lib\site-packages\matplotlib\category.py:181, in UnitData.__init__(self, data) 179 self._counter = itertools.count() 180 if data is not None: --> 181 self.update(data) File D:\conda\envs\py39\lib\site-packages\matplotlib\category.py:214, in UnitData.update(self, data) 212 # check if convertible to number: 213 convertible = True --> 214 for val in OrderedDict.fromkeys(data): 215 # OrderedDict just iterates over unique values in data. 216 _api.check_isinstance((str, bytes), value=val) 217 if convertible: 218 # this will only be called so long as convertible is True. TypeError: unhashable type: 'numpy.ndarray'
In [210]:
Copied!
selected_features = df.loc[df['feature_select'], 'feature_name']
selected_features = df.loc[df['feature_select'], 'feature_name']
In [211]:
Copied!
selected_features
selected_features
Out[211]:
203 chr1_20300000_20400000 218 chr1_21800000_21900000 242 chr1_24200000_24300000 245 chr1_24500000_24600000 280 chr1_28000000_28100000 ... 4256 chr2_176600000_176700000 4266 chr2_177600000_177700000 4273 chr2_178300000_178400000 4283 chr2_179300000_179400000 4310 chr2_182000000_182100000 Name: feature_name, Length: 100, dtype: object
In [214]:
Copied!
# 提取obs中的global列和cpg列
x_values = df['mean_feature_methylation'].tolist()
y_values = df['dispersion_norm'].tolist()
# 定义离散度的阈值来确定异常值
threshold = 2.0 # 根据需要调整阈值
# 删除异常值
filtered_x_values = []
filtered_y_values = []
for x, y in zip(x_values, y_values):
if y < threshold:
filtered_x_values.append(x)
filtered_y_values.append(y)
# 重新绘制没有异常值的散点图
plt.scatter(filtered_x_values, filtered_y_values, alpha=0.5)
# 显示图形
plt.show()
# 提取obs中的global列和cpg列
x_values = df['mean_feature_methylation'].tolist()
y_values = df['dispersion_norm'].tolist()
# 定义离散度的阈值来确定异常值
threshold = 2.0 # 根据需要调整阈值
# 删除异常值
filtered_x_values = []
filtered_y_values = []
for x, y in zip(x_values, y_values):
if y < threshold:
filtered_x_values.append(x)
filtered_y_values.append(y)
# 重新绘制没有异常值的散点图
plt.scatter(filtered_x_values, filtered_y_values, alpha=0.5)
# 显示图形
plt.show()
In [220]:
Copied!
# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation'].tolist()
y_values = adata.var['dispersion'].tolist()
# 定义离散度的阈值来确定异常值
threshold = 0.4 # 根据需要调整阈值
# 删除异常值
filtered_x_values = []
filtered_y_values = []
for x, y in zip(x_values, y_values):
if y < threshold:
filtered_x_values.append(x)
filtered_y_values.append(y)
plt.scatter(filtered_x_values, filtered_y_values, alpha=0.5)
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
# 显示图形
plt.show()
# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation'].tolist()
y_values = adata.var['dispersion'].tolist()
# 定义离散度的阈值来确定异常值
threshold = 0.4 # 根据需要调整阈值
# 删除异常值
filtered_x_values = []
filtered_y_values = []
for x, y in zip(x_values, y_values):
if y < threshold:
filtered_x_values.append(x)
filtered_y_values.append(y)
plt.scatter(filtered_x_values, filtered_y_values, alpha=0.5)
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
# 显示图形
plt.show()
In [221]:
Copied!
from scipy.stats import linregress
# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation'].tolist()
y_values = adata.var['dispersion'].tolist()
# 定义离散度的阈值来确定异常值
threshold = 0.4 # 根据需要调整阈值
# 删除异常值
filtered_x_values = []
filtered_y_values = []
for x, y in zip(x_values, y_values):
if y < threshold:
filtered_x_values.append(x)
filtered_y_values.append(y)
plt.scatter(filtered_x_values, filtered_y_values, alpha=0.5)
# 使用线性回归拟合数据
slope, intercept, r_value, p_value, std_err = linregress(x_values, y_values)
# 绘制趋势线
plt.plot(x_values, intercept + slope * np.array(x_values), color='red', label='Trend Line')
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
# 显示图形
plt.show()
from scipy.stats import linregress
# 提取obs中的global列和cpg列
x_values = adata.var['mean_feature_methylation'].tolist()
y_values = adata.var['dispersion'].tolist()
# 定义离散度的阈值来确定异常值
threshold = 0.4 # 根据需要调整阈值
# 删除异常值
filtered_x_values = []
filtered_y_values = []
for x, y in zip(x_values, y_values):
if y < threshold:
filtered_x_values.append(x)
filtered_y_values.append(y)
plt.scatter(filtered_x_values, filtered_y_values, alpha=0.5)
# 使用线性回归拟合数据
slope, intercept, r_value, p_value, std_err = linregress(x_values, y_values)
# 绘制趋势线
plt.plot(x_values, intercept + slope * np.array(x_values), color='red', label='Trend Line')
# 添加标签和标题
plt.xlabel('mean_feature_methylation')
plt.ylabel('dispersion_log')
# 显示图形
plt.show()
In [222]:
Copied!
# 提取obs中的global列和cpg列
x_values = df['mean_feature_methylation'].tolist()
y_values = df['dispersion_norm'].tolist()
# 定义离散度的阈值来确定异常值
threshold = 2.0 # 根据需要调整阈值
# 删除异常值
filtered_x_values = []
filtered_y_values = []
for x, y in zip(x_values, y_values):
if y < threshold:
filtered_x_values.append(x)
filtered_y_values.append(y)
# 重新绘制没有异常值的散点图
plt.scatter(filtered_x_values, filtered_y_values, alpha=0.5)
# 使用线性回归拟合数据
slope, intercept, r_value, p_value, std_err = linregress(x_values, y_values)
# 绘制趋势线
plt.plot(x_values, intercept + slope * np.array(x_values), color='red', label='Trend Line')
# 显示图形
plt.show()
# 提取obs中的global列和cpg列
x_values = df['mean_feature_methylation'].tolist()
y_values = df['dispersion_norm'].tolist()
# 定义离散度的阈值来确定异常值
threshold = 2.0 # 根据需要调整阈值
# 删除异常值
filtered_x_values = []
filtered_y_values = []
for x, y in zip(x_values, y_values):
if y < threshold:
filtered_x_values.append(x)
filtered_y_values.append(y)
# 重新绘制没有异常值的散点图
plt.scatter(filtered_x_values, filtered_y_values, alpha=0.5)
# 使用线性回归拟合数据
slope, intercept, r_value, p_value, std_err = linregress(x_values, y_values)
# 绘制趋势线
plt.plot(x_values, intercept + slope * np.array(x_values), color='red', label='Trend Line')
# 显示图形
plt.show()
In [223]:
Copied!
import scanpy as sc
import scanpy as sc
In [226]:
Copied!
#sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca(adata, color=['overall_mc_level_CG'], wspace=0.5)
#sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca(adata, color=['overall_mc_level_CG'], wspace=0.5)
In [228]:
Copied!
sc.pl.pca(adata, color=['mc_class_CG'], wspace=0.5)
sc.pl.pca(adata, color=['mc_class_CG'], wspace=0.5)
In [268]:
Copied!
adata_filter=adata[:,adata.var['feature_select']].copy()
adata_filter2=adata_filter[adata_filter.obs['mc_class_CG']>2000000,:].copy()
adata_filter=adata[:,adata.var['feature_select']].copy()
adata_filter2=adata_filter[adata_filter.obs['mc_class_CG']>2000000,:].copy()
In [269]:
Copied!
adata_filter
adata_filter
Out[269]:
AnnData object with n_obs × n_vars = 52 × 1000 obs: 'methylated_CG', 'mc_class_CG', 'total_CG', 'overall_mc_level_CG', 'sample', 'sample_id', 'coverage_cells', 'mean_cell_methylation' var: 'feature_name', 'coverage_feature', 'mean_feature_methylation', 'mean_sq', 'var', 'variance', 'dispersion', 'dispersion_log', 'NormalizedCoverage', 'ResidualDispersion', 'NormalizedDispersion', 'mean_bin', 'cov_bin', 'dispersion_norm', 'feature_select' uns: 'pca' obsm: 'X_pca', 'X_tsne' varm: 'PCs'
In [233]:
Copied!
sc.tl.pca(adata_filter, svd_solver='arpack')
sc.pl.pca(adata_filter, color=['overall_mc_level_CG'], wspace=0.5)
sc.tl.pca(adata_filter, svd_solver='arpack')
sc.pl.pca(adata_filter, color=['overall_mc_level_CG'], wspace=0.5)
In [236]:
Copied!
adata_filter.varm['PCs']
adata_filter.varm['PCs']
Out[236]:
array([[-0.06958627, 0.17975275, 0.03491881, ..., 0.02417051, -0.03574859, 0.1777849 ], [-0.07811921, 0.18211309, 0.03563234, ..., 0.15430381, -0.00066009, 0.18689157], [-0.10578167, -0.0394036 , 0.1402062 , ..., -0.15728749, -0.06590174, 0.20133028], ..., [-0.06801507, 0.19917281, 0.03803346, ..., 0.02431435, -0.12863403, -0.04797525], [-0.10333498, -0.07916441, 0.28298015, ..., 0.08951086, 0.08185068, 0.03221035], [-0.13062945, -0.02340198, -0.08192839, ..., -0.03865004, 0.05464122, -0.02124793]])
In [237]:
Copied!
adata.varm['PCs']
adata.varm['PCs']
Out[237]:
array([[ 1.43114687e-17, 1.38777878e-17, 1.00830802e-17, ..., -6.50521303e-18, -3.68628739e-18, -1.03757191e-30], [-4.77048956e-17, -3.08997619e-17, -1.61004023e-17, ..., -1.38777878e-17, 2.13391703e-30, 9.01991226e-31], [-9.24601288e-18, -8.18398806e-21, 3.68935473e-17, ..., 5.16849603e-31, 8.44948759e-31, 9.30009766e-31], ..., [ 0.00000000e+00, -0.00000000e+00, 0.00000000e+00, ..., -0.00000000e+00, -0.00000000e+00, -0.00000000e+00], [ 0.00000000e+00, -0.00000000e+00, 0.00000000e+00, ..., -0.00000000e+00, -0.00000000e+00, -0.00000000e+00], [ 0.00000000e+00, -0.00000000e+00, 0.00000000e+00, ..., -0.00000000e+00, -0.00000000e+00, -0.00000000e+00]])
In [238]:
Copied!
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[238], line 1 ----> 1 sc.tl.tsne(adata, 2 obsm='X_pca', 3 metric='euclidean', 4 exaggeration=-1, # auto determined 5 perplexity=30, 6 n_jobs=-1) 7 sc.pl.tsne(cdata_mean,color=['Neuron type'], wspace=0.8) TypeError: tsne() got an unexpected keyword argument 'obsm'
In [240]:
Copied!
from openTSNE import TSNEEmbedding, affinity, initialization
#tsne methods
from typing import Callable, Union
def art_of_tsne(X: np.ndarray,
metric: Union[str, Callable] = "euclidean",
exaggeration: float = -1,
perplexity: int = 30,
n_jobs: int = -1) -> TSNEEmbedding:
"""
Implementation of Dmitry Kobak and Philipp Berens
"The art of using t-SNE for single-cell transcriptomics" based on openTSNE.
See https://doi.org/10.1038/s41467-019-13056-x | www.nature.com/naturecommunications
Args:
X The data matrix of shape (n_cells, n_genes) i.e. (n_samples, n_features)
metric Any metric allowed by PyNNDescent (default: 'euclidean')
exaggeration The exaggeration to use for the embedding
perplexity The perplexity to use for the embedding
Returns:
The embedding as an opentsne.TSNEEmbedding object (which can be cast to an np.ndarray)
"""
n = X.shape[0]
if n > 100_000:
if exaggeration == -1:
exaggeration = 1 + n / 333_333
# Subsample, optimize, then add the remaining cells and optimize again
# Also, use exaggeration == 4
logging.info(f"Creating subset of {n // 40} elements")
# Subsample and run a regular art_of_tsne on the subset
indices = np.random.permutation(n)
reverse = np.argsort(indices)
X_sample, X_rest = X[indices[:n // 40]], X[indices[n // 40:]]
logging.info(f"Embedding subset")
Z_sample = art_of_tsne(X_sample)
logging.info(
f"Preparing partial initial embedding of the {n - n // 40} remaining elements"
)
if isinstance(Z_sample.affinities, affinity.Multiscale):
rest_init = Z_sample.prepare_partial(X_rest,
k=1,
perplexities=[1 / 3, 1 / 3])
else:
rest_init = Z_sample.prepare_partial(X_rest, k=1, perplexity=1 / 3)
logging.info(f"Combining the initial embeddings, and standardizing")
init_full = np.vstack((Z_sample, rest_init))[reverse]
init_full = init_full / (np.std(init_full[:, 0]) * 10000)
logging.info(f"Creating multiscale affinities")
affinities = affinity.PerplexityBasedNN(X,
perplexity=perplexity,
metric=metric,
method="approx",
n_jobs=n_jobs)
logging.info(f"Creating TSNE embedding")
Z = TSNEEmbedding(init_full,
affinities,
negative_gradient_method="fft",
n_jobs=n_jobs)
logging.info(f"Optimizing, stage 1")
Z.optimize(n_iter=250,
inplace=True,
exaggeration=12,
momentum=0.5,
learning_rate=n / 12,
n_jobs=n_jobs)
logging.info(f"Optimizing, stage 2")
Z.optimize(n_iter=750,
inplace=True,
exaggeration=exaggeration,
momentum=0.8,
learning_rate=n / 12,
n_jobs=n_jobs)
elif n > 3_000:
if exaggeration == -1:
exaggeration = 1
# Use multiscale perplexity
affinities_multiscale_mixture = affinity.Multiscale(
X,
perplexities=[perplexity, n / 100],
metric=metric,
method="approx",
n_jobs=n_jobs)
init = initialization.pca(X)
Z = TSNEEmbedding(init,
affinities_multiscale_mixture,
negative_gradient_method="fft",
n_jobs=n_jobs)
Z.optimize(n_iter=250,
inplace=True,
exaggeration=12,
momentum=0.5,
learning_rate=n / 12,
n_jobs=n_jobs)
Z.optimize(n_iter=750,
inplace=True,
exaggeration=exaggeration,
momentum=0.8,
learning_rate=n / 12,
n_jobs=n_jobs)
else:
if exaggeration == -1:
exaggeration = 1
# Just a plain TSNE with high learning rate
lr = max(200, n / 12)
aff = affinity.PerplexityBasedNN(X,
perplexity=perplexity,
metric=metric,
method="approx",
n_jobs=n_jobs)
init = initialization.pca(X)
Z = TSNEEmbedding(init,
aff,
learning_rate=lr,
n_jobs=n_jobs,
negative_gradient_method="fft")
Z.optimize(250, exaggeration=12, momentum=0.5, inplace=True, n_jobs=n_jobs)
Z.optimize(750,
exaggeration=exaggeration,
momentum=0.8,
inplace=True,
n_jobs=n_jobs)
return Z
def tsne(adata, obsm='X_pca',
metric: Union[str, Callable] = "euclidean",
exaggeration: float = -1,
perplexity: int = 30,
n_jobs: int = -1):
X = adata.obsm[obsm]
Z = art_of_tsne(X=X, metric=metric, exaggeration=exaggeration, perplexity=perplexity, n_jobs=n_jobs)
adata.obsm['X_tsne'] = Z
return
from openTSNE import TSNEEmbedding, affinity, initialization
#tsne methods
from typing import Callable, Union
def art_of_tsne(X: np.ndarray,
metric: Union[str, Callable] = "euclidean",
exaggeration: float = -1,
perplexity: int = 30,
n_jobs: int = -1) -> TSNEEmbedding:
"""
Implementation of Dmitry Kobak and Philipp Berens
"The art of using t-SNE for single-cell transcriptomics" based on openTSNE.
See https://doi.org/10.1038/s41467-019-13056-x | www.nature.com/naturecommunications
Args:
X The data matrix of shape (n_cells, n_genes) i.e. (n_samples, n_features)
metric Any metric allowed by PyNNDescent (default: 'euclidean')
exaggeration The exaggeration to use for the embedding
perplexity The perplexity to use for the embedding
Returns:
The embedding as an opentsne.TSNEEmbedding object (which can be cast to an np.ndarray)
"""
n = X.shape[0]
if n > 100_000:
if exaggeration == -1:
exaggeration = 1 + n / 333_333
# Subsample, optimize, then add the remaining cells and optimize again
# Also, use exaggeration == 4
logging.info(f"Creating subset of {n // 40} elements")
# Subsample and run a regular art_of_tsne on the subset
indices = np.random.permutation(n)
reverse = np.argsort(indices)
X_sample, X_rest = X[indices[:n // 40]], X[indices[n // 40:]]
logging.info(f"Embedding subset")
Z_sample = art_of_tsne(X_sample)
logging.info(
f"Preparing partial initial embedding of the {n - n // 40} remaining elements"
)
if isinstance(Z_sample.affinities, affinity.Multiscale):
rest_init = Z_sample.prepare_partial(X_rest,
k=1,
perplexities=[1 / 3, 1 / 3])
else:
rest_init = Z_sample.prepare_partial(X_rest, k=1, perplexity=1 / 3)
logging.info(f"Combining the initial embeddings, and standardizing")
init_full = np.vstack((Z_sample, rest_init))[reverse]
init_full = init_full / (np.std(init_full[:, 0]) * 10000)
logging.info(f"Creating multiscale affinities")
affinities = affinity.PerplexityBasedNN(X,
perplexity=perplexity,
metric=metric,
method="approx",
n_jobs=n_jobs)
logging.info(f"Creating TSNE embedding")
Z = TSNEEmbedding(init_full,
affinities,
negative_gradient_method="fft",
n_jobs=n_jobs)
logging.info(f"Optimizing, stage 1")
Z.optimize(n_iter=250,
inplace=True,
exaggeration=12,
momentum=0.5,
learning_rate=n / 12,
n_jobs=n_jobs)
logging.info(f"Optimizing, stage 2")
Z.optimize(n_iter=750,
inplace=True,
exaggeration=exaggeration,
momentum=0.8,
learning_rate=n / 12,
n_jobs=n_jobs)
elif n > 3_000:
if exaggeration == -1:
exaggeration = 1
# Use multiscale perplexity
affinities_multiscale_mixture = affinity.Multiscale(
X,
perplexities=[perplexity, n / 100],
metric=metric,
method="approx",
n_jobs=n_jobs)
init = initialization.pca(X)
Z = TSNEEmbedding(init,
affinities_multiscale_mixture,
negative_gradient_method="fft",
n_jobs=n_jobs)
Z.optimize(n_iter=250,
inplace=True,
exaggeration=12,
momentum=0.5,
learning_rate=n / 12,
n_jobs=n_jobs)
Z.optimize(n_iter=750,
inplace=True,
exaggeration=exaggeration,
momentum=0.8,
learning_rate=n / 12,
n_jobs=n_jobs)
else:
if exaggeration == -1:
exaggeration = 1
# Just a plain TSNE with high learning rate
lr = max(200, n / 12)
aff = affinity.PerplexityBasedNN(X,
perplexity=perplexity,
metric=metric,
method="approx",
n_jobs=n_jobs)
init = initialization.pca(X)
Z = TSNEEmbedding(init,
aff,
learning_rate=lr,
n_jobs=n_jobs,
negative_gradient_method="fft")
Z.optimize(250, exaggeration=12, momentum=0.5, inplace=True, n_jobs=n_jobs)
Z.optimize(750,
exaggeration=exaggeration,
momentum=0.8,
inplace=True,
n_jobs=n_jobs)
return Z
def tsne(adata, obsm='X_pca',
metric: Union[str, Callable] = "euclidean",
exaggeration: float = -1,
perplexity: int = 30,
n_jobs: int = -1):
X = adata.obsm[obsm]
Z = art_of_tsne(X=X, metric=metric, exaggeration=exaggeration, perplexity=perplexity, n_jobs=n_jobs)
adata.obsm['X_tsne'] = Z
return
In [249]:
Copied!
#tsne(adata)
sc.pl.tsne(adata, color=['overall_mc_level_CG'],wspace=0.3)
#tsne(adata)
sc.pl.tsne(adata, color=['overall_mc_level_CG'],wspace=0.3)
In [245]:
Copied!
tsne(adata_filter)
tsne(adata_filter)
In [250]:
Copied!
sc.pl.tsne(adata_filter,color=['overall_mc_level_CG'] ,wspace=0.1)
sc.pl.tsne(adata_filter,color=['overall_mc_level_CG'] ,wspace=0.1)
In [270]:
Copied!
adata_filter2
adata_filter2
Out[270]:
AnnData object with n_obs × n_vars = 44 × 1000 obs: 'methylated_CG', 'mc_class_CG', 'total_CG', 'overall_mc_level_CG', 'sample', 'sample_id', 'coverage_cells', 'mean_cell_methylation' var: 'feature_name', 'coverage_feature', 'mean_feature_methylation', 'mean_sq', 'var', 'variance', 'dispersion', 'dispersion_log', 'NormalizedCoverage', 'ResidualDispersion', 'NormalizedDispersion', 'mean_bin', 'cov_bin', 'dispersion_norm', 'feature_select' uns: 'pca' obsm: 'X_pca', 'X_tsne' varm: 'PCs'
In [272]:
Copied!
sc.tl.pca(adata_filter2, svd_solver='arpack')
sc.pl.pca(adata_filter2, color=['total_CG'], wspace=0.5)
sc.tl.pca(adata_filter2, svd_solver='arpack')
sc.pl.pca(adata_filter2, color=['total_CG'], wspace=0.5)
In [273]:
Copied!
tsne(adata_filter2)
tsne(adata_filter2)
In [274]:
Copied!
sc.pl.tsne(adata_filter2,color=['overall_mc_level_CG'] ,wspace=0.1)
sc.pl.tsne(adata_filter2,color=['overall_mc_level_CG'] ,wspace=0.1)
In [263]:
Copied!
adata_filter2.obs
adata_filter2.obs
Out[263]:
methylated_CG | mc_class_CG | total_CG | overall_mc_level_CG | sample | sample_id | coverage_cells | mean_cell_methylation | Cell_type | Treatment | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2773841.0 | 6389935.0 | 10058585.0 | 0.275769 | GSM1370523 | 0 | 4912 | 0.255770 | MII oocyte | NaN |
1 | 4207956.0 | 9056780.0 | 14749706.0 | 0.285291 | GSM1370524 | 1 | 4912 | 0.267832 | MII oocyte | NaN |
2 | 1062414.0 | 3024372.0 | 3687362.0 | 0.288123 | GSM1370525 | 2 | 4912 | 0.270414 | MII oocyte | NaN |
3 | 2174791.0 | 5617340.0 | 7468542.0 | 0.291194 | GSM1370526 | 3 | 4912 | 0.257664 | MII oocyte | NaN |
4 | 2250723.0 | 5478455.0 | 7718445.0 | 0.291603 | GSM1370527 | 4 | 4912 | 0.256831 | MII oocyte | NaN |
5 | 1235199.0 | 3266783.0 | 4241536.0 | 0.291215 | GSM1370529 | 6 | 4912 | 0.247983 | MII oocyte | NaN |
6 | 1602124.0 | 3903117.0 | 5507096.0 | 0.290920 | GSM1370530 | 7 | 4912 | 0.261821 | MII oocyte | NaN |
7 | 1229421.0 | 3204150.0 | 4160911.0 | 0.295469 | GSM1370531 | 8 | 4912 | 0.254784 | MII oocyte | NaN |
8 | 1149598.0 | 2822474.0 | 3733805.0 | 0.307889 | GSM1370532 | 9 | 4912 | 0.257592 | MII oocyte | NaN |
9 | 1586218.0 | 3869842.0 | 5403788.0 | 0.293538 | GSM1370533 | 10 | 4912 | 0.253991 | MII oocyte | NaN |
10 | 1948937.0 | 3800961.0 | 5242714.0 | 0.371742 | GSM1370535 | 11 | 4912 | 0.393481 | ESC | 2i |
11 | 1271969.0 | 2797560.0 | 3506515.0 | 0.362744 | GSM1370536 | 12 | 4912 | 0.346206 | ESC | 2i |
12 | 2105457.0 | 3372625.0 | 4537703.0 | 0.463992 | GSM1370537 | 13 | 4912 | 0.454491 | ESC | 2i |
13 | 1521358.0 | 4242155.0 | 5799310.0 | 0.262334 | GSM1370538 | 14 | 4912 | 0.236155 | ESC | 2i |
14 | 1974452.0 | 3917329.0 | 5546459.0 | 0.355984 | GSM1370539 | 15 | 4912 | 0.337848 | ESC | 2i |
15 | 372428.0 | 3297855.0 | 4538546.0 | 0.082059 | GSM1370540 | 16 | 4912 | 0.066197 | ESC | 2i |
16 | 2068775.0 | 4283144.0 | 5851328.0 | 0.353556 | GSM1370541 | 17 | 4912 | 0.276954 | ESC | 2i |
17 | 730843.0 | 3950391.0 | 5450398.0 | 0.134090 | GSM1370542 | 18 | 4912 | 0.127163 | ESC | 2i |
18 | 1318842.0 | 4442023.0 | 5843535.0 | 0.225692 | GSM1370543 | 19 | 4912 | 0.214378 | ESC | 2i |
19 | 1109405.0 | 3710145.0 | 5011395.0 | 0.221376 | GSM1370544 | 20 | 4912 | 0.201409 | ESC | 2i |
20 | 721897.0 | 3110061.0 | 4463927.0 | 0.161718 | GSM1370545 | 21 | 4912 | 0.148684 | ESC | 2i |
21 | 1828675.0 | 3532788.0 | 4807341.0 | 0.380392 | GSM1370546 | 22 | 4912 | 0.352224 | ESC | 2i |
22 | 5078262.0 | 4424268.0 | 7002192.0 | 0.725239 | GSM1370555 | 30 | 4912 | 0.633859 | ESC | serum/LIF |
23 | 2883177.0 | 3262923.0 | 4525444.0 | 0.637104 | GSM1370556 | 31 | 4912 | 0.569044 | ESC | serum/LIF |
24 | 3010405.0 | 4728963.0 | 7751272.0 | 0.388376 | GSM1370557 | 32 | 4912 | 0.338542 | ESC | serum/LIF |
25 | 4478594.0 | 4965673.0 | 7571138.0 | 0.591535 | GSM1370558 | 33 | 4912 | 0.570422 | ESC | serum/LIF |
26 | 3200120.0 | 3848428.0 | 5622442.0 | 0.569169 | GSM1370559 | 34 | 4912 | 0.498544 | ESC | serum/LIF |
27 | 1814649.0 | 5795602.0 | 8946794.0 | 0.202827 | GSM1370560 | 35 | 4912 | 0.187398 | ESC | serum/LIF |
28 | 3755105.0 | 3468770.0 | 5304515.0 | 0.707907 | GSM1370561 | 36 | 4912 | 0.629427 | ESC | serum/LIF |
29 | 3243439.0 | 3557629.0 | 4771783.0 | 0.679712 | GSM1370562 | 37 | 4912 | 0.611443 | ESC | serum/LIF |
30 | 3826163.0 | 5262688.0 | 7106219.0 | 0.538425 | GSM1370563 | 38 | 4912 | 0.508737 | ESC | serum/LIF |
31 | 4527218.0 | 4938305.0 | 7210573.0 | 0.627858 | GSM1370564 | 39 | 4912 | 0.596066 | ESC | serum/LIF |
32 | 3487010.0 | 3698022.0 | 5098903.0 | 0.683875 | GSM1370565 | 40 | 4912 | 0.615996 | ESC | serum/LIF |
33 | 1670947.0 | 2077704.0 | 2662177.0 | 0.627662 | GSM1370566 | 41 | 4912 | 0.585674 | ESC | serum/LIF |
34 | 2014271.0 | 3017804.0 | 3828676.0 | 0.526101 | GSM1370567 | 42 | 4912 | 0.468503 | ESC | serum/LIF |
35 | 6249395.0 | 7502459.0 | 10319408.0 | 0.605596 | GSM1370568 | 43 | 4912 | 0.539304 | ESC | serum/LIF |
36 | 2641859.0 | 3607443.0 | 4826163.0 | 0.547404 | GSM1370569 | 44 | 4912 | 0.477331 | ESC | serum/LIF |
37 | 3859515.0 | 5568802.0 | 7575752.0 | 0.509456 | GSM1370570 | 45 | 4912 | 0.469847 | ESC | serum/LIF |
38 | 4079333.0 | 4145869.0 | 6017116.0 | 0.677955 | GSM1370571 | 46 | 4912 | 0.616277 | ESC | serum/LIF |
39 | 3410149.0 | 3967729.0 | 5362681.0 | 0.635904 | GSM1370572 | 47 | 4912 | 0.577196 | ESC | serum/LIF |
40 | 3488749.0 | 4433266.0 | 6109535.0 | 0.571033 | GSM1370573 | 48 | 4912 | 0.552650 | ESC | serum/LIF |
41 | 3588924.0 | 4523416.0 | 6085020.0 | 0.589797 | GSM1370574 | 49 | 4912 | 0.513039 | ESC | serum/LIF |
42 | 37221650.0 | 22641666.0 | 53520217.0 | 0.695469 | GSM1370575 | 50 | 4912 | 0.581207 | ESC | serum/LIF |
43 | 5484605.0 | 9727286.0 | 19593619.0 | 0.279918 | GSM1413827 | 51 | 4912 | 0.257857 | MII oocyte | NaN |
In [277]:
Copied!
#load_meta
metadata = pd.read_csv("D://Test/GSE56789/GSE56879_Meta.txt", sep='\t')
adata_filter2.obs =adata_filter2.obs.merge(metadata[['sample', 'Cell_type','Treatment']], left_on='sample', right_on='sample', how='left')
#load_meta
metadata = pd.read_csv("D://Test/GSE56789/GSE56879_Meta.txt", sep='\t')
adata_filter2.obs =adata_filter2.obs.merge(metadata[['sample', 'Cell_type','Treatment']], left_on='sample', right_on='sample', how='left')
D:\conda\envs\py39\lib\site-packages\anndata\_core\anndata.py:788: UserWarning: AnnData expects .obs.index to contain strings, but got values like: [0, 1, 2, 3, 4] Inferred to be: integer value_idx = self._prep_dim_index(value.index, attr)
In [279]:
Copied!
sc.pl.tsne(adata_filter2,color=['overall_mc_level_CG'] ,wspace=0.1)
sc.pl.tsne(adata_filter2,color=['overall_mc_level_CG'] ,wspace=0.1)
In [286]:
Copied!
sc.pp.neighbors(adata_filter2, n_neighbors=15,use_rep='X')
sc.pp.neighbors(adata_filter2, n_neighbors=15,use_rep='X')
In [308]:
Copied!
#umap
sc.tl.umap(adata_filter2)
#umap
sc.tl.umap(adata_filter2)
In [291]:
Copied!
def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):
this_step = 0
this_min = float(range_min)
this_max = float(range_max)
while this_step < max_steps:
print('step ' + str(this_step))
this_resolution = this_min + ((this_max-this_min)/2)
sc.tl.louvain(adata,resolution=this_resolution)
this_clusters = adata.obs['louvain'].nunique()
print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))
if this_clusters > n_cluster:
this_max = this_resolution
elif this_clusters < n_cluster:
this_min = this_resolution
else:
return(this_resolution, adata)
this_step += 1
print('Cannot find the number of clusters')
print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + + str(this_resolution))
def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):
this_step = 0
this_min = float(range_min)
this_max = float(range_max)
while this_step < max_steps:
print('step ' + str(this_step))
this_resolution = this_min + ((this_max-this_min)/2)
sc.tl.louvain(adata,resolution=this_resolution)
this_clusters = adata.obs['louvain'].nunique()
print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))
if this_clusters > n_cluster:
this_max = this_resolution
elif this_clusters < n_cluster:
this_min = this_resolution
else:
return(this_resolution, adata)
this_step += 1
print('Cannot find the number of clusters')
print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + + str(this_resolution))
In [341]:
Copied!
#num_clusters = len(np.unique(metadata['Treatment']))
#Louvain
getNClusters(adata_filter2,n_cluster=3)
#num_clusters = len(np.unique(metadata['Treatment']))
#Louvain
getNClusters(adata_filter2,n_cluster=3)
step 0 got 4 at resolution 1.5 step 1 got 2 at resolution 0.75 step 2 got 3 at resolution 1.125
Out[341]:
(1.125, AnnData object with n_obs × n_vars = 44 × 1000 obs: 'methylated_CG', 'mc_class_CG', 'total_CG', 'overall_mc_level_CG', 'sample', 'sample_id', 'coverage_cells', 'mean_cell_methylation', 'Cell_type', 'Treatment', 'louvain', 'kmeans', 'label' var: 'feature_name', 'coverage_feature', 'mean_feature_methylation', 'mean_sq', 'var', 'variance', 'dispersion', 'dispersion_log', 'NormalizedCoverage', 'ResidualDispersion', 'NormalizedDispersion', 'mean_bin', 'cov_bin', 'dispersion_norm', 'feature_select' uns: 'pca', 'Treatment_colors', 'neighbors', 'umap', 'louvain', 'louvain_colors', 'Cell_type_colors' obsm: 'X_pca', 'X_tsne', 'X_umap' varm: 'PCs' obsp: 'distances', 'connectivities')
In [295]:
Copied!
!pip install louvain
!pip install louvain
Collecting louvain Obtaining dependency information for louvain from https://files.pythonhosted.org/packages/0c/c3/e675de4a212f09686fd4716ddf48347c244e0a6db5dcba9a5d962ab5eb41/louvain-0.8.1-cp39-cp39-win_amd64.whl.metadata Downloading louvain-0.8.1-cp39-cp39-win_amd64.whl.metadata (1.5 kB) Requirement already satisfied: igraph<0.11,>=0.10.0 in d:\conda\envs\py39\lib\site-packages (from louvain) (0.10.8) Requirement already satisfied: texttable>=1.6.2 in d:\conda\envs\py39\lib\site-packages (from igraph<0.11,>=0.10.0->louvain) (1.6.7) Downloading louvain-0.8.1-cp39-cp39-win_amd64.whl (89 kB) ---------------------------------------- 0.0/89.5 kB ? eta -:--:-- ---- ----------------------------------- 10.2/89.5 kB ? eta -:--:-- ------------------------------------ --- 81.9/89.5 kB 762.6 kB/s eta 0:00:01 ---------------------------------------- 89.5/89.5 kB 843.6 kB/s eta 0:00:00 Installing collected packages: louvain Successfully installed louvain-0.8.1
In [298]:
Copied!
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score
In [362]:
Copied!
num_clusters =3
adata = adata_filter
adata.obs['label'] = adata.obs['Cell_type']
method = 'Test2'
df_metrics = pd.DataFrame(columns=['ARI_Louvain','ARI_kmeans',
'AMI_Louvain','AMI_kmeans',
'Homogeneity_Louvain','Homogeneity_kmeans'])
#kmeans
kmeans = KMeans(n_clusters=num_clusters, random_state=2019).fit(adata.X)
adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')
# #hierachical clustering
# #ERROR: TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.
# hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)
# adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')
# #clustering metrics
#adjusted rank index
ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['louvain'])
ari_kmeans = adjusted_rand_score(adata.obs['label'], adata.obs['kmeans'])
# ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
#adjusted mutual information
ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['louvain'],average_method='arithmetic')
ami_kmeans = adjusted_mutual_info_score(adata.obs['label'], adata.obs['kmeans'],average_method='arithmetic')
# ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
#homogeneity
homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['louvain'])
homo_kmeans = homogeneity_score(adata.obs['label'], adata.obs['kmeans'])
# homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])
df_metrics.loc[method,['ARI_Louvain','ARI_kmeans']] = [ari_louvain,ari_kmeans]
df_metrics.loc[method,['AMI_Louvain','AMI_kmeans']] = [ami_louvain,ami_kmeans]
df_metrics.loc[method,['Homogeneity_Louvain','Homogeneity_kmeans']] = [homo_louvain,homo_kmeans]
# adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\t')
num_clusters =3
adata = adata_filter
adata.obs['label'] = adata.obs['Cell_type']
method = 'Test2'
df_metrics = pd.DataFrame(columns=['ARI_Louvain','ARI_kmeans',
'AMI_Louvain','AMI_kmeans',
'Homogeneity_Louvain','Homogeneity_kmeans'])
#kmeans
kmeans = KMeans(n_clusters=num_clusters, random_state=2019).fit(adata.X)
adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')
# #hierachical clustering
# #ERROR: TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.
# hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)
# adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')
# #clustering metrics
#adjusted rank index
ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['louvain'])
ari_kmeans = adjusted_rand_score(adata.obs['label'], adata.obs['kmeans'])
# ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
#adjusted mutual information
ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['louvain'],average_method='arithmetic')
ami_kmeans = adjusted_mutual_info_score(adata.obs['label'], adata.obs['kmeans'],average_method='arithmetic')
# ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
#homogeneity
homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['louvain'])
homo_kmeans = homogeneity_score(adata.obs['label'], adata.obs['kmeans'])
# homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])
df_metrics.loc[method,['ARI_Louvain','ARI_kmeans']] = [ari_louvain,ari_kmeans]
df_metrics.loc[method,['AMI_Louvain','AMI_kmeans']] = [ami_louvain,ami_kmeans]
df_metrics.loc[method,['Homogeneity_Louvain','Homogeneity_kmeans']] = [homo_louvain,homo_kmeans]
# adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\t')
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) File D:\conda\envs\py39\lib\site-packages\pandas\core\indexes\base.py:3802, in Index.get_loc(self, key, method, tolerance) 3801 try: -> 3802 return self._engine.get_loc(casted_key) 3803 except KeyError as err: File D:\conda\envs\py39\lib\site-packages\pandas\_libs\index.pyx:138, in pandas._libs.index.IndexEngine.get_loc() File D:\conda\envs\py39\lib\site-packages\pandas\_libs\index.pyx:165, in pandas._libs.index.IndexEngine.get_loc() File pandas\_libs\hashtable_class_helper.pxi:5745, in pandas._libs.hashtable.PyObjectHashTable.get_item() File pandas\_libs\hashtable_class_helper.pxi:5753, in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'Cell_type' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) Cell In[362], line 3 1 num_clusters =3 2 adata = adata_filter ----> 3 adata.obs['label'] = adata.obs['Cell_type'] 4 method = 'Test2' 5 df_metrics = pd.DataFrame(columns=['ARI_Louvain','ARI_kmeans', 6 'AMI_Louvain','AMI_kmeans', 7 'Homogeneity_Louvain','Homogeneity_kmeans']) File D:\conda\envs\py39\lib\site-packages\pandas\core\frame.py:3807, in DataFrame.__getitem__(self, key) 3805 if self.columns.nlevels > 1: 3806 return self._getitem_multilevel(key) -> 3807 indexer = self.columns.get_loc(key) 3808 if is_integer(indexer): 3809 indexer = [indexer] File D:\conda\envs\py39\lib\site-packages\pandas\core\indexes\base.py:3804, in Index.get_loc(self, key, method, tolerance) 3802 return self._engine.get_loc(casted_key) 3803 except KeyError as err: -> 3804 raise KeyError(key) from err 3805 except TypeError: 3806 # If we have a listlike key, _check_indexing_error will raise 3807 # InvalidIndexError. Otherwise we fall through and re-raise 3808 # the TypeError. 3809 self._check_indexing_error(key) KeyError: 'Cell_type'
In [307]:
Copied!
df_metrics
df_metrics
Out[307]:
ARI_Louvain | ARI_kmeans | AMI_Louvain | AMI_kmeans | Homogeneity_Louvain | Homogeneity_kmeans | |
---|---|---|---|---|---|---|
Test | 0.191548 | 0.501999 | 0.341999 | 0.686136 | 0.598372 | 1.0 |
In [ ]:
Copied!
In [325]:
Copied!
# !pip install plotly
import plotly.express as px
def interactive_scatter(adata, hue=None, coord_base='umap', continous_cmap='viridis', size=5):
fig = px.scatter(data_frame=pd.DataFrame(adata.obsm['X_umap'],columns=['umap1', 'umap2']),
x='umap1',
y='umap2',
color_continuous_scale=continous_cmap,
color=hue)
fig.update_layout(paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)',
xaxis_showticklabels=False,
yaxis_showticklabels=False,
width=800,
height=800)
fig.update_traces(marker_size=size)
return fig
# !pip install plotly
import plotly.express as px
def interactive_scatter(adata, hue=None, coord_base='umap', continous_cmap='viridis', size=5):
fig = px.scatter(data_frame=pd.DataFrame(adata.obsm['X_umap'],columns=['umap1', 'umap2']),
x='umap1',
y='umap2',
color_continuous_scale=continous_cmap,
color=hue)
fig.update_layout(paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)',
xaxis_showticklabels=False,
yaxis_showticklabels=False,
width=800,
height=800)
fig.update_traces(marker_size=size)
return fig
In [327]:
Copied!
fig = interactive_scatter(adata_filter2)
fig = interactive_scatter(adata_filter2)
In [315]:
Copied!
adata.obsm['X_umap']
adata.obsm['X_umap']
Out[315]:
array([[12.484789 , 16.581722 ], [12.22508 , 17.664066 ], [11.952932 , 17.353918 ], [13.217374 , 17.035156 ], [13.089491 , 16.30659 ], [12.687077 , 15.8422365 ], [12.456281 , 16.944857 ], [11.835731 , 16.619175 ], [12.000754 , 16.130224 ], [11.402539 , 17.039957 ], [14.687245 , 3.0443153 ], [14.237143 , 3.9717314 ], [15.468485 , 2.4151003 ], [12.659276 , 5.184509 ], [13.543063 , 3.7920406 ], [11.166269 , 5.66698 ], [12.79142 , 4.5101447 ], [11.2506485 , 4.9574623 ], [11.915785 , 4.4838223 ], [11.663357 , 5.2958064 ], [11.917299 , 5.6766925 ], [13.8773365 , 3.3940747 ], [19.046078 , -0.17343952], [17.776234 , 0.2363771 ], [13.7890005 , 4.4060965 ], [18.572798 , 1.0591211 ], [16.609453 , 1.7374704 ], [12.103427 , 5.0119734 ], [19.234375 , 0.59901553], [19.425552 , 0.03022308], [17.176418 , 1.5065898 ], [19.219236 , 1.0854653 ], [19.821619 , 0.5332009 ], [18.489052 , 0.6563332 ], [15.494957 , 1.7379029 ], [17.372566 , 0.9697068 ], [16.158829 , 2.1730094 ], [16.147123 , 1.4657835 ], [19.583767 , -0.44952655], [17.990234 , 0.36184123], [18.134104 , 1.4666414 ], [16.548273 , 0.8359049 ], [18.279852 , -0.17635442], [12.905921 , 17.463284 ]], dtype=float32)
In [323]:
Copied!
data_frame=pd.DataFrame(adata.obsm['X_umap'],columns=['umap1', 'umap2'])
data_frame=pd.DataFrame(adata.obsm['X_umap'],columns=['umap1', 'umap2'])
In [324]:
Copied!
data_frame
data_frame
Out[324]:
umap1 | umap2 | |
---|---|---|
0 | 12.484789 | 16.581722 |
1 | 12.225080 | 17.664066 |
2 | 11.952932 | 17.353918 |
3 | 13.217374 | 17.035156 |
4 | 13.089491 | 16.306589 |
5 | 12.687077 | 15.842237 |
6 | 12.456281 | 16.944857 |
7 | 11.835731 | 16.619175 |
8 | 12.000754 | 16.130224 |
9 | 11.402539 | 17.039957 |
10 | 14.687245 | 3.044315 |
11 | 14.237143 | 3.971731 |
12 | 15.468485 | 2.415100 |
13 | 12.659276 | 5.184509 |
14 | 13.543063 | 3.792041 |
15 | 11.166269 | 5.666980 |
16 | 12.791420 | 4.510145 |
17 | 11.250648 | 4.957462 |
18 | 11.915785 | 4.483822 |
19 | 11.663357 | 5.295806 |
20 | 11.917299 | 5.676692 |
21 | 13.877337 | 3.394075 |
22 | 19.046078 | -0.173440 |
23 | 17.776234 | 0.236377 |
24 | 13.789001 | 4.406096 |
25 | 18.572798 | 1.059121 |
26 | 16.609453 | 1.737470 |
27 | 12.103427 | 5.011973 |
28 | 19.234375 | 0.599016 |
29 | 19.425552 | 0.030223 |
30 | 17.176418 | 1.506590 |
31 | 19.219236 | 1.085465 |
32 | 19.821619 | 0.533201 |
33 | 18.489052 | 0.656333 |
34 | 15.494957 | 1.737903 |
35 | 17.372566 | 0.969707 |
36 | 16.158829 | 2.173009 |
37 | 16.147123 | 1.465783 |
38 | 19.583767 | -0.449527 |
39 | 17.990234 | 0.361841 |
40 | 18.134104 | 1.466641 |
41 | 16.548273 | 0.835905 |
42 | 18.279852 | -0.176354 |
43 | 12.905921 | 17.463284 |
In [344]:
Copied!
# settings for the plots
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=250,
frameon=True, vector_friendly=True,
color_map="YlGnBu", format='pdf', transparent=False,
ipython_format='png2x')
sc.pl.umap(adata_filter2,color=['louvain','Cell_type','Treatment','kmeans','total_CG'] ,wspace=0.5)
# settings for the plots
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=250,
frameon=True, vector_friendly=True,
color_map="YlGnBu", format='pdf', transparent=False,
ipython_format='png2x')
sc.pl.umap(adata_filter2,color=['louvain','Cell_type','Treatment','kmeans','total_CG'] ,wspace=0.5)
D:\conda\envs\py39\lib\site-packages\scanpy\plotting\_tools\scatterplots.py:391: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored D:\conda\envs\py39\lib\site-packages\scanpy\plotting\_tools\scatterplots.py:391: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored D:\conda\envs\py39\lib\site-packages\scanpy\plotting\_tools\scatterplots.py:391: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored D:\conda\envs\py39\lib\site-packages\scanpy\plotting\_tools\scatterplots.py:391: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
In [346]:
Copied!
sc.pl.pca_overview(adata_filter2,color=['kmeans','mc_class_CG', 'mCG/CG', 'mCH/CH' ])
sc.pl.pca_overview(adata_filter2,color=['kmeans','mc_class_CG', 'mCG/CG', 'mCH/CH' ])
D:\conda\envs\py39\lib\site-packages\scanpy\plotting\_tools\scatterplots.py:391: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
D:\conda\envs\py39\lib\site-packages\IPython\core\pylabtools.py:152: UserWarning: Glyph 8942 (\N{VERTICAL ELLIPSIS}) missing from current font.
In [347]:
Copied!
#!pip install adjustText
def _text_anno_scatter(data: pd.DataFrame,
ax,
x: str,
y: str,
edge_color=(0.5, 0.5, 0.5, 0.2),
face_color=(0.8, 0.8, 0.8, 0.2),
palette: dict = None,
dodge_text=False,
anno_col='text_anno',
text_anno_kws=None,
text_transform=None,
dodge_kws=None,
linewidth=0.5,
labelsize=5):
"""Add text annotation to a scatter plot"""
# prepare kws
_text_anno_kws = dict(fontsize=labelsize,
fontweight='black',
horizontalalignment='center',
verticalalignment='center')
if text_anno_kws is not None:
_text_anno_kws.update(text_anno_kws)
# plot each text
text_list = []
for text, sub_df in data.groupby(anno_col):
if text_transform is None:
text = str(text)
else:
text = text_transform(text)
if text.lower() in ['', 'nan']:
continue
_x, _y = sub_df[[x, y]].median()
if palette is not None:
_fc = palette[text]
else:
_fc = face_color
text = ax.text(_x, _y, text,
fontdict=_text_anno_kws,
bbox=dict(boxstyle="round",
ec=edge_color,
fc=_fc,
linewidth=linewidth))
text_list.append(text)
if dodge_text:
try:
from adjustText import adjust_text
_dodge_parms = dict(force_points=(0.02, 0.05),
arrowprops=dict(arrowstyle="->",
fc=edge_color,
ec="none",
connectionstyle="angle,angleA=-90,angleB=180,rad=5"),
autoalign='xy')
if dodge_kws is not None:
_dodge_parms.update(dodge_kws)
adjust_text(text_list, x=data['x'], y=data['y'], **_dodge_parms)
except ModuleNotFoundError:
print('Install adjustText package to dodge text, see its github page for help')
return
#!pip install adjustText
def _text_anno_scatter(data: pd.DataFrame,
ax,
x: str,
y: str,
edge_color=(0.5, 0.5, 0.5, 0.2),
face_color=(0.8, 0.8, 0.8, 0.2),
palette: dict = None,
dodge_text=False,
anno_col='text_anno',
text_anno_kws=None,
text_transform=None,
dodge_kws=None,
linewidth=0.5,
labelsize=5):
"""Add text annotation to a scatter plot"""
# prepare kws
_text_anno_kws = dict(fontsize=labelsize,
fontweight='black',
horizontalalignment='center',
verticalalignment='center')
if text_anno_kws is not None:
_text_anno_kws.update(text_anno_kws)
# plot each text
text_list = []
for text, sub_df in data.groupby(anno_col):
if text_transform is None:
text = str(text)
else:
text = text_transform(text)
if text.lower() in ['', 'nan']:
continue
_x, _y = sub_df[[x, y]].median()
if palette is not None:
_fc = palette[text]
else:
_fc = face_color
text = ax.text(_x, _y, text,
fontdict=_text_anno_kws,
bbox=dict(boxstyle="round",
ec=edge_color,
fc=_fc,
linewidth=linewidth))
text_list.append(text)
if dodge_text:
try:
from adjustText import adjust_text
_dodge_parms = dict(force_points=(0.02, 0.05),
arrowprops=dict(arrowstyle="->",
fc=edge_color,
ec="none",
connectionstyle="angle,angleA=-90,angleB=180,rad=5"),
autoalign='xy')
if dodge_kws is not None:
_dodge_parms.update(dodge_kws)
adjust_text(text_list, x=data['x'], y=data['y'], **_dodge_parms)
except ModuleNotFoundError:
print('Install adjustText package to dodge text, see its github page for help')
return
Collecting adjustText Downloading adjustText-0.8-py3-none-any.whl (9.1 kB) Requirement already satisfied: numpy in d:\conda\envs\py39\lib\site-packages (from adjustText) (1.22.1) Requirement already satisfied: matplotlib in d:\conda\envs\py39\lib\site-packages (from adjustText) (3.7.2) Requirement already satisfied: contourpy>=1.0.1 in d:\conda\envs\py39\lib\site-packages (from matplotlib->adjustText) (1.1.0) Requirement already satisfied: cycler>=0.10 in d:\conda\envs\py39\lib\site-packages (from matplotlib->adjustText) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in d:\conda\envs\py39\lib\site-packages (from matplotlib->adjustText) (4.42.1) Requirement already satisfied: kiwisolver>=1.0.1 in d:\conda\envs\py39\lib\site-packages (from matplotlib->adjustText) (1.4.5) Requirement already satisfied: packaging>=20.0 in d:\conda\envs\py39\lib\site-packages (from matplotlib->adjustText) (23.1) Requirement already satisfied: pillow>=6.2.0 in d:\conda\envs\py39\lib\site-packages (from matplotlib->adjustText) (10.0.0) Requirement already satisfied: pyparsing<3.1,>=2.3.1 in d:\conda\envs\py39\lib\site-packages (from matplotlib->adjustText) (3.0.9) Requirement already satisfied: python-dateutil>=2.7 in d:\conda\envs\py39\lib\site-packages (from matplotlib->adjustText) (2.8.2) Requirement already satisfied: importlib-resources>=3.2.0 in d:\conda\envs\py39\lib\site-packages (from matplotlib->adjustText) (6.0.1) Requirement already satisfied: zipp>=3.1.0 in d:\conda\envs\py39\lib\site-packages (from importlib-resources>=3.2.0->matplotlib->adjustText) (3.16.2) Requirement already satisfied: six>=1.5 in d:\conda\envs\py39\lib\site-packages (from python-dateutil>=2.7->matplotlib->adjustText) (1.16.0) Installing collected packages: adjustText Successfully installed adjustText-0.8
In [350]:
Copied!
def categorical_scatter(
data,
ax,
# coords
coord_base='umap',
x=None,
y=None,
# color
hue=None,
palette='auto',
# text annotation
text_anno=None,
text_anno_kws=None,
text_anno_palette=None,
text_transform=None,
dodge_text=False,
dodge_kws=None,
# legend
show_legend=False,
legend_kws=None,
# size
s=5,
size=None,
sizes: dict = None,
size_norm=None,
# other
axis_format='tiny',
max_points=5000,
labelsize=4,
linewidth=0,
zoomxy=1.05,
outline=None,
outline_pad=3,
outline_kws=None,
scatter_kws=None,
return_fig=False
):
"""
Plot scatter plot with these options:
- Color by a categorical variable, and generate legend of the variable if needed
- Add text annotation using a categorical variable
- Circle categories with outlines
Parameters
----------
data
Dataframe that contains coordinates and categorical variables
ax
this function do not generate ax, must provide an ax
coord_base
coords name, if provided, will automatically search for x and y
x
x coord name
y
y coord name
hue
categorical col name or series for color hue
palette
palette of the hue, str or dict
text_anno
categorical col name or series for text annotation
text_anno_kws
text_anno_palette
text_transform
dodge_text
dodge_kws
show_legend
legend_kws
s
size
sizes
size_norm
axis_format
max_points
labelsize
linewidth
zoomxy
outline
outline_pad
outline_kws
scatter_kws
kws dict pass to sns.scatterplot
Returns
-------
"""
if isinstance(data, ad.AnnData):
adata = data
data = adata.obs
x = f'{coord_base}_0'
y = f'{coord_base}_1'
_data = pd.DataFrame({'x': adata.obsm[f'X_{coord_base}'][:, 0],
'y': adata.obsm[f'X_{coord_base}'][:, 1]},
index=adata.obs_names)
else:
# add coords
_data, x, y = _extract_coords(data, coord_base, x, y)
# _data has 2 cols: "x" and "y"
# down sample plot data if needed.
if max_points is not None:
if _data.shape[0] > max_points:
_data = _density_based_sample(_data, seed=1, size=max_points,
coords=['x', 'y'])
# default scatter options
_scatter_kws = {'linewidth': 0, 's': s, 'legend': None, 'palette': palette}
if scatter_kws is not None:
_scatter_kws.update(scatter_kws)
# deal with color
palette_dict = None
if hue is not None:
if isinstance(hue, str):
_data['hue'] = data[hue].copy()
else:
_data['hue'] = hue.copy()
hue = 'hue'
_data['hue'] = _data['hue'].astype('category').cat.remove_unused_categories()
# deal with color palette
palette = _scatter_kws['palette']
if isinstance(palette, str) or isinstance(palette, list):
palette_dict = level_one_palette(_data['hue'], order=None, palette=palette)
elif isinstance(palette, dict):
palette_dict = palette
else:
raise TypeError(f'Palette can only be str, list or dict, '
f'got {type(palette)}')
_scatter_kws['palette'] = palette_dict
# deal with size
if size is not None:
# discard s from _scatter_kws and use size in sns.scatterplot
_scatter_kws.pop('s')
sns.scatterplot(x='x', y='y', data=_data, ax=ax, hue=hue,
size=size, sizes=sizes, size_norm=size_norm,
**_scatter_kws)
# deal with text annotation
if text_anno is not None:
if isinstance(text_anno, str):
_data['text_anno'] = data[text_anno].copy()
else:
_data['text_anno'] = text_anno.copy()
_text_anno_scatter(data=_data[['x', 'y', 'text_anno']],
ax=ax,
x='x',
y='y',
dodge_text=dodge_text,
dodge_kws=dodge_kws,
palette=text_anno_palette,
text_transform=text_transform,
anno_col='text_anno',
text_anno_kws=text_anno_kws,
labelsize=labelsize)
# deal with outline
if outline:
if isinstance(outline, str):
_data['outline'] = data[outline].copy()
else:
_data['outline'] = outline.copy()
_outline_kws = {'linewidth': linewidth,
'palette': None,
'c': 'lightgray',
'single_contour_pad': outline_pad}
if outline_kws is not None:
_outline_kws.update(outline_kws)
density_contour(ax=ax, data=_data, x='x', y='y', groupby='outline', **_outline_kws)
# clean axis
if axis_format == 'tiny':
_make_tiny_axis_label(ax, x, y, arrow_kws=None, fontsize=labelsize)
elif (axis_format == 'empty') or (axis_format is None):
sns.despine(ax=ax, left=True, bottom=True)
ax.set(xticks=[], yticks=[], xlabel=None, ylabel=None)
else:
pass
# deal with legend
if show_legend and (hue is not None):
n_hue = len(palette_dict)
_legend_kws = dict(ncol=(1 if n_hue <= 20 else 2 if n_hue <= 40 else 3),
fontsize=labelsize,
bbox_to_anchor=(1.05, 1),
loc='upper left',
borderaxespad=0.)
if legend_kws is not None:
_legend_kws.update(legend_kws)
handles = []
labels = []
exist_hues = _data['hue'].unique()
for hue_name, color in palette_dict.items():
if hue_name not in exist_hues:
# skip hue_name that do not appear in the plot
continue
handle = Line2D([0], [0], marker='o', color='w',
markerfacecolor=color, markersize=_legend_kws['fontsize'])
handles.append(handle)
labels.append(hue_name)
_legend_kws['handles'] = handles
_legend_kws['labels'] = labels
ax.legend(**_legend_kws)
if zoomxy is not None:
zoom_ax(ax, zoomxy)
if return_fig:
return ax, _data
else:
return
def categorical_scatter(
data,
ax,
# coords
coord_base='umap',
x=None,
y=None,
# color
hue=None,
palette='auto',
# text annotation
text_anno=None,
text_anno_kws=None,
text_anno_palette=None,
text_transform=None,
dodge_text=False,
dodge_kws=None,
# legend
show_legend=False,
legend_kws=None,
# size
s=5,
size=None,
sizes: dict = None,
size_norm=None,
# other
axis_format='tiny',
max_points=5000,
labelsize=4,
linewidth=0,
zoomxy=1.05,
outline=None,
outline_pad=3,
outline_kws=None,
scatter_kws=None,
return_fig=False
):
"""
Plot scatter plot with these options:
- Color by a categorical variable, and generate legend of the variable if needed
- Add text annotation using a categorical variable
- Circle categories with outlines
Parameters
----------
data
Dataframe that contains coordinates and categorical variables
ax
this function do not generate ax, must provide an ax
coord_base
coords name, if provided, will automatically search for x and y
x
x coord name
y
y coord name
hue
categorical col name or series for color hue
palette
palette of the hue, str or dict
text_anno
categorical col name or series for text annotation
text_anno_kws
text_anno_palette
text_transform
dodge_text
dodge_kws
show_legend
legend_kws
s
size
sizes
size_norm
axis_format
max_points
labelsize
linewidth
zoomxy
outline
outline_pad
outline_kws
scatter_kws
kws dict pass to sns.scatterplot
Returns
-------
"""
if isinstance(data, ad.AnnData):
adata = data
data = adata.obs
x = f'{coord_base}_0'
y = f'{coord_base}_1'
_data = pd.DataFrame({'x': adata.obsm[f'X_{coord_base}'][:, 0],
'y': adata.obsm[f'X_{coord_base}'][:, 1]},
index=adata.obs_names)
else:
# add coords
_data, x, y = _extract_coords(data, coord_base, x, y)
# _data has 2 cols: "x" and "y"
# down sample plot data if needed.
if max_points is not None:
if _data.shape[0] > max_points:
_data = _density_based_sample(_data, seed=1, size=max_points,
coords=['x', 'y'])
# default scatter options
_scatter_kws = {'linewidth': 0, 's': s, 'legend': None, 'palette': palette}
if scatter_kws is not None:
_scatter_kws.update(scatter_kws)
# deal with color
palette_dict = None
if hue is not None:
if isinstance(hue, str):
_data['hue'] = data[hue].copy()
else:
_data['hue'] = hue.copy()
hue = 'hue'
_data['hue'] = _data['hue'].astype('category').cat.remove_unused_categories()
# deal with color palette
palette = _scatter_kws['palette']
if isinstance(palette, str) or isinstance(palette, list):
palette_dict = level_one_palette(_data['hue'], order=None, palette=palette)
elif isinstance(palette, dict):
palette_dict = palette
else:
raise TypeError(f'Palette can only be str, list or dict, '
f'got {type(palette)}')
_scatter_kws['palette'] = palette_dict
# deal with size
if size is not None:
# discard s from _scatter_kws and use size in sns.scatterplot
_scatter_kws.pop('s')
sns.scatterplot(x='x', y='y', data=_data, ax=ax, hue=hue,
size=size, sizes=sizes, size_norm=size_norm,
**_scatter_kws)
# deal with text annotation
if text_anno is not None:
if isinstance(text_anno, str):
_data['text_anno'] = data[text_anno].copy()
else:
_data['text_anno'] = text_anno.copy()
_text_anno_scatter(data=_data[['x', 'y', 'text_anno']],
ax=ax,
x='x',
y='y',
dodge_text=dodge_text,
dodge_kws=dodge_kws,
palette=text_anno_palette,
text_transform=text_transform,
anno_col='text_anno',
text_anno_kws=text_anno_kws,
labelsize=labelsize)
# deal with outline
if outline:
if isinstance(outline, str):
_data['outline'] = data[outline].copy()
else:
_data['outline'] = outline.copy()
_outline_kws = {'linewidth': linewidth,
'palette': None,
'c': 'lightgray',
'single_contour_pad': outline_pad}
if outline_kws is not None:
_outline_kws.update(outline_kws)
density_contour(ax=ax, data=_data, x='x', y='y', groupby='outline', **_outline_kws)
# clean axis
if axis_format == 'tiny':
_make_tiny_axis_label(ax, x, y, arrow_kws=None, fontsize=labelsize)
elif (axis_format == 'empty') or (axis_format is None):
sns.despine(ax=ax, left=True, bottom=True)
ax.set(xticks=[], yticks=[], xlabel=None, ylabel=None)
else:
pass
# deal with legend
if show_legend and (hue is not None):
n_hue = len(palette_dict)
_legend_kws = dict(ncol=(1 if n_hue <= 20 else 2 if n_hue <= 40 else 3),
fontsize=labelsize,
bbox_to_anchor=(1.05, 1),
loc='upper left',
borderaxespad=0.)
if legend_kws is not None:
_legend_kws.update(legend_kws)
handles = []
labels = []
exist_hues = _data['hue'].unique()
for hue_name, color in palette_dict.items():
if hue_name not in exist_hues:
# skip hue_name that do not appear in the plot
continue
handle = Line2D([0], [0], marker='o', color='w',
markerfacecolor=color, markersize=_legend_kws['fontsize'])
handles.append(handle)
labels.append(hue_name)
_legend_kws['handles'] = handles
_legend_kws['labels'] = labels
ax.legend(**_legend_kws)
if zoomxy is not None:
zoom_ax(ax, zoomxy)
if return_fig:
return ax, _data
else:
return
In [357]:
Copied!
import seaborn as sns
def level_one_palette(name_list, order=None, palette='auto'):
name_set = set(name_list)
if palette == 'auto':
if len(name_set) < 10:
palette = 'tab10'
elif len(name_set) < 20:
palette = 'tab20'
else:
palette = 'rainbow'
if order is None:
order = list(sorted(name_set))
else:
if (set(order) != name_set) or (len(order) != len(name_set)):
raise ValueError('Order is not equal to set(name_list).')
n = len(order)
colors = sns.color_palette(palette, n)
color_palette = {}
for name, color in zip(order, colors):
color_palette[name] = color
return color_palette
import seaborn as sns
def level_one_palette(name_list, order=None, palette='auto'):
name_set = set(name_list)
if palette == 'auto':
if len(name_set) < 10:
palette = 'tab10'
elif len(name_set) < 20:
palette = 'tab20'
else:
palette = 'rainbow'
if order is None:
order = list(sorted(name_set))
else:
if (set(order) != name_set) or (len(order) != len(name_set)):
raise ValueError('Order is not equal to set(name_list).')
n = len(order)
colors = sns.color_palette(palette, n)
color_palette = {}
for name, color in zip(order, colors):
color_palette[name] = color
return color_palette
In [361]:
Copied!
fig, ax = plt.subplots(figsize=(4, 4), dpi=300)
adata.obs['label'].fillna('nan', inplace=True)
_ = categorical_scatter(data = adata_filter2,
ax = ax,
coord_base = 'umap',
hue = 'Treatment',
text_anno = 'Treatment',
show_legend = 'True')
fig, ax = plt.subplots(figsize=(4, 4), dpi=300)
adata.obs['label'].fillna('nan', inplace=True)
_ = categorical_scatter(data = adata_filter2,
ax = ax,
coord_base = 'umap',
hue = 'Treatment',
text_anno = 'Treatment',
show_legend = 'True')
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[361], line 3 1 fig, ax = plt.subplots(figsize=(4, 4), dpi=300) 2 adata.obs['label'].fillna('nan', inplace=True) ----> 3 _ = categorical_scatter(data = adata_filter2, 4 ax = ax, 5 coord_base = 'umap', 6 hue = 'Treatment', 7 text_anno = 'Treatment', 8 show_legend = 'True') Cell In[350], line 125, in categorical_scatter(data, ax, coord_base, x, y, hue, palette, text_anno, text_anno_kws, text_anno_palette, text_transform, dodge_text, dodge_kws, show_legend, legend_kws, s, size, sizes, size_norm, axis_format, max_points, labelsize, linewidth, zoomxy, outline, outline_pad, outline_kws, scatter_kws, return_fig) 123 palette = _scatter_kws['palette'] 124 if isinstance(palette, str) or isinstance(palette, list): --> 125 palette_dict = level_one_palette(_data['hue'], order=None, palette=palette) 126 elif isinstance(palette, dict): 127 palette_dict = palette Cell In[357], line 13, in level_one_palette(name_list, order, palette) 10 palette = 'rainbow' 12 if order is None: ---> 13 order = list(sorted(name_set)) 14 else: 15 if (set(order) != name_set) or (len(order) != len(name_set)): TypeError: '<' not supported between instances of 'str' and 'float'
In [354]:
Copied!
!pip install seaborn
!pip install seaborn
Requirement already satisfied: seaborn in d:\conda\envs\py39\lib\site-packages (0.12.2) Requirement already satisfied: numpy!=1.24.0,>=1.17 in d:\conda\envs\py39\lib\site-packages (from seaborn) (1.22.1) Requirement already satisfied: pandas>=0.25 in d:\conda\envs\py39\lib\site-packages (from seaborn) (1.5.3) Requirement already satisfied: matplotlib!=3.6.1,>=3.1 in d:\conda\envs\py39\lib\site-packages (from seaborn) (3.7.2) Requirement already satisfied: contourpy>=1.0.1 in d:\conda\envs\py39\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.1.0) Requirement already satisfied: cycler>=0.10 in d:\conda\envs\py39\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in d:\conda\envs\py39\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (4.42.1) Requirement already satisfied: kiwisolver>=1.0.1 in d:\conda\envs\py39\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.4.5) Requirement already satisfied: packaging>=20.0 in d:\conda\envs\py39\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (23.1) Requirement already satisfied: pillow>=6.2.0 in d:\conda\envs\py39\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (10.0.0) Requirement already satisfied: pyparsing<3.1,>=2.3.1 in d:\conda\envs\py39\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (3.0.9) Requirement already satisfied: python-dateutil>=2.7 in d:\conda\envs\py39\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (2.8.2) Requirement already satisfied: importlib-resources>=3.2.0 in d:\conda\envs\py39\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (6.0.1) Requirement already satisfied: pytz>=2020.1 in d:\conda\envs\py39\lib\site-packages (from pandas>=0.25->seaborn) (2023.3) Requirement already satisfied: zipp>=3.1.0 in d:\conda\envs\py39\lib\site-packages (from importlib-resources>=3.2.0->matplotlib!=3.6.1,>=3.1->seaborn) (3.16.2) Requirement already satisfied: six>=1.5 in d:\conda\envs\py39\lib\site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.1->seaborn) (1.16.0)
In [ ]:
Copied!