v2023-11-15
This tutorial continues from Tutorial 1) Processing and analysis of the 3k PBMCs dataset using SC-Elephant
"""
Reset the current tutorial
"""
# delete the output to reset the tutorials
!rm -rf output/pbmc3k.combined_with_eldb.v20221214.aws.subset_320000.ram/
"""
Import necessary packages
"""
# not use GPU (only inference will be performed, and using CPUs are sufficient)
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
# import scelephant
import scelephant as el
from scelephant import RamData
import scanpy as sc
# set figure parameters
sc.set_figure_params( dpi = 200, figsize = ( 8, 6 ), frameon = True )
# plotly export settings
import plotly.io as pio
pio.renderers.default = "notebook"
"""
configure the jupyter notebook environment
"""
el.bk.Wide( 100 ) # adjust the jupyter notebook cell width
import matplotlib as mpl
mpl.rcParams[ "figure.dpi" ] = 100
# set image resolution
# load memory profiler
# %load_ext memory_profiler
"""
Open a Combined RamData - combine ELDB and PBMC3k datasets
"""
# settings
""" [WARNING]
Changing 'flag_use_full_ELDB' to True will result in downloading
a large amount of data from a requester-pay bucket (s3://scelephant/).
On the other hand, RamData objects in s3://scelephant-free/ bucket is freely available. """
flag_use_full_ELDB = False
# initialize a pool of managed operators
fop = el.managers.FileSystemOperatorPool( 8 ) # this pool of managed operators will be used throughout the tutorials
if flag_use_full_ELDB :
ram_ref = RamData( 's3://scelephant/ELDB.combined_v20221214.ramdata/', file_system_operator_pool = fop, name_layer = None, flag_enable_synchronization_through_locking = False, ) # contain full ELDB data, but it is requester-pay bucket, and it is not FREE.
else :
ram_ref = RamData( 's3://scelephant-free/ELDB.combined_v20221214.subset_320000_barcodes.ramdata/', file_system_operator_pool = fop, name_layer = None, flag_enable_synchronization_through_locking = False, ) # does not contain full ELDB data (full dataset is a requester-pay bucket)
ram_query = RamData( f'output/pbmc3k.ram/', file_system_operator_pool = fop )
# build feature mapping using 'id_gene', not 'name_gene'
ram_ref.ft.int_index_str_rep = 0
ram_query.ft.int_index_str_rep = 0
ram = RamData(
"output/pbmc3k.combined_with_eldb.v20221214.aws.subset_320000.ram/", # output path of the Combined RamData object
l_ramdata = [ ram_ref, ram_query ], # list of RamData object to combine
name_layer = None,
int_total_weight_for_each_batch = 10_000_000,
int_index_component_reference = 0, # the index of the RamData object to use as a reference (which will activate Reference Alignment mode)
file_system_operator_pool = fop,
flag_enable_synchronization_through_locking = False, # You can set it to False for better performance, but for RamData objects that are frequently updated, setting to True might be needed to avoid collisions
)
2023-11-18 21:26:58,728 [SC-Elephant] <INFO> (layer) - 'raw' layer has been loaded
ram
"""
Get PCA values of the new barcodes
"""
# settings of the pipeline
name_anno_ori = 'leiden_scanpy'
name_col_bc_all = 'filtered_barcodes' # name of the column containing filtered cells
name_model = 'ipca_filter_normalized_log1p_for_embedding_10504_features_normalized_log1p_capped_100comp' # name of the PCA model of the ELDB
name_col_filter_ft = "filter_pca_filter_normalized_log1p_for_embedding_10504_features_normalized_log1p_capped_100comp" # name of the filter containing genes for the PCA model
ram.ft.change_filter( name_col_filter_ft ) # select the features
# set the suffix of the run
str_suffix_new = '_aligned'
str_suffix = '_filter_normalized_log1p_for_embedding_10504_features_normalized_log1p_capped_100comp'
name_col_pca = f'X_pca{str_suffix}{str_suffix_new}' # new PCA column name that will contains PBMC 3k's PCA values, calculated with a iPCA model of the ELDB
"""
Get PCA values
"""
ram.bc.change_filter( name_col_bc_all ) # select filtered cells
model = ram.apply_pca(
name_model = name_model,
name_layer = 'normalized_log1p_capped',
name_col = name_col_pca,
int_n_components_in_a_chunk = 20,
int_num_threads = 5,
int_index_component_reference = 0, # currently, index 0 contains the reference RamData, the ELDB (or a subset of ELDB).
)
2023-11-18 21:27:27,356 [aiobotocore.credentials] <INFO> (load) - Found credentials in shared credentials file: ~/.aws/credentials 2023-11-18 21:27:43,862 [SC-Elephant] <INFO> (layer) - loading a layer 'normalized_log1p_capped' of RamData component 0 2023-11-18 21:27:52,014 [SC-Elephant] <INFO> (layer) - 'normalized_log1p_capped' layer has been loaded 2023-11-18 21:27:52,016 [SC-Elephant] <INFO> (layer) - loading a layer 'normalized_log1p_capped' of RamData component 1 2023-11-18 21:27:52,108 [SC-Elephant] <INFO> (layer) - 'normalized_log1p_capped' layer has been loaded 2023-11-18 21:27:52,216 [SC-Elephant] <INFO> (layer) - 'normalized_log1p_capped' layer has been loaded 2023-11-18 21:28:02,258 [SC-Elephant] <INFO> (load_str) - completed loading of 10504 number of strings 2023-11-18 21:28:07,877 [SC-Elephant] <INFO> (load_str) - completed loading of 10504 number of strings 100 PCs from 10504 features: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 169553230/169553230 [00:07<00:00, 22353973.89it/s]
"""
co-embedding of new barcodes
"""
str_num_subsampled_cells = "160000" if flag_use_full_ELDB else '20480000' # during embedding process, align the cells of interest to a subset of cells of the ELDB
name_model = f"knnindex__filtered_barcodes_subsampled_{str_num_subsampled_cells}_entries_clus_info__X_pca_filter_normalized_log1p_for_embedding_10504_features_normalized_log1p_capped_100comp__100comp__30_neighbors" # kNN graph for embedding the cells of interest to ELDB
name_col_filter_bc_subsampled = f'filtered_barcodes_subsampled_{str_num_subsampled_cells}_entries_clus_info' # name of the column containing the filter of subsampled cells
name_umap = 'X_umap_filter_normalized_log1p_for_embedding_10504_features_normalized_log1p_capped_100comp_filtered_barcodes_subsampled_18097526_entries_30knn100comp_scanpy_spectral' # name of the column containing UMAP coordinates
name_umap_combined = f'{name_umap}_combined' # the output column name that will contain the UMAP coordinates of the new cells
ram.bc.change_filter( name_col_filter_bc_subsampled )
ram.bc.meta[ name_umap_combined ] = ram.bc.meta[ name_umap ] # copy original values
# perform embedding, while retrieving primary & secondary neighbors
ram.bc.change_filter( name_col_bc_all )
ram.apply_knn(
name_model,
name_col_x = name_col_pca,
name_col_y_input = name_umap,
name_col_y_output = name_umap_combined,
axis = 'barcodes',
operation = 'embedder',
float_std_ratio_for_outlier_detection = 0.03,
name_col_filter_neighbors_of_the_query = 'filter_barcodes_aligned',
int_index_component_reference = 0,
)
# retrieve primary neighbors only
ram.bc.change_filter( name_col_bc_all )
ram.apply_knn(
name_model,
name_col_x = name_col_pca,
name_col_y_input = name_umap,
name_col_y_output = name_umap_combined,
axis = 'barcodes',
operation = 'embedder',
float_std_ratio_for_outlier_detection = 0.03,
name_col_filter_neighbors_of_the_query = 'filter_barcodes_aligned_primary',
flag_include_secondary_neighbors_of_the_query = False,
int_index_component_reference = 0,
)
2023-11-18 21:32:49,923 [root] <INFO> (apply_knn) - [Info] [RamData.apply_label] the nearest-neighbor search started 2023-11-18 21:32:49,925 [SC-Elephant] <INFO> (apply_knn) - Starting kNN-based embedder operation using 259736 entries in the index embedder using 259736 entries: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2695/2695 [00:01<00:00, 1780.74it/s] 2023-11-18 21:32:54,281 [root] <INFO> (apply_knn) - [Info] [RamData.apply_label] the nearest-neighbor search started 2023-11-18 21:32:54,282 [SC-Elephant] <INFO> (apply_knn) - Starting kNN-based embedder operation using 259736 entries in the index embedder using 259736 entries: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2695/2695 [00:01<00:00, 1827.30it/s]
"""
Deep-learning based annotation prediction - using the cell types annotations from the Extended PBMC3k dataset
"""
name_anno_predicted = f'extended_pbmc3k_predicted_cell_type' # the output column names
name_anno = f'metadata__pbmc3k_alignment_primary_and_secondary_neighbors__cell_type' # a column name containing the annotation data that were used to train the Deep Learning model
# copy original values
ram.bc.filter = None # copy all values without lazy loading
ram.bc.meta[ name_anno_predicted ] = ram.bc.meta[ name_anno ] # copy original values
ram.bc.filter = ram.bc.select_component( 1 ) & ram.bc.get_filter( name_col_bc_all ) # select the component 1
ram.apply_dl(
name_model = f'metadata__pbmc3k_alignment_primary_and_secondary_neighbors__cell_type.classifier',
name_col_x = name_col_pca,
name_col_y = name_anno_predicted,
operation = 'classifier',
flag_apply_to_entries_used_for_training = True,
int_num_entries_in_a_batch = 500000,
int_num_threads = 5
)
2023-11-18 21:33:05.493838: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2023-11-18 21:33:05.646905: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. 2023-11-18 21:33:05.683942: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered 2023-11-18 21:33:06.508717: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/mpi/gcc-9.3.0/openmpi-4.1.2/lib:/usr/local/cuda-11.6/lib64: 2023-11-18 21:33:06.508812: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/mpi/gcc-9.3.0/openmpi-4.1.2/lib:/usr/local/cuda-11.6/lib64: 2023-11-18 21:33:06.508819: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly. 2023-11-18 21:33:10.576809: E tensorflow/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected 2023-11-18 21:33:10.576854: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: node211 2023-11-18 21:33:10.576862: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: node211 2023-11-18 21:33:10.577048: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 510.108.3 2023-11-18 21:33:10.577072: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 510.108.3 2023-11-18 21:33:10.577078: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 510.108.3 2023-11-18 21:33:10.577521: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags. 2023-11-18 21:33:10,938 [SC-Elephant] <INFO> (save_model) - metadata__pbmc3k_alignment_primary_and_secondary_neighbors__cell_type.classifier|deep_learning.keras.classifier model saved. 2023-11-18 21:33:10,941 [SC-Elephant] <INFO> (apply_dl) - [RamData.apply_dl] applying deep-learning model started deep-learning classifier: 0%| | 0/2695 [00:00<?, ?it/s]
85/85 [==============================] - 0s 2ms/step
deep-learning classifier: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2695/2695 [00:00<00:00, 3363.64it/s]
'''
set filter of the external data (dataset of interest)
'''
# set filter
ram.bc.filter = ( ram.bc.select_component( 0 ) & ram.bc.get_filter( f'filtered_barcodes_subsampled_{str_num_subsampled_cells}_entries_clus_info' ) ) | ( ram.bc.select_component( 1 ) & ram.bc.get_filter( name_col_bc_all ) )
ram.ft.load_str( )
# save as a filter
ram.bc.save_as_filter(
ram.bc.select_component( 1 ) & ram.bc.get_filter( name_col_bc_all ),
'filter_external_data'
)
"""
plot co-embedded cells
"""
# settings
# ram.int_index_component_reference = None # in order to retrieve gene expression data from ELDB, set index to the reference RamData to None (if ram.int_index_component_reference == 0, only PBMC3k dataset's count data will be retrieved)
ram.ft.filter = None # retrieve expression of any genes
# set filter
ram.bc.filter = ( ram.bc.select_component( 0 ) & ram.bc.get_filter( f'filtered_barcodes_subsampled_{str_num_subsampled_cells}_entries_clus_info' ) ) | ( ram.bc.select_component( 1 ) & ram.bc.get_filter( name_col_bc_all ) ) # for RamData-0 (ELDB), select the subsampled cells. for RamData-1 (PBMC3k), select all filtered cells
# define a list of genes of interest
dict_marker_genes = {
'Proliferating' : [ 'MKI67' ],
'T-cell' : [ 'CD3D' ],
'CD4 T-cell' : [ 'CD4' ],
'CD8 T-cell' : [ 'CD8A', 'CD8B' ],
'Naive T / TCM' : [ 'CCR7', 'SELL' ],
'Effector Phenotype' : [ 'FAS', 'CD44', 'IL2RG', 'IL7R', 'CD69' ],
'TREG' : [ 'FOXP3', 'IL2RA' ],
'MAIT' : [ 'KLRB1', 'SLC4A10' ],
'SOX4+ CD4 T' : [ 'SOX4' ],
'gamma delta T' : [ 'TNFSF13B' ],
'NK' : [ 'GNLY', 'GZMB' ],
'NK Recruiting (CD56 bright)' : [ 'XCL1', 'GZMK' ],
'B cell' : [ 'MS4A1', 'CD79A' ],
'B memory' : [ 'CD27' ],
'Plasma cell' : [ 'JCHAIN', 'TXNDC5' ],
'Monocyte' : [ 'CD86' ],
'Classical Monocyte' : [ 'CD14' ],
'Non-classical Monocyte' : [ 'FCGR3A' ],
'Macrophage' : [ 'CD68' ],
'Macrophage (M1)' : [ 'CD86', 'FCGR2A', ],
'Macrophage (M2)' : [ 'CD163', 'MSR1' ],
'Dendritic' : [ 'FCER1A' ],
"cDC1" : [ 'THBD', 'CLEC9A' ],
'cDC2' : [ 'CD1C', 'SIRPA' ],
'pDC' : [ 'CLEC4C', 'IL3RA', 'LILRA4' ],
'Platelet' : [ 'ITGA2B' ],
'HPSC' : [ 'CD34' ],
}
l_marker_genes = [ ] # collect the marker genes
for e in dict_marker_genes :
l_marker_genes += dict_marker_genes[ e ]
adata = ram[
'normalized_log1p_capped',
:,
[ "str", 'raw_sum', 'filter_external_data', name_anno_predicted, name_anno_ori, { name_umap_combined } ],
l_marker_genes,
[ 'str' ]
] # for detailed descriptions, please see "Tutorial 1) Processing and analysis of the 3k PBMCs dataset using SC-Elephant.""
adata.obsm[ 'X_umap' ] = adata.obsm[ name_umap_combined ]
adata.obs[ 'filter_external_data' ] = adata.obs[ 'filter_external_data' ].astype( str ).astype( object )
sc.pl.umap( adata, color = 'filter_external_data', size = 1 )
"""
plot predicted annotations
"""
sc.pl.umap( adata, color = name_anno_ori, size = 5 ) # cell types annotations from the Extended PBMC3k dataset
sc.pl.umap( adata, color = name_anno_predicted, size = 5 ) # cell types annotations from the Extended PBMC3k dataset
2023-11-18 21:33:13,362 [SC-Elephant] <INFO> (load_str) - completed loading of 10504 number of strings 2023-11-18 21:33:28,991 [SC-Elephant] <INFO> (load_str) - completed loading of 262431 number of strings 2023-11-18 21:33:35,612 [SC-Elephant] <INFO> (load_str) - completed loading of 62914 number of strings ... storing 'filter_external_data' as categorical
"""
Compare original annotations and predicted annotations using a deep-learning model from ELDB
"""
def Shankey_Compare_Annotations( l_anno_1, l_anno_2, int_min_num_entries_for_an_overlap : int = 3, flag_show_label : bool = True, font_size : int = 10, title : str = '', dict_color : dict = dict( ), fig_height : int = 850 ) :
""" # 2023-03-05 16:32:49
draw a Shankey diagram using Plotly for the given lists of annotations
l_anno_1 # first list of annotations
l_anno_2 # second list of annotations
int_min_num_entries_for_an_overlap : int = 3 # the minmum number of entries for a link (overlaps between two annotations) to be valid.
title : Union[ None, str ] = None # the name of the figure. if None is given, no title will be shown
font_size : int = 10, # the font size of the title
dict_color : dict = dict( ), # name of the label to color mapping
fig_height # height of the output graph
"""
import numpy as np
def _map( arr_anno, start_pos : int = 0 ) :
"""
return a dictionary for mapping annotation to its integer representation and a list of unique annotation labels
"""
l_anno_unique = el.bk.LIST_COUNT( arr_anno, duplicate_filter = None ).index.values # retrieve a list of unique annotations
return dict( ( e, i + start_pos ) for i, e in enumerate( l_anno_unique ) ), l_anno_unique
dict_map_1, arr_anno_unique_1 = _map( l_anno_1, start_pos = 0 )
dict_map_2, arr_anno_unique_2 = _map( l_anno_2, start_pos = len( dict_map_1 ) )
label = list( arr_anno_unique_1 ) + list( arr_anno_unique_2 ) if flag_show_label else None # compose a list of unique labels # does not show labels if 'flag_show_label' is False
# retrieve values for drawing the diagram
source, target, value = el.bk.LIST_COUNT( np.array( [ list( dict_map_1[ e ] for e in l_anno_1 ), list( dict_map_2[ e ] for e in l_anno_2 ) ], dtype = int ).T, duplicate_filter = int_min_num_entries_for_an_overlap ).reset_index( drop = False ).values.T
# compose a dataframe
# draw a plot
import plotly.graph_objects as go
fig = go.Figure( data = [ go.Sankey(
node = dict(
pad = 15,
thickness = 20,
line = dict( color = "black", width = 0.5 ),
label = label,
color = list( dict_color[ e ] if e in dict_color else 'blue' for e in label ),
),
link = dict(
source = source,
target = target,
value = value
))])
if title is not None :
fig.update_layout( title_text = title, font_size = font_size, height = fig_height )
return fig
def Get_dict_color( adata, name_col : str ) :
''' get name_category > color mapping of a given column in the .obs of a given AnnData '''
try :
return dict( ( n, c ) for n, c in zip( adata.obs[ name_col ].cat.categories, adata.uns[ name_col + '_colors' ] ) )
except :
return dict( )
# define the folder containing the data
path_folder_graph = f'output/graph/'
os.makedirs( path_folder_graph, exist_ok = True ) # create the output folder
# cell types annotations from the Extended PBMC3k dataset
l_anno_1, l_anno_2 = adata.obs.loc[ adata.obs[ 'filter_external_data' ] == 'True', [ name_anno_ori, name_anno_predicted ] ].values.T
fig = Shankey_Compare_Annotations( l_anno_1, l_anno_2, int_min_num_entries_for_an_overlap = 3, flag_show_label = True, title = name_anno_predicted, dict_color = { ** Get_dict_color( adata, name_anno_ori ), ** Get_dict_color( adata, name_anno_predicted ) } )
fig.show( )
fig.write_html( f"{path_folder_graph}plotly.shankey_comparison.{name_anno_ori}.{name_anno_predicted}.html" )
''' subset PBMC 3k dataset '''
# select only the cells of the PBMC 3k dataset
adata_subset = adata[ adata.obs.filter_external_data == 'True' ]
# add the number of 'PBMC 3k' cells predicted to each cell type label
dict_mapping = dict( ( e, f"{e} ({c:,} cells)" ) for e, c in el.bk.LIST_COUNT( adata_subset.obs[ name_anno_predicted ], duplicate_filter = None ).reset_index( ).values )
name_anno_predicted_for_plotting = f"{name_anno_predicted}_with_numbers"
adata_subset.obs[ name_anno_predicted_for_plotting ] = list( dict_mapping[ e ] for e in adata_subset.obs[ name_anno_predicted ].values )
''' show dotplot '''
sc.pl.dotplot( adata_subset, dict_marker_genes, groupby = name_anno_predicted_for_plotting, figsize = ( 15, 8 ) ) # draw a dot plot using the count data of the PBMC 3k dataset
... storing 'extended_pbmc3k_predicted_cell_type_with_numbers' as categorical
ram