v2023-11-20
To view all the HTML components of this Jupyter Notebook, please view Tutorial 3) Combine 10x MEX count matrices memory-efficiently using SC-Elephant
"""
Reset the tutorial
"""
# delete the intermediate output folders to reset the tutorials
!rm -rf "combine_mtx/"
"""
Download multiple datasets and create output folders
"""
!mkdir -p combine_mtx/data/neurons_cell_900
!mkdir -p combine_mtx/data/neurons_cell_2000
!mkdir -p combine_mtx/data/neuron_cell_9k
!mkdir -p combine_mtx/data/neuron_nuclei_2k
!mkdir -p combine_mtx/data/neuron_nuclei_900
!wget https://cf.10xgenomics.com/samples/cell-exp/2.1.0/neurons_900/neurons_900_filtered_gene_bc_matrices.tar.gz -O combine_mtx/data/neurons_cell_900/neurons_cell_900.tar.gz
!wget https://cf.10xgenomics.com/samples/cell-exp/2.1.0/neurons_2000/neurons_2000_filtered_gene_bc_matrices.tar.gz -O combine_mtx/data/neurons_cell_2000/neurons_cell_2000.tar.gz
!wget https://cf.10xgenomics.com/samples/cell-exp/2.1.0/neuron_9k/neuron_9k_filtered_gene_bc_matrices.tar.gz -O combine_mtx/data/neuron_cell_9k/neuron_cell_9k.tar.gz
!wget https://cf.10xgenomics.com/samples/cell-exp/2.1.0/nuclei_2k/nuclei_2k_filtered_gene_bc_matrices.tar.gz -O combine_mtx/data/neuron_nuclei_2k/neuron_nuclei_2k.tar.gz
!wget https://cf.10xgenomics.com/samples/cell-exp/2.1.0/nuclei_900/nuclei_900_filtered_gene_bc_matrices.tar.gz -O combine_mtx/data/neuron_nuclei_900/neuron_nuclei_900.tar.gz
!cd combine_mtx/data/neurons_cell_900/ && tar -xzf neurons_cell_900.tar.gz && cd filtered_gene_bc_matrices/mm10/ && gzip * && mv genes.tsv.gz features.tsv.gz
!cd combine_mtx/data/neurons_cell_2000/ && tar -xzf neurons_cell_2000.tar.gz && cd filtered_gene_bc_matrices/mm10/ && gzip * && mv genes.tsv.gz features.tsv.gz
!cd combine_mtx/data/neuron_cell_9k/ && tar -xzf neuron_cell_9k.tar.gz && cd filtered_gene_bc_matrices/mm10/ && gzip * && mv genes.tsv.gz features.tsv.gz
!cd combine_mtx/data/neuron_nuclei_2k/ && tar -xzf neuron_nuclei_2k.tar.gz && cd filtered_gene_bc_matrices/mm10/ && gzip * && mv genes.tsv.gz features.tsv.gz
!cd combine_mtx/data/neuron_nuclei_900/ && tar -xzf neuron_nuclei_900.tar.gz && cd filtered_gene_bc_matrices/mm10/ && gzip * && mv genes.tsv.gz features.tsv.gz
!mkdir -p combine_mtx/output
--2023-11-19 18:33:35-- https://cf.10xgenomics.com/samples/cell-exp/2.1.0/neurons_900/neurons_900_filtered_gene_bc_matrices.tar.gz Resolving cf.10xgenomics.com (cf.10xgenomics.com)... 104.18.0.173, 104.18.1.173, 2606:4700::6812:ad, ... Connecting to cf.10xgenomics.com (cf.10xgenomics.com)|104.18.0.173|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 8969223 (8.6M) [application/x-tar] Saving to: ‘combine_mtx/data/neurons_cell_900/neurons_cell_900.tar.gz’ combine_mtx/data/ne 100%[===================>] 8.55M 2.72MB/s in 3.1s 2023-11-19 18:33:39 (2.72 MB/s) - ‘combine_mtx/data/neurons_cell_900/neurons_cell_900.tar.gz’ saved [8969223/8969223] --2023-11-19 18:33:39-- https://cf.10xgenomics.com/samples/cell-exp/2.1.0/neurons_2000/neurons_2000_filtered_gene_bc_matrices.tar.gz Resolving cf.10xgenomics.com (cf.10xgenomics.com)... 104.18.1.173, 104.18.0.173, 2606:4700::6812:1ad, ... Connecting to cf.10xgenomics.com (cf.10xgenomics.com)|104.18.1.173|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 16189449 (15M) [application/x-tar] Saving to: ‘combine_mtx/data/neurons_cell_2000/neurons_cell_2000.tar.gz’ combine_mtx/data/ne 100%[===================>] 15.44M 10.8MB/s in 1.4s 2023-11-19 18:33:41 (10.8 MB/s) - ‘combine_mtx/data/neurons_cell_2000/neurons_cell_2000.tar.gz’ saved [16189449/16189449] --2023-11-19 18:33:41-- https://cf.10xgenomics.com/samples/cell-exp/2.1.0/neuron_9k/neuron_9k_filtered_gene_bc_matrices.tar.gz Resolving cf.10xgenomics.com (cf.10xgenomics.com)... 104.18.1.173, 104.18.0.173, 2606:4700::6812:1ad, ... Connecting to cf.10xgenomics.com (cf.10xgenomics.com)|104.18.1.173|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 72358212 (69M) [application/x-tar] Saving to: ‘combine_mtx/data/neuron_cell_9k/neuron_cell_9k.tar.gz’ combine_mtx/data/ne 100%[===================>] 69.01M 7.01MB/s in 22s 2023-11-19 18:34:04 (3.18 MB/s) - ‘combine_mtx/data/neuron_cell_9k/neuron_cell_9k.tar.gz’ saved [72358212/72358212] --2023-11-19 18:34:04-- https://cf.10xgenomics.com/samples/cell-exp/2.1.0/nuclei_2k/nuclei_2k_filtered_gene_bc_matrices.tar.gz Resolving cf.10xgenomics.com (cf.10xgenomics.com)... 104.18.1.173, 104.18.0.173, 2606:4700::6812:1ad, ... Connecting to cf.10xgenomics.com (cf.10xgenomics.com)|104.18.1.173|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 5676940 (5.4M) [application/x-tar] Saving to: ‘combine_mtx/data/neuron_nuclei_2k/neuron_nuclei_2k.tar.gz’ combine_mtx/data/ne 100%[===================>] 5.41M 3.38MB/s in 1.6s 2023-11-19 18:34:07 (3.38 MB/s) - ‘combine_mtx/data/neuron_nuclei_2k/neuron_nuclei_2k.tar.gz’ saved [5676940/5676940] --2023-11-19 18:34:07-- https://cf.10xgenomics.com/samples/cell-exp/2.1.0/nuclei_900/nuclei_900_filtered_gene_bc_matrices.tar.gz Resolving cf.10xgenomics.com (cf.10xgenomics.com)... 104.18.1.173, 104.18.0.173, 2606:4700::6812:1ad, ... Connecting to cf.10xgenomics.com (cf.10xgenomics.com)|104.18.1.173|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 8557322 (8.2M) [application/x-tar] Saving to: ‘combine_mtx/data/neuron_nuclei_900/neuron_nuclei_900.tar.gz’ combine_mtx/data/ne 100%[===================>] 8.16M 6.76MB/s in 1.2s 2023-11-19 18:34:09 (6.76 MB/s) - ‘combine_mtx/data/neuron_nuclei_900/neuron_nuclei_900.tar.gz’ saved [8557322/8557322]
"""
Import necessary packages
"""
# not use GPU (only inference will be performed, and using CPUs are sufficient)
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
# import scelephant
import scelephant as el
from scelephant import RamData
import scanpy as sc
# set figure parameters
sc.set_figure_params( dpi = 200, figsize = ( 6, 5 ), frameon = True )
# plotly export settings
import plotly.io as pio
pio.renderers.default = "notebook"
"""
configure the jupyter notebook environment
"""
el.bk.Wide( 100 ) # adjust the jupyter notebook cell width
import matplotlib as mpl
mpl.rcParams[ "figure.dpi" ] = 100
# set image resolution
# load memory profiler
# %load_ext memory_profiler
''' define inputs and output '''
l_path_folder_mtx_input = [
'combine_mtx/data/neurons_cell_900/filtered_gene_bc_matrices/mm10/',
'combine_mtx/data/neurons_cell_2000/filtered_gene_bc_matrices/mm10/',
'combine_mtx/data/neuron_cell_9k/filtered_gene_bc_matrices/mm10/',
'combine_mtx/data/neuron_nuclei_2k/filtered_gene_bc_matrices/mm10/',
'combine_mtx/data/neuron_nuclei_900/filtered_gene_bc_matrices/mm10/',
]
path_folder_mtx_output = 'combine_mtx/data/combined_mtx/'
''' add prefix to the barcodes to make barcodes are unique across the datasets '''
for name_dataset, path_folder_mtx in el.bk.GLOB_Retrive_Strings_in_Wildcards( 'combine_mtx/data/*/filtered_gene_bc_matrices/mm10/' ).values :
el.MTX_10X_Barcode_add_prefix_or_suffix( f"{path_folder_mtx}barcodes.tsv.gz", barcode_prefix = f"{name_dataset}-" )
''' combine matrices '''
el.MTX_10X_Combine(
path_folder_mtx_output,
* l_path_folder_mtx_input,
flag_low_memory_mode_because_there_is_no_shared_cell_between_mtxs = True, # since no cells are shared across the datasets (and barcodes are unique across the datasets), it is safe to turn on this option
)
!rm -rf 'combine_mtx/output/mouse_brain_5_datasets_combined.ram/'
# initialize a pool of managed operators
fop = el.managers.FileSystemOperatorPool( 8 ) # this pool of managed operators will be used throughout the tutorials
# create RamData from 10X-formateed MTX
el.create_ramdata_from_mtx(
path_folder_mtx_10x_input = f'combine_mtx/data/combined_mtx/',
path_folder_ramdata_output = f'combine_mtx/output/mouse_brain_5_datasets_combined.ram/',
file_system_operator_pool = fop,
)
ram = RamData(
f'combine_mtx/output/mouse_brain_5_datasets_combined.ram/',
int_total_weight_for_each_batch = 350_000, # increase the batch size for better performance
int_num_cpus = 8,
file_system_operator_pool = fop,
flag_enable_synchronization_through_locking = False, # You can set it to False for better performance, but when this RamData object is being modified by other researchers, setting to True is highly recommended to avoid collisions
)
2023-11-19 18:39:04,551 [SC-Elephant] <INFO> (layer) - 'raw' layer has been loaded
ram.bc.filter = None
arr_str_bc = ram.bc.load_str( ) # load string representations of all barcodes
ram.bc.meta[ 'name_dataset' ] = list( e.split( '-', 1 )[ 0 ] for e in arr_str_bc )
2023-11-19 18:39:04,717 [SC-Elephant] <INFO> (load_str) - completed loading of 15387 number of strings
ram