Spleen Dataset

[1]:
import warnings
warnings.filterwarnings("ignore")

import os
import sys
sys.path.insert(0, os.path.join(os.getcwd(), 'DREAM_stage1'))
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

import torch
torch.set_num_threads(1)
torch.use_deterministic_algorithms(True)

from notebook_pipeline import (
    SpleenTutorialConfig,
    prepare_adata,
    attach_pseudo_labels,
    build_model_and_optimizer,
    train_embedding,
    cluster_and_report,
)

print("Tutorial environment initialized")
Tutorial environment initialized
[2]:
cfg = SpleenTutorialConfig(
    data_dir='/home//Data/',
    dnn_model='/home//Result/Spleen/DNN_model.pth',
    batches=['BALBc-1', 'BALBc-2', 'BALBc-3'],
    epochs=1000,
    n_cluster=4,
)

print(cfg)
SpleenTutorialConfig(data_dir='/home/zhangdaijun/Data/scNiche_data/', batches=['BALBc-1', 'BALBc-2', 'BALBc-3'], dnn_model='/home/zhangdaijun/Code/spatialID-main/result/Spleen/best_DNN_model_all.pth', seed=2025, n_layers=4, agg_method='Mean', prune_long_links=False, model_name='Muti', gae_dim=[128, 64], dae_dim=[128, 64], feat_dim=64, include_cat_covariates_contrastive_loss=False, epochs=1000, optimizer='Adam', use_dnn=True, lr=0.001, attr_loss_weight=1.0, bottleneck=False, n_attributes=1, edge_weight=True, kd_T=1, w_dae=1.0, w_gae=1.0, n_cluster=4, batch_size=4096, weight_decay=0.0001, scheduler_step=20, device=device(type='cuda'))

Step 1: Load and preprocess multiple batch data

[3]:
adata = prepare_adata(cfg)
print(f"adata shape: {adata.shape}")
print(f"batches: {adata.obs['batch'].cat.categories.tolist()}")
100%|██████████| 5/5 [00:00<00:00, 12.91it/s]
100%|██████████| 5/5 [00:00<00:00, 13.90it/s]
100%|██████████| 5/5 [00:00<00:00, 13.95it/s]
adata shape: (244233, 30)
batches: ['BALBc-1', 'BALBc-2', 'BALBc-3']

Step 2: Generate pseudo labels using DNN

[4]:
attach_pseudo_labels(adata, cfg)
adata.obs[['batch', 'pseudo_class']].head()
[4]:
batch pseudo_class
BALBc_1_Cell1 BALBc-1 CD106(+)CD16/32(+)CD31(-)Ly6C(-) stroma
BALBc_1_Cell2 BALBc-1 ERTR7(+) stroma
BALBc_1_Cell3 BALBc-1 ERTR7(+) stroma
BALBc_1_Cell4 BALBc-1 CD106(+)CD16/32(+)CD31(-)Ly6C(-) stroma
BALBc_1_Cell5 BALBc-1 ERTR7(+) stroma

Step 3: Build model and optimizer

[5]:
model, optimizer, scheduler = build_model_and_optimizer(adata, cfg)
print(model.__class__.__name__)

cat_covariates_keys: ['batch']
CATEGORICAL COVARIATES EMBEDDINGS INJECTION -> ['decoder']
self.include_cat_covariates_contrastive_loss False
['decoder']
Decoder embedding effective!
SpatialModel_cov

Step 4: Train and write back latent representations

[6]:
history = train_embedding(model, optimizer, adata, cfg)
print(f"Training epochs: {len(history['losses'])}")
print(f"Final total loss: {history['losses'][-1]:.4f}")
Training Epoch: 100%|██████████| 1000/1000 [15:01<00:00,  1.11it/s]
Training epochs: 1000
Final total loss: 148503168.0000

Step 5: GMM clustering and ARI/NMI summary per batch

[7]:
metrics_df = cluster_and_report(
    adata,
    batches=cfg.batches,
    n_cluster=cfg.n_cluster,
    seed=cfg.seed,
    pred_key="GM",
    truth_key="Compartment",
)
metrics_df

[7]:
batch ari nmi
0 BALBc-1 0.619094 0.592073
1 BALBc-2 0.657278 0.618800
2 BALBc-3 0.564992 0.564693