Spleen Dataset
[1]:
import warnings
warnings.filterwarnings("ignore")
import os
import sys
sys.path.insert(0, os.path.join(os.getcwd(), 'DREAM_stage1'))
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
import torch
torch.set_num_threads(1)
torch.use_deterministic_algorithms(True)
from notebook_pipeline import (
SpleenTutorialConfig,
prepare_adata,
attach_pseudo_labels,
build_model_and_optimizer,
train_embedding,
cluster_and_report,
)
print("Tutorial environment initialized")
Tutorial environment initialized
[2]:
cfg = SpleenTutorialConfig(
data_dir='/home//Data/',
dnn_model='/home//Result/Spleen/DNN_model.pth',
batches=['BALBc-1', 'BALBc-2', 'BALBc-3'],
epochs=1000,
n_cluster=4,
)
print(cfg)
SpleenTutorialConfig(data_dir='/home/zhangdaijun/Data/scNiche_data/', batches=['BALBc-1', 'BALBc-2', 'BALBc-3'], dnn_model='/home/zhangdaijun/Code/spatialID-main/result/Spleen/best_DNN_model_all.pth', seed=2025, n_layers=4, agg_method='Mean', prune_long_links=False, model_name='Muti', gae_dim=[128, 64], dae_dim=[128, 64], feat_dim=64, include_cat_covariates_contrastive_loss=False, epochs=1000, optimizer='Adam', use_dnn=True, lr=0.001, attr_loss_weight=1.0, bottleneck=False, n_attributes=1, edge_weight=True, kd_T=1, w_dae=1.0, w_gae=1.0, n_cluster=4, batch_size=4096, weight_decay=0.0001, scheduler_step=20, device=device(type='cuda'))
Step 1: Load and preprocess multiple batch data
[3]:
adata = prepare_adata(cfg)
print(f"adata shape: {adata.shape}")
print(f"batches: {adata.obs['batch'].cat.categories.tolist()}")
100%|██████████| 5/5 [00:00<00:00, 12.91it/s]
100%|██████████| 5/5 [00:00<00:00, 13.90it/s]
100%|██████████| 5/5 [00:00<00:00, 13.95it/s]
adata shape: (244233, 30)
batches: ['BALBc-1', 'BALBc-2', 'BALBc-3']
Step 2: Generate pseudo labels using DNN
[4]:
attach_pseudo_labels(adata, cfg)
adata.obs[['batch', 'pseudo_class']].head()
[4]:
| batch | pseudo_class | |
|---|---|---|
| BALBc_1_Cell1 | BALBc-1 | CD106(+)CD16/32(+)CD31(-)Ly6C(-) stroma |
| BALBc_1_Cell2 | BALBc-1 | ERTR7(+) stroma |
| BALBc_1_Cell3 | BALBc-1 | ERTR7(+) stroma |
| BALBc_1_Cell4 | BALBc-1 | CD106(+)CD16/32(+)CD31(-)Ly6C(-) stroma |
| BALBc_1_Cell5 | BALBc-1 | ERTR7(+) stroma |
Step 3: Build model and optimizer
[5]:
model, optimizer, scheduler = build_model_and_optimizer(adata, cfg)
print(model.__class__.__name__)
cat_covariates_keys: ['batch']
CATEGORICAL COVARIATES EMBEDDINGS INJECTION -> ['decoder']
self.include_cat_covariates_contrastive_loss False
['decoder']
Decoder embedding effective!
SpatialModel_cov
Step 4: Train and write back latent representations
[6]:
history = train_embedding(model, optimizer, adata, cfg)
print(f"Training epochs: {len(history['losses'])}")
print(f"Final total loss: {history['losses'][-1]:.4f}")
Training Epoch: 100%|██████████| 1000/1000 [15:01<00:00, 1.11it/s]
Training epochs: 1000
Final total loss: 148503168.0000
Step 5: GMM clustering and ARI/NMI summary per batch
[7]:
metrics_df = cluster_and_report(
adata,
batches=cfg.batches,
n_cluster=cfg.n_cluster,
seed=cfg.seed,
pred_key="GM",
truth_key="Compartment",
)
metrics_df
[7]:
| batch | ari | nmi | |
|---|---|---|---|
| 0 | BALBc-1 | 0.619094 | 0.592073 |
| 1 | BALBc-2 | 0.657278 | 0.618800 |
| 2 | BALBc-3 | 0.564992 | 0.564693 |