Tutorial 5: Identifying FVGs in 10X Visium DLPFC dataset¶

In this tutorial, we show how to apply scSTADE to identify FVGs on 10X Visium DLPFC dataset. As a example, we analyse the Visium dataset.

The source code package is freely available at https://github.com/cuiyaxuan/scSTADE/tree/master. The datasets used in this study can be found at https://drive.google.com/drive/folders/1H-ymfCqlDR1wpMRX-bCewAjG5nOrIF51?usp=sharing.

First, cd /home/.../scSTADE_FVG_Functions/div>

# we execute the FVG model in the R environment.
conda create -n r4
source activate r4

conda search r-base
conda install r-base=4.2.0
conda install conda-forge::r-seurat==4.4.0
conda install conda-forge::r-hdf5r
onda install r-devtools

Using R virtual environment with conda。

library("devtools")
devtools::install_github("shaoqiangzhang/DEGman")

install.packages("hdf5r")
install.packages('philentropy')
install.packages('dplyr')
install.packages('foreach')
install.packages('parallel')
install.packages('doParallel')

# You can find the estimate clustering labels and the running results in the scSTADE/scSTADE_FVG_Functions/.

# You can run "Rscript step1_run_DEG.R"
# =========================
# 1. Load libraries
# =========================
source("functions_DEG.R")
library(DEGman)
library(Seurat)
library(dplyr)
library(hdf5r)
library(philentropy)
library(foreach)
library(doParallel)
library(doSNOW)



# =========================
# 3. Load data
# =========================
hc1 <- Read10X_h5(
  "/home/cuiyaxuan/spatialLIBD/151673/151673_filtered_feature_bc_matrix.h5" #### to your path and project name
)

label <- read.csv(
  "/home/cuiyaxuan/metric_change/revise_R2/est_151673/conlabel.csv", #### to your path of cluster label
  header = TRUE,
  row.names = 1
)

# =========================
# 4. Preprocess (once)
# =========================
matlist <- prep_matlist(hc1, label)
k <- length(matlist)    # number of clusters

# =========================
# 5. Parallel + progress bar
# =========================
ncore <- min(k, parallel::detectCores() - 1)
cl <- makeCluster(ncore)
registerDoSNOW(cl)

pb <- txtProgressBar(min = 0, max = k, style = 3)
progress <- function(n) setTxtProgressBar(pb, n)
opts <- list(progress = progress)

files <- foreach(
  n = 1:k,
  .packages = c("DEGman"),
  .options.snow = opts
) %dopar% {
  run_one_cluster(matlist, n)
}

close(pb)
stopCluster(cl)

# =========================
# 6. Done
# =========================

#################################Functionally variable genes weight compute################################


# You can run "Rscript step2_run_FVG.R"

source("functions_FVG.R")

library(tidyverse)

rounded_number <- 3

fre <- list.files(
  path = ".",
  pattern = "\\.csv$",
  full.names = TRUE
) %>%
  map_dfr(read.csv) %>%        # 自动 rbind
  count(x) %>%                 # table(vec1$x)
  filter(n >= rounded_number) %>%
  pull(x)

write.csv(fre, "df1.csv", row.names = FALSE)


# ==============================================================================
# File: main.R
# Description: Main execution script for FVG calculation
# Usage: Make sure functions_FVG.R is in the same directory before running
# ==============================================================================

library(Seurat)
library(dplyr)
library(hdf5r)
library(foreach)
library(parallel)
library(doParallel)

# ==============================================================================
# 1. Set paths and load data
# ==============================================================================
h5_path <- "/home/cuiyaxuan/spatialLIBD/151673/151673_filtered_feature_bc_matrix.h5"  #### to your path and project name
spatial_path <- "/home/cuiyaxuan/spatialLIBD/151673/spatial/tissue_positions_list.csv"  #### to your path and project name
gene_list_path <- "df1.csv"

cat("Loading data...\n")
hc1 <- Read10X_h5(h5_path)
tissue_local <- read.csv(spatial_path, row.names = 1, header = FALSE)

# ==============================================================================
# 2. Seurat preprocessing
# ==============================================================================
pbmc <- CreateSeuratObject(counts = hc1, project = "HC_1", min.cells = 10)
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 30000)

# ==============================================================================
# 3. Data alignment
# ==============================================================================
common_cells <- intersect(colnames(pbmc), rownames(tissue_local))
cat(sprintf("Common cells found: %d\n", length(common_cells)))

pbmc <- subset(pbmc, cells = common_cells)
tissue_local <- tissue_local[common_cells, ]

# Load target gene list
df1 <- read.csv(gene_list_path, header = TRUE)
target_genes <- df1[, 1]

# Keep genes present in expression matrix
valid_genes <- intersect(target_genes, rownames(pbmc))

# Extract expression matrix (cells × genes)
mat_sparse <- GetAssayData(pbmc, layer = "data")[valid_genes, ]
dat1 <- as.matrix(t(mat_sparse))

# Extract spatial coordinates (first two columns)
x_y_list <- tissue_local[, 1:2]

if (!all(rownames(dat1) == rownames(x_y_list))) {
  stop("Data alignment error!")
}

# ==============================================================================
# 4. Parallel computation
# ==============================================================================
n_cores <- 24
cat(sprintf("Starting parallel processing with %d cores...\n", n_cores))

cls <- makeCluster(n_cores)
registerDoParallel(cls)

# Export required objects to worker nodes
clusterExport(cls, c("cripar_opt", "getmode", "x_y_list"))

crinum <- foreach(
  gene_vec = iter(dat1, by = "col"),
  .combine = "c",
  .packages = "stats"
) %dopar% {
  cripar_opt(gene_vec, x_y_list, Ccri = 10, highval = 500, lowval = 50) # Users can freely adjust the following three parameters: Ccri = 10, highval = 500, and lowval = 50.
}

stopCluster(cls)

# ==============================================================================
# 5. Save results
# ==============================================================================
result_df <- data.frame(gene = valid_genes, crinum = crinum)

# Merge with original gene list (keep original order)
final_res <- merge(df1, result_df,
                   by.x = names(df1)[1],
                   by.y = "gene",
                   all.x = TRUE)

fvg_sort <- final_res[order(-final_res$crinum), ]
write.csv(fvg_sort, "fvg.csv", row.names = FALSE)

cat("Done! Results saved to 'fvg.csv'.\n")

/home/cuiyaxuan/anaconda3/envs/pytorch/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
/home/cuiyaxuan/anaconda3/envs/pytorch/lib/python3.8/site-packages/anndata/_core/anndata.py:1830: UserWarning: Variable names are not unique. To make them unique, call .var_names_make_unique.
  utils.warn_names_duplicates("var")
/home/cuiyaxuan/anaconda3/envs/pytorch/lib/python3.8/site-packages/scanpy/readwrite.py:413: DtypeWarning: Columns (1,2,3,4,5) have mixed types. Specify dtype option on import or set low_memory=False.
  positions = pd.read_csv(files['tissue_positions_file'], header=None)