上一節我們研究了如何對單樣本進行分析,這節我們就著重來研究一下如何對多樣本整合進行研究分析!
1. 導入相關包
library(Seurat)
library(tidyverse)
library(patchwork)
2. 數據準備
# 導入單樣本文件
dir = c('~/Desktop/diversity intergration/scRNA_26-0_filtered_feature_bc_matrix','~/Desktop/diversity intergration/scRNA_27-2_filtered_feature_bc_matrix','~/Desktop/diversity intergration/scRNA_28-0_filtered_feature_bc_matrix'
)
2.1 方法一
# 導入數據
counts <- Read10X(data.dir = dir)
# 轉換為seurat對象
scRNA1 <- CreateSeuratObject(counts = counts,min.cells = 3,min.features = 200)
dim(scRNA1)
# 查看每個樣本的細胞
table(scRNA1@meta.data$orig.ident)
# healthy1 healthy2 healthy3
# 6037 6343 11657
2.2 方法二
scRNAlist <- list() # 創建一個空的列表,其中包含三個seurat對象
for (i in 1:length(dir)){counts <- Read10X(data.dir = dir[i])scRNAlist[[i]] <- CreateSeuratObject(counts = counts,min.cells = 3,min.features = 200)scRNAlist[[i]] <- RenameCells(scRNAlist[[i]],add.cell.id = sample_name[i])
}
3. 合并數據
scRNA <- merge(scRNAlist[[1]],y = c(scRNAlist[[2]],scRNAlist[[3]]))
dim(scRNA)
# [1] 18037 24037
4. 數據標準化和選擇高變基因
# 每一個樣本分別進行數據標準化和提取高變基因
for (i in 1:length(scRNAlist)){scRNAlist[[i]] <- NormalizeData(scRNAlist[[i]])scRNAlist[[i]] <- FindVariableFeatures(scRNAlist[[i]],selection.method = 'vst',nfeatures = 2000) # 這里我們只需要2000的基因
}
5. 提取錨點
# 以variableFeatures為基礎尋找錨點
scRNA.anchors <- FindIntegrationAnchors(object.list = scRNAlist)
dim(scRNA.anchors)
6. 利用錨點整合數據
# 利用錨點整合數據
scRNA.integrated <- IntegrateData(anchorset = scRNA.anchors)
dim(scRNA.integrated)
# [1] 2000 24037
7. 數據縮放
# 設置默認的分析方法為integrated
# ScaleData 函數的作用是對數據進行標準化或歸一化處理,以確保不同特征之間具有可比性,并減少數據的偏差和方差。通過縮放數據,可以使后續的分析更加穩定和可靠。
DefaultAssay(scRNA.integrated) <- 'integrated'
scRNA.integrated <-ScaleData(scRNA.integrated,verbose = FALSE)
# scRNA.integrated 對象中的數據將被縮放,并可用于后續的分析步驟,如降維、聚類、差異表達分析等。
8. PCA降維
scRNA.integrated <- RunPCA(scRNA.integrated,features=VariableFeatures(scRNA.integrated))
8.1 熱圖
DimPlot(scRNA.integrated,reduction = 'pca',group.by = 'orig.ident')
8.2 肘圖
ElbowPlot(scRNA.integrated,ndims = 30,reduction = 'pca')
# 從圖中可以看出我們最好應該選擇前20個維度的數據
9. 細胞聚類
scRNA <- FindNeighbors(scRNA.integrated,dims = 1:20)
scRNA <- FindClusters(scRNA,resolution = 0.1) # 0.5聚類的結果太多
table(scRNA@meta.data$seurat_clusters)
metadata <- scRNA@meta.data
# 單獨將數據提取出來
cell_cluster <- data.frame(cell_ID=rownames(metadata),cluster_ID=metadata$seurat_clusters)
10. 降維
10.1 umap降維
scRNA <- RunUMAP(scRNA,dims = 1:20)
embed_umap <- Embeddings(scRNA,'umap')
10.1.1 group_by_cluster
DimPlot(scRNA,reduction = 'umap')
10.1.2 group_by_sample
DimPlot(scRNA,reduction = 'umap',group.by = 'orig.ident')
10.2 tsne降維
scRNA <- RunTSNE(scRNA,dims = 1:20)
embed_tsne <- Embeddings(scRNA,'tsne')
10.2.1 group_by_cluster
DimPlot(scRNA,reduction = 'tsne')
10.2.2 group_by_sample
DimPlot(scRNA,reduction = 'umap',group.by = 'orig.ident')
10.3 umap和tsne的探究
UMAP(Uniform Manifold Approximation and Projection)和 t-SNE(t-Distributed Stochastic Neighbor Embedding)都是用于降維和數據可視化的技術,它們有一些區別和聯系:
區別:
- 算法原理:UMAP 和 t-SNE 采用了不同的數學方法來實現降維和可視化。UMAP 基于流形學習的思想,試圖保持數據的全局結構和局部鄰域關系;而 t-SNE 則主要關注數據點之間的局部相似性,并將高維數據映射到低維空間中,使得相似的數據點在低維空間中更接近。
- 結果解釋:UMAP 通常能夠更好地保持數據的全局結構,對于具有復雜拓撲結構的數據可能更有效;而 t-SNE 更擅長捕捉數據的局部結構和聚類信息,但可能對全局結構的保持相對較弱。
- 計算效率:UMAP 在處理大規模數據集時通常比 t-SNE 更高效,因為它的計算復雜度較低。
- 參數調整:UMAP 和 t-SNE 都有一些參數需要調整,例如鄰居數量、 perplexity 等。然而,UMAP 對參數的選擇相對較不敏感,而 t-SNE 的結果可能對參數的選擇更為敏感。
聯系:
- 數據可視化:UMAP 和 t-SNE 都可以用于將高維數據可視化在低維空間中,幫助人們理解數據的分布和結構。
- 探索性數據分析:它們都可以作為探索性數據分析的工具,幫助發現數據中的模式、聚類和異常值。
- 與其他分析結合:UMAP 和 t-SNE 的結果可以與其他分析方法結合使用,例如聚類分析、分類器等,以進一步挖掘數據的信息。
11. 質控
# 切換數據集
DefaultAssay(scRNA) <- 'RNA'
# 計算線粒體和紅細胞的基因比例
scRNA[['percent.mt']] <- PercentageFeatureSet(scRNA,pattern = 'MT-')# 通常是指與血紅蛋白(Hemoglobin)相關的基因
HB.genes <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ")
HB_m <- match(HB.genes, rownames(scRNA@assays$RNA)) # 匹配已經擁有的基因,返回一個含有下標的向量
HB.genes <- rownames(scRNA@assays$RNA)[HB_m]
scRNA[["percent.HB"]]<-PercentageFeatureSet(scRNA, features=HB.genes)##meta.data添加信息
proj_name <- data.frame(proj_name=rep("demo2",ncol(scRNA)))
rownames(proj_name) <- row.names(scRNA@meta.data)
scRNA <- AddMetaData(scRNA, proj_name)
11.1 可視化
# 繪制小提琴圖
# 所有樣本一個小提琴圖用group.by="proj_name",每個樣本一個小提琴圖用group.by="orig.ident"
plot7 <-VlnPlot(scRNA, group.by = "proj_name", raster=FALSE,features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), pt.size = 0, #不需要顯示點,可以設置pt.size = 0
) + theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) # 將x軸的標題,文本和刻度線都設置為空,這樣x軸就不會顯示任何內容
plot1 <- FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot3 <- FeatureScatter(scRNA, feature1 = "nCount_RNA", feature2 = "percent.HB")
pearplot <- CombinePlots(plots = list(plot1, plot2), nrow=1, legend="none")
pearplot
11.2 去除極端細胞
# 去除細胞特征過高和過低的細胞
scRNA <- subset(scRNA, subset = nFeature_RNA > 200 & nFeature_RNA < 4000 & percent.mt < 10)
12. 歸一化
# 數據歸一化
scRNA <- NormalizeData(scRNA, normalization.method = "LogNormalize", scale.factor = 10000)
13. 鑒定高變基因
# 鑒定高變基因
# 這一步的目的是鑒定出細胞與細胞之間表達相差很大的基因,用于后續鑒定細胞類型
# 我們使用默認參數,用vst方法選出2000個高變基因
scRNA <- FindVariableFeatures(scRNA,selection.method = 'vst',nfeatures = 2000)
dim(scRNA) # 但是這里跑程序的時候基因的數量不對,還沒有找到原因
# [1] 18037 21402# 前十個高變基因
top10 <- head(VariableFeatures(scRNA),10)
top10
# [1] "PTGDS" "S100A9" "S100A8" "CST3" "TRBV11-2" "HLA-DQA1" "HLA-DRA" "C1QA" "LILRA4" "LYZ"
可視化
plot1 <- VariableFeaturePlot(scRNA)
plot2 <- LabelPoints(plot = plot1,points = top10,repel = TRUE,xnudge = 0,ynudge = 0)
14. 細胞注釋
Idents(scRNA) <- 'integrated_snn_res.0.5'
plot3 = DimPlot(scRNA, reduction = "umap", label=T)
# 鑒定細胞類型
# 為了后續分析的方便,我們先用singleR來預測每個cluster的細胞類型
library(celldex)
library(SingleR)
cg <- ImmGenData() # 選定一個參考集數據,ImmGenData是一個免疫細胞的數據集
cellpred <- SingleR(test=testdata,ref=cg, labels=cg$label.main)
table(cellpred$labels) # 看看都注釋到了哪些細胞
# B cells Basophils Endothelial cells Eosinophils Epithelial cells
# 20337 1 224 595 31
# Fibroblasts ILC Macrophages Mast cells
# 3 205 5 1
# 由得到的結果可以看出,用SingerR注釋的結果太拉胯了,雖然手動注釋比較麻煩,但是為了數據的準確性還是手動注釋吧cellType=data.frame(seurat=scRNA@meta.data$seurat_clusters,predict=cellpred$labels)
table(cellType[,1:2]) # 訪問celltyple的2~3列####細胞注釋
scRNA.sub <- subset(scRNA, idents = c(1,3,4,7), invert = TRUE) # 挑選需要的簇
scRNA.sub
new.cluster.ids <- c("1" = "CD4 T","2" = "Monocyte","3" = "CD4 T","4" = "NK","5" = "Monocyte","6" = "CD8 T","7" = "B","8" = "CD8 T","9" ="Monocyte","11"="NK","13" = "Monocyte","14" = "Monocyte","16" = "CD4 T","17" = "DC","18" = "B","19"="NK","20"="Monocyte","21"="DC","22" = "B","25"="NK"
)
scRNA.sub <- RenameIdents(scRNA.sub, new.cluster.ids)
options(repr.plot.height = 6, repr.plot.width = 8)
scRNA.sub$cell_type <- Idents(scRNA.sub)
Idents(scRNA.sub) <- "cell_type"
DimPlot(scRNA.sub, reduction = "umap", label = TRUE,raster=FALSE)