1 Abstract

Here we mainly show how to perform umap in R with proteomics/TMT data and some comparisons between PCA and umap, as well as some tweaks of the parameters.

2 What is umap

Uniform Manifold Approximation and Projection (UMAP) is an newly developed algorithm for dimensional reduction. UMAP is good for large scale data, as well as traditional small dataset. More and more publications have used umap to replace PCA. Generally umap performs better than PCA.

reference1

There are 3 r packages for UMAP umap and uwot, umapr

umapr needs python. developed in 2018, discontinued and archived on github.
umap has two integrated function, one is python, but the default is r, no python needed. For more usage and details, check the vignettes. Easy to use.
uwot is still actively maintained. the author is co-author fo the second version of the umap publication. it is a full implementation of the python version in R.

3 Preparation

umapr is not supported and not used here.

Both umap and uwot are availabe at CRAN, and will be used here.

test TMT data can be downloaded from here:

meta.txt

proteinGroups.txt

peptide.txt

## install the two package, only run for the first time
install.packages("umap")
install.packages("uwot")

library(umap)
library(uwot)
library(ggpubr)
library(tidyverse)

4 iris data test

4.1 input format

First thing first, the data matrix is supposed to be a wide table instead of long table, which means that features are columns, samples are rows. The usual protein/peptide tables needs to be transposed before analysis

4.2 parameter

All using default for fair comparison

4.3 PCA

pca_iris <- stats::prcomp(iris[, -5], retx = TRUE, rank. = 2)

df <- data.frame(pca_iris$x, Species = iris$Species)
plotly::ggplotly(ggpubr::ggscatter(df, x = "PC1", y = "PC2", color = "Species") %>%  ggpubr::ggpar(legend = "right", title = "PCA of iris"))

4.4 using umap::umap

NOTE! the result is in $layout

iris.umap <- umap::umap(iris[, -5])

df <- data.frame(iris.umap$layout, Species = iris$Species)
plotly::ggplotly(ggpubr::ggscatter(df, x = "X1", y = "X2", color = "Species") %>%  ggpubr::ggpar(legend = "right", title = "umap::umap of iris"))

4.5 using uwot::umap

value is a matrix of optimized coordinates

iris_uwot_umap <- uwot::umap(iris[, -5])
df <- data.frame(iris_uwot_umap, Species = iris$Species)
plotly::ggplotly(ggpubr::ggscatter(df, x = "X1", y = "X2", color = "Species") %>%  ggpubr::ggpar(legend = "right", title = "umap::umap of iris"))

5 TMT result from Maxquant

5.1 protein level

meta <- rio::import("IRI_MQ/meta.txt")

MQ_IRI <- rio::import("IRI_MQ/proteinGroups_report.txt")
MQ_IRI <- t(MQ_IRI[,grep("Intensity ",colnames(MQ_IRI))])

# check the meta format
knitr::kable(head(meta))

label	sample	plate	row	plate_row	chanel
Intensity_R_TMT_Plate1_A_TMT10plex126C	R	Plate1	A	Plate1_A	126C
Intensity_S23_TMT_Plate1_A_TMT10plex127N	S23	Plate1	A	Plate1_A	127N
Intensity_S24_TMT_Plate1_A_TMT10plex127C	S24	Plate1	A	Plate1_A	127C
Intensity_A5-PBS2_TMT_Plate1_A_TMT10plex128N	A5-PBS2	Plate1	A	Plate1_A	128N
Intensity_S26_TMT_Plate1_A_TMT10plex128C	S26	Plate1	A	Plate1_A	128C
Intensity_S27_TMT_Plate1_A_TMT10plex129N	S27	Plate1	A	Plate1_A	129N

5.1.1 using raw intensity

5.1.1.1 PCA

pca <- stats::prcomp(MQ_IRI, retx = TRUE, rank. = 2, center = FALSE)

df_pca <- data.frame(pca$x, meta)
plotly::ggplotly(ggpubr::ggscatter(df_pca, x = "PC1", y = "PC2", color = "plate_row") %>%  ggpubr::ggpar(legend = "right", title = "PCA (raw intensity)"))

5.1.1.2 umap::umap

umap <- umap:::umap(MQ_IRI)
df_umap <- data.frame(umap$layout, meta)
ggpubr::ggscatter(df_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "right", title = "UMAP (raw intensity)")

5.1.1.3 uwot:umap

uwot_umap <- uwot::umap(MQ_IRI)
df_uwot_umap <- data.frame(uwot_umap,  meta)
plotly::ggplotly(ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "right", title = "UWOT::UMAP (raw intensity)"))

5.1.2 using log10(intensity)

MQ_IRI <- log10(MQ_IRI+1)

5.1.2.1 PCA

pca <- stats::prcomp(MQ_IRI, retx = TRUE, rank. = 2, center = FALSE)

df_pca <- data.frame(pca$x, meta)
plotly::ggplotly(ggpubr::ggscatter(df_pca, x = "PC1", y = "PC2", color = "plate_row") %>%  ggpubr::ggpar(legend = "right", title = "PCA (log10 raw intensity)"))

5.1.2.2 umap::umap

umap <- umap:::umap(MQ_IRI)
df_umap <- data.frame(umap$layout, meta)
ggpubr::ggscatter(df_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "right", title = "UMAP (log10 raw intensity)")

5.1.2.3 uwot:umap

uwot_umap <- uwot::umap(MQ_IRI)
df_uwot_umap <- data.frame(uwot_umap,  meta)
plotly::ggplotly(ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "right", title = "UWOT::UMAP (log10 raw intensity)"))

5.2 peptide level

MQ_IRI <- rio::import("IRI_MQ/peptides_report.txt")
MQ_IRI <- t(MQ_IRI[,grep("Intensity ",colnames(MQ_IRI))])

5.2.1 using raw intensity

5.2.1.1 PCA

pca <- stats::prcomp(MQ_IRI, retx = TRUE, rank. = 2, center = FALSE)

df_pca <- data.frame(pca$x, meta)
plotly::ggplotly(ggpubr::ggscatter(df_pca, x = "PC1", y = "PC2", color = "plate_row") %>%  ggpubr::ggpar(legend = "right", title = "PCA (raw intensity)"))

5.2.1.2 umap::umap

umap <- umap:::umap(MQ_IRI)
df_umap <- data.frame(umap$layout, meta)
ggpubr::ggscatter(df_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "right", title = "UMAP (raw intensity)")

5.2.1.3 uwot:umap

uwot_umap <- uwot::umap(MQ_IRI)
df_uwot_umap <- data.frame(uwot_umap,  meta)
plotly::ggplotly(ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "right", title = "UWOT::UMAP (raw intensity)"))

5.2.2 using log10(intensity)

MQ_IRI <- log10(MQ_IRI+1)

5.2.2.1 PCA

pca <- stats::prcomp(MQ_IRI, retx = TRUE, rank. = 2, center = FALSE)

df_pca <- data.frame(pca$x, meta)
plotly::ggplotly(ggpubr::ggscatter(df_pca, x = "PC1", y = "PC2", color = "plate_row") %>%  ggpubr::ggpar(legend = "right", title = "PCA (log10 raw intensity)"))

5.2.2.2 umap::umap

umap <- umap:::umap(MQ_IRI)

## Warning: failed creating initial embedding; using random embedding instead

df_umap <- data.frame(umap$layout, meta)
ggpubr::ggscatter(df_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "right", title = "UMAP (log10 raw intensity)")

5.2.2.3 uwot:umap

uwot_umap <- uwot::umap(MQ_IRI)
df_uwot_umap <- data.frame(uwot_umap,  meta)
plotly::ggplotly(ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "right", title = "UWOT::UMAP (log10 raw intensity)"))

6 PCA tweak

some parameters of PCA are worth to tweak, like center and scale. we use the log10 peptide intensity data for test, as we can see that scale does kind of centering, therefore at least one set as true will help the separation

Remeber to remove all 0 features first, otherwise not able to do scale

MQ_IRI <- rio::import("IRI_MQ/peptides_report.txt")
MQ_IRI <- MQ_IRI %>% select(starts_with("Intensity ")) %>% t() %>% as.data.frame() %>% select(where(~ sum(.) != 0))
MQ_IRI <- log10(MQ_IRI+1)

pca <- stats::prcomp(MQ_IRI, retx = TRUE, center = FALSE, scale = FALSE)
df_pca <- data.frame(pca$x, meta)
p1 <- ggpubr::ggscatter(df_pca, x = "PC1", y = "PC2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "center:FALSE  scale:FALSE")

pca <- stats::prcomp(MQ_IRI, retx = TRUE, center = TRUE, scale = FALSE)
df_pca <- data.frame(pca$x, meta)
p2 <- ggpubr::ggscatter(df_pca, x = "PC1", y = "PC2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "center:TRUE  scale:FALSE")

pca <- stats::prcomp(MQ_IRI, retx = TRUE, center = FALSE, scale = TRUE)
df_pca <- data.frame(pca$x, meta)
p3 <- ggpubr::ggscatter(df_pca, x = "PC1", y = "PC2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "center:FLASE  scale:TRUE")

pca <- stats::prcomp(MQ_IRI, retx = TRUE, center = TRUE, scale = TRUE)
df_pca <- data.frame(pca$x, meta)
p4<- ggpubr::ggscatter(df_pca, x = "PC1", y = "PC2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "center:TRUE  scale:TRUE")


ggarrange(p1, p2, p3,p4, ncol = 2, nrow = 2)

7 uwot::umap tweak

Deliberately we choose a raw protein intensity which performs worst from the above check for the test.

There are lots of parameters to modulate

umap can also be used ina supervised way by defining y

7.1 neighbour

setup n_neighbors

MQ_IRI <- rio::import("IRI_MQ/proteinGroups_report.txt")
MQ_IRI <- MQ_IRI %>% select(starts_with("Intensity ")) %>% t() %>% as.data.frame() %>% select(where(~ sum(.) != 0))
#MQ_IRI <- log10(MQ_IRI+1)


uwot_umap <- uwot::umap(MQ_IRI,n_neighbors = 5)
df_uwot_umap <- data.frame(uwot_umap,  meta)
p1 <-ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "neighbors = 5")

uwot_umap <- uwot::umap(MQ_IRI,n_neighbors = 10)
df_uwot_umap <- data.frame(uwot_umap,  meta)
p2 <-ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "neighbors = 10")


uwot_umap <- uwot::umap(MQ_IRI,n_neighbors = 15)
df_uwot_umap <- data.frame(uwot_umap,  meta)
p3 <-ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "neighbors = 15 (default)")

uwot_umap <- uwot::umap(MQ_IRI,n_neighbors = 20)
df_uwot_umap <- data.frame(uwot_umap,  meta)
p4 <-ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "neighbors = 20")


uwot_umap <- uwot::umap(MQ_IRI,n_neighbors = 40)
df_uwot_umap <- data.frame(uwot_umap,  meta)
p5 <-ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "neighbors = 40")

uwot_umap <- uwot::umap(MQ_IRI,n_neighbors = 60)
df_uwot_umap <- data.frame(uwot_umap,  meta)
p6 <-ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "neighbors = 60")

ggarrange(p1, p2, p3,p4,p5,p6, ncol = 2, nrow = 3)

7.2 metric

uwot_umap <- uwot::umap(MQ_IRI,metric = "euclidean")
df_uwot_umap <- data.frame(uwot_umap,  meta)
p1 <-ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "metric: euclidean(default)")

uwot_umap <- uwot::umap(MQ_IRI,metric = "cosine")
df_uwot_umap <- data.frame(uwot_umap,  meta)
p2 <-ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "metric: cosine")


uwot_umap <- uwot::umap(MQ_IRI,metric = "manhattan")
df_uwot_umap <- data.frame(uwot_umap,  meta)
p3 <-ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "metric: manhattan")

uwot_umap <- uwot::umap(MQ_IRI,metric = "hamming")
df_uwot_umap <- data.frame(uwot_umap,  meta)
p4 <-ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "metric: hamming")


uwot_umap <- uwot::umap(MQ_IRI,metric = "correlation")
df_uwot_umap <- data.frame(uwot_umap,  meta)
p5 <-ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "metric: correlation")


ggarrange(p1, p2, p3,p4,p5, ncol = 2, nrow = 3)

8 init

uwot_umap <- uwot::umap(MQ_IRI,init = "spectral")
df_uwot_umap <- data.frame(uwot_umap,  meta)
p1 <-ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "init: spectral(default)")

uwot_umap <- uwot::umap(MQ_IRI,init = "normlaplacian")
df_uwot_umap <- data.frame(uwot_umap,  meta)
p2 <-ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "init: normlaplacian")


uwot_umap <- uwot::umap(MQ_IRI,init = "random")
df_uwot_umap <- data.frame(uwot_umap,  meta)
p3 <-ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "init: random")

uwot_umap <- uwot::umap(MQ_IRI,init = "laplacian")
df_uwot_umap <- data.frame(uwot_umap,  meta)
p4 <-ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "init: laplacian")


uwot_umap <- uwot::umap(MQ_IRI,init = "pca")
df_uwot_umap <- data.frame(uwot_umap,  meta)
p5 <-ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "init: pca")

uwot_umap <- uwot::umap(MQ_IRI,init = "spca")
df_uwot_umap <- data.frame(uwot_umap,  meta)
p6 <-ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "init: spca")


uwot_umap <- uwot::umap(MQ_IRI,init = "agspectral")
df_uwot_umap <- data.frame(uwot_umap,  meta)
p7 <-ggpubr::ggscatter(df_uwot_umap, x = "X1", y = "X2", color = "plate_row") %>%  ggpubr::ggpar(legend = "none", title = "init: agspectral")

ggarrange(p1, p2, p3,p4,p5,p6,p7, ncol = 2, nrow = 4)

9 sessionInfo

sessionInfo()

## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19041)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_Canada.1252  LC_CTYPE=English_Canada.1252   
## [3] LC_MONETARY=English_Canada.1252 LC_NUMERIC=C                   
## [5] LC_TIME=English_Canada.1252    
## system code page: 65001
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] forcats_0.5.0   stringr_1.4.0   dplyr_1.0.2     purrr_0.3.4    
##  [5] readr_1.4.0     tidyr_1.1.2     tibble_3.0.4    tidyverse_1.3.0
##  [9] ggpubr_0.4.0    ggplot2_3.3.2   uwot_0.1.10     Matrix_1.2-18  
## [13] umap_0.2.7.0   
## 
## loaded via a namespace (and not attached):
##  [1] fs_1.5.0          lubridate_1.7.9.2 RcppAnnoy_0.0.18  httr_1.4.2       
##  [5] tools_4.0.3       backports_1.2.0   R6_2.5.0          irlba_2.3.3      
##  [9] DBI_1.1.0         lazyeval_0.2.2    colorspace_2.0-0  withr_2.3.0      
## [13] tidyselect_1.1.0  curl_4.3          compiler_4.0.3    cli_2.2.0        
## [17] rvest_0.3.6       Cairo_1.5-12.2    xml2_1.3.2        plotly_4.9.2.1   
## [21] labeling_0.4.2    scales_1.1.1      askpass_1.1       digest_0.6.27    
## [25] foreign_0.8-80    rmarkdown_2.5     rio_0.5.16        pkgconfig_2.0.3  
## [29] htmltools_0.5.1.1 dbplyr_2.0.0      highr_0.8         htmlwidgets_1.5.2
## [33] rlang_0.4.11      readxl_1.3.1      rstudioapi_0.13   FNN_1.1.3        
## [37] generics_0.1.0    farver_2.0.3      jsonlite_1.7.1    crosstalk_1.1.0.1
## [41] zip_2.1.1         car_3.0-10        magrittr_2.0.1    Rcpp_1.0.5       
## [45] munsell_0.5.0     fansi_0.4.1       abind_1.4-5       reticulate_1.18  
## [49] lifecycle_0.2.0   stringi_1.5.3     yaml_2.2.1        carData_3.0-4    
## [53] grid_4.0.3        crayon_1.3.4      lattice_0.20-41   cowplot_1.1.0    
## [57] haven_2.3.1       hms_0.5.3         knitr_1.30        pillar_1.4.7     
## [61] ggsignif_0.6.0    codetools_0.2-18  reprex_0.3.0      glue_1.4.2       
## [65] evaluate_0.14     data.table_1.14.0 modelr_0.1.8      vctrs_0.3.5      
## [69] cellranger_1.1.0  gtable_0.3.0      openssl_1.4.3     assertthat_0.2.1 
## [73] xfun_0.19         openxlsx_4.2.3    broom_0.7.2       RSpectra_0.16-0  
## [77] rstatix_0.6.0     viridisLite_0.3.0 ellipsis_0.3.1

UMAP VS PCA

Suggestions to ningzhibin@gmail.com

Report generated @2021-07-15 10:35:53