1 Abstract

It is a frequently asked question that how many replicates do we need? (hidden message is to pick up enough/all significantly changed proteins/features)

This document is to simulate that, trying to answer this question.

Readers can use this function to see effects of some other parameters on the number of replicates with different change (mean difference)

2 Simulation function

Idea:

Simulate the proteomics result from a two-samples experiment with defined number of replicates.
Most of the proteins/peptides will be nonchangeing backgroud, sampled from a large normal distributed pool.
A fraction of the proteins/peptides are significantly changed between the two samples, with defined meand difference.
with SD set 1, it is easy to estimate the difference between control and sample
pvalue is calculated based on two-sample t-test, while the qvalue here is the FDR adjusted p-value
In the output figure, the y is normalized to percentage, for easy comparison

Default parameters:

pool_sample_size = 100000 # the size of background pool, be bigger the better, but the slower
mean_difference = 3 # the difference between control and sample, with SD set 1, the mean of difference can be used to interpretate the amplitude of the change, 1 is the same as to sd, while 2 means 2 folds of sd.
sd = 1 #the sd of the sample, leave it as default, unless you know what you are doing
number_features_nochange = 1000 # as name says, the number of background, no change
number_features_changed = 200 # the number of changed features
number_of_replicate = 20 # number of replicate in the experiment, 20 should be good enough for most simulation to see the plateu
number_of_permutation = 20 # number of permutation to see the mean and sd of the simulation

library(ggplot2)
library(reshape2)
library(tidyverse)
library(plotly)



simulation_function<- function(pool_sample_size = 100000,
                    mean_difference  = 3,
                    sd = 1,
                    number_features_nochange = 1000,
                    number_features_changed = 200,
                    number_of_replicate = 20,
                    number_of_permutation = 20){
  
  
  # initialize
  summary_list <- list()
  
  for(n_rep in 2: number_of_replicate){
    #print(paste("analyzing experiment with replicate number:", n_rep))
    p_value_list <- list()
    q_value_list <- list()
    
    for(perm in 1:number_of_permutation){
      
      # generate sampling data
      pool_normal_1 <- rnorm(pool_sample_size, mean = 0, sd =sd)
      pool_normal_2 <- rnorm(pool_sample_size, mean = mean_difference, sd =sd)
      
      
      # background_ctrl, from pool_normal_1
      matrix_background_control <- as.data.frame(matrix(sample(pool_normal_1, n_rep* number_features_nochange, replace = TRUE), ncol =  n_rep))
      colnames(matrix_background_control) <- paste0(rep("ctr_rep", n_rep), 1:n_rep)
      
      
      # background_sample from pool_normal_1
      matrix_background_sample <- as.data.frame(matrix(sample(pool_normal_1, n_rep* number_features_nochange, replace = TRUE), ncol =  n_rep))
      colnames(matrix_background_sample) <- paste0(rep("sample_rep", n_rep), 1:n_rep)
      
      
      # control
      matrix_control <- as.data.frame(matrix(sample(pool_normal_1, n_rep* number_features_changed, replace = TRUE), ncol =  n_rep))
      colnames(matrix_control) <- paste0(rep("ctr_rep", n_rep), 1:n_rep)
      
      matrix_control <- rbind(matrix_control, matrix_background_control)
      
      # sample
      matrix_sample <- as.data.frame(matrix(sample(pool_normal_2, n_rep* number_features_changed, replace = TRUE), ncol =  n_rep))
      colnames(matrix_sample) <- paste0(rep("sample_rep", n_rep), 1:n_rep)
      
      matrix_sample <- rbind(matrix_sample, matrix_background_sample)
      
      
      # combine into one matrix
      sample_combined <- cbind(matrix_control, matrix_sample)
      #sample_compare_df <- data.frame(control, sample)
      #sample_compare_df_melt <- melt(sample_compare_df)
      
      p_value <- apply(sample_combined,1,function(x) t.test(x[1:n_rep],x[1:n_rep + n_rep])$p.value)
      q_value <- p.adjust(p_value, "fdr")
      
      
      p_value_list[[perm]] <- p_value
      q_value_list[[perm]] <- q_value
      
    }
    
    
    # format the pvalue and qvalue into data.frame and melt
      # p_value_combined <- data.frame(p_value_list)
      # names(p_value_combined) <- paste0( "perm", 1:perm)
      # q_value_combined <- data.frame(q_value_list)
      # names(q_value_combined) <- paste0( "perm", 1:perm)
    
    # summary the number of qualified pvalue 
    All_qualified_pvalue <- unlist(lapply(p_value_list, function(x) {length(which(x < 0.05))}))
    True_Positive_pvalue <- unlist(lapply(p_value_list, function(x) {length( which((which(x < 0.05) <= number_features_changed)))}))
    False_Positive_pvalue <- All_qualified_pvalue - True_Positive_pvalue 
    
    
    # we put the significant changed items at the top, therefore there index should be equal or less than number_features_changed
    
    All_qualified_qvalue <- unlist(lapply(q_value_list, function(x) {length(which(x < 0.05))}))
    True_Positive_qvalue <- unlist(lapply(q_value_list, function(x) {length( which((which(x < 0.05) <= number_features_changed)))}))
    False_Positive_qvalue <- All_qualified_qvalue - True_Positive_qvalue 
    
    # change to percentatge
    summary <- 100*data.frame(All_qualified_pvalue,All_qualified_qvalue,True_Positive_pvalue,True_Positive_qvalue,False_Positive_pvalue,False_Positive_qvalue)/number_features_changed
    
    row.names(summary) <- paste0( "perm", 1:perm)
    
    
    # store in the list and return
    summary_list[[paste0("rep", n_rep)]] <- summary
    
  }
  

  
  data_plot_allperm <- summary_list %>% 
    as.data.frame() %>% 
    rownames_to_column() %>% 
    melt() %>% 
    separate(variable, c("rep", "Measures"),sep = "\\.") %>% 
    mutate(rep = as.factor(rep), rep = factor(rep, levels =paste0("rep", 2:number_of_replicate), labels = 2:number_of_replicate)) %>% 
    group_by(rep, Measures) %>% 
    summarise(mean = mean(value), sd = sd(value)) 
  
  
  
  p<- ggplot(data_plot_allperm, aes(x=rep, y=mean, group=Measures, color = Measures)) + 
    geom_line() +
    geom_point(stat="identity")+
    geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2, position=position_dodge(0.05))+
    ggtitle("How Many Replicates do We Need?") + ylab("Percentage of Significantly changed features identified")+ xlab("Number of Replicates Performed")+
    annotate("text", x = c(13,13, 13, 13, 13), y = c(80, 70, 60, 50, 40), label = c(paste0("Mean difference: ",mean_difference), 
                                                                                   paste0("SD: ", sd),
                                                                                   paste0("Number of significantly changed features: ",number_features_changed),
                                                                                   paste0("Number of not changed features: ", number_features_nochange),
                                                                                   paste0("Number of permutations: ", number_of_permutation)
    ))
  
  
  return(ggplotly(p))
  
}

3 Simluation 1: 200 out of 1000

Now we fix the sd=1, background sample size without significant change as 800, sample size with significantly change as 200, sliding mean difference from 1 to 2.5

3.1 difference of mean: 1

simulation_function(mean_difference = 1, number_features_nochange = 800)

3.2 difference of mean: 1.5

simulation_function(mean_difference = 1.5, number_features_nochange = 800)

3.3 difference of mean: 2

simulation_function(mean_difference = 2, number_features_nochange = 800)

3.4 difference of mean: 2.5

simulation_function(mean_difference = 2.5, number_features_nochange = 800)

4 Simluation 2: 200 out of 10000

Now we fix the sd=1, background sample size without significant change as 9800, sample size with significantly change as 200, sliding mean difference from 1 to 2.5

4.1 difference of mean: 1

simulation_function(mean_difference = 1, number_features_nochange = 9800)

4.2 difference of mean: 1.5

simulation_function(mean_difference = 1.5, number_features_nochange = 9800)

4.3 difference of mean: 2

simulation_function(mean_difference = 2, number_features_nochange = 9800)

4.4 difference of mean: 2.5

simulation_function(mean_difference = 2.5, number_features_nochange = 9800)

5 Conclusion

The conclusion is pretty depressing
Three replicates in any casese are not good enough.
For a small sample size with 200 out of 1000, with a difference of 2.5 fold SD, up to 4, 5, 6 replicates are fairly good.
If the difference is big enough(200 out of 10000), with the difference of 2.5 fold of SD, more than 10 replicates are needed to pickup all the significantly changed protein/peptides
The conclusion can also be explained in a reversed angle, that replicates can only pickup some very significantly changed ones.
When the background is large (compared to the number of significantly changed ones), using q-value is mandatory to enruse low FPR.

6 sessionInfo

sessionInfo()

## R version 4.0.4 (2021-02-15)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19041)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_Canada.1252  LC_CTYPE=English_Canada.1252   
## [3] LC_MONETARY=English_Canada.1252 LC_NUMERIC=C                   
## [5] LC_TIME=English_Canada.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] plotly_4.9.3    forcats_0.5.1   stringr_1.4.0   dplyr_1.0.5    
##  [5] purrr_0.3.4     readr_1.4.0     tidyr_1.1.3     tibble_3.1.0   
##  [9] tidyverse_1.3.0 reshape2_1.4.4  ggplot2_3.3.3  
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.6        lubridate_1.7.10  assertthat_0.2.1  digest_0.6.27    
##  [5] utf8_1.1.4        R6_2.5.0          cellranger_1.1.0  plyr_1.8.6       
##  [9] backports_1.2.1   reprex_1.0.0      evaluate_0.14     httr_1.4.2       
## [13] pillar_1.5.1      rlang_0.4.10      lazyeval_0.2.2    readxl_1.3.1     
## [17] rstudioapi_0.13   data.table_1.14.0 jquerylib_0.1.3   rmarkdown_2.7    
## [21] labeling_0.4.2    htmlwidgets_1.5.3 munsell_0.5.0     broom_0.7.5      
## [25] compiler_4.0.4    modelr_0.1.8      xfun_0.22         pkgconfig_2.0.3  
## [29] htmltools_0.5.1.1 tidyselect_1.1.0  fansi_0.4.2       viridisLite_0.3.0
## [33] crayon_1.4.1      dbplyr_2.1.0      withr_2.4.1       grid_4.0.4       
## [37] jsonlite_1.7.2    gtable_0.3.0      lifecycle_1.0.0   DBI_1.1.1        
## [41] magrittr_2.0.1    scales_1.1.1      cli_2.3.1         stringi_1.5.3    
## [45] farver_2.1.0      fs_1.5.0          xml2_1.3.2        bslib_0.2.4      
## [49] ellipsis_0.3.1    generics_0.1.0    vctrs_0.3.6       tools_4.0.4      
## [53] glue_1.4.2        crosstalk_1.1.1   hms_1.0.0         yaml_2.2.1       
## [57] colorspace_2.0-0  rvest_1.0.0       knitr_1.33        haven_2.3.1      
## [61] sass_0.3.1

How many replicates needed

Suggestions to ningzhibin@gmail.com

Report generated @2021-07-15 22:26:45