95% interval mean residual plot added to get_residuals

gladstone-institutes · Nov 15, 2022 · 11fa2a2 · 11fa2a2
1 parent 8713d69
commit 11fa2a2
Show file tree

Hide file tree

Showing 8 changed files with 228 additions and 14 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: RMeDPower
 Type: Package
 Title: Power Calculation for biological experiments
-Version: 0.1.7
+Version: 0.1.8
 Author: Min-Gyoung Shin 
 Maintainer: The package maintainer <yourself@somewhere.net>
 Description: It uses simulation to perform power analysis. It is designed to explore the power of biological experiments and to suggest an optimal number of experimental variables with reasonable power. The backbone of the function is based on simr package, which fits a fixed effect or mixed effect model based on the observed data and simulates response variables. Users can test the power of different combinations of experimental variables and parameters.
@@ -17,7 +17,8 @@ Imports:
     multtest,
     simr,
     lmerTest,
-    lme4
+    lme4,
+    ggplot2
 Suggests: 
     knitr,
     rmarkdown
diff --git a/NAMESPACE b/NAMESPACE
@@ -12,3 +12,4 @@ import(lmerTest)
 import(multtest)
 import(readxl)
 import(simr)
+import(ggplot2)
diff --git a/R/calculate_lmer_estimates.R b/R/calculate_lmer_estimates.R
@@ -19,12 +19,19 @@
 #' @param repeatable_columns Name of experimental variables that may appear repeatedly with the same ID. For example, cell_line C1 may appear in multiple experiments, but plate P1 cannot appear in more than one experiment
 #' @param response_is_categorical Default: the observed variable is continuous Categorical response variable will be implemented in the future. TRUE: Categorical , FALSE: Continuous (default).
 #' @param family The type of distribution family to specify when the response is categorical. If family is "binary" then binary(link="log") is used, if family is "poisson" then poisson(link="logit") is used, if family is "poisson_log" then poisson(link=") log") is used.
-#' @param na.action "complete": missing data is not allowed in all columns (default), "unique": missing data is not ollowed only in condition, experimental, and response columns
+#' @param na.action "complete": missing data is not allowed in all columns (default), "unique": missing data is not allowed only in condition, experimental, and response columns. Selecting "complete" removes an entire row when there is one or more missing values, which may affect the distribution of other features.
 #'
 #' @return A linear mixed model result
 #'
 #' @export
-#' @examples
+#' @examples result=calculate_lmer_estimates(data=RMeDPower_data1,
+#' @examples condition_column="classification",
+#' @examples experimental_columns=c("experiment", "line"),
+#' @examples response_column="cell_size1",
+#' @examples condition_is_categorical=TRUE,
+#' @examples repeatable_columns = "line",
+#' @examples family=NULL,
+#' @examples response_is_categorical=FALSE)
 
 
 

diff --git a/R/calculate_power.R b/R/calculate_power.R
@@ -19,13 +19,13 @@
 #' @param condition_is_categorical Specify whether the condition variable is categorical. TRUE: Categorical, FALSE: Continuous.
 #' @param repeatable_columns Name of experimental variables that may appear repeatedly with the same ID. For example, cell_line C1 may appear in multiple experiments, but plate P1 cannot appear in more than one experiment
 #' @param response_is_categorical Default: the observed variable is continuous TRUE: Categorical , FALSE: Continuous (default).
-#' @param nsim The number of simulations to run. Default=1000
+#' @param nsimn The number of simulations to run. Default=1000
 #' @param family The type of distribution family to specify when the response is categorical. If family is "binary" then binary(link="log") is used, if family is "poisson" then poisson(link="logit") is used, if family is "poisson_log" then poisson(link=") log") is used.
 #' @param target_columns Name of the experimental parameters to use for the power calculation.
 #' @param levels 1: Amplify the number of corresponding target parameter. 0: Amplify the number of samples from the corresponding target parameter, ex) If target_columns = c("experiment","cell_line") and if you want to expand the number of experiment and sample more cells from each cell line, levels = c(1,0).
 #' @param max_size Maximum levels or sample sizes to test. Default: the current level or the current sample size x 5. ex) If max_levels = c(10,5), it will test upto 10 experiments and 5 cell lines.
 #' @param breaks Levels /sample sizes of the variable to be specified along the power curve.. Default: max(1, round( the number of current levels / 5 ))
-#' @param na.action "complete": missing data is not allowed in all columns (default), "unique": missing data is not ollowed only in condition, experimental, response, and target columns
+#' @param na.action "complete": missing data is not allowed in all columns (default), "unique": missing data is not allowed only in condition, experimental, response, and target columns. Selecting "complete" removes an entire row when there is one or more missing values, which may affect the distribution of other features.
 #' @param output Output file name
 ##### If variance estimates should be estimated from data
 #' @param  effect_size If you know the effect size of your condition variable, provided it. If the effect size is not provided, it will be estimated from your data
@@ -34,7 +34,16 @@
 #' @return A power curve image or a power calculation result printed in a text file
 #'
 #' @export
-#' @examples
+#' @examples result=calculate_power(data=RMeDPower_data1,
+#' @examples condition_column="classification",
+#' @examples experimental_columns=c("experiment", "line"),
+#' @examples response_column="cell_size1",
+#' @examples target_columns="experiment",
+#' @examples power_curve=1,
+#' @examples condition_is_categorical=TRUE,
+#' @examples repeatable_columns = "line",
+#' @examples response_is_categorical=FALSE,
+#' @examples levels=1)
 
 
 calculate_power <- function(data, condition_column, experimental_columns, response_column, target_columns, power_curve, condition_is_categorical,

diff --git a/R/check_normality.R b/R/check_normality.R
@@ -11,13 +11,15 @@
 #' @param repeatable_columns Name of experimental variables that may appear repeatedly with the same ID. For example, cell_line C1 may appear in multiple experiments, but plate P1 cannot appear in more than one experiment
 #' @param response_is_categorical Default: the observed variable is continuous Categorical response variable will be implemented in the future. TRUE: Categorical , FALSE: Continuous (default).
 #' @param image_title title of the qq plot
-#' @param na.action "complete": missing data is not allowed in all columns (default), "unique": missing data is not ollowed only in condition, experimental, and response columns
+#' @param na.action "complete": missing data is not allowed in all columns (default), "unique": missing data is not allowed only in condition, experimental, and response columns. Selecting "complete" removes an entire row when there is one or more missing values, which may affect the distribution of other features.
 #'
 #' @return A quantile-quantile (qq) plot of residual values in a mixed-effects model
 #'
 #' @export
 #'
-#' @examples check_normality(data,"classif",c("experiment","line"),"feature1","TRUE")
+#' @examples check_normality(data=RMeDPower_data1, condition_column="classification", experimental_columns=c("experiment","line"),
+#' @examples  response_column="cell_size1", condition_is_categorical=TRUE, repeatable_columns="line")
+
 
 
 

diff --git a/R/get_model_and_data.R b/R/get_model_and_data.R
@@ -0,0 +1,155 @@
+#' @title calculate_lmer_estimates
+#'
+#'
+#' @description This function performs a linear mixed model analysis using lmer.
+#'
+#' Note: The current version does not accept categorical response variables, sample size parameters smaller than the observed samples size
+#'
+#' @import multtest
+#' @import simr
+#' @import lme4
+#' @import lmerTest
+#' @import readxl
+#'
+#' @param data Input data
+#' @param condition_column Name of the condition variable (ex variable with values such as control/case). The input file has to have a corresponding column name
+#' @param experimental_columns Name of variables related to experimental design such as "experiment", "plate", and "cell_line". "experiment" should come always first
+#' @param response_column Name of the variable observed by performing the experiment. ex) intensity.
+#' @param condition_is_categorical Specify whether the condition variable is categorical. TRUE: Categorical, FALSE: Continuous.
+#' @param repeatable_columns Name of experimental variables that may appear repeatedly with the same ID. For example, cell_line C1 may appear in multiple experiments, but plate P1 cannot appear in more than one experiment
+#' @param response_is_categorical Default: the observed variable is continuous Categorical response variable will be implemented in the future. TRUE: Categorical , FALSE: Continuous (default).
+#' @param family The type of distribution family to specify when the response is categorical. If family is "binary" then binary(link="log") is used, if family is "poisson" then poisson(link="logit") is used, if family is "poisson_log" then poisson(link=") log") is used.
+#' @param na.action "complete": missing data is not allowed in all columns (default), "unique": missing data is not allowed only in condition, experimental, and response columns. Selecting "complete" removes an entire row when there is one or more missing values, which may affect the distribution of other features.
+#'
+#' @return A list of the linear mixed model result, original data, experimental column names, and residual values
+#'
+#' @export
+#' @examples
+
+
+
+get_model_and_data <- function(data, condition_column, experimental_columns, response_column, condition_is_categorical,
+                                     repeatable_columns=NA, response_is_categorical=FALSE, family_p=NULL, na.action="complete"){
+
+
+
+  ######input error handler
+  if(!condition_column%in%colnames(data)){ print("condition_column should be one of the column names");return(NULL) }
+  if(sum(experimental_columns%in%colnames(data))!=length(experimental_columns) ){ print("experimental_columns must match column names");return(NULL) }
+  if(!response_column%in%colnames(data)){  print("response_column should be one of the column names");return(NULL) }
+  if(is.null(condition_is_categorical) | !condition_is_categorical%in%c(TRUE,FALSE)){ print("condition_is_categorical must be TRUE or FALSE");return(NULL) }
+  if(!is.na(repeatable_columns)){if(sum(repeatable_columns%in%colnames(data))!=length(repeatable_columns) ){ print("repeatable_columns must match column names");return(NULL) }}
+
+  if(response_is_categorical==TRUE){
+    family_p=switch(family_p, "poisson" = poisson(link="log"), "binomial" = binomial(link="logit"), "bionomial_log" = binomial(link="log") )
+  }
+
+
+  if(na.action=="complete"){
+
+    notNAindex=which( rowSums(is.na(data)) == 0 )
+
+  }else if(na.action=="unique"){
+
+    notNAindex=which( rowSums(is.na(data[,c(condition_column, experimental_columns, response_column)])) == 0 )
+
+  }
+
+
+
+  Data=data[notNAindex,]
+
+
+  cat("\n")
+  print("__________________________________________________________________Summary of data:")
+  print(summary(Data))
+  cat("\n")
+
+  colnames_original=colnames(Data)
+  experimental_columns_index=NULL
+  ####### assign categorical variables
+  if(condition_is_categorical==TRUE) Data[,condition_column]=as.factor(Data[,condition_column])
+
+  cat("\n")
+
+
+  nonrepeatable_columns=NULL
+
+  for(i in 1:length(experimental_columns)){
+    Data[,experimental_columns[i]]=as.factor(Data[,experimental_columns[i]])
+    experimental_columns_index=c(experimental_columns_index,which(colnames(Data)==experimental_columns[i]))
+    colnames(Data)[experimental_columns_index[i]]=paste("experimental_column",i,sep="")
+
+    if(i!=1&&!experimental_columns[i]%in%repeatable_columns){
+      nonrepeatable_columns=c(nonrepeatable_columns, paste("experimental_column",i,sep=""))
+    }
+
+
+    cat("\n")
+    print(paste("_________________________________",experimental_columns[i]," is assigned to experimental_column",i,sep=""))
+    cat("\n")
+  }
+
+
+
+  if(length(experimental_columns)>=2){
+      for(r in 2:length(experimental_columns)){
+      if(colnames(Data)[experimental_columns_index[r]]%in%nonrepeatable_columns){
+        Data[,experimental_columns_index[r]]=paste(Data[,experimental_columns_index[r-1]], Data[,experimental_columns_index[r]],sep="_")
+      }
+    }
+
+  }
+
+
+
+  colnames(Data)[which(colnames(Data)==condition_column)]="condition_column"
+  colnames(Data)[which(colnames(Data)==response_column)]="response_column"
+
+
+  ####### run the formula
+
+  if(response_is_categorical==FALSE){
+    if(length(experimental_columns)==1){
+      lmerFit <- lmerTest::lmer(response_column ~ condition_column + (1 | experimental_column1), data=Data)
+    }else if(length(experimental_columns)==2){
+      lmerFit <- lmerTest::lmer(response_column ~ condition_column + (1 | experimental_column1) + (1 | experimental_column2), data=Data)
+    }else if(length(experimental_columns)==3){
+      lmerFit <- lmerTest::lmer(response_column ~ condition_column + (1 | experimental_column1) + (1 | experimental_column2) + (1 | experimental_column3), data=Data)
+    }else if(length(experimental_columns)==4){
+      lmerFit <- lmerTest::lmer(response_column ~ condition_column + (1 | experimental_column1) + (1 | experimental_column2) + (1 | experimental_column3) + (1 | experimental_column4), data=Data)
+    }else if(length(experimental_columns)==5){
+      lmerFit <- lmerTest::lmer(response_column ~ condition_column + (1 | experimental_column1) + (1 | experimental_column2) + (1 | experimental_column3) + (1 | experimental_column4) + (1 | experimental_column5), data=Data)
+    }
+  }else{
+    if(length(experimental_columns)==1){
+      lmerFit <- lme4::glmer(response_column ~ condition_column + (1 | experimental_column1), data=Data, family=family_p)
+    }else if(length(experimental_columns)==2){
+      lmerFit <- lme4::glmer(response_column ~ condition_column + (1 | experimental_column1) + (1 | experimental_column2), data=Data, family=family_p)
+    }else if(length(experimental_columns)==3){
+      lmerFit <- lme4::glmer(response_column ~ condition_column + (1 | experimental_column1) + (1 | experimental_column2) + (1 | experimental_column3), data=Data, family=family_p)
+    }else if(length(experimental_columns)==4){
+      lmerFit <- lme4::glmer(response_column ~ condition_column + (1 | experimental_column1) + (1 | experimental_column2) + (1 | experimental_column3) + (1 | experimental_column4), data=Data, family=family_p)
+    }else if(length(experimental_columns)==5){
+      lmerFit <- lme4::glmer(response_column ~ condition_column + (1 | experimental_column1) + (1 | experimental_column2) + (1 | experimental_column3) + (1 | experimental_column4) + (1 | experimental_column5), data=Data, family=family_p)
+    }
+  }
+
+
+
+
+
+
+
+  slmerFit <- summary(lmerFit)
+
+  cat("\n")
+  print("__________________________________________________________________Model statistics:")
+  print(slmerFit)
+  cat("\n")
+
+
+
+  return(list(lmerFit, Data, colnames(Data)[experimental_columns_index], slmerFit$residuals))
+}
+
diff --git a/R/get_residuals.R b/R/get_residuals.R
@@ -16,11 +16,20 @@
 #' @param repeatable_columns Name of experimental variables that may appear repeatedly with the same ID. For example, cell_line C1 may appear in multiple experiments, but plate P1 cannot appear in more than one experiment
 #' @param response_is_categorical Default: the observed variable is continuous Categorical response variable will be implemented in the future. TRUE: Categorical , FALSE: Continuous (default).
 #' @param family The type of distribution family to specify when the response is categorical. If family is "binary" then binary(link="log") is used, if family is "poisson" then poisson(link="logit") is used, if family is "poisson_log" then poisson(link=") log") is used.
-#' @param na.action "complete": missing data is not allowed in all columns (default), "unique": missing data is not ollowed only in condition, experimental, and response columns
-#' @return A linear mixed model result
+#' @param na.action "complete": missing data is not allowed in all columns (default), "unique": missing data is not allowed only in condition, experimental, and response columns. Selecting "complete" removes an entire row when there is one or more missing values, which may affect the distribution of other features.
+#' @return The adjusted residual values for the experimental variables will be returned along with the original input data. A boxplot of residual values and a plot of 95% confidence interval mean residual values adjusted for the experimental variables will be returned.
 #'
 #' @export
 #'
+#' @examples result=get_residuals(data=RMeDPower_data1,
+#' @examples condition_column="classification",
+#' @examples experimental_columns=c("experiment", "line"),
+#' @examples response_column="cell_size1",
+#' @examples condition_is_categorical=TRUE,
+#' @examples repeatable_columns = "line",
+#' @examples response_is_categorical=FALSE)
+
+
 get_residuals<-function(data, condition_column, experimental_columns, response_column, condition_is_categorical,
                         repeatable_columns=NA, response_is_categorical=FALSE, family=NULL, na.action="complete"){
 
@@ -151,14 +160,39 @@ get_residuals<-function(data, condition_column, experimental_columns, response_c
 
   if(condition_is_categorical==TRUE){
 
+    #boxplot
     boxplot( as.formula( paste0( "residual ~  condition_column") ), data=Data , xlab=condition_column, ylab=paste0(response_column, " Residual Value"), main=NULL)
 
+    #95% CI plot
+    plotData = Data %>%group_by(as.factor(condition_column))%>%summarise(mean_residual = mean(residual), se = sd(residual)/sqrt(n()))
+
+    colnames(plotData)[1]="condition"
+
+    gp=ggplot2::ggplot(plotData,                ### The data frame to use.
+           aes(x = condition,
+               y = mean_residual)) +
+      geom_errorbar(aes(ymin = mean_residual - 1.96*se,
+                        ymax = mean_residual + 1.96*se),
+                    width = 0.05,
+                    size  = 0.5) +
+      geom_point(shape = 15,
+                 size  = 4) +
+      theme_bw() +
+      theme(axis.title   = element_text(face  = "bold")) + ylab("Mean Residual Value") + xlab("Classification")
+
+    print(gp)
 
   }else{
 
+
     plot(Data[,"condition_column"], Data[,"residual"], xlab=condition_column, ylab=paste0(response_column, " Residual Value"), main=NULL)
     abline(lm( as.formula( paste0( "residual ~  condition_column") ), data=Data),col='blue')
 
+
+
+
+
+
     }
 
   return(Data)

diff --git a/R/transform_data.R b/R/transform_data.R
@@ -11,15 +11,20 @@
 #' @param repeatable_columns Name of experimental variables that may appear repeatedly with the same ID. For example, cell_line C1 may appear in multiple experiments, but plate P1 cannot appear in more than one experiment
 #' @param response_is_categorical Default: the observed variable is continuous Categorical response variable will be implemented in the future. TRUE: Categorical , FALSE: Continuous (default).
 #' @param alpha numeric scalar between 0 and 1 indicating the Type I error associated with the test of outliers
-#' @param na.action "complete": missing data is not allowed in all columns (default), "unique": missing data is not ollowed only in condition, experimental, and response columns
+#' @param na.action "complete": missing data is not allowed in all columns (default), "unique": missing data is not allowed only in condition, experimental, and response columns. Selecting "complete" removes an entire row when there is one or more missing values, which may affect the distribution of other features.
 #'
 #' @return Quantile-quanitle (qq) plots of i) raw residual values ii) log-transformed residual values iii) raw residual values after removing outliers, and iv) log-transformed residual values
 #' @return This function returns a matrix with original columns and additional columns with transformed values. Users can select a feature column for power analysis or regression analysis based on QQ plot results.
 #'
 #' @export
 #'
-#' @examples transform_data(data,"classif",c("experiment","line"),"feature1","TRUE")
-
+#' @examples result=transform_data(data=RMeDPower_data1,
+#' @examples condition_column="classification",
+#' @examples experimental_columns=c("experiment", "line"),
+#' @examples response_column="cell_size1",
+#' @examples condition_is_categorical=TRUE,
+#' @examples repeatable_columns = "line",
+#' @examples response_is_categorical=FALSE)
 
 
 transform_data<-function(data, condition_column, experimental_columns, response_column, condition_is_categorical,