knitr::opts_chunk$set(echo = TRUE)

library(tidyverse)
library(ggplot2)
library(caret)
library(pROC)
library(R.matlab)
library(kableExtra)
int.config <- list()
ext.config <- list()
if( params$cohort == "MESA" ) {
  int.config$color <- "dodgerblue3"
  ext.config$color <- "firebrick3"
} else {
  int.config$color <- "firebrick3"
  ext.config$color <- "dodgerblue3"
}

Perform internal & external validations for a shape-based atlas PLSR study.

Prepare Data

dt <- readRDS(file.path(params$input_folder, sprintf("PLS_%s_%s.rds", params$cohort, params$rf_name)))
if( params$cohort == "MESA" ) {
  ext_cohort <- "UKBB"
} else {
  ext_cohort <- "MESA"
}
message(sprintf("The external cohort: %s", ext_cohort))
## The external cohort: UKBB

Prepare the external cohort input variables. We will read it from the same training data result of the external cohort, and select only the independent variables without the principal scores, because we want to use the projected principal scores from the external cohort into the model atlas.

dt.test <- readRDS(file.path(
  params$input_folder, sprintf("PLS_%s_%s.rds", ext_cohort, params$rf_name)
))$dt.train %>%
  select(c(ID, Age, Sex, sym(params$rf_name))) %>% 
  inner_join(
    cbind(
      read_csv(file.path("PCA", sprintf("%s_ids.csv", ext_cohort)), show_col_types = FALSE),
      readMat(file.path("PCA", params$cohort, sprintf("%s_score.mat", ext_cohort)))$proj[, 1:dt$npcs]
    ),
    by = c("ID" = "ID")
  )

message(sprintf("The external cohort data: %d x %d", nrow(dt.test), ncol(dt.test)))
## The external cohort data: 1827 x 180

Frequency table

# print frequency table
dt$dt.train %>% select(sym(params$rf_name)) %>% table() %>%
  kbl(caption = "Internal test data:") %>% kable_styling("hover", full_width = F, position="left")
Internal test data:
Smoking Freq
NO_RISK 397
RISK 891
# print frequency table
dt.test %>% select(sym(params$rf_name)) %>% table() %>%
  kbl(caption = "External test data:") %>% kable_styling("hover", full_width = F, position="left")
External test data:
Smoking Freq
NO_RISK 645
RISK 1182

Predictions

The prediction from internal cohort has been stored in dt$pls$pred.

int.pred <- dt$pls$pred %>% transmute(
  pred = RISK,
  obs = obs
)

Internal ROC

(int.roc <- roc(dt$pls$pred$obs, dt$pls$pred$RISK, levels = rev(levels(dt$pls$pred$obs))))
## Setting direction: controls > cases
## 
## Call:
## roc.default(response = dt$pls$pred$obs, predictor = dt$pls$pred$RISK,     levels = rev(levels(dt$pls$pred$obs)))
## 
## Data: dt$pls$pred$RISK in 891 controls (dt$pls$pred$obs RISK) > 397 cases (dt$pls$pred$obs NO_RISK).
## Area under the curve: 0.7061
(int.ci <- ci.auc(int.roc, method = "bootstrap"))
## 95% CI: 0.6755-0.7342 (2000 stratified bootstrap replicates)

We need to calculate the external predictions.

# let's retrain again
pls <- train(
  form = sprintf("%s ~ Age + Sex + .", params$rf_name) %>% as.formula(),
  data = dt$dt.train %>% select(-ID),
  method = "pls", 
  metric = "ROC",
  preProc = c("center"),
  tuneGrid = data.frame(ncomp=dt$ncomp),
  probMethod = "softmax",
  trControl = trainControl(
    method="none", savePredictions = "final", classProbs = TRUE, verboseIter=FALSE,
    summaryFunction = twoClassSummary)
)

ext.pred <- data.frame(
  pred = predict(pls, newdata = dt.test, type="prob") %>% pull(RISK),
  obs = pull(dt.test, !!sym(params$rf_name))
)

External ROC

(ext.roc <- roc(ext.pred$obs, ext.pred$pred, levels = rev(levels(as.factor(ext.pred$obs)))))
## Setting direction: controls > cases
## 
## Call:
## roc.default(response = ext.pred$obs, predictor = ext.pred$pred,     levels = rev(levels(as.factor(ext.pred$obs))))
## 
## Data: ext.pred$pred in 1182 controls (ext.pred$obs RISK) > 645 cases (ext.pred$obs NO_RISK).
## Area under the curve: 0.6378
(ext.ci <- ci.auc(ext.roc, method="bootstrap"))
## 95% CI: 0.6116-0.664 (2000 stratified bootstrap replicates)

Compare the two ROC’s

(test.roc <- roc.test(int.roc, ext.roc, paired=FALSE))
## 
##  DeLong's test for two ROC curves
## 
## data:  int.roc and ext.roc
## D = 3.3728, df = 2843.8, p-value = 0.0007539
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2 
##   0.7060643   0.6378009

Confusion matrices

Internal

Use the best index from ‘closest.topleft’ to balance between sensitivity & specificity.

(int.cutoff <- coords(int.roc, x="best", best.method="c"))
int.pred <- int.pred %>%
  mutate(class = ifelse(pred > int.cutoff$threshold, "RISK", "NO_RISK") %>% as.factor())

(int.cm <- confusionMatrix(data = int.pred$class, reference = int.pred$obs, positive = "RISK"))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NO_RISK RISK
##    NO_RISK     262  292
##    RISK        135  599
##                                          
##                Accuracy : 0.6685         
##                  95% CI : (0.642, 0.6942)
##     No Information Rate : 0.6918         
##     P-Value [Acc > NIR] : 0.9665         
##                                          
##                   Kappa : 0.2994         
##                                          
##  Mcnemar's Test P-Value : 4.374e-14      
##                                          
##             Sensitivity : 0.6723         
##             Specificity : 0.6599         
##          Pos Pred Value : 0.8161         
##          Neg Pred Value : 0.4729         
##              Prevalence : 0.6918         
##          Detection Rate : 0.4651         
##    Detection Prevalence : 0.5699         
##       Balanced Accuracy : 0.6661         
##                                          
##        'Positive' Class : RISK           
## 

External

(ext.cutoff <- coords(ext.roc, x="best", best.method="c"))
ext.pred <- ext.pred %>%
  mutate(class = ifelse(pred > ext.cutoff$threshold, "RISK", "NO_RISK") %>% as.factor())

(ext.cm <- confusionMatrix(data = ext.pred$class, reference = ext.pred$obs, positive = "RISK"))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NO_RISK RISK
##    NO_RISK     385  453
##    RISK        260  729
##                                           
##                Accuracy : 0.6097          
##                  95% CI : (0.5869, 0.6322)
##     No Information Rate : 0.647           
##     P-Value [Acc > NIR] : 0.9996          
##                                           
##                   Kappa : 0.2001          
##                                           
##  Mcnemar's Test P-Value : 6.458e-13       
##                                           
##             Sensitivity : 0.6168          
##             Specificity : 0.5969          
##          Pos Pred Value : 0.7371          
##          Neg Pred Value : 0.4594          
##              Prevalence : 0.6470          
##          Detection Rate : 0.3990          
##    Detection Prevalence : 0.5413          
##       Balanced Accuracy : 0.6068          
##                                           
##        'Positive' Class : RISK            
## 

Saving

Save the analysis for collation use

This is ROC-only (for backward compatibility)

roc_filename <- file.path(params$output_folder, sprintf("ROC_%s_%s.rds", params$cohort, params$rf_name))
write_rds(list(
  cohort = params$cohort,
  risk_factor = params$rf_name,
  internal = int.roc,
  external = ext.roc
), file=roc_filename, compress = "gz")
message(sprintf("Saved ROC to %s", roc_filename))
## Saved ROC to Validation/ROC_MESA_Smoking.rds

This is all with the new predictions

pred_filename <- file.path(params$output_folder, sprintf("Prediction_%s_%s.rds", params$cohort, params$rf_name))
write_rds(list(
  cohort = params$cohort,
  risk_factor = params$rf_name,
  internal = list(roc=int.roc, ci=int.ci, pred=int.pred, cutoff=int.cutoff, cm=int.cm),
  external = list(roc=ext.roc, ci=ext.ci, pred=ext.pred, cutoff=ext.cutoff, cm=ext.cm)
), file=pred_filename, compress = "gz")
message(sprintf("Saved predicions to %s", pred_filename))
## Saved predicions to Validation/Prediction_MESA_Smoking.rds

Plot

# change the name for better title of the plot on the paper
if( params$cohort == "MESA" ) {
  cohort_name <- "MESA"
} else {
  cohort_name <- "UK Biobank"
}

plot.roc(int.roc, legacy.axes = FALSE, lty = 1,
         main = sprintf("%s with %s Atlas", params$rf_name, cohort_name))
plot.roc(ext.roc, add=TRUE, lty = 2)

# print p-value
text(1, 1, labels=paste("p-value =", format.pval(test.roc$p.value)), adj=c(0, .5))

legend("bottomright", 
       legend=c(
         sprintf("Internal cohort (AUC: %.2f, 95%% CI: [%.2f, %.2f])", 
                 int.roc$auc, int.ci[1], int.ci[3]), 
         sprintf("External cohort (AUC: %.2f, 95%% CI: [%.2f, %.2f])", 
                 ext.roc$auc, ext.ci[1], ext.ci[3])), 
       lwd=2, lty=c(1,2))

Save outputs

(int.fname <- sprintf("%s/%s_%s_Shapes_on_%s_Scores.csv", 
                      params$output_folder, params$rf_name, params$cohort, params$cohort))
## [1] "Validation/Smoking_MESA_Shapes_on_MESA_Scores.csv"
write_csv(int.pred %>% mutate(id = dt$dt.train$ID[dt$pls$pred$rowIndex]), int.fname)
(ext.fname <- sprintf("%s/%s_%s_Shapes_on_%s_Scores.csv", 
                      params$output_folder, params$rf_name, ext_cohort, params$cohort))
## [1] "Validation/Smoking_UKBB_Shapes_on_MESA_Scores.csv"
write_csv(ext.pred %>% mutate(id = dt.test$ID), ext.fname)