UKBB PLSR Validation on Diabetes

Prepare Data
Predictions
Confusion matrices
- Internal
- External
Saving
Plot
Save outputs

knitr::opts_chunk$set(echo = TRUE)

library(tidyverse)
library(ggplot2)
library(caret)
library(pROC)
library(R.matlab)
library(kableExtra)

int.config <- list()
ext.config <- list()
if( params$cohort == "MESA" ) {
  int.config$color <- "dodgerblue3"
  ext.config$color <- "firebrick3"
} else {
  int.config$color <- "firebrick3"
  ext.config$color <- "dodgerblue3"
}

Perform internal & external validations for a shape-based atlas PLSR study.

Prepare Data

dt <- readRDS(file.path(params$input_folder, sprintf("PLS_%s_%s.rds", params$cohort, params$rf_name)))

if( params$cohort == "MESA" ) {
  ext_cohort <- "UKBB"
} else {
  ext_cohort <- "MESA"
}
message(sprintf("The external cohort: %s", ext_cohort))

## The external cohort: MESA

Prepare the external cohort input variables. We will read it from the same training data result of the external cohort, and select only the independent variables without the principal scores, because we want to use the projected principal scores from the external cohort into the model atlas.

dt.test <- readRDS(file.path(
  params$input_folder, sprintf("PLS_%s_%s.rds", ext_cohort, params$rf_name)
))$dt.train %>%
  select(c(ID, Age, Sex, sym(params$rf_name))) %>% 
  inner_join(
    cbind(
      read_csv(file.path("PCA", sprintf("%s_ids.csv", ext_cohort)), show_col_types = FALSE),
      readMat(file.path("PCA", params$cohort, sprintf("%s_score.mat", ext_cohort)))$proj[, 1:dt$npcs]
    ),
    by = c("ID" = "ID")
  )

message(sprintf("The external cohort data: %d x %d", nrow(dt.test), ncol(dt.test)))

## The external cohort data: 770 x 214

Frequency table

# print frequency table
dt$dt.train %>% select(sym(params$rf_name)) %>% table() %>%
  kbl(caption = "Internal test data:") %>% kable_styling("hover", full_width = F, position="left")

Internal test data:
Diabetes	Freq
NO_RISK	645
RISK	159

# print frequency table
dt.test %>% select(sym(params$rf_name)) %>% table() %>%
  kbl(caption = "External test data:") %>% kable_styling("hover", full_width = F, position="left")

External test data:
Diabetes	Freq
NO_RISK	397
RISK	373

Predictions

The prediction from internal cohort has been stored in dt$pls$pred.

int.pred <- dt$pls$pred %>% transmute(
  pred = RISK,
  obs = obs
)

Internal ROC

(int.roc <- roc(dt$pls$pred$obs, dt$pls$pred$RISK, levels = rev(levels(dt$pls$pred$obs))))

## Setting direction: controls > cases

## 
## Call:
## roc.default(response = dt$pls$pred$obs, predictor = dt$pls$pred$RISK,     levels = rev(levels(dt$pls$pred$obs)))
## 
## Data: dt$pls$pred$RISK in 159 controls (dt$pls$pred$obs RISK) > 645 cases (dt$pls$pred$obs NO_RISK).
## Area under the curve: 0.7909

(int.ci <- ci.auc(int.roc, method = "bootstrap"))

## 95% CI: 0.7479-0.8307 (2000 stratified bootstrap replicates)

We need to calculate the external predictions.

# let's retrain again
pls <- train(
  form = sprintf("%s ~ Age + Sex + .", params$rf_name) %>% as.formula(),
  data = dt$dt.train %>% select(-ID),
  method = "pls", 
  metric = "ROC",
  preProc = c("center"),
  tuneGrid = data.frame(ncomp=dt$ncomp),
  probMethod = "softmax",
  trControl = trainControl(
    method="none", savePredictions = "final", classProbs = TRUE, verboseIter=FALSE,
    summaryFunction = twoClassSummary)
)

ext.pred <- data.frame(
  pred = predict(pls, newdata = dt.test, type="prob") %>% pull(RISK),
  obs = pull(dt.test, !!sym(params$rf_name))
)

External ROC

(ext.roc <- roc(ext.pred$obs, ext.pred$pred, levels = rev(levels(as.factor(ext.pred$obs)))))

## Setting direction: controls > cases

## 
## Call:
## roc.default(response = ext.pred$obs, predictor = ext.pred$pred,     levels = rev(levels(as.factor(ext.pred$obs))))
## 
## Data: ext.pred$pred in 373 controls (ext.pred$obs RISK) > 397 cases (ext.pred$obs NO_RISK).
## Area under the curve: 0.772

(ext.ci <- ci.auc(ext.roc, method="bootstrap"))

## 95% CI: 0.7387-0.8048 (2000 stratified bootstrap replicates)

Compare the two ROC’s

(test.roc <- roc.test(int.roc, ext.roc, paired=FALSE))

## 
##  DeLong's test for two ROC curves
## 
## data:  int.roc and ext.roc
## D = 0.71104, df = 1517.2, p-value = 0.4772
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2 
##   0.7908927   0.7719694

Confusion matrices

Internal

Use the best index from ‘closest.topleft’ to balance between sensitivity & specificity.

(int.cutoff <- coords(int.roc, x="best", best.method="c"))

int.pred <- int.pred %>%
  mutate(class = ifelse(pred > int.cutoff$threshold, "RISK", "NO_RISK") %>% as.factor())

(int.cm <- confusionMatrix(data = int.pred$class, reference = int.pred$obs, positive = "RISK"))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NO_RISK RISK
##    NO_RISK     492   46
##    RISK        153  113
##                                          
##                Accuracy : 0.7525         
##                  95% CI : (0.7211, 0.782)
##     No Information Rate : 0.8022         
##     P-Value [Acc > NIR] : 0.9998         
##                                          
##                   Kappa : 0.3777         
##                                          
##  Mcnemar's Test P-Value : 5.729e-14      
##                                          
##             Sensitivity : 0.7107         
##             Specificity : 0.7628         
##          Pos Pred Value : 0.4248         
##          Neg Pred Value : 0.9145         
##              Prevalence : 0.1978         
##          Detection Rate : 0.1405         
##    Detection Prevalence : 0.3308         
##       Balanced Accuracy : 0.7367         
##                                          
##        'Positive' Class : RISK           
##

External

(ext.cutoff <- coords(ext.roc, x="best", best.method="c"))

ext.pred <- ext.pred %>%
  mutate(class = ifelse(pred > ext.cutoff$threshold, "RISK", "NO_RISK") %>% as.factor())

(ext.cm <- confusionMatrix(data = ext.pred$class, reference = ext.pred$obs, positive = "RISK"))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction NO_RISK RISK
##    NO_RISK     276  102
##    RISK        121  271
##                                           
##                Accuracy : 0.7104          
##                  95% CI : (0.6769, 0.7422)
##     No Information Rate : 0.5156          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.4211          
##                                           
##  Mcnemar's Test P-Value : 0.2281          
##                                           
##             Sensitivity : 0.7265          
##             Specificity : 0.6952          
##          Pos Pred Value : 0.6913          
##          Neg Pred Value : 0.7302          
##              Prevalence : 0.4844          
##          Detection Rate : 0.3519          
##    Detection Prevalence : 0.5091          
##       Balanced Accuracy : 0.7109          
##                                           
##        'Positive' Class : RISK            
##

Saving

Save the analysis for collation use

This is ROC-only (for backward compatibility)

roc_filename <- file.path(params$output_folder, sprintf("ROC_%s_%s.rds", params$cohort, params$rf_name))
write_rds(list(
  cohort = params$cohort,
  risk_factor = params$rf_name,
  internal = int.roc,
  external = ext.roc
), file=roc_filename, compress = "gz")
message(sprintf("Saved ROC to %s", roc_filename))

## Saved ROC to Validation/ROC_UKBB_Diabetes.rds

This is all with the new predictions

pred_filename <- file.path(params$output_folder, sprintf("Prediction_%s_%s.rds", params$cohort, params$rf_name))
write_rds(list(
  cohort = params$cohort,
  risk_factor = params$rf_name,
  internal = list(roc=int.roc, ci=int.ci, pred=int.pred, cutoff=int.cutoff, cm=int.cm),
  external = list(roc=ext.roc, ci=ext.ci, pred=ext.pred, cutoff=ext.cutoff, cm=ext.cm)
), file=pred_filename, compress = "gz")
message(sprintf("Saved predicions to %s", pred_filename))

## Saved predicions to Validation/Prediction_UKBB_Diabetes.rds

Plot

# change the name for better title of the plot on the paper
if( params$cohort == "MESA" ) {
  cohort_name <- "MESA"
} else {
  cohort_name <- "UK Biobank"
}

plot.roc(int.roc, legacy.axes = FALSE, lty = 1,
         main = sprintf("%s with %s Atlas", params$rf_name, cohort_name))
plot.roc(ext.roc, add=TRUE, lty = 2)

# print p-value
text(1, 1, labels=paste("p-value =", format.pval(test.roc$p.value)), adj=c(0, .5))

legend("bottomright", 
       legend=c(
         sprintf("Internal cohort (AUC: %.2f, 95%% CI: [%.2f, %.2f])", 
                 int.roc$auc, int.ci[1], int.ci[3]), 
         sprintf("External cohort (AUC: %.2f, 95%% CI: [%.2f, %.2f])", 
                 ext.roc$auc, ext.ci[1], ext.ci[3])), 
       lwd=2, lty=c(1,2))

Save outputs

(int.fname <- sprintf("%s/%s_%s_Shapes_on_%s_Scores.csv", 
                      params$output_folder, params$rf_name, params$cohort, params$cohort))

## [1] "Validation/Diabetes_UKBB_Shapes_on_UKBB_Scores.csv"

write_csv(int.pred %>% mutate(id = dt$dt.train$ID[dt$pls$pred$rowIndex]), int.fname)

(ext.fname <- sprintf("%s/%s_%s_Shapes_on_%s_Scores.csv", 
                      params$output_folder, params$rf_name, ext_cohort, params$cohort))

## [1] "Validation/Diabetes_MESA_Shapes_on_UKBB_Scores.csv"

write_csv(ext.pred %>% mutate(id = dt.test$ID), ext.fname)