knitr::opts_chunk$set(echo = TRUE)

library(caret)
library(R.matlab)
library(tidyverse)
library(pROC)
library(broom)
library(kableExtra)
library(rlang)
library(testthat)
library(doParallel)

source("train_pls_formulae.R")
n_cores <- detectCores()
message(sprintf("Using %d cores", n_cores))
## Using 20 cores

Note: the output HTML & the training RDS data are saved in PLS.

Parameters
name value
Cohort name UKBB
Risk factor Hypertension
Maximum number of PLS components 30
Pct PCA variance explained (%) 99.9
Data file MESA_UKBB_FinalData.csv

Get the data

  1. Read the variable and select only cases within the cohort UKBB
  2. Select ID, Age, Gender & the risk factor Hypertension.
  3. Recode the risk factor values where TRUE -> RISK and FALSE -> NO_RISK
dt.ori <- read_csv(datafile, show_col_types = FALSE) %>% 
  filter(Cohort == params$cohort) %>%
  select(c(ID, Age, Sex, NoRisk, sym(params$rf_name))) %>%
  mutate(
    !!sym(params$rf_name) := recode_factor(as.factor(!!sym(params$rf_name)), "FALSE" = "NO_RISK", "TRUE" = "RISK"),
    Sex = as.factor(Sex)
  )

message(sprintf("Original %s: %d rows", params$cohort, nrow(dt.ori)))
## Original UKBB: 2960 rows
  1. Define the NO RISK sub-cohort, which is all cases without any risk factor at all
dt.norf <- filter(dt.ori, NoRisk)

message(sprintf("No risk sub-cohort %s: %d rows", params$cohort, nrow(dt.norf)))
## No risk sub-cohort UKBB: 645 rows
  1. Filter cases having Hypertension
dt.rf <- filter(dt.ori,  !!sym(params$rf_name) == "RISK")

message(sprintf("Risk sub-cohort %s: %d rows", params$cohort, nrow(dt.rf)))
## Risk sub-cohort UKBB: 1634 rows

Sanity check: the intersection between NO_RISK and RISK id’s should be zero

test_that("Unique IDs", {
  expect_equal(length(intersect(dt.rf$ID, dt.norf$ID)), 0)
})
## Test passed 🥳
  1. Final data
dt <- bind_rows(dt.norf, dt.rf) %>% 
  select(-c(NoRisk))

message(sprintf("Training data: %d rows", nrow(dt)))
## Training data: 2279 rows
dt %>% select(c(Sex, sym(params$rf_name))) %>% table() %>% 
  kbl(caption = "Risk factor distributions") %>% kable_styling("hover", full_width = F, position="left")
Risk factor distributions
NO_RISK RISK
female 395 738
male 250 896

just for fixing plot title

cohort_title <- if_else(params$cohort == "UKBB", "UK Biobank", params$cohort)  

Prepare the PC scores

  1. Determine how many PCA components to use
model <- list()

expl_vars <- readMat(paste("PCA", params$cohort, "PCA_explained.mat", sep = "/"))$explained
model$npcs <- length(expl_vars[cumsum(expl_vars) < params$pca_var])
message(sprintf("Number of PC components with %.2f%% variance explained: %d", params$pca_var, model$npcs))
## Number of PC components with 99.90% variance explained: 210
rm(expl_vars)
  1. Read PCA components
  2. Combine with Age, Sex & the risk factor using the same ID
# this is a combined X (Age + Sex + PCScores) & Y (Risk Factor)
model$dt.train <- inner_join(
  dt %>% 
    select(c(ID, Age, Sex, sym(params$rf_name))) %>% 
    mutate(
      Sex = as.numeric(Sex)
    ),
  cbind(
    read.csv(paste("PCA", sprintf("%s_ids.csv", params$cohort), sep="/")),
    readMat(paste("PCA", params$cohort, "PCA_score.mat", sep="/"))$score[, 1:model$npcs]
  ),
  by = c("ID" = "ID"))

# check the cohort
message(sprintf("Number of cases = %d, columns = %d", nrow(model$dt.train), ncol(model$dt.train)))
## Number of cases = 2279, columns = 214
  1. Sanity checks.

This should only include that specific cohort

test_that("Check cohort", {
  expect_equal(substr(params$cohort, 1, 3), (model$dt.train %>% transmute(cohort = substr(ID, 1, 3)) %>% distinct())$cohort)
})
## Test passed 🥳

No NA rows

test_that("No NA rows", {
  expect_equal(0, sum(is.na(model$dt.train)))
})
## Test passed 🎉

Find optimal number of PLS components

We’re going to use 5-fold cross validation to determine the optimal parameter for PLS, which is the ncomp.

(m_kcv <- train_pls(as.formula(sprintf("%s ~ Age + Sex + .", params$rf_name)), dt = model$dt.train %>% select(-c(ID)), n_comps=params$max_ncomps))
## + Fold1: ncomp=30 
## - Fold1: ncomp=30 
## + Fold2: ncomp=30 
## - Fold2: ncomp=30 
## + Fold3: ncomp=30 
## - Fold3: ncomp=30 
## + Fold4: ncomp=30 
## - Fold4: ncomp=30 
## + Fold5: ncomp=30 
## - Fold5: ncomp=30 
## Aggregating results
## Selecting tuning parameters
## Fitting ncomp = 16 on full training set
## Partial Least Squares 
## 
## 2279 samples
##  212 predictor
##    2 classes: 'NO_RISK', 'RISK' 
## 
## Pre-processing: centered (212) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1824, 1823, 1823, 1823, 1823 
## Resampling results across tuning parameters:
## 
##   ncomp  ROC        Sens       Spec     
##    1     0.6278493  0.0000000  0.9993884
##    2     0.7108966  0.1565891  0.9650982
##    3     0.7393386  0.2232558  0.9583629
##    4     0.7535149  0.2465116  0.9491886
##    5     0.7545107  0.2852713  0.9442900
##    6     0.7588778  0.3038760  0.9375603
##    7     0.7597515  0.3162791  0.9296017
##    8     0.7610148  0.3193798  0.9357235
##    9     0.7613941  0.3333333  0.9247125
##   10     0.7626121  0.3457364  0.9210371
##   11     0.7630882  0.3534884  0.9143131
##   12     0.7631279  0.3658915  0.9118628
##   13     0.7627485  0.3643411  0.9075683
##   14     0.7642430  0.3844961  0.9051275
##   15     0.7665614  0.3860465  0.9002270
##   16     0.7673511  0.3829457  0.8989944
##   17     0.7669539  0.3937984  0.9014446
##   18     0.7666258  0.3937984  0.8989944
##   19     0.7642834  0.3953488  0.8953228
##   20     0.7626260  0.4000000  0.8989944
##   21     0.7601222  0.3953488  0.8916474
##   22     0.7573785  0.4000000  0.8892028
##   23     0.7560762  0.3953488  0.8928744
##   24     0.7546065  0.3922481  0.8928782
##   25     0.7536798  0.3937984  0.8941033
##   26     0.7524160  0.3937984  0.8947093
##   27     0.7511495  0.3953488  0.8873642
##   28     0.7511527  0.4000000  0.8843080
##   29     0.7509824  0.4000000  0.8861466
##   30     0.7500918  0.4062016  0.8836945
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 16.

Get the number of component

(model$ncomp <- m_kcv$bestTune$ncomp)
## [1] 16

Plot for analysis

plot(m_kcv, main=sprintf("PLS components (%s - %s)", params$rf_name, params$cohort))

Train PLS

After we get the optimal parameter, we train the PLS using leave-one-out cross validation. Since this will take time, we need multiple cores to compute.

Create train control with LOOCV

tc_loo <- trainControl(method="LOOCV", savePredictions = "all", classProbs = TRUE, verboseIter=FALSE,
                       summaryFunction = twoClassSummary, allowParallel = TRUE)
# parallel cores
if( n_cores > 1 ) {
  cl <- makeCluster(n_cores - 1, outfile = "")
  registerDoParallel(cl)
  getDoParWorkers()
}
## [1] 19
# train (take a while)
model$pls <- train(
  form = sprintf("%s ~ Age + Sex + .", params$rf_name) %>% as.formula(),
  data = model$dt.train %>% select(-ID),
  method = "pls", 
  metric = "ROC",
  preProc = c("center"),
  tuneGrid = data.frame(ncomp=model$ncomp),
  probMethod = "softmax",
  trControl = tc_loo
)

# unregister cores
if( n_cores > 1 ) {
  stopCluster(cl)
  registerDoSEQ()
}
model$pls
## Partial Least Squares 
## 
## 2279 samples
##  212 predictor
##    2 classes: 'NO_RISK', 'RISK' 
## 
## Pre-processing: centered (212) 
## Resampling: Leave-One-Out Cross-Validation 
## Summary of sample sizes: 2278, 2278, 2278, 2278, 2278, 2278, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.7714962  0.3844961  0.9039168
## 
## Tuning parameter 'ncomp' was held constant at a value of 16

Or use getTrainPerf to get the results

getTrainPerf(model$pls)
##    TrainROC TrainSens TrainSpec method
## 1 0.7714962 0.3844961 0.9039168    pls

The prediction is in model$pls$pred data frame. Let’s plot the training ROC

Check the level order, because RISK should be the first, since that’s the prediction target.

levels(model$pls$pred$obs)
## [1] "NO_RISK" "RISK"

Calculate the ROC. Note we need to reverse the category level.

(m_roc <- roc(model$pls$pred$obs, model$pls$pred$RISK, levels = rev(levels(model$pls$pred$obs))))
## Setting direction: controls > cases
## 
## Call:
## roc.default(response = model$pls$pred$obs, predictor = model$pls$pred$RISK,     levels = rev(levels(model$pls$pred$obs)))
## 
## Data: model$pls$pred$RISK in 1634 controls (model$pls$pred$obs RISK) > 645 cases (model$pls$pred$obs NO_RISK).
## Area under the curve: 0.7715
plot(m_roc, legacy_axes=TRUE, main = sprintf("Training ROC: %s (%s)", params$rf_name, params$cohort))

Show the first 10 coefficients

coef(model$pls$finalModel)[1:10,,]
##           NO_RISK          RISK
## Age -0.0140922681  0.0140922681
## Sex  0.0004229782 -0.0004229782
## `1`  0.0004834256 -0.0004834256
## `2`  0.0001143037 -0.0001143037
## `3` -0.0003947266  0.0003947266
## `4` -0.0005029031  0.0005029031
## `5`  0.0007313294 -0.0007313294
## `6`  0.0006590670 -0.0006590670
## `7`  0.0004506432 -0.0004506432
## `8`  0.0001354043 -0.0001354043

Save the results

Add some other information too in the model

model$cohort <- params$cohort
model$risk_factor <- params$rf_name
model$pca_var <- params$pca_var
model_filename <- paste(params$output_folder, sprintf("PLS_%s_%s.rds", params$cohort, params$rf_name), sep="/")
write_rds(model, file=model_filename, compress = "gz")
message(sprintf("Saved model to %s", model_filename))
## Saved model to PLS/PLS_UKBB_Hypertension.rds