knitr::opts_chunk$set(echo = TRUE)
library(caret)
library(R.matlab)
library(tidyverse)
library(pROC)
library(broom)
library(kableExtra)
library(rlang)
library(testthat)
library(doParallel)
source("train_pls_formulae.R")
n_cores <- detectCores()
message(sprintf("Using %d cores", n_cores))
## Using 20 cores
Note: the output HTML & the training RDS data are saved in PLS.
name | value |
---|---|
Cohort name | UKBB |
Risk factor | Hypercholesterolemia |
Maximum number of PLS components | 30 |
Pct PCA variance explained (%) | 99.9 |
Data file | MESA_UKBB_FinalData.csv |
dt.ori <- read_csv(datafile, show_col_types = FALSE) %>%
filter(Cohort == params$cohort) %>%
select(c(ID, Age, Sex, NoRisk, sym(params$rf_name))) %>%
mutate(
!!sym(params$rf_name) := recode_factor(as.factor(!!sym(params$rf_name)), "FALSE" = "NO_RISK", "TRUE" = "RISK"),
Sex = as.factor(Sex)
)
message(sprintf("Original %s: %d rows", params$cohort, nrow(dt.ori)))
## Original UKBB: 2960 rows
dt.norf <- filter(dt.ori, NoRisk)
message(sprintf("No risk sub-cohort %s: %d rows", params$cohort, nrow(dt.norf)))
## No risk sub-cohort UKBB: 645 rows
dt.rf <- filter(dt.ori, !!sym(params$rf_name) == "RISK")
message(sprintf("Risk sub-cohort %s: %d rows", params$cohort, nrow(dt.rf)))
## Risk sub-cohort UKBB: 439 rows
Sanity check: the intersection between NO_RISK and RISK id’s should be zero
test_that("Unique IDs", {
expect_equal(length(intersect(dt.rf$ID, dt.norf$ID)), 0)
})
## Test passed 🎉
dt <- bind_rows(dt.norf, dt.rf) %>%
select(-c(NoRisk))
message(sprintf("Training data: %d rows", nrow(dt)))
## Training data: 1084 rows
dt %>% select(c(Sex, sym(params$rf_name))) %>% table() %>%
kbl(caption = "Risk factor distributions") %>% kable_styling("hover", full_width = F, position="left")
NO_RISK | RISK | |
---|---|---|
female | 395 | 209 |
male | 250 | 230 |
just for fixing plot title
cohort_title <- if_else(params$cohort == "UKBB", "UK Biobank", params$cohort)
model <- list()
expl_vars <- readMat(paste("PCA", params$cohort, "PCA_explained.mat", sep = "/"))$explained
model$npcs <- length(expl_vars[cumsum(expl_vars) < params$pca_var])
message(sprintf("Number of PC components with %.2f%% variance explained: %d", params$pca_var, model$npcs))
## Number of PC components with 99.90% variance explained: 210
rm(expl_vars)
# this is a combined X (Age + Sex + PCScores) & Y (Risk Factor)
model$dt.train <- inner_join(
dt %>%
select(c(ID, Age, Sex, sym(params$rf_name))) %>%
mutate(
Sex = as.numeric(Sex)
),
cbind(
read.csv(paste("PCA", sprintf("%s_ids.csv", params$cohort), sep="/")),
readMat(paste("PCA", params$cohort, "PCA_score.mat", sep="/"))$score[, 1:model$npcs]
),
by = c("ID" = "ID"))
# check the cohort
message(sprintf("Number of cases = %d, columns = %d", nrow(model$dt.train), ncol(model$dt.train)))
## Number of cases = 1084, columns = 214
This should only include that specific cohort
test_that("Check cohort", {
expect_equal(substr(params$cohort, 1, 3), (model$dt.train %>% transmute(cohort = substr(ID, 1, 3)) %>% distinct())$cohort)
})
## Test passed 😸
No NA rows
test_that("No NA rows", {
expect_equal(0, sum(is.na(model$dt.train)))
})
## Test passed 😸
We’re going to use 5-fold cross validation to determine the optimal
parameter for PLS, which is the ncomp
.
(m_kcv <- train_pls(as.formula(sprintf("%s ~ Age + Sex + .", params$rf_name)), dt = model$dt.train %>% select(-c(ID)), n_comps=params$max_ncomps))
## + Fold1: ncomp=30
## - Fold1: ncomp=30
## + Fold2: ncomp=30
## - Fold2: ncomp=30
## + Fold3: ncomp=30
## - Fold3: ncomp=30
## + Fold4: ncomp=30
## - Fold4: ncomp=30
## + Fold5: ncomp=30
## - Fold5: ncomp=30
## Aggregating results
## Selecting tuning parameters
## Fitting ncomp = 14 on full training set
## Partial Least Squares
##
## 1084 samples
## 212 predictor
## 2 classes: 'NO_RISK', 'RISK'
##
## Pre-processing: centered (212)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 867, 867, 868, 867, 867
## Resampling results across tuning parameters:
##
## ncomp ROC Sens Spec
## 1 0.6191747 0.8372093 0.2755747
## 2 0.6929981 0.8232558 0.4238245
## 3 0.7165292 0.8279070 0.4830460
## 4 0.7392575 0.8155039 0.4875392
## 5 0.7447988 0.8093023 0.5102665
## 6 0.7488793 0.8077519 0.5125392
## 7 0.7439726 0.8015504 0.5420585
## 8 0.7459260 0.8062016 0.5398642
## 9 0.7473030 0.7906977 0.5466562
## 10 0.7473885 0.7875969 0.5603448
## 11 0.7474031 0.7798450 0.5807210
## 12 0.7502114 0.7736434 0.5807994
## 13 0.7548387 0.7751938 0.5853187
## 14 0.7575199 0.7798450 0.6012800
## 15 0.7571138 0.7813953 0.6012800
## 16 0.7569609 0.7813953 0.6013062
## 17 0.7530724 0.7891473 0.5853448
## 18 0.7479747 0.7891473 0.5830199
## 19 0.7442363 0.7782946 0.5625653
## 20 0.7405891 0.7798450 0.5739551
## 21 0.7357300 0.7860465 0.5602926
## 22 0.7337219 0.7813953 0.5625653
## 23 0.7317813 0.7798450 0.5489028
## 24 0.7317473 0.7782946 0.5670585
## 25 0.7276939 0.7782946 0.5488506
## 26 0.7257288 0.7643411 0.5510972
## 27 0.7258096 0.7813953 0.5625392
## 28 0.7258187 0.7736434 0.5580199
## 29 0.7236444 0.7736434 0.5511233
## 30 0.7222485 0.7720930 0.5488767
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 14.
Get the number of component
(model$ncomp <- m_kcv$bestTune$ncomp)
## [1] 14
Plot for analysis
plot(m_kcv, main=sprintf("PLS components (%s - %s)", params$rf_name, params$cohort))
After we get the optimal parameter, we train the PLS using leave-one-out cross validation. Since this will take time, we need multiple cores to compute.
Create train control with LOOCV
tc_loo <- trainControl(method="LOOCV", savePredictions = "all", classProbs = TRUE, verboseIter=FALSE,
summaryFunction = twoClassSummary, allowParallel = TRUE)
# parallel cores
if( n_cores > 1 ) {
cl <- makeCluster(n_cores - 1, outfile = "")
registerDoParallel(cl)
getDoParWorkers()
}
## [1] 19
# train (take a while)
model$pls <- train(
form = sprintf("%s ~ Age + Sex + .", params$rf_name) %>% as.formula(),
data = model$dt.train %>% select(-ID),
method = "pls",
metric = "ROC",
preProc = c("center"),
tuneGrid = data.frame(ncomp=model$ncomp),
probMethod = "softmax",
trControl = tc_loo
)
# unregister cores
if( n_cores > 1 ) {
stopCluster(cl)
registerDoSEQ()
}
model$pls
## Partial Least Squares
##
## 1084 samples
## 212 predictor
## 2 classes: 'NO_RISK', 'RISK'
##
## Pre-processing: centered (212)
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 1083, 1083, 1083, 1083, 1083, 1083, ...
## Resampling results:
##
## ROC Sens Spec
## 0.7631545 0.7705426 0.5876993
##
## Tuning parameter 'ncomp' was held constant at a value of 14
Or use getTrainPerf to get the results
getTrainPerf(model$pls)
## TrainROC TrainSens TrainSpec method
## 1 0.7631545 0.7705426 0.5876993 pls
The prediction is in model$pls$pred
data frame. Let’s
plot the training ROC
Check the level order, because RISK should be the first, since that’s the prediction target.
levels(model$pls$pred$obs)
## [1] "NO_RISK" "RISK"
Calculate the ROC. Note we need to reverse the category level.
(m_roc <- roc(model$pls$pred$obs, model$pls$pred$RISK, levels = rev(levels(model$pls$pred$obs))))
## Setting direction: controls > cases
##
## Call:
## roc.default(response = model$pls$pred$obs, predictor = model$pls$pred$RISK, levels = rev(levels(model$pls$pred$obs)))
##
## Data: model$pls$pred$RISK in 439 controls (model$pls$pred$obs RISK) > 645 cases (model$pls$pred$obs NO_RISK).
## Area under the curve: 0.7632
plot(m_roc, legacy_axes=TRUE, main = sprintf("Training ROC: %s (%s)", params$rf_name, params$cohort))
coef(model$pls$finalModel)[1:10,,]
## NO_RISK RISK
## Age -0.0231668150 0.0231668150
## Sex 0.0001204298 -0.0001204298
## `1` 0.0003525963 -0.0003525963
## `2` 0.0003842034 -0.0003842034
## `3` -0.0001461091 0.0001461091
## `4` -0.0006980778 0.0006980778
## `5` 0.0008185680 -0.0008185680
## `6` 0.0010667158 -0.0010667158
## `7` 0.0005619117 -0.0005619117
## `8` 0.0007838335 -0.0007838335
Add some other information too in the model
model$cohort <- params$cohort
model$risk_factor <- params$rf_name
model$pca_var <- params$pca_var
model_filename <- paste(params$output_folder, sprintf("PLS_%s_%s.rds", params$cohort, params$rf_name), sep="/")
write_rds(model, file=model_filename, compress = "gz")
message(sprintf("Saved model to %s", model_filename))
## Saved model to PLS/PLS_UKBB_Hypercholesterolemia.rds