knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(ggplot2)
library(caret)
library(pROC)
library(R.matlab)
library(kableExtra)
int.config <- list()
ext.config <- list()
if( params$cohort == "MESA" ) {
int.config$color <- "dodgerblue3"
ext.config$color <- "firebrick3"
} else {
int.config$color <- "firebrick3"
ext.config$color <- "dodgerblue3"
}
Perform internal & external validations for a shape-based atlas PLSR study.
dt <- readRDS(file.path(params$input_folder, sprintf("PLS_%s_%s.rds", params$cohort, params$rf_name)))
if( params$cohort == "MESA" ) {
ext_cohort <- "UKBB"
} else {
ext_cohort <- "MESA"
}
message(sprintf("The external cohort: %s", ext_cohort))
## The external cohort: MESA
Prepare the external cohort input variables. We will read it from the same training data result of the external cohort, and select only the independent variables without the principal scores, because we want to use the projected principal scores from the external cohort into the model atlas.
dt.test <- readRDS(file.path(
params$input_folder, sprintf("PLS_%s_%s.rds", ext_cohort, params$rf_name)
))$dt.train %>%
select(c(ID, Age, Sex, sym(params$rf_name))) %>%
inner_join(
cbind(
read_csv(file.path("PCA", sprintf("%s_ids.csv", ext_cohort)), show_col_types = FALSE),
readMat(file.path("PCA", params$cohort, sprintf("%s_score.mat", ext_cohort)))$proj[, 1:dt$npcs]
),
by = c("ID" = "ID")
)
message(sprintf("The external cohort data: %d x %d", nrow(dt.test), ncol(dt.test)))
## The external cohort data: 770 x 214
Frequency table
# print frequency table
dt$dt.train %>% select(sym(params$rf_name)) %>% table() %>%
kbl(caption = "Internal test data:") %>% kable_styling("hover", full_width = F, position="left")
Diabetes | Freq |
---|---|
NO_RISK | 645 |
RISK | 159 |
# print frequency table
dt.test %>% select(sym(params$rf_name)) %>% table() %>%
kbl(caption = "External test data:") %>% kable_styling("hover", full_width = F, position="left")
Diabetes | Freq |
---|---|
NO_RISK | 397 |
RISK | 373 |
The prediction from internal cohort has been stored in
dt$pls$pred
.
Internal ROC
## Setting direction: controls > cases
##
## Call:
## roc.default(response = dt$pls$pred$obs, predictor = dt$pls$pred$RISK, levels = rev(levels(dt$pls$pred$obs)))
##
## Data: dt$pls$pred$RISK in 159 controls (dt$pls$pred$obs RISK) > 645 cases (dt$pls$pred$obs NO_RISK).
## Area under the curve: 0.7909
## 95% CI: 0.7479-0.8307 (2000 stratified bootstrap replicates)
We need to calculate the external predictions.
# let's retrain again
pls <- train(
form = sprintf("%s ~ Age + Sex + .", params$rf_name) %>% as.formula(),
data = dt$dt.train %>% select(-ID),
method = "pls",
metric = "ROC",
preProc = c("center"),
tuneGrid = data.frame(ncomp=dt$ncomp),
probMethod = "softmax",
trControl = trainControl(
method="none", savePredictions = "final", classProbs = TRUE, verboseIter=FALSE,
summaryFunction = twoClassSummary)
)
ext.pred <- data.frame(
pred = predict(pls, newdata = dt.test, type="prob") %>% pull(RISK),
obs = pull(dt.test, !!sym(params$rf_name))
)
External ROC
## Setting direction: controls > cases
##
## Call:
## roc.default(response = ext.pred$obs, predictor = ext.pred$pred, levels = rev(levels(as.factor(ext.pred$obs))))
##
## Data: ext.pred$pred in 373 controls (ext.pred$obs RISK) > 397 cases (ext.pred$obs NO_RISK).
## Area under the curve: 0.772
## 95% CI: 0.7387-0.8048 (2000 stratified bootstrap replicates)
Compare the two ROC’s
##
## DeLong's test for two ROC curves
##
## data: int.roc and ext.roc
## D = 0.71104, df = 1517.2, p-value = 0.4772
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2
## 0.7908927 0.7719694
Use the best index from ‘closest.topleft’ to balance between sensitivity & specificity.
int.pred <- int.pred %>%
mutate(class = ifelse(pred > int.cutoff$threshold, "RISK", "NO_RISK") %>% as.factor())
(int.cm <- confusionMatrix(data = int.pred$class, reference = int.pred$obs, positive = "RISK"))
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO_RISK RISK
## NO_RISK 492 46
## RISK 153 113
##
## Accuracy : 0.7525
## 95% CI : (0.7211, 0.782)
## No Information Rate : 0.8022
## P-Value [Acc > NIR] : 0.9998
##
## Kappa : 0.3777
##
## Mcnemar's Test P-Value : 5.729e-14
##
## Sensitivity : 0.7107
## Specificity : 0.7628
## Pos Pred Value : 0.4248
## Neg Pred Value : 0.9145
## Prevalence : 0.1978
## Detection Rate : 0.1405
## Detection Prevalence : 0.3308
## Balanced Accuracy : 0.7367
##
## 'Positive' Class : RISK
##
ext.pred <- ext.pred %>%
mutate(class = ifelse(pred > ext.cutoff$threshold, "RISK", "NO_RISK") %>% as.factor())
(ext.cm <- confusionMatrix(data = ext.pred$class, reference = ext.pred$obs, positive = "RISK"))
## Confusion Matrix and Statistics
##
## Reference
## Prediction NO_RISK RISK
## NO_RISK 276 102
## RISK 121 271
##
## Accuracy : 0.7104
## 95% CI : (0.6769, 0.7422)
## No Information Rate : 0.5156
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.4211
##
## Mcnemar's Test P-Value : 0.2281
##
## Sensitivity : 0.7265
## Specificity : 0.6952
## Pos Pred Value : 0.6913
## Neg Pred Value : 0.7302
## Prevalence : 0.4844
## Detection Rate : 0.3519
## Detection Prevalence : 0.5091
## Balanced Accuracy : 0.7109
##
## 'Positive' Class : RISK
##
Save the analysis for collation use
This is ROC-only (for backward compatibility)
roc_filename <- file.path(params$output_folder, sprintf("ROC_%s_%s.rds", params$cohort, params$rf_name))
write_rds(list(
cohort = params$cohort,
risk_factor = params$rf_name,
internal = int.roc,
external = ext.roc
), file=roc_filename, compress = "gz")
message(sprintf("Saved ROC to %s", roc_filename))
## Saved ROC to Validation/ROC_UKBB_Diabetes.rds
This is all with the new predictions
pred_filename <- file.path(params$output_folder, sprintf("Prediction_%s_%s.rds", params$cohort, params$rf_name))
write_rds(list(
cohort = params$cohort,
risk_factor = params$rf_name,
internal = list(roc=int.roc, ci=int.ci, pred=int.pred, cutoff=int.cutoff, cm=int.cm),
external = list(roc=ext.roc, ci=ext.ci, pred=ext.pred, cutoff=ext.cutoff, cm=ext.cm)
), file=pred_filename, compress = "gz")
message(sprintf("Saved predicions to %s", pred_filename))
## Saved predicions to Validation/Prediction_UKBB_Diabetes.rds
# change the name for better title of the plot on the paper
if( params$cohort == "MESA" ) {
cohort_name <- "MESA"
} else {
cohort_name <- "UK Biobank"
}
plot.roc(int.roc, legacy.axes = FALSE, lty = 1,
main = sprintf("%s with %s Atlas", params$rf_name, cohort_name))
plot.roc(ext.roc, add=TRUE, lty = 2)
# print p-value
text(1, 1, labels=paste("p-value =", format.pval(test.roc$p.value)), adj=c(0, .5))
legend("bottomright",
legend=c(
sprintf("Internal cohort (AUC: %.2f, 95%% CI: [%.2f, %.2f])",
int.roc$auc, int.ci[1], int.ci[3]),
sprintf("External cohort (AUC: %.2f, 95%% CI: [%.2f, %.2f])",
ext.roc$auc, ext.ci[1], ext.ci[3])),
lwd=2, lty=c(1,2))
(int.fname <- sprintf("%s/%s_%s_Shapes_on_%s_Scores.csv",
params$output_folder, params$rf_name, params$cohort, params$cohort))
## [1] "Validation/Diabetes_UKBB_Shapes_on_UKBB_Scores.csv"
(ext.fname <- sprintf("%s/%s_%s_Shapes_on_%s_Scores.csv",
params$output_folder, params$rf_name, ext_cohort, params$cohort))
## [1] "Validation/Diabetes_MESA_Shapes_on_UKBB_Scores.csv"