--- title: "BayesSUR with random effects" author: "Zhi Zhao" output: rmarkdown::html_vignette vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{BayesSUR with random effects} \usepackage[utf8]{inputenc} --- ```{css, echo=FALSE} pre { overflow-y: auto; } pre[class] { max-height: 350px; } ``` ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, eval = FALSE) options(rmarkdown.html_vignette.check_title = FALSE) ``` The BayesSUR model has been extended to include mandatory variables by assigning Gaussian priors as random effects rather than spike-and-slab priors, named as **SSUR-MRF with random effects** in [Zhao et al. 2023](https://doi.org/10.1093/jrsssc/qlad102). The R code for the simulated data and real data analyses in [Zhao et al. 2023](https://doi.org/10.48550/arXiv.2101.05899) can be found at the GitHub repository [BayesSUR-RE](https://github.com/zhizuio/BayesSUR-RE). Here, we show some small examples to run the BayesSUR mdoel with random effects. To get started, load the package with ```{r, eval=TRUE} library("BayesSUR") ``` ## Simulate data We design a network as the following figure (a) to construct a complex structure between $20$ response variables and $300$ predictors. It assumes that the responses are divided into six groups, and the first $120$ predictors are divided into nine groups. ![_Simulation scenarios: True relationships between response variables and predictors. (a) Network structure between $\mathbf Y$ and $\mathbf X$. (b) Spare latent indicator variable $\Gamma$ for the associations between $\mathbf Y$ and $\mathbf X$ in the SUR model. Black blocks indicate nonzero coefficients and white blocks indicate zero coefficients. (c) Additional structure in the residual covariance matrix between response variables not explained by $\mathbf X\mathbf B$. Black blocks indicate correlated residuals of the corresponding response variables and white blocks indicate uncorrelated residuals of the corresponding response variables._](../man/figures/figure2.png){width=90%}
Load the simulation function `sim.ssur()` as follows. ```{r} library("gRbase") sim.ssur <- function(n, s, p, t0 = 0, seed = 123, mv = TRUE, t.df = Inf, random.intercept = 0, intercept = TRUE) { # set seed to fix coefficients set.seed(7193) sd_b <- 1 mu_b <- 1 b <- matrix(rnorm((p + ifelse(t0 == 0, 1, 0)) * s, mu_b, sd_b), p + ifelse(t0 == 0, 1, 0), s) # design groups and pathways of Gamma matrix gamma <- matrix(FALSE, p + ifelse(t0 == 0, 1, 0), s) if (t0 == 0) gamma[1, ] <- TRUE gamma[2:6 - ifelse(t0 == 0, 0, 1), 1:5] <- TRUE gamma[11:21 - ifelse(t0 == 0, 0, 1), 6:12] <- TRUE gamma[31:51 - ifelse(t0 == 0, 0, 1), 1:5] <- TRUE gamma[31:51 - ifelse(t0 == 0, 0, 1), 13:15] <- TRUE gamma[52:61 - ifelse(t0 == 0, 0, 1), 1:12] <- TRUE gamma[71:91 - ifelse(t0 == 0, 0, 1), 6:15] <- TRUE gamma[111:121 - ifelse(t0 == 0, 0, 1), 1:15] <- TRUE gamma[122 - ifelse(t0 == 0, 0, 1), 16:18] <- TRUE gamma[123 - ifelse(t0 == 0, 0, 1), 19] <- TRUE gamma[124 - ifelse(t0 == 0, 0, 1), 20] <- TRUE G_kron <- matrix(0, s * p, s * p) G_m <- bdiag(matrix(1, ncol = 5, nrow = 5), matrix(1, ncol = 7, nrow = 7), matrix(1, ncol = 8, nrow = 8)) G_p <- bdiag(matrix(1, ncol = 5, nrow = 5), diag(3), matrix(1, ncol = 11, nrow = 11), diag(9), matrix(1, ncol = 21, nrow = 21), matrix(1, ncol = 10, nrow = 10), diag(9), matrix(1, ncol = 21, nrow = 21), diag(19), matrix(1, ncol = 11, nrow = 11), diag(181)) G_kron <- kronecker(G_m, G_p) combn11 <- combn(rep((1:5 - 1) * p, each = length(1:5)) + rep(1:5, times = length(1:5)), 2) combn12 <- combn(rep((1:5 - 1) * p, each = length(30:60)) + rep(30:60, times = length(1:5)), 2) combn13 <- combn(rep((1:5 - 1) * p, each = length(110:120)) + rep(110:120, times = length(1:5)), 2) combn21 <- combn(rep((6:12 - 1) * p, each = length(10:20)) + rep(10:20, times = length(6:12)), 2) combn22 <- combn(rep((6:12 - 1) * p, each = length(51:60)) + rep(51:60, times = length(6:12)), 2) combn23 <- combn(rep((6:12 - 1) * p, each = length(70:90)) + rep(70:90, times = length(6:12)), 2) combn24 <- combn(rep((6:12 - 1) * p, each = length(110:120)) + rep(110:120, times = length(6:12)), 2) combn31 <- combn(rep((13:15 - 1) * p, each = length(30:50)) + rep(30:50, times = length(13:15)), 2) combn32 <- combn(rep((13:15 - 1) * p, each = length(70:90)) + rep(70:90, times = length(13:15)), 2) combn33 <- combn(rep((13:15 - 1) * p, each = length(110:120)) + rep(110:120, times = length(13:15)), 2) combn4 <- combn(rep((16:18 - 1) * p, each = length(121)) + rep(121, times = length(16:18)), 2) combn5 <- matrix(rep((19 - 1) * p, each = length(122)) + rep(122, times = length(19)), nrow = 1, ncol = 2) combn6 <- matrix(rep((20 - 1) * p, each = length(123)) + rep(123, times = length(20)), nrow = 1, ncol = 2) combnAll <- rbind(t(combn11), t(combn12), t(combn13), t(combn21), t(combn22), t(combn23), t(combn24), t(combn31), t(combn32), t(combn33), t(combn4), combn5, combn6) set.seed(seed + 7284) sd_x <- 1 x <- matrix(rnorm(n * p, 0, sd_x), n, p) if (t0 == 0 & intercept) x <- cbind(rep(1, n), x) if (!intercept) { gamma <- gamma[-1, ] b <- b[-1, ] } xb <- matrix(NA, n, s) if (mv) { for (i in 1:s) { if (sum(gamma[, i]) >= 1) { if (sum(gamma[, i]) == 1) { xb[, i] <- x[, gamma[, i]] * b[gamma[, i], i] } else { xb[, i] <- x[, gamma[, i]] %*% b[gamma[, i], i] } } else { xb[, i] <- sapply(1:s, function(i) rep(1, n) * b[1, i]) } } } else { if (sum(gamma) >= 1) { xb <- x[, gamma] %*% b[gamma, ] } else { xb <- sapply(1:s, function(i) rep(1, n) * b[1, i]) } } corr_param <- 0.9 M <- matrix(corr_param, s, s) diag(M) <- rep(1, s) ## wanna make it decomposable Prime <- list(c(1:(s * .4), (s * .8):s), c((s * .4):(s * .6)), c((s * .65):(s * .75)), c((s * .8):s)) G <- matrix(0, s, s) for (i in 1:length(Prime)) { G[Prime[[i]], Prime[[i]]] <- 1 } # check dimnames(G) <- list(1:s, 1:s) length(gRbase::mcsMAT(G - diag(s))) > 0 var <- solve(BDgraph::rgwish(n = 1, adj = G, b = 3, D = M)) # change seeds to add randomness on error set.seed(seed + 8493) sd_err <- 0.5 if (is.infinite(t.df)) { err <- matrix(rnorm(n * s, 0, sd_err), n, s) %*% chol(as.matrix(var)) } else { err <- matrix(rt(n * s, t.df), n, s) %*% chol(as.matrix(var)) } if (t0 == 0) { b.re <- NA z <- NA y <- xb + err if (random.intercept != 0) { y <- y + matrix(rnorm(n * s, 0, sqrt(random.intercept)), n, s) } z <- sample(1:4, n, replace = T, prob = rep(1 / 4, 4)) return(list(y = y, x = x, b = b, gamma = gamma, z = model.matrix(~ factor(z) + 0)[, ], b.re = b.re, Gy = G, mrfG = combnAll)) } else { # add random effects z <- t(rmultinom(n, size = 1, prob = c(.1, .2, .3, .4))) z <- sample(1:t0, n, replace = T, prob = rep(1 / t0, t0)) set.seed(1683) b.re <- rnorm(t0, 0, 2) y <- matrix(b.re[z], nrow = n, ncol = s) + xb + err return(list( y = y, x = x, b = b, gamma = gamma, z = model.matrix(~ factor(z) + 0)[, ], b.re = b.re, Gy = G, mrfG = combnAll )) } } ``` To simulate data with sample size $n=250$, responsible variables $s=20$ and covariates $p=300$, we can specify the corresponding parameters in the function `sim.ssur()` as follows. ```{r} library("Matrix") n <- 250 s <- 20 p <- 300 sim1 <- sim.ssur(n, s, p, seed = 1) ``` To simulate data from $4$ individual groups with group indicator variables following the defaul multinomial distribution $multinomial(0.1,0.2,0.3,0.4)$, we can simply add the argument `t0 = 4` in the function `sim.ssur()` as follows. ```{r} t0 <- 4 sim2 <- sim.ssur(n, s, p, t0, seed = 1) # learning data sim2.val <- sim.ssur(n, s, p, t0, seed=101) # validation data ``` ## Run BayesSUR model with random effects According to the guideline of prior specification in [Zhao et al. 2023](https://doi.org/10.1093/jrsssc/qlad102), we first set the following parameters `hyperpar` and then running the BayesSUR model with random effects via `betaPrior = "reGroup"` (default `betaPrior = "independent"` with spike-and-slab priors for all coefficients). **For illustration, we run a short MCMC** with `nIter = 300` and `burnin = 100`. Note that here the graph used for the Markov random field prior is the true graph from the returned object of the simulation `sim2$mrfG`. ```{r} hyperpar <- list(mrf_d = -2, mrf_e = 1.6, a_w0 = 100, b_w0 = 500, a_w = 15, b_w = 60) set.seed(1038) fit2 <- BayesSUR( data = cbind(sim2$y, sim2$z, sim2$x), Y = 1:s, X_0 = s + 1:t0, X = s + t0 + 1:p, outFilePath = "sim2_mrf_re", hyperpar = hyperpar, gammaInit = "0", betaPrior = "reGroup", nIter = 300, burnin = 100, covariancePrior = "HIW", standardize = F, standardize.response = F, gammaPrior = "MRF", mrfG = sim2$mrfG, output_CPO = T ) ``` ``` ## BayesSUR -- Bayesian Seemingly Unrelated Regression Modelling ## Reading input files ... ... successfull! ## Clearing and initialising output files ## Initialising the (SUR) MCMC Chain ... ... DONE! ## Drafting the output files with the start of the chain ... DONE! ## ## Starting 2 (parallel) chain(s) for 300 iterations: ## Temperature ladder updated, new temperature ratio : 1.1 ## MCMC ends. --- Saving results and exiting ## Saved to : sim2_mrf_re1/data_SSUR_****_out.txt ## Final w0 : 5.43872 ## Final w : 0.151529 ## Final tau : 5.03502 w/ proposal variance: 1.25175 ## Final eta : 0.0404965 ## -- Average Omega : 0 ## Final temperature ratio : 1.1 ## ## DONE, exiting! ``` Check some summarized information of the results: ```{r} summary(fit2) ``` ``` ## Call: ## BayesSUR(data = cbind(sim2$y, sim2$z, sim2$x), ...) ## ## CPOs: ## Min. 1st Qu. Median 3rd Qu. Max. ## 0.0001880944 0.0242389626 0.0347986252 0.0465162558 0.1307315429 ## ## Number of selected predictors (mPIP > 0.5): 2823 of 20x300 ## ## Top 10 predictors on average mPIP across all responses: ## X.251 X.27 X.296 X.196 X.285 X.130 X.32 X.104 X.58 X.10 ## 0.729580 0.702225 0.695755 0.672865 0.656705 0.653220 0.651730 0.643770 0.638795 0.635315 ## ## Top 10 responses on average mPIP across all predictors: ## X.5 X.8 X.19 X.12 X.4 X.11 X.10 X.14 X.16 X.9 ## 0.5099717 0.4958283 0.4896067 0.4811993 0.4784647 0.4766230 0.4744843 0.4743030 0.4742693 0.4740880 ## ## Expected log pointwise predictive density (elpd) estimates: ## elpd.LOO = -16836.31, elpd.WAIC = -16834.33 ## ## MCMC specification: ## iterations = 300, burn-in = 100, chains = 2 ## gamma local move sampler: bandit ## gamma initialisation: 0 ## ## Model specification: ## covariance prior: HIW ## gamma prior: MRF ## ## Hyper-parameters: ## a_w b_w nu a_tau b_tau a_eta b_eta mrf_d mrf_e a_w0 b_w0 ## 15.0 60.0 22.0 0.1 10.0 0.1 1.0 -2.0 1.6 100.0 500.0 ``` Compute the model performance with respect to **variable selection** ```{r} # compute accuracy, sensitivity, specificity of variable selection gamma <- getEstimator(fit2) (accuracy <- sum(data.matrix(gamma > 0.5) == sim2$gamma) / prod(dim(gamma))) ``` ``` ## [1] 0.5371667 ``` ```{r} (sensitivity <- sum((data.matrix(gamma > 0.5) == 1) & (sim2$gamma == 1)) / sum(sim2$gamma == 1)) ``` ``` ## [1] 0.5298701 ``` ```{r} (specificity <- sum((data.matrix(gamma > 0.5) == 0) & (sim2$gamma == 0)) / sum(sim2$gamma == 0)) ``` ``` ## [1] 0.5382409 ``` Compute the model performance with respect to **response prediction** ```{r} # compute RMSE and RMSPE for prediction performance beta <- getEstimator(fit2, estimator = "beta", Pmax = .5, beta.type = "conditional") (RMSE <- sqrt(sum((sim2$y - cbind(sim2$z, sim2$x) %*% beta)^2) / prod(dim(sim2$y)))) ``` ``` ## [1] 7.134064 ``` ```{r} (RMSPE <- sqrt(sum((sim2.val$y - cbind(sim2.val$z, sim2.val$x) %*% beta)^2) / prod(dim(sim2.val$y)))) ``` ``` ## [1] 8.269975 ``` Compute the model performance with respect to **coefficient bias** ```{r} # compute bias of beta estimates b <- sim2$b b[sim2$gamma == 0] <- 0 (beta.l2 <- sqrt(sum((beta[-c(1:4), ] - b)^2) / prod(dim(b)))) ``` ``` ## [1] 0.4617231 ``` Compute the model performance with respect to **covariance selection** ```{r} g.re <- getEstimator(fit2, estimator = "Gy") (g.accuracy <- sum((g.re > 0.5) == sim2$Gy) / prod(dim(g.re))) ``` ``` ## [1] 0.51 ``` ```{r} (g.sensitivity <- sum(((g.re > 0.5) == sim2$Gy)[sim2$Gy == 1]) / sum(sim2$Gy == 1)) ``` ``` ## [1] 0.1089109 ``` ```{r} (g.specificity <- sum(((g.re > 0.5) == sim2$Gy)[sim2$Gy == 0]) / sum(sim2$Gy == 0)) ``` ``` ## [1] 0.9191919 ``` ## Referrence > Zhi Zhao, Marco Banterle, Alex Lewin, Manuela Zucknick (2023). > Multivariate Bayesian structured variable selection for pharmacogenomic studies. > _Journal of the Royal Statistical Society: Series C (Applied Statistics)_, qlad102. DOI: [10.1093/jrsssc/qlad102](https://doi.org/10.1093/jrsssc/qlad102).