The spamtree package is built to run multivariate spatial regressions based on spatial multivariate trees and using a non-separable cross-covariance function on latent dimensions. In this vignette we simulate two spatially referenced outcomes.
library(abind)
library(magrittr)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(ggplot2)
library(spamtree)
set.seed(2021)
SS <- 30 # coordinate values for jth dimension
n <- SS^2 # total n. locations, including missing ones
xlocs <- seq(0.0, 1, length.out=SS)
coords <- expand.grid(xlocs, xlocs) %>%
as.data.frame() %>%
arrange(Var1, Var2)
c1 <- coords %>% mutate(mv_id=1)
c2 <- coords %>% mutate(mv_id=2)
coords <- bind_rows(c1, c2)
coords_q <- coords %>% dplyr::select(-mv_id)
cx <- coords_q %>% as.matrix()
ix <- 1:nrow(cx) - 1
mv_id <- coords$mv_id
q <- 2
sigma.sq <- 1
tau.sq <- c(.03, .1)
tausq_long <- rep(0, nrow(cx))
tausq_long[mv_id == 1] <- tau.sq[1]
tausq_long[mv_id == 2] <- tau.sq[2]
# some true values for the non-separable multivariate cross-covariance implemented here
ai1 <- c(1, 1.5)
ai2 <- c(.1, .51)
phi_i <- c(1, 2)
thetamv <- 5
Dmat <- matrix(0, q, q)
Dmat[2,1] <- 1
Dmat[upper.tri(Dmat)] <- Dmat[lower.tri(Dmat)]
X <- cbind(rnorm(nrow(coords)), rnorm(nrow(coords)))
B <- c(-.9, .05)
# generate covariance matrix for full GP
system.time({
CC <- CrossCovarianceAG10(cx, mv_id, cx, mv_id, ai1, ai2, phi_i, thetamv, Dmat)
})
#> user system elapsed
#> 0.065 0.015 0.081
LC <- t(chol(CC))
# sample the outcomes at all locations
y_full <- X %*% B + LC %*% rnorm(nrow(cx)) + sqrt(tausq_long) * rnorm(nrow(cx))
rm(list=c("CC", "LC"))
# make some na: 0=na
# (this also creates some misalignment)
lna <- rep(1, nrow(coords))
lna[((coords_q$Var1 > .4) & (coords_q$Var1 < .9) &
(coords_q$Var2 < .7) & (coords_q$Var2 > .4)) & (mv_id == 1)] <- NA
lna[((coords_q$Var1 > .2) & (coords_q$Var1 < .7) &
(coords_q$Var2 < .7) & (coords_q$Var2 > .4)) & (mv_id == 2)] <- NA
y <- y_full * lna
simdata <- coords %>% cbind(y) %>% as.data.frame()We now run spamtree. In practice the data size would be much larger, and we would run many more MCMC iterations.
# prepare for spamtrees
mcmc_keep <- 200
mcmc_burn <- 200
mcmc_thin <- 2
spamtree_done <- spamtree(y, X,
cx, mv_id,
mcmc = list(keep=mcmc_keep, burn=mcmc_burn, thin=mcmc_thin),
num_threads = 10,
verbose=TRUE)
#> Building reference set.
#> Branching the tree 1 ( 1 ) 2 ( 2 ) 3 ( 3 ) 4 ( 4 ).
#> Finalizing with leaves.
#> Building graph.
#> Running MCMC for 600 iterations.
#> 10.0% 226ms (total: 257ms) ~ MCMC acceptance 9.50% (total: 31.15%)
#> 20.0% 199ms (total: 456ms) ~ MCMC acceptance 12.50% (total: 20.66%)
#> 30.0% 193ms (total: 649ms) ~ MCMC acceptance 14.00% (total: 15.47%)
#> 40.0% 200ms (total: 850ms) ~ MCMC acceptance 7.00% (total: 11.62%)
#> 50.0% 196ms (total: 1047ms) ~ MCMC acceptance 5.00% (total: 11.30%)
#> 60.0% 201ms (total: 1249ms) ~ MCMC acceptance 6.50% (total: 11.08%)
#> 70.0% 203ms (total: 1452ms) ~ MCMC acceptance 8.50% (total: 10.69%)
#> 80.0% 209ms (total: 1661ms) ~ MCMC acceptance 11.00% (total: 10.81%)
#> 90.0% 216ms (total: 1878ms) ~ MCMC acceptance 15.00% (total: 12.57%)
#> MCMC done [2074ms]And finally we do some postprocessing and plot the predictions for both outcomes, and the latent process.
# predictions
y_out <- spamtree_done$yhat_mcmc %>%
abind(along=3) %>% `[`(,1,) %>%
apply(1, mean)
w_out <- spamtree_done$w_mcmc %>%
abind(along=3) %>% `[`(,1,) %>%
apply(1, mean)
outdf <- spamtree_done$coordsinfo %>%
rename(mv_id = sort_mv_id) %>%
cbind(data.frame(w_spamtree = w_out,
y_spamtree = y_out))
# plot predictions
outdf %>%
ggplot(aes(Var1, Var2, fill=y_spamtree)) +
geom_raster() +
facet_grid(~mv_id) +
scale_fill_viridis_c() +
theme_minimal() + theme(legend.position="none")
# plot latent process
outdf %>%
ggplot(aes(Var1, Var2, fill=w_spamtree)) +
geom_raster() +
facet_grid(~mv_id) +
scale_fill_viridis_c() +
theme_minimal() + theme(legend.position="none")