This vignette visualizes classification results of neural nets using tools from the classmap package. We will fit neural nets by the nnet package, which allows the training of simple neural networks with a single hidden layer.
library(nnet)
library(classmap)
As a first small example, we consider the Iris data. We start by training a neural network with one hidden layer containing 3 neurons on the full iris data.
set.seed(123)
nn.iris <- nnet(Species ~ ., data = iris, size = 3)
## # weights:  27
## initial  value 207.903843 
## iter  10 value 69.852587
## iter  20 value 8.126663
## iter  30 value 5.901328
## iter  40 value 5.299180
## iter  50 value 4.942638
## iter  60 value 4.747845
## iter  70 value 4.379577
## iter  80 value 4.221971
## iter  90 value 3.738670
## iter 100 value 3.572490
## final  value 3.572490 
## stopped after 100 iterations
names(nn.iris)
##  [1] "n"             "nunits"        "nconn"         "conn"         
##  [5] "nsunits"       "decay"         "entropy"       "softmax"      
##  [9] "censored"      "value"         "wts"           "convergence"  
## [13] "fitted.values" "residuals"     "lev"           "call"         
## [17] "terms"         "coefnames"     "xlevels"
head(nn.iris$fitted.values) # matrix of posterior probabilities.
##   setosa   versicolor    virginica
## 1      1 6.016990e-17 1.968299e-78
## 2      1 1.091170e-16 1.603084e-78
## 3      1 6.068970e-17 1.967292e-78
## 4      1 1.211741e-16 1.701564e-78
## 5      1 6.005968e-17 1.968701e-78
## 6      1 6.186116e-17 2.008031e-78
nn.iris$wts # the "weights" (coefficients, some are negative).
##  [1]    8.461321    1.361977  -12.051401   16.284091  -36.812648 -659.229209
##  [7]  -55.372159  -69.510090  197.535633  143.586681    2.651707    1.568175
## [13]    6.055571  -13.504329   -9.530820   23.554323  -28.893963   -6.098134
## [19]   49.173270   75.460324  120.079027 -189.946588  -40.085310  -97.126609
## [25]  -90.777198  195.965473   -9.070470
summary(nn.iris) 
## a 4-3-3 network with 27 weights
## options were - softmax modelling 
##   b->h1  i1->h1  i2->h1  i3->h1  i4->h1 
##    8.46    1.36  -12.05   16.28  -36.81 
##   b->h2  i1->h2  i2->h2  i3->h2  i4->h2 
## -659.23  -55.37  -69.51  197.54  143.59 
##   b->h3  i1->h3  i2->h3  i3->h3  i4->h3 
##    2.65    1.57    6.06  -13.50   -9.53 
##   b->o1  h1->o1  h2->o1  h3->o1 
##   23.55  -28.89   -6.10   49.17 
##   b->o2  h1->o2  h2->o2  h3->o2 
##   75.46  120.08 -189.95  -40.09 
##   b->o3  h1->o3  h2->o3  h3->o3 
##  -97.13  -90.78  195.97   -9.07
In the above output, the coefficients are listed with their arrows: i1,…,i4 are the 4 _i_nput variables. h1, h2, h3 are the 3 _h_idden nodes. o1, o2, o3 are the _o_utput nodes, one for each class (before the softmax function is applied). b->h1, b->h2, b->h3 are the intercepts to the hidden layer. b->o1, b->o2, b->o3 are the intercepts to the output layer.
We can use this to reconstruct the model from the coefficients. We first extract the coefficients from each layer:
train_x <- as.matrix(iris[, -5]) # original input: 4 variables.
# extract "weights" (coefficients) from each layer:
bias1  <- nn.iris$wts[c(1, 6, 11)] # intercepts to hidden layer
betas1 <- cbind(nn.iris$wts[2:5], nn.iris$wts[7:10], 
                nn.iris$wts[12:15]) # slopes to hidden layer
bias2  <- nn.iris$wts[c(16, 20, 24)] # intercepts to output layer
betas2 <- cbind(nn.iris$wts[17:19], nn.iris$wts[21:23], 
                nn.iris$wts[25:27]) # slopes to output layer.
Now we use the weights and the sigmoid function (for activation in the hidden layer) to reconstruct the representations of the data in the hidden layer (H), as well as in the final layer before softmax (X). X is the set of G-dimensional points v_i given by (6) in the paper. This X will be used for the farness in the class maps.
sigmoid <- function(x) 1 / (1 + exp(-x)) 
H <- t(t(train_x %*% betas1) + bias1) # = hidden layer
X <- t(t(sigmoid(H) %*% betas2) + bias2) # = layer before softmax.
pairs(X, col = unclass(iris$Species)) # G=3 classes ==> 3 dimensions.
plot of chunk unnamed-chunk-5
To obtain the posterior probabilities, we can apply the softmax function to the last layer. We verify that this yields the same result as the probabilities returned by nnet().
softmax <- function(x) exp(x) / sum(exp(x)) 
outprobs <- t(apply(X, 1, softmax))
trainprobs <- nn.iris$fitted.values # this is after softmax
range(trainprobs - outprobs) # near 0, so we have a match.
## [1] -3.418358e-06  3.418358e-06
Now we prepare the output for visualizations.
y <- iris[, 5]
vcrtrain <- vcr.neural.train(X, y, trainprobs)
## 
## There is at least one class with (near-) singular covariance matrix, 
## so farness will be computed from PCA.
names(vcrtrain)
##  [1] "X"         "yint"      "y"         "levels"    "predint"   "pred"     
##  [7] "altint"    "altlab"    "PAC"       "figparams" "fig"       "farness"  
## [13] "ofarness"
vcrtrain$predint[c(1:10, 51:60, 101:110)] # prediction as integer
##   1   2   3   4   5   6   7   8   9  10  51  52  53  54  55  56  57  58  59  60 
##   1   1   1   1   1   1   1   1   1   1   2   2   2   2   2   2   2   2   2   2 
## 101 102 103 104 105 106 107 108 109 110 
##   3   3   3   3   3   3   3   3   3   3
vcrtrain$pred[c(1:10, 51:60, 101:110)]    # prediction as label
##  [1] "setosa"     "setosa"     "setosa"     "setosa"     "setosa"    
##  [6] "setosa"     "setosa"     "setosa"     "setosa"     "setosa"    
## [11] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
## [16] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
## [21] "virginica"  "virginica"  "virginica"  "virginica"  "virginica" 
## [26] "virginica"  "virginica"  "virginica"  "virginica"  "virginica"
vcrtrain$altint[c(1:10, 51:60, 101:110)]  # alternative label as integer
##  [1] 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 2 2 1
vcrtrain$altlab[c(1:10, 51:60, 101:110)]  # alternative label
##  [1] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
##  [6] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
## [11] "setosa"     "setosa"     "setosa"     "setosa"     "setosa"    
## [16] "setosa"     "setosa"     "setosa"     "setosa"     "setosa"    
## [21] "setosa"     "setosa"     "setosa"     "versicolor" "setosa"    
## [26] "versicolor" "setosa"     "versicolor" "versicolor" "setosa"
# Probability of Alternative Class (PAC) of each object:
vcrtrain$PAC[1:3] 
## [1] 6.01699e-17 1.09117e-16 6.06897e-17
#
summary(vcrtrain$PAC)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.0124  0.0000  0.9258
# f(i, g) is the distance from case i to class g:
vcrtrain$fig[1:5, ] # for the first 5 objects:
##            [,1]      [,2]      [,3]
## [1,] 0.20737485 0.9395651 0.9876833
## [2,] 0.85045110 0.9395364 0.9876831
## [3,] 0.08450745 0.9395643 0.9876833
## [4,] 0.86189533 0.9395163 0.9876827
## [5,] 0.23301865 0.9395652 0.9876833
# The farness of an object i is the f(i, g) to its own class: 
vcrtrain$farness[1:5]
## [1] 0.20737485 0.85045110 0.08450745 0.86189533 0.23301865
#
summary(vcrtrain$farness)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0368  0.2432  0.2599  0.4864  0.8557  0.9680
# The "overall farness" of an object is defined as the 
# lowest f(i, g) it has to any class g (including its own):
summary(vcrtrain$ofarness)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0368  0.2432  0.2599  0.4848  0.8557  0.9389
confmat.vcr(vcrtrain)
## 
## Confusion matrix:
##             predicted
## given        setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         1
##   virginica       0          0        50
## 
## The accuracy is 99.33%.
The vcrtrain object can now be used to create the visualizations.
cols <- c("red", "darkgreen", "blue")
# stacked mosaic plot:
stplot <- stackedplot(vcrtrain, classCols = cols,
                     main = "Stacked plot of nnet on iris data") 
stplot
plot of chunk unnamed-chunk-8
# silhouette plot:
silplot(vcrtrain, classCols = cols, main =
          "Silhouette plot of nnet on iris data") 
##  classNumber classLabel classSize classAveSi
##            1     setosa        50       1.00
##            2 versicolor        50       0.96
##            3  virginica        50       0.96
plot of chunk unnamed-chunk-8
# class maps:
classmap(vcrtrain, "setosa", classCols = cols)
plot of chunk unnamed-chunk-8
# Very tight class (low PAC, no high farness).
classmap(vcrtrain, "versicolor", classCols = cols)
plot of chunk unnamed-chunk-8
# Not so tight, one point is predicted as virginica.
classmap(vcrtrain, "virginica", classCols = cols)
plot of chunk unnamed-chunk-8
# Also tight.
To illustrate the use of new data we create a “fake” dataset which is a subset of the training data, where not all classes occur, and ynew has NA’s.
test_x <- train_x[1:100, ]
ynew <- y[1:100]
ynew[c(1:10, 51:60)] <- NA
pairs(train_x, col = as.numeric(y) + 1, pch = 19) # 3 colors
plot of chunk unnamed-chunk-9
pairs(test_x, col = as.numeric(ynew) + 1, pch = 19) # only red and green
plot of chunk unnamed-chunk-9
We now calculate the predictions of this test data, and reconstruct the representation of the test data in the hidden and final layer.
predprobs <- predict(nn.iris, test_x, type = "raw")
range(predprobs - trainprobs[1:100, ]) # perfect match
## [1] 0 0
# Reconstruct this prediction:
Hnew <- t(t(test_x %*% betas1) + bias1) # hidden layer
Xnew <- t(t(sigmoid(Hnew) %*% betas2) + bias2) # layer before softmax
probsnew <- t(apply(Xnew, 1, softmax))
range(probsnew - predprobs) # ~0, so we have a match
## [1] -1.110223e-15  9.714451e-16
Now prepare for visualization:
vcrtest <- vcr.neural.newdata(Xnew, ynew, probsnew, vcrtrain)
plot(vcrtest$predint, vcrtrain$predint[1:100]); abline(0, 1) # identical, OK
plot of chunk unnamed-chunk-11
plot(vcrtest$altint, vcrtrain$altint[1:100]); abline(0, 1) # identical when not NA, OK
plot of chunk unnamed-chunk-11
plot(vcrtest$PAC, vcrtrain$PAC[1:100]); abline(0, 1) # OK
plot of chunk unnamed-chunk-11
vcrtest$farness # length 100, with NA's where ynew is NA
##   [1]         NA         NA         NA         NA         NA         NA
##   [7]         NA         NA         NA         NA 0.22279874 0.72023725
##  [13] 0.96321287 0.53001183 0.25870493 0.25729970 0.25553649 0.18077135
##  [19] 0.26115206 0.20655762 0.89671342 0.20830291 0.25856230 0.95779043
##  [25] 0.96190111 0.96171838 0.86977892 0.05423371 0.13796909 0.89553878
##  [31] 0.93788017 0.61105777 0.25501513 0.25848322 0.88010290 0.22171244
##  [37] 0.24876810 0.16330724 0.52374495 0.22309847 0.23270883 0.95775904
##  [43] 0.09137918 0.92827252 0.93314093 0.78883032 0.16967184 0.41060696
##  [49] 0.22014483 0.03679866         NA         NA         NA         NA
##  [55]         NA         NA         NA         NA         NA         NA
##  [61] 0.25978151 0.88984549 0.25993220 0.24565279 0.88991058 0.82758963
##  [67] 0.88294537 0.25991897 0.25718649 0.25830565 0.88104884 0.82941910
##  [73] 0.92115278 0.25992966 0.09838746 0.73693813 0.25961446 0.83092147
##  [79] 0.85324862 0.17433168 0.25708124 0.25956760 0.27683565 0.92785948
##  [85] 0.88458022 0.88992431 0.71927806 0.25990656 0.87408023 0.30086342
##  [91] 0.25986061 0.05544259 0.22655780 0.10915362 0.06261839 0.08496130
##  [97] 0.67823299 0.06594331 0.88983510 0.73933757
plot(vcrtest$farness, vcrtrain$farness[1:100]); abline(0, 1) # identical where not NA
plot of chunk unnamed-chunk-11
plot(vcrtest$fig, vcrtrain$fig[1:100, ]); abline(0, 1) # same, OK
plot of chunk unnamed-chunk-11
vcrtest$ofarness # 100, withOUT NA's: ofarness even exists for cases with missing ynew, for which farness cannot exist.
##   [1] 0.20737485 0.85045110 0.08450745 0.86189533 0.23301865 0.52757739
##   [7] 0.29437232 0.21839328 0.88896177 0.93850072 0.22279874 0.72023725
##  [13] 0.93891234 0.53001183 0.25870493 0.25729970 0.25553649 0.18077135
##  [19] 0.26115206 0.20655762 0.89671342 0.20830291 0.25856230 0.93859265
##  [25] 0.93834530 0.93888981 0.86977892 0.05423371 0.13796909 0.89553878
##  [31] 0.93788017 0.61105777 0.25501513 0.25848322 0.88010290 0.22171244
##  [37] 0.24876810 0.16330724 0.52374495 0.22309847 0.23270883 0.93857185
##  [43] 0.09137918 0.92827252 0.93314093 0.78883032 0.16967184 0.41060696
##  [49] 0.22014483 0.03679866 0.10278070 0.88802994 0.17433818 0.20547488
##  [55] 0.22641177 0.25516339 0.88960053 0.28551006 0.25902041 0.88957763
##  [61] 0.25978151 0.88984549 0.25993220 0.24565279 0.88991058 0.82758963
##  [67] 0.88294537 0.25991897 0.25718649 0.25830565 0.88104884 0.82941910
##  [73] 0.92115278 0.25992966 0.09838746 0.73693813 0.25961446 0.83092147
##  [79] 0.85324862 0.17433168 0.25708124 0.25956760 0.27683565 0.85571604
##  [85] 0.88458022 0.88992431 0.71927806 0.25990656 0.87408023 0.30086342
##  [91] 0.25986061 0.05544259 0.22655780 0.10915362 0.06261839 0.08496130
##  [97] 0.67823299 0.06594331 0.88983510 0.73933757
plot(vcrtest$ofarness, vcrtrain$ofarness[1:100]); abline(0, 1) # same, OK
plot of chunk unnamed-chunk-11
confmat.vcr(vcrtest) # as expected:
## 
## Confusion matrix:
##             predicted
## given        setosa versicolor virginica
##   setosa         40          0         0
##   versicolor      0         39         1
## 
## The accuracy is 98.75%.
Now we can visualize the classification on the test data, and compare the resulting plots with their counterparts on the training data.
stplot # to compare with:
plot of chunk unnamed-chunk-12
stackedplot(vcrtest, classCols = cols, separSize = 1.5,
            minSize = 1, main = "Stacked plot of nnet on subset of iris data")
## 
## Not all classes occur in these data. The classes to plot are:
## [1] 1 2
plot of chunk unnamed-chunk-12
silplot(vcrtest, classCols = cols, main = "Silhouette plot of nnet on subset of iris data")
##  classNumber classLabel classSize classAveSi
##            1     setosa        40       1.00
##            2 versicolor        40       0.95
plot of chunk unnamed-chunk-12
classmap(vcrtrain, 1, classCols = cols)
plot of chunk unnamed-chunk-12
classmap(vcrtest, 1, classCols = cols) # same, but fewer points
plot of chunk unnamed-chunk-12
#
classmap(vcrtrain, 2, classCols = cols)
plot of chunk unnamed-chunk-12
classmap(vcrtest, 2, classCols = cols) # same, but fewer points
plot of chunk unnamed-chunk-12
#
# classmap(vcrtrain, 3, classCols = cols)
# classmap(vcrtest, 3, classCols = cols)
# # Class number 3 with label virginica has no objects to visualize.
As a second example, we consider the floral buds data.
data("data_floralbuds")
We start by training a neural network with one hidden layer consisting of 2 neurons.
set.seed(123)
nn.buds <- nnet(y ~ ., data = data_floralbuds, size = 2)
## # weights:  26
## initial  value 1041.109691 
## iter  10 value 536.818801
## iter  20 value 459.928280
## iter  30 value 277.176449
## iter  40 value 245.861545
## iter  50 value 142.979121
## iter  60 value 90.662364
## iter  70 value 79.756728
## iter  80 value 75.207876
## iter  90 value 74.102871
## iter 100 value 73.179742
## final  value 73.179742 
## stopped after 100 iterations
names(nn.buds)
##  [1] "n"             "nunits"        "nconn"         "conn"         
##  [5] "nsunits"       "decay"         "entropy"       "softmax"      
##  [9] "censored"      "value"         "wts"           "convergence"  
## [13] "fitted.values" "residuals"     "lev"           "call"         
## [17] "terms"         "coefnames"     "xlevels"
head(nn.buds$fitted.values) # matrix of posterior probabilities of each class
##         branch       bud       scales      support
## 1 1.058835e-07 0.9999986 7.774815e-26 1.276449e-06
## 2 4.339590e-07 0.9999954 6.576856e-24 4.199437e-06
## 3 6.458451e-07 0.9999923 2.847148e-23 7.039262e-06
## 4 7.031223e-07 0.9999923 3.364849e-23 6.949520e-06
## 5 3.429871e-07 0.9999959 3.466130e-24 3.744776e-06
## 6 1.847493e-07 0.9999976 5.016766e-25 2.246819e-06
nn.buds$wts # the "weights" (coefficients, some negative)
##  [1]  -0.2624278 -35.7878683 -21.6885293  51.5692666  -0.8608722  20.4026110
##  [7] -26.4567237   2.5410791  -8.3654734  18.6048565  11.7291332 -13.5435723
## [13]  36.6413025 -27.1683942  22.3796811   6.0538747 -22.1938387 -16.6302472
## [19]  22.5568173  16.5677260  33.2761417 -33.1952834 -35.9826056 -38.9852649
## [25]   5.3913482  42.3781984
summary(nn.buds) # A hidden layer with 2 neurons:
## a 6-2-4 network with 26 weights
## options were - softmax modelling 
##  b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1 i6->h1 
##  -0.26 -35.79 -21.69  51.57  -0.86  20.40 -26.46 
##  b->h2 i1->h2 i2->h2 i3->h2 i4->h2 i5->h2 i6->h2 
##   2.54  -8.37  18.60  11.73 -13.54  36.64 -27.17 
##  b->o1 h1->o1 h2->o1 
##  22.38   6.05 -22.19 
##  b->o2 h1->o2 h2->o2 
## -16.63  22.56  16.57 
##  b->o3 h1->o3 h2->o3 
##  33.28 -33.20 -35.98 
##  b->o4 h1->o4 h2->o4 
## -38.99   5.39  42.38
In the output above, the coefficients are listed with their arrows: i1, …, i6 are the 6 input variables h1, h2 are the 2 hidden nodes o1, .., o4 are the output nodes, one for each class (before softmax) b->h1 and b->h2 are both intercepts to the hidden layer b_o1, …, b->o4 are the intercepts to the output layer
We first extract these weights from the output of nnet():
train_x <- as.matrix(data_floralbuds[, -7])
# extract weights for each layer:
bias1  <- nn.buds$wts[c(1, 8)]
betas1 <- cbind(nn.buds$wts[2:7], nn.buds$wts[9:14])
bias2  <- nn.buds$wts[c(15, 18, 21, 24)]
betas2 <- cbind(nn.buds$wts[16:17], nn.buds$wts[19:20], 
                nn.buds$wts[22:23], nn.buds$wts[25:26])
We now use these weights and the sigmoid activation function to reconstruct the data in the hidden and final layer (before applying the softmax function). We verify that, after applying softmax, we obtain the same posterior probabilities as those in the output of nnet().
H <- t(t(train_x %*% betas1) + bias1) # hidden layer
X <- t(t(sigmoid(H) %*% betas2) + bias2) # layer before softmax
outprobs <- t(apply(X, 1, softmax))
trainprobs <- nn.buds$fitted.values # posterior probabilities
range(trainprobs - outprobs) # ~0
## [1] -2.126469e-10  2.126470e-10
# OK, we have reconstructed the posterior probabilities
pairs(X, col = data_floralbuds$y)
plot of chunk unnamed-chunk-16
Now prepare for visualization:
y <- data_floralbuds$y
vcrtrain <- vcr.neural.train(X, y, trainprobs)
## 
## There is at least one class with (near-) singular covariance matrix, 
## so farness will be computed from PCA.
cols <- c("saddlebrown", "orange", "olivedrab4", "royalblue3")
stackedplot(vcrtrain, classCols = cols, main =
              "Stacked plot of nnet on floral buds data")
plot of chunk unnamed-chunk-18
# Silhouette plot:
silplot(vcrtrain, classCols = cols, 
        main = "Silhouette plot of nnet on floral buds data")
##  classNumber classLabel classSize classAveSi
##            1     branch        49       0.45
##            2        bud       363       0.96
##            3     scales        94       0.86
##            4    support        44       0.57
plot of chunk unnamed-chunk-18
# Quasi residual plot:
PAC <- vcrtrain$PAC
feat <- rowSums(train_x); xlab <- "rowSums(X)"
# pdf("Floralbuds_quasi_residual_plot.pdf", width = 5, height = 4.8)
qresplot(PAC, feat, xlab = xlab, plotErrorBars = TRUE, fac = 2, main = "Floral buds: quasi residual plot")
plot of chunk unnamed-chunk-18
# images with higher sum are easier to classify
# dev.off()
cor.test(feat, PAC, method = "spearman") 
## 
## 	Spearman's rank correlation rho
## 
## data:  feat and PAC
## S = 34339314, p-value = 1.68e-08
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.2383866
# rho = -0.238, p-value < 2e-8 
# ==> decreasing trend is significant
# Class maps:
classmap(vcrtrain, "branch", classCols = cols)
plot of chunk unnamed-chunk-18
classmap(vcrtrain, "bud", classCols = cols)
plot of chunk unnamed-chunk-18
classmap(vcrtrain, "scales", classCols = cols)
plot of chunk unnamed-chunk-18
classmap(vcrtrain, "support", classCols = cols)
plot of chunk unnamed-chunk-18