First of all, thank you for using healthyR.ai. If you
encounter issues or want to make a feature request, please visit https://github.com/spsanderson/healthyR.ai/issues
library(healthyR.ai)In this should example we will showcase the
pca_your_recipe() function. This function takes only a few
arguments. The arguments are currently .data which is the
full data set that gets passed internally to the
recipes::bake() function, .recipe_object which
is a recipe you have already made and want to pass to the function in
order to perform the pca, and finally .threshold which is
the fraction of the variance that should be captured by the
components.
To start this walk through we will first load in a few libraries.
suppressPackageStartupMessages(library(timetk))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(purrr))
suppressPackageStartupMessages(library(healthyR.data))
suppressPackageStartupMessages(library(rsample))
suppressPackageStartupMessages(library(recipes))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(plotly))Now that we have out libraries we can go ahead and get our data set ready.
data_tbl <- healthyR_data %>%
    select(visit_end_date_time) %>%
    summarise_by_time(
        .date_var = visit_end_date_time,
        .by       = "month",
        value     = n()
    ) %>%
    set_names("date_col","value") %>%
    filter_by_time(
        .date_var = date_col,
        .start_date = "2013",
        .end_date = "2020"
    )
head(data_tbl)
#> # A tibble: 6 x 2
#>   date_col            value
#>   <dttm>              <int>
#> 1 2013-01-01 00:00:00  2082
#> 2 2013-02-01 00:00:00  1719
#> 3 2013-03-01 00:00:00  1796
#> 4 2013-04-01 00:00:00  1865
#> 5 2013-05-01 00:00:00  2028
#> 6 2013-06-01 00:00:00  1813The data set is simple and by itself would not be at all useful for a
pca analysis since there is only one predictor, being time. In order to
facilitate the use of the function and this example, we will create a
splits object and a recipe object.
splits <- initial_split(data = data_tbl, prop = 0.8)
splits
#> <Analysis/Assess/Total>
#> <76/19/95>
head(training(splits))
#> # A tibble: 6 x 2
#>   date_col            value
#>   <dttm>              <int>
#> 1 2013-11-01 00:00:00  1669
#> 2 2018-10-01 00:00:00  1645
#> 3 2020-04-01 00:00:00   648
#> 4 2017-06-01 00:00:00  1661
#> 5 2020-02-01 00:00:00  1363
#> 6 2014-04-01 00:00:00  1805rec_obj <- recipe(value ~ ., training(splits)) %>%
    step_timeseries_signature(date_col) %>%
    step_rm(matches("(iso$)|(xts$)|(hour)|(min)|(sec)|(am.pm)"))
rec_obj
#> Recipe
#> 
#> Inputs:
#> 
#>       role #variables
#>    outcome          1
#>  predictor          1
#> 
#> Operations:
#> 
#> Timeseries signature features from date_col
#> Variables removed matches("(iso$)|(xts$)|(hour)|(min)|(sec)|(am.pm)")
get_juiced_data(rec_obj) %>% glimpse()
#> Rows: 76
#> Columns: 20
#> $ date_col           <dttm> 2013-11-01, 2018-10-01, 2020-04-01, 2017-06-01, 20~
#> $ value              <int> 1669, 1645, 648, 1661, 1363, 1805, 1756, 1443, 1616~
#> $ date_col_index.num <dbl> 1383264000, 1538352000, 1585699200, 1496275200, 158~
#> $ date_col_year      <int> 2013, 2018, 2020, 2017, 2020, 2014, 2018, 2019, 201~
#> $ date_col_half      <int> 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1, ~
#> $ date_col_quarter   <int> 4, 4, 2, 2, 1, 2, 1, 2, 2, 2, 4, 1, 1, 3, 3, 4, 1, ~
#> $ date_col_month     <int> 11, 10, 4, 6, 2, 4, 1, 4, 4, 6, 12, 1, 3, 9, 9, 10,~
#> $ date_col_month.lbl <ord> November, October, April, June, February, April, Ja~
#> $ date_col_day       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
#> $ date_col_wday      <int> 6, 2, 4, 5, 7, 3, 2, 2, 1, 7, 5, 6, 4, 1, 3, 5, 3, ~
#> $ date_col_wday.lbl  <ord> Friday, Monday, Wednesday, Thursday, Saturday, Tues~
#> $ date_col_mday      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
#> $ date_col_qday      <int> 32, 1, 1, 62, 32, 1, 1, 1, 1, 62, 62, 1, 60, 63, 63~
#> $ date_col_yday      <int> 305, 274, 92, 152, 32, 91, 1, 91, 91, 152, 336, 1, ~
#> $ date_col_mweek     <int> 5, 6, 5, 5, 5, 6, 6, 6, 5, 5, 5, 5, 5, 5, 6, 5, 6, ~
#> $ date_col_week      <int> 44, 40, 14, 22, 5, 13, 1, 13, 13, 22, 48, 1, 9, 35,~
#> $ date_col_week2     <int> 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, ~
#> $ date_col_week3     <int> 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 0, 1, 0, 2, 2, 1, 1, ~
#> $ date_col_week4     <int> 0, 0, 2, 2, 1, 1, 1, 1, 1, 2, 0, 1, 1, 3, 3, 0, 1, ~
#> $ date_col_mday7     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~Now that we have out initial recipe we can use the
pca_your_recipe() function.
pca_list <- pca_your_recipe(
  .recipe_object = rec_obj,
  .data          = data_tbl,
  .threshold     = 0.8,
  .top_n         = 5
)The function returns a list object and does so
insvisible so you must assign the output to a variable, you
can then access the items of the list in the usual manner.
The following items are included in the output of the function:
Lets start going down the list of items.
This is the portion you will want to output to a variable as this is the recipe object itself that you will use further down the line of your work.
pca_rec_obj <- pca_list$pca_transform
pca_rec_obj
#> Recipe
#> 
#> Inputs:
#> 
#>       role #variables
#>    outcome          1
#>  predictor          1
#> 
#> Operations:
#> 
#> Timeseries signature features from date_col
#> Variables removed matches("(iso$)|(xts$)|(hour)|(min)|(sec)|(am.pm)")
#> Centering for recipes::all_numeric()
#> Scaling for recipes::all_numeric()
#> Sparse, unbalanced variable filter on recipes::all_numeric()
#> PCA extraction with recipes::all_numeric_predictors()pca_list$variable_loadings
#> # A tibble: 169 x 4
#>    terms                 value component id       
#>    <chr>                 <dbl> <chr>     <chr>    
#>  1 date_col_index.num  0.0608  PC1       pca_gLUIr
#>  2 date_col_year       0.00759 PC1       pca_gLUIr
#>  3 date_col_half       0.390   PC1       pca_gLUIr
#>  4 date_col_quarter    0.431   PC1       pca_gLUIr
#>  5 date_col_month      0.435   PC1       pca_gLUIr
#>  6 date_col_wday      -0.0214  PC1       pca_gLUIr
#>  7 date_col_qday       0.0763  PC1       pca_gLUIr
#>  8 date_col_yday       0.435   PC1       pca_gLUIr
#>  9 date_col_mweek      0.00515 PC1       pca_gLUIr
#> 10 date_col_week       0.435   PC1       pca_gLUIr
#> # ... with 159 more rowspca_list$variable_variance
#> # A tibble: 52 x 4
#>    terms       value component id       
#>    <chr>       <dbl>     <int> <chr>    
#>  1 variance 5.22             1 pca_gLUIr
#>  2 variance 2.03             2 pca_gLUIr
#>  3 variance 1.61             3 pca_gLUIr
#>  4 variance 1.32             4 pca_gLUIr
#>  5 variance 1.08             5 pca_gLUIr
#>  6 variance 0.640            6 pca_gLUIr
#>  7 variance 0.546            7 pca_gLUIr
#>  8 variance 0.481            8 pca_gLUIr
#>  9 variance 0.0602           9 pca_gLUIr
#> 10 variance 0.000233        10 pca_gLUIr
#> # ... with 42 more rowspca_list$pca_estimates
#> Recipe
#> 
#> Inputs:
#> 
#>       role #variables
#>    outcome          1
#>  predictor          1
#> 
#> Training data contained 76 data points and no missing data.
#> 
#> Operations:
#> 
#> Timeseries signature features from date_col [trained]
#> Variables removed date_col_year.iso, date_col_month.xts, date_col_hour, d... [trained]
#> Centering for value, date_col_index.num, date_col_year, date_... [trained]
#> Scaling for value, date_col_index.num, date_col_year, date_... [trained]
#> Sparse, unbalanced variable filter removed date_col_day, date_col_mday, date_col_m... [trained]
#> PCA extraction with date_col_index.num, date_col_year, date_col_half... [trained]pca_list$pca_juiced_estimates %>% glimpse()
#> Rows: 76
#> Columns: 9
#> $ date_col           <dttm> 2013-11-01, 2018-10-01, 2020-04-01, 2017-06-01, 20~
#> $ value              <dbl> 0.44890088, 0.36356527, -3.18141831, 0.42045568, -0~
#> $ date_col_month.lbl <ord> November, October, April, June, February, April, Ja~
#> $ date_col_wday.lbl  <ord> Friday, Monday, Wednesday, Thursday, Saturday, Tues~
#> $ PC1                <dbl> 3.0048659, 2.6711853, -1.1208810, -0.3904603, -2.64~
#> $ PC2                <dbl> 2.14445746, -1.48480099, -2.07361735, -0.02091156, ~
#> $ PC3                <dbl> -0.31104849, 0.93838537, 0.17481424, -0.93879863, -~
#> $ PC4                <dbl> 1.1220731, 1.9003632, 0.6652634, -0.9766017, -0.349~
#> $ PC5                <dbl> 0.96683140, -1.05753874, 1.20238552, -0.09816840, 2~
pca_list$pca_baked_data %>% glimpse()
#> Rows: 95
#> Columns: 9
#> $ date_col           <dttm> 2013-01-01, 2013-02-01, 2013-03-01, 2013-04-01, 20~
#> $ value              <dbl> 1.9173846, 0.6266834, 0.9004685, 1.1458084, 1.72537~
#> $ date_col_month.lbl <ord> January, February, March, April, May, June, July, A~
#> $ date_col_wday.lbl  <ord> Tuesday, Friday, Friday, Monday, Wednesday, Saturda~
#> $ PC1                <dbl> -3.3217791, -2.8359454, -2.5497083, -1.8621054, -1.~
#> $ PC2                <dbl> 1.4497125, 2.1686724, 2.2851493, 1.3753605, 2.01505~
#> $ PC3                <dbl> 1.58886820, 0.17395373, -1.36018139, 1.75929130, -0~
#> $ PC4                <dbl> 1.6798572, 0.4216129, -0.6578122, 1.6029163, 0.2607~
#> $ PC5                <dbl> -1.03154681, 1.08976349, -0.63617131, -1.38733867, ~pca_list$pca_rotation_df %>% glimpse()
#> Rows: 13
#> Columns: 13
#> $ PC1  <dbl> 0.060835406, 0.007590762, 0.390410819, 0.431227019, 0.434603098, ~
#> $ PC2  <dbl> -0.678466229, -0.686078247, 0.038529676, 0.002866248, 0.030013375~
#> $ PC3  <dbl> -0.08274604, -0.08192223, 0.20613456, 0.03688136, -0.01194547, -0~
#> $ PC4  <dbl> -0.12993044, -0.12143624, -0.05892464, 0.08698725, -0.07564986, -~
#> $ PC5  <dbl> 0.10120705, 0.10508701, 0.15173407, 0.03570152, -0.02847367, 0.67~
#> $ PC6  <dbl> 0.0027142403, 0.0030052246, -0.1646211815, -0.0140590550, -0.0005~
#> $ PC7  <dbl> -4.758711e-05, -6.241247e-04, -2.168829e-01, -1.371495e-01, 3.929~
#> $ PC8  <dbl> -0.0002136496, 0.0067651895, -0.2099447717, -0.0564867247, -0.054~
#> $ PC9  <dbl> -0.003970374, 0.023498485, 0.811686968, -0.277245187, -0.22187808~
#> $ PC10 <dbl> -0.0125884035, 0.0108413786, -0.0065685540, -0.2957476529, -0.374~
#> $ PC11 <dbl> -2.597840e-02, 2.646478e-02, -1.661370e-03, -1.259419e-01, 7.0202~
#> $ PC12 <dbl> -0.0024575345, 0.0029867166, 0.0033582604, 0.7750265564, -0.34380~
#> $ PC13 <dbl> 7.079183e-01, -7.037780e-01, -9.719988e-05, -2.785589e-02, -3.143~pca_list$pca_variance_df %>% glimpse()
#> Rows: 13
#> Columns: 6
#> $ PC              <chr> "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8"~
#> $ var_explained   <dbl> 4.011694e-01, 1.564429e-01, 1.242129e-01, 1.018863e-01~
#> $ var_pct_txt     <chr> "40.12%", "15.64%", "12.42%", "10.19%", "8.34%", "4.92~
#> $ cum_var_pct     <dbl> 0.4011694, 0.5576124, 0.6818252, 0.7837116, 0.8671034,~
#> $ cum_var_pct_txt <chr> "40.12%", "55.76%", "68.18%", "78.37%", "86.71%", "91.~
#> $ ou_threshold    <fct> Under, Under, Under, Under, Over, Over, Over, Over, Ov~pca_list$pca_variance_scree_pltpca_list$pca_loadings_plt
pca_list$pca_top_n_loadings_plt