Load packages and data

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6     v purrr   0.3.4
## v tibble  3.1.7     v dplyr   1.0.9
## v tidyr   1.2.0     v stringr 1.4.1
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 1.0.0 --
## v broom        1.0.1     v rsample      1.1.0
## v dials        1.0.0     v tune         1.0.0
## v infer        1.0.3     v workflows    1.1.0
## v modeldata    1.0.1     v workflowsets 1.0.0
## v parsnip      1.0.1     v yardstick    1.1.0
## v recipes      1.0.1     
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## * Search for functions across packages at https://www.tidymodels.org/find/
#library(dsbox) #has the gss16 data 
gss16 = read.table("data/gss16.txt", header=TRUE)

Scientific research

Ex 1

gss16 = gss16 %>%
  mutate(advfront = na_if(advfront, "Dont know"),
    advfront = forcats::fct_recode(advfront, 
                    agree = "Strongly agree", 
                    agree = "Agree",
                    "not agree" = "Disagree",
                    "not agree" = "Strongly disagree"))
gss16 %>% count(advfront)
##    advfront    n
## 1     agree 1159
## 2 not agree  199
## 3      <NA> 1509

So, 1159 people agree with supporting advances in the frontiers of knowledge, 199 disagree, and 1509 did not answer that question.

Ex 2

gss16 = gss16 %>%
  mutate(polviews = str_extract(polviews, '[Ll]iberal|[Cc]onservative|Moderate'), 
         polviews = forcats::fct_recode(polviews, 
                    Conservative = "conservative", 
                    Liberal = "liberal"),
         polviews = forcats::fct_relevel(polviews, "Conservative", "Moderate", "Liberal"))
gss16 %>% count(polviews)
##       polviews    n
## 1 Conservative  928
## 2     Moderate 1032
## 3      Liberal  796
## 4         <NA>  111

Now we have collapsed the political views variable down to three categories.

Ex 3

gss16_advfront = gss16 %>%
  select(`advfront`, `educ`, `polviews`, `wrkstat`) %>%
  drop_na()
dim(gss16_advfront)
## [1] 1305    4

The dataset gss16_advfront will be used for the further data analysis.

Ex 4

set.seed(2341)
gss16_split = gss16_advfront %>% 
  initial_split(strata = advfront, prop=3/4)
gss16_train = training(gss16_split)
gss16_test  = testing(gss16_split)
dim(gss16_train)
## [1] 978   4
dim(gss16_test)
## [1] 327   4

The training and testing data sets have 75% and 25% of the original data, respectively.

Ex 5

gss16_rec = recipe(advfront ~ polviews + wrkstat + educ, data = gss16_train) %>%
  step_other(wrkstat, threshold = 0.10, other = "Other") %>%
  step_dummy(all_nominal(), -all_outcomes())

This recipe prepares the dataset with two different data steps. Note: we need to be only using the training data here. This is the only way to keep the testing data out of any adjustments to the model that we might make before settling on the final model.

Ex 6

gss16_spec = logistic_reg() %>%
  set_engine("glm")

This line specifies that the model will be fit with the glm function.

Ex 7

gss16_wflow_1 = workflow() %>%
  add_model(gss16_spec) %>%
  add_recipe(gss16_rec)

The workflow takes the data, preprocessing steps, and model fit together into a full object.

Ex 8

set.seed(1222)
gss16_folds = vfold_cv(gss16_train, v = 5)

gss16_fit_rs_1 = gss16_wflow_1 %>%
  fit_resamples(gss16_folds)

collect_metrics(gss16_fit_rs_1, summarize = FALSE)
## # A tibble: 10 x 5
##    id    .metric  .estimator .estimate .config             
##    <chr> <chr>    <chr>          <dbl> <chr>               
##  1 Fold1 accuracy binary         0.852 Preprocessor1_Model1
##  2 Fold1 roc_auc  binary         0.549 Preprocessor1_Model1
##  3 Fold2 accuracy binary         0.852 Preprocessor1_Model1
##  4 Fold2 roc_auc  binary         0.556 Preprocessor1_Model1
##  5 Fold3 accuracy binary         0.821 Preprocessor1_Model1
##  6 Fold3 roc_auc  binary         0.477 Preprocessor1_Model1
##  7 Fold4 accuracy binary         0.872 Preprocessor1_Model1
##  8 Fold4 roc_auc  binary         0.640 Preprocessor1_Model1
##  9 Fold5 accuracy binary         0.877 Preprocessor1_Model1
## 10 Fold5 roc_auc  binary         0.643 Preprocessor1_Model1
collect_metrics(gss16_fit_rs_1)
## # A tibble: 2 x 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.855     5 0.00977 Preprocessor1_Model1
## 2 roc_auc  binary     0.573     5 0.0312  Preprocessor1_Model1

The average area under the ROC curve is 0.573. The average accuracy for all cross validation folds is 0.855. The accuracy is consistent across folds, but the predictive ability measured by the AUC is more variable.

Rubric

34 points