Examining PISA 2018 Data Set with Statistical Learning Approach

R Statistics Supervised Machine Learning

In this post, I will be developing two statistical learning models, namely Linear Regression and Polynomial Regression, and apply it to Thai student data from the Programme for International Student Assessment (PISA) 2018 data set to examine the impact of classroom competition and cooperation to students’ academic performance.

(14 min read)

Tarid Wongvorachan (University of Alberta)https://www.ualberta.ca
2022-02-27

Introduction

Image from https://marketbusinessnews.com/financial-glossary/statistical-learning. No copyright infringement is intended

Setting up R environment

Show code
setwd("D:/Program/Private_project/DistillSite/_posts/2022-02-27-statlearning")

library(foreign) #To read SPSS data
library(psych) #For descriptive stat
library(tidyverse) #for data manipulation
library(DataExplorer)
library(ggcorrplot) #for correlation matrix
library(tidymodels) #for model building
library(kableExtra) #for kable table
library(visdat) #for overall missingness visualization
Show code
#Import the data set
PISA_TH <-read_csv("PISA2018TH.csv", col_names = TRUE)

PISA_Subsetted <-  PISA_TH %>% 
  select(FEMALE = ST004D01T, READING_SCORE = PV1READ, CLASSCOMP = PERCOMP,
         CLASSCOOP = PERCOOP, ATTCOMP = COMPETE)

PISA_Subsetted$FEMALE <- recode_factor(PISA_Subsetted$FEMALE, 
                                       "1" = "1", "2" = "0")
kbl(head(PISA_Subsetted)) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = TRUE, position = "left")
FEMALE READING_SCORE CLASSCOMP CLASSCOOP ATTCOMP
1 480.756 1.5630 1.6762 0.4352
1 502.610 NA NA NA
1 476.744 0.0866 -0.5358 0.4352
1 489.858 0.6912 0.6012 -0.2661
0 536.178 1.2903 1.6762 0.4352
0 566.755 0.2020 0.6012 0.3709

Missing Data Handling

Show code
visdat::vis_miss(PISA_Subsetted, sort_miss = TRUE, cluster = TRUE)
Show code
naniar::gg_miss_upset(PISA_Subsetted, nsets = 5, nintersects = 10)

Show code
PISA_Subsetted <- na.omit(PISA_Subsetted)

Exploratory Data Analysis and Assumption Check

Show code
plot_intro(PISA_Subsetted, title = "Types of Variables")+
  theme(plot.title = element_text(hjust = 0.5))

Show code
plot_str(PISA_Subsetted)
Show code
describe(PISA_Subsetted)
              vars    n   mean    sd median trimmed   mad    min
FEMALE*          1 8158   1.46  0.50   1.00    1.45  0.00   1.00
READING_SCORE    2 8158 411.26 88.42 402.43  408.00 91.53 146.33
CLASSCOMP        3 8158   0.20  0.87   0.20    0.19  0.73  -1.99
CLASSCOOP        4 8158   0.20  0.92   0.60    0.22  1.07  -2.14
ATTCOMP          5 8158   0.03  0.79   0.20    0.00  0.65  -2.35
                 max  range  skew kurtosis   se
FEMALE*         2.00   1.00  0.18    -1.97 0.01
READING_SCORE 720.09 573.76  0.32    -0.37 0.98
CLASSCOMP       2.04   4.03  0.02    -0.04 0.01
CLASSCOOP       1.68   3.82 -0.39    -0.47 0.01
ATTCOMP         2.01   4.35  0.13     1.11 0.01
Show code
plot_histogram(PISA_Subsetted)
Show code
set.seed(123)
plot_qq(PISA_Subsetted, sampled_rows = 1000L)

Show code
plot_correlation(PISA_Subsetted, type = "continuous", title = "Correlation Matrix") +
  theme(plot.title = element_text(hjust = 0.5))

Data Preprocessing

Show code
set.seed(456)

# Put 3/4 of the data into the training set 
data_split <- initial_split(PISA_Subsetted, prop = 3/4)

# Create data frames for the two sets:
train_data <- training(data_split) #3/4
test_data  <- testing(data_split)  #1/4

Linear Regression Model

Show code
lm_mod <- 
  linear_reg() %>% 
  set_mode("regression") %>%
  set_engine("lm")

lm_mod
Linear Regression Model Specification (regression)

Computational engine: lm 
Show code
PISA_linear_recipe <- 
  recipe(READING_SCORE ~ ., data = train_data) %>%
  step_rm(FEMALE)

PISA_linear_recipe
Recipe

Inputs:

      role #variables
   outcome          1
 predictor          4

Operations:

Variables removed FEMALE
Show code
PISA_workflow <- 
  workflow() %>% 
  add_model(lm_mod) %>% 
  add_recipe(PISA_linear_recipe)

PISA_workflow
== Workflow ==========================================================
Preprocessor: Recipe
Model: linear_reg()

-- Preprocessor ------------------------------------------------------
1 Recipe Step

* step_rm()

-- Model -------------------------------------------------------------
Linear Regression Model Specification (regression)

Computational engine: lm 

Fit the Linear Regression Model to the Training/Testing Dataset

Show code
set.seed(678)

PISA_linear_fit <- 
  PISA_workflow %>% 
  fit(data = train_data)
Show code
PISA_linear_fit %>% 
  extract_fit_parsnip() %>% 
  broom::tidy()
# A tibble: 4 x 5
  term        estimate std.error statistic  p.value
  <chr>          <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)   406.        1.15    355.   0       
2 CLASSCOMP       3.77      1.33      2.84 4.50e- 3
3 CLASSCOOP      19.7       1.23     16.0  1.91e-56
4 ATTCOMP         3.08      1.45      2.12 3.38e- 2
Show code
PISA_linear_train_pred <- 
  predict(PISA_linear_fit, train_data) %>% 
  bind_cols(train_data %>% select(READING_SCORE)) 

PISA_linear_train_pred
# A tibble: 6,118 x 2
   .pred READING_SCORE
   <dbl>         <dbl>
 1  427.          471.
 2  421.          239.
 3  451.          546.
 4  405.          248.
 5  407.          457.
 6  398.          491.
 7  389.          308.
 8  421.          320.
 9  394.          473.
10  404.          507.
# ... with 6,108 more rows
Show code
mse_vec <- function(truth, estimate, na_rm = TRUE, ...) {
  
  mse_impl <- function(truth, estimate) {
    mean((truth - estimate) ^ 2)
  }
  
  metric_vec_template(
    metric_impl = mse_impl,
    truth = truth, 
    estimate = estimate,
    na_rm = na_rm,
    cls = "numeric",
    ...
  )
  
}
Show code
mse_vec(
  truth = PISA_linear_train_pred$READING_SCORE, 
  estimate = PISA_linear_train_pred$.pred
)
[1] 7389.577
Show code
rmse(PISA_linear_train_pred, 
     truth = READING_SCORE,
     estimate = .pred)
# A tibble: 1 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard        86.0
Show code
rsq(PISA_linear_train_pred, 
     truth = READING_SCORE,
     estimate = .pred)
# A tibble: 1 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rsq     standard      0.0499
Show code
PISA_linear_test_pred <- 
  predict(PISA_linear_fit, test_data) %>% 
  bind_cols(test_data %>% select(READING_SCORE)) 

PISA_linear_test_pred
# A tibble: 2,040 x 2
   .pred READING_SCORE
   <dbl>         <dbl>
 1  398.          477.
 2  427.          542.
 3  415.          519.
 4  422.          597.
 5  421.          467.
 6  434.          573.
 7  397.          522.
 8  425.          436.
 9  423.          440.
10  399.          543.
# ... with 2,030 more rows
Show code
mse_vec(
  truth = PISA_linear_test_pred$READING_SCORE, 
  estimate = PISA_linear_test_pred$.pred
)
[1] 7472.221
Show code
rmse(PISA_linear_test_pred, 
     truth = READING_SCORE,
     estimate = .pred)
# A tibble: 1 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard        86.4
Show code
rsq(PISA_linear_test_pred, 
     truth = READING_SCORE,
     estimate = .pred)
# A tibble: 1 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rsq     standard      0.0584

Polynomial Regression Model

Image from https://towardsdatascience.com/polynomial-regression-an-alternative-for-neural-networks-c4bd30fa6cf6. No copyright infringement is intended
Show code
PISA_poly_recipe <- 
  recipe(READING_SCORE ~ CLASSCOMP + CLASSCOOP + ATTCOMP, data = train_data) %>%
  step_poly(CLASSCOMP, degree = 3) %>%
  step_poly(CLASSCOOP, degree = 3)

PISA_poly_recipe
Recipe

Inputs:

      role #variables
   outcome          1
 predictor          3

Operations:

Orthogonal polynomials on CLASSCOMP
Orthogonal polynomials on CLASSCOOP
Show code
PISA_poly_wf <- workflow() %>%
  add_model(lm_mod) %>%
  add_recipe(PISA_poly_recipe)

PISA_poly_wf
== Workflow ==========================================================
Preprocessor: Recipe
Model: linear_reg()

-- Preprocessor ------------------------------------------------------
2 Recipe Steps

* step_poly()
* step_poly()

-- Model -------------------------------------------------------------
Linear Regression Model Specification (regression)

Computational engine: lm 
Show code
set.seed(999)

PISA_poly_fit <- 
  PISA_poly_wf %>%
  fit(data = train_data)

PISA_poly_fit
== Workflow [trained] ================================================
Preprocessor: Recipe
Model: linear_reg()

-- Preprocessor ------------------------------------------------------
2 Recipe Steps

* step_poly()
* step_poly()

-- Model -------------------------------------------------------------

Call:
stats::lm(formula = ..y ~ ., data = data)

Coefficients:
     (Intercept)           ATTCOMP  CLASSCOMP_poly_1  
         411.053             2.729           271.369  
CLASSCOMP_poly_2  CLASSCOMP_poly_3  CLASSCOOP_poly_1  
         295.526            19.960          1400.633  
CLASSCOOP_poly_2  CLASSCOOP_poly_3  
         -55.306           304.765  
Show code
PISA_poly_fit %>% 
  extract_fit_parsnip() %>% 
  broom::tidy()
# A tibble: 8 x 5
  term             estimate std.error statistic  p.value
  <chr>               <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)        411.        1.10   374.    0       
2 ATTCOMP              2.73      1.45     1.88  6.00e- 2
3 CLASSCOMP_poly_1   271.       90.4      3.00  2.69e- 3
4 CLASSCOMP_poly_2   296.       88.4      3.34  8.31e- 4
5 CLASSCOMP_poly_3    20.0      87.2      0.229 8.19e- 1
6 CLASSCOOP_poly_1  1401.       88.7     15.8   4.97e-55
7 CLASSCOOP_poly_2   -55.3      88.6     -0.625 5.32e- 1
8 CLASSCOOP_poly_3   305.       87.2      3.50  4.75e- 4
Show code
PISA_poly_pred <- 
  predict(PISA_poly_fit, test_data) %>% 
  bind_cols(test_data %>% select(READING_SCORE)) 

kbl(head(PISA_poly_pred)) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = TRUE, position = "left")  
.pred READING_SCORE
398.7327 476.744
426.4006 542.238
410.2957 518.744
417.5398 597.441
415.7663 466.964
431.5449 572.923
Show code
mse_vec(
  truth = PISA_poly_pred$READING_SCORE, 
  estimate = PISA_poly_pred$.pred)
[1] 7470.233
Show code
rmse(PISA_poly_pred, 
     truth = READING_SCORE,
     estimate = .pred)
# A tibble: 1 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard        86.4
Show code
rsq(PISA_poly_pred, 
     truth = READING_SCORE,
     estimate = .pred)
# A tibble: 1 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rsq     standard      0.0584

Model Tuning with Cross Validation

Show code
PISA_poly_tuned_recipe <- 
  recipe(READING_SCORE ~ CLASSCOMP + CLASSCOOP + ATTCOMP, data = train_data) %>%
  step_poly(CLASSCOMP, CLASSCOOP, degree = tune())

PISA_poly_wf <- workflow() %>%
  add_model(lm_mod) %>%
  add_recipe(PISA_poly_tuned_recipe)

PISA_poly_wf
== Workflow ==========================================================
Preprocessor: Recipe
Model: linear_reg()

-- Preprocessor ------------------------------------------------------
1 Recipe Step

* step_poly()

-- Model -------------------------------------------------------------
Linear Regression Model Specification (regression)

Computational engine: lm 
Show code
train_data_subsetted = subset(train_data, select = c('READING_SCORE','CLASSCOMP','CLASSCOOP','ATTCOMP'))

PISA_folds <- vfold_cv(train_data, v = 10)

degree_grid <- grid_regular(degree(range = c(1, 10)), levels = 10)

tune_resample <- tune_grid(
  object = PISA_poly_wf, 
  resamples = PISA_folds, 
  grid = degree_grid)

autoplot(tune_resample)

Data Tuning Results with 10-Folds Cross Validation
Show code
show_best(tune_resample, metric = "rmse")
# A tibble: 5 x 7
  degree .metric .estimator  mean     n std_err .config              
   <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1     10 rmse    standard    85.4    10   0.852 Preprocessor10_Model1
2      6 rmse    standard    85.5    10   0.845 Preprocessor06_Model1
3      9 rmse    standard    85.5    10   0.860 Preprocessor09_Model1
4      8 rmse    standard    85.5    10   0.857 Preprocessor08_Model1
5      7 rmse    standard    85.5    10   0.843 Preprocessor07_Model1
Show code
select_by_one_std_err(tune_resample, degree, metric = "rsq")
# A tibble: 1 x 9
  degree .metric .estimator   mean     n std_err .config  .best .bound
   <dbl> <chr>   <chr>       <dbl> <int>   <dbl> <chr>    <dbl>  <dbl>
1      6 rsq     standard   0.0616    10 0.00730 Prepro~ 0.0647 0.0563

Final Model

Show code
best_degree <- select_by_one_std_err(tune_resample, degree, metric = "rsq")

final_wf <- finalize_workflow(PISA_poly_wf, best_degree)

final_fit <- fit(final_wf, train_data)

final_fit
== Workflow [trained] ================================================
Preprocessor: Recipe
Model: linear_reg()

-- Preprocessor ------------------------------------------------------
1 Recipe Step

* step_poly()

-- Model -------------------------------------------------------------

Call:
stats::lm(formula = ..y ~ ., data = data)

Coefficients:
     (Intercept)           ATTCOMP  CLASSCOMP_poly_1  
         411.050             2.836           279.070  
CLASSCOMP_poly_2  CLASSCOMP_poly_3  CLASSCOMP_poly_4  
         300.125            19.260          -167.827  
CLASSCOMP_poly_5  CLASSCOMP_poly_6  CLASSCOOP_poly_1  
        -197.978          -477.187          1381.076  
CLASSCOOP_poly_2  CLASSCOOP_poly_3  CLASSCOOP_poly_4  
         -56.933           281.590          -251.840  
CLASSCOOP_poly_5  CLASSCOOP_poly_6  
        -130.564           389.358  
Show code
PISA_poly_test_pred <- 
  predict(final_fit, test_data) %>% 
  bind_cols(test_data %>% select(READING_SCORE))

mse_vec(
  truth = PISA_poly_test_pred$READING_SCORE, 
  estimate = PISA_poly_test_pred$.pred)
[1] 7382.151
Show code
rmse(PISA_poly_test_pred, 
     truth = READING_SCORE,
     estimate = .pred)
# A tibble: 1 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard        85.9
Show code
rsq(PISA_poly_test_pred, 
     truth = READING_SCORE,
     estimate = .pred)
# A tibble: 1 x 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rsq     standard      0.0697
Show code
tabl <- "
| Model         | R-Square      | MSE   | RMSE |
|---------------|:-------------:|------:|-----:|
| Linear Reg    | 0.0584        | 7472.2| 86.4 |
| Sextic Reg    | 0.0697        | 7382.1| 85.9 |
"
cat(tabl)
Model R-Square MSE RMSE
Linear Reg 0.0584 7472.2 86.4
Sextic Reg 0.0697 7382.1 85.9

Final Remarks and Conclusion

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

Citation

For attribution, please cite this work as

Wongvorachan (2022, Feb. 27). Tarid Wongvorachan: Examining PISA 2018 Data Set with Statistical Learning Approach. Retrieved from https://taridwong.github.io/posts/2022-02-27-statlearning/

BibTeX citation

@misc{wongvorachan2022examining,
  author = {Wongvorachan, Tarid},
  title = {Tarid Wongvorachan: Examining PISA 2018 Data Set with Statistical Learning Approach},
  url = {https://taridwong.github.io/posts/2022-02-27-statlearning/},
  year = {2022}
}