Evaluating Synthetic Data with A Psychometric Approach

R Psychometric

This blog post explores the use of psychometric methods—such as item analysis, independent t-tests, and item response theory (IRT)—to evaluate the quality of synthetic survey data.

(13 min read)

Tarid Wongvorachan (University of Alberta)https://www.ualberta.ca
2024-11-17

Introduction

Show code

Preparing the Dataset

Show code
data_org <- read.csv("df.csv", header = TRUE)
data_org <- na.omit(data_org)

Generating Synthetic Data

Show code
synth.obj <- syn(data_org, method = "cart", seed = 12345)
Show code
data_synth <- synth.obj$syn

Comparing Original and Synthetic Data

Visualizing Differences

Show code
mycols <- c("darkmagenta", "turquoise")
object <- compare(synth.obj, data_org, nrow = 3, ncol = 4, cols = mycols)
object$plots

Utility Assessment

Show code
util_gen <- utility.gen(synth.obj, data_org)
Show code
util_gen

Utility score calculated by method: cart

Call:
utility.gen.synds(object = synth.obj, data = data_org)

Null utilities simulated from a permutation test with 50 replications.

Selected utility measures
    pMSE   S_pMSE 
0.063602 1.691113 

Statistical Testing

Show code
# Initialize an empty data frame to store the t-test results
t_test_results <- data.frame(
  Item = 1:12,
  T_Statistic = numeric(12),
  P_Value = numeric(12),
  Significant = logical(12)
)

# Perform independent sample t-test for each item
for (i in 1:12) {
  # Extract responses for the current item from both original and synthesized datasets
  org_responses <- data_org[, i]
  synth_responses <- data_synth[, i]
  
  # Perform the independent t-test
  t_test <- t.test(org_responses, synth_responses)
  
  # Store the results in the data frame
  t_test_results$T_Statistic[i] <- t_test$statistic
  t_test_results$P_Value[i] <- t_test$p.value
  
  # Check if the p-value is less than 0.05 (significant at the 5% level)
  t_test_results$Significant[i] <- t_test$p.value < 0.05
}
Show code
# View the t-test results
print(t_test_results)
   Item T_Statistic   P_Value Significant
1     1  1.24725294 0.2125793       FALSE
2     2  0.37685436 0.7063567       FALSE
3     3  0.03819008 0.9695433       FALSE
4     4  1.04233137 0.2974951       FALSE
5     5  0.75686676 0.4492967       FALSE
6     6  0.24844901 0.8038351       FALSE
7     7 -0.63765775 0.5238332       FALSE
8     8  0.08628668 0.9312547       FALSE
9     9  0.56532140 0.5719741       FALSE
10   10  0.74877731 0.4541563       FALSE
11   11  0.46332527 0.6432256       FALSE
12   12  0.65988433 0.5094704       FALSE

Visualizing T-Test Results

Show code
ggplot(t_test_results, aes(x = factor(Item), y = T_Statistic, fill = Significant)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.7) +
  labs(title = "Independent Sample T-Test: Item-wise Comparison at p < .05", 
       x = "Item", y = "T-Statistic") +
  scale_fill_manual(values = c("TRUE" = "turquoise", "FALSE" = "darkmagenta")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  geom_text(aes(label = ifelse(Significant, "*", "")), 
            vjust = -0.5, color = "black", size = 5) +
  annotate("text", x = 2, y = max(t_test_results$T_Statistic) + 2, 
           label = "Negative t-statistic: Mean of original < Synthesized\nPositive t-statistic: Mean of original > Synthesized", 
           size = 4, hjust = 0, color = "black", fontface = "italic")

Psychometric Properties Comparison

Show code
# Perform item analysis on original data
org_item_analysis <- itemAnalysis(data_org, itemReport=TRUE, NA.Delete=TRUE, pBisFlag = T,  bisFlag = T, flagStyle = c("X",""))

# Perform item analysis on synthesized data
synth_item_analysis <- itemAnalysis(data_synth, itemReport=TRUE, NA.Delete=TRUE, pBisFlag = T,  bisFlag = T, flagStyle = c("X",""))

Visualizing Item-Level Properties

Show code
#Plot Side-by-Side Bar Charts for Item Means and Discrimination

# Extract item mean and discrimination for original and synthetic data
item_comparison <- data.frame(
  Item = seq_along(org_item_analysis$itemReport$itemMean),
  Original_Mean = org_item_analysis$itemReport$itemMean,
  Synthesized_Mean = synth_item_analysis$itemReport$itemMean,
  Original_Discrimination = org_item_analysis$itemReport$pBis,
  Synthesized_Discrimination = synth_item_analysis$itemReport$pBis
)

# Reshape the data for ggplot
item_means_long <- item_comparison %>%
  select(Item, Original_Mean, Synthesized_Mean) %>%
  tidyr::pivot_longer(cols = c(Original_Mean, Synthesized_Mean), names_to = "Dataset", values_to = "Mean")

item_discrimination_long <- item_comparison %>%
  select(Item, Original_Discrimination, Synthesized_Discrimination) %>%
  tidyr::pivot_longer(cols = c(Original_Discrimination, Synthesized_Discrimination), names_to = "Dataset", values_to = "Discrimination")
Show code
# Plot item means
ggplot(item_means_long, aes(x = as.factor(Item), y = Mean, fill = Dataset)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Item Means Comparison", x = "Item", y = "Mean Score") +
  theme_minimal() +
  scale_fill_manual(values = c("Original_Mean" = "darkmagenta", "Synthesized_Mean" = "turquoise")) + 
  theme(plot.title = element_text(hjust = 0.5))

Show code
# Plot item discrimination
ggplot(item_discrimination_long, aes(x = as.factor(Item), y = Discrimination, fill = Dataset)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Item Discrimination Comparison \n The red line represents minimal acceptable value", x = "Item", y = "Discrimination Index") +
  theme_minimal() +
  scale_fill_manual(values = c("Original_Discrimination" = "darkmagenta", "Synthesized_Discrimination" = "turquoise")) + 
  geom_hline(yintercept = 0.2, color = "red", linetype = "solid") + 
  theme(plot.title = element_text(hjust = 0.5))

Reliability Analysis

Show code
# Reliability Comparison Data Frame
alpha_comparison <- data.frame(
  Dataset = c("Original", "Synthesized"),
  Alpha = c(org_item_analysis$alpha, synth_item_analysis$alpha)
)

# Bar Plot with Values Displayed Above Bars
ggplot(alpha_comparison, aes(x = Dataset, y = Alpha, fill = Dataset)) +
  geom_bar(stat = "identity", width = 0.5) +
  geom_text(aes(label = round(Alpha, 3)), vjust = -0.3, color = "black", size = 5) + # Display values above bars
  labs(title = "Reliability (Coefficient Alpha) Comparison",
       y = "Alpha Reliability Coefficient") +
  theme_minimal() +
  scale_fill_manual(values = c("Original" = "darkmagenta", "Synthesized" = "turquoise")) +
  theme(plot.title = element_text(hjust = 0.5)) # Center the title

Item Response Theory Test Information Function

Show code
# Fit a Graded Response Model (GRM) for original data
model_org <- mirt(data_org, model = 1, itemtype = "graded")

# Fit a Graded Response Model (GRM) for synthesized data
model_synth <- mirt(data_synth, model = 1, itemtype = "graded")

Estimated IRT Model of the Original Dataset

Show code
model_org

Call:
mirt(data = data_org, model = 1, itemtype = "graded")

Full-information item factor analysis with 1 factor(s).
Converged within 1e-04 tolerance after 31 EM iterations.
mirt version: 1.43 
M-step optimizer: BFGS 
EM acceleration: Ramsay 
Number of rectangular quadrature: 61
Latent density type: Gaussian 

Log-likelihood = -9210.903
Estimated parameters: 84 
AIC = 18589.81
BIC = 18949.52; SABIC = 18682.87
G2 (1e+10) = 11699.78, p = 1
RMSEA = 0, CFI = NaN, TLI = NaN
Show code
summary(model_org)
       F1     h2
Q1  0.726 0.5269
Q2  0.673 0.4536
Q3  0.178 0.0318
Q4  0.756 0.5723
Q5  0.250 0.0627
Q6  0.785 0.6155
Q7  0.133 0.0177
Q8  0.741 0.5497
Q9  0.743 0.5513
Q10 0.706 0.4990
Q11 0.224 0.0500
Q12 0.654 0.4283

SS loadings:  4.359 
Proportion Var:  0.363 

Factor correlations: 

   F1
F1  1

EStimated IRT Model of the Synthetic Dataset

Show code
model_synth

Call:
mirt(data = data_synth, model = 1, itemtype = "graded")

Full-information item factor analysis with 1 factor(s).
Converged within 1e-04 tolerance after 22 EM iterations.
mirt version: 1.43 
M-step optimizer: BFGS 
EM acceleration: Ramsay 
Number of rectangular quadrature: 61
Latent density type: Gaussian 

Log-likelihood = -9220.621
Estimated parameters: 83 
AIC = 18607.24
BIC = 18962.67; SABIC = 18699.2
G2 (1e+10) = 11721.99, p = 1
RMSEA = 0, CFI = NaN, TLI = NaN
Show code
summary(model_synth)
       F1     h2
Q1  0.751 0.5643
Q2  0.705 0.4973
Q3  0.171 0.0294
Q4  0.811 0.6577
Q5  0.342 0.1169
Q6  0.789 0.6224
Q7  0.153 0.0233
Q8  0.732 0.5361
Q9  0.759 0.5758
Q10 0.716 0.5126
Q11 0.141 0.0199
Q12 0.632 0.3999

SS loadings:  4.556 
Proportion Var:  0.38 

Factor correlations: 

   F1
F1  1

Test Information Function Comparison

Show code
# Define a range of theta values (latent trait values) to calculate test information
theta_values <- seq(-3, 3, by = 0.1)  # Adjust the range and step size as needed

# Extract the Test Information Function (TIF) for original data
TIF_org <- testinfo(model_org, Theta = theta_values)

# Extract the Test Information Function (TIF) for synthesized data
TIF_synth <- testinfo(model_synth, Theta = theta_values)


# Plot Test Information Function for original and synthesized data
plot(theta_values, TIF_org, type = "l", col = "darkmagenta", lwd = 2, 
     xlim = c(-3, 3), ylim = c(0, max(TIF_org, TIF_synth)), 
     xlab = "Theta", ylab = "Test Information", 
     main = "Test Information Function (TIF) Comparison")

# Add the TIF for synthesized data to the plot
lines(theta_values, TIF_synth, col = "turquoise", lwd = 2)

# Add a legend
legend("topright", legend = c("Original Data", "Synthesized Data"), 
       col = c("darkmagenta", "turquoise"), lwd = 2)

Concluding Remark and Practical Implications

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

Citation

For attribution, please cite this work as

Wongvorachan (2024, Nov. 17). Tarid Wongvorachan: Evaluating Synthetic Data with A Psychometric Approach. Retrieved from https://taridwong.github.io/posts/2024-11-15-synth/

BibTeX citation

@misc{wongvorachan2024evaluating,
  author = {Wongvorachan, Tarid},
  title = {Tarid Wongvorachan: Evaluating Synthetic Data with A Psychometric Approach},
  url = {https://taridwong.github.io/posts/2024-11-15-synth/},
  year = {2024}
}