Exploring Nature’s 2023 postdoc survey with predictive modeling and NLP

Python Supervised Machine Learning Unsupervised Machine Learning

In this post, I will be exploring factors that influence postdoctoral researchers’ perspective to career in scientific research with predictive modeling and natural language processing (NLP).

(16 min read)

Tarid Wongvorachan (University of Alberta)https://www.ualberta.ca
2023-12-15

Introduction

Show code
import pandas as pd
import numpy as np
import sweetviz
import seaborn as sns
import matplotlib.pyplot as plt

#Read the data set
df = pd.read_csv("postdoc_processed.csv", encoding = "ISO-8859-1") #For prediction task

df_text = df[['HOWEMPLOYED', 'EMPLOYSTATUS', 'WHYMOVE', 'WHYEXPCT',
                     'WHYINCREASE', 'WHYDECREASE', 'WHYSECONDJOB', 'DISCRIMCAT',
                     'DISCRIMPER', 'DISCRIMOBSCAT', 'CHALLENGE', 'LACKEDSKILLS',
                     'REFLECT', 'CONTINENT', 'COUNTRY', 'AGE',
                     'GENDER', 'RACE']]
                     
df_reflect = df_text.dropna(subset=['REFLECT']) #For NLP

Data exploration

Show code
#Read the data set
df = pd.read_csv("postdoc_prediction_cleaned.csv", encoding = "ISO-8859-1")

# Use the analyze function to perform EDA
analyze_df = sweetviz.analyze(df, target_feat = "REGRET")

# Render and show the results of EDA
analyze_df.show_html("analyze.html")

Predicting regret in postdoc

Feature selection

Show code
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
Show code
X = df.drop('REGRET', axis=1)
y = df['REGRET']

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=123)

# Initialize the RFECV
rfecv_model = RFECV(estimator=rf_classifier, 
                    step=1, cv=5, 
                    scoring='roc_auc', 
                    min_features_to_select=20)

rfecv = rfecv_model.fit(X_train, y_train)

print('Optimal number of features :', rfecv.n_features_)
print('Best features by RFECV with random forest:', X_train.columns[rfecv.support_])

Show code
X_train_reduced = X_train[['POSTDOCNATIVE', 'POSTDOCMOVE', 'FIRSTPOSTDOC', 'PREVPOSTDOC',
       'POSTDOCEXPCT', 'CAREERSECTOR', 'SALARY', 'SALARYCHANGE', 'PAIDVACAY',
       'PAIDSICK', 'INSURANCE', 'PENSION', 'PARENTLEAVE', 'CHILDCARE',
       'BECOMEPARENT', 'DEBT', 'SAVING', 'SAVINGPROBLEM', 'WKHOURS',
       'OVERTIME', 'OVERNIGHT', 'WEEKENDWK', 'POSTDOCVACANCY', 'SATCHANGE',
       'SATSALARY', 'SATWELLNESS', 'SATFUNDING', 'SATTIME', 'SATCAREER',
       'SATTRAINING', 'SATJOBSCURITY', 'SATOPT', 'SATSUPERVISE', 'SATORG',
       'SATDECISION', 'SATWLB', 'SATWKHOURS', 'SATINTEREST', 'SATACCOM',
       'SATRELATION', 'SATINDE', 'SATRECOG', 'SATSAFETY', 'SATDIVERSE',
       'SUPERVISETIME', 'DISCRIMEXP', 'DISCRIMOBS', 'NEEDMHHELP',
       'AGREEMHSUPP', 'AGREEDIRECTSUPP', 'AGREEWKSUPP', 'AGREEWLB',
       'AGREELNGWK', 'MHLEAVESCI', 'BLIVGENDEREQ', 'BLIVETHEQ', 'BLIVSAFETY',
       'BLIVDIGNITY', 'JOBPRSPCT', 'PRSPCTCHNGE', 'POSTDOCLEAVE', 'MINORITY',
       'LTHEALTHPROB']]

X_test_reduced = X_test[['POSTDOCNATIVE', 'POSTDOCMOVE', 'FIRSTPOSTDOC', 'PREVPOSTDOC',
       'POSTDOCEXPCT', 'CAREERSECTOR', 'SALARY', 'SALARYCHANGE', 'PAIDVACAY',
       'PAIDSICK', 'INSURANCE', 'PENSION', 'PARENTLEAVE', 'CHILDCARE',
       'BECOMEPARENT', 'DEBT', 'SAVING', 'SAVINGPROBLEM', 'WKHOURS',
       'OVERTIME', 'OVERNIGHT', 'WEEKENDWK', 'POSTDOCVACANCY', 'SATCHANGE',
       'SATSALARY', 'SATWELLNESS', 'SATFUNDING', 'SATTIME', 'SATCAREER',
       'SATTRAINING', 'SATJOBSCURITY', 'SATOPT', 'SATSUPERVISE', 'SATORG',
       'SATDECISION', 'SATWLB', 'SATWKHOURS', 'SATINTEREST', 'SATACCOM',
       'SATRELATION', 'SATINDE', 'SATRECOG', 'SATSAFETY', 'SATDIVERSE',
       'SUPERVISETIME', 'DISCRIMEXP', 'DISCRIMOBS', 'NEEDMHHELP',
       'AGREEMHSUPP', 'AGREEDIRECTSUPP', 'AGREEWKSUPP', 'AGREEWLB',
       'AGREELNGWK', 'MHLEAVESCI', 'BLIVGENDEREQ', 'BLIVETHEQ', 'BLIVSAFETY',
       'BLIVDIGNITY', 'JOBPRSPCT', 'PRSPCTCHNGE', 'POSTDOCLEAVE', 'MINORITY',
       'LTHEALTHPROB']]

Hyperparameter Tuning

Show code
# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize RandomizedSearchCV with the classifier, hyperparameter grid, and cross-validation
random_search = RandomizedSearchCV(
    rf_classifier,
    param_distributions=param_grid,
    n_iter=10,  # Adjust the number of iterations as needed
    scoring='accuracy',
    cv=5,
    random_state=42
)

# Fit the randomized search on the selected features
random_search.fit(X_train_reduced, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

Model evaluation

Show code
# Initialize the RandomForestClassifier
rf_classifier_tuned = RandomForestClassifier(n_estimators = 100, min_samples_split = 5, min_samples_leaf = 2, max_depth = 20, random_state=123)

rf_classifier_tuned.fit(X_train_reduced, y_train)
rfc_tuned_predict = rf_classifier_tuned.predict(X_test_reduced)
rfc_tuned_cv_score = cross_val_score(rf_classifier_tuned, X_test_reduced, y_test, cv=5, scoring='roc_auc')
Show code
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn import metrics

#%% Evaluate the tuned RF

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_tuned_predict))
print('\n')

print("=== Classification Report ===")
print(classification_report(y_test, rfc_tuned_predict))
print('\n')

print("=== All AUC Scores ===")
print(rfc_tuned_cv_score)
print('\n')

print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_tuned_cv_score.mean())


#Accuracy of the tuned RF: test data
print("accuracy score of the test set for tuned RF", rf_classifier_tuned.score(X_test_reduced, y_test))

#Root mean square error
mse_rfc_tuned = mean_squared_error(y_test, rfc_tuned_predict)
rmse_rfc_tuned = sqrt(mse_rfc_tuned)


print('RMSE of tuned RF = ', rmse_rfc_tuned)

Show code
#define metrics for tuned RFC
y_pred_proba_tuned_rf = rf_classifier_tuned.predict_proba(X_test_reduced)[::,1]
fpr_tuned_rf, tpr_tuned_rf, _ = metrics.roc_curve(y_test,  y_pred_proba_tuned_rf)

auc_tuned_rf = metrics.roc_auc_score(y_test, y_pred_proba_tuned_rf)

#create ROC curve
plt.plot(fpr_tuned_rf, tpr_tuned_rf, label="Tuned AUC for RF="+str(auc_tuned_rf.round(3)))

plt.legend(loc="lower right")

plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

# displaying the title
plt.title("ROC of Tuned RF")

plt.show()

Explaining the classification results with XAI

Show code
#%% Feature importance report of the tuned RF

# Create a pd.Series of features importances
importances_rf = pd.Series(rf_classifier_tuned.feature_importances_, index = X_test_reduced.columns)

# Sort importances_rf
sorted_importance_rf = importances_rf.sort_values()

#Horizontal bar plot
plt.figure(figsize=(8, 20))

sorted_importance_rf.plot(kind='barh', color='lightgreen');
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.show()

Show code
import dalex as dx

#we created an explainer with dalex package
exp = dx.Explainer(rf_classifier_tuned, X_test_reduced, y_test)

# PDP profile for surface and construction.year variable
RF_mprofile = exp.model_profile(variables = ["MHLEAVESCI", "JOBPRSPCT", "SATCAREER", "SATCHANGE"] , type = "partial")

# comparison for random forest and linear regression model
RF_mprofile.plot()

Negative case: Not recommend younger self for research career

Show code
# Break Down for apartment observation
rf_pparts = exp.predict_parts(new_observation = X_test_reduced.iloc[202], type = "break_down")

# plot Break Down
rf_pparts.plot()

Show code
# Break Down for apartment observation
rf_pparts_shap = exp.predict_parts(new_observation = X_test_reduced.iloc[202], type = "shap")

# plot Break Down
rf_pparts_shap.plot()

Show code
# lime explanation
rf_pparts_lime = exp.predict_surrogate(new_observation = X_test_reduced.iloc[202])

# plot LIME
rf_pparts_lime.show_in_notebook()

Positive case: Still recommend younger self for research career

Show code
# Break Down for apartment observation
rf_pparts = exp.predict_parts(new_observation = X_test_reduced.iloc[6], type = "break_down")

# plot Break Down
rf_pparts.plot()

Show code
# Break Down for apartment observation
rf_pparts_shap = exp.predict_parts(new_observation = X_test_reduced.iloc[6], type = "shap")

# plot Break Down
rf_pparts_shap.plot()

Show code
# lime explanation
rf_pparts_lime = exp.predict_surrogate(new_observation = X_test_reduced.iloc[6])

# plot LIME
rf_pparts_lime.show_in_notebook()

Examining written response with NLP

Exploring textual data

Show code
df_text['REFLECT']

Wordcloud

Show code
from wordcloud import WordCloud, STOPWORDS

# Custom stopwords
custom_stopwords = ["post doc", "doc", "nan", "don", "phd", "postdoc"]

# Concatenate all text data in the 'REFLECT' column
text_data = ' '.join(df_reflect['REFLECT'])

# Remove English stopwords and custom stopwords
stopwords = set(STOPWORDS)
stopwords.update(custom_stopwords)

# Create a WordCloud object with specified stopwords
wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=stopwords).generate(text_data)

# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Turn off axis numbers and ticks
plt.show()

Common word

Show code
def plot_10_most_common_words(count_data, tfidf_vectorizer):
    import matplotlib.pyplot as plt
    words = tfidf_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts += t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x: x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 

    plt.bar(x_pos, counts, align='center')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('Words')
    plt.ylabel('Counts')
    plt.title('10 Most Common Words')
    plt.show()
    
# Custom stopwords, including your own stopwords
custom_stopwords = set(["post doc", "doc", "nan", "don", "phd", "postdoc"])

# Combine standard English stopwords with custom stopwords
stopwords = set(text.ENGLISH_STOP_WORDS).union(custom_stopwords)

# Handle NaN values by filling them with an empty string
df_text['REFLECT'] = df_text['REFLECT'].fillna('')

# Initialize the count vectorizer with the English stop words
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords)

# Fit and transform the processed titles
count_data = tfidf_vectorizer.fit_transform(df_text['REFLECT'])

# Visualize the 10 most common words
plot_10_most_common_words(count_data, tfidf_vectorizer)

Topic modeling

Negative case: Not recommend younger self for research career

Show code
from sklearn.feature_extraction import text
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

custom_stopwords = ["post doc", "doc", "nan", "don", "phd", "postdoc"]
Show code
# Custom stopwords, including your own stopwords
custom_stopwords = set(["post doc", "doc", "nan", "don", "phd"])

# Combine standard English stopwords with custom stopwords
stopwords = set(text.ENGLISH_STOP_WORDS).union(custom_stopwords)

tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                stop_words=stopwords,
                                lowercase=True,
                                token_pattern=r'\b[a-zA-Z]{3,}\b',
                                ngram_range=(1, 2),
                                max_df=0.5,
                                min_df=10)

dtm_tf = tf_vectorizer.fit_transform(df_text['REFLECT'].values.astype('U'))

tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(df_text['REFLECT'].values.astype('U'))

# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=5, random_state=0)
lda_tf.fit(dtm_tf)

# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=5, random_state=0)
lda_tfidf.fit(dtm_tfidf)

pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

Show code
for i,topic in enumerate(lda_tf.components_):
    print(f'Top 10 words for topic #{i}:')
    print([tf_vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
    print('\n')

Positive case: Still recommend younger self for research career

Concluding remark

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

Citation

For attribution, please cite this work as

Wongvorachan (2023, Dec. 15). Tarid Wongvorachan: Exploring Nature's 2023 postdoc survey with predictive modeling and NLP. Retrieved from https://taridwong.github.io/posts/2023-12-10-naturepostdoc/

BibTeX citation

@misc{wongvorachan2023exploring,
  author = {Wongvorachan, Tarid},
  title = {Tarid Wongvorachan: Exploring Nature's 2023 postdoc survey with predictive modeling and NLP},
  url = {https://taridwong.github.io/posts/2023-12-10-naturepostdoc/},
  year = {2023}
}