Exploring COVID-19 data from twitter with topic modeling

Python Natural Language Processing Unsupervised Machine Learning COVID-19

This entry focuses on the exploration of twitter data from Alberta’s Chief Medical Officer of Health via word cloud and topic modeling to gain insights in characteristics of public health messaging during the COVID-19 pandemic.

(7 min read)

Tarid Wongvorachan (University of Alberta)https://www.ualberta.ca
2021-11-23

COVID-19 situation in Alberta, Canada

Text mining and word cloud fundamentals

Show code
#Import necessary modules

import numpy as np #for numpy array
import pandas as pd #for data reading and processing
import matplotlib.pyplot as plt #for plotting
import re #for Regex text cleaning
from wordcloud import WordCloud, STOPWORDS #for word clouds
from nltk.stem import WordNetLemmatizer #to reduce text to base form
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA #for topic modeling
import warnings

warnings.filterwarnings("ignore") #suppress the warning that Python kindly gave me

tweets_df = pd.read_csv("text-query-tweets.csv")

tweets_df.shape

# Print out the first rows of papers
(538, 4)
Show code
print(tweets_df.Text.head(5))
0    We all have the ability to take small actions ...
1    As we head into Halloween weekend, I encourage...
2    Sadly, 9 new deaths related to COVID-19 were a...
3    Over the past 24 hours, we ID’d 603 new cases ...
4    Here is a summary of the latest #COVID19AB num...
Name: Text, dtype: object

Let’s clean the text first

Show code

#remove all numbers from the text with list comprehension
tweets_df['Text_processed'] = tweets_df['Text'].map(lambda x: re.sub(r'[0-9]+', '', x))

# Remove punctuation
tweets_df['Text_processed'] = tweets_df['Text_processed'].map(lambda x: re.sub(r'[^\w\s\,\.!?]', '', x))

# Convert the tweets to lowercase
tweets_df['Text_processed'] = tweets_df['Text_processed'].map(lambda x: x.lower())

#Clean out URLs
tweets_df['Text_processed'] = tweets_df['Text_processed'].map(lambda x: re.sub(r"http\S+", "", x))

# Print the processed titles of the first rows 
print(tweets_df['Text_processed'].head())
0    we all have the ability to take small actions ...
1    as we head into halloween weekend, i encourage...
2    sadly,  new deaths related to covid were also ...
3    over the past  hours, we idd  new cases amp co...
4    here is a summary of the latest covidab number...
Name: Text_processed, dtype: object

So this is what’s happening over time

Show code

#Change datetime format to datetime
tweets_df['Datetime'] = pd.to_datetime(tweets_df['Datetime'])

#Extract month from datetime
tweets_df['Month'] = tweets_df['Datetime'].dt.month

# Group the papers by year
groups = tweets_df.groupby('Month')

# Determine the size of each group
counts = groups.size()

# Visualize the counts as a bar plot

# Vertical lines
plt.axvline(x = 7.0, color = 'forestgreen', label = 'The reopening date', linestyle='--')
plt.axvline(x = 8.0, color = 'firebrick', label = 'Wave 4 started', linestyle='--')
plt.legend(bbox_to_anchor = (1.0, 1), loc = 'upper right')

plt.title("Tweet count across months")
plt.ylabel("Tweet count")
plt.xlabel("Month")
counts.plot()
plt.show()

Let’s see the big picture with word cloud

Show code

text_all = " ".join(tweet for tweet in tweets_df.Text_processed)
print ("There are {} words in the combination of all tweets".format(len(text_all)))

#lemmatize all words
There are 114362 words in the combination of all tweets
Show code
lemmatizer = WordNetLemmatizer()
text_all = "".join([lemmatizer.lemmatize(i) for i in text_all])

# Create Stopword list:
stopwords_cloud = set(STOPWORDS)
stopwords_cloud.update(["https://", "(/)", "Online:", 
                        "Twitter:", "Join", "us", "virtually",
                        "pm", ":", "https", "t", "d", "co", "amp", "will"])
                      
#Generate a word cloud image
wordcloud_tweet = WordCloud(stopwords=stopwords_cloud, background_color="white",random_state=7).generate(text_all)

#Display the generated image:
#the matplotlib way:
plt.figure(figsize=[10,10])
plt.imshow(wordcloud_tweet, interpolation='bilinear')
plt.axis("off")
(-0.5, 399.5, 199.5, -0.5)
Show code
plt.show()

Common word bar plot and text preprocessing for topic modeling

Show code

# Helper function to count common words

def plot_10_most_common_words(count_data, tfidf_vectorizer):
    import matplotlib.pyplot as plt
    words = tfidf_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 

    plt.bar(x_pos, counts,align='center')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.title('10 most common words')
    plt.show()

#Make your own list of stop words
my_additional_stop_words = ("https://", "(/)", "➡Online:", 
                        "➡Twitter:", "Join", "us", "virtually",
                        "pm", ":", "https", "t", "d", "co", "amp", "today", "new", "covid",
                        "covidab", "hours", "completed")
                        
stop_words_lda = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)    

# Initialize the count vectorizer with the English stop words
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words_lda)

# Fit and transform the processed titles
count_data = tfidf_vectorizer.fit_transform(tweets_df['Text_processed'])

# Visualise the 10 most common words
plot_10_most_common_words(count_data, tfidf_vectorizer)

Finally, let’s see potential topics from Dr. Hinshaw’s tweet

Show code

# Helper function to print out the topics
def print_topics(model, tfidf_vectorizer, n_top_words):
    words = tfidf_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
                        
#How many topic and words per topic we want to see
number_topics = 8
number_words = 5 
                      
# Create and fit the LDA model
lda = LDA(n_components=number_topics, random_state = 1)
lda.fit(count_data)

# Print the topics found by the LDA model
LatentDirichletAllocation(n_components=8, random_state=1)
Show code
print_topics(lda, tfidf_vectorizer, number_words)

Topic #0:
twitter online join video transcript

Topic #1:
possible information protection protect soon

Topic #2:
vaccines protect vaccine dose book

Topic #3:
cases tests partially unvaccinated idd

Topic #4:
reported deaths sadly condolences alberta

Topic #5:
oct age steps important dr

Topic #6:
matter pandemic report continue health

Topic #7:
ahs participating prevent book available

Wrapping up here. What can we conclude?

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".

Citation

For attribution, please cite this work as

Wongvorachan (2021, Nov. 23). Tarid Wongvorachan: Exploring COVID-19 data from twitter with topic modeling. Retrieved from https://taridwong.github.io/posts/2021-11-18-exploring-covid-19-data-from-twitter-with-word-clouds/

BibTeX citation

@misc{wongvorachan2021exploring,
  author = {Wongvorachan, Tarid},
  title = {Tarid Wongvorachan: Exploring COVID-19 data from twitter with topic modeling},
  url = {https://taridwong.github.io/posts/2021-11-18-exploring-covid-19-data-from-twitter-with-word-clouds/},
  year = {2021}
}