Lab03 Quanteda Text Analysis

Author

Jim

Published

October 25, 2022

1 Text Modeling and Analysis using Quanteda

This document demonstrates how to perform text modeling and analysis using the quanteda package. We use data from various sources, including US presidential inaugural addresses and tweets about the Biden-Xi summit in November 2021.

1.1 Installation of Required Packages

# Set the CRAN mirror
options(repos = c(CRAN = "https://cran.rstudio.com/"))

install.packages(c("quanteda", "quanteda.textmodels", "quanteda.textplots", "quanteda.textstats", "readr", "ggplot2"))


The downloaded binary packages are in
    /var/folders/m3/k788kw6103zdvc0bwpc1lzd00000gn/T//RtmpVw4lGQ/downloaded_packages

library(quanteda)
library(quanteda.textmodels)
library(quanteda.textplots)
library(quanteda.textstats)
library(readr)
library(ggplot2)

1.2 Importing Twitter Data

summit <- read_csv("https://raw.githubusercontent.com/datageneration/datamethods/master/textanalytics/summit_11162021.csv")

Rows: 14520 Columns: 90
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (50): screen_name, text, source, reply_to_screen_name, hashtags, symbol...
dbl  (26): user_id, status_id, display_text_width, reply_to_status_id, reply...
lgl  (10): is_quote, is_retweet, quote_count, reply_count, ext_media_type, q...
dttm  (4): created_at, quoted_created_at, retweet_created_at, account_create...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

sum_twt = summit$text
toks = tokens(sum_twt)
sumtwtdfm <- dfm(toks)

1.3 Latent Semantic Analysis (LSA)

Latent Semantic Analysis (LSA) is a technique used to reduce the dimensionality of text data. Here, we apply LSA to the tweet data.

sum_lsa <- textmodel_lsa(sumtwtdfm)
summary(sum_lsa)

                Length    Class     Mode   
sk                     10 -none-    numeric
docs               145200 -none-    numeric
features           159930 -none-    numeric
matrix_low_rank 232218360 -none-    numeric
data            232218360 dgCMatrix S4

1.4 Analyzing Hashtags

We can extract hashtags from the tweet data, and then analyze the most frequent ones.

tag_dfm <- dfm_select(dfm(tokens(sum_twt, remove_punct = TRUE)), pattern = "#*")
toptag <- names(topfeatures(tag_dfm, 50))
head(toptag, 10)

 [1] "#china"       "#biden"       "#xijinping"   "#joebiden"    "#america"    
 [6] "#americans"   "#coronavirus" "#fentanyl"    "#xi"          "#us"

1.5 Plotting a Network of Hashtags

We visualize the relationship between the most frequent hashtags using a network plot.

tag_fcm <- fcm(tag_dfm)
topgat_fcm <- fcm_select(tag_fcm, pattern = toptag)
textplot_network(topgat_fcm, min_freq = 50, edge_alpha = 0.8, edge_size = 5)

1.6 Wordcloud from US Presidential Inaugural Addresses

The following code generates a word cloud based on US presidential inaugural addresses from 1789 to 1826.

dfm_inaug <- corpus_subset(data_corpus_inaugural, Year <= 1826) %>%
  tokens() %>%
  dfm() %>%
  dfm_remove(stopwords('english')) %>%
  dfm_remove(pattern = "[[:punct:]]") %>%
  dfm_trim(min_termfreq = 10, verbose = FALSE)

set.seed(100)
textplot_wordcloud(dfm_inaug)

1.7 Comparison Wordcloud for Recent Presidents

We can compare the word usage of different presidents in their inaugural speeches using a word cloud.

corpus_subset(data_corpus_inaugural, 
              President %in% c("Trump", "Obama", "Bush")) %>%
  tokens(remove_punct = TRUE) %>%
  tokens_remove(stopwords("english")) %>%
  dfm() %>%
  dfm_group(groups = President) %>%
  dfm_trim(min_termfreq = 5, verbose = FALSE) %>%
  textplot_wordcloud(comparison = TRUE)

1.8 Keyword in Context (KWIC) Analysis

We can use Keyword in Context (KWIC) analysis to see how specific terms, such as “american”, “people”, and “communist” are used in speeches after 1949.

data_corpus_inaugural_subset <- corpus_subset(data_corpus_inaugural, Year > 1949)
kwic_tokens <- tokens(data_corpus_inaugural_subset)

# Perform KWIC analysis for the word "american"
kwic(kwic_tokens, pattern = "american") %>%
  textplot_xray()

# Plot KWIC for multiple words
g <- textplot_xray(
  kwic(kwic_tokens, pattern = "american"),
  kwic(kwic_tokens, pattern = "people"),
  kwic(kwic_tokens, pattern = "communist")
)
g + aes(color = keyword) + 
  scale_color_manual(values = c("blue", "red", "green")) +
  theme(legend.position = "none")

1.9 Frequency of Terms

Here, we analyze the frequency of the term “american” across different presidents’ speeches.

freq_grouped <- textstat_frequency(dfm(tokens(data_corpus_inaugural_subset)), 
                                   groups = data_corpus_inaugural_subset$President)

freq_american <- subset(freq_grouped, freq_grouped$feature %in% "american")  

ggplot(freq_american, aes(x = group, y = frequency)) +
  geom_point() + 
  scale_y_continuous(limits = c(0, 14), breaks = c(seq(0, 14, 2))) +
  xlab(NULL) + 
  ylab("Frequency") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

1.10 Relative Frequency of Terms

The following code calculates the relative frequency of the term “american” across different presidents.

dfm_rel_freq <- dfm_weight(dfm(tokens(data_corpus_inaugural_subset)), scheme = "prop") * 100
rel_freq <- textstat_frequency(dfm_rel_freq, groups = dfm_rel_freq$President)

rel_freq_american <- subset(rel_freq, feature %in% "american")  

ggplot(rel_freq_american, aes(x = group, y = frequency)) +
  geom_point() + 
  scale_y_continuous(limits = c(0, 0.7), breaks = c(seq(0, 0.7, 0.1))) +
  xlab(NULL) + 
  ylab("Relative frequency") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

1.11 Keyness Analysis for Obama and Trump Speeches

We compare the key terms in the inaugural speeches of Obama and Trump using keyness analysis.

pres_corpus <- corpus_subset(data_corpus_inaugural, 
                             President %in% c("Obama", "Trump"))

pres_dfm <- tokens(pres_corpus, remove_punct = TRUE) %>%
  tokens_remove(stopwords("english")) %>%
  tokens_group(groups = President) %>%
  dfm()

result_keyness <- textstat_keyness(pres_dfm, target = "Trump")

# Plot estimated word keyness
textplot_keyness(result_keyness)

1.12 Wordscores Model

We can estimate word positions and predictions using a Wordscores model.

data(data_corpus_irishbudget2010, package = "quanteda.textmodels")
ie_dfm <- dfm(tokens(data_corpus_irishbudget2010))

refscores <- c(rep(NA, 4), 1, -1, rep(NA, 8))

ws <- textmodel_wordscores(ie_dfm, y = refscores, smooth = 1)

# Plot estimated word positions
textplot_scale1d(ws, highlighted = c("minister", "have", "our", "budget"), highlighted_color = "red")

# Get predictions and plot document positions
pred <- predict(ws, se.fit = TRUE)
textplot_scale1d(pred, margin = "documents", groups = docvars(data_corpus_irishbudget2010, "party"))