# 1. preprocessing======

# **1.1. read in data====
spike1_raw <- list.files(here::here("data/spike_1/"), full.names = T) %>% 
  map_dfr(~read_csv(.) %>% select(-TITLE, -DATELINE))
## **1.2. select distinct headlines====
spike1_unique <- spike1_raw %>% 
  mutate(headline_low=str_to_lower(HEADLINE)) %>% 
  distinct(headline_low, .keep_all = T)%>% 
  mutate(headline_id=rownames(.)) %>% 
  select(headline_id, everything())
# looks like there are still duplicates, wow.

## **1.3. parse date column properly====
spike1_unique <- spike1_unique%>%
  mutate(date_parsed= str_sub(DATE, start = 1L, end = 17L) %>%
           str_trim() %>% 

spike1_unique %>% select(date_parsed) %>% str() #parsed correctly
Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame':    3384 obs. of  1 variable:
 $ date_parsed: Date, format: "2016-10-31" "2016-11-01" ...
## **1.4. create a rather standardised publication column====

spike1_unique %>% 
  select(PUBLICATION) %>% 
  distinct() # 21 publications
# A tibble: 21 x 1
 1 The Independent (United Kingdom)
 3 MailOnline                      
 4 Express Online                  
 5 The Guardian                    
 6 The Daily Telegraph (London)    
 7 The Express                     
 8 i-Independent Print Ltd         
10 The Observer (London)           
# … with 11 more rows
spike1_unique <- spike1_unique %>% 
  mutate(publication_simplified= case_when(
      str_detect(string = PUBLICATION, pattern = regex("mail", ignore_case = T)  ) ~ "Daily Mail",
      str_detect(string = PUBLICATION, pattern = regex("mirror", ignore_case = T)  ) ~ "Daily Mirror",
      str_detect(string = PUBLICATION, pattern = regex("people", ignore_case = T)  ) ~ "People",#exclude
      str_detect(string = PUBLICATION, pattern = regex("star", ignore_case = T)  ) ~ "Daily Star", #exclude
      str_detect(string = PUBLICATION, pattern = regex("express", ignore_case = T)  ) ~ "The Express",
      str_detect(string = PUBLICATION, pattern = regex("independent", ignore_case = T)  ) ~ "The Independent",
      str_detect(string = PUBLICATION, pattern = regex("telegraph", ignore_case = T)  ) ~ "Daily Telegraph",
      str_detect(string = PUBLICATION, pattern = regex("guardian", ignore_case = T)  ) ~ "The Guardian",
      str_detect(string = PUBLICATION, pattern = regex("observer", ignore_case = T)  ) ~ "The Observer",
      str_detect(string = PUBLICATION, pattern = regex(" sun ", ignore_case = T)  ) ~ "The Sun",
      str_detect(string = PUBLICATION, pattern = regex("times", ignore_case = T)  ) ~ "Times"
spike1_unique %>% 
  select(publication_simplified) %>% 
  distinct() #11 publications, cool.
# A tibble: 11 x 1
 1 The Independent       
 2 Daily Mirror          
 3 Daily Mail            
 4 The Express           
 5 The Guardian          
 6 Daily Telegraph       
 7 The Observer          
 8 Times                 
 9 The Sun               
10 People                
11 Daily Star            
# **1.5. add publication type and politics==== 
spike1_unique <- spike1_unique %>% 
  mutate( publication_politics= case_when(
    str_detect(string = publication_simplified, pattern = regex("mail", ignore_case = T)  ) ~ "right",
    str_detect(string = publication_simplified, pattern = regex("mirror", ignore_case = T)  ) ~ "centre-left",
    str_detect(string = publication_simplified, pattern = regex("people", ignore_case = T)  ) ~ "centre-left",#exclude
    str_detect(string = publication_simplified, pattern = regex("star", ignore_case = T)  ) ~ "non-political", #exclude
    str_detect(string = publication_simplified, pattern = regex("express", ignore_case = T)  ) ~ "right",
    str_detect(string = publication_simplified, pattern = regex("independent", ignore_case = T)  ) ~ "centre",
    str_detect(string = publication_simplified, pattern = regex("telegraph", ignore_case = T)  ) ~ "centre-right",
    str_detect(string = publication_simplified, pattern = regex("guardian", ignore_case = T)  ) ~ "centre-left",
    str_detect(string = publication_simplified, pattern = regex("observer", ignore_case = T)  ) ~ "centre-left",
    str_detect(string = publication_simplified, pattern = regex("the sun", ignore_case = T)  ) ~ "right",
    str_detect(string = publication_simplified, pattern = regex("times", ignore_case = T)  ) ~ "centre-right"
  ) %>% 
  mutate(publication_format= case_when(
    str_detect(string = publication_simplified, pattern = regex("mail", ignore_case = T)  ) ~ "tabloid",
    str_detect(string = publication_simplified, pattern = regex("mirror", ignore_case = T)  ) ~ "tabloid",
    str_detect(string = publication_simplified, pattern = regex("people", ignore_case = T)  ) ~ "tabloid",
    str_detect(string = publication_simplified, pattern = regex("star", ignore_case = T)  ) ~ "tabloid",
    str_detect(string = publication_simplified, pattern = regex("express", ignore_case = T)  ) ~ "tabloid",
    str_detect(string = publication_simplified, pattern = regex("independent", ignore_case = T)  ) ~ "online",
    str_detect(string = publication_simplified, pattern = regex("telegraph", ignore_case = T)  ) ~ "broadsheet",
    str_detect(string = publication_simplified, pattern = regex("guardian", ignore_case = T)  ) ~ "broadsheet", #i guess it's tabloid sized now
    str_detect(string = publication_simplified, pattern = regex("observer", ignore_case = T)  ) ~ "broadsheet",
    str_detect(string = publication_simplified, pattern = regex("the sun", ignore_case = T)  ) ~ "tabloid",
    str_detect(string = publication_simplified, pattern = regex("times", ignore_case = T)  ) ~ "broadsheet"

# **1.6. take a look at some summaries====
spike1_unique %>% group_by(publication_simplified) %>% summarise(n=n())
# A tibble: 11 x 2
   publication_simplified     n
   <chr>                  <int>
 1 Daily Mail               391
 2 Daily Mirror             240
 3 Daily Star                18
 4 Daily Telegraph          554
 5 People                     7
 6 The Express              710
 7 The Guardian             298
 8 The Independent          790
 9 The Observer              32
10 The Sun                  123
11 Times                    221
spike1_unique %>% group_by(publication_politics) %>% summarise(n=n())
# A tibble: 5 x 2
  publication_politics     n
  <chr>                <int>
1 centre                 790
2 centre-left            577
3 centre-right           775
4 non-political           18
5 right                 1224
spike1_unique %>% group_by(publication_format) %>% summarise(n=n())
# A tibble: 3 x 2
  publication_format     n
  <chr>              <int>
1 broadsheet          1105
2 online               790
3 tabloid             1489
spike1_unique %>% group_by(publication_simplified,publication_format,publication_politics) %>% summarise(n=n())
# A tibble: 11 x 4
# Groups:   publication_simplified, publication_format [11]
   publication_simplified publication_format publication_politics     n
   <chr>                  <chr>              <chr>                <int>
 1 Daily Mail             tabloid            right                  391
 2 Daily Mirror           tabloid            centre-left            240
 3 Daily Star             tabloid            non-political           18
 4 Daily Telegraph        broadsheet         centre-right           554
 5 People                 tabloid            centre-left              7
 6 The Express            tabloid            right                  710
 7 The Guardian           broadsheet         centre-left            298
 8 The Independent        online             centre                 790
 9 The Observer           broadsheet         centre-left             32
10 The Sun                tabloid            right                  123
11 Times                  broadsheet         centre-right           221
headlines_dtm <- DocumentTermMatrix(headlines_corpus)
headlines_lda <- LDA(headlines_dtm, k=10, control = list(seed=1234))

headlines_lda %>% summary()
 Length   Class    Mode 
      1 LDA_VEM      S4 
headlines_topics <- tidy(headlines_lda, matrix="beta")
# A tibble: 113,530 x 3
   topic term        beta
   <int> <chr>      <dbl>
 1     1 !':   0.0000299 
 2     2 !':   0.00000318
 3     3 !':   0.0000380 
 4     4 !':   0.0000518 
 5     5 !':   0.0000267 
 6     6 !':   0.0000581 
 7     7 !':   0.0000116 
 8     8 !':   0.0000370 
 9     9 !':   0.00000474
10    10 !':   0.00000661
# … with 113,520 more rows
headlines_top_terms <- headlines_topics %>%
  group_by(topic) %>%
  top_n(15, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

# A tibble: 150 x 3
   topic term           beta
   <int> <chr>         <dbl>
 1     1 brexit      0.0263 
 2     1 vote        0.0129 
 3     1 "\"brexit"  0.00962
 4     1 court       0.00897
 5     1 "brexit\"," 0.00784
 6     1 brexit;     0.00708
 7     1 trump       0.00660
 8     1 mps         0.00615
 9     1 brexit,     0.00581
10     1 article     0.00537
# … with 140 more rows
headlines_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +

Enter Text miner

headlines_textminer_dtm <- CreateDtm(doc_vec = spike1_unique$headline_low,
            doc_names = spike1_unique$headline_id,
            ngram_window = c(1, 3),
            stopword_vec = c(tm::stopwords("english"), # stopwords from tm
                            tm::stopwords("SMART")), # this is the default value
            lower = TRUE,
            remove_punctuation = FALSE,
            remove_numbers = FALSE, 
            verbose = TRUE,
            cpus = 4)

optimise_k_func <- function(x){
headlines_textminer_model <- FitLdaModel(dtm = headlines_textminer_dtm, 
                     k = x, 
                     iterations = 200, # i recommend a larger value, 500 or more
                     alpha = 0.1, # this is the default value
                     beta = 0.05, # this is the default value
                     cpus = 4) 

headlines_textminer_model$r2 <- CalcTopicModelR2(dtm = headlines_textminer_dtm, 
                             phi = headlines_textminer_model$phi,
                             theta = headlines_textminer_model$theta,
                             cpus = 4)

headlines_textminer_model$coherence <- CalcProbCoherence(phi = headlines_textminer_model$phi,
                                                         dtm = headlines_textminer_dtm, 
                                                         M = 5)

results <- tibble(k_value=x,r2=headlines_textminer_model$r2, coherence=headlines_textminer_model$coherence)

k_values <- seq(5,100,5)
Loading required package: future
optimise_k_results <- future_map_dfr(k_values,optimise_k_func)
optimise_k_results %>% 
  group_by(k_value, r2) %>% 
  summarise(coherence=mean(coherence)) %>% View()

headlines_textminer_model <- FitLdaModel(dtm = headlines_textminer_dtm, 
                                         k = 40, 
                                         iterations = 1000, # i recommend a larger value, 500 or more
                                         alpha = 0.1, # this is the default value
                                         beta = 0.05, # this is the default value
                                         cpus = 4) 

headlines_textminer_model$ll <- CalcLikelihood(dtm = headlines_textminer_dtm, 
                           phi = headlines_textminer_model$phi, 
                           theta = headlines_textminer_model$theta,
                           cpus = 2)
[1] -675879.9
headlines_textminer_model$coherence <- CalcProbCoherence(phi = headlines_textminer_model$phi, dtm = headlines_textminer_dtm, M = 5)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
0.007229 0.050837 0.073718 0.120547 0.181022 0.470649 
headlines_textminer_model$coherence %>% hist(main="Histogram of probabilistic coherence")

headlines_textminer_model$top_terms <- GetTopTerms(phi = headlines_textminer_model$phi, M = 15)
head(headlines_textminer_model$top_terms,15) %>% View()

headlines_textminer_model$prevalence <- colSums(headlines_textminer_model$theta) / sum(headlines_textminer_model$theta) * 100

# textmineR has a naive topic labeling tool based on probable bigrams
headlines_textminer_model$labels <- LabelTopics(assignments = headlines_textminer_model$theta > 0.05, 
                            dtm = headlines_textminer_dtm,
                            M = 1)

t_1 "brexit_boost"    
t_2 "eu_referendum"   
t_3 "block_brexit"    
t_4 "donald_trump"    
t_5 "article_50"      
t_6 "northern_ireland"
headlines_textminer_model$summary <- data.frame(topic = rownames(headlines_textminer_model$phi),
                           label = headlines_textminer_model$labels,
                           coherence = round(headlines_textminer_model$coherence, 3),
                           prevalence = round(headlines_textminer_model$prevalence,3),
                           top_terms = apply(headlines_textminer_model$top_terms, 2, function(x){
                             paste(x, collapse = ", ")
                           stringsAsFactors = FALSE)

headlines_textminer_model$summary[ order(headlines_textminer_model$summary$prevalence, decreasing = TRUE) , ][ 1:40 , ] %>%
  write_csv(here::here("data/topic_models_spike1.csv")) #it seems k=80 is the best performing model but too many topics. k=40 is not too bad as well

headlines_textminer_model$summary[ order(headlines_textminer_model$summary$prevalence, decreasing = TRUE) , ][ 1:40 , ] %>% 
  as_tibble() %>% 

