Medical Sociology on Wheels

Comments on Health Beliefs, Risk, Mitigation, and Sociological Research Methods, a Disabled Perspective


Natural Language Processing (NLP) and Latent Dirichlet Analysis (LDA) of Tweets (Example)

attach packages

library(devtools)
## Loading required package: usethis
library(textclean)
library(textdata)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5.9000     ✓ purrr   0.3.4     
## ✓ tibble  3.1.5          ✓ dplyr   1.0.7     
## ✓ tidyr   1.1.4          ✓ stringr 1.4.0.9000
## ✓ readr   2.0.2          ✓ forcats 0.5.1
## ── Conflicts ───────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::complete() masks RCurl::complete()
## x dplyr::filter()   masks stats::filter()
## x dplyr::lag()      masks stats::lag()
library(tidytext)
library(widyr)
library(hms)
library(lubridate) 
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:hms':
## 
##     hms
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
## The following objects are masked from 'package:quanteda':
## 
##     meta, meta<-
## 
## Attaching package: 'tm'
## The following object is masked from 'package:quanteda':
## 
##     stopwords
library(wordcloud)
## Loading required package: RColorBrewer
library(rle)
library(broom)
library(tokenizers)
library(tidyselect)
library(textshape)
## 
## Attaching package: 'textshape'
## The following object is masked from 'package:lubridate':
## 
##     duration
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:purrr':
## 
##     flatten
## The following object is masked from 'package:tibble':
## 
##     column_to_rownames
library(topicmodels)
library(spacyr)
library(seededlda)
## 
## Attaching package: 'seededlda'
## The following objects are masked from 'package:topicmodels':
## 
##     terms, topics
## The following object is masked from 'package:stats':
## 
##     terms
library(stm)
## stm v1.3.6 successfully loaded. See ?stm for help. 
##  Papers, resources, and other materials at structuraltopicmodel.com
library(academictwitteR)
library(lda)
library(lexicon)
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(mclust)
##     __  ___________    __  _____________
##    /  |/  / ____/ /   / / / / ___/_  __/
##   / /|_/ / /   / /   / / / /\__ \ / /   
##  / /  / / /___/ /___/ /_/ /___/ // /    
## /_/  /_/\____/_____/\____//____//_/    version 5.4.7
## Type 'citation("mclust")' for citing this R package in publications.
## 
## Attaching package: 'mclust'
## The following object is masked from 'package:purrr':
## 
##     map
library(mgsub)
## 
## Attaching package: 'mgsub'
## The following object is masked from 'package:textclean':
## 
##     mgsub
library(NLP)
library(readr)
library(rematch2)
library(remotes)
## 
## Attaching package: 'remotes'
## The following objects are masked from 'package:devtools':
## 
##     dev_package_deps, install_bioc, install_bitbucket, install_cran, install_deps,
##     install_dev, install_git, install_github, install_gitlab, install_local,
##     install_svn, install_url, install_version, update_packages
## The following object is masked from 'package:usethis':
## 
##     git_credentials
library(reshape2)
library(rsparse)
## Setting OpenMP threads number to 1
## Can be adjusted by setting `options("rsparse_omp_threads" = N_THREADS)`
library(SnowballC)
library(stringi)
library(stringr)
library(termco)
## 
## Attaching package: 'termco'
## The following object is masked from 'package:dplyr':
## 
##     group_cols
library(tau)
## 
## Attaching package: 'tau'
## The following object is masked from 'package:readr':
## 
##     tokenize
library(textrecipes)
## Loading required package: recipes
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
## 
##     fixed
## The following object is masked from 'package:devtools':
## 
##     check
## The following object is masked from 'package:stats':
## 
##     step
library(textshape)
library(tidymodels)
## Registered S3 method overwritten by 'tune':
##   method                   from   
##   required_pkgs.model_spec parsnip
## ── Attaching packages ───────────────────────────────────────────────────── tidymodels 0.1.4 ──
## ✓ dials        0.0.10     ✓ tune         0.1.6 
## ✓ infer        1.0.0      ✓ workflows    0.2.4 
## ✓ modeldata    0.1.1      ✓ workflowsets 0.1.0 
## ✓ parsnip      0.1.7      ✓ yardstick    0.0.8 
## ✓ rsample      0.1.0
## ── Conflicts ──────────────────────────────────────────────────────── tidymodels_conflicts() ──
## x NLP::annotate()                 masks ggplot2::annotate()
## x recipes::check()                masks devtools::check()
## x textshape::column_to_rownames() masks tibble::column_to_rownames()
## x textshape::combine()            masks dplyr::combine()
## x tidyr::complete()               masks RCurl::complete()
## x scales::discard()               masks purrr::discard()
## x magrittr::extract()             masks tidyr::extract()
## x dplyr::filter()                 masks stats::filter()
## x recipes::fixed()                masks stringr::fixed()
## x textshape::flatten()            masks purrr::flatten()
## x parsnip::get_dependency()       masks spacyr::get_dependency()
## x termco::group_cols()            masks dplyr::group_cols()
## x dplyr::lag()                    masks stats::lag()
## x mclust::map()                   masks purrr::map()
## x magrittr::set_names()           masks purrr::set_names()
## x yardstick::spec()               masks readr::spec()
## x recipes::step()                 masks stats::step()
## x parsnip::translate()            masks tau::translate()
## x dials::weight()                 masks termco::weight()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
library(udpipe)
library(widyr)
library(discrim)
## 
## Attaching package: 'discrim'
## The following object is masked from 'package:dials':
## 
##     smoothness
library(dictionarytools)


library(forcats)
library(ggplot2)
library(generics)
## 
## Attaching package: 'generics'
## The following object is masked from 'package:tune':
## 
##     tune_args
## The following objects are masked from 'package:infer':
## 
##     calculate, generate, hypothesize, specify, visualize
## The following object is masked from 'package:dials':
## 
##     prune
## The following object is masked from 'package:termco':
## 
##     evaluate
## The following object is masked from 'package:lubridate':
## 
##     as.difftime
## The following object is masked from 'package:dplyr':
## 
##     explain
## The following objects are masked from 'package:base':
## 
##     as.difftime, as.factor, as.ordered, intersect, is.element, setdiff, setequal,
##     union
library(ISOcodes)
library(lazyeval)
## 
## Attaching package: 'lazyeval'
## The following object is masked from 'package:parsnip':
## 
##     make_call
## The following objects are masked from 'package:purrr':
## 
##     is_atomic, is_formula
library(norm)
library(rlang)
## 
## Attaching package: 'rlang'
## The following objects are masked from 'package:lazyeval':
## 
##     as_name, call_modify, call_standardise, expr_label, expr_text, f_env, f_env<-,
##     f_label, f_lhs, f_lhs<-, f_rhs, f_rhs<-, f_text, is_atomic, is_call, is_formula,
##     is_lang, is_pairlist, missing_arg
## The following object is masked from 'package:magrittr':
## 
##     set_names
## The following objects are masked from 'package:textshape':
## 
##     as_list, flatten
## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
##     flatten_raw, invoke, list_along, modify, prepend, splice
library(sourcetools)
## 
## Attaching package: 'sourcetools'
## The following object is masked from 'package:tau':
## 
##     tokenize
## The following objects are masked from 'package:readr':
## 
##     read_lines, tokenize
library(sys)
library(data.table)
## data.table 1.14.2 using 1 threads (see ?getDTthreads).  Latest news: r-datatable.com
## **********
## This installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.
## This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux.
## **********
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:rlang':
## 
##     :=
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week, yday, year
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt
library(assertthat)
## 
## Attaching package: 'assertthat'
## The following object is masked from 'package:rlang':
## 
##     has_name
## The following object is masked from 'package:tibble':
## 
##     has_name
library(brio)
## 
## Attaching package: 'brio'
## The following object is masked from 'package:sourcetools':
## 
##     read_lines
## The following objects are masked from 'package:readr':
## 
##     read_file, read_file_raw, read_lines, write_file, write_lines
## The following objects are masked from 'package:base':
## 
##     readLines, writeLines
library(class)


require(dplyr)

Currently have users separated by expertise type
(medical, public health, disabled, non-expert)
I have some repeat users in the three expert dfs
use dplyr to eliminate repeats (distinct)
Prior to distinction, N’s for experts are:
disabled N=360
public health N=899
medical N=1570

user_disabCI <- user_disabCI %>%
  distinct()
user_pubH <- user_pubH %>%
  distinct()
user_medic <- user_medic %>%
  distinct()

after distinct command, N’s for experts are:
disabled N=62
public health N=316
medical N=500

Now I want to use a left_join to bind the expertise info into the main baby df
(m2m_baby_df3)
start with disabled group

m2m_BigBaby_df <- m2m_baby_df3 %>%
  left_join(user_disabCI)
## Joining, by = "author_id"

now left join to the big baby df
add public health

m2m_BigBaby_df <- m2m_BigBaby_df %>%
  left_join(user_pubH, by=c("author_id"))

now left join to big baby df again
add medicine

m2m_BigBaby_df <- m2m_BigBaby_df %>%
  left_join(user_medic, by=c("author_id"))

added two cases to the big baby df when added disabled group, unclear why, note

read in vader dictionary (sentiments for social media)
set vader dictionary to quanteda dictionary
try corpustools to do sentiment analysis narrowed to context (kwic)

library(SentimentAnalysis)
## 
## Attaching package: 'SentimentAnalysis'
## The following object is masked from 'package:sourcetools':
## 
##     read
## The following object is masked from 'package:base':
## 
##     write
GI_dict <- dictionary(DictionaryGI)

library(corpustools)
## 
## Attaching package: 'corpustools'
## The following object is masked from 'package:tidytext':
## 
##     get_stopwords
m2mTcorp <- create_tcorpus(Qcorp_tweet)
## 
  |                                                                                           
  |                                                                                     |   0%
  |                                                                                           
  |============                                                                         |  14%
  |                                                                                           
  |========================                                                             |  29%
  |                                                                                           
  |====================================                                                 |  43%
  |                                                                                           
  |=================================================                                    |  57%
  |                                                                                           
  |=============================================================                        |  71%
  |                                                                                           
  |=========================================================================            |  86%
  |                                                                                           
  |=====================================================================================| 100%
m2mTcorp$code_dictionary(GI_dict, column = 'lsd15')
m2mTcorp$set('sentiment', 1, subset = lsd15 %in% c('positive','neg_negative'))
m2mTcorp$set('sentiment', -1, subset = lsd15 %in% c('negative','neg_positive'))

browse_texts(m2mTcorp, scale='sentiment')

this produced a list of top 50 feats with pos words hi-lite green, neg red
saved as pdf

next, limit the t corpus to words within 3 words of “mask”

mask_TcorpBaby <- subset_query(m2mTcorp, "mask", window=3)

last step pre-sentiment analysis, transform t corp back to dfm

dfm_maskTcorp <- get_dfm(m2mTcorp, 
                         feature="token",
                         context_level = c("document")) %>% 
  dfm_trim(min_docfreq=5) %>% 
  dfm_remove(stopwords("english")) %>% 
  dfm_tolower %>% 
  dfm_remove("[^a-z]", valuetype="regex")
## Error in `[<-.data.frame`(`*tmp*`, field, value = list(structure(list(: replacement element 1 is a matrix/data frame of 1 row, need 35179

To get past this error, I am going to try to start by creating t corpus from
df instead of Q corpus, use BigBaby, this should pull meta including expert cat
also set doc col to id to indicate 1 tweet = 1 doc, not 1 user = 1 doc
(if use user, will have repeat doc ids in the doc col)

m2m_BigBaby_clean <- m2m_BigBaby_df %>%
  distinct(id, .keep_all = TRUE)
m2mTcorpFix <- create_tcorpus(m2m_BigBaby_clean, 
                              doc_column = 'id',
                              text_columns = 'stripped_text')
## 
  |                                                                                           
  |                                                                                     |   0%
  |                                                                                           
  |============================                                                         |  33%
  |                                                                                           
  |=========================================================                            |  67%
  |                                                                                           
  |=====================================================================================| 100%
m2mTcorpFix
## tCorpus containing 497423 tokens
## grouped by documents (n = 19732)
## contains:
##   - 3 columns in $tokens:    doc_id, token_id, token
##   - 48 columns in $meta:     doc_id, author_id, profile_image_url, username, url, location, verified, name, protected, public_metrics_user.followers_count, public_metrics_user.following_count, public_metrics_user.tweet_count, public_metrics_user.listed_count, created_at_user, entities_user.url.urls, entities_user.description.hashtags, entities_user.description.mentions, entities_user.description.urls, entities_user.description.cashtags, pinned_tweet_id, description, withheld.country_codes, possibly_sensitive, entities_tweet.mentions, entities_tweet.urls, entities_tweet.annotations, entities_tweet.hashtags, entities_tweet.cashtags, source, in_reply_to_user_id, text, conversation_id, geo.place_id, geo.coordinates.coordinates, geo.coordinates.type, created_at_tweet, public_metrics_tweet.retweet_count, public_metrics_tweet.reply_count, public_metrics_tweet.like_count, public_metrics_tweet.quote_count, referenced_tweets, lang, attachments.media_keys, attachments.poll_ids, strip_ubio, words.x, words.y, words

get more info

head(m2mTcorpFix$tokens)
##                 doc_id token_id   token
## 1: 1233904988767866880        1  thanks
## 2: 1233904988767866880        2     for
## 3: 1233904988767866880        3 wearing
## 4: 1233904988767866880        4    that
## 5: 1233904988767866880        5    mask
## 6: 1233904988767866880        6       i

look at meta data for fixed T corp

head(m2mTcorpFix$meta)
##                 doc_id          author_id
## 1: 1233904988767866880           17423868
## 2: 1233905558245101568 925722636247883776
## 3: 1233905677187219456 830524466744655874
## 4: 1233906675624660992 731596455358566400
## 5: 1233908180897468416          423836621
## 6: 1233908728983744512         1587102235
##                                                               profile_image_url      username
## 1: https://pbs.twimg.com/profile_images/1314301326566789120/re2_QnPs_normal.jpg  JoeDiStefano
## 2: https://pbs.twimg.com/profile_images/1050087895787290627/nBfyBBEm_normal.jpg HiThisIsTerry
## 3: https://pbs.twimg.com/profile_images/1424700818662559749/EwTh6ZyK_normal.jpg     YoRomello
## 4: https://pbs.twimg.com/profile_images/1413002140662964230/Dvtb41Pj_normal.jpg    TheRevSven
## 5: https://pbs.twimg.com/profile_images/1386509488325238784/CPxYC3DY_normal.jpg      bmoluvzu
## 6: https://pbs.twimg.com/profile_images/1323518851582078977/twuMb6VC_normal.jpg     ValClumsy
##                        url                    location verified                    name
## 1: https://t.co/YYh4VgyBb5               New York City    FALSE            JoeDiStefano
## 2:                                                <NA>    FALSE                 Terry C
## 3:                                        Brooklyn, NY    FALSE                    Rome
## 4: https://t.co/rOKJphoB8a                 Savanna, OK    FALSE                     Rev
## 5: https://t.co/RsXvJJukeu                  Dallas, TX    FALSE Licensed Esthetician 💫
## 6: https://t.co/eOUsgq1ivj San Bernardino, California     FALSE               Valeria🍑
##    protected public_metrics_user.followers_count public_metrics_user.following_count
## 1:     FALSE                                3928                                3729
## 2:     FALSE                                 427                                1818
## 3:     FALSE                                 454                                 817
## 4:     FALSE                                1729                                1279
## 5:     FALSE                                 553                                 433
## 6:     FALSE                                 333                                 277
##    public_metrics_user.tweet_count public_metrics_user.listed_count          created_at_user
## 1:                           20025                                0 2008-11-16T16:01:46.000Z
## 2:                           10807                                0 2017-11-01T13:54:10.000Z
## 3:                           17424                                1 2017-02-11T21:10:38.000Z
## 4:                           25945                               13 2016-05-14T21:26:02.000Z
## 5:                           72101                                4 2011-11-29T00:30:13.000Z
## 6:                           23701                                3 2013-07-11T23:41:24.000Z
##    entities_user.url.urls entities_user.description.hashtags
## 1:      <data.frame[1x5]>                          <list[0]>
## 2:              <list[0]>                          <list[0]>
## 3:              <list[0]>                          <list[0]>
## 4:      <data.frame[1x5]>                          <list[0]>
## 5:      <data.frame[1x5]>                          <list[0]>
## 6:      <data.frame[1x5]>                          <list[0]>
##    entities_user.description.mentions entities_user.description.urls
## 1:                          <list[0]>                      <list[0]>
## 2:                          <list[0]>                      <list[0]>
## 3:                          <list[0]>                      <list[0]>
## 4:                  <data.frame[1x3]>                      <list[0]>
## 5:                  <data.frame[1x3]>                      <list[0]>
## 6:                          <list[0]>                      <list[0]>
##    entities_user.description.cashtags     pinned_tweet_id
## 1:                          <list[0]> 1290689717680439297
## 2:                          <list[0]>                <NA>
## 3:                          <list[0]> 1383871080004128770
## 4:                          <list[0]> 1410620364128989186
## 5:                          <list[0]> 1308493736813514753
## 6:                          <list[0]> 1402011359429087259
##                                                                                                                                        description
## 1:                                                                                                                        Hungry, Q List Celebrity
## 2:                                                                                                        She's an emotional support dog...I swear
## 3:                                                                                   bitch, hold on… let me take off my glasses. I can’t hear you.
## 4:               Community Manager for @lootcrate\nIndie Fanatic. Sometimes Content Creator.\n Email: reverendsven.ttv@gmail.com\nViews Are My Own
## 5: Lashes so long when I bat my eyes you get whip lash. Angel Quartz Beauty 💖 🌸Certified MUA🌸 License Esthetician✨👼 Insta: @angelquartzbeauty
## 6:                                                                                                                          27 ♒️🇲🇽 IG📸:valclumsy
##    withheld.country_codes possibly_sensitive entities_tweet.mentions entities_tweet.urls
## 1:                                     FALSE               <list[0]>           <list[0]>
## 2:                                     FALSE       <data.frame[3x4]>           <list[0]>
## 3:                                     FALSE               <list[0]>   <data.frame[1x5]>
## 4:                                     FALSE               <list[0]>           <list[0]>
## 5:                                     FALSE               <list[0]>           <list[0]>
## 6:                                     FALSE               <list[0]>           <list[0]>
##    entities_tweet.annotations entities_tweet.hashtags entities_tweet.cashtags
## 1:                  <list[0]>               <list[0]>               <list[0]>
## 2:                  <list[0]>               <list[0]>               <list[0]>
## 3:                  <list[0]>               <list[0]>               <list[0]>
## 4:          <data.frame[1x5]>               <list[0]>               <list[0]>
## 5:                  <list[0]>               <list[0]>               <list[0]>
## 6:          <data.frame[1x5]>               <list[0]>               <list[0]>
##                 source in_reply_to_user_id
## 1: Twitter for Android                <NA>
## 2:  Twitter for iPhone  822491301484576772
## 3:  Twitter for iPhone                <NA>
## 4: Twitter for Android                <NA>
## 5: Twitter for Android                <NA>
## 6:  Twitter for iPhone                <NA>
##                                                                                                                                                                                                                                text
## 1:                                                                                                                                                                Thanks for wearing that mask, I'd hate to catch your racism . . .
## 2:                                                                                                                     @tburages @tedlieu @realDonaldTrump Projection is an interesting tactic often used to mask ones own behavior
## 3: Niggas really think it’s cool to smell like backwoods, weed, and shakedatass. Nah bruh, fix yo self up and buy some cologne. \n\nAnd don’t try to mask yo stank with cologne either. Take a shower bruh. https://t.co/q6VgE0XQ84
## 4:                                                                                                               I got foam, elastic, a foam head, heat, paint... the new Zeeboh Mask is coming and that means more Zeeboh content.
## 5:                                                                                                                                                       It's so boring when people dont get jelly masks now I'm like bleh reg mask
## 6:                                                                                                                                          I bought a wristlet off of Poshmark and the seller shipped me a note and a face mask 💞
##        conversation_id     geo.place_id geo.coordinates.coordinates geo.coordinates.type
## 1: 1233904988767866880 00c39537733fa112                   <list[0]>                 <NA>
## 2: 1233597848752517121 0a0de7bd49ef942d                   <list[0]>                 <NA>
## 3: 1233905677187219456 24a53a1880093fa9                   <list[0]>                 <NA>
## 4: 1233906675624660992 d1bb2686669067d6                   <list[0]>                 <NA>
## 5: 1233908180897468416 352cf4e7314da0b4                   <list[0]>                 <NA>
## 6: 1233908728983744512 36ac79e68ace76e4                   <list[0]>                 <NA>
##            created_at_tweet public_metrics_tweet.retweet_count
## 1: 2020-03-01T00:00:48.000Z                                  0
## 2: 2020-03-01T00:03:04.000Z                                  0
## 3: 2020-03-01T00:03:32.000Z                                  0
## 4: 2020-03-01T00:07:31.000Z                                  0
## 5: 2020-03-01T00:13:29.000Z                                  0
## 6: 2020-03-01T00:15:40.000Z                                  0
##    public_metrics_tweet.reply_count public_metrics_tweet.like_count
## 1:                                1                               4
## 2:                                0                               0
## 3:                                0                               1
## 4:                                0                               2
## 5:                                0                               0
## 6:                                0                               0
##    public_metrics_tweet.quote_count referenced_tweets lang attachments.media_keys
## 1:                                0         <list[0]>   en              <list[0]>
## 2:                                0 <data.frame[1x2]>   en              <list[0]>
## 3:                                0 <data.frame[1x2]>   en              <list[0]>
## 4:                                0         <list[0]>   en              <list[0]>
## 5:                                0         <list[0]>   en              <list[0]>
## 6:                                0         <list[0]>   en              <list[0]>
##    attachments.poll_ids
## 1:                     
## 2:                     
## 3:                     
## 4:                     
## 5:                     
## 6:                     
##                                                                                                                           strip_ubio
## 1:                                                                                                           hungry q list celebrity
## 2:                                                                                            she s an emotional support dog i swear
## 3:                                                                         bitch hold on let me take off my glasses i cant hear you 
## 4:         community manager for lootcrate indie fanatic sometimes content creator email reverendsven ttv gmail com views are my own
## 5: lashes so long when i bat my eyes you get whip lash angel quartz beauty certified mua license esthetician insta angelquartzbeauty
## 6:                                                                                                                   27 ig valclumsy
##    words.x words.y words
## 1:    <NA>    <NA>  <NA>
## 2:    <NA>    <NA>  <NA>
## 3:    <NA>    <NA>  <NA>
## 4:    <NA>    <NA>  <NA>
## 5:    <NA>    <NA>  <NA>
## 6:    <NA>    <NA>  <NA>

pre-process tokens and build LDA (latent dirilect allocation) topic model,
should give new col with topic assignment

m2mTcorpFix$preprocess(use_stemming = T, remove_stopwords=T, min_docfreq = 1000)
m2mLDA = m2mTcorpFix$lda_fit('feature', create_feature = 'topic', K = 5, alpha = 0.001)
## 30 rows in the dtm are empty. These have been deleted
## 30 columns in the dtm are empty. These have been deleted

look at new tokens, first 15 tokens

head(m2mTcorpFix$tokens, 15)
##                  doc_id token_id           token feature topic
##  1: 1233904988767866880        1          thanks    <NA>    NA
##  2: 1233904988767866880        2             for    <NA>    NA
##  3: 1233904988767866880        3         wearing    wear     1
##  4: 1233904988767866880        4            that    <NA>    NA
##  5: 1233904988767866880        5            mask    mask     1
##  6: 1233904988767866880        6               i    <NA>    NA
##  7: 1233904988767866880        7               d    <NA>    NA
##  8: 1233904988767866880        8            hate    <NA>    NA
##  9: 1233904988767866880        9              to    <NA>    NA
## 10: 1233904988767866880       10           catch    <NA>    NA
## 11: 1233904988767866880       11            your    <NA>    NA
## 12: 1233904988767866880       12          racism    <NA>    NA
## 13: 1233905558245101568        1        tburages    <NA>    NA
## 14: 1233905558245101568        2         tedlieu    <NA>    NA
## 15: 1233905558245101568        3 realdonaldtrump    <NA>    NA

browse topics

m2m_topics = browse_texts(m2mTcorpFix, category='topic', view=T)

loaded as html
only first 500 topics

work on dictionaries to setup for network analysis

library(qdapDictionaries)
library(qdapRegex)
## 
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:rlang':
## 
##     %|%
## The following object is masked from 'package:generics':
## 
##     explain
## The following object is masked from 'package:termco':
## 
##     as_count
## The following object is masked from 'package:dplyr':
## 
##     explain
## The following object is masked from 'package:ggplot2':
## 
##     %+%
library(quanteda.sentiment)
## 
## Attaching package: 'quanteda.sentiment'
## The following object is masked from 'package:quanteda':
## 
##     data_dictionary_LSD2015
library(sentiment)
## Loading required package: rjson
## Loading required package: plyr
## ---------------------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ---------------------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise, summarize
## The following object is masked from 'package:purrr':
## 
##     compact
library(syuzhet)
## 
## Attaching package: 'syuzhet'
## The following object is masked from 'package:scales':
## 
##     rescale
## The following object is masked from 'package:spacyr':
## 
##     get_tokens
dict_nrc <- quanteda.sentiment::data_dictionary_NRC
dict_lsd <- quanteda.sentiment::data_dictionary_LSD2015
dict_afinn <- quanteda.sentiment::data_dictionary_AFINN
dict_polar <- quanteda.sentiment::data_dictionary_geninqposneg

now try hits

dict_nrc <- dictionary(dict_nrc)
dict_lsd <- dictionary(dict_lsd)
dict_afinn <- dictionary(dict_afinn)
dict_polar <- dictionary(dict_polar)
nrc_hits = search_dictionary(m2mTcorpFix, dict_nrc)
lsd_hits = search_dictionary(m2mTcorpFix, dict_lsd)
afinn_hits = search_dictionary(m2mTcorpFix, dict_afinn)
polar_hits = search_dictionary(m2mTcorpFix, dict_polar)

use igraph and ggplot2 to graph the semantic network
separate 1 network per sentiment type

net 1 = NRC conditional probabilityof co-occurrence

library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:class':
## 
##     knn
## The following object is masked from 'package:rlang':
## 
##     is_named
## The following objects are masked from 'package:generics':
## 
##     components, union
## The following objects are masked from 'package:dials':
## 
##     degree, neighbors
## The following object is masked from 'package:textshape':
## 
##     ends
## The following objects are masked from 'package:lubridate':
## 
##     %--%, union
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
m2mNRC_net <- semnet(nrc_hits, measure = 'con_prob')

get adjacency for nrc net list

adjNRC <- igraph::get.adjacency(m2mNRC_net, attr = 'weight')

plot nrc

plot(adjNRC)


not sure what to make of this plot
the example I used plotted “hits” so let’s try that even though it did not
seem to make sense

plot(nrc_hits)


it worked, try to plot the conditional probability net

plot(m2mNRC_net)


shows same net, actually seems less helpful than the first plot
redo first plot for all other sents
lsd hits

plot(lsd_hits)

afinn hits*****DID NOT WORK

plot(afinn_hits)

polarity hits

plot(polar_hits)


now have two nets confirming that tweet sent toward masks more pos than neg,
still close, but definitely more pos

print adjacency

adjNRC
## 10 x 10 sparse Matrix of class "dgCMatrix"
##    [[ suppressing 10 column names 'anger', 'anticipation', 'disgust' ... ]]
##                                                                                               
## anger        .         0.3923227 0.1554977 0.2092822 0.1489915 0.3591412 0.4101063 0.013662980
## anticipation 0.2636642 .         0.1612010 0.2098819 0.1700918 0.3626294 0.4439586 0.011514357
## disgust      0.2866853 0.4422231 .         0.2355058 0.1539384 0.4626150 0.4474210 0.010795682
## fear         0.3056699 0.4561292 0.1865695 .         0.1542604 0.4016471 0.4551790 0.010136205
## joy          0.2574963 0.4374063 0.1443028 0.1825337 .         0.3365817 0.4419040 0.017616192
## negative     0.2766917 0.4157059 0.1933166 0.2118630 0.1500418 .         0.4215539 0.009857978
## positive     0.2711500 0.4367651 0.1604531 0.2060510 0.1690565 0.3617723 .         0.011471179
## sadness      0.3519553 0.4413408 0.1508380 0.1787709 0.2625698 0.3296089 0.4469274 .          
## surprise     0.2800718 0.4829443 0.3429084 0.2396768 0.1597846 0.4560144 0.4245961 0.007181329
## trust        0.2748049 0.4704571 0.1900780 0.2619844 0.1705686 0.4007804 0.4777035 0.011705686
##                                  
## anger        0.06766428 0.1069182
## anticipation 0.07841423 0.1230141
## disgust      0.15273890 0.1363455
## fear         0.08457396 0.1488755
## joy          0.06671664 0.1146927
## negative     0.08487886 0.1201337
## positive     0.06782334 0.1228850
## sadness      0.04469274 0.1173184
## surprise     .          0.1929982
## trust        0.11984392 .

plot semnet

plot_semnet(m2mNRC_net)


try to subset by topic
I already know I have 5 topics

m2mTcorpFix_s1 = m2mTcorpFix$subset(topic==1)
m2mTcorpFix_s2 = m2mTcorpFix$subset(topic==2)
m2mTcorpFix_s3 = m2mTcorpFix$subset(topic==3)
m2mTcorpFix_s4 = m2mTcorpFix$subset(topic==4)
m2mTcorpFix_s5 = m2mTcorpFix$subset(topic==5)

hits per topic

T1nrc_hits = search_dictionary(m2mTcorpFix_s1, dict_nrc)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 1 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 3 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 4 has 0 rows but longest item has 2; filled with NA
T1lsd_hits = search_dictionary(m2mTcorpFix_s1, dict_lsd)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 1 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 3 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 4 has 0 rows but longest item has 2; filled with NA
T1afinn_hits = search_dictionary(m2mTcorpFix_s1, dict_afinn)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 1 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 3 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 4 has 0 rows but longest item has 2; filled with NA
T1polar_hits = search_dictionary(m2mTcorpFix_s1, dict_polar)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 1 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 3 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 4 has 0 rows but longest item has 2; filled with NA
T2nrc_hits = search_dictionary(m2mTcorpFix_s2, dict_nrc)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 1 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 3 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 4 has 0 rows but longest item has 2; filled with NA
T2lsd_hits = search_dictionary(m2mTcorpFix_s2, dict_lsd)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 1 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 3 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 4 has 0 rows but longest item has 2; filled with NA
T2afinn_hits = search_dictionary(m2mTcorpFix_s2, dict_afinn)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 1 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 3 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 4 has 0 rows but longest item has 2; filled with NA
T2polar_hits = search_dictionary(m2mTcorpFix_s2, dict_polar)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 1 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 3 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 4 has 0 rows but longest item has 2; filled with NA
T3nrc_hits = search_dictionary(m2mTcorpFix_s3, dict_nrc)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 1 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 3 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 4 has 0 rows but longest item has 2; filled with NA
T3lsd_hits = search_dictionary(m2mTcorpFix_s3, dict_lsd)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 1 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 3 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 4 has 0 rows but longest item has 2; filled with NA
T3afinn_hits = search_dictionary(m2mTcorpFix_s3, dict_afinn)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 1 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 3 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 4 has 0 rows but longest item has 2; filled with NA
T3polar_hits = search_dictionary(m2mTcorpFix_s3, dict_polar)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 1 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 3 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 4 has 0 rows but longest item has 2; filled with NA
T4nrc_hits = search_dictionary(m2mTcorpFix_s4, dict_nrc)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 1 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 3 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 4 has 0 rows but longest item has 2; filled with NA
T4lsd_hits = search_dictionary(m2mTcorpFix_s4, dict_lsd)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 1 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 3 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 4 has 0 rows but longest item has 2; filled with NA
T4afinn_hits = search_dictionary(m2mTcorpFix_s4, dict_afinn)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 1 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 3 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 4 has 0 rows but longest item has 2; filled with NA
T4polar_hits = search_dictionary(m2mTcorpFix_s4, dict_polar)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 1 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 3 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 4 has 0 rows but longest item has 2; filled with NA
T5nrc_hits = search_dictionary(m2mTcorpFix_s5, dict_nrc)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 1 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 3 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 4 has 0 rows but longest item has 2; filled with NA
T5lsd_hits = search_dictionary(m2mTcorpFix_s5, dict_lsd)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 1 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 3 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 4 has 0 rows but longest item has 2; filled with NA
T5afinn_hits = search_dictionary(m2mTcorpFix_s5, dict_afinn)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 1 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 3 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 4 has 0 rows but longest item has 2; filled with NA
T5polar_hits = search_dictionary(m2mTcorpFix_s5, dict_polar)
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 1 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 3 has 0 rows but longest item has 2; filled with NA
## Warning in as.data.table.list(x, keep.rownames = keep.rownames, check.names = check.names, :
## Item 4 has 0 rows but longest item has 2; filled with NA

remove all of the topics objects, this was not a good way to go

rm(m2mTcorpFix_s1)
rm(m2mTcorpFix_s2)
rm(m2mTcorpFix_s3) 
rm(m2mTcorpFix_s4) 
rm(m2mTcorpFix_s5)
rm(T1nrc_hits)
rm(T1lsd_hits)
rm(T1afinn_hits)
rm(T1polar_hits)
rm(T2nrc_hits)
rm(T2lsd_hits)
rm(T2afinn_hits)
rm(T2polar_hits)
rm(T3nrc_hits)
rm(T3lsd_hits)
rm(T3afinn_hits)
rm(T3polar_hits)
rm(T4nrc_hits)
rm(T4lsd_hits)
rm(T4afinn_hits)
rm(T4polar_hits)
rm(T5nrc_hits)
rm(T5lsd_hits)
rm(T5afinn_hits)
rm(T5polar_hits)

NOW transform tcorp back to dfm
use existing LDA, set topics output for LDA as new col for new dfm
determined doc freq using BigBaby clean N=19732, 10% of the N is approx 200

dfm_maskTcorp <- get_dfm(m2mTcorpFix, 
                         feature="token",
                         context_level = c("document")) 

dfm is still not working

switch gears, tidy the gibbs lda

m2m_topics <- tidy(m2mLDA, matrix = "beta")
m2m_topics
## # A tibble: 105 × 3
##    topic term    beta
##    <int> <chr>  <dbl>
##  1     1 amp   0.0285
##  2     2 amp   0.0259
##  3     3 amp   0.0114
##  4     4 amp   0.0484
##  5     5 amp   0.0352
##  6     1 can   0.0288
##  7     2 can   0.0263
##  8     3 can   0.0378
##  9     4 can   0.0257
## 10     5 can   0.0318
## # … with 95 more rows
m2m_topics_topT <- m2m_topics %>%
  group_by(topic) %>%
  slice_max(beta, n = 5) %>% 
  ungroup() %>%
  arrange(topic, -beta)

m2m_topics_topT %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()


ok now I know n in above code was for n terms, set to higher n

m2m_topics_topT <- m2m_topics %>%
  group_by(topic) %>%
  slice_max(beta, n = 20) %>% 
  ungroup() %>%
  arrange(topic, -beta)

m2m_topics_topT %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()


try more words, see if context is better

m2m_topics_topT <- m2m_topics %>%
  group_by(topic) %>%
  slice_max(beta, n = 50) %>% 
  ungroup() %>%
  arrange(topic, -beta)

m2m_topics_topT %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()


do some comparison between topics to see diff in beta for words per topic

beta_wide <- m2m_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  pivot_wider(names_from = topic, values_from = beta) %>% 
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))

beta_wide
## # A tibble: 21 × 7
##    term  topic1 topic2 topic3 topic4 topic5 log_ratio
##    <chr>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>     <dbl>
##  1 amp   0.0285 0.0259 0.0114 0.0484 0.0352   -0.136 
##  2 can   0.0288 0.0263 0.0378 0.0257 0.0318   -0.130 
##  3 dont  0.0253 0.0258 0.0241 0.0238 0.0157    0.0329
##  4 face  0.0552 0.0533 0.0550 0.0502 0.0589   -0.0509
##  5 get   0.0369 0.0388 0.0320 0.0472 0.0334    0.0742
##  6 glove 0.0300 0.0342 0.0354 0.0286 0.0372    0.187 
##  7 go    0.0286 0.0358 0.0282 0.0346 0.0340    0.320 
##  8 im    0.0300 0.0321 0.0179 0.0279 0.0310    0.0953
##  9 just  0.0327 0.0354 0.0364 0.0360 0.0325    0.116 
## 10 like  0.0230 0.0353 0.0278 0.0297 0.0281    0.615 
## # … with 11 more rows
plot(beta_wide)


works but not what I was going for plot-wise
no code with sample plot so winging it here
try ggplot of beta wide

beta_wide %>%
  ggplot(aes(log_ratio,term)) +
  geom_col() +
  geom_line()
## geom_path: Each group consists of only one observation. Do you need to adjust the group
## aesthetic?


try fewer words

m2m_topics_topT <- m2m_topics %>%
  group_by(topic) %>%
  slice_max(beta, n = 10) %>% 
  ungroup() %>%
  arrange(topic, -beta)

m2m_topics_topT %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()

xx

m2mTopicsCorp <- create_tcorpus(m2m_topics, 
                              doc_column = 'topic',
                              text_columns = 'term')
## Error in create_tcorpus.character(text, doc_id = doc_id, meta = x[, !colnames(x) %in% : doc_id should not contain duplicate values
m2mTopicsCorp
## Error in eval(expr, envir, enclos): object 'm2mTopicsCorp' not found
nrc_hits2 = search_dictionary(m2mTopicsCorp, dict_nrc)
## Error in class(x)[1] %in% c("tCorpus"): object 'm2mTopicsCorp' not found

ending
notes for follow-up
*
topic info is stored in fixed tcorpus, this means that the hits account for
the topics somewhere, need to figure out where/how.

filter by topic

nrc_pos <- get_sentiments("nrc") %>% 
  filter(sentiment == "positive")

m2m_topic1 <- m2m_topics_topT %>%
  filter(topic == 1) %>%
  inner_join(nrc_pos)
## Error: `by` must be supplied when `x` and `y` have no common variables.
## ℹ use by = character()` to perform a cross-join.
m2m_topic2 <- m2m_topics_topT %>%
  filter(topic == 2)

m2m_topic3 <- m2m_topics_topT %>%
  filter(topic == 3)

m2m_topic4 <- m2m_topics_topT %>%
  filter(topic == 4)

m2m_topic5 <- m2m_topics_topT %>%
  filter(topic == 5)

**The documents per topic is what I want next to link this with the 1tweet=1case
level

code for swap function fix if needed

swap <- function(x, i) { x[i, ] <- x[i, c(2,1)]; return (x) }

code for vader dictionary call and reformat to quanteda, needs to be fixed for
the quanteda step using “dictionary” command

url <- "https://raw.githubusercontent.com/cjhutto/vaderSentiment/master/vaderSentiment/vader_lexicon.txt"
vader <- read_delim(url, col_names=c("word","sentiment", "details"),  col_types="cdc",  delim="\t")

head(vader)
## # A tibble: 6 × 4
##   word     sentiment details X4                                      
##   <chr>        <dbl> <chr>   <chr>                                   
## 1 $:            -1.5 0.80623 [-1, -1, -1, -1, -3, -1, -3, -1, -2, -1]
## 2 %)            -0.4 1.0198  [-1, 0, -1, 0, 0, -2, -1, 2, -1, 0]     
## 3 %-)           -1.5 1.43178 [-2, 0, -2, -2, -1, 2, -2, -3, -2, -3]  
## 4 &-:           -0.4 1.42829 [-3, -1, 0, 0, -1, -1, -1, 2, -1, 2]    
## 5 &:            -0.7 0.64031 [0, -1, -1, -1, 1, -1, -1, -1, -1, -1]  
## 6 ( '}{' )       1.6 0.66332 [1, 2, 2, 1, 1, 2, 2, 1, 3, 1]
vader_dict <- dictionary(list(vader))
## Error in paste(unlist(dict, recursive = TRUE), collapse = " "): result would exceed 2^31-1 bytes
%d bloggers like this: