R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(SocialMediaLab)
## Warning: package 'SocialMediaLab' was built under R version 3.4.3
library(magrittr)

library(tm)
## Loading required package: NLP
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.4.3
library(stringr)
#extract
#NSF campaign
#https://www.youtube.com/watch?v=xK1Qf0MTIRU #2009 
#1,645 views #1 comment
#Very helpful.  I feel much more aware of how to recognise a stroke.  Thanks!
#https://www.youtube.com/watch?v=RBaMgsSKzCc #2010
#7,946 views #0 Comments
#https://www.youtube.com/watch?v=MmoGeCXovJ8 #2011
#4,755 views # 0 comments
#Everyone needs to know this. 
#https://www.youtube.com/watch?v=27pbdKLOHNU #2013
#479 views #0 Comments
#https://www.youtube.com/watch?v=xA-P5voEik8 #2015
#60 views #0 Comments
#https://www.youtube.com/watch?v=YHzz2cXBlGk #2006 stroke heroes long version
videoIDs<-c("YHzz2cXBlGk") #123 comments #406,253 views 2/2/18

#extract
#g_youtube_actor <- Authenticate("youtube", apiKey= apiKey) %>%
#  Collect(videoIDs = videoIDs, writeToFile=TRUE) %>%
#  Create("Actor")

#output of socialmedialab
df<-read.csv("Feb_01_1_49_59 PM_2018_AEDT_YoutubeData.csv",stringsAsFactors = FALSE)

toRemove <- which(df$Comment=="")

if (isTRUE(length(toRemove)!=0)) {
  df <- df[-toRemove,]
}

keywords <- df$Comment 
keywords <- iconv(keywords, to = 'utf-8')
myCorpus <- VCorpus(VectorSource(keywords))
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
myCorpus <- tm_map(myCorpus, removeNumbers)
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeWords, stopwords("english"),lazy=TRUE) 
myCorpus <- tm_map(myCorpus, stripWhitespace, lazy=TRUE)
dtm <- DocumentTermMatrix(myCorpus,control = list(wordLengths=c(3, 20)))
dtm<-removeSparseTerms(dtm, 0.95)

tdm=TermDocumentMatrix(myCorpus,control = list(minWordLength=3,maxWordLength=20) )

inspect(dtm[1:5,5:10])
## <<DocumentTermMatrix (documents: 5, terms: 6)>>
## Non-/sparse entries: 3/27
## Sparsity           : 90%
## Maximal term length: 5
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs just know like love song still
##    1    0    0    0    1    0     0
##    2    0    0    0    0    0     0
##    3    0    0    0    1    0     0
##    4    0    0    0    0    0     0
##    5    2    0    0    0    0     0

matrix conversion

#convert to matrix
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)

write.csv(m,file="youtube_strokeheroes.csv") #write to Document directory
head(d, 10)
##          word freq
## stroke stroke   22
## like     like   14
## video   video   13
## song     song   10
## just     just    8
## fast     fast    7
## first   first    7
## good     good    7
## know     know    7
## love     love    7
barplot(d[1:20,]$freq, las = 2, names.arg = d[1:20,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")

#word cloud
library(wordcloud)
## Loading required package: RColorBrewer
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=100, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))