R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

setwd("~/DataMining/Atherosclerosis/")
library(pubmed.mineR)
library(lsa)
## Loading required package: SnowballC
library(SnowballC)
library(wordcloud)
## Loading required package: RColorBrewer
library(DT)
## Warning: package 'DT' was built under R version 3.4.3
abstracts=readabs("pubmed_athero_result.txt") 
#atherosclerosis+inflammation+plaque rupture
pmids=abstracts@PMID
gene=gene_atomization(abstracts)
words=word_atomizations(abstracts)
summary(gene)
##   Gene_symbol
##  ABCA1  : 1  
##  ABCG1  : 1  
##  ACE    : 1  
##  ACE2   : 1  
##  ADAM10 : 1  
##  ADAM17 : 1  
##  (Other):63  
##                                                           Genes 
##  absent in melanoma 2                                      : 1  
##  ADAM metallopeptidase domain 10                           : 1  
##  ADAM metallopeptidase domain 17                           : 1  
##  amyloid beta (A4) precursor protein                       : 1  
##  angiotensin I converting enzyme (peptidyl-dipeptidase A) 1: 1  
##  angiotensin I converting enzyme (peptidyl-dipeptidase A) 2: 1  
##  (Other)                                                   :63  
##       Freq   
##  1      :21  
##  2      :12  
##  4      : 8  
##  3      : 7  
##  6      : 6  
##  7      : 4  
##  (Other):11
abstract.bodies=abstracts@Abstract
#SentenceToken(abstract.bodies[1])
Sathero=searchabsL(abstracts)
Con=Find_conclusion(abstracts) #conclusion of abstracts
#create list of words
tdm_wordsA=c("inflammation","atherosclerosis", "carotid","rupture", "estrogen", "testosterone", "statin","IMT","shear stress","plaque","systolic",  "vulnerable","calcification", "ROS", "annexin","apoptosis", "endothelium", "EPC","thrombotic","macrophage","collagen","fibroblast", "foam cell", "smooth muscle", "monocyte", "chemokine", "antibody", "homocysteine", "insulin", "mitochondrial", "angiogenesis") #"T cell"

#create list of gene
#take gene data from first column
tdm_wordsG=gene[,1] 

#merge the 2 word lists
tdm_wordsAG=c(tdm_wordsA,tdm_wordsG)

#create term document matrix
tdmAG=tdm_for_lsa(Sathero,tdm_wordsAG)

#plot wordcloud
m <- as.matrix(tdmAG)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
##                            word freq
## T                             T 3522
## plaque                   plaque 2761
## atherosclerosis atherosclerosis 1054
## rupture                 rupture  939
## inflammation       inflammation  935
## macrophage           macrophage  506
## vulnerable           vulnerable  287
## carotid                 carotid  262
## CRP                         CRP  219
## monocyte               monocyte  183

Including Plots

You can also embed plots, for example:

set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))