Laboratory of Microbial Genomics and Big Data (강원대학교 미생물유전체빅데이터 연구실)

# install.packages("RISmed")
# install.packages("tm")
library(RISmed)
require(tm)

# Set the working directory
setwd("C:/Users/ebkim/Desktop")

# Reference: http://www.sthda.com/english/wiki/text-mining-and-word-cloud-fundamentals-in-r-5-simple-steps-you-should-know

# file = "BiologicalText.txt"                     # For your local files.
file = "http://itanimal.ipdisk.co.kr/web_laboratory/research_r_data/BiologicalText.txt"
text = readLines(file)
# text <- readLines(file.choose())                # You may open a target text file.


# Text conversion to a corpus
corpText = Corpus(VectorSource(text))

# Inspect the corpus text   --> 1 line = 1 document
inspect(corpText)

# Text processing - Replace special symbols (/, @, |) with spaces
toSpace = content_transformer(function (x , pattern ) gsub(pattern, " ", x))
corpText = tm_map(corpText, toSpace, "/")
corpText = tm_map(corpText, toSpace, "@")
corpText = tm_map(corpText, toSpace, "\\|")
inspect(corpText)

corpText = tm_map(corpText, content_transformer(tolower))               # Lower Case
inspect(corpText)

corpText = tm_map(corpText, removeNumbers)                              # Remove numbers
inspect(corpText)

corpText = tm_map(corpText, removeWords, stopwords("english"))          # Remove common stopwords in English
stopwords("english")
inspect(corpText)

corpText = tm_map(corpText, removeWords, c("xxx", "blabla"))            # Remove user-defined stopwords
inspect(corpText)

corpText = tm_map(corpText, removePunctuation)                          # Remove punctuations
inspect(corpText)

corpText = tm_map(corpText, stripWhitespace)                            # Remove extra spaces
inspect(corpText)

corpText = tm_map(corpText, stemDocument)                               # Text stemming (Removing -ing, -ed, -er, -ful, ...)
inspect(corpText)





# Sorting by using a Term-Document Matrix 
tdm = TermDocumentMatrix(corpText)
mat = as.matrix(tdm)                # Conversion to a matrix
head(mat)
rowSums(mat); colSums(mat)
wordsSorted = sort(rowSums(mat), decreasing=TRUE)
head(wordsSorted, n = 20)
dfWordFreq = data.frame(Word = names(wordsSorted), Freq = wordsSorted)
head(dfWordFreq, 20)

# Sorting by using a Document-Term Matrix
require(slam)
dtm = DocumentTermMatrix(corpText)
mat = as.matrix(dtm)
head(mat)
freq = col_sums(dtm)                       # Get frequency for each word
words = colnames(dtm)                      # Get all words to be processed
dfWordFreq = data.frame(Word = words, Freq = freq)
freqOrder = order(dfWordFreq[, "Freq"], decreasing = TRUE)
dfWordFreq = dfWordFreq[freqOrder, ]       # Sorting
head(dfWordFreq, 20)


#install.packages("SnowballC")
#install.packages("RColorBrewer")
#install.packages("wordcloud")
library(RColorBrewer)
library(wordcloud)

# Drawing a wordcloud
colorsSelected = brewer.pal(8, "Dark2"); colorsSelected               # Select colors for wordcloud
wordcloud(words = dfWordFreq$Word, 
          freq  = dfWordFreq$Freq,
          min.freq     = 1,
          max.words    = 150,
          random.order = FALSE,
          rot.per      = 0.35, 
          colors       = colorsSelected
)
# For more options, type "help(wordcloud)" in R.
# rot.per: proportion words with 90 degree rotation
# There is another library package for wordcloud. Refer to the following link.
# https://cran.r-project.org/web/packages/wordcloud2/vignettes/wordcloud.html


# Frequency Table
tdm = TermDocumentMatrix(corpText)
mat = as.matrix(tdm)                                 # Conversion to a matrix
head(mat)

rowSums(mat); colSums(mat)
wordsSorted = sort(rowSums(mat), decreasing=TRUE)
head(wordsSorted, n = 20)

dfWordFreq = data.frame(Word = names(wordsSorted), Freq = wordsSorted)
head(dfWordFreq, 20)

# Saving data files for keyword frequency and abstract text
write.table(data.frame(dfWordFreq), row.names=TRUE, file="Text_Freq.txt", sep="\t")