Laboratory of Microbial Genomics and Big Data (강원대학교 미생물유전체빅데이터 연구실)

R: Text Mining - Text File - Corpus, Wordcloud - by Eun Bae Kim (08/22/2018)
 Visits : 497,894 ( Your IP 3.137.198.37 )
 

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
 10: 
 11: 
 12: 
 13: 
 14: 
 15: 
 16: 
 17: 
 18: 
 19: 
 20: 
 21: 
 22: 
 23: 
 24: 
 25: 
 26: 
 27: 
 28: 
 29: 
 30: 
 31: 
 32: 
 33: 
 34: 
 35: 
 36: 
 37: 
 38: 
 39: 
 40: 
 41: 
 42: 
 43: 
 44: 
 45: 
 46: 
 47: 
 48: 
 49: 
 50: 
 51: 
 52: 
 53: 
 54: 
 55: 
 56: 
 57: 
 58: 
 59: 
 60: 
 61: 
 62: 
 63: 
 64: 
 65: 
 66: 
 67: 
 68: 
 69: 
 70: 
 71: 
 72: 
 73: 
 74: 
 75: 
 76: 
 77: 
 78: 
 79: 
 80: 
 81: 
 82: 
 83: 
 84: 
 85: 
 86: 
 87: 
 88: 
 89: 
 90: 
 91: 
 92: 
 93: 
 94: 
 95: 
 96: 
 97: 
 98: 
 99: 
 100: 
 101: 
 102: 
 103: 
 104: 
 105: 
 106: 
 107: 
 108: 
 109: 
 110: 
 111: 
 112: 
 113: 
 114: 
# install.packages("RISmed")
# install.packages("tm")
library(RISmed)
require(tm)

# Set the working directory
setwd("C:/Users/ebkim/Desktop")

# Reference: http://www.sthda.com/english/wiki/text-mining-and-word-cloud-fundamentals-in-r-5-simple-steps-you-should-know

# file = "BiologicalText.txt"                     # For your local files.
file = "http://itanimal.ipdisk.co.kr/web_laboratory/research_r_data/BiologicalText.txt"
text = readLines(file)
# text <- readLines(file.choose())                # You may open a target text file.


# Text conversion to a corpus
corpText = Corpus(VectorSource(text))

# Inspect the corpus text   --> 1 line = 1 document
inspect(corpText)

# Text processing - Replace special symbols (/, @, |) with spaces
toSpace = content_transformer(function (x , pattern ) gsub(pattern, " ", x))
corpText = tm_map(corpText, toSpace, "/")
corpText = tm_map(corpText, toSpace, "@")
corpText = tm_map(corpText, toSpace, "\\|")
inspect(corpText)

corpText = tm_map(corpText, content_transformer(tolower))               # Lower Case
inspect(corpText)

corpText = tm_map(corpText, removeNumbers)                              # Remove numbers
inspect(corpText)

corpText = tm_map(corpText, removeWords, stopwords("english"))          # Remove common stopwords in English
stopwords("english")
inspect(corpText)

corpText = tm_map(corpText, removeWords, c("xxx", "blabla"))            # Remove user-defined stopwords
inspect(corpText)

corpText = tm_map(corpText, removePunctuation)                          # Remove punctuations
inspect(corpText)

corpText = tm_map(corpText, stripWhitespace)                            # Remove extra spaces
inspect(corpText)

corpText = tm_map(corpText, stemDocument)                               # Text stemming (Removing -ing, -ed, -er, -ful, ...)
inspect(corpText)





# Sorting by using a Term-Document Matrix 
tdm = TermDocumentMatrix(corpText)
mat = as.matrix(tdm)                # Conversion to a matrix
head(mat)
rowSums(mat); colSums(mat)
wordsSorted = sort(rowSums(mat), decreasing=TRUE)
head(wordsSorted, n = 20)
dfWordFreq = data.frame(Word = names(wordsSorted), Freq = wordsSorted)
head(dfWordFreq, 20)

# Sorting by using a Document-Term Matrix
require(slam)
dtm = DocumentTermMatrix(corpText)
mat = as.matrix(dtm)
head(mat)
freq = col_sums(dtm)                       # Get frequency for each word
words = colnames(dtm)                      # Get all words to be processed
dfWordFreq = data.frame(Word = words, Freq = freq)
freqOrder = order(dfWordFreq[, "Freq"], decreasing = TRUE)
dfWordFreq = dfWordFreq[freqOrder, ]       # Sorting
head(dfWordFreq, 20)


#install.packages("SnowballC")
#install.packages("RColorBrewer")
#install.packages("wordcloud")
library(RColorBrewer)
library(wordcloud)

# Drawing a wordcloud
colorsSelected = brewer.pal(8, "Dark2"); colorsSelected               # Select colors for wordcloud
wordcloud(words = dfWordFreq$Word, 
          freq  = dfWordFreq$Freq,
          min.freq     = 1,
          max.words    = 150,
          random.order = FALSE,
          rot.per      = 0.35, 
          colors       = colorsSelected
)
# For more options, type "help(wordcloud)" in R.
# rot.per: proportion words with 90 degree rotation
# There is another library package for wordcloud. Refer to the following link.
# https://cran.r-project.org/web/packages/wordcloud2/vignettes/wordcloud.html


# Frequency Table
tdm = TermDocumentMatrix(corpText)
mat = as.matrix(tdm)                                 # Conversion to a matrix
head(mat)

rowSums(mat); colSums(mat)
wordsSorted = sort(rowSums(mat), decreasing=TRUE)
head(wordsSorted, n = 20)

dfWordFreq = data.frame(Word = names(wordsSorted), Freq = wordsSorted)
head(dfWordFreq, 20)

# Saving data files for keyword frequency and abstract text
write.table(data.frame(dfWordFreq), row.names=TRUE, file="Text_Freq.txt", sep="\t")



Kangwon National University