1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63:
64:
65:
66:
67:
68:
69:
70:
71:
72:
73:
74:
75:
76:
77:
78:
79:
80:
81:
82:
83:
84:
85:
86:
87:
88:
89:
90:
91:
92:
93:
94:
95:
96:
97:
98:
99:
100:
101:
102:
103:
104:
105:
106:
107:
108:
109:
110:
111:
112:
113:
114:
|
# install.packages("RISmed")
# install.packages("tm")
library(RISmed)
require(tm)
# Set the working directory
setwd("C:/Users/ebkim/Desktop")
# Reference: http://www.sthda.com/english/wiki/text-mining-and-word-cloud-fundamentals-in-r-5-simple-steps-you-should-know
# file = "BiologicalText.txt" # For your local files.
file = "http://itanimal.ipdisk.co.kr/web_laboratory/research_r_data/BiologicalText.txt"
text = readLines(file)
# text <- readLines(file.choose()) # You may open a target text file.
# Text conversion to a corpus
corpText = Corpus(VectorSource(text))
# Inspect the corpus text --> 1 line = 1 document
inspect(corpText)
# Text processing - Replace special symbols (/, @, |) with spaces
toSpace = content_transformer(function (x , pattern ) gsub(pattern, " ", x))
corpText = tm_map(corpText, toSpace, "/")
corpText = tm_map(corpText, toSpace, "@")
corpText = tm_map(corpText, toSpace, "\\|")
inspect(corpText)
corpText = tm_map(corpText, content_transformer(tolower)) # Lower Case
inspect(corpText)
corpText = tm_map(corpText, removeNumbers) # Remove numbers
inspect(corpText)
corpText = tm_map(corpText, removeWords, stopwords("english")) # Remove common stopwords in English
stopwords("english")
inspect(corpText)
corpText = tm_map(corpText, removeWords, c("xxx", "blabla")) # Remove user-defined stopwords
inspect(corpText)
corpText = tm_map(corpText, removePunctuation) # Remove punctuations
inspect(corpText)
corpText = tm_map(corpText, stripWhitespace) # Remove extra spaces
inspect(corpText)
corpText = tm_map(corpText, stemDocument) # Text stemming (Removing -ing, -ed, -er, -ful, ...)
inspect(corpText)
# Sorting by using a Term-Document Matrix
tdm = TermDocumentMatrix(corpText)
mat = as.matrix(tdm) # Conversion to a matrix
head(mat)
rowSums(mat); colSums(mat)
wordsSorted = sort(rowSums(mat), decreasing=TRUE)
head(wordsSorted, n = 20)
dfWordFreq = data.frame(Word = names(wordsSorted), Freq = wordsSorted)
head(dfWordFreq, 20)
# Sorting by using a Document-Term Matrix
require(slam)
dtm = DocumentTermMatrix(corpText)
mat = as.matrix(dtm)
head(mat)
freq = col_sums(dtm) # Get frequency for each word
words = colnames(dtm) # Get all words to be processed
dfWordFreq = data.frame(Word = words, Freq = freq)
freqOrder = order(dfWordFreq[, "Freq"], decreasing = TRUE)
dfWordFreq = dfWordFreq[freqOrder, ] # Sorting
head(dfWordFreq, 20)
#install.packages("SnowballC")
#install.packages("RColorBrewer")
#install.packages("wordcloud")
library(RColorBrewer)
library(wordcloud)
# Drawing a wordcloud
colorsSelected = brewer.pal(8, "Dark2"); colorsSelected # Select colors for wordcloud
wordcloud(words = dfWordFreq$Word,
freq = dfWordFreq$Freq,
min.freq = 1,
max.words = 150,
random.order = FALSE,
rot.per = 0.35,
colors = colorsSelected
)
# For more options, type "help(wordcloud)" in R.
# rot.per: proportion words with 90 degree rotation
# There is another library package for wordcloud. Refer to the following link.
# https://cran.r-project.org/web/packages/wordcloud2/vignettes/wordcloud.html
# Frequency Table
tdm = TermDocumentMatrix(corpText)
mat = as.matrix(tdm) # Conversion to a matrix
head(mat)
rowSums(mat); colSums(mat)
wordsSorted = sort(rowSums(mat), decreasing=TRUE)
head(wordsSorted, n = 20)
dfWordFreq = data.frame(Word = names(wordsSorted), Freq = wordsSorted)
head(dfWordFreq, 20)
# Saving data files for keyword frequency and abstract text
write.table(data.frame(dfWordFreq), row.names=TRUE, file="Text_Freq.txt", sep="\t")
|