1:
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12:
13:
14:
15:
16:
17:
18:
19:
20:
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63:
64:
65:
66:
67:
68:
69:
70:
71:
72:
73:
74:
75:
76:
77:
78:
79:
80:
81:
82:
83:
84:
85:
86:
87:
88:
89:
90:
91:
92:
93:
94:
95:
96:
97:
98:
99:
100:
101:
102:
103:
104:
105:
106:
107:
108:
|
# install.packages("RISmed")
# install.packages("tm")
library(RISmed)
require(tm)
# Set the working directory
setwd("C:/Users/ebkim/Desktop")
# Make your own query for your purposes
# query = "additive OR (lactic AND acid AND bacteria) OR additive"
query = "probiotic AND cancer"
iYear1 = 2016
iYear2 = 2018
sSaveFile = "pig_additive_2016_to_2018_Freq.txt"
sSaveAbstract = "pig_additive_2016_to_2018_Abstract.txt"
oSearch <- EUtilsSummary( query,
type="esearch",
db = "pubmed",
mindate=iYear1,
maxdate=iYear2,
retmax=99999999)
# Date: YYYY/MM/YY or YYYY/MM or YYYY may be ok.
# How many documents?
QueryCount(oSearch)
# Get records from oSearch object
# It takes time to get document from the NCBI Web Server.
oRec <- EUtilsGet(oSearch)
# Publication year Information
years = YearPubmed(oRec); years
yearsTable = table(years); yearsTable
yearsTable.Sorted = sort(yearsTable, decreasing=T)
yearsTableDf = as.data.frame(yearsTable); yearsTableDf
par(mfrow=c(1,3))
barplot(yearsTable, ylim=c(0, 200), , col=c('red', 'green', 'blue'))
barplot(yearsTable, ylim=c(0, 200), , col=c('#ff9900', '#99ff99', '#33ccff'))
barplot(yearsTable.Sorted, ylim=c(0, 200), , col=c('#ff9900', '#99ff99', '#33ccff'))
abline(h=0, lty=1, col='black')
# Journal Information
journals = MedlineTA(oRec)
journalsTable = table(journals); journalsTable
journalsTable.Sorted = sort(journalsTable, decreasing=T); journalsTable.Sorted
journalsTable.Sorted10 = journalsTable.Sorted[1:10]; journalsTable.Sorted10
journalsTable.Sorted10Df = as.data.frame(journalsTable.Sorted10); journalsTable.Sorted10Df
par(mfrow=c(1,2))
par(mar=c(3, 12, 3, 3))
barplot( journalsTable.Sorted10, horiz=TRUE, xlim=c(0, 16), col=c('#ff9900', '#99ff99', '#33ccff'),
cex.names=0.8, names.arg=names(journalsTable.Sorted10), las=1)
abline(v=0, lty=1, col='black')
par(mar=c(12, 3, 3, 3))
barplot( journalsTable.Sorted10, horiz=FALSE, ylim=c(0, 16), col=c('#ff9900', '#99ff99', '#33ccff'),
cex.names=0.8, names.arg=names(journalsTable.Sorted10), las=2)
abline(h=0, lty=1, col='black')
# Abstract Handling
abstracts = AbstractText(oRec); abstracts[1:10]
# Saving data files for keyword frequency and abstract text
fileConn=file("Abstract.txt")
writeLines(abstracts, fileConn)
close(fileConn)
# Corpus Handling
corpText = Corpus(VectorSource(abstracts)) # abstracts --> converted to a corpus
inspect(corpText) # Inspect the corpus text: 1 line = 1 document?
# Text processing - Replace special symbols (/, @, |) with spaces
toSpace = content_transformer(function (x , pattern ) gsub(pattern, " ", x))
corpText = tm_map(corpText, toSpace, "/")
corpText = tm_map(corpText, toSpace, "@")
corpText = tm_map(corpText, toSpace, "\\|")
inspect(corpText)
corpText = tm_map(corpText, content_transformer(tolower)) # Lower Case
inspect(corpText)
corpText = tm_map(corpText, removeNumbers) # Remove numbers
inspect(corpText)
corpText = tm_map(corpText, removeWords, stopwords("english")) # Remove common stopwords in English
stopwords("english")
inspect(corpText)
corpText = tm_map(corpText, removeWords, c("xxx", "blabla")) # Remove user-defined stopwords
inspect(corpText)
corpText = tm_map(corpText, removePunctuation) # Remove punctuations
inspect(corpText)
corpText = tm_map(corpText, stripWhitespace) # Remove extra spaces
inspect(corpText)
corpText = tm_map(corpText, stemDocument) # Text stemming (Removing -ing, -ed, -er, -ful, ...)
inspect(corpText)
|