Laboratory of Microbial Genomics and Big Data (강원대학교 미생물유전체빅데이터 연구실)

R: Text Mining - PubMed Medline - Downloading Abstracts - by Eun Bae Kim (08/22/2018)
 Visits : 497,881 ( Your IP 3.145.64.210 )
 

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
 10: 
 11: 
 12: 
 13: 
 14: 
 15: 
 16: 
 17: 
 18: 
 19: 
 20: 
 21: 
 22: 
 23: 
 24: 
 25: 
 26: 
 27: 
 28: 
 29: 
 30: 
 31: 
 32: 
 33: 
 34: 
 35: 
 36: 
 37: 
 38: 
 39: 
 40: 
 41: 
 42: 
 43: 
 44: 
 45: 
 46: 
 47: 
 48: 
 49: 
 50: 
 51: 
 52: 
 53: 
 54: 
 55: 
 56: 
 57: 
 58: 
 59: 
 60: 
 61: 
 62: 
 63: 
 64: 
 65: 
 66: 
 67: 
 68: 
 69: 
 70: 
 71: 
 72: 
 73: 
 74: 
 75: 
 76: 
 77: 
 78: 
 79: 
 80: 
 81: 
 82: 
 83: 
 84: 
 85: 
 86: 
 87: 
 88: 
 89: 
 90: 
 91: 
 92: 
 93: 
 94: 
 95: 
 96: 
 97: 
 98: 
 99: 
 100: 
 101: 
 102: 
 103: 
 104: 
 105: 
 106: 
 107: 
 108: 
# install.packages("RISmed")
# install.packages("tm")
library(RISmed)
require(tm)

# Set the working directory
setwd("C:/Users/ebkim/Desktop")

# Make your own query for your purposes
# query = "additive OR (lactic AND acid AND bacteria) OR additive"
query = "probiotic AND cancer"

iYear1        = 2016
iYear2        = 2018
sSaveFile     = "pig_additive_2016_to_2018_Freq.txt"
sSaveAbstract = "pig_additive_2016_to_2018_Abstract.txt"


oSearch <- EUtilsSummary(  query, 
                           type="esearch", 
			   db = "pubmed", 
			   mindate=iYear1, 
			   maxdate=iYear2, 
			   retmax=99999999)
# Date: YYYY/MM/YY or YYYY/MM or YYYY may be ok.


# How many documents?
QueryCount(oSearch)

# Get records from oSearch object
# It takes time to get document from the NCBI Web Server.
oRec <- EUtilsGet(oSearch)


# Publication year Information
years                = YearPubmed(oRec); years
yearsTable           = table(years); yearsTable
yearsTable.Sorted    = sort(yearsTable, decreasing=T)
yearsTableDf         = as.data.frame(yearsTable); yearsTableDf
par(mfrow=c(1,3))
barplot(yearsTable, ylim=c(0, 200), , col=c('red', 'green', 'blue'))
barplot(yearsTable, ylim=c(0, 200), , col=c('#ff9900', '#99ff99', '#33ccff'))
barplot(yearsTable.Sorted, ylim=c(0, 200), , col=c('#ff9900', '#99ff99', '#33ccff'))
abline(h=0, lty=1, col='black')



# Journal Information
journals                  = MedlineTA(oRec)
journalsTable             = table(journals); journalsTable
journalsTable.Sorted      = sort(journalsTable, decreasing=T); journalsTable.Sorted
journalsTable.Sorted10    = journalsTable.Sorted[1:10]; journalsTable.Sorted10
journalsTable.Sorted10Df  = as.data.frame(journalsTable.Sorted10); journalsTable.Sorted10Df
par(mfrow=c(1,2))
par(mar=c(3, 12, 3, 3))
barplot(  journalsTable.Sorted10, horiz=TRUE, xlim=c(0, 16), col=c('#ff9900', '#99ff99', '#33ccff'),
          cex.names=0.8, names.arg=names(journalsTable.Sorted10), las=1)
abline(v=0, lty=1, col='black')
par(mar=c(12, 3, 3, 3))
barplot(  journalsTable.Sorted10, horiz=FALSE, ylim=c(0, 16), col=c('#ff9900', '#99ff99', '#33ccff'),
          cex.names=0.8, names.arg=names(journalsTable.Sorted10), las=2)
abline(h=0, lty=1, col='black')




# Abstract Handling
abstracts   = AbstractText(oRec); abstracts[1:10]

# Saving data files for keyword frequency and abstract text
fileConn=file("Abstract.txt")
writeLines(abstracts, fileConn)
close(fileConn)


# Corpus Handling
corpText = Corpus(VectorSource(abstracts))     # abstracts --> converted to a corpus
inspect(corpText)                              # Inspect the corpus text: 1 line = 1 document?

# Text processing - Replace special symbols (/, @, |) with spaces
toSpace = content_transformer(function (x , pattern ) gsub(pattern, " ", x))
corpText = tm_map(corpText, toSpace, "/")
corpText = tm_map(corpText, toSpace, "@")
corpText = tm_map(corpText, toSpace, "\\|")
inspect(corpText)

corpText = tm_map(corpText, content_transformer(tolower))               # Lower Case
inspect(corpText)

corpText = tm_map(corpText, removeNumbers)                              # Remove numbers
inspect(corpText)

corpText = tm_map(corpText, removeWords, stopwords("english"))          # Remove common stopwords in English
stopwords("english")
inspect(corpText)

corpText = tm_map(corpText, removeWords, c("xxx", "blabla"))            # Remove user-defined stopwords
inspect(corpText)

corpText = tm_map(corpText, removePunctuation)                          # Remove punctuations
inspect(corpText)

corpText = tm_map(corpText, stripWhitespace)                            # Remove extra spaces
inspect(corpText)

corpText = tm_map(corpText, stemDocument)                               # Text stemming (Removing -ing, -ed, -er, -ful, ...)
inspect(corpText)



Kangwon National University