Perl: GenBank Handling (File, Tabulate), glob (File Array) - by Eun Bae Kim (08/21/2018)
 

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
  
use strict;
use warnings;
use LWP::Simple;

my $iDate1 = 2015;
my $iDate2 = 2018;
my $sTopic = "pig+AND+probiotic";

my $sDownFolder = "Download_PubMed";
unless (-d $sDownFolder) {               # If the folder does not exist.
	system("mkdir ".$sDownFolder);
}

# Reference: https://www.ncbi.nlm.nih.gov/books/NBK25501
# Reference: https://dataguide.nlm.nih.gov/eutilities/utilities.html#esearch
# https://www.ncbi.nlm.nih.gov/pubmed/29975997?report=medline&format=text
#     --> One document (medline format)

print "--------------------------------------------\n";
for (my $i = $iDate1; $i<=$iDate2; $i++) {
	print "Year: ".$i."\n";
	my $sUrl = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?".
	          "db=pubmed&term=".$sTopic."+".$i."[pdat]".    # pdat --> YYYY/MM/DD or YYYY/MM or YYYY
		      "&RetMax=999999999&usehistory=y";
	my $sSaveFile = $sDownFolder."/PubMed_Year_".$i.".xml";

	print "URL: ".$sUrl."\n";
	print "XML: ".$sSaveFile."\n";
	getstore($sUrl, $sSaveFile);


    # Reading an XML file for the number of document
	my $iDocuCnt = 0;
	open (h_in, "$sSaveFile");
	while (my $sLine = <h_in>) {
		$iDocuCnt = 0;
		if ($sLine =~/\<Count\>/) {
			$iDocuCnt = $';
		}
		if ($iDocuCnt =~/\<\/Count\>/) { 
			$iDocuCnt = $`;
			print "The Number of Document: ".$iDocuCnt."\n";
			last;
		}
	}
	close (h_in);

    # Reading an XML file for the information stored in the server
	my $sWebEnv = "";
	open (h_in, "$sSaveFile");
	while (my $sLine = <h_in>) {
		$sWebEnv = "";
		if ($sLine =~/\<WebEnv\>/) {
			$sWebEnv = $';
		}
		if ($sWebEnv =~/\<\/WebEnv\>/) {
			$sWebEnv = $`;
			print "WebEnv ID: ".$sWebEnv."\n";
			last;
		}
	}
	close (h_in);


    # Download a medline file containing publication records by using the WebEnv ID.
	$sUrl = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?".
		    "retmode=text&rettype=medline&retstart=0&retmax=999999999".
		    "&db=pubmed&query_key=1&WebEnv=".$sWebEnv;
	$sSaveFile = $sDownFolder."/PubMed_Year_".$i."_Abstract_Medline.txt";
	getstore($sUrl, $sSaveFile);
	
	print "URL: ".$sUrl."\n";
	if (-f $sSaveFile) {
		print "Downloaded:".$sSaveFile."\n";
	} else {
		print "Downloaded: Failed!!\n";
	}
	print "-----------------------------------\n";
}
print "Download completed~!!!\n";





# For more information, refer to the following link
# https://www.tutorialspoint.com/perl