Perl: URL Download, Genus/Species Counting - by Eun Bae Kim (08/21/2018)
 

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
  
use strict;
use LWP::Simple;

my $iCnt = 0;
# URLs for target files. qw --> no need to use quatation marks.
my @aUrls = qw (
	ftp://ftp.ncbi.nlm.nih.gov/sra/wgs_aux/AZ/FR/AZFR01/AZFR01.1.fsa_nt.gzz
	ftp://ftp.ncbi.nlm.nih.gov/sra/wgs_aux/AV/AI/AVAI01/AVAI01.1.fsa_nt.gz
	ftp://ftp.ncbi.nlm.nih.gov/sra/wgs_aux/AX/DQ/AXDQ01/AXDQ01.1.fsa_nt.gz
);

my $sDownFolder = "Download_WGS";
system("mkdir ".$sDownFolder);
print "--------------------------------------------\n";
foreach my $sCurUrl (@aUrls) {
	$iCnt++;
	my $sLocalFile = $sDownFolder."/".funcFileNameFromPath($sCurUrl);
	print "Target ".$iCnt."\t".$sLocalFile."\n";

	my $oRespond = getstore($sCurUrl, $sLocalFile);    # If a url is not correct, no exception occurs.
	if (is_error($oRespond)) {
		print "    Downloading failed: ".$sCurUrl."\n";
		print "    Error Message: ".$oRespond."\n";
	}
}

sub funcFileNameFromPath {
	my $sFilePath = shift;
	my $sRevPath = reverse($sFilePath);
	my $sFileName = "";

	if ($sRevPath=~/\//){
		$sFileName = $`;
		$sFileName = reverse($sFileName);
	} else {
		$sFileName = $sFilePath;
	}
	return $sFileName;
}





# Prokaryotic Nomenclature - Database
# https://www.dsmz.de/support/bacterial-nomenclature-up-to-date-downloads.html
# https://www.dsmz.de/fileadmin/Bereiche/ChiefEditors/BacterialNomenclature/DSMZ_bactnames.xlsx

my $sUrl = "http://lab.gutdesigner.com/research_perl_data/ProkaryotesNames.txt";
my $sSaveFile = "ProkaryotesNames.txt";
getstore($sUrl, $sSaveFile);                    # If a url is not correct, no exception occurs.

print "--------------------------------------------\n";
print "Target URL: ".$sUrl."\n";
if (-f $sSaveFile) {                            # If the file exists
	print "Downloaded File:".$sSaveFile."\n";
}
print "\n";


# Open the file and count the number of genus
my %hGenus_by_Genus = ();
open (hIn, $sSaveFile) or die;
<hIn>;                                          # Read the first line and do not use it.
while (my $sLine = <hIn>) {
	$sLine =~s/\n//g;
	$sLine =~s/\r//g;

	my @aLine = split("\t", $sLine);
	my $sCurGenus = $aLine[0];
	$hGenus_by_Genus{$sCurGenus} = $sCurGenus;
}
close(hIn);

my @aGenusNames = keys(%hGenus_by_Genus);
my $iGenusCnt = 0;
print "--------------------------------------------\n";
foreach my $sCurGenus (@aGenusNames) {
	$iGenusCnt++;
	if ($iGenusCnt <= 20) {
		printf("%-2s %-20s\n", $iGenusCnt, $sCurGenus);
	}
}
print "-----------------------------------\n";
print "The number of Genera: ".$iGenusCnt."\n";

print "--------------------------------------------\n";
$iGenusCnt = 0;
@aGenusNames = sort {$a cmp $b} @aGenusNames;  # Sorting lexically
foreach my $sCurGenus (@aGenusNames) {
	$iGenusCnt++;
	if ($iGenusCnt <= 20) {
		printf("%-2s %-20s\n", $iGenusCnt, $sCurGenus);
	}
}
print "-----------------------------------\n";
print "The number of Genera: ".$iGenusCnt."\n";



# Open the file and count the number of species
my %hBiName_by_BiName = ();
open (hIn, $sSaveFile) or die;
<hIn>;                                          # Read the first line and do not use it.
while (my $sLine = <hIn>) {
	$sLine =~s/\n//g;
	$sLine =~s/\r//g;

	my @aLine = split("\t", $sLine);
	my $sCurGenus = $aLine[0];
	my $sCurSpec  = $aLine[1];
	if ($sCurSpec eq "") {
		next;
	} else {
		my $sCurBiName = $sCurGenus." ".$sCurSpec;
		$hBiName_by_BiName{$sCurBiName} = $sCurBiName;
	}
}
close(hIn);

print "--------------------------------------------\n";
my $iBiNameCnt = 0;
my @aBiNames = keys(%hBiName_by_BiName);
@aBiNames = sort {$a cmp $b} @aBiNames;  # Sorting lexically
foreach my $sCurBiName (@aBiNames) {
	$iBiNameCnt++;
	if ($iBiNameCnt <= 20) {
		printf("%-2s %-20s\n", $iBiNameCnt, $sCurBiName);
	}
}
print "-----------------------------------\n";
print "The number of Genera: ".$iBiNameCnt."\n";


# For more information, refer to the following link
# https://www.tutorialspoint.com/perl