Perl: Regular Expression (Basics and Applications) - by Eun Bae Kim (08/03/2018)
 

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
  
use strict;

my $sSeq = "ATGCGTACGTGCGTAGATCGCGCTGATTGG";

# Check by using "if ..."
print "--------------------------------------------\n";
if ($sSeq=~/ACG/) {
	print "Original Sequence :".$sSeq."\n";       # Pre-Existing Variable
	print "Before a match    :".$`."\n";          # Pre-Existing Variable
	print "The match         :".$&."\n";          # Pre-Existing Variable
	print "After a match     :".$'."\n";          # Pre-Existing Variable
}



# Check the first/last position
print "--------------------------------------------\n";
if ($sSeq=~/^ATG/) {                              # Fist Position
	print "Original Sequence :".$sSeq."\n";       # Pre-Existing Variable
	print "Before a match    :".$`."\n";          # Pre-Existing Variable
	print "The match         :".$&."\n";          # Pre-Existing Variable
	print "After a match     :".$'."\n";          # Pre-Existing Variable
}
print "---------------------------------\n";
if ($sSeq=~/TGG$/) {                              # Last Position
	print "Original Sequence :".$sSeq."\n";       # Pre-Existing Variable
	print "Before a match    :".$`."\n";          # Pre-Existing Variable
	print "The match         :".$&."\n";          # Pre-Existing Variable
	print "After a match     :".$'."\n";          # Pre-Existing Variable
}



# Check one of characters / quantity of characters
my $sTemp1 = "1ab2cd3ef4gh5ij6gh789gh1ab2cd444gh";
my $sTemp2 = "A rat showed me an apple.";
my $sX     = "cry";
print "--------------------------------------------\n";
if ($sTemp1=~/[1-9]gh/) {                         # One of 1-9
	print "String : ".$sTemp1."\n";
	print "Concept: [1-9]gh\n";
	print "Match  : ".$&."\n";
}
print "---------------------------------\n";
if ($sTemp1=~/[1-9]{2,5}gh/) {                    # 2~5 characters of 1-9
	print "String : ".$sTemp1."\n";
	print "Concept: [1-9]{2,5}gh\n";
	print "Match  : ".$&."\n";
}
print "---------------------------------\n";
if ($sTemp1=~/[1-9]+gh/) {                        # ?: 0 or 1, *: 0 or more, +: 1 or more
	print "String : ".$sTemp1."\n";
	print "Concept: [1-9]+gh\n";
	print "Match  : ".$&."\n";
}
print "---------------------------------\n";
if ($sTemp1=~/4[1-9]+gh/) {                        # ?: 0 or 1, *: 0 or more, +: 1 or more
	print "String : ".$sTemp1."\n";
	print "Concept: 4[1-9]+gh\n";
	print "Match  : ".$&."\n";
}
print "---------------------------------\n";
if ($sTemp2=~/[$sX]at/) {                         # one of 'c', 'r', and 'y'
	print "String : ".$sTemp2."\n";
	print "Concept: \\s[n-t]at\n";
	print "Match  : ".$&."\n";
}
print "---------------------------------\n";
if ($sTemp2=~/[\s1-9][n-t]at/) {                  # \s: space, one of \s and 1-9, one of 'n' - 't'
	print "String : ".$sTemp2."\n";
	print "Concept: [\\s1-9][n-t]at\n";
	print "Match  : ".$&."\n";
}
print "---------------------------------\n";
if ($sTemp2=~/[brand][panda][python][play]/) {    # 'rat' or 'apple'
	print "String: ".$sTemp2."\n";
	print "Concept: [brand][panda][python][play]\n";
	print "Match: ".$&."\n";
}



# Reverse Complement
print "--------------------------------------------\n";
print "Original: ".$sSeq."\n";
print "Reverse : ".reverse($sSeq)."\n";
print "Rev.Comp: ".funcRevComp($sSeq)."\n";

sub funcRevComp {
	my $sSeq = reverse uc($_[0]);
	$sSeq =~ tr/ATGCN/TACGN/;                     # A -> T, T ->A, G -> C, C -> G, N -> N
	return $sSeq;	
}



# Substitution
my $sStr = "My name is EBKim. EBKim is my thesis advisor.";
print "--------------------------------------------\n";
print "Original             : ".$sStr."\n";
$sStr=~s/EBKim/JBPark/;                           # Single substitution
print "After 1 Substitution : ".$sStr."\n";
print "---------------------------------\n";
$sStr = "My name is EBKim. EBKim is my thesis advisor.";
print "Original             : ".$sStr."\n";
$sStr=~s/EBKim/JBPark/g;                          # Multiple substitution by 'g'
print "After 1 Substitution : ".$sStr."\n";

sub funcRevComp {
	my $sSeq = reverse uc($_[0]);
	$sSeq =~ tr/ATGCN/TACGN/;                     # A -> T, T ->A, G -> C, C -> G, N -> N
	return $sSeq;	
}



# GC Content
my @aResult = funcGcContent($sSeq);		# Get an array result from a function
print "--------------------------------------------\n";
print "Seq    : ".$sSeq."\n";
print "G Cnt  : ".$aResult[0]."\n";
print "C Cnt  : ".$aResult[1]."\n";
print "Length : ".$aResult[2]."\n";
print "GC(%)  : ".$aResult[3]."\n";

sub funcGcContent {
	my $sDnaSeq = uc(shift);            # or use "shift" function. 1st element of Array "@_"

	my $iGCnt = $sDnaSeq =~ tr/G/G/;    # Counting the number of "G"
	my $iCCnt = $sDnaSeq =~ tr/C/C/;    # Counting the number of "C"

	my $iGcContent = ($iGCnt+$iCCnt)*100 / length($sDnaSeq);
	$iGcContent = sprintf("%.2f", $iGcContent);

	return ($iGCnt, $iCCnt, length($sDnaSeq), $iGcContent);     # Return an Array
}




# For more information, refer to the following link
# https://www.tutorialspoint.com/perl
# http://www.perl.or.kr/perl_iyagi/regexp