Perl: GenBank Handling 3 - Multiple Exons - By Eun Bae Kim and SEKim (03/29/2024)
 

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
use strict;
use Bio::SeqIO;
use Bio::SeqFeature::Generic;
use Data::Dumper;

my $sInputFile  = "00000030_GenBank3.gbk";
#my $sInputFile  = "Lp_WCFS1_dnaA_CDS.gbk";

my $oGbk = Bio::SeqIO->new(-file => $sInputFile, -format => 'genbank');

while (my $oSeq = $oGbk->next_seq()){
    # 전체 길이: 27604 bp
    print length($oSeq->seq())."\n";
    print $oSeq->seq()."\n"; # Forward 서열임
    print "=====\n";
    <STDIN>;

    foreach my $oCurFeat($oSeq->get_SeqFeatures()){
        if ($oCurFeat->primary_tag eq "CDS"){
            # 17749..27094 ==> 9,346 bp
            my $sSeq = $oSeq->subseq($oCurFeat->start, $oCurFeat->end);
            print length($sSeq)." bp: ".$oCurFeat->start."..".$oCurFeat->end."\n";
            print $sSeq."\n"; # Forward 서열임
            print "=====\n";
            <STDIN>;

            print $oCurFeat->location()."\n";  # 현재 Feature의 위치에 대한 객체임
            print "=====\n";
            <STDIN>;

            print Dumper($oCurFeat->location())."\n";     #location의 hash 정보 출력
            print "=====\n";
            <STDIN>;

            print $oCurFeat->start."..".$oCurFeat->end."\n";
            print $oCurFeat->location()->start."..".$oCurFeat->location()->end."\n";
            print $oCurFeat->location()->seq_id()."\n";   #accession number
            print $oCurFeat->location()->strand()."\n";   #location의 hash 정보 출력
            print "=====\n";
            <STDIN>;

            # spliced_seq()은 아래와 같은 정보를 처리한 서열에 대한 객체를 반환함.
            # 따라서 상위 서열인 $oCurFeat의 서열을 기준으로 보면,
            # $oCurFeat->spliced_seq()->seq()의 서열은 
            # complement와 join이 완료된 것이므로 RevComp 방향이 된다.
            # complement(join(<17749..17907,18261..18380,18748..18925,
            # 19055..19200,19380..19563,19702..19850,20297..20365,
            # 20799..20942,21005..21100,21226..21388,21510..21624,
            # 22233..22383,22649..22818,22985..23112,23328..23470,
            # 23640..23793,24109..24250,24382..24484,25021..25161,
            # 25291..25329,25772..25928,26349..26521,26999..>27094))
            print Dumper($oCurFeat->spliced_seq())."\n";   #spliced_seq의 hash 정보 출력
            
            # $oSeq와 RevComp 방향 관계인 서열임 (즉, complement과 join 처리가 되었으므로)
            my $sSplicedSeq = $oCurFeat->spliced_seq()->seq();
            print "Forward Seq: ".$sSplicedSeq."\n";
            print "-\n";
            # $oSeq와 동일 방향 관계인 서열임 (즉, complement과 join 처리가 된 것을 다시 revcom했으므로)
            print "RevComp Seq: ".$oCurFeat->spliced_seq()->revcom()->seq()."\n";
            print "=====\n";
            <STDIN>;

            my $sJoinedSeq = "";
            # join 묶기 전의 개별 exon들에 대하여 출력
            for my $oCurSubLocation ( $oCurFeat->location()->sub_Location() ) {
                print $oCurFeat->strand."\n";
                print $oCurFeat->location()->strand."\n";
                print $oCurSubLocation->strand."\n";
                
                my $sCurSubSeq = $oSeq->subseq($oCurSubLocation->start, 
                                               $oCurSubLocation->end);
                $sJoinedSeq = $sJoinedSeq.$sCurSubSeq;
                # strand orientation 확인하여 출력 필요
                if ($oCurFeat->strand == 1) {
                    # $oSeq와 동일 방향 관계인 서열 출력
                    print $oCurSubLocation->start."..".$oCurSubLocation->end."\n";
                    print $sCurSubSeq."\n";
                } elsif ($oCurFeat->strand == -1) {
                    # $oSeq와 RevComp 방향 관계인 서열 출력
                    my $iLeng = $oCurSubLocation->end - $oCurSubLocation->start + 1;
                    print $iLeng."\n";
                    print $oCurSubLocation->end."..".$oCurSubLocation->start."\n";
                    print funcRevComp($sCurSubSeq)."\n";
                }
                print "\n=====\n";<STDIN>;
            }
            if ($oCurFeat->strand == 1) {
                print $sJoinedSeq."\n";
            } elsif ($oCurFeat->strand == -1) {
                print funcRevComp($sJoinedSeq)."\n";
            }
            print "\n=====\n";
        }
    }
}
print "--------------------------------------------\n";
print "Finished!!\n";


sub funcRevComp {
    my $sSeq = reverse uc($_[0]);
    $sSeq =~ tr/ATGCN/TACGN/;   # A -> T, T ->A, G -> C, C -> G, N -> N
    return $sSeq;    
}