File: Fasta.t

package info (click to toggle)
bioperl 1.6.924-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 50,776 kB
  • ctags: 11,412
  • sloc: perl: 175,865; xml: 27,565; lisp: 2,034; sh: 1,958; makefile: 19
file content (396 lines) | stat: -rw-r--r-- 14,288 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
BEGIN {
    use lib '.';
    use Bio::Root::Test;

    test_begin( -tests => 107,
                -requires_modules => [qw(Bio::DB::Fasta Bio::SeqIO)] );
}
use strict;
use warnings;
use Bio::Root::Root;
use File::Copy;
my $DEBUG = test_debug();


# Test Bio::DB::Fasta, but also the underlying module, Bio::DB::IndexedBase

my $test_dir  = setup_temp_dir('dbfa');
my $test_file = test_input_file('dbfa', 'mixed_alphabet.fasta');
my $test_files = [
    test_input_file('dbfa', 'mixed_alphabet.fasta'),
    test_input_file('dbfa', '6.fa')
];


{
    # Test basic functionalities
    ok my $db = Bio::DB::Fasta->new($test_dir, -reindex => 1), 'Index a directory';
    is $db->glob, '*.{fa,FA,fasta,FASTA,fast,FAST,dna,DNA,fna,FNA,faa,FAA,fsa,FSA}';
    isa_ok $db, 'Bio::DB::Fasta';
    is $db->length('CEESC13F'), 389;
    is $db->seq('CEESC13F:1,10'), 'cttgcttgaa';
    is $db->seq('CEESC13F:1-10'), 'cttgcttgaa';
    is $db->seq('CEESC13F:1..10'), 'cttgcttgaa';
    is $db->seq('CEESC13F:1..10/1'), 'cttgcttgaa';
    is $db->seq('CEESC13F:1..10/+1'), 'cttgcttgaa';
    is $db->seq('CEESC13F:1..10/-1'), 'ttcaagcaag';
    is $db->seq('CEESC13F/1'), 'cttgcttgaaaaatttatataaatatttaagagaagaaaaataaataatcgcatctaatgacgtctgtccttgtatccctggtttccattgactggtgcactttcctgtctttgaggacatggacaatattcggcatcagttcctggctctccctcctctcctggtgctccagcagaaccgttctctccattatctcccttgtctccacgtggtccacgctctcctggtgctcctggaataccttgagctccctcgtgccgaattcctgcagcccgggggatccactagttctagagcggccgccaccgcggtgggagctccagcttttgttncctttagtgagggttaatttcgagcttggcgtaatcatggtcatagctgtttcctg';
    is $db->seq('CEESC13F/-1'), 'caggaaacagctatgaccatgattacgccaagctcgaaattaaccctcactaaaggnaacaaaagctggagctcccaccgcggtggcggccgctctagaactagtggatcccccgggctgcaggaattcggcacgagggagctcaaggtattccaggagcaccaggagagcgtggaccacgtggagacaagggagataatggagagaacggttctgctggagcaccaggagaggagggagagccaggaactgatgccgaatattgtccatgtcctcaaagacaggaaagtgcaccagtcaatggaaaccagggatacaaggacagacgtcattagatgcgattatttatttttcttctcttaaatatttatataaatttttcaagcaag';
    is $db->seq('AW057119', 1, 10), 'tcatgttggc';
    is $db->seq('AW057119', 1, 10, 1), 'tcatgttggc';
    is $db->seq('AW057119', 1, 10, -1), 'gccaacatga';
    is $db->seq('AW057119', 10, 1), 'gccaacatga';
    is $db->seq('AW057119', 10, 1, -1), 'tcatgttggc';
    is $db->header('AW057119'), 'AW057119 test description';
    is $db->seq('foobarbaz'), undef;
    is $db->get_Seq_by_id('foobarbaz'), undef;
    is $db->file('AW057119'), '1.fa';
    is $db->file('AW057410'), '3.fa';
    is $db->file('CEESC13F'), '6.fa';

    # Bio::DB::RandomAccessI and Bio::DB::SeqI methods
    ok my $primary_seq = $db->get_Seq_by_id('AW057119');
    ok $primary_seq = $db->get_Seq_by_acc('AW057119');
    ok $primary_seq = $db->get_Seq_by_version('AW057119');
    ok $primary_seq = $db->get_Seq_by_primary_id('AW057119');
    isa_ok $primary_seq, 'Bio::PrimarySeq::Fasta';
    isa_ok $primary_seq, 'Bio::PrimarySeqI';

    # Bio::PrimarySeqI methods
    is $primary_seq->id, 'AW057119';
    is $primary_seq->display_id, 'AW057119';
    like $primary_seq->primary_id, qr/^Bio::PrimarySeq::Fasta=HASH/;
    is $primary_seq->alphabet, 'dna';
    is $primary_seq->accession_number, 'unknown';
    is $primary_seq->is_circular, undef;
    is $primary_seq->subseq(11, 20), 'ttctcggggt';
    is $primary_seq->description, 'test description', 'bug 3126';
    is $primary_seq->seq, 'tcatgttggcttctcggggtttttatggattaatacattttccaaacgattctttgcgccttctgtggtgccgccttctccgaaggaactgacgaaaaatgacgtggatttgctgacaaatccaggcgaggaatatttggacggattgatgaaatggcacggcgacgagcgacccgtgttcaaaagagaggacatttatcgttggtcggatagttttccagaatatcggctaagaatgatttgtctgaaagacacgacaagggtcattgcagtcggtcaatattgttactttgatgctctgaaagaaaggagagcagccattgttcttcttaggattgggatggacggatcctgaatatcgtaatcgggcagttatggagcttcaagcttcgatggcgctggaggagagggatcggtatccgactgccaacgcggcatcgcatccaaataagttcatgaaacgattttggcacatattcaacggcctcaaagagcacgaggacaaaggtcacaaggctgccgctgtttcatacaagagcttctacgacctcanagacatgatcattcctgaaaatctggatgtcagtggtattactgtaaatgatgcacgaaaggtgccacaaagagatataatcaactacgatcaaacatttcatccatatcatcgagaaatggttataatttctcacatgtatgacaatgatgggtttggaaaagtgcgtatgatgaggatggaaatgtacttggaattgtctagcgatgtctttanaccaacaagactgcacattagtcaattatgcagatagcc';
    ok my $trunc = $primary_seq->trunc(11,20);
    isa_ok $trunc, 'Bio::PrimarySeq::Fasta';
    isa_ok $trunc, 'Bio::PrimarySeqI';
    is $trunc->length, 10;
    is $trunc->seq, 'ttctcggggt';
    ok my $rev = $trunc->revcom;
    isa_ok $rev, 'Bio::PrimarySeq::Fasta';
    isa_ok $rev, 'Bio::PrimarySeqI';
    is $rev->seq, 'accccgagaa';
    is $rev->length, 10;
}


{
    # Re-open an existing index.
    # Doing this test properly involves unloading and reloading Bio::DB::Fasta.

    SKIP: {
        test_skip(-tests => 1, -requires_modules => [qw(Class::Unload)]);
        use_ok('Class::Unload');
        Class::Unload->unload( 'Bio::DB::Fasta' );
        Class::Unload->unload( 'Bio::DB::IndexedBase' );
        require Bio::DB::Fasta;
    }

    ok my $db = Bio::DB::Fasta->new($test_dir), 'Re-open an existing index';
    is $db->seq('AW057119', 1, 10), 'tcatgttggc';
}


{
    # Test tied hash access
    my %h;
    ok tie(%h, 'Bio::DB::Fasta', $test_dir), 'Tied hash access';
    ok exists $h{'AW057146'};
    is $h{'AW057146:1,10'} , 'aatgtgtaca'; # in file 1.fa
    is $h{'AW057146:10,1'} , 'tgtacacatt'; # reverse complement
    is $h{'AW057443:11,20'}, 'gaaccgtcag'; # in file 4.fa
}


{
    # Test writing the Bio::PrimarySeq::Fasta objects with SeqIO
    ok my $db = Bio::DB::Fasta->new($test_dir, -reindex => 1), 'Writing with SeqIO';
    my $out = Bio::SeqIO->new(
        -format => 'genbank',
        -file   => '>'.test_output_file()
    );
    my $primary_seq = Bio::Seq->new(-primary_seq => $db->get_Seq_by_acc('AW057119'));
    eval {
        $out->write_seq($primary_seq)
    };
    is $@, '';

    $out = Bio::SeqIO->new(-format => 'embl', -file  => '>'.test_output_file());
    eval {
        $out->write_seq($primary_seq)
    };
    is $@, '';
}


{
    # Test alphabet and reverse-complement RNA
    ok my $db = Bio::DB::Fasta->new( $test_file, -reindex => 1), 'Index a single file';
    is $db->alphabet('gi|352962132|ref|NG_030353.1|'), 'dna';
    is $db->alphabet('gi|352962148|ref|NM_001251825.1|'), 'rna';
    is $db->alphabet('gi|194473622|ref|NP_001123975.1|'), 'protein';
    is $db->alphabet('gi|61679760|pdb|1Y4P|B'), 'protein';
    is $db->alphabet('123'), '';
    is $db->seq('gi|352962148|ref|NM_001251825.1|', 20, 29,  1), 'GUCAGCGUCC';
    is $db->seq('gi|352962148|ref|NM_001251825.1|', 20, 29, -1), 'GGACGCUGAC';

    # Test empty sequence
    is $db->seq('123'), '';

    is $db->file('gi|352962132|ref|NG_030353.1|'), 'mixed_alphabet.fasta';
}


{
    # Test stream
    ok my $db = Bio::DB::Fasta->new( $test_file, -reindex => 1);
    ok my $stream = $db->get_PrimarySeq_stream;
    isa_ok $stream, 'Bio::DB::Indexed::Stream';
    my $count = 0;
    while (my $seq = $stream->next_seq) {
        $count++;
    }
    is $count, 5;

    # ActivePerl will not allow deletion if the tie-hash is still active
    $db->DESTROY;
    # Strawberry Perl temporary file
    unlink "$test_file.index" if -e "$test_file.index";
    # ActivePerl temporary files
    unlink "$test_file.index.dir" if -e "$test_file.index.dir";
    unlink "$test_file.index.pag" if -e "$test_file.index.pag";
}


{
    # Concurrent databases (bug #3390)
    ok my $db1 = Bio::DB::Fasta->new( test_input_file('dbfa', '1.fa') );
    ok my $db3 = Bio::DB::Fasta->new( test_input_file('dbfa', '3.fa') );
    ok my $db4 = Bio::DB::Fasta->new( $test_dir );
    ok my $db2 = Bio::DB::Fasta->new( test_input_file('dbfa', '2.fa') );
    is $db4->file('AW057231'), '1.fa';
    is $db2->file('AW057302'), '2.fa';
    is $db4->file('AW057119'), '1.fa';
    is $db3->file('AW057336'), '3.fa';
    is $db1->file('AW057231'), '1.fa';
    is $db4->file('AW057410'), '3.fa';

    # ActivePerl will not allow deletion if the tie-hash is still active
    $db1->DESTROY;
    $db2->DESTROY;
    $db3->DESTROY;
    # Strawberry Perl temporary file
    unlink $db1->index_name if -e $db1->index_name;
    unlink $db2->index_name if -e $db2->index_name;
    unlink $db3->index_name if -e $db3->index_name;
    # ActivePerl temporary files
    unlink $db1->index_name().'.dir' if -e $db1->index_name().'.dir';
    unlink $db2->index_name().'.dir' if -e $db2->index_name().'.dir';
    unlink $db3->index_name().'.dir' if -e $db3->index_name().'.dir';
    unlink $db1->index_name().'.pag' if -e $db1->index_name().'.pag';
    unlink $db2->index_name().'.pag' if -e $db2->index_name().'.pag';
    unlink $db3->index_name().'.pag' if -e $db3->index_name().'.pag';
}


{
    # Test an arbitrary index filename and cleaning
    my $name = 'arbitrary.idx';
    ok my $db = Bio::DB::Fasta->new( $test_file,
        -reindex => 1, -index_name => $name, -clean => 1,
    );
    is $db->index_name, $name;

    # Tied-hash in Strawberry Perl produce $name,
    # while in ActivePerl produce "$name.dir" and "$name.pag"
    if (-e "$name.pag") {
        ok -f "$name.pag";
        # ActivePerl will not allow deletion if the tie-hash is still active
        $db->DESTROY;
        unlink "$name.dir" if -e "$name.dir";
        unlink "$name.pag" if -e "$name.pag";
        ok ! -f "$name.pag";
    }
    else {
        ok -f $name;
        # ActivePerl will not allow deletion if the tie-hash is still active
        $db->DESTROY;
        unlink $name if -e $name;
        ok ! -f $name;
    }
    undef $db;
}


{
    # Test makeid
    ok my $db = Bio::DB::Fasta->new( $test_file,
        -reindex => 1, -clean => 1, -makeid => \&extract_gi,
    ), 'Make single ID';
    is_deeply [sort $db->get_all_primary_ids], ['', 194473622, 352962132, 352962148, 61679760];
    is $db->get_Seq_by_id('gi|352962148|ref|NM_001251825.1|'), undef;
    isa_ok $db->get_Seq_by_id(194473622), 'Bio::PrimarySeqI';
}


{
    # Test makeid that generates several IDs, bug #3389
    ok my $db = Bio::DB::Fasta->new( $test_file,
        -reindex => 1, -clean => 1, -makeid => \&extract_gi_and_ref,
    ), 'Make multiple IDs, bug #3389';
    is_deeply [sort $db->get_all_primary_ids], ['', 194473622, 352962132, 352962148, 61679760, 'NG_030353.1',  'NM_001251825.1', 'NP_001123975.1'];
    is $db->get_Seq_by_id('gi|352962148|ref|NM_001251825.1|'), undef;
    isa_ok $db->get_Seq_by_id('NG_030353.1'), 'Bio::PrimarySeqI';
}


{
    # Test opening set of files and test IDs
    ok my $db = Bio::DB::Fasta->new( $test_files, -reindex => 1), 'Index a set of files';
    ok $db->ids;
    ok $db->get_all_ids;
    my @ids = sort $db->get_all_primary_ids();
    is_deeply \@ids, [ qw(
        123
        CEESC12R
        CEESC13F
        CEESC13R
        CEESC14F
        CEESC14R
        CEESC15F
        CEESC15R
        CEESC15RB
        CEESC16F
        CEESC17F
        CEESC17RB
        CEESC18F
        CEESC18R
        CEESC19F
        CEESC19R
        CEESC20F
        CEESC21F
        CEESC21R
        CEESC22F
        CEESC23F
        CEESC24F
        CEESC25F
        CEESC26F
        CEESC27F
        CEESC28F
        CEESC29F
        CEESC30F
        CEESC32F
        CEESC33F
        CEESC33R
        CEESC34F
        CEESC35R
        CEESC36F
        CEESC37F
        CEESC39F
        CEESC40R
        CEESC41F
        gi|194473622|ref|NP_001123975.1|
        gi|352962132|ref|NG_030353.1|
        gi|352962148|ref|NM_001251825.1|
        gi|61679760|pdb|1Y4P|B
    )];
    like $db->index_name, qr/^fileset_.+\.index$/;

    my $index = $db->index_name;
    # ActivePerl will not allow deletion if the tie-hash is still active
    $db->DESTROY;
    # Strawberry Perl temporary file
    unlink $index if -e $index;
    # ActivePerl temporary files
    unlink "$index.dir" if -e "$index.dir";
    unlink "$index.pag" if -e "$index.pag";
}


{
    # Squash warnings locally
    local $SIG{__WARN__} = sub {};

    # Issue 3172
    my $test_dir = setup_temp_dir('bad_dbfa');
    throws_ok {my $db = Bio::DB::Fasta->new($test_dir, -reindex => 1)}
        qr/FASTA header doesn't match/;

    # Issue 3237
    # Empty lines within a sequence is bad...
    throws_ok {my $db = Bio::DB::Fasta->new(test_input_file('badfasta.fa'), -reindex => 1)}
        qr/Blank lines can only precede header lines/;
}


{
    # Issue 3237 again
    # but empty lines preceding headers are okay, but let's check the seqs just in case
    my $db;
    lives_ok {$db = Bio::DB::Fasta->new(test_input_file('spaced_fasta.fa'), -reindex => 1)};
    is length($db->seq('CEESC39F')), 375, 'length is correct in sequences past spaces';
    is length($db->seq('CEESC13F')), 389;

    is $db->subseq('CEESC39F', 51, 60)  , 'acatatganc', 'subseq is correct';
    is $db->subseq('CEESC13F', 146, 155), 'ggctctccct', 'subseq is correct';

    # Remove temporary test file
    my $outfile = test_input_file('spaced_fasta.fa').'.index';

    # ActivePerl will not allow deletion if the tie-hash is still active
    $db->DESTROY;
    # Strawberry Perl temporary file
    unlink $outfile if -e $outfile;
    # ActivePerl temporary files
    unlink "$outfile.dir" if -e "$outfile.dir";
    unlink "$outfile.pag" if -e "$outfile.pag";
}

exit;


sub extract_gi {
    # Extract GI from RefSeq
    my $header = shift;
    my ($id) = ($header =~ /gi\|(\d+)/m);
    return $id || '';
}


sub extract_gi_and_ref {
    # Extract GI and from RefSeq
    my $header = shift;
    my ($gi)  = ($header =~ /gi\|(\d+)/m);
    $gi ||= '';
    my ($ref) = ($header =~ /ref\|([^|]+)/m);
    $ref ||= '';
    return $gi, $ref;
}


sub setup_temp_dir {
    # this obfuscation is to deal with lockfiles by GDBM_File which can
    # only be created on local filesystems apparently so will cause test
    # to block and then fail when the testdir is on an NFS mounted system

    my $data_dir = shift;

    my $io = Bio::Root::IO->new();
    my $tempdir = test_output_dir();
    my $test_dir = $io->catfile($tempdir, $data_dir);
    mkdir($test_dir); # make the directory
    my $indir = test_input_file($data_dir);
    opendir(my $INDIR,$indir) || die("cannot open dir $indir");
    # effectively do a cp -r but only copy the files that are in there, no subdirs
    for my $file ( map { $io->catfile($indir,$_) } readdir($INDIR) ) {
        next unless (-f $file );
        copy($file, $test_dir);
    }
    closedir($INDIR);
    return $test_dir
}