File: clustal2fasta.pl

package info (click to toggle)
fasta3 36.3.8i.14-Nov-2020-3
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 7,016 kB
  • sloc: ansic: 77,269; perl: 10,677; python: 2,461; sh: 428; csh: 86; sql: 55; makefile: 40
file content (101 lines) | stat: -rwxr-xr-x 2,403 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/perl

################################################################
# copyright (c) 2014,2015 by William R. Pearson and The Rector &
# Visitors of the University of Virginia */
################################################################
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under this License is distributed on an "AS
# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied.  See the License for the specific language
# governing permissions and limitations under the License.
################################################################

################################################################
# clustal2fasta.pl 
################################################################
# clustal2fasta.pl takes a standard clustal format alignment file
# and produces the corresponding FASTA file.
#
################################################################

use warnings;
use strict;
use Pod::Usage;
use Getopt::Long;

my ($shelp, $help, $trim) = (0, 0);

GetOptions(
    "h|?" => \$shelp,
    "help" => \$help,
    );

pod2usage(1) if $shelp;
pod2usage(exitstatus => 0, verbose => 2) if $help;
unless (-f STDIN || -p STDIN || @ARGV) {
 pod2usage(1);
}

my @seq_ids = ();
my %msa = ();
    
# read the first line, first should not be blank
my $title = <>;

while (my $line = <>) {
  chomp $line;
  next unless ($line);
  next if ($line =~ m/^[\s:\*\+\.]+$/);   # skip conservation line

  my ($seq_id, $align) = split(/\s+/,$line);

  if (defined($msa{$seq_id})) {
    $msa{$seq_id} .= $align;
  }
  else {
    $msa{$seq_id} = $align;
    push @seq_ids, $seq_id;
  }
}

for my $seq_id ( @seq_ids ) {
  my $fmt_seq = $msa{$seq_id};
  $fmt_seq =~ s/(.{0,60})/$1\n/g;
  print ">$seq_id\n$fmt_seq";
}

__END__

=pod

=head1 NAME

 clustal2fasta.pl

=head1 SYNOPSIS

 clustal2fasta.pl clustal.msa

=head1 OPTIONS

 -h	short help
 --help include description


=head1 DESCRIPTION

C<clustal2fasta.pl> takes a Clustal format interleaved multiple
sequence alignment and produces the corresponding fasta format library.

=head1 AUTHOR

William R. Pearson, wrp@virginia.edu

=cut