File: make_multi_seq.pl

package info (click to toggle)
cd-hit 4.8.1-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 1,940 kB
  • sloc: perl: 7,934; cpp: 3,120; python: 771; makefile: 130; sh: 26; ansic: 6
file content (76 lines) | stat: -rwxr-xr-x 1,670 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/perl

#note you have to use "-d 0" in the cd-hit run
#note you better to use "-g 1" in the cd-hit run

# this script read the .clstr file, it generate a seperate fasta file
# for each cluster over certain size
# for example, if you run cd-hit -i db -o dbout -c 0.6 -n 4 -d 0 -g 1
# then you will have a dbout.clstr
# now run this script:
# ./make_multi_seq.pl db dbout.clstr multi-seq 20
# 

my $fasta = shift;
my $clstr = shift;
my $out_dir = shift;
my $size_cutoff = shift;

die unless (-e $fasta);
die unless (-e $clstr);
die unless ($out_dir);
$size_cutoff = 1 unless ($size_cutoff);

if (not -e $out_dir) {my $cmd = `mkdir $out_dir`;}


open(TMP, $clstr) || die;
my %id2cid=();
my $cid = "";
my @ids = ();
my $no = 0;
while($ll = <TMP>) {
  if ($ll =~ /^>/) {
    if ($no >= $size_cutoff) {
      foreach $i (@ids) { $id2cid{$i} = $cid;}
    }

    if ($ll =~ /^>Cluster (\d+)/) { $cid = $1; }
    else { die "Wrong format $ll"; }
    @ids = ();
    $no = 0;
  }
  else {
    if ($ll =~ /(aa|nt), >(.+)\.\.\./) { push(@ids, $2); $no++; }
    else { die "Wrong format $ll"; }
  }
}
close(TMP);
    if ($no >= $size_cutoff) {
      foreach $i (@ids) { $id2cid{$i} = $cid;}
    }


open(FASTA, $fasta) || die;
my $outfile_open = 0;
my $flag = 0;
while($ll=<FASTA>) {
  if ($ll=~ /^>(\S+)/) {
    my $id = $1;
    my $cid = $id2cid{$id};
    if (defined($cid)) {
      close(OUT) if ($outfile_open);
      open(OUT, ">> $out_dir/$cid") || die "can not open file to write $out_dir/$cid";
      $outfile_open = 1;
      $flag = 1;
    }
    else {
      $flag = 0;
    }
  }
  if ($flag) {print OUT $ll;}
}
close(FASTA);
      close(OUT) if ($outfile_open);