File: gmap_process.pl.in

package info (click to toggle)
gmap 2017-01-14-1
  • links: PTS, VCS
  • area: non-free
  • in suites: stretch
  • size: 32,596 kB
  • ctags: 10,947
  • sloc: ansic: 489,036; perl: 5,420; sh: 4,248; makefile: 799
file content (196 lines) | stat: -rw-r--r-- 4,875 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#! @PERL@
# $Id: gmap_process.pl.in,v 1.8 2010-07-21 22:02:18 twu Exp $

use warnings;

use IO::File;
use Getopt::Std;
undef($opt_c);			# coord file
undef($opt_g);			# gunzip each file
undef($opt_f);			# File with input FASTA source filenames
getopts("c:gf:");

if (!defined($coord_file = $opt_c)) {
  if (-e "coords.txt") {
    $coord_file = "coords.txt";
  } else {
    die "Must specify coordinate file (created by md_coords or fa_coords) with -c flag."
  }
}

read_coords($coord_file);

if (defined($opt_f)) {
    # Source files given in a file
    @sourcefiles = ();
    $SOURCES = new IO::File($opt_f) or die "Cannot open file $opt_f";
    while (defined($line = <$SOURCES>)) {
	chop $line;
	push @sourcefiles,$line;
    }
    close($SOURCES);
    process_fasta(\@sourcefiles);

} elsif ($#ARGV < 0) {
    # FASTA is piped via stdin
    @streams = ();
    push @streams,"<&STDIN";
    process_fasta(\@streams);

} else {
    # Source files given on command line
    process_fasta(\@ARGV);
}

check_processed();

exit;


sub read_coords {
  my ($coord_file) = @_;
  my $universal_coord = 0;

  $FP = new IO::File($coord_file) or die "Cannot open coord file $coord_file";
  print STDERR "Reading coordinates from file $coord_file\n";

  while (defined($line = <$FP>)) {
    if ($line =~ /^#/) {
	#if ($line =~ /Reference strain:\s*(\S+)/) {
	#$refstrain = $1;
	#}

    } else {
      $line =~ s/\r\n/\n/;
      chop $line;
      @fields = split /\s+/,$line;
      $contig = $fields[0];

      #if (!defined($strain{$contig} = $fields[2])) {
      #$strain{$contig} = "";
      #} elsif (defined($refstrain) && $strain{$contig} eq $refstrain) {
      #$strain{$contig} = "";
      #}

      $coords{$contig} = $fields[1];
      $universal_coord{$contig} = $universal_coord;

      ($chrstart,$chrend) = $coords{$contig} =~ /\S+:(\d+)\.\.(\d+)/;
      if (!defined($fields[2])) {
	  $universal_coord += $chrend - $chrstart + 1;
      } elsif ($fields[2] eq "circular") {
	  $circularp{$contig} = 1;
	  $universal_coord += $chrend - $chrstart + 1;
	  $universal_coord += $chrend - $chrstart + 1;
      } elsif ($fields[2] eq "primary") {
	  # Linear and primary
	  $universal_coord += $chrend - $chrstart + 1;
      } else {
	  # Altloc
	  $primary{$contig} = $fields[2];
	  $universal_coord += $chrend - $chrstart + 1;
      }
    }
  }
  close($FP);
  return;
}


sub find_contig_name {
    my ($contiginfo, $coords) = @_;

    if ($contiginfo !~ /\|/) {
	if (defined($ {$coords}{$contiginfo})) {
	    return $contiginfo;
	} elsif ($contiginfo =~ /(\S+)/ && defined($ {$coords}{$1})) {
	    return $1;
	} else {
	    # Failed
	    return $contiginfo;
	}
    } else {
	# Old code for FASTA files that contained pipe delimiters
	@parts = split /\|/,$contiginfo;
	foreach $part (@parts) {
	    if (defined($ {$coords}{$part})) {
		return $part;
	    } elsif ($part =~ /(\S+)/ && defined($ {$coords}{$1})) {
		return $1;
	    }
	}
	# Failed
	return $contiginfo;
    }
}

sub process_fasta {
  my ($argv) = @_;
  my $printp = 0;

  foreach $arg (@ {$argv}) {
      if (defined($opt_g)) {
	  $FP = new IO::File("gunzip -c \"$arg\" |") or die "Cannot open file $arg";
      } else {
	  $FP = new IO::File($arg) or die "Cannot open file $arg";
      }

      while (defined($line = <$FP>)) {
	  $line =~ s/\r\n/\n/;
	  chomp $line;
	  if ($line !~ /\S/) {
	      # Skip blank lines
	  } elsif ($line =~ /^>(\S+)/) {

	      # $contig = find_contig_name($1,\%coords);
	      $contig = $1;

	      if (!defined($coords{$contig})) {
		  print STDERR "No coordinates defined for contig $contig.  Skipping.\n";
		  $printp = 0;
		  
	      } else {
		  
		  # ($chr,$chrpos1,$chrpos2) = $coords{$contig} =~ /(\S+):(\d+)\D+(\d+)/;
		  # If $chrpos2 < $chrpos1, then contig needs to be reverse complement.
		  # However, gmapindex knows how to handle this
		  
		  $processedp{$contig} = 1;
		  #printf (">%s\t%s\t%s\n",$contig,$coords{$contig},$strain{$contig});
		  printf (">%s\t%s\t%u",$contig,$coords{$contig},$universal_coord{$contig});
		  if (defined($circularp{$contig})) {
		      print "\tcircular";
		  } elsif (defined($primary{$contig})) {
		      print "\t" . $primary{$contig};
		  }
		  print "\n";
		  $printp = 1;
		  
	      }
	  } elsif ($printp == 1) {
	      print $line . "\n";
	  }
      }

      close($FP);
  }

  return;
}


sub check_processed {
  $nwarnings = 0;
  foreach $contig (keys %coords) {
    if (!defined($processedp{$contig})) {
      print STDERR "Warning!!  Contig $contig was listed in coords file, but not seen in a FASTA file\n";
      $nwarnings++;
    }
  }
  if ($nwarnings > 0) {
    print STDERR "Warning: A total of $nwarnings contigs were listed in coords file, but not seen in a FASTA file\n";
    print STDERR "Should provide the correct FASTA files, or comment out these contigs in the coords file $coord_file\n";
  }
  return;
}