File: fragment_4dnpairs.pl

package info (click to toggle)
python-pairix 0.3.8-1
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 179,892 kB
sloc: ansic: 5,553; python: 1,207; sh: 602; perl: 464; makefile: 64
file content (134 lines) | stat: -rwxr-xr-x 4,790 bytes
parent folder | download | duplicates (3)
#!/usr/bin/perl 
##########
#The MIT License (MIT)
#
# Copyright (c) 2015 Aiden Lab
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
#  THE SOFTWARE.
##########

# Perl script to convert to fragment map from infile. The infile should be in
# DCIC format:
# # HEADER 
# readname chr1 pos1 chr2 pos2 str1 str2
#
# The script also requires a restriction site file, which lists on 
# each line, the sorted locations of the enzyme restriction sites.
#
# Usage:  fragment_4dnpairs.pl <infile> <outfile> [site file]\n";

use POSIX;
use Getopt::Long;

$no_replacement=1;
&GetOptions ( 'a|allow-replacement' => sub { $no_replacement=0; });

$site_file = "/opt/juicer/restriction_sites/hg19_DpnII.txt";
# Check arguments
if (scalar(@ARGV) == 2) {
  ($infile,$outfile) = @ARGV;
}
elsif (scalar(@ARGV) == 3) {
  ($infile,$outfile,$site_file) = @ARGV;
}
else {
  print "Usage: fragment_4dnpairs.pl [--allow-replacement] <infile> <outfile> [site file]\n";
  print " <infile>: file in intermediate format to calculate statistics on\n";
  print " <outfile>: output, results of fragment search\n";  
  print " [site file]: list of restriction sites, one line per chromosome (default DpnII hg19)\n";
  print " --allow-replacement : allows replacing existing frag1/frag2 columns. Default: abort if the columns already exist.\n";
  exit;
}
# Global variables for calculating statistics
my %chromosomes;
my %hindIII;

# read in restriction site file and store as multidimensional array
open FILE, $site_file or die $!;

while (<FILE>) {
  my @locs = split;
  my $key = shift(@locs);
  my $ref = \@locs;
  $chromosomes{$key} = $ref;
	if ($key == "14") {
		$chromosomes{$key."m"} = $ref;  
		$chromosomes{$key."p"} = $ref;
	}
}
close(FILE);

# read in infile and calculate statistics
open INFILE, $infile or die $!;
open OUTFILE,">", $outfile or die $!;
$"="\t";
$frag1_colindex=-1;
$frag2_colindex=-1;
while (<INFILE>) {
  chomp;
  if(/^#columns:\s*/) { 
      my @headers = split/\s+/,$_;
      if((/frag1/ || /frag2/) && $no_replacement) { die "frag columns already exist. Aborting..\n"; }
      elsif((/frag1/ || /frag2/) && !$no_replacement) { 
          for my $i (1..$#headers){
              if($headers[$i] eq 'frag1') { $frag1_colindex=$i-1; }
              elsif($headers[$i] eq 'frag2') { $frag2_colindex=$i-1; }
          }
          print STDERR "$frag1_colindex, $frag2_colindex\n";  # Soo
          if($frag1_colindex == -1) { $frag1_colindex=$#headers; print OUTFILE "$_ frag1\n"; next; }
          elsif($frag2_colindex == -1) { $frag2_colindex=$#headers; print OUTFILE "$_ frag2\n"; next; }
          else { print OUTFILE "$_\n"; next; }
      }
      else { $frag1_colindex=$#headers; $frag2_colindex=$#headers+1; print OUTFILE "$_ frag1 frag2\n"; next; }
  }
  elsif(/^#/) { print OUTFILE "$_\n"; next; }
  my @original_record = split;
  my @record = @original_record[5,1,2,6,3,4];

  # find upper index of position in sites array via binary search
  my $index1 = &bsearch($record[2],$chromosomes{$record[1]});
  my $index2 = &bsearch($record[5],$chromosomes{$record[4]});
  $original_record[$frag1_colindex] = $index1;
  $original_record[$frag2_colindex] = $index2;
  print OUTFILE "@original_record\n";
}
close(INFILE);
close(OUTFILE);

# Binary search, array passed by reference
# search array of integers a for given integer x
# return index where found or upper index if not found
sub bsearch {
  my ($x, $a) = @_;            # search for x in array a
  my ($l, $u) = (0, @$a - 1);  # lower, upper end of search interval
  my $i;                       # index of probe
  while ($l <= $u) {
    $i = int(($l + $u)/2);
    if ($a->[$i] < $x) {
      $l = $i+1;
    }
    elsif ($a->[$i] > $x) {
      $u = $i-1;
    } 
    else {
      return $i+1; # found
    }
  }
  return $l;         
}