File: compare_masking.pl

package info (click to toggle)
augustus 3.4.0%2Bdfsg2-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 758,480 kB
  • sloc: cpp: 65,451; perl: 21,436; python: 3,927; ansic: 1,240; makefile: 1,032; sh: 189; javascript: 32
file content (88 lines) | stat: -rwxr-xr-x 1,653 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/perl

# Katharina J. Hoff
# November 20th 2018


use strict;
use warnings;
use Getopt::Long;

my $usage = << 'ENDUSAGE';
compare_masking	compare the repeat masking content of differently masked (same) assemblies

SYNOPSIS

compare_masking file1.fa file2.fa

	file1.fa  softmasked fasta file
	file2.fa  softmasked second fasta file

OPTIONS

    --help    output this help message

WARNING: This script keeps two assemblies in memory, i.e. it is not suitable for large genomes!

ENDUSAGE

my ($help);

GetOptions('help' => \$help);

if($help){
	print $usage;
	exit(1);
}


my %masking1;
my $key;
open(FILE1, "<", $ARGV[0]) or die ("Could not open file $ARGV[0]!\n");
	while(<FILE1>){
		chomp;
		if(m/^>/){
			$masking1{$_} = "";
			$key = $_;
		}else{
			$masking1{$key} .= $_;
		}
	}
close(FILE1) or die ("Could not close file $ARGV[0]!\n");

my %masking2;
open(FILE2, "<", $ARGV[1]) or die ("Could not open file $ARGV[1]!\n");
	while(<FILE2>){
		chomp;
		if(m/^>/){
			$masking2{$_} = "";
			$key = $_;
		}else{
			$masking2{$key} .= $_;
		}
	}
close(FILE2) or die ("Could not close file $ARGV[1]!\n");


my $only1 = 0;
my $only2 = 0;
my $both = 0;
while( my($key, $value) = each(%masking1)){
	my @arr1 = split(//, $value);
	my @arr2 = split(//, $masking2{$key});
	my $counter = 0;
	foreach(@arr1){
		if( ($_ =~ m/\p{Lowercase}/) and ($arr2[$counter] =~ m/\p{Lowercase}/)){
			$both++;
		}elsif($_ =~ m/\p{Lowercase}/){
			$only1++;
		}elsif($arr2[$counter] =~ m/\p{Lowercase}/){
			$only2++;
		}
		$counter++;
	}
}

print("Masked in both files: $both\n");
print("Masked in File1, only: $only1\n");
print("Masked in File2, only: $only2\n");