1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
|
#!/bin/awk -f
# Usage: not-acgt.awk
# Read a fasta input file and find regions consisting of MIN_RUN
# or more consecutive non-acgt characters in the first string.
# If there is more than one string, all strings after the first
# are ignore. Output is one line
# per region, with start position and end position on each line.
# Positions are inclusive, counting from 1 so that the first 10
# positions of the file are indicated as "1 10". The value of
# MIN_RUN can be set below.
BEGIN {
MIN_RUN = 5;
ct = pos = start = 0;
}
/^>/ {
line_ct ++;
if (line_ct == 1)
next;
else
exit;
}
{
n = length ($1);
for (i = 1; i <= n; i ++)
{
if (match (substr ($1, i, 1), /[acgtACGT]/))
Pr();
else
{
if (ct == 0)
start = pos + 1;
ct ++;
}
pos ++;
}
}
END {
Pr();
}
function Pr ()
{
if (ct >= MIN_RUN)
printf "%8d %8d\n", start, pos;
ct = 0;
}
|