File: not-acgt.awk

package info (click to toggle)
tigr-glimmer 3.02-3
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 13,952 kB
  • ctags: 2,530
  • sloc: cpp: 24,376; awk: 232; csh: 220; makefile: 155; sh: 36
file content (55 lines) | stat: -rwxr-xr-x 974 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/bin/awk -f
# Usage:  not-acgt.awk
#   Read a fasta input file and find regions consisting of MIN_RUN
#   or more consecutive non-acgt characters in the first string.
#   If there is more than one string, all strings after the first
#   are ignore.  Output is one line
#   per region, with start position and end position on each line.
#   Positions are inclusive, counting from 1 so that the first 10
#   positions of the file are indicated as "1 10".  The value of
#   MIN_RUN can be set below.


BEGIN {
  MIN_RUN = 5;
  ct = pos = start = 0;
}


/^>/ {
  line_ct ++;
  if (line_ct == 1)
    next;
  else
    exit;
}


{
  n = length ($1);
  for (i = 1; i <= n; i ++)
    {
      if (match (substr ($1, i, 1), /[acgtACGT]/))
        Pr();
      else
        {
          if (ct == 0)
            start = pos + 1;
          ct ++;
        }
      pos ++;
    }
}


END {
  Pr();
}


function  Pr  ()
{
  if (ct >= MIN_RUN)
    printf "%8d %8d\n", start, pos;
  ct = 0;
}