File: maf-sort.sh

package info (click to toggle)
last-align 128-1
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 1,656 kB
  • ctags: 1,820
  • sloc: cpp: 18,045; python: 836; ansic: 635; makefile: 93; sh: 65
file content (60 lines) | stat: -rwxr-xr-x 1,801 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#! /bin/sh

# Sort MAF-format alignments by sequence name, then start position,
# then end position, of the top sequence.  Also, merge identical
# alignments.  Comment lines starting with "#" are written at the top,
# in unchanged order.  If option "-d" is specified, then alignments
# that appear only once are omitted (like uniq -d).

# Maybe the sorting should consider the strand of the top sequence?

# XXX Preceding whitespace is considered part of the sequence name.  I
# want to use sort -b, but it seems to be broken in different ways for
# different versions of sort!

# XXX Alignments with differences in whitespace are considered
# non-identical.

# This script uses perl instead of specialized commands like uniq.
# The reason is that, on some systems (e.g. Mac OS X), uniq doesn't
# work with long lines.

# Make "sort" use a standard ordering:
LC_ALL=C
export LC_ALL

uniqOpt=1
while getopts hd opt
do
    case $opt in
	h)  cat <<EOF
Usage: $(basename $0) [options] my-alignments.maf

Options:
  -h  show this help message and exit
  -d  only print duplicate alignments
EOF
	    exit
	    ;;
	d)  uniqOpt=2
            ;;
    esac
done
shift $((OPTIND - 1))

tmpfile=${TMPDIR-/tmp}/maf-sort.$$

cat "$@" | tee $tmpfile | perl -ne 'print if /^#/'

perl -ne 'print unless /^#/' $tmpfile |  # remove comment lines
perl -pe 'y/ /!/  if /^a/'    |  # change spaces to '!'s in 'a' lines
perl -pe 's/\n/#/ if /\S/'    |  # join each alignment into one big line
sort -k2,2 -k3,3n -k4,4n      |  # sort the lines

# print only the first (or second) of each run of identical lines:
perl -ne '$c = 0 if $x ne $_; $x = $_; print if ++$c == '$uniqOpt |

perl -pe 's/#/\n/g'           |  # undo the line-joining
perl -pe 'y/!/ / if /^a/'        # change '!'s back to spaces in 'a' lines

rm $tmpfile