1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
|
#!/bin/bash
# This file parses UCSC Chromosome Band table into a Go source code file.
#
# This script will only work on the Assembly table.
#
# The prefix, e.g. chr, will be used to label the chromosomes (e.g. chr1, chr2 ... )
# By default, "chr" is used. The package will be used to name the generated package.
#
# To download data tables, see http://genome.ucsc.edu/cgi-bin/hgTables
#
# USE OF THIS SCRIPT WITHOUT A FILTER OR WITH NOFRAG UNSET
# SHOULD IN MOST CASES BE SEEN AS COMPILER ABUSE.
file=$1
prefix=$2
species=$3
package=$4
filter=$5
nofrags=$6
if [ -z "$file" ]; then
echo "Please specify the UCSC assembly table file"
exit
fi
if [ -z "$prefix" ]; then
prefix="chr"
fi
if [ -z "$filter" ]; then
filter="^$"
fi
label="$(tr '[:lower:]' '[:upper:]' <<< ${prefix:0:1})${prefix:1}"
(
echo -e "// DO NOT EDIT. This file was autogenerated by parse.assembly\n"
echo "// Package $package defines chromosome and assembly fragment intervals for the $package genome assembly for $species."
echo -e "package $package\n"
echo "import ("
if [ -z "$nofrags" ]; then
echo -e "\t\"github.com/biogo/biogo/feat\""
fi
echo -e "\t\"github.com/biogo/biogo/feat/genome\"\n)\n"
# chromosomes
echo 'var ('
< $file zcat \
| grep -v '^#' \
| grep -v $filter \
| sed -e 's/\t/ /g' -e 's/chr//' | tr -s ' ' \
| awk '{print $2,$0}' \
| sed -e 's/^[XZ]/1.1e10/' -e 's/^[YW]/1.2e10/' -e 's/^M/1.3e10/' -e's/^Un[^ ]\+/2e10/' \
| sed -e 's/^\([1-9][0-9]*\)[lL]/\1/' -e 's/^\([1-9][0-9]*\)[rR]/\1.5/' \
| sort -k1,1g -k5rn,5 \
| sort -k1,1g -k3,3 -u \
| awk -v prefix=$prefix -v label=$label '{print "\t"label$3" = genome.Chromosome{Chr: \""prefix$3"\", Desc: \"Chromosome\", Length:",$5"}"}'
echo -e ')\n'
echo 'var Chromosomes = []*genome.Chromosome{'
< $file zcat \
| grep -v '^#' \
| grep -v $filter \
| sed -e 's/\t/ /g' -e 's/chr//' | tr -s ' ' \
| awk '{print $2,$0}' \
| sed -e 's/^[XZ]/1.1e10/' -e 's/^[YW]/1.2e10/' -e 's/^M/1.3e10/' -e's/^Un[^ ]\+/2e10/' \
| sed -e 's/^\([1-9][0-9]*\)[lL]/\1/' -e 's/^\([1-9][0-9]*\)[rR]/\1.5/' \
| sort -k1,1g -k5rn,5 \
| sort -k1,1g -k3,3 -u \
| awk -v label=$label '{print "\t&"label$3","}'
echo -e '}\n'
# fragments
if [ -z "$nofrags" ]; then
echo 'var ('
< $file zcat \
| grep -v '^#' \
| grep -v $filter \
| sed -e 's/\t/ /g' -e 's/chr//' | tr -s ' ' \
| awk '{print $2,$0}' \
| sed -e 's/^[XZ]/1.1e10/' -e 's/^[YW]/1.2e10/' -e 's/^M/1.3e10/' -e's/^Un[^ ]\+/2e10/' \
| sed -e 's/^\([1-9][0-9]*\)[lL]/\1/' -e 's/^\([1-9][0-9]*\)[rR]/\1.5/' \
| sort -k1,1g -k3,3 \
| awk -v prefix=$prefix -v label=$label '{print "\t"label$3"_"$8"_"$4" = genome.Fragment{Frag: \""$8"\", Desc: \"Fragment\", Chr: &"label$3", ChrStart:",$4", ChrEnd: "$5", FragStart:",$9", FragEnd: "$10", Type: \x27"$7"\x27, Strand:",$11"1}"}' \
| sed 's/\.\(.*=\)/_\1/'
echo -e ')\n'
echo 'var Fragments = []*genome.Fragment{'
< $file zcat \
| grep -v '^#' \
| grep -v $filter \
| sed -e 's/\t/ /g' -e 's/chr//' | tr -s ' ' \
| awk '{print $2,$0}' \
| sed -e 's/^[XZ]/1.1e10/' -e 's/^[YW]/1.2e10/' -e 's/^M/1.3e10/' -e's/^Un[^ ]\+/2e10/' \
| sed -e 's/^\([1-9][0-9]*\)[lL]/\1/' -e 's/^\([1-9][0-9]*\)[rR]/\1.5/' \
| sort -k1,1g -k3,3 \
| awk -v prefix=$prefix -v label=$label '{print "\t&"label$3"_"$8"_"$46","}' \
| sed 's/\./_/'
echo -e '}\n'
# init
cat << 'END'
//line parse.assembly:99
func init() {
for _, b := range Fragments {
b.Chr.(*genome.Chromosome).Features = append(b.Chr.(*genome.Chromosome).Features, b)
}
for _, c := range Chromosomes {
fc := make([]feat.Feature, len(c.Features))
copy(fc, c.Features)
c.Features = fc
}
}
END
fi
) | gofmt
|