1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
|
// read genome files
package ggd_utils
import (
"bytes"
"fmt"
"io"
"log"
"regexp"
"strconv"
"strings"
"github.com/shenwei356/xopen"
)
// GenomeFile has the Lengths and Orders of the chromosomes.
type GenomeFile struct {
Lengths map[string]int
Order map[string]int
path string
ReMap map[string]string
}
// Less checks if one chromosome occurs before the other.
func (g *GenomeFile) Less(a, b string) bool {
return g.Order[a] <= g.Order[b]
}
func readChromosomMappings(fname string) map[string]string {
if fname == "" {
return nil
}
rdr, err := xopen.Ropen(fname)
result := make(map[string]string)
if err != nil {
log.Fatalf("[gsort] unable to open chromosome maping file: %s", fname)
}
warned := false
for {
line, err := rdr.ReadString('\n')
if len(line) > 0 {
toks := strings.Split(strings.TrimSpace(line), "\t")
if len(toks) == 1 {
toks = append(toks, "[unknown]"+toks[0])
if !warned {
log.Printf("[gsort] warning unmappable chromosome: %s.", toks[0])
warned = true
}
}
result[toks[0]] = toks[1]
}
if err == io.EOF {
break
}
if err != nil {
log.Printf("[gsort] error reading chromosome maping file: %s. %s", fname, err)
}
}
return result
}
// ReadGenomeFile returns a GenomeFile struct
func ReadGenomeFile(path string, chromsomeMappings string) (*GenomeFile, error) {
rdr, err := xopen.Ropen(path)
if err != nil {
return nil, err
}
gf := &GenomeFile{path: path, Lengths: make(map[string]int, 50), Order: make(map[string]int, 50)}
defer rdr.Close()
gf.ReMap = readChromosomMappings(chromsomeMappings)
space := regexp.MustCompile("\\s+")
// allow us to bypass a header. found indicates when we have a usable line.
found := false
for {
line, err := rdr.ReadBytes('\n')
line = bytes.TrimSpace(line)
if len(line) > 0 {
if line[0] == '#' {
continue
}
}
if err == io.EOF {
break
}
if err != nil {
return nil, err
}
if len(line) == 0 {
continue
}
toks := space.Split(string(line), -1)
chrom := toks[0]
length, err := strconv.Atoi(toks[1])
if err != nil {
if !found {
continue
}
return nil, err
}
found = true
gf.Lengths[chrom] = length
gf.Order[chrom] = len(gf.Order)
}
if !found {
return nil, fmt.Errorf("no usable lengths found for %s\n", path)
}
return gf, nil
}
|