File: util.go

package info (click to toggle)
golang-github-shenwei356-bio 0.13.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 852 kB
  • sloc: perl: 114; sh: 58; makefile: 21
file content (99 lines) | stat: -rwxr-xr-x 2,266 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
package fastx

import (
	"io"

	"github.com/shenwei356/bio/seq"
)

// GetSeqNames returns the names of a fasta/q file
func GetSeqNames(file string) ([]string, error) {
	names := []string{}
	seq.ValidateSeq = false
	reader, err := NewDefaultReader(file)
	if err != nil {
		return nil, nil
	}
	for {
		record, err := reader.Read()
		if err != nil {
			if err == io.EOF {
				break
			}
			return nil, err
		}
		names = append(names, string(record.Name))
	}
	return names, nil
}

// GetSeqNumber returns the sequences number of FASTA/Q files
func GetSeqNumber(file string) (int, error) {
	n := 0
	seq.ValidateSeq = false
	reader, err := NewDefaultReader(file)
	if err != nil {
		return 0, nil
	}
	for {
		_, err := reader.Read()
		if err != nil {
			if err == io.EOF {
				break
			}
			return 0, err
		}
		n++
	}
	return n, nil
}

// GetSeqs return fastx records of a file.
// when alphabet is nil or seq.Unlimit, it will automaticlly detect the alphabet.
// when idRegexp is "", default idRegexp ( ^([^\s]+)\s? ) will be used.
func GetSeqs(file string, alphabet *seq.Alphabet, bufferSize int, chunkSize int, idRegexp string) ([]*Record, error) {
	records := []*Record{}

	reader, err := NewReader(alphabet, file, idRegexp)
	if err != nil {
		return records, err
	}
	for chunk := range reader.ChunkChan(bufferSize, chunkSize) {
		if err != nil {
			return records, err
		}

		records = append(records, chunk.Data...)
	}
	return records, nil
}

// GetSeqsMap returns all seqs as a map for fasta file
func GetSeqsMap(file string, alphabet *seq.Alphabet, bufferSize int, chunkSize int, idRegexp string) (map[string]*Record, error) {
	m := make(map[string]*Record)
	records, err := GetSeqs(file, alphabet, bufferSize, chunkSize, idRegexp)
	if err != nil {
		return m, err
	}
	for _, record := range records {
		m[string(record.Name)] = record
	}
	return m, nil
}

// GuessAlphabet guess the alphabet of the file by the first maxLen bases
func GuessAlphabet(file string) (*seq.Alphabet, bool, error) {
	reader, err := NewDefaultReader(file)
	if err != nil {
		return nil, false, err
	}

	_, err = reader.Read()
	if err != nil {
		if err == io.EOF {
			return reader.Alphabet(), false, io.EOF
		}
		return nil, false, err
	}
	return reader.Alphabet(), reader.IsFastq, nil
}