File: CHECK-entries.sh

package info (click to toggle)
resfinder-db 0.0+git20200408.0322c0d-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 3,520 kB
  • sloc: python: 75; sh: 36; makefile: 4
file content (63 lines) | stat: -rwxr-xr-x 2,385 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/bin/sh
#
# CHECK-entries.sh
# Marco van Zwetselaar <zwets@kcri.ac.tz>
#
#  Checks that each sequence in the fsa files has an entry in phenotypes.txt,
#  and that every entry in phenotypes has a corresponding sequence in an fsa.
#
#  Writes to stdout the lists of missing entries, as well as close matches
#  based on allele ID or accession.
#

# Echo all identifiers from the phenotypes.txt file, one per line.
phenotype_ids() {
    cut -f1 phenotypes.txt | tail -n +2 | sed -e 's/ *$//'
}

# Echo all sequence identifiers from the *.fsa, one per line.
sequence_ids() {
    fgrep -h '>' *.fsa | sed -e 's/>\([^ ]*\).*/\1/'
}

# Filter stdin for near matches of seqid $1.  A near match is when either
# the allele ID (without accession) or the accession matches.
near_matches() {
    # Horrid REGEX, but we need to escape special chars and split the SEQID
    local REGEX="$(echo "$1" |
        sed -e 's,\([][().+?|*]\),\\\1,g' \
            -e 's,^\([^_]*_[^_]*\)\(_.*\),^\1|\2$,')"
    grep -E "$REGEX" | tr '\n' ' '
}


printf "
===============================================================================
I. Entries in phenotypes.txt with trailing whitespace in their identifier (col 1).
   (This whitespace breaks simple key based lookups.)
-------------------------------------------------------------------------------\n"
cut -f1 phenotypes.txt | tail -n +2 | fgrep ' '


printf "
===============================================================================
II. Entries in phenotypes.txt with no matching sequence in an *.fsa file.
    Second column lists close matches (having identical alleleID or accession).
-------------------------------------------------------------------------------\n"
phenotype_ids | while read SEQID; do
    sequence_ids | fgrep -xq "${SEQID}" ||
        printf "$SEQID\t%s\n" "$(sequence_ids | near_matches "$SEQID")"
done


printf "
===============================================================================
III. Sequences in *.fsa that have no corresponding entry in phenotypes.txt.
     Second column lists close matches (having identical alleleID or accession).
-------------------------------------------------------------------------------\n"
sequence_ids | while read SEQID; do
    phenotype_ids | fgrep -xq "${SEQID}" ||
        printf "$SEQID\t%s\n" "$(phenotype_ids | near_matches "$SEQID")"
done

# vim: sts=4:sw=4:si:et:ai