File: mask_low_complexity.sh

package info (click to toggle)
kraken2 2.1.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,124 kB
  • sloc: cpp: 4,557; python: 2,014; perl: 1,281; sh: 515; makefile: 69
file content (43 lines) | stat: -rwxr-xr-x 1,209 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/bin/bash

# Copyright 2013-2023, Derrick Wood <dwood@cs.jhu.edu>
#
# This file is part of the Kraken 2 taxonomic sequence classification system.

# Masks low complexity sequences in the database using the dustmasker (nucl.)
# or segmasker (prot.) programs from NCBI.

set -u  # Protect against uninitialized vars.
set -e  # Stop on error
set -o pipefail  # Stop on failures in non-final pipeline commands

target="$1"

MASKER="k2mask"
if [ -n "$KRAKEN2_PROTEIN_DB" ]; then
  MASKER="segmasker"
fi

if ! which $MASKER > /dev/null; then
  echo "Unable to find $MASKER in path, can't mask low-complexity sequences"
  exit 1
fi

if [ -d $target ]; then
  for file in $(find $target '(' -name '*.fna' -o -name '*.faa' ')'); do
    if [ ! -e "$file.masked" ]; then
      $MASKER -in $file -outfmt fasta | sed -e '/^>/!s/[a-z]/x/g' > "$file.tmp"
      mv "$file.tmp" $file
      touch "$file.masked"
    fi
  done
elif [ -f $target ]; then
  if [ ! -e "$target.masked" ]; then
    $MASKER -in $target -outfmt fasta | sed -e '/^>/!s/[a-z]/x/g' > "$target.tmp"
    mv "$target.tmp" $target
    touch "$target.masked"
  fi
else
  echo "Target $target must be directory or regular file, aborting masking"
  exit 1
fi