File: remove_short_seqs.sh

package info (click to toggle)
cct 20170919%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 333,956 kB
  • sloc: xml: 837,386; perl: 12,630; sh: 1,602; makefile: 17
file content (104 lines) | stat: -rwxr-xr-x 2,728 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/bin/bash -e

PROGNAME=`basename $0`

function usage {
    echo "usage: remove_short_seqs.sh [[-i input] [-l length] | [-h]]"
    echo "-i DIRECTORY the input directory of GenBank files with .gbk extensions"
    echo "-l INTEGER remove GenBank files that describe sequences shorter than this length"
    echo "
USAGE:
   remove_short_seqs.sh -i DIR -l INTEGER

DESCRIPTION:
   Removes GenBank files that are shorter than the specified length from the
   provided directory.

REQUIRED ARGUMENTS:
   -i, --input DIR
      Input directory of GenBank files with .gbk extensions.
   -l, --length INTEGER
      Remove GenBank files that describe sequences shorter than this length.

OPTIONAL ARGUMENTS:
   -h, --help
      Show this message

EXAMPLE:
   remove_short_seqs.sh -i my_project/comparison_genomes -l 100000
"
}

function error_exit {
        echo "${PROGNAME}: ${1:-"Unknown Error"}" 1>&2
        exit 1
}

function remove_trailing_slash {
    string="$1"
    new_string=`echo "$string" | perl -nl -e 's/\/+$//;' -e 'print $_'`
    echo $new_string
}

function get_sequence_length {
    file="$1"
    seq_length=`head -n 1 "$file" | perl -nl -e 'm/(\d+)\sbp/;' -e 'print $1'`
    echo $seq_length
}

while [ "$1" != "" ]; do
    case $1 in
        -i | --input )          shift
                                input=$1
                                ;;
        -l | --length )         shift
                                min_length=$1
                                ;;
        -h | --help )           usage
                                exit
                                ;;
        * )                     usage
                                exit 1
    esac
    shift
done

if [ -z $input ]; then
    error_exit "Please use '-i' to specify an input directory of GenBank files with .gbk extensions. Use '-h' for help."
fi

if [ -z $min_length ]; then
    error_exit "Please use '-l' to specify a minimum sequence length to keep. Use '-h' for help."
fi

cct_home=$CCT_HOME

input=`remove_trailing_slash "$input"`

# save and change IFS to avoid problems with filesnames with spaces
OLDIFS=$IFS
IFS=$'\n'
 
#find all GenBank files in the directory
files=($( find "$input" -maxdepth 1 -type f -name "*.gbk" ))

# restore IFS
IFS=$OLDIFS

length=${#files[@]}
for (( i=0; i<$length; i++ ));
do
    gbk_file=${files[$i]}
    seq_length=`get_sequence_length "$gbk_file"`

    if [ -z $seq_length ]; then
        error_exit "Unable to determine length of sequence '$gbk_file'"
    fi

    if [ "$seq_length" -lt "$min_length" ]; then
	echo "Removing file '$gbk_file' because sequence length is $seq_length"
	rm -f "$gbk_file"
    else
	echo "Keeping file '$gbk_file' because sequence length is $seq_length"
    fi
done