1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
|
#!/bin/bash
###########################################################
#################### DEFAULT VALUES #######################
###########################################################
version="1.2.6"
read_sets="" # FOR instance: "read_set1.fa.gz read_set2.fq.gz"
prefix="discoRes" # all temp and final files will be written will start with this prefix
k=31 # size of kmers
b=0 # smart branching approach: bubbles in which both paths are equaly branching are discarded, all others are accepted
c=4 # minimal coverage
d=1 # estimated number of error per read (used by kissreads only)
g=10000000 # estimated genome size. Used only to control kissnp2 memory usage. e.g. 3 billion (3000000000) uses 4Gb of RAM.
PATH_RS="./tools/" # path were executables kissnp2 and kissreads are. Leave blank if they are located in a directory located in the PATH environnement variable
remove=1 # by default: remove waste files
#######################################################################
#################### END HEADER #######################
#######################################################################
function help {
echo "run_discoSnp.sh, a pipelining kissnp2 and kissreads for calling SNPs from NGS reads without the need of a reference genome"
echo "Version "$version
echo "Usage: ./run_discoSnp.sh OPT"
echo -e "\tMANDATORY:"
echo -e "\t \t -r list of reads separated by space, surrounded by the '\"' character. Note that reads may be in fasta or fastq format, gzipped or not."
echo -e "\t \t Example: -r \"data_sample/reads_sequence1.fasta data_sample/reads_sequence2.fasta.gz\"."
echo -e "\tOPT:"
echo -e "\t\t -b value. "
echo -e "\t\t\t 0: forbid SNPs for wich any of the two paths is branching (high precision, lowers the recal in complex genomes). Default value"
echo -e "\t\t\t 1: (smart branching) forbid SNPs for wich the two paths are branching (e.g. the two paths can be created either with a 'A' or a 'C' at the same position"
echo -e "\t\t\t 2: No limitation on branching (lowers the precision, high recall)"
echo -e "\t\t -p prefix. All out files will start with this prefix. Example: -p my_prefix"
echo -e "\t\t -k value. Set the length of used kmers. Must fit the compiled value. Default=31. Example -k 31"
echo -e "\t\t -c value. Set the minimal coverage: Used by kissnp2 (don't use kmers with lower coverage) and kissreads (read coherency threshold). Default=4. Example -c 4"
echo -e "\t\t -d value. Set the number of authorized substitutions used while mapping reads on found SNPs (kissreads). Default=1. Example: -d 1"
echo -e "\t\t -g value. Estimated genome size. Used only to control kissnp2 memory usage. e.g. 3 billion (3000000000) uses 4Gb of RAM. Default=10 million. Example: -d 10000000"
echo -e "\t\t -w: conserve all the intermediate files (waste)"
echo -e "\t\t -h: Prints this message and exist"
echo "Any further question: read the readme file or contact us: pierre.peterlongo@inria.fr"
}
#######################################################################
#################### GET OPTIONS #######################
#######################################################################
while getopts ":r:p:k:c:d:g:b:hw" opt; do
case $opt in
w)
remove=0
;;
h)
help
exit
;;
r)
echo "use read set: $OPTARG" >&2
read_sets=$OPTARG
;;
b)
echo "use branching strategy: $OPTARG" >&2
b=$OPTARG
;;
p)
echo "use prefix=$OPTARG" >&2
prefix=$OPTARG
;;
k)
echo "use k=$OPTARG" >&2
k=$OPTARG
;;
c)
echo "use c=$OPTARG" >&2
c=$OPTARG
;;
d)
echo "use d=$OPTARG" >&2
d=$OPTARG
;;
g)
echo "use g=$OPTARG" >&2
g=$OPTARG
;;
\?)
echo "Invalid option: -$OPTARG" >&2
exit 1
;;
:)
echo "Option -$OPTARG requires an argument." >&2
exit 1
;;
esac
done
#######################################################################
#################### END GET OPTIONS #######################
#######################################################################
if [ -z "$read_sets" ]; then
echo "You must provide at least one read set (-r)"
help
exit
fi
#######################################################################
#################### OPTIONS SUMMARY #######################
#######################################################################
MY_PATH="`( cd \"$MY_PATH\" && pwd )`" # absolutized and normalized
if [ -z "$MY_PATH" ] ; then
# error; for some reason, the path is not accessible
# to the script (e.g. permissions re-evaled after suid)
exit 1 # fail
fi
echo -e "\tRunning discoSnp "$version", in directory "$MY_PATH" with following parameters:"
echo -e "\t\t read_sets="$read_sets
echo -e "\t\t prefix="$prefix
echo -e "\t\t c="$c
echo -e "\t\t k="$k
echo -e "\t\t b="$b
echo -e "\t\t d="$d
echo -e "\t\t g="$g
echo -e -n "\t starting date="
date
echo
#######################################################################
#################### END OPTIONS SUMMARY #######################
#######################################################################
######### CHECK THE k PARITY ##########
rest=$(( $k % 2 ))
if [ $rest -eq 0 ]
then
echo "k=$k is even number, to avoid palindromes, we set it to $(($k-1))"
k=$(($k-1))
fi
#######################################
#######################################################################
#################### KISSNP2 #######################
#######################################################################
echo -e "\t############################################################"
echo -e "\t#################### KISSNP2 MODULE #######################"
echo -e "\t############################################################"
$PATH_RS\kissnp2 $read_sets -T -k $k -c $c -g $g -o $prefix -l -b $b # see also options for kissnp with kissnp -h (filtration with max occurrence numbers (-C), filtration with respect to size of contig (-e), possibility to output only unitig (-t instead of -T), ...)
if [ $? -ne 0 ]
then
echo "there was a problem with kissnp2, command line: $PATH_RS\kissnp2 $read_sets -t -k $k -c $c -g $g -o $prefix"
exit
fi
#######################################################################
#################### KISSREADS #######################
#######################################################################
echo
echo -e "\t#############################################################"
echo -e "\t#################### KISSREADS MODULE #######################"
echo -e "\t#############################################################"
i=4 #avoid modidy this (or increase this if memory needed by kissread is too high. Min 1. Large i (7-10) decreases memory and increases time).
smallk=$(($k-$i-1)) # DON'T modify this.
$PATH_RS\kissreads $prefix\_k_$k\_c_$c.fa $read_sets -k $smallk -i $i -O $k -c $c -d $d -n -o $prefix\_k_$k\_c_$c\_coherent -u $prefix\_k_$k\_c_$c\_uncoherent #no need to see kissreads option in theory.
if [ $? -ne 0 ]
then
echo "there was a problem with kissnp2, command line: $PATH_RS\kissreads $prefix\_k_$k\_c_$c.fa $read_sets -k $smallk -i $i -O $k -c $c -d $d -n -o $prefix\_k_$k\_c_$c\_coherent -u $prefix\_k_$k\_c_$c\_uncoherent "
exit
fi
echo -e "\t###############################################################"
echo -e "\t#################### DISCOSNPs FINISHED #######################"
echo -e "\t###############################################################"
#######################################################################
#################### SORT AND FORMAT COHERENT RESULTS #################
#######################################################################
sort -rg $prefix\_k_$k\_c_$c\_coherent | cut -d " " -f 2 | tr ';' '\n' > $prefix\_k_$k\_c_$c\_coherent.fa
if [ $? -ne 0 ]
then
echo "there was a problem with the result sorting, command line: sort -rg $prefix\_k_$k\_c_$c\_coherent | cut -d " " -f 2 | tr ';' '\n' > $prefix\_k_$k\_c_$c\_coherent.fa"
exit
fi
#######################################################################
#################### REMOVE USELESS FILES #################
#######################################################################
if [ $remove -eq 1 ]
then
echo "I removed " $prefix\_k_$k\_c_$c."*" $prefix\_k_$k\_c_$c\_coherent $prefix\_k_$k\_c_$c\_uncoherent
rm -f $prefix\_k_$k\_c_$c.* $prefix\_k_$k\_c_$c\_coherent $prefix\_k_$k\_c_$c\_uncoherent
fi
echo -e -n "\t ending date="
date
echo -e "\t SNPs are stored in \""$prefix\_k_$k\_c_$c\_coherent.fa"\""
echo -e "\t Thanks for using discoSnp - http://colibread.inria.fr/discoSnp/"
|