File: run_discoSnp.sh

package info (click to toggle)
discosnp 1.2.6-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 2,024 kB
  • ctags: 2,430
  • sloc: cpp: 20,140; ansic: 3,300; asm: 271; sh: 185; makefile: 152; python: 104
file content (222 lines) | stat: -rwxr-xr-x 8,549 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
#!/bin/bash

###########################################################
#################### DEFAULT VALUES #######################
###########################################################
version="1.2.6"
read_sets="" # FOR instance: "read_set1.fa.gz read_set2.fq.gz"
prefix="discoRes" # all temp and final files will be written will start with this prefix
k=31 # size of kmers
b=0 # smart branching approach: bubbles in which both paths are equaly branching are  discarded, all others are accepted
c=4 # minimal coverage
d=1 # estimated number of error per read (used by kissreads only)
g=10000000 # estimated genome size. Used only to control kissnp2 memory usage. e.g. 3 billion (3000000000) uses 4Gb of RAM.
PATH_RS="./tools/" # path were executables kissnp2 and kissreads are. Leave blank if they are located in a directory located in the PATH environnement variable
remove=1 # by default: remove waste files
#######################################################################
#################### END HEADER                 #######################
#######################################################################

function help {
echo "run_discoSnp.sh, a pipelining kissnp2 and kissreads for calling SNPs from NGS reads without the need of a reference genome"
echo "Version "$version
echo "Usage: ./run_discoSnp.sh OPT"
echo -e "\tMANDATORY:"
echo -e "\t \t -r list of reads separated by space, surrounded by the '\"' character. Note that reads may be in fasta or fastq format, gzipped or not."
echo -e "\t \t    Example: -r \"data_sample/reads_sequence1.fasta   data_sample/reads_sequence2.fasta.gz\"."
echo -e "\tOPT:"
echo -e "\t\t -b value. "
echo -e "\t\t\t 0: forbid SNPs for wich any of the two paths is branching (high precision, lowers the recal in complex genomes). Default value"
echo -e "\t\t\t 1: (smart branching) forbid SNPs for wich the two paths are branching (e.g. the two paths can be created either with a 'A' or a 'C' at the same position"
echo -e "\t\t\t 2: No limitation on branching (lowers the precision, high recall)"
echo -e "\t\t -p prefix. All out files will start with this prefix. Example: -p my_prefix"
echo -e "\t\t -k value. Set the length of used kmers. Must fit the compiled value. Default=31. Example -k 31"
echo -e "\t\t -c value. Set the minimal coverage: Used by kissnp2 (don't use kmers with lower coverage) and kissreads (read coherency threshold). Default=4. Example -c 4"
echo -e "\t\t -d value. Set the number of authorized substitutions used while mapping reads on found SNPs (kissreads). Default=1. Example: -d 1"
echo -e "\t\t -g value. Estimated genome size. Used only to control kissnp2 memory usage. e.g. 3 billion (3000000000) uses 4Gb of RAM. Default=10 million. Example: -d 10000000"
echo -e "\t\t -w: conserve all the intermediate files (waste)"
echo -e "\t\t -h: Prints this message and exist"
echo "Any further question: read the readme file or contact us: pierre.peterlongo@inria.fr"
}


#######################################################################
#################### GET OPTIONS                #######################
#######################################################################
while getopts ":r:p:k:c:d:g:b:hw" opt; do
case $opt in

	w)
	remove=0
	;;
	
h)
help
exit
;;

r)
echo "use read set: $OPTARG" >&2
read_sets=$OPTARG
;;

b)
echo "use branching strategy: $OPTARG" >&2
b=$OPTARG
;;

p)
echo "use prefix=$OPTARG" >&2
prefix=$OPTARG
;;

k)
echo "use k=$OPTARG" >&2
k=$OPTARG
;;

c)
echo "use c=$OPTARG" >&2
c=$OPTARG
;;

d)
echo "use d=$OPTARG" >&2
d=$OPTARG
;;

g)
echo "use g=$OPTARG" >&2
g=$OPTARG
;;

\?)
echo "Invalid option: -$OPTARG" >&2
exit 1
;;

:)
echo "Option -$OPTARG requires an argument." >&2
exit 1
;;
esac
done
#######################################################################
#################### END GET OPTIONS            #######################
#######################################################################

if [ -z "$read_sets" ]; then
	echo "You must provide at least one read set (-r)"
help
exit
fi
	


#######################################################################
#################### OPTIONS SUMMARY            #######################
#######################################################################
MY_PATH="`( cd \"$MY_PATH\" && pwd )`"  # absolutized and normalized
if [ -z "$MY_PATH" ] ; then
# error; for some reason, the path is not accessible
# to the script (e.g. permissions re-evaled after suid)
exit 1  # fail
fi
echo -e "\tRunning discoSnp "$version", in directory "$MY_PATH" with following parameters:"
echo -e "\t\t read_sets="$read_sets
echo -e "\t\t prefix="$prefix
echo -e "\t\t c="$c
echo -e "\t\t k="$k
echo -e "\t\t b="$b
echo -e "\t\t d="$d
echo -e "\t\t g="$g
echo -e -n "\t starting date="
date
echo
#######################################################################
#################### END OPTIONS SUMMARY        #######################
#######################################################################






######### CHECK THE k PARITY ##########
rest=$(( $k % 2 ))
if [ $rest -eq 0 ]
then
echo "k=$k is even number, to avoid palindromes, we set it to $(($k-1))"
k=$(($k-1))
fi
#######################################


#######################################################################
#################### KISSNP2                    #######################
#######################################################################

echo -e "\t############################################################"
echo -e "\t#################### KISSNP2 MODULE  #######################"
echo -e "\t############################################################"

$PATH_RS\kissnp2 $read_sets -T -k $k -c $c  -g $g -o $prefix -l -b $b # see also options for kissnp with kissnp -h (filtration with max occurrence numbers (-C), filtration with respect to size of contig (-e), possibility to output only unitig (-t instead of -T), ...)

if [ $? -ne 0 ]
then
    echo "there was a problem with kissnp2, command line: $PATH_RS\kissnp2 $read_sets -t -k $k -c $c  -g $g -o $prefix"
    exit
fi


#######################################################################
#################### KISSREADS                  #######################
#######################################################################
echo
echo -e "\t#############################################################"
echo -e "\t#################### KISSREADS MODULE #######################"
echo -e "\t#############################################################"

i=4 #avoid modidy this (or increase this if memory needed by kissread is too high. Min 1. Large i (7-10) decreases memory and increases time).
smallk=$(($k-$i-1)) # DON'T modify this.


$PATH_RS\kissreads $prefix\_k_$k\_c_$c.fa $read_sets -k $smallk -i $i -O $k -c $c -d $d -n  -o $prefix\_k_$k\_c_$c\_coherent -u $prefix\_k_$k\_c_$c\_uncoherent  #no need to see kissreads option in theory.
if [ $? -ne 0 ]
then
echo "there was a problem with kissnp2, command line: $PATH_RS\kissreads $prefix\_k_$k\_c_$c.fa $read_sets -k $smallk -i $i -O $k -c $c -d $d -n  -o $prefix\_k_$k\_c_$c\_coherent -u $prefix\_k_$k\_c_$c\_uncoherent "
exit
fi



echo -e "\t###############################################################"
echo -e "\t#################### DISCOSNPs FINISHED #######################"
echo -e "\t###############################################################"
#######################################################################
#################### SORT AND FORMAT COHERENT RESULTS #################
#######################################################################
sort -rg $prefix\_k_$k\_c_$c\_coherent | cut -d " " -f 2 | tr ';' '\n' > $prefix\_k_$k\_c_$c\_coherent.fa
if [ $? -ne 0 ]
then
echo "there was a problem with the result sorting, command line: sort -rg $prefix\_k_$k\_c_$c\_coherent | cut -d " " -f 2 | tr ';' '\n' > $prefix\_k_$k\_c_$c\_coherent.fa"
exit
fi


#######################################################################
#################### REMOVE USELESS FILES             #################
#######################################################################
if [ $remove -eq 1 ]
then
	echo "I removed " $prefix\_k_$k\_c_$c."*" $prefix\_k_$k\_c_$c\_coherent $prefix\_k_$k\_c_$c\_uncoherent
rm -f $prefix\_k_$k\_c_$c.* $prefix\_k_$k\_c_$c\_coherent $prefix\_k_$k\_c_$c\_uncoherent
fi

echo -e -n "\t ending date="
date
echo -e "\t SNPs are stored in \""$prefix\_k_$k\_c_$c\_coherent.fa"\""
echo -e "\t Thanks for using discoSnp - http://colibread.inria.fr/discoSnp/"