File: transcript-align.sh

package info (click to toggle)
libgoby-java 3.3.1%2Bdfsg2-11
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 58,108 kB
  • sloc: java: 78,105; cpp: 5,011; xml: 3,170; python: 2,108; sh: 1,575; ansic: 277; makefile: 114
file content (358 lines) | stat: -rw-r--r-- 12,668 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
#!/bin/bash -x

##########################################################################
##########################################################################
## Copyright (C) 2009-2010 Institute for Computational Biomedicine,
##                         Weill Medical College of Cornell University
##
##  This program is free software; you can redistribute it and/or modify
##  it under the terms of the GNU General Public License as published by
##  the Free Software Foundation; either version 3 of the License, or
##  (at your option) any later version.
##
##  This program is distributed in the hope that it will be useful,
##  but WITHOUT ANY WARRANTY; without even the implied warranty of
##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##  GNU General Public License for more details.
##
##  You should have received a copy of the GNU General Public License
##  along with this program.  If not, see <http://www.gnu.org/licenses/>.
##########################################################################
##########################################################################

#
# This script will help you
# 1. Index a reference for a Transcript alignment
# 2. Align a reads file against that reference
#

#
# NOTE:
# The transcript-ids and gene-ids need to be embedded into the Fasta reference
# in the header/comment lines. It is assumed that the comment / header line for
# each sequence will be in the format ">TRANSCRIPT_ID IGNORED_DATA gene:GENE_ID".
# For example,
# ">ENST00000400845 cdna:known supercontig::GL000222.1:11792:16670:1 gene:ENSG00000215756"
# speciefies a transcript-id of "ENST00000400845" and a gene-id of "ENSG00000215756".
#

##
##  Edit the following items...
##
TAG=MeaningfulTag
COMPACT_READS_FILE_TO_ALIGN=your_reads_file.compact-reads
REFERENCE_FASTA_GZ=Homo_sapiens.GRCh37.57.cdna.all.fa.gz
VERSION=GRCh37.57
ORGANISM=homo_sapiens
JVM_FLAGS=-Xmx8g

#
# The aligner to use
#
ALIGNER=bwa
ALIGNER_OPTIONS=
QUALITY_FILTER_PARAMETERS="threshold=0.05"
AMBIGUITY_THRESHOLD=2

#
# When splitting the input READS file, how big to make each chunk. In bytes.
#
READS_SPLIT_CHUNK_SIZE=100000000

#
# Should be basespace or colorspace... depends on the reads you will be aligning
#
SPACE=basespace


############################################################
############################################################
##                                                        ##
##  You shouldn't have to change anything below here.     ##
##                                                        ##
############################################################
############################################################

function removeDir {
    if [ -d $1 ]; then
        rm -rf $1
    fi
}

function calculate_PAD_FORMAT {
    _NUMBER=$1
    _NUMBER_TO_PAD=0
    while [ ${_NUMBER} -gt 10 ]; do
        _NUMBER_TO_PAD=$(( _NUMBER_TO_PAD + 1 ))
        _NUMBER=$(( _NUMBER / 10 ))
    done
    if [ ${_NUMBER_TO_PAD} -eq 0 ]; then
        PAD_FORMAT=%d
    else
        PAD_FORMAT=%0$(( _NUMBER_TO_PAD + 1 ))d
    fi
}

function cleanup {
    # Cleanup failed previous attempts for this tag
    removeDir ${SPLIT_INDEX_DIR}
    removeDir ${SPLIT_ALIGN_DIR}
}

function init {
    echo "########################################################"
    echo "###  init"
    echo "########################################################"
    SPACE_PARAM=""
    if [ $SPACE = "colorspace" ]; then
        SPACE_PARAM="--color-space"
    fi
    
    INDEX_PREFIX=index
    CURRENT_DIR=`pwd`
    OUTPUT_BASE_DIR=${CURRENT_DIR}/reference-db
    SPLIT_WORK_DIR=${CURRENT_DIR}/work-${TAG}
    SPLIT_INDEX_DIR=${SPLIT_WORK_DIR}-index
    SPLIT_ALIGN_DIR=${SPLIT_WORK_DIR}-align
    REFERENCE_FASTA_GZ=${CURRENT_DIR}/${REFERENCE_FASTA_GZ}
    GOBY_JAR=${CURRENT_DIR}/goby.jar
    LOG4J_CONFIG=${CURRENT_DIR}/log4j.properties
    GOBY_CONFIG=${CURRENT_DIR}/goby.properties
    
    #
    # Rename version since this is a transcript index
    #
    VERSION=Transcript-${VERSION}
    
    #
    # Directory where we store the compact reference and toplevel-ids file.
    #
    INPUT_COMPACT_REF_DIR=${OUTPUT_BASE_DIR}/${VERSION}/${ORGANISM}/reference
    INDEXED_REF_DIR=${OUTPUT_BASE_DIR}/${VERSION}/${ORGANISM}/${SPACE}/${ALIGNER}

    # How many parts to split the READS file into
    READS_FILE_SIZE=$(stat -c%s "${COMPACT_READS_FILE_TO_ALIGN}")
    NUMBER_OF_ALIGN_PARTS=$((READS_FILE_SIZE / READS_SPLIT_CHUNK_SIZE))
    if [ ${READS_FILE_SIZE} -gt $(( NUMBER_OF_ALIGN_PARTS * READS_SPLIT_CHUNK_SIZE ))  ]; then
         NUMBER_OF_ALIGN_PARTS=$(( NUMBER_OF_ALIGN_PARTS + 1 ))
    fi

    PRE_RESULTS_DIR=${SPLIT_ALIGN_DIR}/pre-results
}

function indexReference {
    echo "########################################################"
    echo "###  indexReference"
    echo "########################################################"
    #
    # See if it is likely that the index has already been built. If so,
    # don't rebuild it.
    #
    if [ ! -f "${INPUT_COMPACT_REF_DIR}/gene-transcript-map.txt" ]; then
    
        removeDir ${INPUT_COMPACT_REF_DIR}
        removeDir ${INDEXED_REF_DIR}
    
        mkdir -p ${INPUT_COMPACT_REF_DIR}/
    
        #
        # Split the fasta by transcript
        #
        mkdir -p ${SPLIT_INDEX_DIR}
    
        java ${JVM_FLAGS} \
            -Dlog4j.configuration=file:${LOG4J_CONFIG} \
            -Dgoby.configuration=file:${GOBY_CONFIG} \
            -jar ${GOBY_JAR} --mode split-transcripts --input ${REFERENCE_FASTA_GZ} \
            --output ${SPLIT_INDEX_DIR}/${INDEX_PREFIX}
        RETURN_STATUS=$?
        if [ ! $RETURN_STATUS -eq 0 ];then
            echo "Split reference by transcript failed"
            exit
        fi
        mv ${SPLIT_INDEX_DIR}/${INDEX_PREFIX}.config ${INPUT_COMPACT_REF_DIR}/gene-transcript-map.txt
    
        cd ${SPLIT_INDEX_DIR}
        java ${JVM_FLAGS} \
            -Dlog4j.configuration=file:${LOG4J_CONFIG} \
            -Dgoby.configuration=file:${GOBY_CONFIG} \
            -jar ${GOBY_JAR} --mode fasta-to-compact -n 1 --include-identifiers *.fa.gz
        RETURN_STATUS=$?
        if [ ! $RETURN_STATUS -eq 0 ];then
            echo "Conversion of plit reference by transcript files to .compact-reads files failed."
            exit
        fi
        mv *.compact-reads ${INPUT_COMPACT_REF_DIR}
        rm *.fa.gz
        cd ${CURRENT_DIR}
    
        cp ${CURRENT_DIR}/goby.properties ${SPLIT_INDEX_DIR}
    
        for SUB_COMPACT_READS in ${INPUT_COMPACT_REF_DIR}/*.compact-reads
        do
            SUB_INDEX_PREFIX=`basename ${SUB_COMPACT_READS}`
            SUB_INDEX_PREFIX=${SUB_INDEX_PREFIX%\.*}T
            echo "Creating index ${ALIGNER} / ${SPACE} / ${SUB_INDEX_PREFIX} ..."
            cd ${SPLIT_INDEX_DIR}
            java ${JVM_FLAGS} \
                -Dlog4j.configuration=file:${LOG4J_CONFIG} \
                -Dgoby.configuration=file:${GOBY_CONFIG} \
                -jar ${GOBY_JAR} --mode align --index --reference ${SUB_COMPACT_READS} ${SPACE_PARAM} \
                --aligner ${ALIGNER} --database-name \
                ${INDEXED_REF_DIR}/${SUB_INDEX_PREFIX}
            RETURN_STATUS=$?
            if [ ! $RETURN_STATUS -eq 0 ];then
                echo "Indexing of transcript reference failed for ${SUB_COMPACT_READS}"
                exit
            fi
            cd ${CURRENT_DIR}
        done
    fi
}

#
# Split the reads
# Figure out how many transcript parts there were
#
function splitReads {
    echo "########################################################"
    echo "###  splitReads"
    echo "########################################################"
    NUMBER_OF_TRANSCRIPT_PARTS=`ls ${INPUT_COMPACT_REF_DIR}/${INDEX_PREFIX}.*.compact-reads | wc -l`

    #
    # Split the reads
    #  
    mkdir -p ${SPLIT_ALIGN_DIR}
    if [ ${NUMBER_OF_ALIGN_PARTS} -eq 1 ]; then
        cp ${COMPACT_READS_FILE_TO_ALIGN} ${SPLIT_ALIGN_DIR}/0.compact-reads
    else
        for ((i = 0; i < ${NUMBER_OF_ALIGN_PARTS}; i++));
        do
            READS_FILE=${SPLIT_ALIGN_DIR}/${i}.compact-reads
    
            START_POSITION=$(( i *  READS_SPLIT_CHUNK_SIZE ))
            END_POSITION=$(( START_POSITION +  READS_SPLIT_CHUNK_SIZE - 1 ))
    
            java ${JVM_FLAGS} \
                -Dlog4j.configuration=file:${LOG4J_CONFIG} \
                -Dgoby.configuration=file:${GOBY_CONFIG} \
                -jar ${GOBY_JAR} --mode reformat-compact-reads --output ${READS_FILE} \
                --start-position ${START_POSITION} --end-position ${END_POSITION} ${COMPACT_READS_FILE_TO_ALIGN}
            RETURN_STATUS=$?
            if [ ! $RETURN_STATUS -eq 0 ];then
                echo "Spliting the input reads file failed."
                exit
            fi
        done
    fi
}

#
# Align the split reads
#
function alignSplitReads {
    echo "########################################################"
    echo "###  alignSplitReads"
    echo "########################################################"
    calculate_PAD_FORMAT ${NUMBER_OF_TRANSCRIPT_PARTS}
    for ((READS_NUM = 0; READS_NUM < NUMBER_OF_ALIGN_PARTS; READS_NUM++));
    do
        READS_FILE=${SPLIT_ALIGN_DIR}/${READS_NUM}.compact-reads
        for ((TRANSCRIPT_NUMBER = 0; TRANSCRIPT_NUMBER < NUMBER_OF_TRANSCRIPT_PARTS; TRANSCRIPT_NUMBER++));
        do
            REF_INDEX_PREFIX=`printf ${INDEX_PREFIX}.${PAD_FORMAT} ${TRANSCRIPT_NUMBER}`
            REFERENCE_INDEX_FILE=${INPUT_COMPACT_REF_DIR}/${REF_INDEX_PREFIX}.compact-reads        
            java ${JVM_FLAGS} \
                -Dlog4j.configuration=file:${LOG4J_CONFIG} \
                -Dgoby.configuration=file:${GOBY_CONFIG} \
                -jar ${GOBY_JAR} \
                --mode align \
                --reference ${REFERENCE_INDEX_FILE} \
                --aligner ${ALIGNER} ${SPACE_PARAM} --search \
                --ambiguity-threshold ${AMBIGUITY_THRESHOLD} --quality-filter-parameters "${QUALITY_FILTER_PARAMETERS}" \
                --database-name ${REF_INDEX_PREFIX}T --database-directory ${INDEXED_REF_DIR} \
                ${ALIGNER_OPTIONS} --reads ${READS_FILE} --basename ${TAG}
             RETURN_STATUS=$?
             rm ${READS_NUM}.fasta ${READS_NUM}.fastq
             if [ ! $RETURN_STATUS -eq 0 ];then
                 echo "Alignment failed"
                 exit
             fi
    
             # Alignment completed for this part
             RESULT_DIR=${PRE_RESULTS_DIR}/transcripts-${READS_NUM}-${TRANSCRIPT_NUMBER}
             /bin/mkdir -p ${RESULT_DIR}
             /bin/mv *.entries *.header *.stats *.tmh ${RESULT_DIR}
        done
    done
}

#
# Merge the results
#
function merge {
    echo "########################################################"
    echo "###  merge"
    echo "########################################################"
    GENE_TRANSCRIPT_MAP_FILE=${INPUT_COMPACT_REF_DIR}/gene-transcript-map.txt
    for ((READS_NUM = 0; READS_NUM < NUMBER_OF_ALIGN_PARTS; READS_NUM++));
    do
        if [ ${NUMBER_OF_ALIGN_PARTS} -eq 1 ]; then
            OUTPUT_DIR=${CURRENT_DIR}/results-${TAG}
        else
            OUTPUT_DIR=${PRE_RESULTS_DIR}/${TAG}-${READS_NUM}
        fi
        mkdir -p ${OUTPUT_DIR}
        java ${JVM_FLAGS} \
            -Dlog4j.configuration=file:${LOG4J_CONFIG} \
            -Dgoby.configuration=file:${GOBY_CONFIG} \
            -jar ${GOBY_JAR} \
            --mode merge-compact-alignments \
            --gene-transcript-map-file ${GENE_TRANSCRIPT_MAP_FILE} \
            --output ${OUTPUT_PREFIX}/${TAG} \
            ${PRE_RESULTS_DIR}/transcripts-${READS_NUM}-*/*.entries
         RETURN_STATUS=$?
         if [ ! $RETURN_STATUS -eq 0 ];then
             echo "Merge failed"
             exit
         fi
    done
}

#
# Concat
#
function concat {
    echo "########################################################"
    echo "###  concat"
    echo "########################################################"
    if [ ${NUMBER_OF_ALIGN_PARTS} -gt 1 ];then
        RESULT_DIR=${CURRENT_DIR}/results-${TAG}
        mkdir -p ${RESULT_DIR}
        java ${JVM_FLAGS} \
            -Dlog4j.configuration=file:${LOG4J_CONFIG} \
            -Dgoby.configuration=file:${GOBY_CONFIG} \
            -jar ${GOBY_JAR} \
            --mode concatenate-alignments --adjust-query-indices false \
            --output ${RESULT_DIR}/${TAG} ${PRE_RESULTS_DIR}/${TAG}-*/*.entries
        RETURN_STATUS=$?
        if [ ! $RETURN_STATUS -eq 0 ];then
            echo "Concat failed"
            exit
        fi
    
    fi
}

#
# Execute the transcript alignment
#
cleanup
init
indexReference
splitReads
alignSplitReads
merge
concat
cleanup