File: compare-graphs.sh

package info (click to toggle)
vg 1.30.0%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 267,848 kB
  • sloc: cpp: 446,974; ansic: 116,148; python: 22,805; cs: 17,888; javascript: 11,031; sh: 5,866; makefile: 4,039; java: 1,415; perl: 1,303; xml: 442; lisp: 242
file content (281 lines) | stat: -rwxr-xr-x 9,253 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
#!/usr/bin/env bash
# compare-graphs.sh: compare a set of graph against each other using toil-vg mapeval on AWS

set -ex

# What toil-vg should we install?
TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@bf1006b4932ce48d1bd742691619808285582c4c"

# What Toil should we use?
TOIL_APPLIANCE_SELF=quay.io/ucsc_cgl/toil:3.16.0a1.dev2281-c7d77b028064a739e897f7b1eb158c902b530475

# What vg should we use?
VG_DOCKER_OPTS=()

# How many nodes should we use at most?
MAX_NODES=6

# What's our unique run ID? Should be lower-case and start with a letter for maximum compatibility.
# See <https://gist.github.com/earthgecko/3089509>
RUN_ID="run$(cat /dev/urandom | LC_CTYPE=C tr -dc 'a-z0-9' | fold -w 32 | head -n 1)"

# What cluster should we use?
CLUSTER_NAME="${RUN_ID}"
MANAGE_CLUSTER=1

# Should we delete the job store when we exit?
# We do by default, and if the run finishes successfully.
# We don't if we're asked to keep it and Toil errors out.
REMOVE_JOBSTORE=1

# Should we delete the outstore at the end of the script, if we're deleting the
# jobstore?
REMOVE_OUTSTORE=1

# What input reads and position truth set should we use?
READ_STEM="comparison"

# Should we look for .bam reads (instead of .gam)?
USE_BAM_READS=0

# Should we look for .fq.gz reads (instead of .gam)?
USE_FQ_READS=0

# Should we add --ignore_quals
IGNORE_QUALS=""

# Should we restart?
RESTART_ARG=""

usage() {
    # Print usage to stderr
    exec 1>&2
    printf "Usage: $0 [Options] OUTPUT_PATH KEYPAIR_NAME REGION_NAME GRAPH [GRAPH [GRAPH ...]] \n"
    printf "Options:\n\n"
    printf "\t-p PACKAGE\tUse the given Python package specifier to install toil-vg.\n"
    printf "\t-t CONTAINER\tUse the given Toil container in the cluster (default: ${TOIL_APPLIANCE_SELF}).\n"
    printf "\t-c CLUSTER\tUse the given existing Toil cluster.\n"
    printf "\t-v DOCKER\tUse the given Docker image specifier for vg.\n"
    printf "\t-r READS\tUse the given read set stem (default: ${READ_STEM}).\n"
    printf "\t-b BAM-READS\tUse BAM input reads (in READ_STEM).\n"
    printf "\t-f FASTQ-READS\tUse .fq.gz input reads (in READ_STEM). Much faster than GAM/BAM.\n"	 
    printf "\t-R RUN_ID\tUse or restart the given run ID.\n"
    printf "\t-k \tKeep the out store and job store in case of error.\n"
    printf "\t-s \tRestart a failed run.\n"
    printf "\t-3 \tUse S3 instead of HTTP (much faster)\n"
    printf "\t-i \tIgnore base qualities (needed if not running on trained sim or bam data)\n"
    exit 1
}

while getopts "hp:t:c:v:r:bfR:ks3i" o; do
    case "${o}" in
        p)
            TOIL_VG_PACKAGE="${OPTARG}"
            ;;
        t)
            TOIL_APPLIANCE_SELF="${OPTARG}"
            ;;
        c)
            CLUSTER_NAME="${OPTARG}"
            MANAGE_CLUSTER=0
            ;;
        v)
            VG_DOCKER_OPTS="--vg_docker ${OPTARG}"
            ;;
        r)
            READ_STEM="${OPTARG}"
            ;;
        b)
            USE_BAM_READS=1
            ;;
        f)
            USE_FQ_READS=1
            ;;        		  
        R)
            # This doesn't change the cluster name, which will still be the old run ID if not manually set.
            # That's probably fine.
            RUN_ID="${OPTARG}"
            ;;
        k)
            REMOVE_JOBSTORE=0
            ;;
        s)
            RESTART_ARG="--restart"
            ;;
        3)
            USE_S3=1
            ;;
        i)
            IGNORE_QUALS="--ignore-quals"
            ;;
        *)
            usage
            ;;
    esac
done

shift $((OPTIND-1))

if [[ "$#" -lt "4" ]]; then
    # Too few arguments
    usage
fi

OUTPUT_PATH="${1}"
shift
KEYPAIR_NAME="${1}"
shift
REGION_NAME="${1}"
shift

GRAPH_NAMES=( )
while [[ "$#" -gt "0" ]]; do
    # Put all the args as graph names
    GRAPH_NAMES+=("$1")
    shift
done

# Where do we keep our input files
if [ "$REGION_NAME" == "B37" ] || [ "$REGION_NAME" == "HS37D5" ]; then
     STORE_TAG="$REGION_NAME"
else
     STORE_TAG="bakeoff"
fi
if [[ "${USE_S3}" -eq "1" ]]; then
     INPUT_STORE="s3://vg-data/${STORE_TAG}"
else
     INPUT_STORE="https://vg-data.s3.amazonaws.com/${STORE_TAG}"     
fi

# Where do we save our results from the various jobs responsible for writing them?
OUTPUT_STORE="aws:us-west-2:vg-data/comparison-script/runs/${RUN_ID}"
OUTPUT_STORE_URL="s3://vg-data/comparison-script/runs/${RUN_ID}"

# Where do we store our jobs?
JOB_TREE="aws:us-west-2:${RUN_ID}"

# Put this in front of commands to do or not do them
PREFIX=""

echo "Running run ${RUN_ID} as ${KEYPAIR_NAME} to compare ${GRAPH_NAMES[*]} on ${REGION_NAME} into ${OUTPUT_PATH}"

function get_input_url() {
    # Prints the input URL to download for the given file name
    local BASE_FILENAME="${1}"
    shift
    echo "${INPUT_STORE}/${BASE_FILENAME}"
}

function get_graph_url() {
    # Prints the base URL for the given graph
    local BASE_GRAPHNAME="${1}"
    shift
    get_input_url "${BASE_GRAPHNAME}-${REGION_NAME}"
}

# Make sure we don't leave the cluster running or data laying around on exit.
function clean_up() {
    set +e
    if [[ "${REMOVE_JOBSTORE}" == "1" ]]; then
        # Delete the Toil intermediates we could have used to restart the job
        $PREFIX toil clean "${JOB_TREE}"
        
        if [[ "${REMOVE_OUTSTORE}" == "1" ]]; then
            # Toil is happily done and we downloaded things OK
            # Delete the outputs
            $PREFIX aws s3 rm --recursive "${OUTPUT_STORE_URL}"
        fi
    fi
    if [[ "${MANAGE_CLUSTER}" == "1" ]]; then
        # Destroy the cluster
        $PREFIX toil destroy-cluster "${CLUSTER_NAME}" -z us-west-2a
    fi
}
trap clean_up EXIT

# Convert just graph stems to full base urls
GRAPH_URLS=()
for GRAPH_STEM in "${GRAPH_NAMES[@]}"; do
    GRAPH_URLS+=(`get_graph_url "${GRAPH_STEM}"`)
done

if [[ "${MANAGE_CLUSTER}" == "1" ]]; then
    TOIL_APPLIANCE_SELF="${TOIL_APPLIANCE_SELF}" $PREFIX toil launch-cluster "${CLUSTER_NAME}" --leaderNodeType=t2.medium -z us-west-2a "--keyPairName=${KEYPAIR_NAME}"
fi

# We need to manually install git to make pip + git work...
$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" apt update
$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" apt install git -y

# Ignore the old virtualenv if re-using a cluster

# For hot deployment to work, toil-vg needs to be in a virtualenv that can see the system Toil
$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" virtualenv --system-site-packages venv

$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" venv/bin/pip install pyyaml
$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" venv/bin/pip install aws
$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" venv/bin/pip install numpy
$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" venv/bin/pip install scipy
$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" venv/bin/pip install scikit-learn
$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" venv/bin/pip install "${TOIL_VG_PACKAGE}"

# We need the master's IP to make Mesos go
MASTER_IP="$($PREFIX toil ssh-cluster --insecure --zone=us-west-2a --logOff "${CLUSTER_NAME}" hostname -i)"

# Strip some garbage from MASTER_IP
MASTER_IP="${MASTER_IP//[$'\t\r\n ']}"

# Make sure we download the outstore whether we break now or not
set +e

# What truth/read set should we use?
READ_SET="${READ_STEM}-${REGION_NAME}"

# Toggle BAM Reads
if [[ "${USE_BAM_READS}" -eq "1" ]]; then
    INPUT_OPTS="--bam_input_reads $(get_input_url ${READ_SET}.bam)"
elif [[ "${USE_FQ_READS}" -eq "1" ]]; then
    INPUT_OPTS="--truth $(get_input_url ${READ_SET}.pos) --fastq $(get_input_url ${READ_SET}.fq.gz)" 	 
else
    INPUT_OPTS="--truth $(get_input_url ${READ_SET}.pos) --gam_input_reads $(get_input_url ${READ_SET}.gam)" 
fi

$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" venv/bin/toil-vg mapeval \
    --whole_genome_config \
    ${RESTART_ARG} \
    ${VG_DOCKER_OPTS} \
    ${INPUT_OPTS} \
    --fasta `get_input_url "${REGION_NAME}.fa"` \
    --index-bases "${GRAPH_URLS[@]}" \
    --gam-names "${GRAPH_NAMES[@]}" \
    --bwa \
    --multipath ${IGNORE_QUALS} \
    --mapeval-threshold 200 \
    --realTimeLogging --logInfo \
    "${JOB_TREE}" \
    "${OUTPUT_STORE}" \
    --batchSystem mesos --provisioner=aws "--mesosMaster=${MASTER_IP}:5050" \
    --nodeTypes=r3.8xlarge:0.70 --defaultPreemptable --maxNodes=${MAX_NODES} --retryCount=3
TOIL_ERROR="$?"
    
# Make sure the output is public
$PREFIX toil ssh-cluster --insecure --zone=us-west-2a "${CLUSTER_NAME}" venv/bin/aws s3 sync --acl public-read "${OUTPUT_STORE_URL}" "${OUTPUT_STORE_URL}"
    
mkdir -p "${OUTPUT_PATH}"
aws s3 sync "${OUTPUT_STORE_URL}" "${OUTPUT_PATH}"
DOWNLOAD_ERROR="$?"

if [[ "${TOIL_ERROR}" == "0" ]]; then
    # Toil completed successfully.
    # We will delete the job store
    REMOVE_JOBSTORE=1
fi

if [[ ! "${DOWNLOAD_ERROR}" == "0" ]]; then
    # Download failed
    # We will keep the out store
    # (we also keep if if Toil fails and we're keeping the jobstore)
    REMOVE_OUTSTORE=0
fi

# Cluster, tree, and output will get cleaned up by the exit trap