1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
|
#!/bin/bash
usage="\
Usage:
$(basename "$0") -P stacks_dir -B bam_file [-q min_MAPQ=20] -O out_dir
Extracts the coordinates of the RAD loci from the given BAM file into a
'locus_coordinates.tsv' table, then rewrites the 'catalog.fa.gz' and
'catalog.calls' files so that they include the genomic coordinates given in the
input BAM file.
"
version="_VERSION_"
# Check dependencies
# ==========
command -v python3 &>/dev/null || { python3; echo "Error: Python 3 is required." >&2; exit 1; }
command -v samtools &>/dev/null || { samtools; echo "Error: Samtools is required." >&2; exit 1; }
# Parse arguments
# ==========
# If there aren't any arguments, print the help.
[[ $# -gt 0 ]] || { echo -n "$usage"; exit 1; }
# Check that there aren't weird characters in the input.
for arg in "$@" ;do
if [[ "$arg" =~ [^[:alnum:]_.,/~+=:-] ]] ;then
echo "Error: illegal character in argument '$arg'." >&2
exit 1
fi
done
# Parse arguments.
min_mapq=20
while getopts 'P:B:O:q:hv' opt ;do
case "$opt" in
P) stacks_d="${OPTARG/%\//}" ;;
B) bam_f="${OPTARG/%\//}" ;;
O) out_d="${OPTARG/%\//}" ;;
q) min_mapq="$OPTARG" ;;
h) echo -n "$usage" >&2; exit 1 ;;
v) echo "$version" >&2; exit 1 ;;
*) echo "Error: Bad arguments (-h for help)." >&2; exit 1 ;;
esac
done
shift $((OPTIND-1))
if [[ $# -ne 0 ]] ;then
echo "Error: '$1' is seen as a positional argument (expected no positional arguments)." >&2
exit 1
fi
# Check them.
[[ "$stacks_d" ]] || { echo "Error: -P is required." >&2; exit 1; }
[[ "$bam_f" ]] || { echo "Error: -B is required." >&2; exit 1; }
[[ "$out_d" ]] || { echo "Error: -O is required." >&2; exit 1; }
[[ "$min_mapq" =~ ^[0-9]+$ ]] || { echo "Error: -q expects a positive integer (received '$min_mapq')." >&2; exit 1; }
[[ -e "$stacks_d" ]] || { ls -- "$stacks_d"; exit 1; }
[[ -e "$bam_f" ]] || { ls -- "$bam_f"; exit 1; }
fa="$stacks_d/catalog.fa.gz"
vcf="$stacks_d/catalog.calls"
[[ -e "$fa" ]] || { ls -- "$fa"; exit 1; }
[[ -e "$vcf" ]] || { ls -- "$vcf"; exit 1; }
o_coords="$out_d/locus_coordinates.tsv"
o_fa="$out_d/catalog.fa.gz"
o_vcf="$out_d/catalog.calls"
[[ ! -e "$o_coords" ]] || { echo "Error: Refusing to overwrite '$o_coords'." >&2; exit 1; }
[[ ! -e "$o_fa" ]] || { echo "Error: Refusing to overwrite '$o_fa'." >&2; exit 1; }
[[ ! -e "$o_vcf" ]] || { echo "Error: Refusing to overwrite '$o_vcf'." >&2; exit 1; }
# If something goes wrong, stop & print an error.
set -e -o pipefail
trap "echo 'Error: aborted.' >&2" EXIT
# Make sure we have write permissions.
mkdir -p "$out_d"
touch "$o_fa" "$o_vcf"
# Retrieve the highest ID in the catalog.
# ==========
echo "Finding the highest current locus ID..."
id=$(gzip -cd "$fa" | tail -n2 | head -n1 | awk '{print $1}')
if [[ ! "$id" =~ ^\>[0-9]+$ ]] ;then
echo "Error: Unexpected format in '$fa'; the second to last line was:" >&2
gzip -cd "$fa" | tail -n2 | head -n1
exit 1
fi
id=$(echo "$id" | cut -c2-)
echo "$id"
echo
# Write the old_id : new_id : pos file.
# ==========
echo "Extracting locus coordinates..."
samtools view -F 0x904 -q "$min_mapq" "$bam_f" |
# Convert SAM to `LOCUS_ID \t CHR \t POS \t STRAND`.
awk -F '\t' -v OFS='\t' '{
if (int( $2 % (2*16) / 16 )) {
# Negative strand.
pos = $4 - 1
gsub(/[0-9]+[ISH]/, "", $6)
gsub(/[A-Z]/, ",", $6)
n = split(substr($6, 1, length($6)-1), cig, ",")
for (i=1; i<=n; ++i) { pos += cig[i] }
print $1, $3, pos, "-"
} else {
print $1, $3, $4, "+"
}
}' |
# Sort by chromosome.
sort -k2,2 |
# Pull the reference chromosome order from the BAM header.
join -t $'\t' -12 -22 -o "1.1 1.2 2.1 1.3 1.4" \
- \
<(samtools view -H "$bam_f" |
grep '^@SQ' | grep -oE $'SN:[^\t]+' | cut -c4- |
awk '{printf NR"\t"$1"\n"}' |
sort -k2,2
) |
# Sort by chromosome index, bp & strand, then remove the index.
sort -k3,3n -k4,4n -k5,5r | cut -f 1,2,4,5 |
# Create the new IDs (we start at the power of 10 above the current max).
awk 'BEGIN{id = '"$id"' + 1} {printf id++ "\t" $1 "\t" $2 ":" $3 ":" $4 "\n"}' |
# Add a header.
{ echo -e 'id_new\tid_old\taln_pos'; cat; } \
> "$o_coords"
[[ $(wc -l < "$o_coords") -gt 1 ]] || { echo "Error: Extraction of coordinates failed; check BAM file." >&2; exit 1; }
echo "Wrote '$o_coords'."
echo
# Write the new FASTA file.
# ==========
echo "Rewriting locus sequences and information..."
gzip -cd "$fa" |
tr -d '\r' | paste -d '\r' - - | sed 's/^>//' |
awk -F' ' -v OFS=' ' '
BEGIN{
while(getline < "'"$o_coords"'") {
if (! /^id_new/) {
old2new[$2] = $1
old2pos[$2] = $3
}
}
} {
old_id = $1
if (old_id in old2new) {
$1 = old2new[old_id] " pos=" old2pos[old_id]
print
}
}' |
sort -k1,1n |
sed 's/^/>/' | tr '\r' '\n' |
gzip \
> "$o_fa"
echo "Wrote '$o_fa'."
echo
# Write the new VCF file.
# ==========
echo "Rewriting variants..."
{
# Copy the header.
{ gzip -cd "$vcf" 2>/dev/null || true; } | sed '/^#CHROM/ q' # (gzip unhappy about the closed pipe).
# Update & resort the records.
gzip -cd "$vcf" | grep -v '^#' |
awk -F'\t' -v OFS='\t' '
BEGIN{
while(getline < "'"$o_coords"'") {
if (! /^id_new/) {
old2new[$2] = $1
}
}
} {
old_id = $1
if (old_id in old2new) {
$1 = old2new[old_id]
print
}
}' |
sort -k1,1n -k2,2n
} | gzip \
> "$o_vcf"
echo "Wrote '$o_vcf'."
echo
trap - EXIT
echo "$(basename "$0") is done."
|