File: read_info_histograms.sh

package info (click to toggle)
filtlong 0.2.1-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,672 kB
  • sloc: cpp: 3,458; python: 996; sh: 119; makefile: 38
file content (48 lines) | stat: -rwxr-xr-x 1,543 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env bash

long_reads=$1
illumina_1=$2
illumina_2=$3

# Get the directory of this script.
script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

# Get paths to other necessary files.
hist="$script_dir"/histogram.py
filtlong="$script_dir"/../bin/filtlong

# Prepare a tab-delimited file of read info.
if [ -n "$illumina_1" ] && [ -n "$illumina_2" ]; then
    illumina_args="-1 "$illumina_1" -2 "$illumina_2
else
    illumina_args=""
fi
$filtlong $illumina_args --min_mean_q 1 --verbose $long_reads 2>&1 >/dev/null | grep -P "length = \d+"  | sed 's/\s\+length = //' | sed 's/\s\+\w\+ quality = /\t/g' > temp_filtlong_read_info

printf "\n"
echo "READ SET SUMMARY"
echo "----------------"
printf "number of reads: %'d\n" $(cat temp_filtlong_read_info | wc -l)
total_bases=$(cut -f1 temp_filtlong_read_info | paste -sd+ - | bc)
printf "number of bases: %'d\n" $total_bases
target=$(($total_bases / 2))
n50=$(zless $long_reads | paste - - - - | cut -f2 | awk '{print length($0);}' | sort -nr | awk -v tar="$target" '{sum += $0; if (sum > tar) {print($0); exit;}}')
printf "N50 read length: %'d\n" $n50

printf "\n\n"
echo "READ LENGTHS"
echo "------------"
cat temp_filtlong_read_info | awk '{print $1, $1}' | $hist -a -m 0 -b 40

printf "\n\n"
echo "MEAN QUALITIES"
echo "--------------"
cat temp_filtlong_read_info | awk '{print $1, $2}' | $hist -a -b 40

printf "\n\n"
echo "WINDOW QUALITIES"
echo "----------------"
cat temp_filtlong_read_info | awk '{print $1, $3}' | $hist -a -b 40

printf "\n"
rm temp_filtlong_read_info