File: dnaclust-ref

package info (click to toggle)
dnaclust 3-7
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 720 kB
  • sloc: cpp: 3,630; sh: 516; makefile: 64
file content (98 lines) | stat: -rwxr-xr-x 2,976 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/bin/bash
similarity=0.98
threads=1

print_help()
{
    fold -w 120 <<EOF
Usage: dnaclust-ref [OPTIONS...]
DNACLUST helper script to cluster sequences using a reference database.

  -c CENTERS         Fasta file of cluster centers/references.
  -d                 After clustering with reference database, perform de novo clustering.
  -r SIMILARITY      Set similarity between cluster center and cluster
                     sequences (default=0.98)
  -t THREADS         Set the number of threads to use
  -i INPUT_FILE      Fasta file of sequences to be clustered.
  -v                 Print verbose messages to standard error
  -h                 Give this help list

The sequences to be clustered are read from the STDIN. The cluster centers are written to STDOUT. Messages are written to STDERR.
EOF
}

while getopts "c:i:dr:t:vhln" option
do
    case $option in
  c) cluster_centers="$OPTARG";;
  i) input="$OPTARG";;
  d) de_novo_cluster=0;;
	r) similarity="$OPTARG";;
  t) threads="$OPTARG";;
	v) verbose=0;;
  l) left_gaps_allowed=0;;
  n) no_overlap=0;;
	h) print_help; exit 0;;
	[?]) print_help; exit 1;;
    esac
done

print_message()
{
    if [ $verbose ]
    then
	echo "`date +%T` $1" >&2
    fi
}

parameters=""
if [ $left_gaps_allowed ]
then
  parameters+=" --left-gaps-allowed "
fi

if [ $no_overlap ]
then
  parameters+=" --no-overlap "
fi

dnaclust_path="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
print_message "$dnaclust_path"
#exit 1
UNAME=$( uname )
tempdir=""
if [ $UNAME = "Darwin" ]; 
then
  tempdir=`mktemp -d -t .`
else
  tempdir=`mktemp -d -p .`
fi
#tempdir="tmpref/"
trap "rm -fr $tempdir" EXIT

#sequences_sorted=`mktemp -p $tempdir`
db_sorted=`basename ${cluster_centers} .fasta`.sorted.fasta
# Reads the sequences from STDIN.
print_message "Reading and sorting the database sequences: $tempdir/${db_sorted}" 
#"/usr/lib/dnaclust/fastasort" --random-shuffle > $sequences_sorted 
cat $cluster_centers | "/usr/lib/dnaclust/fastasort" > $tempdir/${db_sorted}

print_message "Recruiting from the sequences, using database."
"$dnaclust_path/dnaclust" $parameters -s $similarity -t $threads --no-k-mer-filter -i $input -p $tempdir/${db_sorted} -r | awk '{if (NF > 1) print $0}' > $input.db.clusters
print_message "DB recruited sequences: $input.db.clusters"

if [ $de_novo_cluster ]
then
  print_message "Run DNACLUST on remaining sequences."
  unclustered_seq=`basename $input .fasta`.unclustered.fasta
  awk '{ for (i = 1; i <= NF; i++) print $i}' $input.db.clusters > $tempdir/clustered_seqs

  "/usr/lib/dnaclust/fastaselect" --everything-except -f ${input} < $tempdir/clustered_seqs > $tempdir/${unclustered_seq}
  if [[ -s $tempdir/${unclustered_seq} ]] ; then
    "$dnaclust_path/dnaclust" $parameters -s $similarity -t $threads --no-k-mer-filter -i $tempdir/${unclustered_seq} > $input.denovo.clusters  
    print_message "Writing de novo clusters to: $input.denovo.clusters"
  else
    touch $input.denovo.clusters
  fi
fi
exit 1