File: from_phased_alleles_to_clusters.sh

package info (click to toggle)
discosnp 1%3A2.6.2-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,656 kB
  • sloc: python: 5,893; sh: 2,966; cpp: 2,692; makefile: 14
file content (80 lines) | stat: -rwxr-xr-x 2,896 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# FROM A FILE GENERATED USING OPTION -phasing FROM KISSREADS, GENERATE CLUSTERS OF VARIANTS WHO TRANSITIVELY CO-OCCUR IN READS

echo "This is \"from_phased_alleles_to_clusters.sh\". Generates clusters of variants who transitively co-occur in reads from a file generated using option -phasing from kissreads"
echo "Usage: from_phased_alleles_to_clusters.sh phased_alleles_file [minimal edge coverage support (default 0)]"

file=$1

if [ ! -f $file ]; then
    echo "In \"from_phased_alleles_to_clusters.sh\": file \"$file\" does not exist"
   exit 0
fi
filename=$(basename -- "$file")
path=$(dirname "${file}")

# FIND THE PATH CONTAINING THE SCRIPT: 
EDIR=$( python3 -c "import os.path; print(os.path.dirname(os.path.realpath(\"${BASH_SOURCE[0]}\")))" ) 
echo $EDIR

edge_coverage_threshold=0
if ! test -z "$2" 
then
       edge_coverage_threshold=$2
fi

# 1000547h;-2286435h; -1330792h;1152525l; => 1
# FORMAT PHASED ALLELE IDS INTO SIMPLER FORMAT FOR CONNECTED COMPONENT DETECTION
#cmd="cat ${file} | tr -d \"-\" | sed '1d' | cut -d \"=\" -f 1 | python3 ${EDIR}/from_path_to_edges.py | cut -f 1,2 | tr -d \"l\" | tr -d \"h\" | sort -u "
# 1/ suppress the '-' sign occurrences 
# 2/ suppres the first line 
# 3/ remove what exists after => (included) 
# 4/ transform n-uplets into couples: 
#   a;b;c; d;e
#  becomes (r means mapped by a unique gene and p means mapped by a pair of reads). Each line is ordered (smallest id first)
#   a b r
#   b c r
#   c d p
#   d e r
# 5/ remove r/p info
# 6/ remove h/l info
# 7/ remove duplicates

### Same command keeping only edges seen at leas edge_coverage_threshold times

cmd="cat ${file} | tr -d \"-\" | sed '1d' | cut -d \"=\" -f 1 | python3 ${EDIR}/from_path_to_edges.py | cut -f 1,2 | tr -d \"l\" | tr -d \"h\" | sort | uniq -c | awk '\$1>=${edge_coverage_threshold} {print \$2\" \"\$3}'"
###
echo $cmd "> edge_${filename}"
eval $cmd "> edge_${filename}"

if [ $? -ne 0 ]
then
    echo "In \"from_phased_alleles_to_clusters.sh\": there was a problem with file $file"
    exit 1
fi

# COMPUTE CONNECTED COMPONENTS
cmd="${EDIR}/../build/bin/quick_hierarchical_clustering edge_${filename}"
echo $cmd "> connected_components_${filename}"
eval $cmd > connected_components_${filename}
if [ $? -ne 0 ]
then
    echo "In \"from_phased_alleles_to_clusters.sh\": there was a problem while computing connected components of edge_${filename}"
    exit 1
fi

# REMOVE INTERMEDIATE FILE
# cmd="rm -f edge_${filename}"
# echo $cmd
# eval $cmd

# REPLACE THE CREATED FILE IN TIS ORIGINAL DIRECTORY
cmd="mv connected_components_${filename} $path/connected_components_${filename}"
echo $cmd
eval $cmd
if [ $? -ne 0 ]
then
    echo "In \"from_phased_alleles_to_clusters.sh\": there was a problem while moving connected components file "
    exit 1
fi

echo "Connected components (clusters of variants) from file $file are in $path/connected_components_${filename}"