File: gattaca.snake

package info (click to toggle)
spades 3.13.1+dfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 22,172 kB
  • sloc: cpp: 136,213; ansic: 48,218; python: 16,809; perl: 4,252; sh: 2,115; java: 890; makefile: 507; pascal: 348; xml: 303
file content (46 lines) | stat: -rw-r--r-- 2,173 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from scripts.common import contig_length
from scripts.converters import ProfileFormatter, BinningParser

class GattacaFormatter(ProfileFormatter):
    def header(self, first_line):
        sample_count = len(first_line)
        print(*(["contig", "length"] + ["cov_mean_sample" + str(i) for i in range(1, sample_count + 1)]), sep="\t", file=self.out)

    def format(self, contig, profile):
        print(contig, contig_length(contig), *profile, sep="\t", file=self.out)

rule gattaca_pre:
    input:   "binning/profiles.tsv"
    output:  "binning/gattaca/profiles.in"
    message: "Preparing GATTACA input"
    run:
        GattacaFormatter().run(input[0], output[0])

rule filter_contigs:
    input:   contigs="assembly/{frags}/all.fasta", profile="binning/gattaca/profiles.in"
    output:  contigs="assembly/{frags}/all_filtered.fasta"
    message: "Filter {wildcards.frags} contigs that have profile information"
    shell:   "cut -f1 < {input.profile} > tmp/names_tmp.txt && sed -i '1d' tmp/names_tmp.txt && "
             "{SCRIPTS}/contig_name_filter.py {input.contigs} tmp/names_tmp.txt {output.contigs}"

# Binning with GATTACA
# conda create -n py27 python=2.7.9 numpy scipy scikit-learn anaconda
# conda install -c bioconda pysam=0.11.2.2
rule gattaca:
    input:   contigs="assembly/{}/all_filtered.fasta".format(FRAGS), profiles="binning/gattaca/profiles.in"
    output:  "binning/gattaca/binning.out"
    threads: THREADS
    log:     "binning/gattaca.log"
    message: "Running GATTACA clustering"
    shell:   "set +u; source activate py37; set -u\n"
             "python3 {SOFT}/gattaca/src/python/gattaca.py cluster --contigs {input.contigs}"
             " --coverage {input.profiles} --algorithm dirichlet --clusters {output} >{log} 2>&1\n"
             "rm -f assembly/splits/all_filtered.fasta.fai"

rule gattaca_post:
    input:   contigs="assembly/{}/all.fasta".format(FRAGS), binning="binning/gattaca/binning.out"
    output:  "binning/binning.tsv" #Also bins
    message: "Postprocessing GATTACA results"
    run:
        BinningParser().run(input.binning, output[0])
        shell("{SCRIPTS}/split_bins.py {input.contigs} {output} binning/bins")