File: genome.py

package info (click to toggle)
idseq-bench 0.0~git20210602.27fb6dc-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 196 kB
  • sloc: python: 849; sh: 39; makefile: 3
file content (72 lines) | stat: -rw-r--r-- 3,112 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
from .util import remove_safely, check_call


class Genome:

    all = dict()
    by_accid = dict()
    downloads_dir = "downloads"

    def __init__(self, category, organism, lineage, versioned_accession_ids, genome_assembly_url=None):
        self.category = category
        self.organism = organism
        self.lineage = lineage
        assert ["subspecies", "species", "genus", "family"] == [l['level'] for l in lineage]
        self.subspecies_taxid, self.species_taxid, self.genus_taxid, self.family_taxid = [(l['tax_id'] or 0) for l in lineage]
        self.taxid = self.subspecies_taxid or self.species_taxid
        self.genome_assembly_URL = genome_assembly_url
        self.versioned_accession_ids = [Genome.ensure_versioned(vaccid) for vaccid in versioned_accession_ids]
        self.key = f"{category}__{organism}__{self.taxid}"
        self.filename = f"{Genome.downloads_dir}/{self.key}.fasta"
        # The size (number of bases) is filled in by fetch_all()
        self.size = None
        Genome.all[self.key] = self
        for vaccid in self.versioned_accession_ids:
            assert vaccid not in Genome.by_accid, f"Accession ID {vaccid} should not occur in multiple genomes."
            Genome.by_accid[vaccid] = self

    @staticmethod
    def ensure_versioned(vaccid):
        if "." not in vaccid:
            vaccid += ".1"
            print(f"INFO:  Changed {vaccid[:-2]} to {vaccid} in order to match ISS-generated read headers.")
        return vaccid

    @staticmethod
    def fetch_versioned_accession_id(vaccid):  # e.g., "NC_004325.2"
        output_file = f"{Genome.downloads_dir}/{vaccid}.fa"
        os.makedirs(Genome.downloads_dir, exist_ok=True)
        if os.path.isfile(output_file):
            print(f"{output_file} already exists, nice")
        else:
            try:
                command = f"cd {Genome.downloads_dir}; ncbi-acc-download --format fasta {vaccid} -e all"
                check_call(command)
            except:
                remove_safely(output_file)
                raise
        return output_file

    @staticmethod
    def fetch_all():
        for g in Genome.all.values():
            subdir = g.filename.rsplit("/", 1)[0]
            os.makedirs(subdir, exist_ok=True)
            remove_safely(g.filename)
            accession_fas = []
            for f in (g.versioned_accession_ids):
                af = Genome.fetch_versioned_accession_id(f)
                accession_fas.append(af)
            accession_fastas = " ".join(accession_fas)
            command = f"cat {accession_fastas} > {g.filename}"
            check_call(command)
            assert os.path.isfile(g.filename), f"Failed to download genome {g.filename}"
            remove_safely(f"{g.key}.size")
            command = f"grep -v '^>' {g.filename} | tr -d '\n' | wc > {g.key}.size"
            check_call(command)
            with open(f"{g.key}.size") as f:
                line = f.readline().rstrip()
                g.size = int(line.split()[2])
            remove_safely(f"{g.key}.size")
            print(f"Genome {g.key} size {g.size} bases.")