File: bcbio_nextgen_install.py

package info (click to toggle)
bcbio 1.2.9-4
  • links: PTS, VCS
  • area: contrib
  • in suites: sid
  • size: 36,792 kB
  • sloc: python: 45,766; sh: 209; makefile: 159; xml: 129
file content (304 lines) | stat: -rwxr-xr-x 14,026 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
#!/usr/bin/python3
"""Automatically install required tools and data to run bcbio-nextgen pipelines.

This automates the steps required for installation and setup to make it easier to get started with
bcbio-nextgen. The defaults provide data files for human variant calling.

Requires: git, wget, bgzip2, Python 3 or 2.7
"""
from __future__ import print_function
import argparse
import collections
import contextlib
import datetime
import os
import platform
import shutil
import subprocess
import sys
try:
    import urllib2 as urllib_request
except ImportError:
    import urllib.request as urllib_request

REMOTES = {
    "requirements":
        "https://raw.githubusercontent.com/bcbio/bcbio-nextgen/master/requirements-conda.txt",
    "gitrepo": "https://github.com/bcbio/bcbio-nextgen.git",
    "system_config":
        "https://raw.githubusercontent.com/bcbio/bcbio-nextgen/master/config/bcbio_system.yaml",
    "anaconda": "https://repo.anaconda.com/miniconda/Miniconda3-py37_4.10.3-%s-x86_64.sh"
}


def main(args, sys_argv):
    check_arguments(args)
    check_dependencies()
    with bcbio_tmpdir():
        setup_data_dir(args)
        print("Installing isolated base python installation")
        anaconda = install_anaconda_python(args)
        if args.use_mamba:
            conda_bin = "mamba"
        else:
            conda_bin = "conda"
        print(f"Installing {conda_bin}")
        anaconda = install_mamba(anaconda, args)
        print("Installing conda-build")
        subprocess.check_call([anaconda[conda_bin], "install", "--yes", "conda-build"])
        print("Installing bcbio-nextgen")
        bcbio = install_conda_pkgs(anaconda, args)
        bootstrap_bcbionextgen(anaconda, args)
    print("Installing data and third party dependencies")
    system_config = write_system_config(REMOTES["system_config"], args.datadir, args.tooldir)
    setup_manifest(args.datadir)
    subprocess.check_call([bcbio, "upgrade"] + _clean_args(sys_argv, args))
    print("Finished: bcbio-nextgen, tools and data installed")
    print(" Genome data installed in:\n  %s" % args.datadir)
    if args.tooldir:
        print(" Tools installed in:\n  %s" % args.tooldir)
    print(" Ready to use system configuration at:\n  %s" % system_config)
    print(" Edit configuration file as needed to match your machine or cluster")


def _clean_args(sys_argv, args):
    """Remove data directory from arguments to pass to upgrade function
       remove --mamba"""
    base = [x for x in sys_argv if
            x.startswith("-") or not args.datadir == os.path.abspath(os.path.expanduser(x))]
    # Remove installer only options we don't pass on
    base = [x for x in base if x not in set(["--minimize-disk"])]
    if "--nodata" in base:
        base.remove("--nodata")
    else:
        base.append("--data")
    if "--mamba" in base:
        base.remove("--mamba")
    return base


def bootstrap_bcbionextgen(anaconda, args):
    if args.upgrade == "development":
        git_tag = "@%s" % args.revision if args.revision != "master" else ""
        subprocess.check_call([anaconda["pip"], "install", "--upgrade", "--no-deps",
                               "git+%s%s#egg=bcbio-nextgen" % (REMOTES["gitrepo"], git_tag)])

def install_mamba(anaconda, args):
    """ Install conda or mamba"""
    if args.use_mamba:
        conda_bin = "mamba"
    else:
        conda_bin = "conda"
    anaconda_dir = os.path.join(args.datadir, "anaconda")
    bindir = os.path.join(anaconda_dir, "bin")
    mamba = os.path.join(bindir, conda_bin)
    subprocess.check_call([anaconda["conda"], "install", "--yes", conda_bin])
    anaconda[conda_bin] = mamba
    return anaconda

def install_conda_pkgs(anaconda, args):
    env = dict(os.environ)
    # Try to avoid user specific pkgs and envs directories
    # https://github.com/conda/conda/issues/6748
    env["CONDA_PKGS_DIRS"] = os.path.join(anaconda["dir"], "pkgs")
    env["CONDA_ENVS_DIRS"] = os.path.join(anaconda["dir"], "envs")
    conda_bin = anaconda["conda"]
    if "mamba" in anaconda.keys():
        mamba_bin = anaconda["mamba"]
    else:
        mamba_bin = anaconda["conda"]
    if not os.path.exists(os.path.basename(REMOTES["requirements"])):
        subprocess.check_call(["wget", "--no-check-certificate", REMOTES["requirements"]])
    if args.minimize_disk:
        subprocess.check_call([mamba_bin, "install", "--yes", "nomkl"], env=env)
    subprocess.check_call([mamba_bin, "install", "--yes", "--only-deps", "bcbio-nextgen"], env=env)
    subprocess.check_call([conda_bin, "install", "--yes",
                           "--file", os.path.basename(REMOTES["requirements"])], env=env)
    return os.path.join(anaconda["dir"], "bin", "bcbio_nextgen.py")


def _guess_distribution():
    """Simple approach to identify if we are on a MacOSX or Linux system for Anaconda"""
    if platform.mac_ver()[0]:
        return "macosx"
    else:
        return "linux"


def install_anaconda_python(args):
    """Provide isolated installation of Anaconda python for running bcbio-nextgen.
    http://docs.continuum.io/anaconda/index.html
    """
    anaconda_dir = os.path.join(args.datadir, "anaconda")
    bindir = os.path.join(anaconda_dir, "bin")
    conda = os.path.join(bindir, "conda")
    if not os.path.exists(anaconda_dir) or not os.path.exists(conda):
        if os.path.exists(anaconda_dir):
            shutil.rmtree(anaconda_dir)
        dist = args.distribution if args.distribution else _guess_distribution()
        url = REMOTES["anaconda"] % ("MacOSX" if dist.lower() == "macosx" else "Linux")
        if not os.path.exists(os.path.basename(url)):
            subprocess.check_call(['wget', '--progress=dot:giga', url])
        subprocess.check_call(['bash', os.path.basename(url), '-b', '-p', anaconda_dir])
        # conda-forge channel should have the highest priority
        # https://bioconda.github.io/user/install.html#set-up-channels
        subprocess.check_call([conda, 'config', '--add', 'channels', 'bioconda',
                               '--file', os.path.join(anaconda_dir, '.condarc')])
        subprocess.check_call([conda, 'config', '--add', 'channels', 'conda-forge',
                               '--file', os.path.join(anaconda_dir, '.condarc')])
    return {"conda": conda,
            "pip": os.path.join(bindir, "pip"),
            "dir": anaconda_dir}


def setup_manifest(datadir):
    """Create barebones manifest to be filled in during update"""
    manifest_dir = os.path.join(datadir, "manifest")
    if not os.path.exists(manifest_dir):
        os.makedirs(manifest_dir)


def write_system_config(base_url, datadir, tooldir):
    """Write a bcbio_system.yaml configuration file with tool information"""
    out_file = os.path.join(datadir, "galaxy", os.path.basename(base_url))
    if not os.path.exists(os.path.dirname(out_file)):
        os.makedirs(os.path.dirname(out_file))
    if os.path.exists(out_file):
        # if no tool directory and exists, do not overwrite
        if tooldir is None:
            return out_file
        else:
            bak_file = out_file + ".bak%s" % (datetime.datetime.now().strftime("%Y%M%d_%H%M"))
            shutil.copy(out_file, bak_file)
    if tooldir:
        java_basedir = os.path.join(tooldir, "share", "java")
    rewrite_ignore = ("log",)
    with contextlib.closing(urllib_request.urlopen(base_url)) as in_handle:
        with open(out_file, "w") as out_handle:
            in_resources = False
            in_prog = None
            for line in (l.decode("utf-8") for l in in_handle):
                if line[0] != " ":
                    in_resources = line.startswith("resources")
                    in_prog = None
                elif (in_resources and line[:2] == "  " and line[2] != " "
                      and not line.strip().startswith(rewrite_ignore)):
                    in_prog = line.split(":")[0].strip()
                # Update java directories to point to install directory, avoid special cases
                elif line.strip().startswith("dir:") and in_prog and in_prog not in ["log", "tmp"]:
                    final_dir = os.path.basename(line.split()[-1])
                    if tooldir:
                        line = "%s: %s\n" % (line.split(":")[0],
                                             os.path.join(java_basedir, final_dir))
                    in_prog = None
                elif line.startswith("galaxy"):
                    line = "# %s" % line
                out_handle.write(line)
    return out_file


def setup_data_dir(args):
    if not os.path.exists(args.datadir):
        cmd = ["mkdir", "-p", args.datadir]
        subprocess.check_call(cmd)


@contextlib.contextmanager
def bcbio_tmpdir():
    orig_dir = os.getcwd()
    work_dir = os.path.join(os.getcwd(), "tmpbcbio-install")
    if not os.path.exists(work_dir):
        os.makedirs(work_dir)
    os.chdir(work_dir)
    yield work_dir
    os.chdir(orig_dir)
    shutil.rmtree(work_dir)


def check_arguments(args):
    """Ensure argruments are consistent and correct"""
    if args.toolplus and not args.tooldir:
        raise argparse.ArgumentTypeError("Cannot specify --toolplus without --tooldir")


def check_dependencies():
    """Ensure required tools for installation are present"""
    print("Checking required dependencies")
    for dep, msg in [(["git", "--version"], "Git (http://git-scm.com/)"),
                     (["wget", "--version"], "wget"),
                     (["bzip2", "-h"], "bzip2")]:
        try:
            p = subprocess.Popen(dep, stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
            out, code = p.communicate()
        except OSError:
            out = "Executable not found"
            code = 127
        if code == 127:
            raise OSError("bcbio-nextgen installer requires %s\n%s" % (msg, out))


def _check_toolplus(x):
    """Parse options for adding non-standard/commercial tools like GATK and MuTecT"""
    import argparse
    Tool = collections.namedtuple("Tool", ["name", "fname"])
    std_choices = set(["data", "dbnsfp", "ericscript"])
    if x in std_choices:
        return Tool(x, None)
    elif "=" in x and len(x.split("=")) == 2:
        name, fname = x.split("=")
        fname = os.path.normpath(os.path.realpath(fname))
        if not os.path.exists(fname):
            raise argparse.ArgumentTypeError("Unexpected --toolplus argument for %s. "
                                             "File does not exist: %s" % (name, fname))
        return Tool(name, fname)
    else:
        raise argparse.ArgumentTypeError("Unexpected --toolplus argument. "
                                         "Expect toolname=filename.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Automatic installation for bcbio-nextgen pipelines")
    parser.add_argument("datadir", help="Directory to install genome data",
                        type=lambda x: (os.path.abspath(os.path.expanduser(x))))
    parser.add_argument("--cores", default=1,
                        help="Number of cores to use if local indexing is necessary.")
    parser.add_argument("--tooldir", help="Directory to install 3rd party software tools. "
                                          "Leave unspecified for no tools",
                        type=lambda x: (os.path.abspath(os.path.expanduser(x))), default=None)
    parser.add_argument("--toolplus", help="Specify additional tool categories to install",
                        action="append", default=[], type=_check_toolplus)
    parser.add_argument("--datatarget",
                        help="Data to install. Allows customization or install of extra data.",
                        action="append", default=[],
                        choices=["variation", "rnaseq", "smallrna", "gemini", "vep", "dbnsfp",
                                 "battenberg", "kraken", "ericscript", "gnomad"])
    parser.add_argument("--genomes", help="Genomes to download", action="append", default=[],
                        choices=["BDGP6", "canFam3", "dm3", "galGal4", "GRCh37", "GRCz10",
                                 "GRCz11", "hg19", "hg38", "hg38-noalt", "mm10", "mm9", "phix",
                                 "pseudomonas_aeruginosa_ucbpp_pa14", "rn5", "rn6", "sacCer3",
                                 "Sscrofa11.1", "TAIR10", "WBcel235", "xenTro3"])
    parser.add_argument("--aligners", help="Aligner indexes to download",
                        action="append", default=[],
                        choices=["bbmap", "bowtie", "bowtie2", "bwa", "hisat2", "minimap2",
                                 "novoalign", "rtg", "snap", "star", "ucsc"])
    parser.add_argument("--nodata", help="Do not install data dependencies",
                        dest="install_data", action="store_false", default=True)
    parser.add_argument("--mamba", help="Use mamba instead of conda",
                        dest="use_mamba", action="store_true", default=False)
    parser.add_argument("--isolate", help="Created an isolated installation without PATH updates",
                        dest="isolate", action="store_true", default=False)
    parser.add_argument("--minimize-disk", help="Try to minimize disk usage (no MKL extensions)",
                        dest="minimize_disk", action="store_true", default=False)
    parser.add_argument("-u", "--upgrade", help="Code version to install",
                        choices=["stable", "development"], default="stable")
    parser.add_argument("--revision", help="Specify a git commit hash or tag to install",
                        default="master")
    parser.add_argument("--cloudbiolinux", help="Specify a cloudbiolinux git commit hash or tag to install",
                        default="master")
    parser.add_argument("--distribution", help="Operating system distribution", default="",
                        choices=["ubuntu", "debian", "centos", "scientificlinux", "macosx"])
    if len(sys.argv) == 1:
        parser.print_help()
    else:
        main(parser.parse_args(), sys.argv[1:])