File: run_FastQTL_threaded.py

package info (click to toggle)
fastqtl 2.184%2Bv7%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 17,428 kB
  • sloc: cpp: 3,102; python: 323; makefile: 82; sh: 19; javascript: 3
file content (122 lines) | stat: -rwxr-xr-x 5,984 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python3
# Author: Francois Aguet
import argparse
import os
import numpy as np
import subprocess
import gzip
import multiprocessing as mp
import contextlib
from datetime import datetime

@contextlib.contextmanager
def cd(cd_path):
    saved_path = os.getcwd()
    os.chdir(cd_path)
    yield
    os.chdir(saved_path)

def get_cmd(args, chunk):
    cmd = os.path.join(fastqtl_dir, 'bin', 'fastQTL')+' --vcf '+args.vcf+' --bed '+args.bed+' --window '+args.window\
        +' --maf-threshold '+args.maf_threshold+' --ma-sample-threshold '+args.ma_sample_threshold
    if args.covariates:
        cmd += ' --cov '+args.covariates
    if args.threshold:
        cmd += ' --threshold'+args.threshold
    if args.permute:
        cmd += ' --permute '+' '.join([str(p) for p in args.permute])
    if args.best_variant_only:
        cmd += ' --report-best-only'
    if args.seed:
        cmd += ' --seed '+args.seed
    if args.exclude_samples:
        cmd += ' --exclude-samples '+args.exclude_samples
    cmd += ' --chunk '+str(chunk)+' '+args.chunks\
        + ' --out '+args.prefix+'_chunk{0:03d}.txt.gz'.format(chunk)\
        + ' --log '+args.prefix+'_chunk{0:03d}.log'.format(chunk)
    return cmd

def perm_worker(inputs):
    args = inputs[0]
    chunk = inputs[1]
    cmd = get_cmd(args, chunk)
    print('Processing chunk '+str(chunk), flush=True)
    s = subprocess.check_call(cmd, shell=True, executable='/bin/bash', stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    print('Finished chunk '+str(chunk), flush=True)
    return s


parser = argparse.ArgumentParser(description='Run FastQTL')
parser.add_argument('vcf', help='Genotypes in VCF 4.1 format')
parser.add_argument('bed', help='Phenotypes in UCSC BED extended format')
parser.add_argument('prefix', help='Prefix for output file name')
parser.add_argument('--covariates', default='', help='Covariates')
parser.add_argument('--chunks', default='100', help='Number of chunks, minimum: #chromosomes')
parser.add_argument('--permute', default=None, type=str, nargs='+', help='Number of permutations, e.g. [1000, 10000] (adaptive). Default: None (run nominal pass)')
parser.add_argument('--best_variant_only', action='store_true')
parser.add_argument('--window', default='1e6', help='Cis-window size. Default values is 1Mb (1e6).')
parser.add_argument('--threshold', default='', help='Output only significant phenotype-variant pairs with a p-value below threshold (default 1)')
parser.add_argument('--maf_threshold', default='0.0', help='Include only genotypes with minor allele frequency >=maf_threshold (default 0)')
parser.add_argument('--ma_sample_threshold', default='0', help='Include only genotypes with >=ma_sample_threshold samples carrying the minor allele (default 0)')
parser.add_argument('--fdr', default=0.05, type=np.double)
parser.add_argument('--seed', default=None, help='Random number generator seed')
parser.add_argument('--exclude_samples', default=None, help='')
parser.add_argument('--qvalue_lambda', default=None, help='lambda parameter for pi0est in qvalue.')
parser.add_argument('-t', '--threads', default=8, type=int, help='Number of threads')
parser.add_argument('-o', '--output_dir', default='.', help='Output directory')
args = parser.parse_args()
fastqtl_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))

if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)

with cd(args.output_dir):
    print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Running FastQTL on {0:d} threads.'.format(args.threads), flush=True)
    with mp.Pool(processes=args.threads) as pool:
        pdata_res = [pool.map_async(perm_worker, ((args,k),)) for k in np.arange(1,int(args.chunks)+1)]
        pool.close()
        pool.join()
    for res in pdata_res:  # check exit status
        assert res.get()[0]==0
    print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Done.', flush=True)

    if args.permute:
        # merge chunks
        print('Merging chunks ... ', end='', flush=True)
        cmd = 'zcat '+args.prefix+'_chunk*.txt.gz | gzip -c -1 > '+args.prefix+'.txt.gz && rm '+args.prefix+'_chunk*.txt.gz'
        subprocess.check_call(cmd, shell=True, executable='/bin/bash')
        cmd = 'cat '+args.prefix+'_chunk*.log > '+args.prefix+'.egenes.log && rm '+args.prefix+'_chunk*.log'
        subprocess.check_call(cmd, shell=True, executable='/bin/bash')
        print('done.', flush=True)

        # calculate q-values (R script also adds header)
        print('Calculating q-values', flush=True)
        cmd = 'Rscript '+os.path.join('/usr/share/fastqtl', 'calculateSignificanceFastQTL.R')\
            +' '+args.prefix+'.txt.gz '+str(args.fdr)+' '+args.prefix+'.egenes.txt.gz'
        if args.qvalue_lambda is not None:
            cmd += ' --lambda '+args.qvalue_lambda
        subprocess.check_call(cmd, shell=True, executable='/bin/bash')
        os.remove(args.prefix+'.txt.gz')
    else:
        # merge chunks
        print('Merging chunks ... ', end='', flush=True)
        with gzip.open('header_chunk.txt.gz', 'wb') as f:  # order from analysisNominal.cpp
            f.write(('\t'.join([
                'gene_id',
                'variant_id',
                'tss_distance',
                'ma_samples',
                'ma_count',
                'maf',
                'pval_nominal',
                'slope',
                'slope_se',
            ])+'\n').encode('utf-8'))
        cmd = 'zcat header_chunk.txt.gz '+args.prefix+'_chunk*.txt.gz | gzip -c -1 > '+args.prefix+'.txt.gz && rm '+args.prefix+'_chunk*.txt.gz'
        subprocess.check_call(cmd, shell=True, executable='/bin/bash')
        os.remove('header_chunk.txt.gz')
        cmd = 'cat '+args.prefix+'_chunk*.log > '+args.prefix+'.allpairs.log && rm '+args.prefix+'_chunk*.log'
        subprocess.check_call(cmd, shell=True, executable='/bin/bash')
        print('done.', flush=True)

        os.rename(args.prefix+'.txt.gz', args.prefix+'.allpairs.txt.gz')