1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
|
#start
import sys
import os
import glob
#config params:
#+) input=<directory_containing_per_sample_sums (format per file (tabs): chrm,start,end,sum)
#+) staging=<directory_to_put_intermediate_files>
#+) sample_ids_file=<file_with_studyID_sampleID_sampleID2_mappings>
#optional:
#+) existing_sums=<file_with_original_coordinates/annotations_in_SAME_ORDER_as_sums_files>
#+) pigz_threads=<#_of_threads_to_use_for_final_pasted_file_compression>
#+) prefix=<additional_type_prefix_for_files_paste>
FILES=['%sall.samples.pasted.gz' % config['prefix']]
main_script_path=os.path.join(workflow.basedir)
SCRIPTS={'find':os.path.join(main_script_path,'find_new_files.sh'),'group':os.path.join(main_script_path,'group_sums.sh'),'paste':os.path.join(main_script_path,'paste_sums.sh')}
#initial coords/annotations (e.g. exons.bed.w_header.gz in Monorail)
if 'existing_sums' not in config:
config['existing_sums']=""
#how many threads to allow pigz when doing the final single file compression
if 'pigz_threads' not in config:
config['pigz_threads']=4
wildcard_constraints:
study_group_num="[0-9a-zA-Z]{2}",
run_group_num="[0-9a-zA-Z]{2}",
#prefix example: 'dn.' (for APA sites) or empty string
type=config['prefix']+"all"
rule all:
input:
expand("{file}", file=FILES)
###exon SUM pasting rules
rule find_sums:
input:
config['input'],
config['sample_ids_file']
output:
config['staging'] + '/{type}.groups.manifest'
params:
staging=config['staging'],
script_path=SCRIPTS['find'],
type=lambda wildcards: wildcards.type
shell:
"{params.script_path} {input[0]} {input[1]} {params.staging} {params.type} .tsv"
rule group_sums:
input:
config['staging'] + '/{type}.groups.manifest'
output:
config['staging'] + '/{type}.{study_group_num}.{run_group_num}.grouped'
params:
study_group_num=lambda wildcards: wildcards.study_group_num,
run_group_num=lambda wildcards: wildcards.run_group_num,
staging=config['staging'],
script_path=SCRIPTS['group'],
type=lambda wildcards: wildcards.type
shell:
"{params.script_path} {params.staging}/{params.type}.{params.study_group_num}.{params.run_group_num}.manifest {output} convert_to_int"
#do a rule instantiation per *run* low-order name grouping to do hierarchical pastes
rule paste_sums_per_group:
input:
config['staging'] + '/{type}.{study_group_num}.{run_group_num}.grouped'
output:
config['staging'] + '/{type}.{study_group_num}.{run_group_num}.pasted'
params:
study_group_num=lambda wildcards: wildcards.study_group_num,
run_group_num=lambda wildcards: wildcards.run_group_num,
staging=config['staging'],
script_path=SCRIPTS['paste'],
type=lambda wildcards: wildcards.type,
pigz_threads=0
shell:
"{params.script_path} {params.staging}/{params.type}.{params.study_group_num}.{params.run_group_num}.manifest {output} {params.pigz_threads}"
def get_pasted_sum_files(wildcards):
study_loworder = wildcards.study_group_num
fin = open(config['sample_ids_file'], "r")
lines = fin.read().split('\n')
fin.close()
return [config['staging']+"/%s.%s.%s.pasted" % (wildcards.type, f.split('\t')[0][-2:], f.split('\t')[1][-2:]) for f in lines[:-1]]
#return [config['staging']+"/%s.%s.%s.pasted" % (wildcards.type, f.split('/')[-3], f.split('/')[-1]) for f in glob.glob(config['input']+'/%s/*??' % (study_loworder))]
rule collect_pasted_sums:
input:
get_pasted_sum_files
output:
config['staging'] + '/{type}.{study_group_num}.pasted.files.list'
params:
study_group_num=lambda wildcards: wildcards.study_group_num,
staging=config['staging'],
type=lambda wildcards: wildcards.type
shell:
"ls {params.staging}/{params.type}.{params.study_group_num}.??.pasted > {output}"
rule paste_sums_per_study_group:
input:
config['staging'] + '/{type}.{study_group_num}.pasted.files.list'
output:
os.path.join(config['staging'], '{type}.{study_group_num}.pasted')
params:
study_group_num=lambda wildcards: wildcards.study_group_num,
staging=config['staging'],
script_path=SCRIPTS['paste'],
existing_sums=config['existing_sums'],
type=lambda wildcards: wildcards.type,
pigz_threads=0
shell:
"{params.script_path} {input} {output} {params.pigz_threads} dont_get_ids"
def get_study_pasted_sum_files(wildcards):
fin = open(config['sample_ids_file'], "r")
lines = fin.read().split('\n')
fin.close()
return [config['staging']+"/%s.%s.pasted" % (wildcards.type, f.split('\t')[0][-2:]) for f in lines[:-1]]
#return [config['staging']+"/%s.%s.pasted" % (wildcards.type, f.split('/')[-1]) for f in glob.glob(config['input']+'/??')]
rule collect_study_pasted_sums:
input:
get_study_pasted_sum_files
output:
config['staging'] + '/{type}.groups.pasted.files.list'
params:
staging=config['staging'],
type=lambda wildcards: wildcards.type
shell:
"ls {params.staging}/{params.type}.??.pasted > {output}"
rule paste_sums_final:
input:
config['staging'] + '/{type}.groups.pasted.files.list'
output:
'{type}.samples.pasted.gz'
params:
staging=config['staging'],
script_path=SCRIPTS['paste'],
existing_sums=config['existing_sums'],
type=lambda wildcards: wildcards.type,
pigz_threads = config['pigz_threads']
shell:
"{params.script_path} {input} {output} {params.pigz_threads} dont_get_ids {params.existing_sums}"
|