#!/usr/bin/python3 # -*- coding: utf-8 -*- '''utils module for snakemake pipeline This module provides utility classes and functions that make it easy to handle nucliotide sequence data ''' from itertools import takewhile from collections import OrderedDict import sys import os.path import re import yaml import pprint ''' FastQ Illumina filenames FastQ with complex naming convention): _____ ____ openBIS ID contains underscores and "mismatches in index" field may or may not be present Example: __<6>__ ____<001>_.fastq.gz ([(openBIS_ID,3,'_'), flowcell_name, lane, (sample_name,['USER,ID'],'_'), sample_number] __<68928>__<6>__ ____<001>_.fastq.gz (?P)_(?P)_(?P)_(?P)_(?P)_(?P_(?P)_(?P)_(?P)_(?P).fastq.gz IlluminaStandard: ____ ''' # Exceptions # illumina_standard_filename = [ # ('sample_name', {}), ('sample_num', {'regex': 'S\d+'}), # ('lane', {'regex': 'L\d+'}), ('read', {'regex': 'R\d+'}), # ('running_num', {'regex': '\d+'})] class SequenceFile: filename = '' path = '' ext = '' ext2 = '' add_exts = [] def __init__(self, filename, path='', ext='', ext2='', add_exts=[]): self.filename = filename self.path = path self.ext = ext self.ext2 = ext2 self.add_exts = [] def __lt__(self, other): return (([self.filename, self.ext, self.ext2] + other) < ([other.filename, other.ext, other.ext2] + other.other)) def __str__(self): return ''.join([os.path.join(self.path, self.filename), self.ext, self.ext2] + self.add_exts) def __repr__(self): return ''.join([os.path.join(self.path, self.filename), self.ext, self.ext2] + self.add_exts) class Field(): name = '' regex = '' subf_num = 1 subf_sep = None optional = False def __init__(self, name, regex='', subf_num=1, subf_sep=None, optional=False): self.name = name self.regex = regex self.subf_num = subf_num self.subf_sep = subf_sep self.optional = optional def to_regex(self, field_sep): if self.subf_num < 1: name = self.name raise MalformedFieldError( 'Field %s has less than 1 entry' % (name)) elif self.subf_num == 1: field_regex = (r'[^%s]*' % field_sep if not self.regex else self.regex) re_pattern = r'(?P<%s>%s)' % (self.name, field_regex) else: sfs = (field_sep + (self.subf_sep if self.subf_sep is not None else '')) subf_sep = field_sep if self.subf_sep is None else self.subf_sep subf_regex = r'[^%s]*' % format(sfs) subf_regex = subf_sep.join([subf_regex]*self.subf_num) re_pattern = ( '(?P<%s>%s)' % (self.name, subf_regex)) return re_pattern class MalformedFieldError(Exception): '''Exception thrown if regex field is malformed ''' pass class AmbigiousPairedReadsError(Exception): ''' Exception thrown if read pairs can't be matched Paired Read data has at most two files that are grouped together If more are found, this esxception is raised. Example: S1_R1.fastq, S1_R2.fastq S1_R3.fastq > Exception S1_R1.fastq, S1_R2.fastq S1_R2.fastq > Exception S1_R1.fastq, S1_R2.fastq > OK ''' pass class UnknownExtensionError(Exception): '''Exception thrown if input sequence file has an unknown file extension If the pipeline only excepts nucleotide sequence files, like Fastas and Fastqs, inputting a mapping file or index will cause this exception to be thrown. ''' pass class IncongruentFieldValueError(Exception): '''Exception thrown if ther is a missmatch in a sample file grouping If a grouping of files, that belong to the same SampleId grouping, differ in a non variable field of the filename, this exception is raised ''' pass class SampleIDNotUniqueError(Exception): '''File exists more than once in sample file grouping This might be caused by a sample id that is not unique ''' pass class FormatMismatch(Exception): pass # Classes class MultiFileReads: def __init__(self): pass def get_files(self): pass def add_files(self): pass class IlluminaMFR(MultiFileReads): n_files = 0 uses_common_fields_as_id = False n_branches = 0 format_name = '' field_names = [] id_fields = [] id_string = '' id_sep = '' main_sep = '' var_fields = [] ignore_fields = [] regex_str = '' regex = None sample_dict = {} non_var_dict = {} common_fields_dict = {} default_field_values = {} def __init__(self, filename_rule, main_sep='_', id_sep='', id_fields=['sample_name'], var_fields=['lane', 'running_num'], format_name='new_format', default_field_values={'sample_num': 'S1', 'lane': 'L001', 'read': 'R1', 'running_num': '001'}): super().__init__() # Construct regex string self.regex_str = main_sep.join([ Field(name, **key_opts).to_regex(main_sep) for (name, key_opts) in filename_rule]) self.regex = re.compile(self.regex_str) self.field_names = [name for (name, _) in filename_rule] self.id_fields = id_fields self.id_sep = id_sep self.id_string = '' self.default_field_values = default_field_values self.var_fields = var_fields self.sample_dict = {} self.non_var_dict = {} self.format_name = format_name self.main_sep = main_sep self.n_files = 0 self.n_branches = 0 self.common_fields_dict = {} def __repr__(self): return 'IlluminaMFR<%s>' % ', '.join( ['format_name: %s' % self.format_name, '\nn_files: %i' % self.n_files, 'n_branches: %i' % self.n_branches, 'tree:\n%s' % pprint.pformat(self.sample_dict)]) def add_files(self, file_list, read_target=0): if isinstance(file_list, SequenceFile): file_list = [file_list] file_list = sorted(file_list) for file in file_list: try: field_dict = self.regex.match(file.filename).groupdict() except AttributeError: raise FormatMismatch( 'FormatMismatch: %s does not fit %s' % (file, self.format_name)) sample_dict = self.sample_dict read = None var_dict = {} if self.n_files == 0: self.common_fields_dict = dict((key, val) for (key, val) in field_dict.items() if key not in (self.var_fields + ['read'])) self.update_id_field() for field in self.field_names: f_value = field_dict[field] sample_key = field + ':' + f_value read_field = field == 'read' if read_field: read_id = f_value[-1] if not read_target or read_id == str(read_target): read = file.filename elif field in self.var_fields: if self.n_files == 0 or sample_key not in sample_dict: sample_dict[sample_key] = {} sample_dict = sample_dict[sample_key] var_dict[field] = f_value # print(var_dict) else: if self.n_files == 0: self.non_var_dict[sample_key] = 42 elif (not read_field and sample_key not in self.non_var_dict): raise IncongruentFieldValueError( 'File: %s\n of ID Group %s has' ' group missmatch in field: %s\n' ' with value %s' % ( file, self.id_string, field, f_value)) if read is None: read_id = 1 read = file.filename # if read is not None: # if 'read%s' % read_id in sample_dict: raise SampleIDNotUniqueError( 'Error: files \n%s\n%s\n' 'have the same variable arguments. ' % ( str(file), sample_dict['read'+str(read_id)])) var_dict['read%s' % read_id] = file if len(sample_dict) < 1: self.n_branches += 1 sample_dict.update(var_dict.copy()) self.n_files += 1 def update_id_field(self, id_sep=None, id_fields=None): if id_sep is not None: self.id_sep = id_sep if id_fields: self.id_fields = id_fields self.id_string = self.id_sep.join( [self.common_fields_dict[field] for field in self.id_fields]) if id_sep or id_fields: self.use_common_fields_as_id = False def set_format_name(self, new_name): self.format_name = new_name def get_files(self, undetermined=None): output = [] to_do = [self.sample_dict] while to_do: curr_node = to_do.pop() if not any(isinstance(next_node, dict) for next_node in curr_node.values()): output += [curr_node] else: to_do.extend([curr_node[n] for n in curr_node]) return output def use_common_fields_as_id(self): new_id_fields = [f for f in self.field_names if f not in (self.var_fields+['read'])] self.update_id_field(id_fields=new_id_fields) self.uses_common_fields_as_id = True class PacBioMFR(MultiFileReads): n_files = 0 n_branches = 0 format_name = '' field_names = [] id_fields = [] id_string = '' id_sep = '' main_sep = '' var_fields = [] ignore_fields = [] regex_str = '' regex = None sample_dict = {} non_var_dict = {} common_fields_dict = {} def __init__(self, filename_rule, main_sep='_', id_sep='', id_fields=['sample_name'], var_fields=['dummy'], format_name='new_format'): super().__init__() # Construct regex string self.regex_str = main_sep.join([ Field(name, **key_opts).to_regex(main_sep) for (name, key_opts) in filename_rule]) self.regex = re.compile(self.regex_str) self.field_names = [name for (name, _) in filename_rule] self.id_fields = id_fields self.id_sep = id_sep self.id_string = '' self.var_fields = var_fields self.sample_dict = {} self.non_var_dict = {} self.format_name = format_name self.main_sep = main_sep self.n_files = 0 self.n_branches = 0 self.common_fields_dict = {} def __repr__(self): return 'PacBioMFR<%s>' % ', '.join( ['format_name: %s' % self.format_name, '\nn_files: %i' % self.n_files, 'n_branches: %i' % self.n_branches, 'tree:\n%s' % pprint.pformat(self.sample_dict)]) def add_files(self, file_list, file_selector_target=''): if isinstance(file_list, SequenceFile): file_list = [file_list] file_list = sorted(file_list) for file in file_list: try: field_dict = self.regex.match(file.filename).groupdict() except AttributeError: raise FormatMismatch( 'FormatMismatch: %s does not fit %s' % (file, self.format_name)) sample_dict = self.sample_dict file_selector = None var_dict = {} if self.n_files == 0: self.common_fields_dict = dict((key, val) for (key, val) in field_dict.items() if key not in (self.var_fields + ['file_selector'])) self.update_id_field() for field in self.field_names: f_value = field_dict[field] sample_key = field + ':' + f_value file_selector_field = field == 'file_selector' if file_selector_field: file_selector_id = f_value if not file_selector_target or file_selector_id == str(file_selector_target): file_selector = file.filename elif field in self.var_fields: if self.n_files == 0 or sample_key not in sample_dict: sample_dict[sample_key] = {} sample_dict = sample_dict[sample_key] var_dict[field] = f_value else: if self.n_files == 0: self.non_var_dict[sample_key] = 42 elif (not file_selector_field and sample_key not in self.non_var_dict): raise IncongruentFieldValueError( 'File: %s\n of ID Group %s has' ' group missmatch in field: %s\n' ' with value %s' % ( file, self.id_string, field, f_value)) if 'file_selector_id' in sample_dict: raise SampleIDNotUniqueError( 'Error: files \n%s\n%s\n' 'have the same variable arguments. ' % ( file_selector, sample_dict[str(file_selector_id)])) if file_selector is not None: # var_dict[file_selector_id] = file if len(sample_dict) < 1: self.n_branches += 1 sample_dict.update(var_dict.copy()) self.n_files += 1 def update_id_field(self, id_sep=None, id_fields=None): if id_sep is not None: self.id_sep = id_sep if id_fields: self.id_fields = id_fields self.id_string = self.id_sep.join( [self.common_fields_dict[field] for field in self.id_fields]) def set_format_name(self, new_name): self.format_name = new_name def get_files(self, undetermined=None): output = [] to_do = [self.sample_dict] while to_do: curr_node = to_do.pop() if not any(isinstance(next_node, dict) for next_node in curr_node.values()): output += [curr_node] else: to_do.extend([curr_node[n] for n in curr_node]) return output class IonTorrentMFR(MultiFileReads): pass def test_fun(filename): '''testfunction used for debugging purposes imports first format of given filenames.yaml, builds and returns corresponding multi file read object ''' formats = get_formats_from_file(filename) basel = list(formats[0].values())[0] mfr = IlluminaMFR(basel['format'].items(), main_sep=basel['main_sep']) return mfr class SampleInfoPaired: ''' Information Container for Paired End Read Data Attributes: ID (str): first section of file base name shared by read pair READ1 (str): Absolute path of first half of read pairs READ2 (str): Absolute path to second half of read pairs exts (:obj:`list` of :obj:`str`): List of extensions used by read pair in correct order zip_exts (:obj:`list` of :obj:`str`): List of compression extensions used by pair (also in correct order) If they are empty strings, no compression extension is used ''' ID = '' READ1 = '' READ2 = '' exts = ['', ''] zip_exts = ['', ''] add_info = {} def __init__(self, r1, r2, id_str, exts=['.fastq', '.fastq'], zip_exts=['', ''], add_info={}): '''Initializer Reads in the attributes in the order read1, read2, ids, exts, zip, exts Kwargs: exts: default ['.fastq', '.fastq'] zip_exts: default ['', ''] ''' self.ID = id_str self.READ1 = r1 self.READ2 = r2 self.exts = exts self.zip_exts = zip_exts self.add_info = add_info def __str__(self): return 'SampleInfoPaired()' def __repr__(self): att = (self.ID, self.READ1, self.READ2, ','.join(self.exts), ','.join(self.zip_exts)) return ('SampleInfoPaired()' if not any(att) else '' % att) class SampleInfoSingle: ''' Information Container for Single End Read Data Attributes: ID (str): first section of read file base name READ1 (str): Absolute path to read file ext ( :obj:`str`): Extension used by read data file zip_ext (:obj:`str`): Compression extension used by read file If the string is empty, no compression extension is used ''' ID = '' READ1 = '' ext = '' zip_ext = '' add_info = {} def __init__(self, ids, r1, ext='.fastq', zip_ext='', add_info={}): '''Initializer Reads in the attributes in the order ids, read1, exts, zip, exts Kwargs: ext: default '.fastq' zip_ext: default '' ''' self.ID = ids self.READ1 = r1 self.ext = ext self.zip_ext = zip_ext self.add_info = add_info def __str__(self): return 'SampleInfoSingle()' def __repr__(self): att = (self.ID, self.READ1, self.ext, self.zip_ext) return ('SampleInfoSingle()' if not any(att) else '' % att) class PacBioSampleInfoRS_II: bax1 = '' bax2 = '' bax3 = '' metadata = '' bas = '' add_info = {} def __init__(self, ids, metadata='', bax1='', bax2='', bax3='', bas='', add_info={}): '''Initializer Reads in the attributes in the order ids, read1, exts, zip, exts Kwargs: ext: default '.fastq' zip_ext: default '' ''' self.ID = ids self.metadata = metadata self.bas = bas self.add_info = add_info self.bax1 = bax1 self.bax2 = bax2 self.bax3 = bax3 def __str__(self): return 'SampleInfoSingle()' class PacBioSampleInfoRS: ID = '' metadata = '' bas = '' add_info = {} def __init__(self, ids, metadata='', bas='', add_info={}): '''Initializer Reads in the attributes in the order ids, read1, exts, zip, exts Kwargs: ext: default '.fastq' zip_ext: default '' ''' self.ID = ids self.metadata = metadata self.bas = bas self.add_info = add_info def __str__(self): return 'SampleInfoSingle()' class ReferenceInfo: ''' Information Container for Genomic Reference Data Attributes: ID (str): reference file base name without extension REFERENCE (str): Absolute path to reference file ext ( :obj:`str`): Extension used by reference data file zip_ext (:obj:`str`): Compression extension used by reference file If the string is empty, no compression extension is used ''' ID = '' REFERENCE = '' ext = '' zip_ext = '' def __init__(self, id, reference, ext='.fna', zip_ext='.gz'): self.ID = id self.REFERENCE = reference self.ext = ext self.zip_ext = zip_ext def __str__(self): return 'ReferenceInfo()' def __repr__(self): att = (self.ID, self.REFERENCE, self.ext, self.zip_ext) return ('ReferenceInfo()' if not any(att) else '' % att) def eprint(*args, **kwargs): ''' print function that prints to stderr :return: returns nothing ''' print(*args, file=sys.stderr, **kwargs) def test_extension(filename, extension_list): ''' tests which extension a file uses Args: filename (:obj:`str`): name of file whose extension will get checked extension_list (:obj:`list` of :obj:`str`): list of extensions that the file will be checked against. should contain the dot and extensions that share a prefix should be sorted in ascending order Returns: (:obj:`str`): Extension used or '' if not found in extension_list ''' res = '' for ext in extension_list: if len(filename.split(ext)) == 2: res = ext break return res def parse_sample_info(sample_list, format_dict, use_common_fields_as_id=False, target_formats=['illumina_fastq']): '''Parses filenames and generates SampleInfoObjects Turns list of input files into read data containers. It finds pairs for paired end read data and determines sample ids, which compression extension is used (if any), and which nucleotide sequence file extension is used. It only accepts files that end in (otional:) Args: sample_list(:obj:`list` of :obj:`str`): list of filenames format_dict(:obj:`dict` nested and with various types): file naming conventions loaded from yaml file Kwargs: use_common_fields_as_id(:obj:`bool` default False): Ignores ID Fields specified in formats config and just uses all fields that are not marked as variable as ID target_formats(:obj:`list` of :obj:`str`): List of machine target_formats to check against Choose from: illumina_fastq, ion_torrent_bam, pacbio Returns: :obj:`dict` of :obj:`MFR_Collection` Returns dictionary of MFR_Collections found, with respective target_format as key value. Example: result["illumina_fastq"] returns a Illumina_MFR_Collection() It also provides a dictionary entry for discarded files result["discarded"] Raises: UnknownExtensionError: If sequence file extension unknown AmbigiousPairedReadsError: If paired data has to many matching files ''' collections = {'ion_torrent_bam': IonTorrent_MFR_Collection, 'illumina_fastq': Illumina_MFR_Collection, 'pacbio': PacBio_MFR_Collection} sample_list = sorted(sample_list) mfr_samples = {} # Collection of detected multifile samples mfrs_found = dict() discarded = list() # Accumulate sample information and build samples dictinary for sample in sample_list: found_format_or_is_leftover = False for target_format_type in target_formats: if found_format_or_is_leftover: break seq_exts = format_dict[target_format_type]['main_exts'] format_list = format_dict[target_format_type]['formats'] # Select which mfr type to try mfr_type = collections[target_format_type].mfr_type try: zip_exts = format_dict[target_format_type]['secondary_exts'] except KeyError: zip_exts = [] used_ext = test_extension(sample, seq_exts) if not used_ext: continue raise UnknownExtensionError( 'Extension not recognized\n%s' % sample) sample_string, zipped = sample.split(used_ext) if zipped and not test_extension(zipped, zip_exts): continue path = os.path.dirname(sample) sample = os.path.basename(sample_string) # Get first section of file name for ID seq_file = SequenceFile(sample, path=path, ext=used_ext, ext2=zipped, add_exts=[]) # Check if known format mfr = find_format(seq_file, format_list, mfr_type) if target_format_type not in mfrs_found: mfrs_found[target_format_type] = ( collections[target_format_type]()) if issubclass(type(mfr), MultiFileReads): if use_common_fields_as_id: mfr.use_common_fields_as_id() mfrs_found[target_format_type].add(mfr, seq_file) else: mfrs_found[target_format_type].leftovers.add(sample, path, zipped, used_ext) found_format_or_is_leftover = True if not found_format_or_is_leftover: discarded.append(sample) # Process samples and build for id_str in mfr_samples: mfr = mfr_samples[id_str] print('MultiFileSample: ', id_str, 'format:', mfr.format_name) print(mfr.get_files()) return mfrs_found, discarded class MFR_Collection: mfrs = {} mfr_type = MultiFileReads def __init__(self): self.mfrs = {} def add(self, mfr, seq_file): pass ### ---- Illumina FastQ Collections ------ ### class Illumina_MFR_Collection(MFR_Collection): mfrs = {} leftovers = None mfr_type = IlluminaMFR def __init__(self): super().__init__() self.leftovers = Leftovers() self.mfrs = {} def add(self, mfr, seq_file): if mfr.id_string not in self.mfrs: self.mfrs[mfr.id_string] = mfr else: self.mfrs[mfr.id_string].add_files(seq_file) def flatten_rename(self, newIDPrefix='S', start_index=1): ''' ''' result = {} index = start_index for mfr_id in sorted(self.mfrs.keys()): mfr = self.mfrs[mfr_id] for samp_dict in mfr.get_files(): read_num = len([x for x in samp_dict if x[:-1] == 'read']) if read_num == 1: # if 'read1' not in samp_dict: # print(samp_dict, '\n', mfr.format_name) file = samp_dict['read1'] sample = SampleInfoSingle(mfr.id_string, os.path.abspath(str(file)), ext=file.ext, zip_ext=file.ext2) elif read_num == 2: file1 = samp_dict['read1'] file2 = samp_dict['read2'] sample = SampleInfoPaired(os.path.abspath(str(file1)), os.path.abspath(str(file2)), mfr.id_string, [file1.ext, file2.ext], [file2.ext2, file2.ext2]) else: raise AmbigiousPairedReadsError( 'To many files map together:\n%s' % repr(mfr).replace('>,', '>,\n')) result['%s%i' % (newIDPrefix, index)] = sample index += 1 return result def flatten_naive(self): result = {} for mfr_id in sorted(self.mfrs.keys()): mfr = self.mfrs[mfr_id] for samp_dict in mfr.get_files(): id_values = [] for field in mfr.field_names: if field in mfr.var_fields: id_values.append(samp_dict[field]) if field in mfr.common_fields_dict: id_values.append(mfr.common_fields_dict[field]) id_string = mfr.main_sep.join(id_values) read_num = len([x for x in samp_dict if x[:-1] == 'read']) if read_num == 1: # if 'read1' not in samp_dict: # print(samp_dict, '\n', mfr.format_name) file = samp_dict['read1'] sample = SampleInfoSingle(id_string, os.path.abspath(str(file)), ext=file.ext, zip_ext=file.ext2) elif read_num == 2: file1 = samp_dict['read1'] file2 = samp_dict['read2'] sample = SampleInfoPaired(os.path.abspath(str(file1)), os.path.abspath(str(file2)), id_string, [file1.ext, file2.ext], [file2.ext2, file2.ext2]) else: raise AmbigiousPairedReadsError( 'To many files map together:\n%s' % repr(mfr).replace('>,', '>,\n')) result[id_string] = sample return result def get_samples(self): result = {} for mfr_id in sorted(self.mfrs.keys()): mfr = self.mfrs[mfr_id] container = [] for samp_dict in mfr.get_files(): read_num = len([x for x in samp_dict if x[:-1] == 'read']) add_info = dict((x, y) for x, y in samp_dict.items() if x[:-1] != 'read') add_info.update(mfr.common_fields_dict) add_info['format'] = mfr.format_name if read_num == 1: # if 'read1' not in samp_dict: # print(samp_dict, '\n', mfr.format_name) try: file = samp_dict['read1'] sample = SampleInfoSingle(mfr.id_string, os.path.abspath(str(file)), ext=file.ext, zip_ext=file.ext2, add_info=add_info) except KeyError as err: eprint('get_verbose_samples(): cannot find read1\n ' 'culprit: %s' % repr(mfr).replace('>,', '>,\n')) elif read_num == 2: file1 = samp_dict['read1'] file2 = samp_dict['read2'] sample = SampleInfoPaired(os.path.abspath(str(file1)), os.path.abspath(str(file2)), mfr.id_string, [file1.ext, file2.ext], [file2.ext2, file2.ext2], add_info=add_info) else: raise AmbigiousPairedReadsError( 'To many files map together:\n%s' % repr(mfr).replace('>,', '>,\n')) container.append(sample) result[mfr_id] = container return result class Leftovers: samples = {} delims = [] num_files = 0 def __init__(self, delims=['_', '.', '+']): self.samples = {} self.delims = delims self.num_files = 0 def add(self, sample, path, zipped, used_ext): for delim in self.delims: sample_delim_split = sample.split(delim) sample_ID = sample_delim_split[0] if len(sample_delim_split) > 1: break # Sample not seen before num = len(self.samples) if sample_ID not in self.samples: lcp = 0 self.samples[sample_ID] = ([sample], [path], lcp, num, [zipped], [used_ext]) num += 1 # Sample already seen else: (prev_sams, prev_paths, prev_lcp, old_num, zippeds, exts) = ( self.samples[sample_ID]) # Use longest common prefix of files to determine read type # Note: Not safe if reads are fractured between different flow cell # lanes or tiles... this will break lcp = len(longest_common_prefix(prev_sams[0], sample)) self.samples[sample_ID] = ( prev_sams+[sample], prev_paths+[path], lcp, old_num, zippeds+[zipped], exts+[used_ext]) def process_leftovers(self, rename=True, rename_start_index=1): results = dict() for id_str in self.samples: sams, paths, lcp, num, zippeds, exts = self.samples[id_str] num += rename_start_index identical = all(lcp == len(x) for x in sams) if len(sams) == 2 and not identical: #print(sams) pair = [x[lcp] for x in sams] index = pair.index('1') ord_ext = [exts[index]] ord_zippeds = [zippeds[index]] read1 = os.path.join(paths[index], sams[index] + exts[index] + zippeds[index] if zippeds else '') read1 = os.path.abspath(read1) index = pair.index('2') ord_ext += [exts[index]] ord_zippeds += [zippeds[index]] read2 = os.path.join(paths[index], sams[index] + exts[index] + zippeds[index] if zippeds else '') read2 = os.path.abspath(read2) final_id_string = ('S%i' % num) if rename else id_str results[final_id_string] = SampleInfoPaired( read1, read2, id_str, exts=ord_ext, zip_exts=ord_zippeds) elif len(sams) == 1 or identical: read1 = os.path.join(paths[0], sams[0] + exts[0] + zippeds[0] if zippeds else '') final_id_string = ('S%i' % num) if rename else id_str results[final_id_string] = SampleInfoSingle( id_str, read1, ext=exts[0], zip_ext=zippeds[0]) else: # Here goes missing logic to deal with flow cell lanes and co #print(sams) raise AmbigiousPairedReadsError( 'Error: Found %i Files for Sample. Expected 1 or 2\n' 'Files for id: %s\n%s\n Flow cell Logic is currently missing' '' % (len(sams), id_str, '\n'.join(sams))) return results ### ------- Ion-Torrent Bam Collection -------- ### class IonTorrent_MFR_Collection(MFR_Collection): mfrs = {} mfr_type = IonTorrentMFR leftovers = None def __init__(self): super().__init__() leftovers = Leftovers() ### ------- PacBio h5 and meta.xml Collection ------- ### class PacBio_MFR_Collection(MFR_Collection): mfrs = {} mfr_type = PacBioMFR leftovers = None def __init__(self): super().__init__() self.mfrs = {} def add(self, mfr, seq_file): if mfr.id_string not in self.mfrs: self.mfrs[mfr.id_string] = mfr else: self.mfrs[mfr.id_string].add_files(seq_file) def get_samples(self): results = {} for mfr_id in sorted(self.mfrs.keys()): mfr = self.mfrs[mfr_id] container = [] for samp_dict in mfr.get_files(): bax_num = len([x for x in samp_dict if x in '123']) file_dict = dict((x if x not in '123' else 'bax%s' % x, str(y)) for x, y in samp_dict.items() if x in ['bas', 'metadata'] or y.ext == '.bax' and x in '123') add_info = dict((x, str(y)) for x, y in samp_dict.items() if x not in ['1', '2', '3', 'bas', 'metadata']) add_info.update(mfr.common_fields_dict) add_info['format'] = mfr.format_name file_dict['add_info'] = add_info if bax_num == 0 and ('bas' in file_dict): # if 'read1' not in samp_dict: # print(samp_dict, '\n', mfr.format_name) try: file = samp_dict sample = PacBioSampleInfoRS(mfr.id_string, **file_dict) except KeyError as err: eprint('get_verbose_samples(): cannot find read1\n ' 'culprit: %s' % repr(mfr).replace('>,', '>,\n')) results[mfr_id] = sample elif bax_num == 3: sample = PacBioSampleInfoRS_II(mfr.id_string, **file_dict) results[mfr_id] = sample else: #eprint('To many/less files map together:\n%s' % # repr(mfr).replace('>,', '>,\n')) pass return results def parse_reference_info(reference_list): '''Parsing reference file names into reference info objects Turns list of input files into reference data containers. It determines reference ids, which compression extension is used (if any), and which nucleotide sequence file extension is used. It only accepts files that end in (otional:) Args: reference_list(:obj:`list` of :obj:`str`): list of filenames Returns: :obj:`list` of :obj:`ReferenceInfo` Raises: UnknownExtensionError: If sequence file extension unknown ''' results = [] zip_exts = ['.gz', '.xz', '.bz2', '.lzma', '.lzo', '.lz', '.rz'] seq_exts = ['.fasta', '.fastq', '.fas', '.fna', '.fnq', '.fa'] for ref, num in zip(reference_list, range(1, len(reference_list)+1)): used_ext = test_extension(ref, seq_exts) if not used_ext: continue # raise UnknownExtensionError( # 'Extension not recognized\n%s' % ref) ref_id, zipped = ref.split(used_ext) if zipped and not test_extension(zipped, zip_exts): continue ref_id = os.path.basename(ref_id) results.append(('G%i' % num, ReferenceInfo(ref_id, os.path.abspath(ref), used_ext, zipped))) return results def find_format(file, formats, mfr_type): '''Checks which naming scheme fits ''' out = file for format_raw in formats: format_name = list(format_raw.keys())[0] settings_dict = list(format_raw.values())[0] mfr = mfr_type(settings_dict['format'].items(), main_sep=settings_dict['main_sep'], format_name=format_name) try: mfr.add_files(out) id_sep = (settings_dict['id_sep'] if 'id_sep' in settings_dict else None) id_fields = (settings_dict['id_fields'] if 'id_fields' in settings_dict else None) if id_sep is not None or id_fields: mfr.update_id_field(id_fields=id_fields, id_sep=id_sep) except FormatMismatch: # eprint('%s\n is not format: %s' % (file, format_name)) # eprint(mfr.regex_str) continue return mfr return out def get_formats_from_file(format_yaml): '''Loads a Yaml with fastq naming standards Warning: Currently depends on the transient feature of insert order preserving dictionaries (only python3.6) ''' with open(format_yaml, 'r') as format_fh: return ordered_load(format_fh) def ordered_load(stream, Loader=yaml.Loader, object_pairs_hook=OrderedDict): '''Stolen function to get ordered dictionary from yaml https://stackoverflow.com/questions/5121931/ in-python-how-can-you-load-yaml-mappings-as-ordereddicts/21048064#21048064 https://stackoverflow.com/users/650222/coldfix ''' class OrderedLoader(Loader): pass def construct_mapping(loader, node): loader.flatten_mapping(node) return object_pairs_hook(loader.construct_pairs(node)) OrderedLoader.add_constructor( yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, construct_mapping) return yaml.safe_load(stream, OrderedLoader) def longest_common_prefix(str1, str2): '''longest common prefix of two strings ''' return [i[0] for i in takewhile(lambda x: (len(set(x)) == 1), zip(str1, str2))]