""" HHpred and Hidden Markov Model APIs. This package defines the abstractions for working with HHpred's HMMs and hit lists. L{ProfileHMM} is the most important object of this module. It describes a sequence profile hidden Markov model in the way HHpred sees this concept: - a profile is composed of a list of L{HMMLayer}s, which contain a number of L{State}s - these L{States} can be of different types: Match, Insertion Deletion, etc. - a profile contains a multiple alignment, from which it is derived - this multiple alignment is an A3M (condensed) Alignment, where the first sequence is a master sequence - the match states in all layers correspond to the residues of the master sequence L{ProfileHMM} objects provide list-like access to their layers: >>> hmm.layers[1] # first layer: layer at master residue=1 Every layer provides dictionary-like access to its states: >>> layer[States.Match] and every state provides dictionary-like access to its transitions to other states: >>> state = hmm.layers[1][States.match] >>> state.transitions[States.Insertion] # Match > Insertion >>> transition.predecessor # source state >>> transition.successor # target state Whether this transition points to a state at the same (i) or the next layer (i+1) depends on the semantics of the source and the target states. Building HMMs from scratch is supported through a number of C{append} methods at various places: >>> layer = HMMLayer(...) >>> layer.append(State(...)) >>> hmm.layers.append(layer) See L{HMMLayersCollection}, L{HMMLayer}, L{EmissionTable} and L{TransitionTable} for details. """ import sys import math import csb.core import csb.io import csb.bio.structure as structure import csb.bio.sequence as sequence from csb.core import Enum class UnobservableStateError(AttributeError): pass class StateNotFoundError(csb.core.ItemNotFoundError): pass class TransitionNotFoundError(StateNotFoundError): pass class LayerIndexError(csb.core.CollectionIndexError): pass class StateExistsError(KeyError): pass class TransitionExistsError(KeyError): pass class EmissionExistsError(KeyError): pass class HMMArgumentError(ValueError): pass class States(csb.core.enum): """ Enumeration of HMM state types """ Match='M'; Insertion='I'; Deletion='D'; Start='S'; End='E' class ScoreUnits(csb.core.enum): """ Enumeration of HMM emission and transition score units """ LogScales='LogScales'; Probability='Probability' BACKGROUND = [ 0.076627178753322270, 0.018866884241976509, 0.053996136712517316, 0.059788009880742142, 0.034939432842683173, 0.075415244982547675, 0.036829356494115069, 0.050485048600600511, 0.059581159080509941, 0.099925728794059046, 0.021959667190729986, 0.040107059298840765, 0.045310838527464106, 0.032644867589507229, 0.051296350550656143, 0.046617000834108295, 0.071051060827250878, 0.072644631719882335, 0.012473412286822654, 0.039418044025976547 ] """ Background amino acid probabilities """ RELATIVE_SA = { 'A': 0.02, 'B': 0.14, 'C': 0.33, 'D': 0.55, 'E': 1.00 } """ Relative solvent accessibility codes (upper bounds) """ class ProfileHMM(object): """ Describes a protein profile Hidden Markov Model. Optional parameters: @param units: defines the units of the transition and emission scores @type units: L{ScoreUnits} @param scale: the scaling factor used to convert emission/transition probabilities @type scale: float @param logbase: the base of the logarithm used for scaling the emission and transition probabilities @type logbase: float """ def __init__(self, units=ScoreUnits.LogScales, scale=-1000., logbase=2): self._name = None self._id = None self._family = None self._length = ProfileLength(0, 0) self._alignment = None self._consensus = None self._dssp = None self._dssp_solvent = None self._psipred = None self._effective_matches = None self._evd = EVDParameters(None, None) self._version = None self._pseudocounts = False self._emission_pseudocounts = False self._transition_pseudocounts = False self._layers = HMMLayersCollection() self._start = State(States.Start) self._start_insertion = None self._end = State(States.End) self._scale = scale self._logbase = logbase if units is None: self._score_units = ScoreUnits.LogScales else: self._score_units = units @property def name(self): """ Profile name (NAME) @rtype: str """ return self._name @name.setter def name(self, value): self._name = str(value) @property def id(self): """ Profile entry ID (FILE) @rtype: str """ return self._id @id.setter def id(self, value): self._id = str(value) @property def family(self): """ Alternative entry ID (FAM) @rtype: str """ return self._family @family.setter def family(self, value): self._family = str(value) @property def length(self): """ Profile length @rtype: L{ProfileLength} """ return self._length @length.setter def length(self, value): if not isinstance(value, ProfileLength): raise TypeError(value) self._length = value @property def alignment(self): """ Source multiple alignment @rtype: L{A3MAlignment} """ return self._alignment @alignment.setter def alignment(self, value): if not isinstance(value, sequence.A3MAlignment): raise TypeError(value) self._alignment = value @property def consensus(self): """ Consensus sequence @rtype: L{AbstractSequence} """ return self._consensus @consensus.setter def consensus(self, value): if not isinstance(value, sequence.AbstractSequence): raise TypeError(value) self._consensus = value @property def dssp(self): """ DSSP (calculated) secondary structure @rtype: L{SecondaryStructure} """ return self._dssp @dssp.setter def dssp(self, value): if not isinstance(value, structure.SecondaryStructure): raise TypeError(value) self._dssp = value @property def dssp_solvent(self): """ Solvent accessibility values @rtype: str """ return self._dssp_solvent @dssp_solvent.setter def dssp_solvent(self, value): self._dssp_solvent = str(value) @property def psipred(self): """ PSIPRED (predicted) secondary structure @rtype: L{SecondaryStructure} """ return self._psipred @psipred.setter def psipred(self, value): if not isinstance(value, structure.SecondaryStructure): raise TypeError(value) self._psipred = value @property def effective_matches(self): """ Number of effective matches (NEFF) """ return self._effective_matches @effective_matches.setter def effective_matches(self, value): self._effective_matches = value @property def evd(self): """ Extreme-value distribution parameters (EVD) @rtype: L{EVDParameters} """ return self._evd @evd.setter def evd(self, value): if not isinstance(value, EVDParameters): raise TypeError(value) self._evd = value @property def version(self): """ Format version number (HHsearch) @rtype: str """ return self._version @version.setter def version(self, value): self._version = str(value) @property def pseudocounts(self): """ @rtype: bool """ return self._pseudocounts @pseudocounts.setter def pseudocounts(self, value): self._pseudocounts = bool(value) @property def emission_pseudocounts(self): """ @rtype: bool """ return self._emission_pseudocounts @emission_pseudocounts.setter def emission_pseudocounts(self, value): self._emission_pseudocounts = bool(value) @property def transition_pseudocounts(self): """ @rtype: bool """ return self._transition_pseudocounts @transition_pseudocounts.setter def transition_pseudocounts(self, value): self._transition_pseudocounts = bool(value) @property def layers(self): """ List-like access to the HMM's layers @rtype: L{HMMLayersCollection} """ return self._layers @property def start(self): """ Start state (at the start layer) @rtype: L{State} """ return self._start @start.setter def start(self, value): if value is None or (isinstance(value, State) and value.type == States.Start): self._start = value else: raise TypeError(value) @property def start_insertion(self): """ Insertion state at the start layer @rtype: L{State} """ return self._start_insertion @start_insertion.setter def start_insertion(self, value): if value is None or (isinstance(value, State) and value.type == States.Insertion): self._start_insertion = value else: raise TypeError(value) @property def end(self): """ Final state (at the end layer) @rtype: L{State} """ return self._end @end.setter def end(self, value): if value is None or (isinstance(value, State) and value.type == States.End): self._end = value else: raise TypeError(value) @property def scale(self): """ Score scaling factor @rtype: float """ return self._scale @property def logbase(self): """ Base of the logarithm used for score scaling @rtype: float """ return self._logbase @property def score_units(self): """ Current score units @rtype: L{ScoreUnits} member """ return self._score_units @property def residues(self): """ List of representative residues, attached to each layer @rtype: collection of L{Residue} """ res = [layer.residue for layer in self.layers] return csb.core.ReadOnlyCollectionContainer( res, type=structure.Residue, start_index=1) @property def all_layers(self): """ A list of layers including start and start_insertion @rtype: list of L{HMMLayer} """ complete_layers = [] first_layer = HMMLayer(rank=0, residue=None) first_layer.append(self.start) if self.start_insertion: first_layer.append(self.start_insertion) complete_layers.append(first_layer) for layer in self.layers: complete_layers.append(layer) return complete_layers @property def has_structure(self): """ True if this profile contains structural data @rtype: bool """ has = False for layer in self.layers: if layer.residue.has_structure: return True return has def serialize(self, file_name): """ Serialize this HMM to a file. @param file_name: target file name @type file_name: str """ rec = sys.getrecursionlimit() sys.setrecursionlimit(10000) csb.io.Pickle.dump(self, open(file_name, 'wb')) sys.setrecursionlimit(rec) @staticmethod def deserialize(file_name): """ De-serialize an HMM from a file. @param file_name: source file name (pickle) @type file_name: str """ rec = sys.getrecursionlimit() sys.setrecursionlimit(10000) try: return csb.io.Pickle.load(open(file_name, 'rb')) finally: sys.setrecursionlimit(rec) def _convert(self, units, score, scale, logbase): if units == ScoreUnits.Probability: return logbase ** (score / scale) elif units == ScoreUnits.LogScales: if score == 0: #score = sys.float_info.min return None return math.log(score, logbase) * scale else: raise ValueError('Unknown target unit {0}'.format(units)) def to_hmm(self, output_file=None, convert_scores=False): """ Dump the profile in HHM format. @param output_file: the output file name @type output_file: str @param convert_scores: if True, forces automatic convertion to L{ScoreUnits}.LogScales, which is required by the output file format @type convert_scores: bool """ from csb.bio.io.hhpred import HHMFileBuilder if convert_scores: self.convert_scores(ScoreUnits.LogScales) temp = csb.io.MemoryStream() builder = HHMFileBuilder(temp) builder.add_hmm(self) data = temp.getvalue() temp.close() if not output_file: return data else: with csb.io.EntryWriter(output_file, close=False) as out: out.write(data) def segment(self, start, end): """ Extract a sub-segment of the profile. @param start: start layer of the segment (rank) @type start: int @param end: end layer of the segment (rank) @type end: int @return: a deepcopy of the extracted HMM segment @rtype: L{ProfileHMMSegment} """ return ProfileHMMSegment(self, start, end) def subregion(self, start, end): return ProfileHMMRegion(self, start, end) def add_emission_pseudocounts(self, *a, **k): """ See L{csb.bio.hmm.pseudocounts.PseudocountBuilder} """ from csb.bio.hmm.pseudocounts import PseudocountBuilder PseudocountBuilder(self).add_emission_pseudocounts(*a, **k) def add_transition_pseudocounts(self, *a, **k): """ See L{csb.bio.hmm.pseudocounts.PseudocountBuilder} """ from csb.bio.hmm.pseudocounts import PseudocountBuilder PseudocountBuilder(self).add_transition_pseudocounts(*a, **k) def structure(self, chain_id=None, accession=None): """ Extract the structural information from the HMM. @param accession: defines the accession number of the structure @type accession: str @param chain_id: defines explicitly the chain identifier @type chain_id: str @return: a shallow L{Structure} wrapper around the residues in the HMM. @rtype: L{Structure} """ struct = structure.Structure(accession or self.id) chain = self.chain(chain_id) struct.chains.append(chain) return struct def chain(self, chain_id=None): """ Extract the structural information from the HMM. @param chain_id: defines explicitly the chain identifier @type chain_id: str @return: a shallow L{Chain} wrapper around the residues in the HMM. @rtype: L{Chain} """ if chain_id is None: if self.id: chain_id = self.id.rstrip()[-1] else: chain_id = '_' chain = structure.Chain(chain_id, type=sequence.SequenceTypes.Protein, residues=self.residues) chain._torsion_computed = True return chain def emission_profile(self): """ Extract the emission scores of all match states in the profile. The metric of the emission scores returned depends on the current hmm.score_units setting - you may need to call hmm.convert_scores() to adjust the hmm to your particular needs. @return: a list of dictionaries; each dict key is a single amino acid @rtype: list """ profile = [] for layer in self.layers: emission = {} for aa in layer[States.Match].emission: emission[str(aa)] = layer[States.Match].emission[aa] or 0.0 profile.append(emission) return profile def convert_scores(self, units=ScoreUnits.Probability, method=None): """ Convert emission and transition scores to the specified units. @param units: the target units for the conversion (a member of L{ScoreUnits}). @type units: L{csb.core.EnumItem} @param method: if defined, implements the exact mathematical transformation that will be applied. It must be a function or lambda expression with the following signature:: def (target_units, score, scale, logbase) and it has to return the score converted to C{target_units}. If method performs a conversion from probabilities to scaled logs, you should also update C{hmm.scale} and C{hmm.logbase}. @type method: function, lambda """ if self._score_units == units: return if method is not None: convert = method else: convert = self._convert for layer in self.layers: for state_kind in layer: state = layer[state_kind] if not state.silent: for residue in state.emission: if state.emission[residue] is not None: state.emission.update(residue, convert( units, state.emission[residue], self.scale, self.logbase)) for residue in state.background: if state.background[residue] is not None: state.background.update(residue, convert( units, state.background[residue], self.scale, self.logbase)) for tran_kind in state.transitions: transition = state.transitions[tran_kind] transition.probability = convert(units, transition.probability, self.scale, self.logbase) # The Neff-s are interger numbers and should not be transformed # (except when writing the profile to a hhm file) if self.start_insertion: for t_it in self.start_insertion.transitions: transition = self.start_insertion.transitions[t_it] transition.probability = convert(units, transition.probability, self.scale, self.logbase) for residue in self.start_insertion.emission: state = self.start_insertion if state.emission[residue] is not None: state.emission.update(residue, convert(units, state.emission[residue], self.scale, self.logbase)) state.background.update(residue, convert(units, state.background[residue], self.scale, self.logbase)) for tran_kind in self.start.transitions: transition = self.start.transitions[tran_kind] transition.probability = convert(units, transition.probability, self.scale, self.logbase) self._score_units = units def emission_similarity(self, other): """ Compute the Log-sum-of-odds score between the emission tables of self and other (Soeding 2004). If no observable Match state is found at a given layer, the Insertion state is used instead. @note: This is not a full implementation of the formula since only emission vectors are involved in the computation and any transition probabilities are ignored. @param other: the subject HMM @type other: L{ProfileHMM} @return: emission log-sum-of-odds similarity between C{self} and C{other} @rtype: float @raise ValueError: when self and other differ in their length, when the score_units are not Probability, or when no observable states are present """ score = 1 if self.layers.length != other.layers.length or self.layers.length < 1: raise ValueError('Both HMMs must have the same nonzero number of layers') if self.score_units != ScoreUnits.Probability or \ other.score_units != ScoreUnits.Probability: raise ValueError('Scores must be converted to probabilities first.') for q_layer, s_layer in zip(self.layers, other.layers): try: if States.Match in q_layer and not q_layer[States.Match].silent: q_state = q_layer[States.Match] else: q_state = q_layer[States.Insertion] if States.Match in s_layer and not s_layer[States.Match].silent: s_state = s_layer[States.Match] else: s_state = s_layer[States.Insertion] except csb.core.ItemNotFoundError: raise ValueError('Query and subject must contain observable states ' 'at each layer') emission_dotproduct = 0 for aa in q_state.emission: q_emission = q_state.emission[aa] or sys.float_info.min s_emission = s_state.emission[aa] or sys.float_info.min emission_dotproduct += (q_emission * s_emission / q_state.background[aa]) score *= emission_dotproduct return math.log(score) def _assign_secstructure(self): """ Attach references from each profile layer to the relevant DSSP secondary structure element. """ assert self.dssp is not None for motif in self.dssp: for i in range(motif.start, motif.end + 1): self.layers[i].residue.secondary_structure = motif class ProfileHMMSegment(ProfileHMM): """ Represents a segment (fragment) of a ProfileHMM. @param hmm: source HMM @type hmm: ProfileHMM @param start: start layer of the segment (rank) @type start: int @param end: end layer of the segment (rank) @type end: int @raise ValueError: when start or end positions are out of range """ def __init__(self, hmm, start, end): if start < hmm.layers.start_index or start > hmm.layers.last_index: raise IndexError('Start position {0} is out of range'.format(start)) if end < hmm.layers.start_index or end > hmm.layers.last_index: raise IndexError('End position {0} is out of range'.format(end)) #hmm = csb.core.deepcopy(hmm) super(ProfileHMMSegment, self).__init__(units=hmm.score_units, scale=hmm.scale, logbase=hmm.logbase) self.id = hmm.id self.family = hmm.family self.name = hmm.name self.pseudocounts = hmm.pseudocounts self.evd = hmm.evd self.version = hmm.version self.source = hmm.id self._source_start = start self._source_end = end if hmm.alignment: self.alignment = hmm.alignment.hmm_subregion(start, end) self.consensus = hmm.consensus.subregion(start, end) layers = csb.core.deepcopy(hmm.layers[start : end + 1]) max_score = 1.0 if hmm.score_units != ScoreUnits.Probability: max_score = hmm._convert(hmm.score_units, max_score, hmm.scale, hmm.logbase) self._build_graph(layers, max_score) if hmm.dssp: self.dssp = hmm.dssp.subregion(start, end) self._assign_secstructure() if hmm.psipred: self.psipred = hmm.psipred.subregion(start, end) self.length.layers = self.layers.length self.length.matches = self.layers.length self.effective_matches = sum([(l.effective_matches or 0.0) for l in self.layers]) / self.layers.length @property def source_start(self): """ Start position of this segment in its source HMM @rtype: int """ return self._source_start @property def source_end(self): """ End position of this segment in its source HMM @rtype: int """ return self._source_end def _build_graph(self, source_layers, max_score): for rank, layer in enumerate(source_layers, start=1): for atom_kind in layer.residue.atoms: layer.residue.atoms[atom_kind].rank = rank layer.residue._rank = rank layer.rank = rank self.layers.append(layer) if rank == 1: for state_kind in layer: if state_kind in(States.Match, States.Deletion): start_tran = Transition(self.start, layer[state_kind], max_score) self.start.transitions.append(start_tran) elif rank == len(source_layers): for state_kind in layer: state = layer[state_kind] if not (States.End in state.transitions or States.Match in state.transitions): state.transitions.set({}) else: end_tran = Transition(state, self.end, max_score) state.transitions.set({States.End: end_tran}) # TODO: I->I ? class EmissionProfileSegment(ProfileHMMSegment): """ Represents a segment of the Match state emission probabilities of a L{ProfileHMM}. Contains only Match states, connected with equal transition probabilities of 100%. """ def _build_graph(self, source_layers): factory = StateFactory() for rank, source_layer in enumerate(source_layers, start=1): emission = source_layer[States.Match].emission background = source_layer[States.Match].background match = factory.create_match(emission, background) match.rank = rank layer = HMMLayer(rank, source_layer.residue) layer.append(match) self.layers.append(layer) if rank == 1: self.start.transitions.append(Transition(self.start, match, 1.0)) elif rank < len(source_layers): prev_match = self.layers[rank - 1][States.Match] prev_match.transitions.append(Transition(prev_match, match, 1.0)) elif rank == len(source_layers): match.transitions.append(Transition(match, self.end, 1.0)) else: assert False class ProfileHMMRegion(ProfileHMM): """ A shallow proxy referring to a sub-region of a given Profile HMM. @param hmm: source HMM @type hmm: L{ProfileHMM} @param start: start layer of the segment (rank) @type start: int @param end: end layer of the segment (rank) @type end: int @raise ValueError: when start or end positions are out of range """ def __init__(self, hmm, start, end): if start < hmm.layers.start_index or start > hmm.layers.last_index: raise IndexError('Start position {0} is out of range'.format(start)) if end < hmm.layers.start_index or end > hmm.layers.last_index: raise IndexError('End position {0} is out of range'.format(end)) if hmm.score_units != ScoreUnits.Probability: raise ValueError('Scores must be converted to probabilities first.') self._layers = HMMLayersCollection(hmm.layers[start : end + 1]) self._score_units = hmm.score_units self.id = hmm.id self.name = hmm.name self.family = hmm.family self._source_start = start self._source_end = end @property def source_start(self): """ Start position of this segment in its source HMM @rtype: int """ return self._source_start @property def source_end(self): """ End position of this segment in its source HMM @rtype: int """ return self._source_end class ProfileLength(object): def __init__(self, matches, layers): self.matches = matches self.layers = layers class EVDParameters(object): def __init__(self, lamda, mu): self.lamda = lamda self.mu = mu def __nonzero__(self): return self.__bool__() def __bool__(self): return (self.lamda is not None or self.mu is not None) class EmissionTable(csb.core.DictionaryContainer): """ Represents a lookup table of emission probabilities. Provides dictionary-like access: >>> state.emission[ProteinAlphabet.ALA] emission probability for ALA @param emission: an initialization dictionary of emission probabilities @type emission: dict @param restrict: a list of residue types allowed for this emission table. Defaults to the members of L{csb.bio.sequence.ProteinAlphabet} @type restrict: list """ def __init__(self, emission=None, restrict=Enum.members(sequence.ProteinAlphabet)): super(EmissionTable, self).__init__(emission, restrict) def append(self, residue, probability): """ Append a new emission probability to the table. @param residue: residue name (type) - a member of L{csb.bio.sequence.ProteinAlphabet} @type residue: L{csb.core.EnumItem} @param probability: emission score @type probability: float @raise EmissionExistsError: if residue is already defined """ if residue in self: raise EmissionExistsError('Residue {0} is already defined.'.format(residue)) super(EmissionTable, self).append(residue, probability) def set(self, table): """ Set the emission table using the dictionary provided in the argument. @param table: the new emission table @type table: dict """ super(EmissionTable, self)._set(table) def update(self, residue, probability): """ Update the emission C{probability} of a given emission C{residue}. @param residue: name (type) of the residue to be updated @type residue: L{csb.core.EnumItem} @param probability: new emission score @type probability: float """ super(EmissionTable, self)._update({residue: probability}) class TransitionTable(csb.core.DictionaryContainer): """ Represents a lookup table of transitions that are possible from within a given state. Provides dictionary-like access, where dictionary keys are target states. These are members of the L{States} enumeration, e.g.: >>> state.transitions[States.Match] transition info regarding transition from the current state to a Match state >>> state.transitions[States.Match].predecessor state >>> state.transitions[States.Match].successor the next match state @param transitions: an initialization dictionary of target L{State}:L{Transition} pairs @type transitions: dict @param restrict: a list of target states allowed for this transition table. Defaults to the L{States} enum members @type restrict: list """ def __init__(self, transitions=None, restrict=Enum.members(States)): super(TransitionTable, self).__init__(transitions, restrict) @property def _exception(self): return TransitionNotFoundError def append(self, transition): """ Append a new C{transition} to the table. @param transition: transition info @type transition: L{Transition} @raise TransitionExistsError: when a transition to the same target state already exists for the current state """ if transition.successor.type in self: msg = 'Transition to a {0} state is already defined.' raise TransitionExistsError(msg.format(transition.successor.type)) super(TransitionTable, self).append(transition.successor.type, transition) def set(self, table): """ Set the transition table using the dictionary provided in the argument. @param table: the new transition table @type table: dict """ super(TransitionTable, self)._set(table) def update(self, target_statekind, transition): """ Update the information of a transition, which points to a target state of the specified L{States} kind. @param target_statekind: the key of the transition to be updated @type target_statekind: L{csb.core.EnumItem} @param transition: new transition info object @type transition: L{Transition} @raise ValueError: if I{transition.successor.type} differs from C{target_statekind} """ if transition.successor.type != target_statekind: raise ValueError("Successor's type differs from the specified target state.") super(TransitionTable, self)._update({target_statekind: transition}) class HMMLayersCollection(csb.core.CollectionContainer): """ Provides consecutive, 1-based access to all of the layers in the profile. Each profile layer contains a catalog of available states at that index, e.g.: >>> profile.layers[i] the catalog at profile layer i >>> profile.layers[i][States.Deletion] the deletion state at index i @param layers: initialization list of L{HMMLayer}s @type layers: list """ def __init__(self, layers=None): super(HMMLayersCollection, self).__init__(layers, type=HMMLayer, start_index=1) @property def _exception(self): return LayerIndexError class HMMLayer(csb.core.DictionaryContainer): """ Provides a dictionary-like catalog of the available states at this layer. Lookup keys are members of the L{States} enumeration, e.g.: >>> profile.layers[i][States.Deletion] the deletion state at layer number i @param rank: layer's number @type rank: int @param residue: a representative L{ProteinResidue} that is associated with this layer @type residue: L{ProteinResidue} @param states: initialization dictionary of L{States}.Item:L{State} pairs @type states: dict """ def __init__(self, rank, residue, states=None): super(HMMLayer, self).__init__(states, restrict=Enum.members(States)) self._rank = int(rank) self._residue = None self._effective_matches = None self._effective_insertions = None self._effective_deletions = None self.residue = residue @property def _exception(self): return StateNotFoundError @property def rank(self): """ Layer's position @rtype: int """ return self._rank @rank.setter def rank(self, value): self._rank = int(value) @property def residue(self): """ Representative residue @rtype: L{Residue} """ return self._residue @residue.setter def residue(self, residue): if residue and residue.type == sequence.SequenceAlphabets.Protein.GAP: raise HMMArgumentError('HMM match states cannot be gaps') self._residue = residue @property def effective_matches(self): """ Number of effective matches at this layer @rtype: int """ return self._effective_matches @effective_matches.setter def effective_matches(self, value): self._effective_matches = value @property def effective_insertions(self): """ Number of effective insertions at this layer @rtype: int """ return self._effective_insertions @effective_insertions.setter def effective_insertions(self, value): self._effective_insertions = value @property def effective_deletions(self): """ Number of effective deletions at this layer @rtype: int """ return self._effective_deletions @effective_deletions.setter def effective_deletions(self, value): self._effective_deletions = value def append(self, state): """ Append a new C{state} to the catalog. @param state: the new state @type state: L{State} @raise StateExistsError: when a state of the same type is already defined """ if state.type in self: raise StateExistsError( 'State {0} is already defined at this position.'.format(state.type)) super(HMMLayer, self).append(state.type, state) def update(self, state_kind, state): """ Update the sate of the specified kind under the current layer. @param state_kind: state type (key) - a member of L{States} @type state_kind: L{csb.core.EnumItem} @param state: the new state info @type state: L{State} @raise ValueError: if state.type differs from state_kind """ if state.type != state_kind: raise ValueError("State's type differs from the specified state_kind") super(HMMLayer, self)._update({state_kind: state}) class State(object): """ Describes a Hidden Markov Model state. @param type: one of the L{States} enumeration values, e.g. States.Match @type type: L{csb.core.EnumItem} @param emit: a collection of emittable state names allowed for the state, e.g. the members of I{SequenceAlphabets.Protein}. If not defined, the state will be created as a silent (unobservable). @type emit: list @raise ValueError: if type is not a member of the States enum """ def __init__(self, type, emit=None): self._type = None self._rank = None self._transitions = TransitionTable() self._emission = None self._background = None self.type = type if emit is not None: self._emission = EmissionTable(restrict=emit) self._background = EmissionTable(restrict=emit) def __repr__(self): return "".format(self) @property def type(self): """ State type: one of the L{States} @rtype: enum item """ return self._type @type.setter def type(self, value): if value.enum is not States: raise TypeError(value) self._type = value @property def rank(self): return self._rank @rank.setter def rank(self, value): self._rank = int(value) @property def transitions(self): """ Lookup table with available transitions to other states @rtype: L{TransitionTable} """ return self._transitions @property def emission(self): """ Lookup table with available emission probabilities @rtype: L{EmissionTable} """ if self._emission is None: raise UnobservableStateError('Silent {0!r} state'.format(self.type)) return self._emission @property def background(self): """ Lookup table with background probabilities @rtype: L{EmissionTable} """ return self._background @property def silent(self): """ Whether this state can emit something @rtype: bool """ try: return self.emission is None except UnobservableStateError: return True class StateFactory(object): """ Simplifies the construction of protein profile HMM states. """ def __init__(self): self._aa = Enum.members(sequence.ProteinAlphabet) def create_match(self, emission, background): state = State(States.Match, emit=self._aa) state.emission.set(emission) state.background.set(background) return state def create_insertion(self, background): state = State(States.Insertion, emit=self._aa) state.emission.set(background) state.background.set(background) return state def create_deletion(self): return State(States.Deletion) class TransitionType(object): def __init__(self, source, target): self.source_state = source.type self.target_state = target.type def __repr__(self): return '{0}->{1}'.format(self.source_state, self.target_state) class Transition(object): """ Describes a Hidden Markov Model transition between two states. @param predecessor: source state @type predecessor: L{State} @param successor: target state @type successor: L{State} @param probability: transition score @type probability: float """ def __init__(self, predecessor, successor, probability): if not (isinstance(predecessor, State) or isinstance(successor, State)): raise TypeError('Predecessor and successor must be State instances.') self._predecessor = predecessor self._successor = successor self._probability = None self._type = TransitionType(predecessor, successor) self.probability = probability def __str__(self): return ''.format(self) @property def predecessor(self): """ Transition source state @rtype: L{State} """ return self._predecessor @property def successor(self): """ Transition target state @rtype: L{State} """ return self._successor @property def probability(self): """ Transition score @rtype: float """ return self._probability @probability.setter def probability(self, value): if not (value >=0): raise ValueError('Transition probability must be a positive number.') self._probability = float(value) @property def type(self): """ Struct, containing information about the source and target state types @rtype: L{TransitionType} """ return self._type class HHpredHitAlignment(sequence.SequenceAlignment): """ Represents a query-template alignment in an HHpred result. @param hit: relevant hit object @type param: L{HHpredHit} @param query: the query sequence in the alignment region, with gaps @type query: str @param subject: the subject sequence in the alignment region, with gaps @type subject: str """ GAP = sequence.ProteinAlphabet.GAP def __init__(self, hit, query, subject): if not isinstance(hit, HHpredHit): raise TypeError(hit) self._hit = hit q = sequence.Sequence('query', '', ''.join(query), type=sequence.SequenceTypes.Protein) s = sequence.Sequence(hit.id, '', ''.join(subject), type=sequence.SequenceTypes.Protein) super(HHpredHitAlignment, self).__init__((q, s)) @property def query(self): """ Query sequence (with gaps) @rtype: str """ return self.rows[1].sequence @property def subject(self): """ Subject sequence (with gaps) @rtype: str """ return self.rows[2].sequence @property def segments(self): """ Find all ungapped query-subject segments in the alignment. Return a generator over all ungapped alignment segments, represented by L{HHpredHit} objects @rtype: generator """ def make_segment(sstart, send, qstart, qend): seg = HHpredHit(self._hit.rank, self._hit.id, sstart, send, qstart, qend, self._hit.probability, self._hit.qlength) seg.slength = self._hit.slength seg.evalue = self._hit.evalue seg.pvalue = self._hit.pvalue seg.score = self._hit.score seg.ss_score = self._hit.ss_score seg.identity = self._hit.identity seg.similarity = self._hit.similarity seg.prob_sum = self._hit.prob_sum return seg in_segment = False qs = self._hit.qstart - 1 ss = self._hit.start - 1 qi, si = qs, ss qe, se = qs, ss for q, s in zip(self.query, self.subject): if q != HHpredHitAlignment.GAP: qi += 1 if s != HHpredHitAlignment.GAP: si += 1 if HHpredHitAlignment.GAP in (q, s): if in_segment: yield make_segment(ss, se, qs, qe) in_segment = False qs, ss = 0, 0 qe, se = 0, 0 else: if not in_segment: in_segment = True qs, ss = qi, si qe, se = qi, si if in_segment: yield make_segment(ss, se, qs, qe) def to_a3m(self): """ @return: a query-centric A3M alignment. @rtype: L{csb.bio.sequence.A3MAlignment} """ a3m = self.format(sequence.AlignmentFormats.A3M) return sequence.A3MAlignment.parse(a3m, strict=False) class HHpredHit(object): """ Represents a single HHsearch hit. @param rank: rank of the hit @type rank: int @param id: id of the hit @type id: str @param start: subject start @type start: int @param end: subject end @type end: int @param qstart: query start @type qstart: int @param qend: query end @type qend: int @param probability: probability of the hit @type probability: float @param qlength: length of the query @type qlength: int """ def __init__(self, rank, id, start, end, qstart, qend, probability, qlength): self._rank = None self._id = None self._start = None self._end = None self._qstart = None self._qend = None self._probability = None self._qlength = None self._alignment = None self._slength = None self._evalue = None self._pvalue = None self._score = None self._ss_score = None self._identity = None self._similarity = None self._prob_sum = None # managed properties self.rank = rank self.id = id self.start = start self.end = end self.qstart = qstart self.qend = qend self.probability = probability self.qlength = qlength def __str__(self): return "{0.id} {0.probability} {0.start}-{0.end}".format(self) def __repr__(self): return "".format(self) def __lt__(self, other): return self.rank < other.rank def equals(self, other): """ Return True if C{self} is completely identical to C{other} (same id, same start and end positions). @param other: right-hand-term @type other: HHpredHit @rtype: bool """ return (self.id == other.id and self.start == other.start and self.end == other.end) def surpasses(self, other): """ Return True if C{self} is a superior to C{other} in terms of length and probability. These criteria are applied in the following order: 1. Length (the longer hit is better) 2. Probability (if they have the same length, the one with the higher probability is better) 3. Address (if they have the same length and probability, the one with higher memory ID wins; for purely practical reasons) @param other: right-hand-term @type other: HHpredHit @rtype: bool """ if self.length > other.length: return True elif self.length == other.length: if self.probability > other.probability: return True elif self.probability == other.probability: if id(self) > id(other): return True return False def includes(self, other, tolerance=1): """ Return True if C{other} overlaps with C{self}, that means C{other} is fully or partially included in C{self} when aligned over the query. @param other: right-hand-term @type other: HHpredHit @param tolerance: allow partial overlaps for that number of residues at either end @type tolerance: int @rtype: bool """ if self.id == other.id: if other.start >= self.start: if (other.end - self.end) <= tolerance: return True elif other.end <= self.end: if (self.start - other.start) <= tolerance: return True return False def add_alignment(self, query, subject): """ Add query/subject alignment to the hit. @param query: the query sequence within the alignment region, with gaps @type query: str @param subject: the subject sequence within the alignment region, with gaps @type subject: str """ self._alignment = HHpredHitAlignment(self, query, subject) @property def rank(self): return self._rank @rank.setter def rank(self, value): try: value = int(value) except: raise TypeError('rank must be int, not {1}'.format(type(value))) self._rank = value @property def id(self): return self._id @id.setter def id(self, value): try: value = str(value) except: raise TypeError('id must be string, not {0}'.format(type(value))) self._id = value @property def start(self): return self._start @start.setter def start(self, value): try: value = int(value) except: raise TypeError('start must be int, not {0}'.format(type(value))) self._start = value @property def end(self): return self._end @end.setter def end(self, value): try: value = int(value) except: raise TypeError('end must be int, not {0}'.format(type(value))) self._end = value @property def qstart(self): return self._qstart @qstart.setter def qstart(self, value): try: value = int(value) except: raise TypeError('qstart must be int, not {0}'.format(type(value))) self._qstart = value @property def qend(self): return self._qend @qend.setter def qend(self, value): try: value = int(value) except: raise TypeError('qend must be int, not {0}'.format(type(value))) self._qend = value @property def qlength(self): return self._qlength @qlength.setter def qlength(self, value): try: value = int(value) except: raise TypeError('qlength must be int, not {0}'.format(type(value))) self._qlength = value @property def probability(self): return self._probability @probability.setter def probability(self, value): try: value = float(value) except: raise TypeError('probability must be float, not {0}'.format(type(value))) self._probability = value @property def alignment(self): return self._alignment @property def length(self): try: return self.end - self.start + 1 except: return 0 @property def slength(self): return self._slength @slength.setter def slength(self, value): self._slength = value @property def evalue(self): return self._evalue @evalue.setter def evalue(self, value): self._evalue = value @property def pvalue(self): return self._pvalue @pvalue.setter def pvalue(self, value): self._pvalue = value @property def score(self): return self._score @score.setter def score(self, value): self._score = value @property def ss_score(self): return self._ss_score @ss_score.setter def ss_score(self, value): self._ss_score = value @property def identity(self): return self._identity @identity.setter def identity(self, value): self._identity = value @property def similarity(self): return self._similarity @similarity.setter def similarity(self, value): self._similarity = value @property def prob_sum(self): return self._prob_sum @prob_sum.setter def prob_sum(self, value): self._prob_sum = value class HHpredHitList(object): """ Represents a collection of L{HHpredHit}s. """ def __init__(self, hits, query_name='', match_columns=-1, no_of_seqs='', neff=-1., searched_hmms=-1, date='', command=''): self._hits = list(hits) self._query_name = None self._match_columns = None self._no_of_seqs = None self._neff = None self._searched_hmms = None self._date = None self._command = None self.query_name = query_name self.match_columns = match_columns self.no_of_seqs = no_of_seqs self.neff = neff self.searched_hmms = searched_hmms self.date = date self.command = command @property def query_name(self): return self._query_name @query_name.setter def query_name(self, value): self._query_name = value @property def match_columns(self): return self._match_columns @match_columns.setter def match_columns(self, value): self._match_columns = value @property def no_of_seqs(self): return self._no_of_seqs @no_of_seqs.setter def no_of_seqs(self, value): self._no_of_seqs = value @property def neff(self): return self._neff @neff.setter def neff(self, value): self._neff = value @property def searched_hmms(self): return self._searched_hmms @searched_hmms.setter def searched_hmms(self, value): self._searched_hmms = value @property def date(self): return self._date @date.setter def date(self, value): self._date = value @property def command(self): return self._command @command.setter def command(self, value): self._command = value def __str__(self): return "HHpredHitList\n\tquery={0.query_name}\n\tmatch_columns={0.match_columns}\n\tno_of_seqs={0.no_of_seqs}\n\tneff={0.neff}\n\tsearched_hmms={0.searched_hmms}\n\tdate={0.date}\n\tcommand={0.command}".format(self) def __repr__(self): return "".format(len(self)) def __getitem__(self, index): return self._hits[index] def __iter__(self): return iter(self._hits) def __len__(self): return len(self._hits) def sort(self): self._hits.sort(key=lambda i: i.rank)