File: period.py

package info (click to toggle)
python-cogent 1.5.3-2
links: PTS, VCS
area: main
in suites: jessie, jessie-kfreebsd
size: 16,424 kB
ctags: 24,343
sloc: python: 134,200; makefile: 100; ansic: 17; sh: 10
file content (224 lines) | stat: -rw-r--r-- 7,040 bytes
from __future__ import division
from random import shuffle, random, choice
import numpy

try:
    from math import factorial
except ImportError: # python version < 2.6
    from cogent.maths.stats.special import Gamma
    factorial = lambda x: Gamma(x+1)

from cogent.maths.stats.special import igam

__author__ = "Hua Ying, Julien Epps and Gavin Huttley"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Julien Epps", "Hua Ying", "Gavin Huttley"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Gavin Huttley"
__email__ = "Gavin.Huttley@anu.edu.au"
__status__ = "Production"

def chi_square(x, p, df=1):
    """returns the chisquare statistic and it's probability"""
    N = len(x)
    end = N
    sim = numpy.logical_not(numpy.logical_xor(x[0:end-p], x[p:end]))*1
    s = ((numpy.ones((N-p,), float)-sim)**2).sum()
    D = s/(N-p)
    p_val = 1 - igam(df/2.0, D/2)
    return D, p_val

def g_statistic(X, p=None, idx=None):
    """
    return g statistic and p value
    arguments: 
        X - the periodicity profile (e.g. DFT magnitudes, autocorrelation etc)
            X needs to contain only those period values being considered, 
            i.e. only periods in the range [llim, ulim]
    """
    # X should be real
    X = abs(numpy.array(X))
    if p is None: 
        power = X.max(0)
        idx = X.argmax(0)
    else:
        assert idx is not None
        power = X[idx]
    g_obs = power/X.sum()
    M = numpy.floor(1/g_obs)
    pmax = len(X)
    
    result = numpy.zeros((int(M+1),), float)
    pmax_fact = factorial(pmax)
    for index in xrange(1, min(pmax, int(M))+1):
        v = (-1)**(index-1)*pmax_fact/factorial(pmax-index)/factorial(index)
        v *= (1-index*g_obs)**(pmax-1)
        result[index] = v
    
    p_val = result.sum()
    return g_obs, p_val

def _seq_to_symbols(seq, motifs, motif_length, result=None):
    """return symbolic represetation of the sequence
    Arguments: 
        - seq: a sequence
        - motifs: a list of sequence motifs
        - motif_length: length of first motif
    """
    if result is None:
        result = numpy.zeros(len(seq), numpy.uint8)
    else:
        result.fill(0)
    
    if motif_length is None:
        motif_length = len(motifs[0])
    
    for i in xrange(len(seq) - motif_length + 1):
        if seq[i: i + motif_length] in motifs:
            result[i] = 1
    
    return result

try:
    from cogent.maths._period import seq_to_symbols
    #raise ImportError
except ImportError:
    seq_to_symbols = _seq_to_symbols

class SeqToSymbols(object):
    """class for converting all occurrences of motifs in passed sequence
    to 1/0 otherwise"""
    def __init__(self, motifs, length=None, motif_length=None):
        super(SeqToSymbols, self).__init__()
        if type(motifs) == str:
            motifs = [motifs]
        self.motifs = motifs
        self.length = length
        self.motif_length = motif_length or len(motifs[0])
        self.working = None
        if length is not None:
            self.setResultArray(length)
    
    def setResultArray(self, length):
        """sets a result array for length"""
        self.working = numpy.zeros(length, numpy.uint8)
        self.length = length
    
    def __call__(self, seq, result=None):
        if result is None and self.working is None:
            self.setResultArray(len(seq))
        elif self.working is not None:
            if len(seq) != self.working.shape[0]:
                self.setResultArray(len(seq))
        
        result = self.working
        result.fill(0)
        if type(seq) != str:
            seq = ''.join(seq)
        
        return seq_to_symbols(seq, self.motifs, self.motif_length, result)
    

def circular_indices(vector, start, length, num):
    """docstring for circular_indices"""
    if start > length:
        start = start-length
        
    if start+num < length:
        return vector[start: start+num]
    # get all till end, then from beginning
    return vector[start:] + vector[:start+num-length]

def sampled_places(block_size, length):
    """returns randomly sampled positions with block_size to make a new vector
    with length
    """
    # Main condition is to identify when a draw would run off end, we want to
    # draw from beginning
    num_seg, remainder = divmod(length, block_size)
    vector = range(length)
    result = []
    for seg_num in xrange(num_seg):
        i = choice(vector)
        result += circular_indices(vector, i, length, block_size)
    
    if remainder:
        result += circular_indices(vector, i+block_size, length, remainder)
    
    assert len(result) == length, len(result)
    return result

def blockwise_bootstrap(signal, calc, block_size, num_reps, seq_to_symbols=None, num_stats=None):
    """returns observed statistic and the probability from the bootstrap
    test of observing more `power' by chance than that estimated from the
    observed signal
    
    Arguments:
        - signal: a series, can be a sequence object
        - calc: function to calculate the period power, e.g. ipdft, hybrid,
          auto_corr or any other statistic.
        - block_size: size of contiguous values for resampling
        - num_reps: number of randomly generated permutations
        - seq_to_symbols: function to convert a sequence to 1/0. If not
          provided, the raw data is used.
        - num_stats: the number of statistics being evaluated for each
          interation. Default to 1.
    """
    signal_length = len(signal)
    
    if seq_to_symbols is not None:
        dtype='c'
    else:
        dtype=None # let numpy guess
    
    signal = numpy.array(list(signal), dtype=dtype)
    
    if seq_to_symbols is not None:
        symbolic = seq_to_symbols(signal)
        data = symbolic
    else:
        data = signal
    
    obs_stat = calc(data)
    if seq_to_symbols is not None:
        if sum(symbolic) == 0:
            p = [numpy.array([1.0, 1.0, 1.0]), 1.0][num_stats == 1]
            
            return obs_stat, p
    
    if num_stats is None:
        try:
            num_stats = calc.getNumStats()
        except AttributeError:
            num_stats = 1
    
    if num_stats == 1:
        count = 0
    else:
        count = numpy.zeros(num_stats)
    
    for rep in range(num_reps):
        # get sample positions
        sampled_indices = sampled_places(block_size, signal_length)
        new_signal = signal.take(sampled_indices)
        if seq_to_symbols is not None:
            symbolic = seq_to_symbols(new_signal)
            data = symbolic
        else:
            data = new_signal
        sim_stat = calc(data)
        # count if > than observed
        if num_stats > 1:
            count[sim_stat >= obs_stat] += 1
        elif sim_stat >= obs_stat:
            count += 1
        
    return obs_stat, count / num_reps


# def percrb4():
#     """Return SNR and CRB for periodicity estimates from symbolic signals"""
#     # TODO: complete the function according to Julien's percrb4.m
#     pass
#