File: hwe.py

package info (click to toggle)
freebayes 1.3.7-1~exp
links: PTS, VCS
area: main
in suites: experimental
size: 6,324 kB
sloc: cpp: 125,947; ansic: 4,539; sh: 1,084; python: 604; asm: 271; lisp: 75; perl: 27; makefile: 20
file content (74 lines) | stat: -rw-r--r-- 3,653 bytes
parent folder | download | duplicates (5)
#!/usr/bin/env python3

from dirichlet import multinomial, multinomialln, multinomial_coefficient, multinomial_coefficientln
import math
import operator

def fold(func, iterable, initial=None, reverse=False):
    x=initial
    if reverse:
        iterable=reversed(iterable)
    for e in iterable:
        x=func(x,e) if x is not None else e
    return x

def hwe_expectation(genotype, allele_counts):
    """@genotype is counts of A,B,C etc. alleles, e.g. (2,0) is AA and (1,1) is AB
    @allele_counts is the counts of the alleles in the population"""
    population_total_alleles = sum(allele_counts)
    ploidy = sum(genotype)
    genotype_coeff = multinomial_coefficient(ploidy, genotype)
    allele_frequencies = [count / float(population_total_alleles) for count in allele_counts]
    genotype_expected_frequency = genotype_coeff * fold(operator.mul, [math.pow(freq, p) for freq, p in zip(allele_frequencies, genotype)])
    return genotype_expected_frequency


def add_tuple(a, b):
    l = []
    for i,j in zip(a,b):
        l.append(i + j)
    return tuple(l)

# TODO handle NULL case, genotype has frequency 0
def hwe_sampling_probln(genotype, genotypes, ploidy):
    """@genotype:  counts of A,B,C etc. alleles, e.g. (2,0) is AA and (1,1) is AB
    @genotypes: counts of the genotypes in the population,
                     e.g. (((2,0),1), ((1,1),2), ((0,2),1)) would be 1xAA, 2xAB, 1xBB
    @return: the probability that there are exactly as many "genotype" in the
             population as suggested by the genotype counts, given HWE"""
    population_total_alleles = sum([sum(g[0]) * g[1] for g in genotypes])
    #print "population_total_alleles", population_total_alleles
    allele_counts = fold(add_tuple, [[a * g[1] for a in g[0]] for g in genotypes])
    #print "allele_counts", allele_counts
    genotype_counts = [g[1] for g in genotypes]
    #print "genotype_counts", genotype_counts
    population_total_genotypes = sum([gtc[1] for gtc in genotypes])
    #print "total genotypes:", population_total_genotypes
    #print "ploidy", ploidy
    # number of arrangements of the alleles in the sample
    arrangements_of_alleles_in_sample = multinomial_coefficientln(population_total_alleles, allele_counts)
    #print "arrangements_of_alleles_in_sample", math.exp(arrangements_of_alleles_in_sample)
    # number of arrangements which contain exactly count genotypes
    #arrangements_with_exactly_count_genotype = multinomial_coefficientln(population_total_genotypes, genotype_counts)
    #print math.exp(multinomial_coefficientln(ploidy, genotype))
    arrangements_with_exactly_count_genotype = \
            multinomial_coefficientln(ploidy, genotype) \
            + multinomial_coefficientln(population_total_genotypes, genotype_counts)
    #print "arrangements_with_exactly_count_genotype", math.exp(arrangements_with_exactly_count_genotype)
    return arrangements_with_exactly_count_genotype - arrangements_of_alleles_in_sample;

def inbreeding_coefficient(genotype, genotypes):
    population_total_alleles = sum([sum(g[0]) * g[1] for g in genotypes])
    allele_counts = fold(add_tuple, [[a * g[1] for a in g[0]] for g in genotypes])
    genotype_counts = [g[1] for g in genotypes]
    population_total_genotypes = sum([gtc[1] for gtc in genotypes])
    expected = hwe_expectation(genotype, allele_counts) * population_total_genotypes
    observed = 0
    for g in genotypes:
        if g[0] == genotype:
            observed = g[1]
            break
    if observed == 0:
        print("error, no observations of genotype, cannot calculate inbreeding coefficient")
        return 0
    return 1 - (observed / expected)