File: bit_ops.h

package info (click to toggle)
openmpi 5.0.8-4
links: PTS, VCS
area: main
in suites:
size: 201,684 kB
sloc: ansic: 613,078; makefile: 42,353; sh: 11,194; javascript: 9,244; f90: 7,052; java: 6,404; perl: 5,179; python: 1,859; lex: 740; fortran: 61; cpp: 20; tcl: 12
file content (161 lines) | stat: -rw-r--r-- 4,422 bytes
parent folder | download | duplicates (5)
/*
 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2005 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2011 High Performance Computing Center Stuttgart,
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */

#ifndef OPAL_BIT_OPS_H
#define OPAL_BIT_OPS_H

#include "opal/prefetch.h"

/**
 * Calculates the highest bit in an integer
 *
 * @param value The integer value to examine
 * @param start Position to start looking
 *
 * @returns pos Position of highest-set integer or -1 if none are set.
 *
 * Look at the integer "value" starting at position "start", and move
 * to the right.  Return the index of the highest bit that is set to
 * 1.
 *
 * WARNING: *NO* error checking is performed.  This is meant to be a
 * fast inline function.
 * Using __builtin_clz (count-leading-zeros) uses 3 cycles instead
 * of 17 cycles (on average value, with start=32)
 * compared to the loop-version (on Intel Nehalem -- with icc-12.1.0 -O2).
 */
static inline int opal_hibit(int value, int start)
{
    unsigned int mask;

#if OPAL_C_HAVE_BUILTIN_CLZ
    /* Only look at the part that the caller wanted looking at */
    mask = value & ((1 << start) - 1);

    if (OPAL_UNLIKELY(0 == mask)) {
        return -1;
    }

    start = (8 * sizeof(int) - 1) - __builtin_clz(mask);
#else
    --start;
    mask = 1 << start;

    for (; start >= 0; --start, mask >>= 1) {
        if (value & mask) {
            break;
        }
    }
#endif

    return start;
}

/**
 * Returns the cube dimension of a given value.
 *
 * @param value The integer value to examine
 *
 * @returns cubedim The smallest cube dimension containing that value
 *
 * Look at the integer "value" and calculate the smallest power of two
 * dimension that contains that value.
 *
 * WARNING: *NO* error checking is performed.  This is meant to be a
 * fast inline function.
 * Using __builtin_clz (count-leading-zeros) uses 3 cycles instead of 50 cycles
 * compared to the loop-version (on Intel Nehalem -- with icc-12.1.0 -O2).
 */
static inline int opal_cube_dim(int value)
{
    int dim, size;

#if OPAL_C_HAVE_BUILTIN_CLZ
    if (OPAL_UNLIKELY(1 >= value)) {
        return 0;
    }
    size = 8 * sizeof(int);
    dim = size - __builtin_clz(value - 1);
#else
    for (dim = 0, size = 1; size < value; ++dim, size <<= 1) /* empty */
        ;
#endif

    return dim;
}

/**
 * @brief Returns next power-of-two of the given value.
 *
 * @param value The integer value to return power of 2
 *
 * @returns The next power of two
 *
 * WARNING: *NO* error checking is performed.  This is meant to be a
 * fast inline function.
 * Using __builtin_clz (count-leading-zeros) uses 4 cycles instead of 77
 * compared to the loop-version (on Intel Nehalem -- with icc-12.1.0 -O2).
 */
static inline int opal_next_poweroftwo(int value)
{
    int power2;

#if OPAL_C_HAVE_BUILTIN_CLZ
    if (OPAL_UNLIKELY(0 == value)) {
        return 1;
    }
    power2 = 1 << (8 * sizeof(int) - __builtin_clz(value));
#else
    for (power2 = 1; value > 0; value >>= 1, power2 <<= 1) /* empty */
        ;
#endif

    return power2;
}

/**
 * @brief Returns next power-of-two of the given value (and the value itselve if already
 * power-of-two).
 *
 * @param value The integer value to return power of 2
 *
 * @returns The next power of two (inclusive)
 *
 * WARNING: *NO* error checking is performed.  This is meant to be a
 * fast inline function.
 * Using __builtin_clz (count-leading-zeros) uses 4 cycles instead of 56
 * compared to the loop-version (on Intel Nehalem -- with icc-12.1.0 -O2).
 */
static inline int opal_next_poweroftwo_inclusive(int value)
{
    int power2;

#if OPAL_C_HAVE_BUILTIN_CLZ
    if (OPAL_UNLIKELY(1 >= value)) {
        return 1;
    }
    power2 = 1 << (8 * sizeof(int) - __builtin_clz(value - 1));
#else
    for (power2 = 1; power2 < value; power2 <<= 1) /* empty */
        ;
#endif

    return power2;
}

#endif /* OPAL_BIT_OPS_H */