/** 
 * @file rng.c 
 * @brief double precision SIMD-oriented Fast Mersenne Twister (rng)
 * based on IEEE 754 format.
 *
 * @author Mutsuo Saito (Hiroshima University)
 * @author Makoto Matsumoto (Hiroshima University)
 *
 * Copyright (C) 2007,2008 Mutsuo Saito, Makoto Matsumoto and Hiroshima
 * University. All rights reserved.
 *
 * The new BSD License is applied to this software, see LICENSE.txt
 */
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#ifndef DSFMT_PARAMS_H
#define RNG_PARAMS_H

#include "rng.h"

/*----------------------
  the parameters of RNG
  following definitions are in rng-paramsXXXX.h file.
  ----------------------*/
/** the pick up position of the array.
#define RNG_POS1 122 
*/

/** the parameter of shift left as four 32-bit registers.
#define RNG_SL1 18
 */

/** the parameter of shift right as four 32-bit registers.
#define RNG_SR1 12
*/

/** A bitmask, used in the recursion.  These parameters are introduced
 * to break symmetry of SIMD.
#define RNG_MSK1 (uint64_t)0xdfffffefULL
#define RNG_MSK2 (uint64_t)0xddfecb7fULL
*/

/** These definitions are part of a 128-bit period certification vector.
#define RNG_PCV1	UINT64_C(0x00000001)
#define RNG_PCV2	UINT64_C(0x00000000)
*/

#define RNG_LOW_MASK  UINT64_C(0x000FFFFFFFFFFFFF)
#define RNG_HIGH_CONST UINT64_C(0x3FF0000000000000)
#define RNG_SR	12

/* for sse2 */
#if defined(HAVE_SSE2)
  #define SSE2_SHUFF 0x1b
#elif defined(HAVE_ALTIVEC)
  #if defined(__APPLE__)  /* For OSX */
    #define ALTI_SR (vector unsigned char)(4)
    #define ALTI_SR_PERM \
        (vector unsigned char)(15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14)
    #define ALTI_SR_MSK \
        (vector unsigned int)(0x000fffffU,0xffffffffU,0x000fffffU,0xffffffffU)
    #define ALTI_PERM \
        (vector unsigned char)(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3)
  #else
    #define ALTI_SR      {4}
    #define ALTI_SR_PERM {15,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14}
    #define ALTI_SR_MSK  {0x000fffffU,0xffffffffU,0x000fffffU,0xffffffffU}
    #define ALTI_PERM    {12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3}
  #endif
#endif

#ifndef DSFMT_PARAMS19937_H
#define RNG_PARAMS19937_H

/* #define RNG_N	191 */
/* #define RNG_MAXDEGREE	19992 */
#define RNG_POS1	117
#define RNG_SL1	19
#define RNG_MSK1	UINT64_C(0x000ffafffffffb3f)
#define RNG_MSK2	UINT64_C(0x000ffdfffc90fffd)
#define RNG_MSK32_1	0x000ffaffU
#define RNG_MSK32_2	0xfffffb3fU
#define RNG_MSK32_3	0x000ffdffU
#define RNG_MSK32_4	0xfc90fffdU
#define RNG_FIX1	UINT64_C(0x90014964b32f4329)
#define RNG_FIX2	UINT64_C(0x3b8d12ac548a7c7a)
#define RNG_PCV1	UINT64_C(0x3d84e1ac0dc82880)
#define RNG_PCV2	UINT64_C(0x0000000000000001)
#define RNG_IDSTR	"rng2-19937:117-19:ffafffffffb3f-ffdfffc90fffd"


/* PARAMETERS FOR ALTIVEC */
#if defined(__APPLE__)	/* For OSX */
    #define ALTI_SL1 	(vector unsigned int)(3, 3, 3, 3)
    #define ALTI_SL1_PERM \
	(vector unsigned char)(2,3,4,5,6,7,30,30,10,11,12,13,14,15,0,1)
    #define ALTI_SL1_MSK \
	(vector unsigned int)(0xffffffffU,0xfff80000U,0xffffffffU,0xfff80000U)
    #define ALTI_MSK	(vector unsigned int)(RNG_MSK32_1, \
			RNG_MSK32_2, RNG_MSK32_3, RNG_MSK32_4)
#else	/* For OTHER OSs(Linux?) */
    #define ALTI_SL1 	{3, 3, 3, 3}
    #define ALTI_SL1_PERM \
	{2,3,4,5,6,7,30,30,10,11,12,13,14,15,0,1}
    #define ALTI_SL1_MSK \
	{0xffffffffU,0xfff80000U,0xffffffffU,0xfff80000U}
    #define ALTI_MSK \
	{RNG_MSK32_1, RNG_MSK32_2, RNG_MSK32_3, RNG_MSK32_4}
#endif

#endif /* RNG_PARAMS19937_H */

#endif /* RNG_PARAMS_H */

/** rng internal state vector */
rng_t rng_global_data;
/** rng mexp for check */
static const int rng_mexp = RNG_MEXP;

/*----------------
  STATIC FUNCTIONS
  ----------------*/
inline static uint32_t ini_func1(uint32_t x);
inline static uint32_t ini_func2(uint32_t x);
inline static void gen_rand_array_c1o2(rng_t * restrict rng, w128_t * restrict array,
				       int size);
inline static void gen_rand_array_c0o1(rng_t * restrict rng, w128_t * restrict array,
				       int size);
inline static void gen_rand_array_o0c1(rng_t * restrict rng, w128_t * restrict array,
				       int size);
inline static void gen_rand_array_o0o1(rng_t * restrict rng, w128_t * restrict array,
				       int size);
inline static int idxof(int i);
static void initial_mask(rng_t * restrict rng);
static void period_certification(rng_t * restrict rng);

#if defined(HAVE_SSE2)
#  include <emmintrin.h>
/** mask data for sse2 */
static __m128i sse2_param_mask;
/** 1 in 64bit for sse2 */
static __m128i sse2_int_one;
/** 2.0 double for sse2 */
static __m128d sse2_double_two;
/** -1.0 double for sse2 */
static __m128d sse2_double_m_one;

static void setup_const(void);
#endif

/**
 * This function simulate a 32-bit array index overlapped to 64-bit
 * array of LITTLE ENDIAN in BIG ENDIAN machine.
 */
#if defined(RNG_BIG_ENDIAN)
inline static int idxof(int i) {
    return i ^ 1;
}
#else
inline static int idxof(int i) {
    return i;
}
#endif

/**
 * This function represents the recursion formula.
 * @param r output
 * @param a a 128-bit part of the internal state array
 * @param b a 128-bit part of the internal state array
 * @param lung a 128-bit part of the internal state array
 */
#if defined(HAVE_ALTIVEC)
inline static void do_recursion(w128_t * restrict r, w128_t * restrict a, w128_t * restrict b,
				w128_t * restrict lung) {
    const vector unsigned char sl1 = ALTI_SL1;
    const vector unsigned char sl1_perm = ALTI_SL1_PERM;
    const vector unsigned int sl1_msk = ALTI_SL1_MSK;
    const vector unsigned char sr1 = ALTI_SR;
    const vector unsigned char sr1_perm = ALTI_SR_PERM;
    const vector unsigned int sr1_msk = ALTI_SR_MSK;
    const vector unsigned char perm = ALTI_PERM;
    const vector unsigned int msk1 = ALTI_MSK;
    vector unsigned int w, x, y, z;

    z = a->s;
    w = lung->s;
    x = vec_perm(w, (vector unsigned int)perm, perm);
    y = vec_perm(z, sl1_perm, sl1_perm);
    y = vec_sll(y, sl1);
    y = vec_and(y, sl1_msk);
    w = vec_xor(x, b->s);
    w = vec_xor(w, y);
    x = vec_perm(w, (vector unsigned int)sr1_perm, sr1_perm);
    x = vec_srl(x, sr1);
    x = vec_and(x, sr1_msk);
    y = vec_and(w, msk1);
    z = vec_xor(z, y);
    r->s = vec_xor(z, x);
    lung->s = w;
}
#elif defined(HAVE_SSE2)
/**
 * This function setup some constant variables for SSE2.
 */
static void setup_const(void) {
    static int first = 1;
    if (!first) {
	return;
    }
    sse2_param_mask = _mm_set_epi32(RNG_MSK32_3, RNG_MSK32_4,
				    RNG_MSK32_1, RNG_MSK32_2);
    sse2_int_one = _mm_set_epi32(0, 1, 0, 1);
    sse2_double_two = _mm_set_pd(2.0, 2.0);
    sse2_double_m_one = _mm_set_pd(-1.0, -1.0);
    first = 0;
}

/**
 * This function represents the recursion formula.
 * @param r output 128-bit
 * @param a a 128-bit part of the internal state array
 * @param b a 128-bit part of the internal state array
 * @param d a 128-bit part of the internal state array (I/O)
 */
inline static void do_recursion(w128_t * restrict r, w128_t * restrict a, w128_t *b, w128_t * restrict u) {
    __m128i v, w, x, y, z;
    
    x = a->si;
    z = _mm_slli_epi64(x, RNG_SL1);
    y = _mm_shuffle_epi32(u->si, SSE2_SHUFF);
    z = _mm_xor_si128(z, b->si);
    y = _mm_xor_si128(y, z);

    v = _mm_srli_epi64(y, RNG_SR);
    w = _mm_and_si128(y, sse2_param_mask);
    v = _mm_xor_si128(v, x);
    v = _mm_xor_si128(v, w);
    r->si = v;
    u->si = y;
}
#else /* standard C */
/**
 * This function represents the recursion formula.
 * @param r output 128-bit
 * @param a a 128-bit part of the internal state array
 * @param b a 128-bit part of the internal state array
 * @param lung a 128-bit part of the internal state array (I/O)
 */
inline static void do_recursion(w128_t * restrict r, w128_t * restrict a, w128_t * restrict b,
				w128_t * restrict lung) {
    uint64_t t0, t1, L0, L1;

    t0 = a->u[0];
    t1 = a->u[1];
    L0 = lung->u[0];
    L1 = lung->u[1];
    lung->u[0] = (t0 << RNG_SL1) ^ (L1 >> 32) ^ (L1 << 32) ^ b->u[0];
    lung->u[1] = (t1 << RNG_SL1) ^ (L0 >> 32) ^ (L0 << 32) ^ b->u[1];
    r->u[0] = (lung->u[0] >> RNG_SR) ^ (lung->u[0] & RNG_MSK1) ^ t0;
    r->u[1] = (lung->u[1] >> RNG_SR) ^ (lung->u[1] & RNG_MSK2) ^ t1;
}
#endif

#if defined(HAVE_SSE2)
/**
 * This function converts the double precision floating point numbers which
 * distribute uniformly in the range [1, 2) to those which distribute uniformly
 * in the range [0, 1).
 * @param w 128bit stracture of double precision floating point numbers (I/O)
 */
inline static void convert_c0o1(w128_t * restrict w) {
    w->sd = _mm_add_pd(w->sd, sse2_double_m_one);
}

/**
 * This function converts the double precision floating point numbers which
 * distribute uniformly in the range [1, 2) to those which distribute uniformly
 * in the range (0, 1].
 * @param w 128bit stracture of double precision floating point numbers (I/O)
 */
inline static void convert_o0c1(w128_t * restrict w) {
    w->sd = _mm_sub_pd(sse2_double_two, w->sd);
}

/**
 * This function converts the double precision floating point numbers which
 * distribute uniformly in the range [1, 2) to those which distribute uniformly
 * in the range (0, 1).
 * @param w 128bit stracture of double precision floating point numbers (I/O)
 */
inline static void convert_o0o1(w128_t * restrict w) {
    w->si = _mm_or_si128(w->si, sse2_int_one);
    w->sd = _mm_add_pd(w->sd, sse2_double_m_one);
}
#else /* standard C and altivec */
/**
 * This function converts the double precision floating point numbers which
 * distribute uniformly in the range [1, 2) to those which distribute uniformly
 * in the range [0, 1).
 * @param w 128bit stracture of double precision floating point numbers (I/O)
 */
inline static void convert_c0o1(w128_t * restrict w) {
    w->d[0] -= 1.0;
    w->d[1] -= 1.0;
}

/**
 * This function converts the double precision floating point numbers which
 * distribute uniformly in the range [1, 2) to those which distribute uniformly
 * in the range (0, 1].
 * @param w 128bit stracture of double precision floating point numbers (I/O)
 */
inline static void convert_o0c1(w128_t * restrict w) {
    w->d[0] = 2.0 - w->d[0];
    w->d[1] = 2.0 - w->d[1];
}

/**
 * This function converts the double precision floating point numbers which
 * distribute uniformly in the range [1, 2) to those which distribute uniformly
 * in the range (0, 1).
 * @param w 128bit stracture of double precision floating point numbers (I/O)
 */
inline static void convert_o0o1(w128_t * restrict w) {
    w->u[0] |= 1;
    w->u[1] |= 1;
    w->d[0] -= 1.0;
    w->d[1] -= 1.0;
}
#endif

/**
 * This function fills the user-specified array with double precision
 * floating point pseudorandom numbers of the IEEE 754 format.
 * @param rng rng state vector.
 * @param array an 128-bit array to be filled by pseudorandom numbers.  
 * @param size number of 128-bit pseudorandom numbers to be generated.
 */
inline static void gen_rand_array_c1o2(rng_t *restrict rng, w128_t * restrict array,
				       int size) {
    int i, j;
    w128_t lung;

    lung = rng->status[RNG_N];
    do_recursion(&array[0], &rng->status[0], &rng->status[RNG_POS1],
		 &lung);
    for (i = 1; i < RNG_N - RNG_POS1; i++) {
	do_recursion(&array[i], &rng->status[i],
		     &rng->status[i + RNG_POS1], &lung);
    }
    for (; i < RNG_N; i++) {
	do_recursion(&array[i], &rng->status[i],
		     &array[i + RNG_POS1 - RNG_N], &lung);
    }
    for (; i < size - RNG_N; i++) {
	do_recursion(&array[i], &array[i - RNG_N],
		     &array[i + RNG_POS1 - RNG_N], &lung);
    }
    for (j = 0; j < 2 * RNG_N - size; j++) {
	rng->status[j] = array[j + size - RNG_N];
    }
    for (; i < size; i++, j++) {
	do_recursion(&array[i], &array[i - RNG_N],
		     &array[i + RNG_POS1 - RNG_N], &lung);
	rng->status[j] = array[i];
    }
    rng->status[RNG_N] = lung;
}

/**
 * This function fills the user-specified array with double precision
 * floating point pseudorandom numbers of the IEEE 754 format.
 * @param rng rng state vector.
 * @param array an 128-bit array to be filled by pseudorandom numbers.  
 * @param size number of 128-bit pseudorandom numbers to be generated.
 */
inline static void gen_rand_array_c0o1(rng_t * restrict rng, w128_t * restrict array,
				       int size) {
    int i, j;
    w128_t lung;

    lung = rng->status[RNG_N];
    do_recursion(&array[0], &rng->status[0], &rng->status[RNG_POS1],
		 &lung);
    for (i = 1; i < RNG_N - RNG_POS1; i++) {
	do_recursion(&array[i], &rng->status[i],
		     &rng->status[i + RNG_POS1], &lung);
    }
    for (; i < RNG_N; i++) {
	do_recursion(&array[i], &rng->status[i],
		     &array[i + RNG_POS1 - RNG_N], &lung);
    }
    for (; i < size - RNG_N; i++) {
	do_recursion(&array[i], &array[i - RNG_N],
		     &array[i + RNG_POS1 - RNG_N], &lung);
	convert_c0o1(&array[i - RNG_N]);
    }
    for (j = 0; j < 2 * RNG_N - size; j++) {
	rng->status[j] = array[j + size - RNG_N];
    }
    for (; i < size; i++, j++) {
	do_recursion(&array[i], &array[i - RNG_N],
		     &array[i + RNG_POS1 - RNG_N], &lung);
	rng->status[j] = array[i];
	convert_c0o1(&array[i - RNG_N]);
    }
    for (i = size - RNG_N; i < size; i++) {
	convert_c0o1(&array[i]);
    }
    rng->status[RNG_N] = lung;
}

/**
 * This function fills the user-specified array with double precision
 * floating point pseudorandom numbers of the IEEE 754 format.
 * @param rng rng state vector.
 * @param array an 128-bit array to be filled by pseudorandom numbers.  
 * @param size number of 128-bit pseudorandom numbers to be generated.
 */
inline static void gen_rand_array_o0o1(rng_t * restrict rng, w128_t * restrict array,
				       int size) {
    int i, j;
    w128_t lung;

    lung = rng->status[RNG_N];
    do_recursion(&array[0], &rng->status[0], &rng->status[RNG_POS1],
		 &lung);
    for (i = 1; i < RNG_N - RNG_POS1; i++) {
	do_recursion(&array[i], &rng->status[i],
		     &rng->status[i + RNG_POS1], &lung);
    }
    for (; i < RNG_N; i++) {
	do_recursion(&array[i], &rng->status[i],
		     &array[i + RNG_POS1 - RNG_N], &lung);
    }
    for (; i < size - RNG_N; i++) {
	do_recursion(&array[i], &array[i - RNG_N],
		     &array[i + RNG_POS1 - RNG_N], &lung);
	convert_o0o1(&array[i - RNG_N]);
    }
    for (j = 0; j < 2 * RNG_N - size; j++) {
	rng->status[j] = array[j + size - RNG_N];
    }
    for (; i < size; i++, j++) {
	do_recursion(&array[i], &array[i - RNG_N],
		     &array[i + RNG_POS1 - RNG_N], &lung);
	rng->status[j] = array[i];
	convert_o0o1(&array[i - RNG_N]);
    }
    for (i = size - RNG_N; i < size; i++) {
	convert_o0o1(&array[i]);
    }
    rng->status[RNG_N] = lung;
}

/**
 * This function fills the user-specified array with double precision
 * floating point pseudorandom numbers of the IEEE 754 format.
 * @param rng rng state vector.
 * @param array an 128-bit array to be filled by pseudorandom numbers.  
 * @param size number of 128-bit pseudorandom numbers to be generated.
 */
inline static void gen_rand_array_o0c1(rng_t *restrict rng, w128_t * restrict array,
				       int size) {
    int i, j;
    w128_t lung;

    lung = rng->status[RNG_N];
    do_recursion(&array[0], &rng->status[0], &rng->status[RNG_POS1],
		 &lung);
    for (i = 1; i < RNG_N - RNG_POS1; i++) {
	do_recursion(&array[i], &rng->status[i],
		     &rng->status[i + RNG_POS1], &lung);
    }
    for (; i < RNG_N; i++) {
	do_recursion(&array[i], &rng->status[i],
		     &array[i + RNG_POS1 - RNG_N], &lung);
    }
    for (; i < size - RNG_N; i++) {
	do_recursion(&array[i], &array[i - RNG_N],
		     &array[i + RNG_POS1 - RNG_N], &lung);
	convert_o0c1(&array[i - RNG_N]);
    }
    for (j = 0; j < 2 * RNG_N - size; j++) {
	rng->status[j] = array[j + size - RNG_N];
    }
    for (; i < size; i++, j++) {
	do_recursion(&array[i], &array[i - RNG_N],
		     &array[i + RNG_POS1 - RNG_N], &lung);
	rng->status[j] = array[i];
	convert_o0c1(&array[i - RNG_N]);
    }
    for (i = size - RNG_N; i < size; i++) {
	convert_o0c1(&array[i]);
    }
    rng->status[RNG_N] = lung;
}

/**
 * This function represents a function used in the initialization
 * by init_by_array
 * @param x 32-bit integer
 * @return 32-bit integer
 */
static uint32_t ini_func1(uint32_t x) {
    return (x ^ (x >> 27)) * (uint32_t)1664525UL;
}

/**
 * This function represents a function used in the initialization
 * by init_by_array
 * @param x 32-bit integer
 * @return 32-bit integer
 */
static uint32_t ini_func2(uint32_t x) {
    return (x ^ (x >> 27)) * (uint32_t)1566083941UL;
}

/**
 * This function initializes the internal state array to fit the IEEE
 * 754 format.
 * @param rng rng state vector.
 */
static void initial_mask(rng_t * restrict rng) {
    int i;
    uint64_t * restrict psfmt;

    psfmt = &rng->status[0].u[0];
    for (i = 0; i < RNG_N * 2; i++) {
        psfmt[i] = (psfmt[i] & RNG_LOW_MASK) | RNG_HIGH_CONST;
    }
}

/**
 * This function certificate the period of 2^{SFMT_MEXP}-1.
 * @param rng rng state vector.
 */
static void period_certification(rng_t * restrict rng) {
    uint64_t pcv[2] = {RNG_PCV1, RNG_PCV2};
    uint64_t tmp[2];
    uint64_t inner;
    int i;
#if (RNG_PCV2 & 1) != 1
    int j;
    uint64_t work;
#endif

    tmp[0] = (rng->status[RNG_N].u[0] ^ RNG_FIX1);
    tmp[1] = (rng->status[RNG_N].u[1] ^ RNG_FIX2);

    inner = tmp[0] & pcv[0];
    inner ^= tmp[1] & pcv[1];
    for (i = 32; i > 0; i >>= 1) {
        inner ^= inner >> i;
    }
    inner &= 1;
    /* check OK */
    if (inner == 1) {
	return;
    }
    /* check NG, and modification */
#if (RNG_PCV2 & 1) == 1
    rng->status[RNG_N].u[1] ^= 1;
#else
    for (i = 1; i >= 0; i--) {
	work = 1;
	for (j = 0; j < 64; j++) {
	    if ((work & pcv[i]) != 0) {
		rng->status[RNG_N].u[i] ^= work;
		return;
	    }
	    work = work << 1;
	}
    }
#endif
    return;
}

/*----------------
  PUBLIC FUNCTIONS
  ----------------*/
/**
 * This function returns the identification string.  The string shows
 * the Mersenne exponent, and all parameters of this generator.
 * @return id string.
 */
const char *rng_get_idstring(void) {
    return RNG_IDSTR;
}

/**
 * This function returns the minimum size of array used for \b
 * fill_array functions.
 * @return minimum size of array used for fill_array functions.
 */
int rng_get_min_array_size(void) {
    return RNG_N64;
}

/**
 * This function fills the internal state array with double precision
 * floating point pseudorandom numbers of the IEEE 754 format.
 * @param rng rng state vector.
 */
void rng_gen_rand_all(rng_t * restrict rng) {
    int i;
    w128_t lung;

    lung = rng->status[RNG_N];
    do_recursion(&rng->status[0], &rng->status[0],
		 &rng->status[RNG_POS1], &lung);
    for (i = 1; i < RNG_N - RNG_POS1; i++) {
	do_recursion(&rng->status[i], &rng->status[i],
		     &rng->status[i + RNG_POS1], &lung);
    }
    for (; i < RNG_N; i++) {
	do_recursion(&rng->status[i], &rng->status[i],
		     &rng->status[i + RNG_POS1 - RNG_N], &lung);
    }
    rng->status[RNG_N] = lung;
}

/**
 * This function generates double precision floating point
 * pseudorandom numbers which distribute in the range [1, 2) to the
 * specified array[] by one call. The number of pseudorandom numbers
 * is specified by the argument \b size, which must be at least (SFMT_MEXP
 * / 128) * 2 and a multiple of two.  The function
 * get_min_array_size() returns this minimum size.  The generation by
 * this function is much faster than the following fill_array_xxx functions.
 *
 * For initialization, init_gen_rand() or init_by_array() must be called
 * before the first call of this function. This function can not be
 * used after calling genrand_xxx functions, without initialization.
 *
 * @param rng rng state vector.
 * @param array an array where pseudorandom numbers are filled
 * by this function.  The pointer to the array must be "aligned"
 * (namely, must be a multiple of 16) in the SIMD version, since it
 * refers to the address of a 128-bit integer.  In the standard C
 * version, the pointer is arbitrary.
 *
 * @param size the number of 64-bit pseudorandom integers to be
 * generated.  size must be a multiple of 2, and greater than or equal
 * to (SFMT_MEXP / 128) * 2.
 *
 * @note \b memalign or \b posix_memalign is available to get aligned
 * memory. Mac OSX doesn't have these functions, but \b malloc of OSX
 * returns the pointer to the aligned memory block.
 */
void rng_fill_array_close1_open2(rng_t * restrict rng, double array[restrict], int size) {
    assert(size % 2 == 0);
    assert(size >= RNG_N64);
    gen_rand_array_c1o2(rng, (w128_t *)array, size / 2);
}

/**
 * This function generates double precision floating point
 * pseudorandom numbers which distribute in the range (0, 1] to the
 * specified array[] by one call. This function is the same as
 * fill_array_close1_open2() except the distribution range.
 *
 * @param rng rng state vector.
 * @param array an array where pseudorandom numbers are filled
 * by this function.
 * @param size the number of pseudorandom numbers to be generated.
 * see also \sa fill_array_close1_open2()
 */
void rng_fill_array_open_close(rng_t * restrict rng, double array[restrict], int size) {
    assert(size % 2 == 0);
    assert(size >= RNG_N64);
    gen_rand_array_o0c1(rng, (w128_t *)array, size / 2);
}

/**
 * This function generates double precision floating point
 * pseudorandom numbers which distribute in the range [0, 1) to the
 * specified array[] by one call. This function is the same as
 * fill_array_close1_open2() except the distribution range.
 *
 * @param array an array where pseudorandom numbers are filled
 * by this function.
 * @param rng rng state vector.
 * @param size the number of pseudorandom numbers to be generated.
 * see also \sa fill_array_close1_open2()
 */
void rng_fill_array_close_open(rng_t * restrict rng, double array[restrict], int size) {
    assert(size % 2 == 0);
    assert(size >= RNG_N64);
    gen_rand_array_c0o1(rng, (w128_t *)array, size / 2);
}

/**
 * This function generates double precision floating point
 * pseudorandom numbers which distribute in the range (0, 1) to the
 * specified array[] by one call. This function is the same as
 * fill_array_close1_open2() except the distribution range.
 *
 * @param rng rng state vector.
 * @param array an array where pseudorandom numbers are filled
 * by this function.
 * @param size the number of pseudorandom numbers to be generated.
 * see also \sa fill_array_close1_open2()
 */
void rng_fill_array_open_open(rng_t * restrict rng, double array[restrict], int size) {
    assert(size % 2 == 0);
    assert(size >= RNG_N64);
    gen_rand_array_o0o1(rng, (w128_t *)array, size / 2);
}

#if defined(__INTEL_COMPILER)
#  pragma warning(disable:981)
#endif
/**
 * This function initializes the internal state array with a 32-bit
 * integer seed.
 * @param rng rng state vector.
 * @param seed a 32-bit integer used as the seed.
 * @param mexp caller's mersenne expornent
 */
void rng_chk_init_gen_rand(rng_t * restrict rng, uint32_t seed, int mexp) {
    int i;
    uint32_t * restrict psfmt;

    /* make sure caller program is compiled with the same MEXP */
    if (mexp != rng_mexp) {
	fprintf(stderr, "RNG_MEXP doesn't match with rng.c\n");
	exit(1);
    }
    psfmt = &rng->status[0].u32[0];
    psfmt[idxof(0)] = seed;
    for (i = 1; i < (RNG_N + 1) * 4; i++) {
        psfmt[idxof(i)] = 1812433253UL 
	    * (psfmt[idxof(i - 1)] ^ (psfmt[idxof(i - 1)] >> 30)) + i;
    }
    initial_mask(rng);
    period_certification(rng);
    rng->idx = RNG_N64;
#if defined(HAVE_SSE2)
    setup_const();
#endif
}

/**
 * This function initializes the internal state array,
 * with an array of 32-bit integers used as the seeds
 * @param rng rng state vector.
 * @param init_key the array of 32-bit integers, used as a seed.
 * @param key_length the length of init_key.
 * @param mexp caller's mersenne expornent
 */
void rng_chk_init_by_array(rng_t * restrict rng, uint32_t init_key[restrict],
			     int key_length, int mexp) {
    int i, j, count;
    uint32_t r;
    uint32_t * restrict psfmt32;
    int lag;
    int mid;
    int size = (RNG_N + 1) * 4;	/* pulmonary */

    /* make sure caller program is compiled with the same MEXP */
    if (mexp != rng_mexp) {
	fprintf(stderr, "RNG_MEXP doesn't match with rng.c\n");
	exit(1);
    }
    if (size >= 623) {
	lag = 11;
    } else if (size >= 68) {
	lag = 7;
    } else if (size >= 39) {
	lag = 5;
    } else {
	lag = 3;
    }
    mid = (size - lag) / 2;

    psfmt32 = &rng->status[0].u32[0];
    memset(rng->status, 0x8b, sizeof(rng->status));
    if (key_length + 1 > size) {
	count = key_length + 1;
    } else {
	count = size;
    }
    r = ini_func1(psfmt32[idxof(0)] ^ psfmt32[idxof(mid % size)] 
		  ^ psfmt32[idxof((size - 1) % size)]);
    psfmt32[idxof(mid % size)] += r;
    r += key_length;
    psfmt32[idxof((mid + lag) % size)] += r;
    psfmt32[idxof(0)] = r;
    count--;
    for (i = 1, j = 0; (j < count) && (j < key_length); j++) {
	r = ini_func1(psfmt32[idxof(i)] 
		      ^ psfmt32[idxof((i + mid) % size)] 
		      ^ psfmt32[idxof((i + size - 1) % size)]);
	psfmt32[idxof((i + mid) % size)] += r;
	r += init_key[j] + i;
	psfmt32[idxof((i + mid + lag) % size)] += r;
	psfmt32[idxof(i)] = r;
	i = (i + 1) % size;
    }
    for (; j < count; j++) {
	r = ini_func1(psfmt32[idxof(i)] 
		      ^ psfmt32[idxof((i + mid) % size)] 
		      ^ psfmt32[idxof((i + size - 1) % size)]);
	psfmt32[idxof((i + mid) % size)] += r;
	r += i;
	psfmt32[idxof((i + mid + lag) % size)] += r;
	psfmt32[idxof(i)] = r;
	i = (i + 1) % size;
    }
    for (j = 0; j < size; j++) {
	r = ini_func2(psfmt32[idxof(i)] 
		      + psfmt32[idxof((i + mid) % size)] 
		      + psfmt32[idxof((i + size - 1) % size)]);
	psfmt32[idxof((i + mid) % size)] ^= r;
	r -= i;
	psfmt32[idxof((i + mid + lag) % size)] ^= r;
	psfmt32[idxof(i)] = r;
	i = (i + 1) % size;
    }
    initial_mask(rng);
    period_certification(rng);
    rng->idx = RNG_N64;
#if defined(HAVE_SSE2)
    setup_const();
#endif
}
#if defined(__INTEL_COMPILER)
#  pragma warning(default:981)
#endif
