#include <stdio.h>
#include <stdlib.h>
#include <omp.h> 
#include <stdbool.h>

#include "cycletimer.h"

/* Use of OMP to sum a large array of numbers */

#define DSIZE (1000L*1000L)


#define NRUN 100
#define NSRUN 1

double data[DSIZE];

#define TMAX 32


volatile double accum[TMAX][TMAX];

typedef double (*sum_fun_t)(double*, int);

static void init_data() {
    int i;
    for (i = 0; i < DSIZE; i++)
        /* Random over {+1, -1} */
        data[i] = (random() & 0x1) * 2.0 - 1.0;
}

/* Baseline */
double sum_array(double *d, int n) {
    int i;
    double sum = 0.0;
    for (i = 0; i < n ; i++) {
        double val = d[i];
        sum += val;
    }
    return sum;
}

/* Use OMP synchronization */
double sum_array_omp_critical(double *d, int n) {
    int i;
    double sum = 0.0;
#pragma omp parallel for schedule(static)
    for (i = 0; i < n ; i++) {
        double val = d[i];
#pragma omp critical
	{
	    sum += val;
	}
    }
    return sum;
}

/* Use GCC atomic operation */
double sum_array_omp_atomic(double *d, int n) {
    int i;
    // GCC builtin only supports integral data
    // Fortunately, the test data is all integral
    long sum = 0;
#pragma omp parallel for schedule(static)
    for (i = 0; i < n ; i++) {
        long val = d[i];
        __sync_fetch_and_add(&sum, val);
    }
    return (double) sum;
}

/* Use OMP reduction */
double sum_array_omp_reduce(double *d, int n) {
    int i;
    double sum = 0.0;
#pragma omp parallel for schedule(static) reduction (+:sum)
    for (i = 0; i < n ; i++) {
        double val = d[i];
        sum += val;
    }
    return sum;
}

/* Use local accumulator: row of accum */
double sum_array_omp_row(double *d, int n) {
    int i, t;
    int nthread = omp_get_max_threads();
    double sum = 0.0;
    for (t = 0; t < nthread; t++)
        accum[0][t] = 0.0;
#pragma omp parallel for schedule(static)
    for (i = 0; i < n ; i++) {
        double val = d[i];
        int myt = omp_get_thread_num();
        accum[0][myt] += val;
    }
    for (t = 0; t < nthread; t++)
        sum += accum[0][t];
    return sum;
}

/* Use local accumulator: column of accum */
double sum_array_omp_col(double *d, int n) {
    int i, t;
    double sum = 0.0;
    int nthread = omp_get_max_threads();
    for (t = 0; t < nthread; t++)
        accum[t][0] = 0.0;
#pragma omp parallel for schedule(static)
    for (i = 0; i < n ; i++) {
        double val = d[i];
        int myt = omp_get_thread_num();
        accum[myt][0] += val;
    }
    for (t = 0; t < nthread; t++)
        sum += accum[t][0];
    return sum;
}

/* Manage loop */
double sum_array_omp_thread(double *d, int n) {
    double sum = 0.0;
    int t;
    int nthread = omp_get_max_threads();
    double laccum[nthread];
#pragma omp parallel
    {
        int myt = omp_get_thread_num();
        int istart = myt * n/nthread;
        int iend = (myt+1)*n/nthread;
        if (myt == nthread-1)
            iend = n;
        int i;
        double psum = 0.0;
        for (i = istart; i < iend ; i++) {
            double val = d[i];
            psum += val;
        }
        laccum[myt] = psum;
    }
    for (t = 0; t < nthread; t++)
        sum += laccum[t];
    return sum;
}

volatile double save = 0.0;


/* Return performance in GFLOPS */
static double run_test(sum_fun_t f, char *fname, bool slow, int nthread) {
    /* Get baseline value */
    double true_sum = sum_array(data, DSIZE);
    /* Test correctness */
    omp_set_num_threads(nthread);
    double test_sum = f(data, DSIZE);
    if (test_sum != true_sum) {
        fprintf(stderr, "Error.  %d threads.  Function %s gave sum = %.1f.  True sum = %.1f\n", nthread, fname, test_sum, true_sum);
        return 0.0;
    }
    int r;
    double tstart = currentSeconds();
    int nrun = slow ? NSRUN : NRUN;
    for (r = 0; r < nrun; r++)
        save += f(data, DSIZE);
    double deltat = currentSeconds() - tstart;
    double ops = (double) DSIZE * nrun;
    return ops/deltat * 1e-9;
}
 

struct {
    char *name;
    sum_fun_t fun;
    bool slow;
} functions[] = {
    //    {"Sequential", sum_array, false},
    {"OMP critical", sum_array_omp_critical, true},
    {"GCC atomic", sum_array_omp_atomic, true},
    {"OMP rowwise accum", sum_array_omp_row, false},
    {"OMP colwise accum", sum_array_omp_col, false},
    {"OMP threading", sum_array_omp_thread, false},
    {"OMP reduce", sum_array_omp_reduce, false},
    {"",NULL, false}
};

int main(int argc, char *argv[]) {
    int t, f;
    int tmax = omp_get_max_threads();
    init_data();
    printf("Function");
    for (t = 1; t <= tmax; t++)
        printf("\t%d", t);
    printf("\n");
    for (f = 0; ; f++) {
        if (functions[f].fun == NULL)
            break;
        printf("%s", functions[f].name);
        for (t = 1; t <= tmax; t++) {
            double gf = run_test(functions[f].fun, functions[f].name, functions[f].slow, t);
            printf("\t%.3f", gf);
        }
        printf("\n");
    }
    return 0;
}
        
