/*
* Copyright (c) 1992 Carnegie Mellon University 
*                    SCAL project: Guy Blelloch, Siddhartha Chatterjee,
*                                  Jonathan Hardwick, Jay Sipelstein,
*                                  Marco Zagha
* All Rights Reserved.
*
* Permission to use, copy, modify and distribute this software and its
* documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* The SCAL project requests users of this software to return to 
*
*  Guy Blelloch				guy.blelloch@cs.cmu.edu
*  School of Computer Science
*  Carnegie Mellon University
*  5000 Forbes Ave.
*  Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie Mellon
* the rights to redistribute these changes.
*/

#include "defins.h"
#include <cvl.h>

/* -----------------Unsegmented Scans----------------------------------*/

/* Macros for making scan functions: 
 *	d = destination vector
 * 	s = source vector
 * 	in = first element for scan  (usually the identity)
 * 	len = length of vector
 *
 * scans return the final element of the result
 */
/* implementation note:  we need to assign d and s to local variable of the
 * right type.  I don't like doing arithmetic on void *s!
 * Need a tmp variable to handle inplace case correctly.  We could have two
 * separate versions to save the extra instruction or two.
 */
#define scan(_name, _func, _type)				\
    _type _name(d, s, in, len, scratch)				\
    vec_p d, s, scratch;					\
    _type in;							\
    int len;							\
    	{							\
	register _type sum = in;				\
	register _type *dest = (_type *)d;			\
	register _type *src = (_type *)s;			\
	register _type tmp;					\
	unroll(len, tmp = sum; sum = _func(sum, *src); *dest++=tmp;src++;)\
	return sum;                    				\
	}							\
    make_no_scratch(_name)					\
    make_inplace(_name,INPLACE_TRUE)

scan(add_suz, plus, int)	/* add scans */
scan(add_sud, plus, double)

scan(min_suz, min, int)		/* min scans */
scan(min_sud, min, double)

scan(max_suz, max, int)		/* max scans */
scan(max_sud, max, double)

scan(and_sub, and, cvl_bool)	/* logical and scan */
scan(and_suz, band, int)	/* bitwise and scan */

scan(or_sub, or, cvl_bool)		/* logical or scan */
scan(or_suz, bor, int)		/* bitwise or scan */

/* ----------------- Unsegmented Simple Scans --------------------------*/

/* simple scan template:
   d = destination vector
   s = source vector
   len - length of d, s
   d and s should be vectors of the same size and type
*/
/* see above implementation note, variables in inner loop declared register */
#define simpscan(_name, _func, _type, _init)			\
    void _name(d, s, len, scratch)				\
    vec_p d, s, scratch;					\
    int len;							\
    	{							\
	register _type *dest = (_type *)d;			\
	register _type *src = (_type *)s;			\
	register _type tmp, sum = _init;			\
	unroll(len, tmp = sum; sum = _func(sum, *src); *dest++=tmp;src++;)\
	}							\
    make_no_scratch(_name)					\
    make_inplace(_name,INPLACE_TRUE)

simpscan(add_nuz, plus, int, 0)	/* add scans */
simpscan(add_nud, plus, double, (double) 0.0)

simpscan(min_nuz, min, int, MAX_INT)	/* min scans */
simpscan(min_nud, min, double, MAX_DOUBLE)

simpscan(max_nuz, max, int, MIN_INT)	/* max scans */
simpscan(max_nud, max, double, MIN_DOUBLE)

simpscan(and_nub, and, cvl_bool, 1)	/* logical and scan */
simpscan(and_nuz, band, int, ~0)	/* bitwise and scan */

simpscan(or_nub, or, cvl_bool, 0)	/* logical or scan */
simpscan(or_nuz, bor, int, 0)		/* bitwise or scan */



/* ----------------- Segmented Simple Scans --------------------------*/

/* segmented simple scan template:
   d = destination vector
   s = source vector
   sd = segment descriptor of source
   n = number of elements in whole vector
   m = number of segments

   d and s should be vectors of the same size and type
*/
/* see above implementation note, variables in inner loop declared register */
#define simpsegscan(_name, _funct, _type, _init)			\
    void _name (d, s, sd, n, m, scratch)				\
    vec_p d, s, sd, scratch;						\
    int n, m;								\
    { 									\
	register _type *src_end = (_type *) s;				\
	register _type *src = (_type *)s;				\
	int *segd = (int *)sd;						\
	int *segd_end = (int *)sd + m;					\
	register _type sum;						\
	register _type *dest = (_type *)d;				\
	register _type tmp;						\
									\
	while (segd < segd_end) {					\
	    src_end += *segd++;						\
	    sum = (_type) _init; 					\
	    while (src < src_end) {					\
		tmp = sum;						\
		sum = _funct(sum, *src);				\
		*dest++ = tmp;						\
		src++;							\
	    }								\
	}								\
    }									\
    make_no_seg_scratch(_name)						\
    make_inplace(_name,INPLACE_TRUE)

simpsegscan(add_nez, plus, int, 0)	/* add scans */
simpsegscan(add_ned, plus, double, (double) 0.0)

simpsegscan(min_nez, min, int, MAX_INT)	/* min scans */
simpsegscan(min_ned, min, double, MAX_DOUBLE)

simpsegscan(max_nez, max, int, MIN_INT)	/* max scans */
simpsegscan(max_ned, max, double, MIN_DOUBLE)

simpsegscan(and_neb, and, cvl_bool, 1)	/* logical and scan */
simpsegscan(and_nez, band, int, ~0)	/* bitwise and scan */

simpsegscan(or_neb, or, cvl_bool, 0)	/* logical or scan */
simpsegscan(or_nez, bor, int, 0)	/* bitwise or scan */


/* ----------------Segmented Scans------------------------------------*/

/* segmented scan template:
   d1 = destination vector
   d2 = vector holding final element of scan of each segment
   s = source vector
   in = vector of initial operands for scan
   sd = segment descriptor of source
   n = number of elements in whole vector
   m = number of segments

   in, sd, and d2 should all be vectors of the same size and type
   d1 and s should be vectors of the same size and type
*/
/* see above implementation note, variables in inner loop declared register */
#define segscan(_name, _funct, _type)					\
    void _name (d1, d2, s, in, sd, n, m, scratch)			\
    vec_p d1, d2, s, in, sd, scratch;					\
    int n, m;								\
    { 									\
	register _type *src_end = (_type *) s;				\
	register _type *src = (_type *)s;				\
	int *segd = (int *)sd;						\
	int *segd_end = (int *)sd + m;					\
	register _type sum;						\
	register _type *dest = (_type *)d1;				\
	register _type tmp;						\
	_type *dest2 = (_type *)d2;					\
	_type *init = (_type *) in;					\
									\
	while (segd < segd_end) {					\
	    src_end += *segd++;						\
	    sum = *init++;						\
	    while (src < src_end) {					\
		tmp = sum;						\
		sum = _funct(sum, *src);				\
		*dest++ = tmp;						\
		src++;							\
	    }								\
	*(dest2++) = sum;						\
	}								\
    }									\
    make_no_seg_scratch(_name)						\
    make_inplace(_name,INPLACE_TRUE)

segscan(add_sez, plus, int)		/* add scans */
segscan(add_sed, plus, double)

segscan(min_sez, min, int)		/* min scans */
segscan(min_sed, min, double)

segscan(max_sez, max, int)		/* max scans */
segscan(max_sed, max, double)

segscan(and_seb, and, cvl_bool)		/* logical and scan */
segscan(and_sez, band, int)		/* bitwise and scan */

segscan(or_seb, or, cvl_bool)		/* logical or scan */
segscan(or_sez, bor, int)		/* bitwise or scan */


/* --------------------Reduce Functions--------------------------------*/
/* reduce template */
	
#define reduce(_name, _funct, _type, _identity)         \
    _type _name(s, len, scratch)			\
    vec_p s, scratch;					\
    int len;						\
    {							\
      _type sum = _identity;  		                \
      _type *src = (_type *)s;				\
      unroll(len, sum = _funct(sum, *src); src++;)	\
      return sum;					\
    }							\
    make_no_scratch(_name)				\
    make_inplace(_name,INPLACE_TRUE)

reduce(add_ruz, plus, int, 0)			/* add reduces */
reduce(add_rud, plus, double, (double) 0.0)

reduce(min_ruz, min, int, MAX_INT)		/* min reduces */
reduce(min_rud, min, double, MAX_DOUBLE)

reduce(max_ruz, max, int, MIN_INT)		/* max reduces */
reduce(max_rud, max, double, MIN_DOUBLE)

reduce(and_rub, and, cvl_bool, TRUE)		/* logical and reduce */
/* K&R (2nd ed) says that this is OK */
reduce(and_ruz, band, int, (~0))		/* bitwise and scan */

reduce(or_rub, or, cvl_bool, FALSE)		/* logical or reduce */
reduce(or_ruz, bor, int, 0)			/* bitwise or reduce */

/* ------------------Segmented Reduces ---------------------------------*/
/* segmented reduce template:
 *	d = destination vector
 *	s = source vector
 *	sd = segment descriptor of source, with components n and m
 */
/* see implementation note above */
#define segreduce(_name, _funct, _type, _identity)	\
    void _name (d, s, sd, n, m, scratch)		\
    vec_p d, s, sd, scratch;				\
    int n, m; 						\
    {							\
	int *segd = (int *)sd;				\
	int *segd_end = (int *)sd + m;			\
	register _type *src = (_type *)s;		\
	register _type *src_end = (_type *)s;		\
	register _type sum;				\
	register _type *_dest = (_type *)d;		\
							\
	while (segd < segd_end) {			\
	    src_end += *(segd++);			\
	    sum = _identity;				\
	    while (src < src_end)  {			\
	      sum = _funct(sum, *src);			\
	      src++;					\
	    }						\
	*(_dest++) = sum;				\
	}						\
    }							\
    make_no_seg_scratch(_name)				\
    make_inplace(_name,INPLACE_TRUE)


segreduce(add_rez, plus, int, 0)		/* add reduces */
segreduce(add_red, plus, double, (double) 0.0)

segreduce(min_rez, min, int, MAX_INT)		/* min reduces */
segreduce(min_red, min, double, MAX_DOUBLE)

segreduce(max_rez, max, int, MIN_INT)		/* max reduces */
segreduce(max_red, max, double, MIN_DOUBLE)

segreduce(and_reb, and, cvl_bool, TRUE)		/* logical and reduce */
segreduce(and_rez, band, int, ~0)		/* bitwise and reduce */

segreduce(or_reb, or, cvl_bool, FALSE)		/* logical or reduce */
segreduce(or_rez, bor, int, 0)			/* bitwise or reduce */

/* -------------------Extract-------------------------------------*/

/* extract ith element from V */
/* extract template */
#define make_extract(_name, _type)			\
    _type _name (V, i, len, scratch)			\
	vec_p V, scratch;				\
	int i, len;					\
	{ return ((_type *)V)[i];}			\
	make_no_scratch(_name)				\
	make_inplace(_name,INPLACE_TRUE)

make_extract(ext_vuz, int)
make_extract(ext_vub, cvl_bool)
make_extract(ext_vud, double)

/* segmented extract:
 *	d = destination vector (unsegmented),
 *	s = source vector (segmented, same type as d)
 *	i = index vector (unsegmented), length as d
 *  sd, n, m = segment descriptor for v
 */
#define make_seg_ext(_name, _type)			\
    void _name (d, s, i, sd, n, m, scratch)		\
    vec_p d, s, i, sd, scratch;				\
    int n, m;						\
    {	                                  		\
	int *index = (int *)i;				\
	int *index_end = (int *)i + m;			\
	_type *dest = (_type *)d;			\
	_type *val_vec = (_type *)s;			\
	int *segd = (int *)sd;				\
							\
	while (index < index_end) {			\
	    *(dest++) = *(val_vec + *(index++));	\
	    val_vec += *(segd++);			\
	}						\
    }							\
    make_no_seg_scratch(_name)				\
    make_inplace(_name,INPLACE_TRUE)

make_seg_ext(ext_vez, int)
make_seg_ext(ext_veb, cvl_bool)
make_seg_ext(ext_ved, double)

/* ------------------Replace-------------------------------------*/

/* replace ith element of V with val */

#define make_replace(_name, _type, _funct)		\
	void _name(V, i, val, len, scratch)		\
	vec_p V, scratch;				\
	int i, len;					\
	_type val;					\
	{ ((_type *)V)[i] = _funct(val); }		\
  make_no_scratch(_name)				\
  make_inplace(_name,INPLACE_TRUE)

make_replace(rep_vuz, int, ident)
make_replace(rep_vub, cvl_bool, notnot)
make_replace(rep_vud, double, ident)

/* segmented replace:
 *	d = destination vector  (segmented)
 *	s = index vector	(unsegmented, one entry per segment of d)
 *	v = value vector    (ditto)
 *	sd, n, m = segment descriptor for d.
 */
#define make_seg_replace(_name, _type)		\
    void _name(d, s, v, sd, n, m, scratch)	\
    vec_p d, s, v, sd, scratch;			\
    int n,m;					\
    {						\
	int *src = (int *)s;			\
	int *send = src + m;			\
	_type *dest = (_type *)d;		\
	_type *val = (_type *)v;		\
	int *segd = (int *)sd;			\
						\
	while (src < send) {			\
	   *(dest + *(src++)) = *(val++);	\
	   dest += *(segd++);			\
	}					\
    }						\
    make_no_seg_scratch(_name)			\
    make_inplace(_name,INPLACE_TRUE)

make_seg_replace(rep_vez, int)
make_seg_replace(rep_veb, cvl_bool)
make_seg_replace(rep_ved, double)

/* ----------------Distribute-----------------------------------*/

/* distribute v to length len, return in d */
#define make_distribute(_name, _type)		\
    void _name(d, v, len, scratch)		\
    vec_p d, scratch;				\
    _type v;					\
    int len;					\
    { 						\
	register _type * dest = (_type *)d;	\
	unroll(len, *(dest++) = v;) 		\
    }						\
    make_no_scratch(_name)			\
    make_inplace(_name,INPLACE_TRUE)

make_distribute(dis_vuz, int)
make_distribute(dis_vub, cvl_bool)
make_distribute(dis_vud, double)

/* segmented distribute:
 *  d = destination vector (segmented)
 *  v = value vector (unsegmented), same type as d
 *  sd, n, m = segment descriptor for d
 */
#define make_seg_distribute(_name, _type)	\
    void _name(d, v, sd, n, m, scratch) 	\
    vec_p d, v, sd, scratch;			\
    int n, m;					\
    {						\
	_type *dest_end = (_type *)d;		\
	_type *dest = (_type *) d;		\
	int *segd = (int *)sd;			\
	int *segd_end = segd + m;		\
	_type val;				\
	_type *val_vec = (_type *)v;		\
						\
	while (segd < segd_end) {		\
	    val = *(val_vec++);			\
	    dest_end += *segd++;		\
	    while (dest < dest_end) {		\
		*dest++ = val;	               	\
	    }                         		\
	}					\
    }						\
    make_no_seg_scratch(_name)			\
    make_inplace(_name,INPLACE_TRUE)

make_seg_distribute(dis_vez, int)
make_seg_distribute(dis_veb, cvl_bool)
make_seg_distribute(dis_ved, double)


/* --------------Permute---------------------------------------*/

/* simple permute: 
 *	d = destination vector
 *	s = source vector, same type as d
 *	i = index vector
 *	len = length of vectors
 */

#define make_smpper(_name, _type)			\
    void _name(d, s, i, len, scratch)			\
    vec_p d, s, i, scratch;				\
    int len;						\
    {							\
	register int *indexp = (int *)i;		\
	register _type *dest = (_type *)d;		\
	register _type *src = (_type *)s;		\
	unroll(len, dest[*(indexp++)] = *(src++);)	\
    }							\
    make_no_scratch(_name)				\
    make_inplace(_name,INPLACE_FALSE)

make_smpper(smp_puz, int)
make_smpper(smp_pub, cvl_bool)
make_smpper(smp_pud, double)

/* segmented simple permute:
 *  d = destination vector (segmented)
 *  s = source vector (segmented), same type as d
 *  i = index vector (segmented)
 *  sd, n, m = segment descriptor
 */
#define make_seg_smpper(_name, _type)			\
    void _name(d, s, i, sd, n, m, scratch)		\
    vec_p d, s, i, sd, scratch;				\
    int n, m;						\
    {							\
	_type *src = (_type *)s;			\
	_type *src_end = (_type *)s;			\
	int *segd = (int *)sd;				\
	int *segd_end = segd + m;			\
	_type *dest = (_type *)d;			\
	int *index = (int *)i;				\
							\
	while (segd < segd_end) {			\
	    src_end += *(segd);				\
	    while (src < src_end)  {			\
		*(dest + *(index++)) = *(src++);	\
	    }						\
	    dest += *(segd++);				\
	}						\
    }							\
    make_no_seg_scratch(_name)				\
    make_inplace(_name,INPLACE_FALSE)

make_seg_smpper(smp_pez, int)
make_seg_smpper(smp_peb, cvl_bool)
make_seg_smpper(smp_ped, double)

/*----------------------Back Permute-------------------*/
/* back permute: 
 *	d = destination vector
 *	s = source vector, same type as d
 *	i = index vector
 *	s_len = length of s
 *	d_len = length of d and i
 */

#define make_bckper(_name, _type)			\
	void _name(d, s, i, s_len, d_len, scratch)	\
	vec_p d, s, i, scratch;				\
	int s_len, d_len;				\
	{						\
	    register int *index = (int *)i;		\
	    register _type *dest = (_type *)d;		\
	    register _type *src = (_type *)s;		\
	    unroll(d_len, *dest++ = src[*index++];) 	\
	}						\
	make_no2_scratch(_name)				\
	make_inplace(_name,INPLACE_FALSE)

make_bckper(bck_puz, int)
make_bckper(bck_pub, cvl_bool)
make_bckper(bck_pud, double)

/* segmented bck permute:
 *  d = destination vector (segmented)
 *  s = source vector (segmented), same type as d
 *  i = index vector (compatible with d)
 *  sd_s, n_s, m_s = source segment descriptor
 *  sd_d, n_d, n_d = dest segment descriptor
 */
#define make_seg_bckper(_name, _type)		\
    void _name(d, s, i, sd_s, n_s, m_s, sd_d, n_d, m_d, scratch)	\
    vec_p d, s, i, sd_s, sd_d, scratch;		\
    int n_s, m_s, n_d, m_d;			\
    {						\
	_type *src = (_type *)s;		\
	int *segd_dest = (int *)sd_d;		\
	int *segd_src = (int *)sd_s;		\
	int *segd_end = segd_dest + m_d;	\
	_type *dest = (_type *)d;		\
	int *index = (int *)i;			\
	int *index_end = index;			\
						\
	while (segd_dest < segd_end) {		\
	    index_end += *(segd_dest++);	\
	    while (index < index_end)  {	\
		*(dest++) = *(src + *index++);	\
	    }					\
	    src += *(segd_src++);		\
	}					\
    }						\
    make_no_seg2_scratch(_name)			\
    make_inplace(_name,INPLACE_FALSE)

make_seg_bckper(bck_pez, int)
make_seg_bckper(bck_peb, cvl_bool)
make_seg_bckper(bck_ped, double)
