/* Fast transpose for distributed two dimensional transposes.
   based on the CMU direct deposit communication model.

   Transposes arrays with distributed rows in C, distributed columns in 
   Fortran on T3D systems up to 1024 nodes, configured as tori up to 16x8x8.
   
          P:  Number of processors must be power of two
          N:  Size of the matrix are to transpose must be power of two
              and greather equal than P.
          NM: Row/Column size of the data structure must be integer

   Alpha Release of routines used internally for performance evaluations.

   Parallel Systems Group of Carnegie Mellon University, June 1, 1995
   Author: Thomas M. Stricker
   Bug reports to: tomstr@cs.cmu.edu
 */

typedef struct {double r,i;} doublecomplex; 

int tp_lb();
/* int tp_lb(x) 
 *    int x;
 *
 * Auxiliary function, computes integer binary logarithm
 *   returns logarithm base 2 or -1 if argument is not a power of two.
 * (iterative unoptimized)
 *
 */

int tp_transpose_init();
/* int tp_transpose_init()
 *
 * Initalization call, must be called at the start of the program
 * to initialize communication schedules.
 *
 */

int tp_transpose_mode();
/* int tp_transpose_mode(mode)
 *     int mode;
 *
 * Optional call, not used for normal operation.
 * Forces different communication schedules according to modes: 
 */
#define tp_auto 0          /* Picks best problem size, default */
#define tp_plain 1         /* Uses a plain schedule without synch */  
#define tp_controlled 2    /* Uses congestion controlled optimal schedule */
#define tp_random 3        /* Uses a random schedule */
/*
 */
 
int tp_transpose_complex(); 
/* int tp_transpose_complex(b,a,LN,NM) 
 *     doublecomplex b[],a[];
 *     int LN,NM;
 * 
 * Matrix size is specified as binary logarithm such that. N=2^LN
 * Lower array dimension is specified as integer MN
 * 
 * for 0 <= i,j < N:
 *   b[i][j] = a[j][i)
 *
 * Transposes content of an N by N matrix area stored in a[][NM] 
 * data structure of complex numbers (represented as pair of 64 bit 
 * double floats). b[][] and a[][] is assumed to be disjoint, does
 * not work in place.
 *
 * Unlike N, the dimension of the array NM does not have to be a 
 * power of two and is often chosen to be relatively prime for better cache 
 * alignment.
 *
 * Returns 0 for success, -1 for parameter errors.
 */

int tp_transpose_double();
/* int tp_transpose_double(b,a,LN,NM) 
 *     double b[],a[];
 *     int LN,NM;
 * 
 * Same parameters, function and restriction as complex route above. 
 *
 */
 

















