/////////////////////////////////////////////////////////////////////////////////////
//                                                                      
//                  I N T E L   P R O P R I E T A R Y                   
//                                                                      
//     COPYRIGHT (c)  2001 BY  INTEL  CORPORATION.  ALL RIGHTS          
//     RESERVED.   NO  PART  OF THIS PROGRAM  OR  PUBLICATION  MAY      
//     BE  REPRODUCED,   TRANSMITTED,   TRANSCRIBED,   STORED  IN  A    
//     RETRIEVAL SYSTEM, OR TRANSLATED INTO ANY LANGUAGE OR COMPUTER    
//     LANGUAGE IN ANY FORM OR BY ANY MEANS, ELECTRONIC, MECHANICAL,    
//     MAGNETIC,  OPTICAL,  CHEMICAL, MANUAL, OR OTHERWISE,  WITHOUT    
//     THE PRIOR WRITTEN PERMISSION OF :                                
//                                                                      
//                        INTEL  CORPORATION                            
//                                                                     
//                     2200 MISSION COLLEGE BLVD                        
//                                                                      
//               SANTA  CLARA,  CALIFORNIA  95052-8119                  
//                                                                      
/////////////////////////////////////////////////////////////////////////////////////
//
// 		File Name: scheduler_packet.uc
// 
// 		Purpose:	  DRR scheduler for Egress Sausalito
//
/////////////////////////////////////////////////////////////////////////////////////
//
// 		History:
//
// 		Date			Comment										By
//		---------------------------------------------------------------------------
//
//		03/17/02		Created										Uday Naik
//
/////////////////////////////////////////////////////////////////////////////////////

#ifndef __SCHEDULER_EGRESS_DRR_UC__
#define __SCHEDULER_EGRESS_DRR_UC__

/////////////////////////////////////////////////////////////////////////////////////
// 
// This scheduler runs on the egress Sausalito. It is a frame or packet based 
// scheduler. It implements WRR scheduling on the ports and DRR on the queues
// within a port. Currently 16 ports and 6 classes per port are supported
//
// 
// The following are the issues with implementing DRR under the IXP2400 environment
// 
//      1)  The packet size is not available for a queue when it is being scheduled.
//          Once the dequeue is issued, the packet size is received N beats later 
//          where each beat is 88 cycles.
//
//      2)  The bit vector with information on which queues have data may not be
//          current. This could mean that a dequeue is issued on a queue that has
//          no data
//
//      3)  The scheduler schedules a packet every beat (88 cycles). This means that
//          for large packets, the scheduler is running faster than the transmit
//
//      4)  The exact packet size is not available. The packet size is in multiples 
//			of CHUNK_SIZE which is MTU/128. The queue credit increments are also 
//			given in CHUNK_SIZE units.
//
//      5)  During a DRR round, a queue may go empty or come alive. While a queue is
//          empty it should not receive credit. But queues that frequently go empty
//          should not affect the bandwidth allocation.
//
// The above issues are worked around with modifications to DRR as follows
//    
//      1) We use a scheme of negative credits. The criteria for a queue to be 
//		   eligible to send is that it has data, flow control is off on the port and 
//		   the credits for the queue are positive. A packet is transmitted from a 
//		   queue if it meets the above criteria. Once the packet length is received, 
//		   (N beats later), the packet length is decremented from the current credit 
//		   of the queue. When the current credit of the queue goes negative, it can 
//		   no longer transmit. When all the queues on a port go negative, one DRR 
//		   round is over. Each queue gets another round of credit at this point. 
//		   To ensure that all the queues are schedulable with one round of credit, 
//		   we need to keep the minimum quantum for a queue as (N *  MTU)/CHUNK_SIZE.
//
//      2) If a dequeue is issued on a queue that has no data, then the QM returns 
//		   the packet size as 0. This is treated as a special case. The scheduler 
//		   will run slightly faster than the QM to allow it to make up for lost slots.
//
//      3) If the queue between TX and QM gets full due to large packets or because 
//		   the scheduler is running slightly faster, then the QM will not dequeue the
//		   packet and instead return a 0 for the packet size.
//
//      4) The algorithm will round robin among ports first and queues next. i.e. if 
//         queue i of port j is scheduled, the next queue scheduled will be queue k 
//		   of port j + 1. When the scheduler comes back to port j, the next queue 
//		   scheduled in port j will be port i+1. This increases the probability that 
//		   the packet length is back by the time the queue 
//  
//      5) While a queue is empty or flow control is on, it's credit remains 
//		   untouched. If a queue comes alive in the middle of a round, it is allowed 
//		   to participate right away with the available credit. 
//         
//  
/////////////////////////////////////////////////////////////////////////////////////
// 
// Absolute register
// 
// PortEmptyVector	: Port Empty Bit Vector  : 1 word (NUMBER_OF_PORTS bits are used)
//                    If bit is set then port has data
//
/////////////////////////////////////////////////////////////////////////////////////

.reg @port_empty_vector			; Absolute GPR (Global shared across threads)

////////////////////////////////////////////////////////////////////////////////////

// include stdmac.uc in IXPblocks Portable library

#include <stdmac.uc>

// include localmem.uc in IXPblocks Portable library for read/write local memory

#include <localmem.uc>

// include file system.h for system constants

#include "dl_system.h"

// header file with constants for algorithm

#include "scheduler_packet.h"

// include the queue manager message handling code 

#include "scheduler_qm.uc"


/////////////////////////////////////////////////////////////////////////////////////
// 
// _scheduler_get_credit_increment()
//
// Description:
// 	
//		Get the DRR credit quantum for a given queue. This should be read from SRAM  
//		control block. In simulation we will simply use a fixed number equal to the minimum 
//		credit increment
//
// Outputs: 
//
//		out_credit_increment:	DRR credit quantum for a specific queue
//
// Inputs:  
//
//		in_queue_id:			queue_id
//
// Size: 
//
//		1 instruction
// 
//
/////////////////////////////////////////////////////////////////////////////////////
	
#macro _scheduler_get_credit_increment(out_credit_increment, in_queue_id)

#ifdef USE_IMPORT_VAR
.begin
  .reg sram_block_base, offset, $credit0, queue_offset
  .sig sram_sig


  // get the sram address to read from 

  immed32(sram_block_base, SCHED_WEIGHT_CREDIT_BASE)

  // get the queue offset in SRAM. The first NUM_OF_PORTS LW addressess
  // are for weights on each port

  alu[queue_offset,--,b, NUMBER_OF_PORTS, <<2]

  // get the offset based on queue number

  alu[offset, --,b, in_queue_id, <<2]

  // the final offset of a give queue in SRAM
  alu[offset, offset, +, queue_offset]

  sram[read, $credit0, sram_block_base, offset, 1], ctx_swap[sram_sig]

  // $credit0 contains weight for the port

  alu[out_credit_increment, --,b, $credit0]


.end

#else

// In simulation mode define the credit increment like this

#define_eval	MIN_CREDIT_INCREMENT  ((MTU * 8) / (1 << BITS_FOR_PACKET_LENGTH))

  immed32(credit_increment, MIN_CREDIT_INCREMENT)

#endif

#endm

/////////////////////////////////////////////////////////////////////////////////////
// 
// _scheduler_get_port_weight()
//
// Description:
// 	
//		 Get the WRR weight for a given port. This should be read from a SRAM control 
//		 block when run in hardware and is a fixed value in simulation.
//
// Outputs: 
//
//		weight:					WRR weight in number of packets
//
// Inputs:  
//
//		port:					port_number			
//
// Size: 
//
//		1 instruction
// 
//
/////////////////////////////////////////////////////////////////////////////////////
	
#macro _scheduler_get_port_weight(out_weight, in_port]

#ifdef USE_IMPORT_VAR
.begin
  .reg sram_block_base, offset, $credit0
  .sig sram_sig


  // get the sram address to read from 

  immed32(sram_block_base, SCHED_WEIGHT_CREDIT_BASE)
  alu[offset, --,b, in_port, <<2]
  sram[read, $credit0, sram_block_base, offset, 1], ctx_swap[sram_sig]

  // $credit0 contains weight for the port

  alu[out_weight, --,b, $credit0]


.end

#else

   // In simulation we will set this up as 1 for every port

  immed[out_weight, 1]

#endif

#endm

/////////////////////////////////////////////////////////////////////////////////////
// 
// _scheduler_init_scratch_ring[]
//
// Description:
// 	
//		 Initialize the scratch ring between the scheduler and the QM
//
// Outputs: 
//								None
//
// Inputs:  
//								None				
//
// Constants
//
//		RBASE: 					Base address of scratch ring. Should be 4 byte 
//								aligned
//
// 		RSIZE: 					Size of scratch ring in words. Valid values are 
//								128, 256, 512, 1024
//
// 		RING:  					Ring number (0..15)
//
// Size:     
//								10 instruction
// 
//
/////////////////////////////////////////////////////////////////////////////////////

#macro	_scheduler_init_scratch_ring(RBASE, RSIZE, RING)

.begin 	

	.sig 	cw1, cw2, cw3						; signals used in cap[write...]
	.reg	$_rhead, $_rtail, $_rbase, _base
 
	// These define_eval are required. Otherwise the caller cannot have spaces
	// in between parameters like init[a, b, c].

	#define_eval RN		RING
	#define_eval RS		RSIZE
	#define_eval RB		RBASE

	immed[$_rhead, 0x0]								; Initialise ring head to 0
	immed[$_rtail, 0x0]								; Initialise ring tail to 0;
	immed[_base, RB]								; Initialise ring base 

	alu_shf[$_rbase, _base, or, RING_SIZE_/**/RS, <<30]; [31:30]= 0 => Ring size is 
													   ; 128

	// Initialise the Scratch Ring base (and size), head and tail.

	// Note: We can Queue a max. of 4 commands to any external unit 
	// (like sram, dram, cap, etc). Beyond this limit the ME will stall.
	// The limit of 4 includes all the commands issued by all other MEs 
	// as well. It is the programmers responsibility to ensure this.

	// Since this is the only thread and ME that is queuing cmds at this time,
	// we can queue 3 commands safely.

	cap[write, $_rbase, SCRATCH_RING_BASE_/**/RN], sig_done[cw1]	; base = 0x1000
	cap[write, $_rhead, SCRATCH_RING_HEAD_/**/RN], sig_done[cw2]	; head = 0
	cap[write, $_rtail, SCRATCH_RING_TAIL_/**/RN], sig_done[cw3]	; tail = 0

	ctx_arb[cw1, cw2, cw3]		

#undef RN
#undef RS
#undef RB

.end

#endm

/////////////////////////////////////////////////////////////////////////////////////
// 
// _scheduler_init_rings()
//
// Description:
// 	
//		 Initialize the scratch ring and NN rings
//
// Outputs: 
//								None
//
// Inputs:  
//								None				
//
// Constants
//								None
//
// Size: 
//								12 instruction
// 
//
/////////////////////////////////////////////////////////////////////////////////////

#macro _scheduler_init_rings()

.begin

	.reg 	ctx_enable_data

#ifdef UNIT_TEST

	_scheduler_init_scratch_ring(RING_BASE, RING_SIZE, RING_ID)

#endif
	
	//
	// Set up the CTX_ENABLES local csr for NN ring 
	// 
	// bit 20 NN_MODE = 0 : next neighbor register are written
	//                                      by previous ME 
	//
	// bits [19:18] NN_RING_EMPTY = 0 : NN_EMPTY asserts when
	//                                                  NN_PUT == NN_GET
	//													(default)
	// bits [15:8] CTX enables for contexts 0 to 7
	//

    immed32(ctx_enable_data, 0xFF00)			
    local_csr_wr[CTX_ENABLES, ctx_enable_data] 	

	/* initialize the NN indices */
	local_csr_wr[nn_put, 0]
	local_csr_wr[nn_get, 0]

.end

#endm

/////////////////////////////////////////////////////////////////////////////////////
// 
// scheduler_init()
//
// Description:
// 	
//		 Initialize the DRR scheduler
//
//
// Outputs:				
//
//		port_credit_vector		Current WRR credit vector for port
//		port_init_credit_vector Initial WRR credit vector for port
//		ring					Scratch ring for deq requests
//		port_mask				Mask used to round robin among ports			
//		minus_two				A constant 0xfffffffe for computinf next port mask	
// 
//
/////////////////////////////////////////////////////////////////////////////////////

#macro scheduler_init(port_credit_vector, port_init_credit_vector, ring, port_mask,\
					 minus_two)

.begin
	
	.reg	i
	.reg 	credit_increment
	.reg 	weight
	.reg	q_mask

	// ((1 << number_of_queues_per_port) - 1)

	immed32(q_mask, QUEUE_MASK)

	// This is a constant 0xfffffffe which is used to compute the next port mask. 
	// This is stored in a register since it takes two instructions to compute

	immed32(minus_two, 0xfffffffe)

	// Store the initial bit vector for ports with credit in a register

	immed32(port_init_credit_vector, PORT_INITIAL_CREDIT_VECTOR)
	immed32(port_credit_vector, PORT_INITIAL_CREDIT_VECTOR)

	// Id of the scratch ring to talk to the Queue Manager

	alu_shf[ring, --, b, RING_ID, <<2]

	// This portMask is used to control the round robin scheduling among ports
	// Initialize it to all 1's

	alu[port_mask, --, ~B, 0]

	// Initialize the port empty vector to 0. All ports are initially empty

	immed[@port_empty_vector,0x0] 

	// Set the base address for ports in local memory. This has
	// a 3 cycle latency which should be covered by the branch in
	// the while

	localmem_set_address(0, 0, LM_HANDLE_0)

	nop
	nop

	// For each port set up the configuration

	immed[i, 0]

	.while (i < NUMBER_OF_PORTS)		

		// Set schedule vector and queue mask to all 1's. Every queue has enough 
		// credits

		alu[*l$index0++, --, B, q_mask] ; write schedule vector
		alu[*l$index0++, --, B, q_mask] ; write queue mask

		// Set all ports to be empty

		alu[*l$index0++, --, B, 0] 			; QueueEmpty Vector 

		// Set the packets scheduled to be  0

		alu[*l$index0++, --, B, 0] 			; packets scheduled 

		_scheduler_get_port_weight(weight, i)		

		alu[*l$index0++, --, B, weight] 	; current port weight for WRR among ports
		alu[*l$index0++, --, B, weight]		; port weight quantum for WRR
		
		alu[--, --, B, *l$index0++]
		alu[--, --, B, *l$index0++]

		// increment i

		alu[i, i, +, 1]

	.endw
	
	// initialize the loop variable
	
	immed[i, 0] 

	// For each queue set up the current credit and quantum. Get the quantum
	// from the SRAM control block

	.while (i < (NUMBER_OF_PORTS * NUMBER_OF_QUEUES_PER_PORT))
		
		_scheduler_get_credit_increment(credit_increment, i)
		
		// Set the credit increment and the current credit

		alu[*l$index0++, --, B, credit_increment] ; credit increment
		alu[*l$index0++, --, B, credit_increment] ; current credit 

		// increment i

		alu[i, i, +, 1]

	.endw

	// Set up the registers for next neighbor and scratch rings

	_scheduler_init_rings()

.end

#endm

/////////////////////////////////////////////////////////////////////////////////////
// 
// schedule[]
//
// Description:
// 	
//		Schedule a packet. When we exit this macro, we will always have scheduled a 
//		packet.  If no data is available on any port, we will loop inside this same 
//		macro. This will ensure that every time we execute this macro we will 
//		execute atleast 37 instructions. This is important for reuse of the xfer 
//		register
//
//
// Outputs/inputs:				
//
//		credit_vector			Current WRR credit vector for port
//
//	
// Inputs:  
//		
//		init_credit_vector 		Initial WRR credit vector for port
//		ring					Scratch ring for deq requests
//		port_mask				Mask used to round robin among ports			
//		minus_two				A constant 0xfffffffe used to compute next port mask
//		deq_message				xfer register for deq message
//		deq_signal				signal for scratch put
//		wait_signal				signal to ctx arb on 				
//
// Constants
//								None
//
// Size: 
//								55 cycles
// 
//
/////////////////////////////////////////////////////////////////////////////////////


#macro schedule(out_credit_vector, in_init_credit_vector, in_ring, in_port_mask, \
				in_minus_two, in_deq_message, DEQ_SIGNAL, WAIT_SIGNAL)

.begin

	.reg 	port_number
	.reg 	queue_number
	.reg 	temp
	.reg 	temp2
	.reg 	port_lm_offset
    .reg 	xfer_byte_offset
	.reg 	port_credit_quantum
	.reg    sched_vector
	.reg 	packets_in_flight
#ifdef PORT_LM_BASE_NOT_ZERO
    .reg    port_lm_base
#endif

SCHEDULE_PORT#:

	// check if any port has data 

	alu[--, @port_empty_vector, -, 0]

	// if not swap out 

	beq[SWAP_OUT#]
	
	// AND the empty vector and credit vector. 

	alu[temp, @port_empty_vector, AND, out_credit_vector]

	// if no port has data and credit, reset the credit vector 

	bne [DO_FFS#]

	// Reset the credit vector to init vector

	alu[out_credit_vector, --, B, in_init_credit_vector]

	// AND the empty vector and new credit vector again

	alu[temp, @port_empty_vector, AND, out_credit_vector]


DO_FFS#:

	// AND the mask in

	alu[temp2, temp, AND, in_port_mask]

	// find the next port with data to send and with credit 

	ffs[port_number, temp2]

	// check if no bit was set. 

	bne[ADJUST_PORT_CREDIT#] 

	// If we get here then mask needs to be reset to all 1's. So we simply compute
	// port number from temp & mask which is the same as temp since mask is all 1's.
	// mask is recomputed to the correct value later using the port number. 

	// At this point, temp HAS to have some bit set. So we wont check for 0.

	ffs[port_number, temp]


ADJUST_PORT_CREDIT#:

	// Compute the offset into local memory for this port and set the local memory 
	// handle. Each port data structure is 32 bytes. This operation has a 3 cycle 
	// latency before the local memory is usable 

	alu_shf[port_lm_offset, --, B, port_number, <<5] 
#ifdef PORT_LM_BASE_NOT_ZERO
	immed[port_lm_base, PORT_LM_BASE_OFFSET]
	alu[port_lm_offset, port_lm_offset, +, port_lm_base]
#endif
	localmem_set_address(0, port_lm_offset , LM_HANDLE_0)
		
	// Also use defer slot compute transfer register offset for packets
	// transmitted count. Also in bytes

	alu_shf[xfer_byte_offset, --, B, port_number, <<2]

	// Set the indirect read for xfer register holding packet transmit count. This
	// has a 2 cycle latency

	local_csr_wr[T_INDEX, xfer_byte_offset]

	// 3 cycle latency

	nop

	// Decrement the credit. If the port does not get selected because of flow
	// control reasons then it loses this turn and no port is scheduled for the
	// beat

	alu[*l$index0[PORT_CURRENT_CREDIT_INDEX], *l$index0[PORT_CURRENT_CREDIT_INDEX],\
		 -, 1]

	// if the credit is over give it another round and clear the bit in the vector

	bgt[CHECK_IN_FLIGHT_COUNT#] 

	// Set port number for indirect operation

	alu[--, port_number, OR, 0]
	
	// clear the port credit vector. 

	alu_shf[out_credit_vector, out_credit_vector, AND~, 1, <<indirect]

	// AND with the port empty vector 

	alu[--, out_credit_vector, AND, @port_empty_vector]
	
	// check if the result is 0, then the round is over. Reset credit vector

	bne[CHECK_IN_FLIGHT_COUNT#] , defer [2]
		
	// defer 1 - read in the port credit quantum 

	alu[port_credit_quantum, --, B, *l$index0[PORT_WEIGHT_INDEX]]
		
	// defer 2 - increment the credit for the port by quantum

	alu[*l$index0[PORT_CURRENT_CREDIT_INDEX], *l$index0[PORT_CURRENT_CREDIT_INDEX], \
			+, port_credit_quantum]

	// Set the credit vector to init vector

	alu[out_credit_vector, --, B, in_init_credit_vector]


CHECK_IN_FLIGHT_COUNT#:

	// Compute number of packets in flight.

	alu[packets_in_flight, *l$index0[PORT_PACKETS_SCHEDULED_INDEX], -, *$$index]

	// If packets in flight exceeds MAX_IN_FLIGHT, swap out.

	alu[--, MAX_IN_FLIGHT, -, packets_in_flight]
	

	ble[SWAP_OUT#] , defer [2]

	// compute the next port mask.

	alu[--, port_number, OR, 0]
	alu_shf[in_port_mask, --, B, in_minus_two, <<indirect]	

SCHEDULE_QUEUE#:

	// If we get here we have scheduled a port. Now we schedule a queue on the port

	// Read the port parameters. Assume that the local memory offset has already 
	// been set up.
			
	alu[temp, --, B, *l$index0[PORT_QUEUE_EMPTY_VECTOR_INDEX]] 

	// AND the schedule and empty vectors. Empty vector must be nonzero or
	// we would not have selected the port

	alu[temp,  temp, AND, *l$index0[PORT_SCHEDULE_VECTOR_INDEX]]

	// if this is 0 then reset the schedule vector since empty vector is non zero
	// Also compute ffs straight from empty vector since schedule vector is now all 1's

	bne [DO_FFS_FOR_QUEUE#]

	// reset the schedule vector to all 1

	alu[*l$index0[PORT_SCHEDULE_VECTOR_INDEX], --, ~B, 0]

	// set temp back to port queue empty vector index 

	alu[temp, --, B, *l$index0[PORT_QUEUE_EMPTY_VECTOR_INDEX]] 


DO_FFS_FOR_QUEUE#:
		
	// AND the queue mask in

	alu[temp2, *l$index0[PORT_CURRENT_QUEUE_MASK_INDEX], AND, temp]

	// find the eligible queue

	ffs[queue_number, temp2]

	// check if no bit is set, in that case we need to recompute the ffs. 

	bne[CHECK_RING_FULL#] 

	// If we get here, then ffs failed. So the queue mask must be 0.
	// So queue mask needs to be reset to all 1's. 
	// Recompute ffs straight from temp which is the and of the sched vector
	// and empty vector. At this point we will always find a queue

	ffs[queue_number, temp]


CHECK_RING_FULL#:

	// check if the ring is full

	br_!inp_state[RING_FULL_VALUE, SEND_DEQUEUE#] , defer[2] 

	// defer 1 - create the message which is basically just the queue id with MSB bit set

	alu_shf[temp, queue_number, OR, port_number,  <<NUMBER_OF_BITS_FOR_QUEUE]

	// defer 2 Now set the MSB bit and copy into transfer register

	alu_shf[in_deq_message, temp, OR, 1, <<31]


	ctx_arb[voluntary] , br[CHECK_RING_FULL#]

SEND_DEQUEUE#:
	
	// write the data on the scratch ring.

	scratch[put, in_deq_message, 0, in_ring, 1]  , sig_done[DEQ_SIGNAL]

	// calculate the new queue mask

	alu[--, queue_number, OR, 0]
	alu_shf[temp, --, B, in_minus_two, <<indirect]

	// If we get here then we scheduled a port. We swap out and then exit the macro

	ctx_arb[WAIT_SIGNAL] , br [SCHEDULE_EXIT#] , defer[2]
		
	// defer 1 - write out the new queue mask

	alu[*l$index0[PORT_CURRENT_QUEUE_MASK_INDEX], --, B, temp]

	// defer 2 - Increment packets scheduled for this port

	alu[*l$index0[PORT_PACKETS_SCHEDULED_INDEX], *l$index0[PORT_PACKETS_SCHEDULED_INDEX], +, 1]
	

SWAP_OUT#:

	// If we get here, we did not send a packet for some reason or the other. So
	// we swap out and go back to the top of the loop 

	ctx_arb[voluntary] , br [SCHEDULE_PORT#] 
	

SCHEDULE_EXIT#:

	// Exit this macro 

.end

#endm

#ifndef SCHEDULER_DISPATCH_LOOP_EXCLUDED
/////////////////////////////////////////////////////////////////////////////////////
//
// main() : Code execution starts here
//
/////////////////////////////////////////////////////////////////////////////////////

#define INTER_THREAD_SIGNAL 	8

.sig volatile	prev_thread_signal
.addr 			prev_thread_signal		INTER_THREAD_SIGNAL

br=ctx[0, INIT_REFLECTOR_WRITE#]

// If thread number is 1, go to the scheduler routine. 

br=ctx[ 1, SCHEDULER_THREAD#]

// If thread number is 2, go to the Queue Manager Message Handler routine

br=ctx[ 2, QM_HANDLER_THREAD#]

// All other threads should simply abort

ctx_arb[kill]

/////////////////////////////////////////////////////////////////////////////////////	

// The xfer registers 0..15 will be used as a target for a reflector write by 
// the transmit microengine. They will contain the count of the packets transmitted 
// and will be accessed in absolute indexed mode. Since xfer registers 0..15 belong 
// to thread 0 in relative mode, this thread will initialize them to 0 by reading in 
// 0's into these registers initially. It will signal the scheduler thread once done

/////////////////////////////////////////////////////////////////////////////////////

INIT_REFLECTOR_WRITE#:

.begin

	.reg visible $$txd_p0, $$txd_p1, $$txd_p2, $$txd_p3, $$txd_p4, $$txd_p5, $$txd_p6, \
				 $$txd_p7, $$txd_p8, $$txd_p9, $$txd_p10, $$txd_p11, $$txd_p12, \
				 $$txd_p13, $$txd_p14,$$txd_p15
	
	.xfer_order	 $$txd_p0, $$txd_p1, $$txd_p2, $$txd_p3, $$txd_p4, $$txd_p5, $$txd_p6, \
				 $$txd_p7, $$txd_p8, $$txd_p9, $$txd_p10, $$txd_p11, $$txd_p12, \
				 $$txd_p13, $$txd_p14, $$txd_p15

	.sig 		 read_signal1, read_signal2
	.reg 		 offset

	alu[offset, --, B, &$$txd_p0]

	// set up offset to point to a location in SRAM where 8 0's are stored

	immed32[offset, SRAM_ZERO_BLOCK]

	// read in the 8 zero's

	sram[read, $$txd_p0, offset, 0,  8], sig_done[read_signal1]
	sram[read, $$txd_p8, offset, 32, 8], sig_done[read_signal2]

	// wait for the IO to complete. It is safe to let other threads run now

	ctx_arb[read_signal1, read_signal2]

	// signal the scheduler thread 

	#define_eval INTER_THREAD_SIGNAL_REG_VALUE (0x80 | (8 << 3))

	local_csr_wr[SAME_ME_SIGNAL, INTER_THREAD_SIGNAL_REG_VALUE]

	nop
	nop

	ctx_arb[kill]

.end

/////////////////////////////////////////////////////////////////////////////////////

SCHEDULER_THREAD#:

.begin
	
	// Scheduler thread

	.reg	port_credit_vector
	.reg	port_init_credit_vector
	.reg 	port_mask
	.reg 	minus_two
	.reg    ring
	.reg	$deq1, $deq2, $deq3, $deq4
	.sig	deq_signal1, deq_signal2, deq_signal3, deq_signal4

	// Wait for a signal from thread 0

	ctx_arb[prev_thread_signal]

	// Execute in thread 1 in an infinite loop

	scheduler_init[port_credit_vector, port_init_credit_vector, ring, port_mask, \
				   minus_two]

SCHEDULE#:
	
	schedule[port_credit_vector, port_init_credit_vector, ring, port_mask, \
			 minus_two, $deq1, deq_signal1, voluntary]

	schedule[port_credit_vector, port_init_credit_vector, ring, port_mask, \
			 minus_two, $deq2, deq_signal2, voluntary]

	schedule[port_credit_vector, port_init_credit_vector, ring, port_mask, \
			 minus_two, $deq3, deq_signal3, voluntary]

	schedule[port_credit_vector, port_init_credit_vector, ring, port_mask, \
			 minus_two, $deq4, deq_signal4, deq_signal1]


SCHEDULE_LOOP#:

	schedule[port_credit_vector, port_init_credit_vector, ring, port_mask, \
			 minus_two, $deq1, deq_signal1, deq_signal2]

	schedule[port_credit_vector, port_init_credit_vector, ring, port_mask, \
	 		 minus_two, $deq2, deq_signal2, deq_signal3]

	schedule[port_credit_vector, port_init_credit_vector, ring, port_mask, \
			 minus_two, $deq3, deq_signal3, deq_signal4]

	schedule[port_credit_vector, port_init_credit_vector, ring, port_mask, \
			 minus_two, $deq4, deq_signal4, deq_signal1]

	br [SCHEDULE_LOOP#]

.end

/////////////////////////////////////////////////////////////////////////////////////

QM_HANDLER_THREAD#:

	// Execute in an infinite loop handling QM messages

	_scheduler_qm_message_handler[]

#endif

/////////////////////////////////////////////////////////////////////////////////////

#endif   // __SCHEDULER_EGRESS_DRR_UC__

/////////////////////////////////////////////////////////////////////////////////////