/////////////////////////////////////////////////////////////////////////////////////
//                                                                      
//                  I N T E L   P R O P R I E T A R Y                   
//                                                                      
//     COPYRIGHT (c)  2001 BY  INTEL  CORPORATION.  ALL RIGHTS          
//     RESERVED.   NO  PART  OF THIS PROGRAM  OR  PUBLICATION  MAY      
//     BE  REPRODUCED,   TRANSMITTED,   TRANSCRIBED,   STORED  IN  A    
//     RETRIEVAL SYSTEM, OR TRANSLATED INTO ANY LANGUAGE OR COMPUTER    
//     LANGUAGE IN ANY FORM OR BY ANY MEANS, ELECTRONIC, MECHANICAL,    
//     MAGNETIC,  OPTICAL,  CHEMICAL, MANUAL, OR OTHERWISE,  WITHOUT    
//     THE PRIOR WRITTEN PERMISSION OF :                                
//                                                                      
//                        INTEL  CORPORATION                            
//                                                                     
//                     2200 MISSION COLLEGE BLVD                        
//                                                                      
//               SANTA  CLARA,  CALIFORNIA  95052-8119                  
//                                                                      
/////////////////////////////////////////////////////////////////////////////////////
//
// Macros for Ingress QM message handling 
//
/////////////////////////////////////////////////////////////////////////////////////
//
//
// The QM sends messages on a NN ring. Each message is two words 
//
// First word is the enqueue message
//
//						--	Bit 30 	   : Enqueue Transition
//						--  Bits 9..0  : Enqueue Queue Id 
//
// Second word is the dequeue message
//
//						-- Bit 30		: Dequeue Transition
//						-- Bit 29		: Invalid Dequeue 
//						-- Bits 9..0	: Dequeue queue Id 
//
// For each beat, there may be an enqueue transition AND a dequeue transition 
// in the
// worst case. 
//
//
/////////////////////////////////////////////////////////////////////////////////////

#ifndef __CSIX_SCHEDULER_QM_UC__
#define __CSIX_SCHEDULER_QM_UC__

/////////////////////////////////////////////////////////////////////////////////////

// Bit number for detecting enqueue/dequeue transition messages 

#define QM_TRANSITION_BIT           	30

// Bit number for invalid dequeues

#define QM_INVALID_DEQUEUE_BIT			29

/////////////////////////////////////////////////////////////////////////////////////
//
// Get the queue group number from the Queue Manager Message, bits 5..9
//
// Outputs: group_number	(0..31)
// Inputs:  message
//
// Size: 1 instruction
//
/////////////////////////////////////////////////////////////////////////////////////

#macro _scheduler_qm_get_group_number(group_number, message)

	alu_shf[group_number, 0x1f, AND, message, >>5]

#endm

/////////////////////////////////////////////////////////////////////////////////////
//
// Get the queue number from the Queue Manager Message, bits 0..4
//
// Outputs: queue number (0..31)
// Inputs: message
//
// Size: 1 instruction
//
/////////////////////////////////////////////////////////////////////////////////////

#macro _scheduler_qm_get_queue_number(queue_number, message)

	alu[queue_number, 0x1f, AND, message]

#endm

/////////////////////////////////////////////////////////////////////////////////////
//  
//  Get the local memory offset for the queue group number in the Queue Manager 
//  message.
// 
// 
//  Outputs:	group_lm_offset
//  Inputs:     message
//
//  Size:       1 instruction
//
/////////////////////////////////////////////////////////////////////////////////////

#macro _scheduler_qm_get_group_lm_offset(group_lm_offset, message, group_lm_mask)

 	// The group number is bits 5..9. Each entry is 16 bytes (left shift of 4). 
	// Combining the two gives us a right shift of 1 and a mask of 0x1f0 for 
	// zeroing out bits 0..3

	alu_shf[group_lm_offset, group_lm_mask, AND, message, >>1]


#endm

/////////////////////////////////////////////////////////////////////////////////////
// 
//  Get the local memory offset for a specific queue data structure
//
//  Outputs:	queue_lm_offset
//  Inputs:     message
//
//  Size:       2 instructions
//
/////////////////////////////////////////////////////////////////////////////////////

#macro _scheduler_qm_get_queue_lm_offset(queue_lm_offset, message, queue_number, \
										 queue_lm_base,queue_lm_offset_mask)

	// compute the queue local memory offset. We use the queue id (0..1023) and
	// multiply it by size of each entry (2 bytes) to compute the local memory 
	// offset. queue id is in first 10 bits. We left shift by 1 for 2 byte multiply
	// and that puts the lm offset in the first 11 bits

	alu_shf[queue_lm_offset, queue_lm_offset_mask, AND, message, <<1]

	// Add the local memory base

	alu[queue_lm_offset, queue_lm_offset, +, queue_lm_base] 	

#endm


////////////////////////////////////////////////////////////////////// //////////////
//
// Handle an enq transition message
//
// Inputs: 	message:		Enq Transition message
//			group_lm_mask	mask for local memory structure per group
//
//
// Size:	10 instructions 
//
////////////////////////////////////////////////////////////////////////// //////////


#macro _scheduler_qm_handle_enqueue_transition(message, group_lm_mask)
.begin

	.reg 	group_lm_offset
	.reg	queue_number
	.reg	queue_bitmask
	.reg	group_number
	.reg	group_bitmask

	// compute queue group local memory offset

	_scheduler_qm_get_group_lm_offset(group_lm_offset, message, group_lm_mask)

	// Set up port offset index into local memory. 3 cycle latency for this to take
	// effect. 

	localmem_set_address(0, group_lm_offset, LM_HANDLE_0)

	// compute the queue number (0..31) within a group

	_scheduler_qm_get_queue_number(queue_number, message)

	// compute 1 << queue_number

	alu[--, queue_number, OR, 0]
	alu_shf[queue_bitmask, -- , B, 1, <<indirect]

	// compute the queue group number from 0..31

	_scheduler_qm_get_group_number(group_number, message)

	// compute 1 << group number. Cant use global in al_shf instruction

	alu[--, group_number, OR, 0]
	alu_shf[group_bitmask, -- , B, 1, <<indirect]

	// Set the bit for this queue in the queue empty vector.for the group 

	alu[*l$index0[LM_GROUP_EMPTY_VECTOR_INDEX], \
		*l$index0[LM_GROUP_EMPTY_VECTOR_INDEX] , OR , queue_bitmask]

	// Also if the bit for this group was 0 in the parent vector, we need to set it.
	// Rather than check for the bit being 0, we will simply set the bit	
	
	alu[ @root_empty_vector, @root_empty_vector, OR, group_bitmask]

.end		
#endm

/////////////////////////////////////////////////////////////////////////////////////
//
// Handle Dequeue Transition
//
// Inputs:
// 
//		message:			Dequeue Transition Message 
//		group_lm_mask:		Group local memory mask 
//		
// Constants:
//
//		EXIT_LABEL:
//
// 13 instructions in worst case
//
/////////////////////////////////////////////////////////////////////////////////////

#macro _scheduler_qm_handle_dequeue_transition(message, group_lm_mask, EXIT_LABEL]

.begin

	.reg group_lm_offset
	.reg queue_number
	.reg queue_bitmask
	.reg group_number
	.reg group_bitmask

	// compute queue group local memory offset

	_scheduler_qm_get_group_lm_offset(group_lm_offset, message, group_lm_mask)

	// Set up group offset index into local memory. 3 cycle latency for this to take
	// effect. 

	localmem_set_address(0, group_lm_offset, LM_HANDLE_0)

	// compute the queue number (0..31) within a group

	_scheduler_qm_get_queue_number(queue_number, message)

	// compute 1 << queueNumber

	alu[--, queue_number, OR, 0]
	alu_shf[queue_bitmask, -- , B, 1, <<indirect]

	// clear the bit in the empty vector

	alu[*l$index0[LM_GROUP_EMPTY_VECTOR_INDEX], \
		*l$index0[LM_GROUP_EMPTY_VECTOR_INDEX], AND~, queue_bitmask]

	// if the bit vector is now zero then clear the bit in the parent
	// otherwise exit using the exit label

	bne[SWAP_OUT#] 
	
	// compute the queue group number from 0..31

	_scheduler_qm_get_group_number(group_number, message)

	// Use the defer slots to compute ~(1 << groupNumber)

	alu[--, group_number, OR, 0]
	alu_shf[group_bitmask, --, B, 1, <<indirect]

	ctx_arb[voluntary] , br [EXIT_LABEL] , defer[1]

	// Now clear the bit in the global empty vector 

	alu[@root_empty_vector, @root_empty_vector, AND~, group_bitmask];  defer 1



SWAP_OUT#:

	ctx_arb[voluntary] , br[EXIT_LABEL]

.end
#endm

///////////////////////////////////////////////////////////////////////////////////// 
//
// 	drain_nn_ring: Remove any junk data that may be present in the nn ring.
//
//	Problem: When using Core Components (xscale) the application fails to work
//	when the system is "powered ON" for the first time. (it works fine for subsequent
//	"reset").
//
//	This is caused by initialisation sequence in CC where there is considerable delay
//	between loading Microcode in "each" microengine. For some reason this causes the 
//	next neighbour ring between QM and scheduler to be non-empty (junk data) causing 
//	scheduler to read this data. Depending on this junk data, scheduler may go thro'
//	"invalid dequeue" path where it decrements "packets_scheduled", which was initialised
// 	to 0, which makes it negative. This leads to packets_in_flight crossing its threshold 
//	preventing scheduler from transmitting any packets.
//
//	Until the problem is resolved in CC (or resource manager) the workaround is to 
//	simply drain the nn ring of any junk data.
//
//	Note: This problem doesn't arise when downloading this app using Workbench or when
//	running in simulation mode.
/////////////////////////////////////////////////////////////////////////////////////

#macro drain_nn_ring[]
.begin

#ifdef	USE_IMPORT_VAR

	//	This fix is only when using Core Components on IXDP2400.

loop#:

	br_inp_state[NN_EMPTY, end#]

	alu[--, --, B, *n$index++]			; drain nn one LW at a time
	ctx_arb[voluntary]					; this helps drain the nn completely.

	br[loop#]

end#:

#endif	//	USE_IMPORT_VAR

.end
#endm


		
///////////////////////////////////////////////////////////////////////////////////// 
//
// QM Message Handler. This loops infinitely running in a thread of its own
//
// Instruction estimate: 
//
//						 Worst case through is 28 cycles through loop 
//						
// 
/////////////////////////////////////////////////////////////////////////////////////

#macro _scheduler_qm_message_handler()

.begin
			
	// local variables for this thread 

	.reg  	message					// QM  message read from ring
	.reg	group_lm_mask			// Mask to compute local memory offset for group

	// Set this up in a register so we dont waste cycles computing it 

	immed32[group_lm_mask, 0x1f0]

	drain_nn_ring[]					// Workaround: See the macro for details

QM_SWAP_OUT#:

	// Swap out. The first time the thread runs it will swap out immediately

	ctx_arb[voluntary]

	// First Read the QM message from a scratch or NN ring. In this case we 
	// use a NN ring

CHECK_NN_EMPTY#:

	// Check if the ring is empty. If it is not, then branch to swapping out

	br_inp_state[NN_EMPTY, QM_SWAP_OUT#]

	// Read the message

	alu[message, --, B, *n$index++] 

	// Check if it is an enqueue transition.   

	br_bclr[message, QM_TRANSITION_BIT, CHECK_DEQUEUE_TRANSITION#]

	// Handle the enqueue transition 

	_scheduler_qm_handle_enqueue_transition(message, group_lm_mask)

CHECK_DEQUEUE_TRANSITION#:

	// Read the message

	alu[message, --, B, *n$index++] 

	// Check if it is a dequeue transition.   

	br_bclr[message, QM_TRANSITION_BIT, CHECK_INVALID_DEQUEUE#]

	// This will branch to QM_SWAP_OUT#. If we get here an invalid
	// dequeue cannot occur since dequeue transition was detected

	_scheduler_qm_handle_dequeue_transition(message, group_lm_mask, CHECK_NN_EMPTY#)

CHECK_INVALID_DEQUEUE#:

	// Check if it is an invalid dequeue. 

	br_bclr[message, QM_INVALID_DEQUEUE_BIT, QM_SWAP_OUT#]

	// Swap out and go to checking if ring is empty

	ctx_arb[voluntary] , br [CHECK_NN_EMPTY#] , defer[2]

		// If we get here then an invalid dequeue occured. decrement packets 
		// scheduled

		alu[@packets_scheduled, @packets_scheduled, -, 1]

		// decrement packets in flight

		alu[@packets_in_flight, @packets_in_flight, -, 1]

.end
#endm
	
/////////////////////////////////////////////////////////////////////////////////////
	
#endif 	// __CSIX_SCHEDULER_QM_UC__

/////////////////////////////////////////////////////////////////////////////////////