/////////////////////////////////////////////////////////////////////////////////////////
//                                                                      
//                  I N T E L   P R O P R I E T A R Y                   
//                                                                      
//     COPYRIGHT (c)  2001 BY  INTEL  CORPORATION.  ALL RIGHTS          
//     RESERVED.   NO  PART  OF THIS PROGRAM  OR  PUBLICATION  MAY      
//     BE  REPRODUCED,   TRANSMITTED,   TRANSCRIBED,   STORED  IN  A    
//     RETRIEVAL SYSTEM, OR TRANSLATED INTO ANY LANGUAGE OR COMPUTER    
//     LANGUAGE IN ANY FORM OR BY ANY MEANS, ELECTRONIC, MECHANICAL,    
//     MAGNETIC,  OPTICAL,  CHEMICAL, MANUAL, OR OTHERWISE,  WITHOUT    
//     THE PRIOR WRITTEN PERMISSION OF :                                
//                                                                      
//                        INTEL  CORPORATION                            
//                                                                     
//                     2200 MISSION COLLEGE BLVD                        
//                                                                      
//               SANTA  CLARA,  CALIFORNIA  95052-8119 
//
/////////////////////////////////////////////////////////////////////////////////////////
// 		
//		Change History
// 		--------------
//
// Date			Description											Whom
// ---------------------------------------------------------------------------------
//
// 12/26/02    	Port Scheduler Microengine of Egress Scheduler 		urn/kt              
//                                                                  
/**********************************************************************************/


#ifndef __PORT_SCHEDULER_UC__
#define __PORT_SCHEDULER_UC__

/**********************************************************************************/

#include "scheduler_packet.h"
#include "stdmac.uc"
#include "xbuf.uc"

// Location in local memory of the Port Scheduler ME where per 
// port data structures are stored 
#define_eval    PORT_STRUCS_LM_BASE        	0
#define_eval	LIST_INACTIVE_BIT			31

	.reg	port_structs_base 
	.reg	list_active_flag curr_port_offset prev_port_offset
	.reg	zero new_round_req_valid
	.reg	curr_round_mask next_round_mask
	.reg	sig_mask						//all signals to wait on after ctx_arb
	.reg	req_pending						//reg with REQ_PENDING_BIT set
	.reg	got_response					//reg with GOT_RESPONSE_BIT set
	.reg	port_active						//reg with PORT_ACTIVE bit set
	.reg	got_response_port_active		//reg with GOT_RESPONSE_BIT 
											//and PORT_ACTIVE bit set
	
	.reg	$scratch_ring_request			//to send scratch ring request back to
											//class_scheduler when a packets to dequeue fall below
											//LOW_WATER_MARK

	.reg	$init_scratch_ring_request		//to send the first scratch ring request back to	
	#define_eval _RING_ADDR (RING_ID * 4)	
		
	.reg visible global $$txd_p0 $$txd_p1 $$txd_p2 $$txd_p3 \
						$$txd_p4 $$txd_p5 $$txd_p6 $$txd_p7 \
						$$txd_p8 $$txd_p9 $$txd_p10 $$txd_p11 \
						$$txd_p12 $$txd_p13 $$txd_p14 $$txd_p15

	.xfer_order			$$txd_p0 $$txd_p1 $$txd_p2 $$txd_p3 \
						$$txd_p4 $$txd_p5 $$txd_p6 $$txd_p7 \
						$$txd_p8 $$txd_p9 $$txd_p10 $$txd_p11 \
						$$txd_p12 $$txd_p13 $$txd_p14 $$txd_p15
#ifdef _DEBUG_COUNTERS_
	//number of packets COUNT sent to PORT_SCHEDULER on nn-ring for eNqueue
	.reg @port_sched_enq_in_ctr
	//number of packets COUNT sent to PORT_SCHEDULER on nn-ring for dequeue
	.reg @port_sched_deq_in_ctr
	//number of packets PORT_SCHEDULER sent to QUEUE MANAGER on nn-ring for enqueue
	.reg @port_sched_enq_out_ctr
	//number of packets PORT_SCHEDULER sent to QUEUE MANAGER on nn-ring for dequeue
	.reg @port_sched_deq_out_ctr
	//times the nn-ring between PORT_SCHEDULDER and QUEUE MANAGER is full
	.reg @port_sched_nn_ring_full
	//the last enqueue number and dequeue number sent to Queue Manager
	.reg @port_sched_last_deq_enq_num_out
	//the last SOP sent to Queue Manager
	.reg @port_sched_last_sop_out
	//the last EOP sent to Queue Manager
	.reg @port_sched_last_eop_out
#endif

	.sig	volatile next_deq_round_req_done

	.sig	volatile init_deq_round_req_done

///////////////////////////////////////////////////////////////////////////////

#macro port_scheduler_init()

		thread_0_init()

#endm

///////////////////////////////////////////////////////////////////////////////
// thread_0_init
//	 	Description: 
//			Initialize regsiters and local memory needed by Port_scheduler
//
//	 	Outputs: 
//
//		Inputs:
//		
///////////////////////////////////////////////////////////////////////////////

#macro thread_0_init()
.begin

	.reg ctx_enable_data set_bits

#ifdef _DEBUG_COUNTERS_

	immed[@port_sched_enq_in_ctr, 0]
	immed[@port_sched_deq_in_ctr, 0]
	immed[@port_sched_enq_out_ctr, 0]
	immed[@port_sched_deq_out_ctr, 0]
	immed[@port_sched_nn_ring_full, 0]
	immed[@port_sched_last_deq_enq_num_out, 0]
	immed[@port_sched_last_sop_out, 0]
	immed[@port_sched_last_eop_out, 0]
#endif
		
	local_csr_rd[ctx_enables]
	immed[ctx_enable_data, 0]

	; Set bits [19:18] to 11 to specify that the message on NN-ring is 4 longwords
	immed32[set_bits, 0x000c0000]
	alu[ctx_enable_data, ctx_enable_data, OR, set_bits]
	local_csr_wr[ctx_enables, ctx_enable_data]

	local_csr_wr[nn_get, 0]
	local_csr_wr[nn_put, 0]	

	;set invalid bit to signal that ports-with-packets list is empty
	alu_shf[list_active_flag, --, B, 0x1, <<LIST_INACTIVE_BIT]

	immed[sig_mask, 0]

	immed[zero, 0]
	immed[curr_port_offset, 0]
	immed[prev_port_offset, 0]
	move(new_round_req_valid, 0x80000000)
	move(curr_round_mask, 0xffff)
	move(next_round_mask, 0xffff0000)

	alu_shf[req_pending, --, B, 1, <<REQ_PENDING_BIT]
	alu_shf[got_response, --, B, 1, <<GOT_RESPONSE_BIT]
	alu_shf[port_active, --, B, 1, <<PORT_ACTIVE_BIT]
	alu_shf[got_response_port_active, port_active, OR, got_response]

	init_packets_transmitted_xfer_regs()
	
	init_per_port_data_structs_in_lm()

	;Thread 0 of the Sort ME in the functional pipeline waits 
	;for the signal that all blocks are to  wait on to indicate that 
	;system initialization is done
	ctx_arb[system_init_sig]

.end
#endm
///////////////////////////////////////////////////////////////////////////////
// init_per_port_data_structs_in_lm
//	 	Description: 
//			Initialize per port data structures in local memory
//
//	 	Outputs: 
//
//		Inputs:
//		
///////////////////////////////////////////////////////////////////////////////

#macro init_per_port_data_structs_in_lm()

.begin

		.reg loop_cnt init_lm_temp
		
		immed[loop_cnt, 0]

		.while (loop_cnt < NUMBER_OF_PORTS)
			alu_shf[init_lm_temp, --, B, loop_cnt, <<PORT_STRUCTURE_SIZE]
			local_csr_wr[active_lm_addr_0, init_lm_temp]
			nop
			nop
			nop

			alu[lm0_port_pkts_enqueued, --, B, 0]
	 		immed_w0[lm0_port_credit_quantum_and_current, PORT_CREDIT_QUANTUM]
			immed_w1[lm0_port_credit_quantum_and_current, PORT_CREDIT_QUANTUM]
	 		alu[lm0_port_pkts_scheduled	, --, B, 0]	 		
	 		alu[lm0_port_req_status_link_next,  --, B, 0]
	 		alu[lm0_port_next_and_curr_deq_rounds, --, B, 0]
			alu[lm0_port_next_and_curr_rnd_pkts_cnt, --, B, 0]
			
			alu[loop_cnt, loop_cnt, +, 1]
		.endw  

.end

#endm

///////////////////////////////////////////////////////////////////////////////
// init_packets_transmitted_xfer_regs
//	 	Description: 
//			Initialize read transfer registers which will be updated by
//			Packet Tx via reflect write to communicate the number
//			of packets in flight (scheduled for dequeue but still waiting)
//
//	 	Outputs: 
//
//		Inputs:
//		
///////////////////////////////////////////////////////////////////////////////
#macro init_packets_transmitted_xfer_regs()
.begin
	.reg loop_cnt temp

	.sig	volatile dram_regs_init_done

	 xbuf_alloc($sr_write, 16, write)
	
	.set $sr_write[0] $sr_write[1] $sr_write[2] $sr_write[3] \
		 $sr_write[4] $sr_write[5] $sr_write[6] $sr_write[7] \
		 $sr_write[8] $sr_write[9] $sr_write[10] $sr_write[11] \
		 $sr_write[12] $sr_write[13] $sr_write[14] $sr_write[15]

		local_csr_wr[T_INDEX, &$sr_write[0]]
		nop
		
		move(temp, SRAM_ZERO_BLOCK)

		;init the array of transfer registers keep track of the packets 
		;transmitted per port
		immed[loop_cnt, 0]
	
		.while (loop_cnt < 16)
			immed[*$index++, 0]
			alu[loop_cnt, loop_cnt, +, 1]
		.endw

		sram[write, $sr_write[0], temp, 0, 8], sig_done[dram_regs_init_done]
		ctx_arb[dram_regs_init_done]
		sram[read, $$txd_p[0], temp, 0, 8], sig_done[dram_regs_init_done]
		ctx_arb[dram_regs_init_done]

		sram[write, $sr_write[8], temp, 0, 8], sig_done[dram_regs_init_done]
		ctx_arb[dram_regs_init_done]		
		sram[read, $$txd_p[8], temp, 0, 8], sig_done[dram_regs_init_done]
		ctx_arb[dram_regs_init_done]

		xbuf_free($sr_write)

		.io_completed dram_regs_init_done
.end
#endm
///////////////////////////////////////////////////////////////////////////////
// check_nn_ring_status
//	 	Description: 
//			Check for nn-ring full. If it is, wait in a loop
//
//	 	Outputs: 
//
//		Inputs:
//		
///////////////////////////////////////////////////////////////////////////////
#macro check_nn_ring_status()
port_sched_nn_ring_full#:

#ifdef _DEBUG_COUNTERS_

	br_!inp_state[NN_FULL, port_sched_nn_ring_not_full#]
	alu[@port_sched_nn_ring_full, @port_sched_nn_ring_full, +, 1]
	br[port_sched_nn_ring_full#]

port_sched_nn_ring_not_full#:

#else
	br_inp_state[NN_FULL, port_sched_nn_ring_full#]
#endif

#endm
///////////////////////////////////////////////////////////////////////////////
// write_complete_message_to_nn_ring
//	 	Description: 
//			Check for nn-ring full. If it is, wait in a loop
//
//	 	Outputs: 
//
//		Inputs:		in_enq_port_round: port and round where a packet to be enqueued
//					in_deq_port_round: port and round where packets will be dequeued
//					in_sop_handle: SOP of packet being enqueued
//					in_eop_handle: EOP of packet being enqueued
//		
///////////////////////////////////////////////////////////////////////////////
#macro write_complete_message_to_nn_ring(in_enq_port_round, in_deq_port_round, \
						in_sop_handle, in_eop_handle)

.begin

#ifdef _DEBUG_COUNTERS_
	.begin
		.reg port_sched_tmp

		alu[--, --, B, in_enq_port_round]
		beq[no_enq_out#]
		alu[@port_sched_enq_out_ctr, @port_sched_enq_out_ctr, +, 1]

		;save the last message on nn-ring to GPR. In case the pipe line
		;hang, this info is useful in order to find out what may cause the hang
		alu[@port_sched_last_sop_out, --, B, in_sop_handle]
		alu[@port_sched_last_eop_out, --, B, in_eop_handle]
	
	no_enq_out#:
		alu[--, --, B, in_deq_port_round]
		beq[no_deq_out#]

		alu[@port_sched_deq_out_ctr, @port_sched_deq_out_ctr, +, 1]

		alu[port_sched_tmp, --, B, in_enq_port_round]
		ld_field[port_sched_tmp, 1100, in_deq_port_round, <<16]
		;save the last message on nn-ring to GPR. In case the pipe line
		;hang, this info is useful in order to find out what may cause the hang
		alu[@port_sched_last_deq_enq_num_out, --, B, port_sched_tmp]
	no_deq_out#:

	.end
#endif

	write_enq_deq_num_to_nn_ring(in_enq_port_round, in_deq_port_round)

	local_csr_wr[active_ctx_wakeup_events, sig_mask]

	;wait for signal from scratch which should be done by now
	ctx_arb[--], defer[2], br[start_port_schedule#]

	write_sop_to_nn_ring(in_sop_handle)

	write_eop_to_nn_ring(in_eop_handle)


.end

#endm
///////////////////////////////////////////////////////////////////////////////
// chk_status_and_write_to_nn_ring
///////////////////////////////////////////////////////////////////////////////
#macro chk_status_and_write_to_nn_ring(in_enq_port_round, in_deq_port_round, \
						in_sop_handle, in_eop_handle)

	check_nn_ring_status()

	write_complete_message_to_nn_ring(in_enq_port_round, in_deq_port_round, \
						in_sop_handle, in_eop_handle)

#endm

///////////////////////////////////////////////////////////////////////////////
// write_enq_deq_num_to_nn_ring
//	 	Description: 
//			This macro write only 1 longword (enqueue port and queue / dequeue port
//											 and queue number)
//			Assumption: nn-ring is not full
//
//	 	Outputs: 
//
//		Inputs:		in_enq_port_round: port and round where a packet to be enqueued
//										[31:16] MUST BE 0
//										[15:12] enqueue port [11:0] enqueue round
//					in_deq_port_round: port and round where packets will be dequeued
//										[15:12] dequeue port [11:0] dequeue round
///////////////////////////////////////////////////////////////////////////////

#macro write_enq_deq_num_to_nn_ring(in_enq_port_round, in_deq_port_round)

.begin

	;compress the port and round number into 16 bits because
	;Queue Manager expect 16 bits queue number	
	alu_shf[*n$index++, in_enq_port_round, OR, in_deq_port_round, <<16]	

.end
#endm
///////////////////////////////////////////////////////////////////////////////
// write_sop_to_nn_ring
//	 	Description: 
//			This macro write only 1 longword (SOP) to the nn-ring.
//											 
//			Assumption: nn-ring is not full
//
//	 	Outputs: 
//
//		Inputs:		SOP
///////////////////////////////////////////////////////////////////////////////
#macro write_sop_to_nn_ring(sop)

.begin

	alu[*n$index++, --, B, sop]
.end
#endm
///////////////////////////////////////////////////////////////////////////////
// write_eop_to_nn_ring
//	 	Description: 
//			This macro write only 1 longword (EOP) to the nn-ring.
//											 
//			Assumption: nn-ring is not full
//
//	 	Outputs: 
//
//		Inputs:		EOP
///////////////////////////////////////////////////////////////////////////////

#macro write_eop_to_nn_ring(eop)

.begin

port_sched_measure_perf#:
	;Since Queue Manager expects 3-longword message, just send a NULL EOP
	alu[*n$index++, --, B, eop]	
.end
#endm

///////////////////////////////////////////////////////////////////////////////
// read_enq_msg_on_nn_ring
///////////////////////////////////////////////////////////////////////////////
#macro 	read_enq_msg_on_nn_ring(out_enq_port_round)

	alu[out_enq_port_round , --, B, *n$index++]

#ifdef _DEBUG_COUNTERS_
		br_bset[out_enq_port_round, 31, no_enq_in#]
		alu[@port_sched_enq_in_ctr, @port_sched_enq_in_ctr, +, 1]
	no_enq_in#:
#endif

#endm
///////////////////////////////////////////////////////////////////////////////
// read_sop_on_nn_ring

//	 	Outputs: 	out_enq_port_round
//
//		Inputs:	
///////////////////////////////////////////////////////////////////////////////
#macro 	read_sop_on_nn_ring(out_sop)

	alu[out_sop , --, B, *n$index++]

#endm
///////////////////////////////////////////////////////////////////////////////
// read_next_deq_msg_on_nn_ring
///////////////////////////////////////////////////////////////////////////////
#macro 	read_next_deq_msg_on_nn_ring(out_next_deq_port_round)

	alu_shf[out_next_deq_port_round, --, B, *n$index++]

#endm
///////////////////////////////////////////////////////////////////////////////
// read_next_deq_pkts_count_on_nn_ring
///////////////////////////////////////////////////////////////////////////////
#macro 	read_next_deq_pkts_count_on_nn_ring(out_next_deq_round_pkts_count)

	alu_shf[out_next_deq_round_pkts_count, --, B, *n$index++, <<16]

#ifdef _DEBUG_COUNTERS_
	.begin
		.reg debug_temp
		alu_shf[debug_temp, --, B, out_next_deq_round_pkts_count, >>16]
		alu[@port_sched_deq_in_ctr, @port_sched_deq_in_ctr, +, debug_temp]
	.end
#endif

#endm
///////////////////////////////////////////////////////////////////////////////
// process_enqueue_message
//	 	Description: Upon receiving a meesage to enqueue a packet on a port
//			- Increment packets count for this port
//			- If if this is the first packet on this port,
//			add the port to ports_with_data link list. Since this port now
//			have packet, send the initial request for next dequeue round for this port.
//			
//	 	Outputs: 
//
//		Inputs:		
///////////////////////////////////////////////////////////////////////////////
#macro process_enqueue_message()

.begin

	.reg	temp

	
	read_enq_msg_on_nn_ring(enq_port_round)
	
	;Message: Port is in [16:12] and each structure is 32 bytes ( shift left 5 bits). 
	;Therefore the net number of shift bits to find port structure offset is 7 
	;(PORT_STRUCTURE_SHIFT). 
	;After shifting, mask out the last 5 bits. Since base of the structures is 0 in 
	;local memory, the offset is the same as address of the structure
	alu_shf[enq_port_offset, INVERTED_PORT_STRUCTURE_MASK, ~AND, \
				enq_port_round, >>PORT_STRUCTURE_SHIFT]
	
	;need to wait 3 cycles before active_lm_addr_0 can be used
	local_csr_wr[active_lm_addr_0, enq_port_offset]

	read_sop_on_nn_ring(sop_handle)

	read_next_deq_msg_on_nn_ring(next_deq_port_round)

	;calculate port offset for next dequeue port structure in advance
	;to fill the latency of the above write to ACTIVE_LM_ADDR_0 CSR
	alu_shf[next_deq_port_offset, INVERTED_PORT_STRUCTURE_MASK, ~AND, \
			next_deq_port_round, >>PORT_STRUCTURE_SHIFT]

	;preload ACTIVE_LM_ADDR_1 for use in process_next_dequeue_round_response()
	;Done here to save 3-cycle latency before active_lm_addr_1 can be used	
	local_csr_wr[active_lm_addr_1, next_deq_port_offset]

	;check if the enqueue message is invalid.
	br_bset[enq_port_round, 31, end_process_enqueue_message#]

	;the next 2 instruction are in defer slots because they must
	;be done whether BR!=BYTE takes place or not
	
	alu[lm0_port_pkts_enqueued, lm0_port_pkts_enqueued, +, 1]

	;Check whether the port is already in port_with_packets list. If not, send
	;the initial request. It's sent only once when the port is
	;not in the port_with_packets list.
	br!=byte[lm0_port_req_status_link_next, 3, 0, \
				end_process_enqueue_message#]


	;prepare the message with port number and bit 31 = 1 (for scratch ring message,
	;bit 31=1 means VALID)
	alu_shf[$init_scratch_ring_request, new_round_req_valid, or, enq_port_offset, \
				>>PORT_STRUCTURE_SIZE]
		
enqueue_check_ring_full#:
	br_inp_state[RING_FULL, enqueue_check_ring_full#]
		
	scratch[put, $init_scratch_ring_request, zero, _RING_ADDR, 1], \
			sig_done[init_deq_round_req_done]	

	alu_shf[sig_mask, sig_mask, OR, 1, <<(&init_deq_round_req_done)]

	;set the port active bit 
	ld_field[lm0_port_req_status_link_next, 1000 , req_pending]

end_process_enqueue_message#:

.end

#endm

///////////////////////////////////////////////////////////////////////////////
// process_next_dequeue_round_response
//	 	Description: Upon receiving a message for dequeue packets on a port
//			- Get the number of packets to be dequeue
//			- If dequeueing is happening on a port, save this new packets
//			number and round number into the port data as the next value
//			for dequeue.
//			- If dequeue is not happening, save the round number and packets count
//			into port structure and start dequeueing from that round.
//		
//			
//	 	Outputs: 
//
//		Inputs:		
///////////////////////////////////////////////////////////////////////////////
#macro process_next_dequeue_round_response()

.begin

	.reg process_deq_temp


	//check if the new dequeue round response is invalid
	//This is done first because next_deq_port_round is often invalid
	br_bset[next_deq_port_round, 31, end_process_next_dequeue_round_response#], defer[3]

	;The next 3 instructions are in defer slot because they
	;are to be done whether BR_BSET takes place or not

	;preload active_lm_addr_0 to save latency in case of invalid
	;next_deq_port_round.
	;need 3 cycles before active_lm_addr_0 can be used	
	local_csr_wr[active_lm_addr_0, curr_port_offset]

	;Current port offset is port number << PORT_STRUCTURE_SIZE. 
	;Shift right by (PORT_STRUCTURE_SIZE-2) to set up the T_INDEX[8:2].
	alu_shf[xfer_byte_offset, --, B, curr_port_offset, >>PORT_OFFSET_TO_T_INDEX]

	read_next_deq_pkts_count_on_nn_ring(next_deq_round_pkts_count)


	//At this point, the case is valid next_deq_port_round.


	//At this point active_lm_addr_1 MUST BE READY with value next_deq_port_offset
	
	;if BOTH the current dequeue round and the next dequeue round has zero packets,
	;add the new packet count to current round packets count and add the port to
	;the active port list
	alu[process_deq_temp, lm1_port_next_and_curr_rnd_pkts_cnt, -, 0]
	beq[add_to_active_ports_list#] 

	;if current dequeue round has packets, the port should had been added
	;to active list already.	
	;save the packets count of the dequeue round into next dequeue round pkts count
	ld_field[lm1_port_next_and_curr_rnd_pkts_cnt, 1100, next_deq_round_pkts_count]	
	
	;save next dequeue round number into next dequeue round
	alu_shf[lm1_port_next_and_curr_deq_rounds, lm1_port_next_and_curr_deq_rounds, \
			OR, next_deq_port_round, <<16]

	;set the got response bit, reset the request pending bit.
	;GOT_RESPONSE bit = 1 means for this port the next round pkts count and 
	;next round to dequeue have valid data.
	;set PORT_ACTIVE bit since port has been added to the ports_with_packets list
	ld_field[lm1_port_req_status_link_next,  1000, got_response_port_active]

	br[end_process_next_dequeue_round_response#]

add_to_active_ports_list#:
	local_csr_wr[active_lm_addr_0, prev_port_offset]

	;save the packets count from the response into current pkts count
	alu_shf[lm1_port_next_and_curr_rnd_pkts_cnt, lm1_port_next_and_curr_rnd_pkts_cnt, \
			OR, next_deq_round_pkts_count, >>16]	

	;save round number from the response into current round in port structure
	ld_field[lm1_port_next_and_curr_deq_rounds, 0011, next_deq_port_round]

	;since the dequeue round packets count changes from non-zero to zero 0, 
	;this enqueue message causes an enqueue transition to the port. 
	;Add the port to the list of ports with data

	;reset the request pending bit
	;set PORT_ACTIVE bit since port has been added to the ports_with_packets list
	ld_field[lm1_port_req_status_link_next,  1000, port_active]

	;first check if there are any ports with packets
	br_bset[list_active_flag, LIST_INACTIVE_BIT, first_active_port#]

	;there are ports in the active ports list. Insert the structure of this 
	;new port in between previous port and current port in the port list so that
	;it will be dequeued last in the next time around

	;set next port of previous entry to the new entry. 
	;ACTIVE_LM_ADDR_0 is set to prev entry in active ports list at this point
	ld_field[lm0_port_req_status_link_next, 0011, next_deq_port_offset]	

	;set the next pointer in the new entry to the current port.
	;This mean the new entry is inserted in between the current port entry
	;and the previous port entry 
	ld_field[lm1_port_req_status_link_next, 0011, curr_port_offset]

	;preload ACTIVE_LM_ADDR_0 for use in create_deq_message. This is
	;to avoid 3-cycle latency
	local_csr_wr[active_lm_addr_0, curr_port_offset]

	alu[prev_port_offset, --, B, next_deq_port_offset]
	
	br[end_process_next_dequeue_round_response#]


first_active_port#:
	move(curr_port_offset, next_deq_port_offset)	
	
	;preload ACTIVE_LM_ADDR_0 for use in create_deq_message. This is
	;to avoid 3-cycle latency
	local_csr_wr[active_lm_addr_0, curr_port_offset]

	move(prev_port_offset, next_deq_port_offset)

	;reset the LIST_INACTIVE_BIT (bit 31)
	immed[list_active_flag, 0]

end_process_next_dequeue_round_response#:

.end

#endm

///////////////////////////////////////////////////////////////////////////////
// process_next_dequeue_round_response
//	 	Description: 
//			This macro looks at the link list of ports with packets, the port 
//			credit, packets in flight, etc. to decide whether to send a dequeue
//			message to the Queue Manager. 
//			If the number of packets in the current dequeue round falls below
//			a LOW_WATER_MARK, the macro also sends a new dequeue round request 
//			to the Class Scheduler on scratch ring
//			
//	 	Outputs: 
//
//		Inputs:		
///////////////////////////////////////////////////////////////////////////////

#macro create_dequeue_message()

.begin

	.reg	packets_in_flight 
	.reg	curr_round curr_pkts_count next_pkts_count
	.reg	temp next_port

	;check if the NN ring to the QM is full. If it is, this means the queue
	;can not keep up with all the enqueue and dequeue messages. Skip sending
	;dequeue message until the ring is not full
	//br_inp_state[NN_FULL, nn_ring_full_send_invalid_deq_msg#]

	br_bset[list_active_flag, LIST_INACTIVE_BIT, no_more_port_with_packets#]

	;3-cycle latency
	local_csr_wr[T_INDEX, xfer_byte_offset]

	//At this point ACTIVE_LM_ADDR_0 MUST BE LOADED AND READY with curr_port_offset

	alu[curr_pkts_count, lm0_port_next_and_curr_rnd_pkts_cnt, AND, curr_round_mask]

check_low_water_mark#:	

	alu[--, curr_pkts_count, -, ROUND_LOW_WATER_MARK]
	bgt[check_packets_in_flight#], defer[2]

	;at this point, the valid bit in enqueue message is no longer needed.
	;Clear it up so that the valid enqueue port and round number is in 
	;the lower 16 bits to prepare for output message 
	alu[enq_port_round, enq_port_round, AND, curr_round_mask]
	
	;T_INDEX points to a transfer register designated to this port where
	;Tx ME wrote the number of packets transmitted from the port
	alu[packets_in_flight, lm0_port_pkts_scheduled, -, *$$index]
	
	;check if request pending bit == 1. If it is, that means this
	;ME already sent a next dequeue round request. 
	;Also, check if the got response bit == 1 meaning the resquest response
	;already came back and next round data is already updated. In either case,
	;no need to send next dequeue round request again
	br!=byte[lm0_port_req_status_link_next, 3, PORT_ACTIVE_VAL, \
				check_packets_in_flight#]

	;check if there are more packets has been enqueued for this port than
	;those are ready for dequeue. If this is the case, a next dequeue round
	;request can be sent.
	alu[--, lm0_port_pkts_enqueued, -, curr_pkts_count]
	ble[check_packets_in_flight#]

	;At this point, it's certain that all previous requests have gotten response.
	;This mean all previous Scratch put MUST HAVE BEEN DONE
	.io_completed next_deq_round_req_done

	;prepare the message with port number and bit 31 = 1 in advance. This is  
	;to fill in latency of CSR write to active_lm_addr_0
	alu_shf[$scratch_ring_request, new_round_req_valid, or, curr_port_offset, \
		>>PORT_STRUCTURE_SIZE]
	
check_ring_full#:
	br_inp_state[RING_FULL, check_ring_full#]

	scratch[put, $scratch_ring_request, zero, _RING_ADDR, 1], \
			sig_done[next_deq_round_req_done]	
	
	alu_shf[sig_mask, sig_mask, OR, 1, <<(&next_deq_round_req_done)]
		
	;set the request pending bit 
	alu_shf[lm0_port_req_status_link_next, \
			lm0_port_req_status_link_next, OR, 1, <<REQ_PENDING_BIT]

check_packets_in_flight#:

	alu[--, MAX_IN_FLIGHT, -, packets_in_flight]

	ble[high_packets_in_flight#]	
	
	
	;ready to create message


	;decrement credit
	alu[lm0_port_credit_quantum_and_current, lm0_port_credit_quantum_and_current, -, 1]

	;check if new credit is still > 0	
	alu[--, lm0_port_credit_quantum_and_current, AND, curr_round_mask]
	bgt[port_has_credit#], defer[3]

	;the next 3 instructions need to be done whether the branch (BGT) takes place
	;increment packets scheduled for dequeue 
	alu[lm0_port_pkts_scheduled, lm0_port_pkts_scheduled, +, 1]	

	;decrement packets enqueued
	alu[lm0_port_pkts_enqueued, lm0_port_pkts_enqueued, -, 1]
	
	;get current dequeue round
	alu[deq_port_round, lm0_port_next_and_curr_deq_rounds, AND, curr_round_mask]

	;at this point, the case is port ran out of credit
	;loading new credit and advance to next port

	;if this is the only port in the list, reload the message and continue
	alu[--, curr_port_offset, -, prev_port_offset]
	beq[check_pkts_count#], defer[3]

	;get lm0_port_credit_quantum_and_current into GPR to shift and load back to
	;local memory (assembler doesn't allow instructions with both source and dest 
	;in local memory)
	alu[temp, --, B, lm0_port_credit_quantum_and_current]
	ld_field[lm0_port_credit_quantum_and_current, 0011, temp, >>16]

	;decrement packets available for dequeueing in current dequeue round
	alu[lm0_port_next_and_curr_rnd_pkts_cnt, lm0_port_next_and_curr_rnd_pkts_cnt, -, 1]
	
	;if this port has no more packets, remove it from the list and
	;advance to the next port
	beq[non_empty_list#]
		
	;at this point, the case is the port has more packets and list has more ports.
	;Just advance to next port
	alu[prev_port_offset, --, B, curr_port_offset]	
	ld_field_w_clr[curr_port_offset, 0011, lm0_port_req_status_link_next]

	chk_status_and_write_to_nn_ring(enq_port_round, deq_port_round, sop_handle, 0)

//	br[start_port_schedule#]

port_has_credit#:

	;decrement packets available for dequeueing in current dequeue round
	alu[lm0_port_next_and_curr_rnd_pkts_cnt, lm0_port_next_and_curr_rnd_pkts_cnt, -, 1]

check_pkts_count#:
	;if both pkts count in current and next dequeue count are zero, remove entry
	;after sending this last packet
	beq[remove_port_from_list#]

create_deq_msg_nn_ring_full_2#:
	br_inp_state[NN_FULL, create_deq_msg_nn_ring_full_2#]

	;Since the BEQ above didn't take place, not both pkts count in current and 
	;next dequeue count are zero
	;Check if current pkts count is 0, if true it means next packets count > 0
	alu[--, lm0_port_next_and_curr_rnd_pkts_cnt, AND, curr_round_mask] 
	beq[current_rnd_has_no_pkts#]
	
	chk_status_and_write_to_nn_ring(enq_port_round, deq_port_round, sop_handle, 0)
		
	/*, defer[1]
	
	write_enq_deq_num_to_nn_ring(enq_port_round, deq_port_round)

	write_sop_to_nn_ring(sop_handle)

	write_eop_to_nn_ring()

	br[start_port_schedule#]
	*/

current_rnd_has_no_pkts#:

	;current packets count became 0. Move next dequeue 
	;round pkts count to current dequeue round pkts count
	alu_shf[lm0_port_next_and_curr_rnd_pkts_cnt, --, B, \
			lm0_port_next_and_curr_rnd_pkts_cnt, >>16]

	;next dequeue round become current dequeue round
	alu_shf[lm0_port_next_and_curr_deq_rounds, --, B, \
			lm0_port_next_and_curr_deq_rounds, >>16]

	;reset GOT_RESPONSE bit	because data in next round packets cound and
	;next dequeue round became 0 (this port ready to accept new next-round data)
	alu[lm0_port_req_status_link_next, \
		lm0_port_req_status_link_next, AND~, got_response]

	chk_status_and_write_to_nn_ring(enq_port_round, deq_port_round, sop_handle, 0)

/*
	write_sop_to_nn_ring(sop_handle)

	write_eop_to_nn_ring()

	br[start_port_schedule#]
*/

high_packets_in_flight#:
	;get the next port in advance	
	ld_field_w_clr[next_port, 0011, lm0_port_req_status_link_next]

	local_csr_wr[active_lm_addr_1, next_port]

	;if only one port in the list, just go to the end
	alu[--, curr_port_offset, -, prev_port_offset]	
	beq[no_more_port_with_packets#]

	;to meet latency for ACTIVE_LM_ADDR_1
	nop

	;if the next port has no packet, no need to advance the current port offset
	alu[--, lm1_port_next_and_curr_rnd_pkts_cnt, -, 0]
	beq[no_more_port_with_packets#]

	alu[prev_port_offset, --, B, curr_port_offset]
		
	alu[curr_port_offset, --, B, next_port]

no_more_port_with_packets#:
nn_ring_full_send_invalid_deq_msg#:


	br_bset[enq_port_round, 31, start_port_schedule#]

	;output an invalid dequeue message (queue number 0 is invalid for QUEUE MANAGER)
	alu[deq_port_round, --, B, 0]

	br[write_message#]
				
remove_port_from_list#:

	alu[--, curr_port_offset, -, prev_port_offset]

	;if curr_port_offset and prev_port_offset are the same, the list
	;only has 1 port. So it becomes empty after removing this port
	bne[non_empty_list#]

	;reset PORT_ACTIVE bit	
	alu[lm0_port_req_status_link_next, \
		lm0_port_req_status_link_next, AND~, port_active]

	;At this point the list is verfied empty. Set the empty bit.
	alu_shf[list_active_flag, --, B, 0x1, <<LIST_INACTIVE_BIT]	
		
	br[write_message#]	

non_empty_list#:
	local_csr_wr[active_lm_addr_1, prev_port_offset]
		
	ld_field_w_clr[next_port, 0011, lm0_port_req_status_link_next]
		
	alu[curr_port_offset, --, B, next_port]	

	;reset PORT_ACTIVE bit	
	alu[lm0_port_req_status_link_next, \
		lm0_port_req_status_link_next, AND~, port_active]	
		
	;advance to the next port.
	;active_lm_addr1 should be loaded with prev_port at this point
	ld_field[lm1_port_req_status_link_next, 0011, next_port]

	br[write_message#]

.end

#endm


///////////////////////////////////////////////////////////////////////////////
// port_scheduler
/*
Macro:	

Psuedo code:

start_port_scheduler#:

		if (NN_EMPTY) 
			branch to no_count_msg#
		endif

	Part 1: Process Enqueue Message

		Read enqueue port and round from nn-ring

		Read SOP of enqueuing packet from nn-ring

		If (valid enqueue message)
		
			Calculate location of the Port Data Structure of the enqueue port
			in local memory

			Increment Packet Enqueued Counter in Port Data Structure

			If port is not in ports_with_packets list (Port_Active == 0)

				Send a next dequeue round request to CLASS_SCHEDULER via Scratch ring.
			
				Set the Request_Pending bit

			Endif
	
		Endif

	Part 2: Process Dequeue Message

		read next dequeue port and round  on nn-ring 
	
		read packets count of next dequeue port and round on nn-ring
	
	
		If (valid next_dequeue_port_and_round)

			Calculate location of the Port Data Structure of the next dequeue port
			in local memory

			If (the port has zero packet in its current round)
					
				Add packets count to the Current Dequeue Round
			
				Add port in the active ports list as the last entry.

				Reset Request_Pending_Bit.

				Set Port Active Bit to indicate that port is in ports_with_packets list.

			Else
		
				If port has packets, it must have been added to the 
				port_with_packets list already. Just add round number 
				to Next Dequeue Round and packets packets count to the 
				Next Dequeue Round Packets Count in Port Data Structure.

				Reset Request_Pending_Bit.

				Set Port Active Bit to indicate that port is in port_with_packets list.
			
				Set Got_Response_Bit to indicate that Next Dequeue Round / Packets Count 
				has valid data.  
							
			Endif
				
		Endif

	Part 3: Create a dequeue message

		If (ports_with_packets list is empty)

			branch to no_port_with_packets#

		Endif				


		Get the Port Data Structure pointed to by at pointer at the head 
		ports_with_packets list
	
		If ( (current packets count < LOW_WATER_MARK) && (req_pending_bit == 0) 
				&& (got_response_bit == 0)

			Send a next dequeue round request to CLASS_SCHEDULER via Scratch ring.
			
			Set the Request_Pending bit
							
		Endif

			
		If (port packet in flight > MAX_PACKET_IN_FLIGHT)
				
			br[high_packet_in_flight#]

		Endif

		Decrement port credit.

		If port credit does not become zero
		
			br[port_has_credit#]
		
		Else
			
			If list only has one port
				Reload credit branch to port_has_credit#
			Else (list has other ports)
				Reload credit
				Send this last dequeue message
				Advance to the next port
			Endif	
		Endif

port_has_credit#:
		
		Decrement Enqueued Packets Count

		Increment Packets Scheduled For Dequeue

		Decrement Current Round Packets Count
		
		If (Current Round Packets Count becomes 0) && (Next Round Packets Count == 0)

			br[remove_port_from_list#]

		Else if (Current Round Packets Count becomes 0) && (Next Round Packets Count != 0)

			Move Next Round data into Current Round data and make Next Round 
			data become 0

			Reset the got_response_bit to indicate that the Next Round data is 
			no longer valid.
		Endif

		write to nn-ring:
			dequeue and enqueue port and round
			sop
			eop = 0 (just a null eop since this is not needed but Queue Manage
					expects a 3-longword message)
		
		br[start_port_schedule#]

high_packets_in_flight#:

	If list only has more port
		Advance to the next port
	Endif
	
	If valid enqueue message
		dequeue port and round = 0 (invalid)
		br[write_message#]
	Else
		br[end_port_schedule#]
	Endif

remove_port_from_list#:

	If list has more port
		br[non_empty_port_list#]
	Endif
	
	Reset active_port bit
	Set list_inactive bit (to indicate the list is empty)
	br[write_message#]					

non_empty_port_list#:
	
	Reset active_port bit
	Advance to next port
	br[write_message#]					
		
no_more_port_with_packets#:

	if invalid enq_port_round
		br[start_port_schedule#]
    else
		dequeue port and round = 0 (invalid)
		br[write_message#]
	endif

write_message#:
				
		write to nn-ring:
			dequeue and enqueue port and round
			sop
			eop = 0 (just a null eop since this is not needed but Queue Manage
					expects a 3-longword message)
		
		br[start_port_schedule#]

no_count_msg#:
			
		enqueue message = invalid
			
		if 	(active_ports list is NOT empty)
			br[create_dequeue_msg#]
		endif

*/
/////////////////////////////////////////////////////////////////////////////////
#macro port_schedule()

.begin
	.reg 	sop_handle 
	.reg	enq_port_round enq_port_offset
	.reg	next_deq_port_round next_deq_port_offset
	.reg	deq_port_round
	.reg 	next_deq_round_pkts_count
	.reg 	xfer_byte_offset				//value to put in T_INDEX to access the trqansfer
											//register associated with the port being dequeued

start_port_schedule#:

	.io_completed next_deq_round_req_done init_deq_round_req_done
	immed[sig_mask, 0]
				
	br_inp_state[NN_EMPTY, no_count_msg#]

	process_enqueue_message()

	;process next_dequeue_port_and_round message
	process_next_dequeue_round_response()

	;create dequeue message to send to QUEUE MANAGER. Dequeue port and
	;round come from the  active ports list (list of ports with packets)
create_dequeue_msg#:
	
	create_dequeue_message()

write_message#:

	chk_status_and_write_to_nn_ring(enq_port_round, deq_port_round, sop_handle, 0)
//	br[start_port_schedule#]


no_count_msg#:

	;For QUEUE MANAGER, queue number 0 is invalid
	alu_shf[enq_port_round, --, B, 0]
	
	alu[sop_handle, --, B, 0]

	;Current port offset is port number << PORT_STRUCTURE_SIZE. 
	;Shift right by (PORT_STRUCTURE_SIZE-2) to set up the T_INDEX[8:2].
	;Do this here to fill up the latency of the write active_lm_addr_1 CSR
	#define_eval	_SHF_VAL	(PORT_STRUCTURE_SIZE - 2)
	alu_shf[xfer_byte_offset, --, B, curr_port_offset, >>_SHF_VAL]
	#undef _SHF_VAL

	;preload ACTIVE_LM_ADDR_0 for use in create_deq_message. This is
	;to avoid 3-cycle latency
	local_csr_wr[active_lm_addr_0, curr_port_offset]
	;Current port offset is port number << PORT_STRUCTURE_SIZE. 
	;Shift right by (PORT_STRUCTURE_SIZE-2) to set up the T_INDEX[8:2].
	alu_shf[xfer_byte_offset, --, B, curr_port_offset, >>PORT_OFFSET_TO_T_INDEX]

	;if active ports list is not empty, then try to create dequeue message
	br_bclr[list_active_flag, LIST_INACTIVE_BIT, create_dequeue_msg#]

	;if empty port list then just swap out
	ctx_arb[--] , defer[1], br[start_port_schedule#]
	local_csr_wr[active_ctx_wakeup_events, sig_mask]

.end

#endm


/*************************************************************************************/

main#:

.begin
	// Run with thread 0 only
	br=ctx[ 0, run_port_scheduler#]

	// for all other threads, just abort
	ctx_arb[kill]			

run_port_scheduler#:						 
						 
	port_scheduler_init()

	.while (1)
		port_schedule()
	.endw

	nop; to avoid warnning 5133 
	
.end

/////////////////////////////////////////////////////////////////////////////////////////
	#undef _RING_ADDR

#endif // __PORT_SCHEDULER_UC__