#ifndef _ATM_TM_SCHEDULER_UTIL_UC_
#define _ATM_TM_SCHEDULER_UTIL_UC_

/*******************************************************************************
                             Intel Proprietary

 Copyright (c) 1998-2002 By Intel Corporation.  All rights reserved.
 No part of this program or publication may be reproduced, transmited,
 transcribed, stored in a retrieval system, or translated into any language
 or computer language in any form or by any means, electronic, mechanical,
 magnetic, optical, chemical, manual, or otherwise, without the prior
 written permission of:
                         Intel Corporation
                         2200 Mission College Blvd.
                         Santa Clara, CA  95052-8119
*******************************************************************************/

/*
 *      File Name: atm_tm_scheduler_util.uc                                         
 *                                                                   
 *      Description: This file contains the actual scheduler and write-out
 *					 microcode macros. 
 *
 *      History: ver 1.0
 *
 */

#include <xbuf.uc>
#include <stdmac.uc>
#include <sram.uc>
#include <dram.uc>
#include <dl_system.h>
#include <atm_tm.h>


//------------------------------------------------------------------
// atm_tm_scheduler_util(in_port_no)
//
//    Description: Schedules from the time queues and writes into the
//					time queues.
//
//    Parameters: None.
//		Inputs: 
//			in_port_no - 0...8
//		Outputs:
//			None.
//------------------------------------------------------------------

#macro atm_tm_scheduler_util(in_port_no)
.begin

	.reg tq_base tq_offset tql_base tql_offset lm_offset temp temp2
	.reg vcq_reg
	.reg sig_mask rtn_reg
	.reg load_newtq
	.reg tq_len working_tqlen
	.reg ubrwpri_addr
	.reg portinfo_addr
	.reg dq_lm_offset		//DQ cahce address offset for served port (with base = 0x0)
	.reg curtq
	
///////////////////////////////////////////////////////////////////////
#ifndef HBR_EXCLUDED
	.reg rt_lm_mask hbr_count
#endif
///////////////////////////////////////////////////////////////////////
	.reg qlen
	.reg $vcq_wo
   	.reg $vcq 
	.reg $txcount $tqlen $newtqlen0 $newtqlen1 $newtqlen2

#if defined(FIRST_SCHEDULER_ME) || defined(SECOND_SCHEDULER_ME)
	.reg $wo_msg[2]

	.sig scratch_read_sig
#endif

//Start the scheduling process by clearing the signal masks (sig_mask)
//and setting the load_newtq variable to zero. 
//Loading new TQs from SRAM happen every AGGREGATION slots and load_newtq is
//the flag variable to detect if SRAM read has been issued to load new TQ.
schedule#:
.set $newtqlen0 $newtqlen1 $newtqlen2 $txcount
	alu[sig_mask, --, B, 1, <<&next_thread_sig]
	alu[load_newtq, --, B, 0]

	; Reading message from shaper
#ifdef FIRST_SCHEDULER_ME
	scratch[get, $wo_msg[0], RING_TO_SCHEDULER_EVEN, load_newtq,  2], sig_done[scratch_read_sig]
#endif
#ifdef SECOND_SCHEDULER_ME
	scratch[get, $wo_msg[0], RING_TO_SCHEDULER_ODD, load_newtq, 2], sig_done[scratch_read_sig]
#endif

	; load port info to LM on position lm_offset in LM
	_get_port_entry_static(port, @dq_lm_base, @ubrwpri_lm_base, portinfo_addr, \
							ubrwpri_addr, curtq)


; ------------------------- DQs servicing loop ------------------------
//check if AGGREGATION number of slots have passed
//If so we need to load new LBR time queue pointer to the departure rings in LM
//provided that the time queues have data in them.
	; get RT_for_the_LBR_TQ to compare with AGGREGATION
	br=byte[*l$index0[1], 2, AGGREGATION_MASK, issue_read_newtq_and_txcount#]

continue#:

/////////////////////////////////////////////////////////////////////////////
#ifndef HBR_EXCLUDED

	//Set the LM base to the current HBR TQ pointer
	//Increment the pointer to point to the next entry in the TQ
	//Take care of wraparounds in the process of incrementing
	alu[lm_offset, _hbrtq_lm_base, +, @_rt_hbr]
	local_csr_wr[ACTIVE_LM_ADDR_1, lm_offset]

#endif	//HBR_EXCLUDED
///////////////////////////////////////////////////////////////////////


#ifdef FLOW_CONTROL
//Check the number of packets that have been transmitted for
//flow control purposes
	; check if port is blocked by flow controlled. If so, no cell needs to be scheduled
	br_bset[*l$index0[1], 0, FC_on#]
#endif

	//Priority queueing	 

///////////////////////////////////////////////////////////////////////
#ifndef HBR_EXCLUDED

	//Priority1: HBR VC
	//Read the HBR VCQ from the current slot in the HBR TQ.
	//Check if the VCQ has data by looking up the LW that 
	//contains the HBR VCQ length in LM.
	//If length > 0, go to hbr_tx#
	alu[temp, 0x3, and, @_rt_hbr]		;offset in HBR TQ longword
	alu[temp, --, b, temp, <<3]
	alu[--, temp, and, 0]				;for indirect reference only
	ld_field_w_clr[vcq_reg, 0001, *l$index1[0], >>indirect]
	alu[temp, --, b, vcq_reg, <<1]
	alu[lm_offset, _vcqlen_lm_base, +, temp]
	local_csr_wr[ACTIVE_LM_ADDR_1, lm_offset]  

#endif	//HBR_EXCLUDED
///////////////////////////////////////////////////////////////////////

	; increment RT_for_the_LBR_TQ
	alu[*l$index0[1], @one16, +, *l$index0[1]]

///////////////////////////////////////////////////////////////////////
#ifndef HBR_EXCLUDED

	alu[@_rt_hbr, @_rt_hbr, +, 1]
	alu[@_rt_hbr, @_rt_hbr, AND, RTLM_MASK]
	alu[temp, 0x10, AND, vcq_reg, <<4]
	alu[--, temp, and, 0]				;for indirect reference only
	ld_field_w_clr[hbr_count, 0011, *l$index1[0], >>indirect], load_cc
	bne[hbr_tx#]

#endif	//HBR_EXCLUDED
////////////////////////////////////////////////////////////////////////

.reg tq_offset_for_port

	; Calculate offset from the beginning of RTQ table for given port using TQ_offset
#define_eval INDIRECT_SHIFT 	(16 - TQ_SIZE_SHIFT)	; we need tq_offset_for_port to be in bytes
#define_eval INDIRECT_MASK		((1 << TQ_SIZE_SHIFT)-1)	; mask lower bits

	alu[tq_offset_for_port, INDIRECT_MASK, ~AND, *l$index0[2], >>INDIRECT_SHIFT] ; to get TQ offset in bytes

#undef INDIRECT_MASK
#undef INDIRECT_SHIFT
	
	; setting LM1 to point beginning of DQ in LM(4 LWs RT, 4LWs NRT, 4LWs UBR)
	local_csr_wr[ACTIVE_LM_ADDR_1, dq_lm_offset]
	; Priority1: RT TQ
	//Check if the real time departure queue length is greater than 0
	//If so go to rt_tx#
    br!=byte[*l$index0[6], 0, 0, rt_tx#]

	; Priority2: NRT TQ
    //Check if the real time departure queue length is greater than 0	
	//If so go to nrt_tx#
    br!=byte[*l$index0[7], 2, 0, nrt_tx#]

	; Priority3: UBR TQ
    //Check if the ubr departure queue length is greater than 0	
	//If so go to ubr_tx#
    br!=byte[*l$index0[7], 0, 0, ubr_tx#]

	; Priority4: UBR_PRI TQs
	//Dequeue from UBR priority queues
	_ubr_pri_deq(temp, portinfo_addr, ubrwpri_addr)
	alu[vcq_reg, temp, or, port, <<QM_REQ_PORT_OFFSET]	;add port number	
	alu[$vcq, vcq_reg, or, 1, <<31]
	load_addr[rtn_reg, scratch_put#]
	br_bclr[temp, 31, write_out#]

    load_addr[rtn_reg, check_for_load_new_tq#]
	br[write_out#]

// ******************* Schedule from real-time TQ *******************
//Check if the current time queue length > 0.
//If not we need to load a new time queue from the DQ ring
rt_tx#:
.begin 
.reg rtdq_lm_consumer rtdq_lm_producer
.reg rtqlen working_rtqlen rtqnum
.reg tq_base_for_port

	alu[working_rtqlen, --, B, *l$index0[11]]
	; compare RTQlen with Working_RTQlen
	alu[--, *l$index0[8], -, working_rtqlen] ; RTQlen - Working_RTQlen
    ble[new_rt_tx#]

//Transmit from the rt TQ
//Compute the SRAM address to read the tq entry
//Issue SRAM read and jump to "write out"
cur_rt_tx#:

	; Add RTQ base to offset
	alu[tq_base_for_port, tq_offset_for_port, +, @_rtq_sram_base]

	; Calculate offset to get currentely serviced TQ
#define_eval INDIRECT_SHIFT 		(16 - TQ_SIZE_SHIFT)
#define_eval INDIRECT_MASK	((1 << TQ_SIZE_SHIFT) - 1)

	alu[tq_offset, INDIRECT_MASK, ~AND, *l$index0[5], >>INDIRECT_SHIFT]	;get offset in bytes (RTQnum)

#undef INDIRECT_MASK
#undef INDIRECT_SHIFT

	alu[tq_base, tq_base_for_port, +, tq_offset]

// Calculate offset from beginning of TQ element
	; calc. offset to get time-slot 
	alu[tq_offset, --, B, working_rtqlen, <<2] ; each element is 4 bytes (LW) wide
	sram[read, $vcq, tq_base, tq_offset, 1], sig_done[sram_read_dn_sched]


rt_update_vars#:
	; decrement DQRTlen
	alu[*l$index0[6], *l$index0[6], -, 1]
	; increment Working_RTQlen
	alu[*l$index0[11], 1, +, working_rtqlen]

rt_finish_tx#:
	alu[sig_mask, sig_mask, or, 1, <<&sram_read_dn_sched]	
	load_addr[rtn_reg, deq#]
	br[write_out#]	

//Load the new time queue that needs to be serviced from the DQ ring
//Update the scheduler port state accordingly with the new 
//rt time queue length and number
new_rt_tx#:
	; use Rtdq_lm_consumer to calculate offset in DQ
#define_eval INDIRECT_SHIFT 	(16 - 2)
#define_eval INDIRECT_MASK		(RTDQ_SIZE_MASK << 2)

	alu[rtdq_lm_consumer, INDIRECT_MASK, AND, *l$index0[3], >>INDIRECT_SHIFT]	

#undef INDIRECT_MASK
#undef INDIRECT_SHIFT

	; calculate LM offset from it to get RT DQ
	alu[lm_offset, dq_lm_offset, +, rtdq_lm_consumer]
 	local_csr_wr[ACTIVE_LM_ADDR_1, lm_offset]
	; increment Rtdq_lm_consumer by 1 LW (4 bytes)
	alu[rtdq_lm_consumer, rtdq_lm_consumer, +, 4]
	alu[rtdq_lm_consumer, RTDQ_SIZE_MASK, AND, rtdq_lm_consumer, >>2]
	ld_field[*l$index0[3], 0100, rtdq_lm_consumer, <<16]	; Writing consumer to LM

	; load new RTQnum for servicing from DQ
	ld_field_w_clr[rtqnum, 0011, *l$index1[0]]
	ld_field[*l$index0[5], 1100, rtqnum, <<16]

	; write TQlen to RTQlen
	alu[rtqlen, mask_upper16, AND, *l$index1[0], >>16]
	alu[*l$index0[8], --, B, rtqlen]
	; zero Working_RTQlen
	alu[working_rtqlen, --, B, 0]
    br[cur_rt_tx#]
.end
// ******************* End of schedule from real-time TQ *******************


// ******************* Schedule from non real-time TQ *******************
//Check if the current time queue length > 0.
//If not we need to load a new time queue from the DQ ring
nrt_tx#:
.begin 
.reg nrtdq_lm_consumer nrtdq_lm_producer
.reg nrtqlen working_nrtqlen nrtqnum
.reg tq_base_for_port

	alu[working_nrtqlen, --, B, *l$index0[12]]
	; compare NRTQlen with Working_NRTQlen
	alu[--, *l$index0[9], -, working_nrtqlen] ; NRTQlen - Working_NRTQlen
    ble[new_nrt_tx#]

//Transmit from the rt TQ
//Compute the SRAM address to read the tq entry
//Issue SRAM read and jump to "write out"
cur_nrt_tx#:

	; Add RTQ base to offset
	alu[tq_base_for_port, tq_offset_for_port, +, @_nrtq_sram_base]

	; Calculate offset to get currentely serviced TQ
#define_eval INDIRECT_SHIFT 		(TQ_SIZE_SHIFT)

	alu[tq_offset, mask_nrtq_num, AND, *l$index0[5], <<INDIRECT_SHIFT]	;get offset in bytes (NRTQnum)

#undef INDIRECT_SHIFT

	alu[tq_base, tq_base_for_port, +, tq_offset]

// Calculate offset from beginning of TQ element
	; calc. offset to get time-slot 
	alu[tq_offset, --, B, working_nrtqlen, <<2] ; each element is 4 bytes (LW) wide
	sram[read, $vcq, tq_base, tq_offset, 1], sig_done[sram_read_dn_sched]


nrt_update_vars#:
	; decrement DQNRTlen
	alu[*l$index0[7], *l$index0[7], -, @one16]
	; increment Working_NRTQlen
	alu[*l$index0[12], 1, +, working_nrtqlen]

nrt_finish_tx#:
	alu[sig_mask, sig_mask, or, 1, <<&sram_read_dn_sched]	
	load_addr[rtn_reg, deq#]
	br[write_out#]	

//Load the new time queue that needs to be serviced from the DQ ring
//Update the scheduler port state accordingly with the new 
//rt time queue length and number
new_nrt_tx#:
	; use Nrtdq_lm_consumer to calculate offset in DQ
#define_eval INDIRECT_SHIFT 	(0 + 2)
#define_eval INDIRECT_MASK		(NRTDQ_SIZE_MASK << 2)

	alu[nrtdq_lm_consumer, INDIRECT_MASK, AND, *l$index0[3], <<INDIRECT_SHIFT]	

#undef INDIRECT_MASK
#undef INDIRECT_SHIFT

	; calculate LM offset from it to get NRT DQ
	alu[lm_offset, dq_lm_offset, +, nrtdq_lm_consumer]
	alu[lm_offset, 16, +, lm_offset]	; NRT DQ is + 16 bytes from RTDQ
 	local_csr_wr[ACTIVE_LM_ADDR_1, lm_offset]
	; increment Nrtdq_lm_consumer by 1 LW (4 bytes)
	alu[nrtdq_lm_consumer, nrtdq_lm_consumer, +, 4]
	alu[nrtdq_lm_consumer, NRTDQ_SIZE_MASK, AND, nrtdq_lm_consumer, >>2]
	ld_field[*l$index0[3], 0001, nrtdq_lm_consumer]	; Writing consumer to LM

	; load new NRTQnum for servicing from DQ
	ld_field_w_clr[nrtqnum, 0011, *l$index1[0]]
	ld_field[*l$index0[5], 0011, nrtqnum]

	; write TQlen to NRTQlen
	alu[nrtqlen, mask_upper16, AND, *l$index1[0], >>16]
	alu[*l$index0[9], --, B, nrtqlen]
	; zero Working_NRTQlen
	alu[working_nrtqlen, --, B, 0]
    br[cur_nrt_tx#]
.end

// ******************* End of schedule from non real-time TQ *******************


// ******************* Schedule from UBR TQ *******************
//Check if the current time queue length > 0.
//If not we need to load a new time queue from the DQ ring
ubr_tx#:
.begin 
.reg ubrtdq_lm_consumer ubrtdq_lm_producer
.reg ubrtqlen working_ubrtqlen ubrtqnum
.reg tq_base_for_port

	alu[working_ubrtqlen, --, B, *l$index0[13]]
	; compare UBRTQlen with Working_UBRTQlen
	alu[--, *l$index0[10], -, working_ubrtqlen] ; UBRTQlen - Working_UBRTQlen
    ble[new_ubrt_tx#]

//Transmit from the ubr TQ
//Compute the SRAM address to read the tq entry
//Issue SRAM read and jump to "write out"
cur_ubrt_tx#:

	; Add RTQ base to offset
	alu[tq_base_for_port, tq_offset_for_port, +, @_ubrtq_sram_base]

	; Calculate offset to get currentely serviced TQ
#define_eval INDIRECT_SHIFT 		(16 - TQ_SIZE_SHIFT)
#define_eval INDIRECT_MASK	((1 << TQ_SIZE_SHIFT) - 1)

	alu[tq_offset, INDIRECT_MASK, ~AND, *l$index0[6], >>INDIRECT_SHIFT]	;get offset in bytes (UBRTQnum)

#undef INDIRECT_MASK
#undef INDIRECT_SHIFT

	alu[tq_base, tq_base_for_port, +, tq_offset]

// Calculate offset from beginning of TQ element
	; calc. offset to get time-slot 
	alu[tq_offset, --, B, working_ubrtqlen, <<2] ; each element is 4 bytes (LW) wide
	sram[read, $vcq, tq_base, tq_offset, 1], sig_done[sram_read_dn_sched]


ubrt_update_vars#:
	; decrement DQUBRTlen
	alu[*l$index0[7], *l$index0[7], -, 1]
	; increment Working_RTQlen
	alu[*l$index0[13], 1, +, working_ubrtqlen]

ubrt_finish_tx#:
	alu[sig_mask, sig_mask, or, 1, <<&sram_read_dn_sched]	
	load_addr[rtn_reg, deq#]
	br[write_out#]	

//Load the new time queue that needs to be serviced from the DQ ring
//Update the scheduler port state accordingly with the new 
//rt time queue length and number
new_ubrt_tx#:
	; use Ubrtdq_lm_consumer to calculate offset in DQ
#define_eval INDIRECT_SHIFT 	(2)
#define_eval INDIRECT_MASK		(UBRDQ_SIZE_MASK << 2)

	alu[ubrtdq_lm_consumer, INDIRECT_MASK, AND, *l$index0[2], <<INDIRECT_SHIFT]	

#undef INDIRECT_MASK
#undef INDIRECT_SHIFT

	; calculate LM offset from it to get RT DQ
	alu[lm_offset, dq_lm_offset, +, ubrtdq_lm_consumer]
	alu[lm_offset, 32, +, lm_offset]		; offset from the beginning of given DQ table
 	local_csr_wr[ACTIVE_LM_ADDR_1, lm_offset]
	; increment Ubrtdq_lm_consumer by 1 LW (4 bytes)
	alu[ubrtdq_lm_consumer, ubrtdq_lm_consumer, +, 4]
	alu[ubrtdq_lm_consumer, UBRDQ_SIZE_MASK, AND, ubrtdq_lm_consumer, >>2]
	ld_field[*l$index0[2], 0001, ubrtdq_lm_consumer]	; Writing consumer to LM

	; load new UBRTQnum for servicing from DQ
	ld_field_w_clr[ubrtqnum, 0011, *l$index1[0]]
	ld_field[*l$index0[6], 1100, ubrtqnum, <<16]

	; write TQlen to UBRTQlen
	alu[ubrtqlen, mask_upper16, AND, *l$index1[0], >>16]
	alu[*l$index0[10], --, B, ubrtqlen]
	; zero Working_UBRTQlen
	alu[working_ubrtqlen, --, B, 0]
    br[cur_ubrt_tx#]
.end

// ******************* End of schedule from UBR TQ *******************

///////////////////////////////////////////////////////////////////////
#ifndef HBR_EXCLUDED

//Schedule from the High bit rate time queue
//VCQ entry is already known, since the length for this VCQ 
//was previously checked. 
//Just decrement the length of this VCQ by 1 and jump to "write out" 
hbr_tx#:
	alu[hbr_count, hbr_count, -, 1]
	alu[--, vcq_reg, and, 0x1]
	bne[hi_word#]
	ld_field[*l$index1[0], 0011, hbr_count]
	br[tx_cont#]
hi_word#:
	ld_field[*l$index1[0], 1100, hbr_count, <<16]
tx_cont#:
	alu[vcq_reg, vcq_reg, or, in_port_no, <<QM_REQ_PORT_OFFSET]
	alu[$vcq, vcq_reg, or, 1, <<31]
	load_addr[rtn_reg, scratch_put#]	
	br[write_out#]

#endif	//HBR_EXCLUDED
///////////////////////////////////////////////////////////////////////

//The actual cell dequeue message is sent out to the QM from here
//This is executed after the thread returns from finishing the functions
//of "write out"
deq#:
	alu[$vcq, $vcq, or, 1, <<31]

scratch_put#:

#ifdef DEBUG
	local_csr_rd[TIMESTAMP_LOW]
	immed[debug_tmp, 0]
	alu[$debug0, --, B, debug_tmp]
	alu[$debug1, --, B, 0xba]
	alu[$debug2, --, B, *l$index0[0]]
	alu[$debug3, --, B, *l$index0[1]]
	alu[debug_tmp, --, B, @debug_base]
	sram[write, $debug0, debug_tmp, 0, 4], sig_done[debug_done]
	.io_completed debug_done
	alu[@debug_base, @debug_base, +, 16]
#endif

    br_inp_state[ring_full, retry#]

 	scratch[put, $vcq, 0, qmring_num, 1], sig_done[scratch_put_dn]
	alu[sig_mask, sig_mask, or, 1, <<&scratch_put_dn]

#ifdef FLOW_CONTROL
	; increment Schcount
	alu[*l$index0[1], *l$index0[1], +, @one24]
#endif
	
//check if the new tq has to be loaded into the Deqprture queue in LM.
//This is done by polling the load_newtq flag variable
//If so jump to "load new tq"
check_for_load_new_tq#:
	br=byte[load_newtq, 0, 1, load_new_tq#]

//end of the scheduler functions.
end#:
   	local_csr_wr[ACTIVE_CTX_WAKEUP_EVENTS, sig_mask]	
   	ctx_arb[--], br[end_sched#]
; -------------------------------------------------------------------------------

#ifdef FLOW_CONTROL
//If Flow Control, there is nothing to schedule for this 
//cell tx slot. Jump to write out.
FC_on#:
	; increment RT_for_the_LBR_TQ
	alu[*l$index0[1], @one16, +, *l$index0[1]]
	load_addr[rtn_reg, check_for_load_new_tq#]
	br[write_out#]

#endif
//retry to see if the scratch ring has space to accomodate
//the current deq request,
retry#:
	br[scratch_put#]

;--------------------------------------------------------------------------------


issue_read_newtq_and_txcount#:

//This macro issues a read for new rt, nrt & ubr time queue
//to be loaded into the departure rings and txcount 
//for flow control purposes.

	_issue_read_newtq_and_txcount(load_newtq, $newtqlen0, $newtqlen1, $newtqlen2, \
		sig_mask, $txcount, curtq)

///////////////////////////////////////////////////////////////////////
#ifndef HBR_EXCLUDED
//Macro functionality: Check if it is time yet to poll for a 
//new HBR TQ in SRAM. A new HBR TQ will be available in SRAM whenever 
//a HBR VC arrives or leaves the system. If it is time, then, check if 
//a new HBR TQ is present in SRAM by examining a particular location 
//in SRAM. If new HBR TQ is present, load the TQ into LM, in
//place of the old one.

	_check_and_load_new_hbr_tq()
#endif
///////////////////////////////////////////////////////////////////////

//branch to where you left off.
	br[continue#]


load_new_tq#:
//happens deterministically once every AGGREGATION slots
//Put the <TQnum, TQlen> into respective software rings 
//for real and non real time  and ubr traffic, if TQlen > 0.
//Increase the lengths of the DQ by the current TQ length.
//Also reset the flag variable load_newtq.

	_load_tq_into_dqrings(load_newtq, $newtqlen0, $newtqlen1, $newtqlen2, portinfo_addr, curtq)


#ifdef FLOW_CONTROL
//This macro Checks for flow control to be asserted or de-asserted
//on a port.

	_flow_control($txcount)

#endif

//Branch to the end of the scheduler.
	br[end#]


;--------------------------- WriteOut loop --------------------------------------
//The writeout functionality begins here.
write_out#:
.begin
.reg msg0		; First LW of the message from shaper
.reg msg1		; Second LW of the message from shaper
.reg code		; Class of Service
.reg tqnum		; Time Queue number
.reg cell_count	; Cell counf (for UBRwPRI and HBR)
.reg ubrwpri_cache
.reg ubrwpri_addr_wo
.reg portinfo_addr_wo

	alu[*l$index0[14], --, B, curtq] ; update CurTQ (in case there is issue_load_new_tq.. running in background)
	; signal next thread
	local_csr_wr[SAME_ME_SIGNAL, next_thread_sig_csr_val]
#if defined(FIRST_SCHEDULER_ME) || defined(SECOND_SCHEDULER_ME)
	ctx_arb[scratch_read_sig]
	alu[msg0, --, B, $wo_msg[0]]
	beq[no_write#]
	alu[$vcq_wo, --, b, msg0]
	alu[msg1, --, B, $wo_msg[1]]	

#else
	; if NN is empty, there is nothing to writeout
	br_inp_state[nn_empty, no_write#]
	; read NN
	alu[msg0, --, b, *n$index++]
	alu[$vcq_wo, --, b, msg0]
	alu[msg1, --, B, *n$index++]	

#endif
///////////////////////////////////////////////////////////////////////
#ifndef HBR_EXCLUDED

//Check if the 	VC is HBR or LBR	
	br_bset[msg1, 31, hbr_ubr_wo#]

#endif	//HBR_EXCLUDED
///////////////////////////////////////////////////////////////////////

	; load PortInfo for given port
	_get_port_entry_static_wo(msg0, @ubrwpri_lm_base, ubrwpri_addr_wo, portinfo_addr_wo)


#ifdef DEBUG
	local_csr_rd[TIMESTAMP_LOW]
	immed[debug_tmp, 0]
	alu[$debug0, --, B, debug_tmp]
	alu[$debug1, --, B, msg1]
	alu[$debug2, --, B, *l$index0[0]]
	alu[$debug3, --, B, *l$index0[1]]
	alu[debug_tmp, --, B, @debug_base]
	sram[write, $debug0, debug_tmp, 0, 4], sig_done[debug_done]
	.io_completed debug_done
	alu[@debug_base, @debug_base, +, 16]
#endif
//Writeout for low bit rate traffic
//Compute the time queue number from the timeslot 
//If the traffic is non-real time branch to nrt_write#
lbr_wo#:

#define_eval INDIRECT_SHIFT 	(28 - 2)	; we need code to be multiplied by 4
#define_eval INDIRECT_MASK		(0x7 << 2)	; mask lower bits

	alu[code, INDIRECT_MASK, and, msg1, >>INDIRECT_SHIFT]

#undef INDIRECT_MASK
#undef INDIRECT_SHIFT

//Check if the cell belongs to real time or non real time VCQ
//by checking the CODE value.
check_code#:
.reg rtqnum nrtqnum ubrtqnum    //xxTQnum used in writeout
.reg tqnumA		; tqnum register in GPR A - placed here as optimalization

	//Check for :
	//UBR code = 	000		don't needed GCRA calculate
	//CBR code = 	001		needed GCRA calculate
	//rtVBR code = 	010		needed GCRA calculate twice time
	//nrtVBR code = 011		needed GCRA calculate twice time
	//UBR w/PCR =	100		needed GCRA calculate
	//UBR w/MDCR =	101		needed GCRA calculate
	//GFR =			110		needed GFR_GCRA calculate
	jump[code, jump_table#],  targets[_UBR#, _CBR#, _rtVBR#, _nrtVBR#, \
							    	  _UBRwPCR#, _UBRwMDCR#, _GFR#] 

jump_table#:

_UBR#:	   	alu[tqnum, @mask_upper13, AND, msg1]	;get number of TQ to be serviced
        	; Setting LM index for UBRwPRI table 
        	local_csr_wr[ACTIVE_LM_ADDR_0, ubrwpri_addr_wo] 
        	ld_field_w_clr[ubrwpri_cache, 0001, *l$index0[4], >>16]
            br[prio_write#]

_CBR#: 		alu[tqnum, @mask_upper13, AND, msg1]	;get number of TQ to be serviced
        #if   (streq('CLASS_CBR', 'RTQUEUE'))
        	alu[rtqnum, mask_upper16, AND, *l$index0[5], >>16] ; get RTQnum
        #elif (streq('CLASS_CBR', 'NRTQUEUE'))
        	alu[nrtqnum, mask_upper16, AND, *l$index0[5]] ; get NRTQnum
        #elif (streq('CLASS_CBR', 'UBRQUEUE'))
        	alu[ubrtqnum, mask_upper16, AND, *l$index0[6], >>16] ; get UBRTQnum
        #endif
            alu[tqnumA, @mask_upper13, AND, msg1]	;get number of TQ to be serviced (use in GPR A)
            br[CLASS_CBR]

_rtVBR#: 	alu[tqnum, @mask_upper13, AND, msg1]	;get number of TQ to be serviced
        #if   (streq('CLASS_RTVBR', 'RTQUEUE'))
        	alu[rtqnum, mask_upper16, AND, *l$index0[5], >>16] ; get RTQnum
        #elif (streq('CLASS_RTVBR', 'NRTQUEUE'))
        	alu[nrtqnum, mask_upper16, AND, *l$index0[5]] ; get NRTQnum
        #elif (streq('CLASS_RTVBR', 'UBRQUEUE'))
        	alu[ubrtqnum, mask_upper16, AND, *l$index0[6], >>16] ; get UBRTQnum
        #endif
            alu[tqnumA, @mask_upper13, AND, msg1]	;get number of TQ to be serviced (use in GPR A)
            br[CLASS_RTVBR]

_nrtVBR#: 	alu[tqnum, @mask_upper13, AND, msg1]	;get number of TQ to be serviced
        #if   (streq('CLASS_NRTVBR', 'RTQUEUE'))
        	alu[rtqnum, mask_upper16, AND, *l$index0[5], >>16] ; get RTQnum
        #elif (streq('CLASS_NRTVBR', 'NRTQUEUE'))
        	alu[nrtqnum, mask_upper16, AND, *l$index0[5]] ; get NRTQnum
        #elif (streq('CLASS_NRTVBR', 'UBRQUEUE'))
        	alu[ubrtqnum, mask_upper16, AND, *l$index0[6], >>16] ; get UBRTQnum
        #endif
            alu[tqnumA, @mask_upper13, AND, msg1]	;get number of TQ to be serviced (use in GPR A)
            br[CLASS_NRTVBR]

_UBRwPCR#: 	alu[tqnum, @mask_upper13, AND, msg1]	;get number of TQ to be serviced
        #if   (streq('CLASS_UBR_PCR', 'RTQUEUE'))
        	alu[rtqnum, mask_upper16, AND, *l$index0[5], >>16] ; get RTQnum
        #elif (streq('CLASS_UBR_PCR', 'NRTQUEUE'))
        	alu[nrtqnum, mask_upper16, AND, *l$index0[5]] ; get NRTQnum
        #elif (streq('CLASS_UBR_PCR', 'UBRQUEUE'))
        	alu[ubrtqnum, mask_upper16, AND, *l$index0[6], >>16] ; get UBRTQnum
        #endif
            alu[tqnumA, @mask_upper13, AND, msg1]	;get number of TQ to be serviced (use in GPR A)
            br[CLASS_UBR_PCR]

_UBRwMDCR#: alu[tqnum, @mask_upper13, AND, msg1]	;get number of TQ to be serviced
        #if   (streq('CLASS_UBR_MDCR', 'RTQUEUE'))
        	alu[rtqnum, mask_upper16, AND, *l$index0[5], >>16] ; get RTQnum
        #elif (streq('CLASS_UBR_MDCR', 'NRTQUEUE'))
        	alu[nrtqnum, mask_upper16, AND, *l$index0[5]] ; get NRTQnum
        #elif (streq('CLASS_UBR_MDCR', 'UBRQUEUE'))
        	alu[ubrtqnum, mask_upper16, AND, *l$index0[6], >>16] ; get UBRTQnum
        #endif
            alu[tqnumA, @mask_upper13, AND, msg1]	;get number of TQ to be serviced (use in GPR A)
            br[CLASS_UBR_MDCR]

_GFR#:		alu[tqnum, @mask_upper13, AND, msg1]	;get number of TQ to be serviced
        #if   (streq('CLASS_GFR', 'RTQUEUE'))
        	alu[rtqnum, mask_upper16, AND, *l$index0[5], >>16] ; get RTQnum
        #elif (streq('CLASS_GFR', 'NRTQUEUE'))
        	alu[nrtqnum, mask_upper16, AND, *l$index0[5]] ; get NRTQnum
        #elif (streq('CLASS_GFR', 'UBRQUEUE'))
        	alu[ubrtqnum, mask_upper16, AND, *l$index0[6], >>16] ; get UBRTQnum
        #endif
            alu[tqnumA, @mask_upper13, AND, msg1]	;get number of TQ to be serviced (use in GPR A)
            br[CLASS_GFR]

// ******************* UBR writeout *******************
prio_write#:
	;extract cell_count
	alu[cell_count, --, b, msg1, <<QM_REQ_CELL_COUNT_CLR_SHIFT]
	alu[cell_count, --, b, cell_count, >>QM_REQ_CELL_COUNT_CLR_SHIFT]	
	;extract priority
	alu[temp, 0x7, and, msg1, >>QM_REQ_CELL_COUNT_LEN]
	jump[temp, prio_jump_table#],	targets[_prio_1#, _prio_2#, _prio_3#, \
											_prio_4#, _prio_5#, _prio_6#, \
											_prio_7#, _prio_8#]
prio_jump_table#:
_prio_1#: br[_ubr_prio_1#]
_prio_2#: br[_ubr_prio_2#]
_prio_3#: br[_ubr_prio_3#]
_prio_4#: br[_ubr_prio_4#]
_prio_5#: br[_ubr_prio_5#]
_prio_6#: br[_ubr_prio_6#]
_prio_7#: br[_ubr_prio_7#]
_prio_8#: br[_ubr_prio_8#]

_ubr_prio_1#:
	;increment cell count
	alu[cell_count, cell_count, +, *l$index0[0]]	
	alu[cell_count, --, b, cell_count, <<VCQ_NUMBER_OF_BITS]
	dbl_shf[temp, msg0, cell_count, >>VCQ_NUMBER_OF_BITS]
	alu[*l$index0[0], --, b, temp]		;store cell count
	alu[ubrwpri_cache, ubrwpri_cache, or, 1]				;set bit in the UBRwPRI cache byte
	br[_ubr_prio_end#]
_ubr_prio_2#:
	;increment cell count
	alu[cell_count, cell_count, +, *l$index0[1]]	
	alu[cell_count, --, b, cell_count, <<VCQ_NUMBER_OF_BITS]
	dbl_shf[temp, msg0, cell_count, >>VCQ_NUMBER_OF_BITS]
	alu[*l$index0[1], --, b, temp]		;store cell count
	alu[ubrwpri_cache, ubrwpri_cache, or, 1, <<1]		;set bit in the UBRwPRI cache byte
	br[_ubr_prio_end#]
_ubr_prio_3#:
	;increment cell count
	alu[cell_count, cell_count, +, *l$index0[2]]	
	alu[cell_count, --, b, cell_count, <<VCQ_NUMBER_OF_BITS]
	dbl_shf[temp, msg0, cell_count, >>VCQ_NUMBER_OF_BITS]
	alu[*l$index0[2], --, b, temp]		;store cell count
	alu[ubrwpri_cache, ubrwpri_cache, or, 1, <<2]		;set bit in the UBRwPRI cache byte
	br[_ubr_prio_end#]
_ubr_prio_4#:
	;increment cell count
	alu[cell_count, cell_count, +, *l$index0[3]]	
	alu[cell_count, --, b, cell_count, <<VCQ_NUMBER_OF_BITS]
	dbl_shf[temp, msg0, cell_count, >>VCQ_NUMBER_OF_BITS]
	alu[*l$index0[3], --, b, temp]		;store cell count
	alu[ubrwpri_cache, ubrwpri_cache, or, 1, <<3]		;set bit in the UBRwPRI cache byte
	br[_ubr_prio_end#]
_ubr_prio_5#:
	;increment cell count
	alu[cell_count, cell_count, +, *l$index0[4]]	
	alu[cell_count, --, b, cell_count, <<VCQ_NUMBER_OF_BITS]
	dbl_shf[temp, msg0, cell_count, >>VCQ_NUMBER_OF_BITS]
	alu[*l$index0[4], --, b, temp]		;store cell count
	alu[ubrwpri_cache, ubrwpri_cache, or, 1, <<4]		;set bit in the UBRwPRI cache byte
	br[_ubr_prio_end#]
_ubr_prio_6#:
	;increment cell count
	alu[cell_count, cell_count, +, *l$index0[5]]	
	alu[cell_count, --, b, cell_count, <<VCQ_NUMBER_OF_BITS]
	dbl_shf[temp, msg0, cell_count, >>VCQ_NUMBER_OF_BITS]
	alu[*l$index0[5], --, b, temp]		;store cell count
	alu[ubrwpri_cache, ubrwpri_cache, or, 1, <<5]		;set bit in the UBRwPRI cache byte
	br[_ubr_prio_end#]
_ubr_prio_7#:
	;increment cell count
	alu[cell_count, cell_count, +, *l$index0[6]]	
	alu[cell_count, --, b, cell_count, <<VCQ_NUMBER_OF_BITS]
	dbl_shf[temp, msg0, cell_count, >>VCQ_NUMBER_OF_BITS]
	alu[*l$index0[6], --, b, temp]		;store cell count
	alu[ubrwpri_cache, ubrwpri_cache, or, 1, <<6]		;set bit in the UBRwPRI cache byte
	br[_ubr_prio_end#]
_ubr_prio_8#:
	;increment cell count
	alu[cell_count, cell_count, +, *l$index0[7]]	
	alu[cell_count, --, b, cell_count, <<VCQ_NUMBER_OF_BITS]
	dbl_shf[temp, msg0, cell_count, >>VCQ_NUMBER_OF_BITS]
	alu[*l$index0[7], --, b, temp]		;store cell count
	alu[ubrwpri_cache, ubrwpri_cache, or, 1, <<7]		;set bit in the UBRwPRI cache byte
	br[_ubr_prio_end#]

_ubr_prio_end#:
	local_csr_wr[ACTIVE_LM_ADDR_0, portinfo_addr_wo] 
	local_csr_wr[ACTIVE_CTX_WAKEUP_EVENTS, sig_mask]	
	alu[sig_mask, --, B, 1, <<&next_thread_sig]
	nop
	ld_field[*l$index0[4], 0100, ubrwpri_cache, <<16]
	ctx_arb[--]
	.io_completed scratch_read_dn sram_read_dn_sched \
	sram_read_dn_newtq0 sram_read_dn_newtq1 sram_read_dn_newtq2
	local_csr_wr[SAME_ME_SIGNAL, next_thread_sig_csr_val]
	rtn[rtn_reg]
// ******************* End of UBR writeout *******************

// ******************* RTQ writeout *******************
//Write out for Real time (rtVBR) low bit rate traffic.
//Atomic increment the length of the timequeue in SRAM
//Sleep until the length of the time queue returns.
rt_write#:
.begin
.reg tql_offset_for_port tql_base_for_port

	; get TQlen_offset for given port
#define_eval INDIRECT_SHIFT 	(16 - TQLEN_ENTRY_SIZE_SHIFT)	; we need tq_offset_for_port to be in bytes
	alu[tql_offset_for_port, mask_tqlofs, AND, *l$index0[2], >>INDIRECT_SHIFT]	; TQlen offset in bytes
#undef INDIRECT_SHIFT

	; calculate offset from the beginning of TQ length table for that port
	alu[tql_base_for_port, _rtqlen_sram_base, +, tql_offset_for_port]

	_check_shift(tqnum, tqnumA, rtqnum, *l$index0[14])

	; Place request in TQ
	_write_lbr(sig_mask, tqnum, tql_base_for_port, @_rtq_sram_base, $vcq_wo, \
			rtn_reg, sram_read_dn0_wo, sram_write_dn0, TQ_SIZE_SHIFT, TQ_MAX_LEN, TQLEN_ENTRY_SIZE_SHIFT)

.end	
// ******************* End of RTQ writeout *******************


// ******************* NRTQ writeout *******************
//Write out for Non Real time (nrtVBR) low bit rate traffic.
//Atomic increment the length of the timequeue in SRAM
//Sleep until the length of the time queue returns.
nrt_write#:
.begin
.reg tql_offset_for_port tql_base_for_port

	; get TQlen_offset for given port
#define_eval INDIRECT_SHIFT 	(16 - TQLEN_ENTRY_SIZE_SHIFT)	; we need tq_offset_for_port to be in bytes
	alu[tql_offset_for_port, mask_tqlofs, AND, *l$index0[2], >>INDIRECT_SHIFT]	; TQlen offset in bytes
#undef INDIRECT_SHIFT

	; calculate offset from the beginning of TQ length table for that port
	alu[tql_base_for_port, _nrtqlen_sram_base, +, tql_offset_for_port]

	_check_shift(tqnum, tqnumA, nrtqnum, *l$index0[14])

	; Place request in TQ
	_write_lbr(sig_mask, tqnum, tql_base_for_port, @_nrtq_sram_base, $vcq_wo, \
			rtn_reg, sram_read_dn0_wo, sram_write_dn0, TQ_SIZE_SHIFT, TQ_MAX_LEN, TQLEN_ENTRY_SIZE_SHIFT)

.end	
// ******************* End of NRTQ writeout *******************


// ******************* UBRTQ writeout *******************
//Write out for UBR w/PCR & w/MDCR low bit rate traffic.
//Atomic increment the length of the timequeue in SRAM
//Sleep until the length of the time queue returns.
ubr_write#:
.begin
.reg tql_offset_for_port tql_base_for_port

	; get TQlen_offset for given port
#define_eval INDIRECT_SHIFT 	(16 - TQLEN_ENTRY_SIZE_SHIFT)	; we need tq_offset_for_port to be in bytes
	alu[tql_offset_for_port, mask_tqlofs, AND, *l$index0[2], >>INDIRECT_SHIFT]	; TQlen offset in bytes
#undef INDIRECT_SHIFT

	; calculate offset from the beginning of TQ length table for that port
	alu[tql_base_for_port, _ubrtqlen_sram_base, +, tql_offset_for_port]

	_check_shift(tqnum, tqnumA, ubrtqnum, *l$index0[14])

	; Place request in TQ
	_write_lbr(sig_mask, tqnum, tql_base_for_port, @_ubrtq_sram_base, $vcq_wo, \
			rtn_reg, sram_read_dn0_wo, sram_write_dn0, TQ_SIZE_SHIFT, TQ_MAX_LEN, TQLEN_ENTRY_SIZE_SHIFT)

.end	

// ******************* End of UBRTQ writeout *******************

//Nothing to write out. Just do the signaling and go back to scheduler
//in the end.
no_write#:
	// Making threads to run synchronously
	local_csr_wr[ACTIVE_CTX_WAKEUP_EVENTS, sig_mask]	
	ctx_arb[--]

	.io_completed scratch_read_dn sram_read_dn_sched \
	sram_read_dn_newtq0 sram_read_dn_newtq1 sram_read_dn_newtq2
	local_csr_wr[SAME_ME_SIGNAL, next_thread_sig_csr_val]
	alu[sig_mask, --, B, 1, <<&next_thread_sig]
	rtn[rtn_reg]

///////////////////////////////////////////////////////////////////////
#ifndef HBR_EXCLUDED

//Writeout for High Bit Rate
//Bump up the cell count (in LM) for the VCQ by the number of cells
//that are enqueued for the VCQ.
hbr_ubr_wo#:	
	alu[temp, --, b, msg0, <<VCQ_NUMBER_OF_BITS_CLR_SHIFT]
#define_eval HBR_SHIFT (VCQ_NUMBER_OF_BITS_CLR_SHIFT -1)
	alu[temp, --, b, temp, >>HBR_SHIFT]
	alu[lm_offset, _vcqlen_lm_base, +, temp]
	local_csr_wr[ACTIVE_LM_ADDR_1, lm_offset]  
	;extract cell_count
	alu[cell_count, --, b, msg1, <<QM_REQ_CELL_COUNT_CLR_SHIFT]
	alu[cell_count, --, b, cell_count, >>QM_REQ_CELL_COUNT_CLR_SHIFT]
	;increment and store cell_count
	alu[--, msg0, and, 0x1]
	bne[hi_word_wo#]
	alu[*l$index1[0], cell_count, +, *l$index1[0]]
	br[wo_cont#]
hi_word_wo#:
	ld_field_w_clr[hbr_count, 0011, *l$index1[0], >>16]	;extract stored cell count
	alu[hbr_count, cell_count, +, hbr_count]
	ld_field[*l$index1[0], 1100, hbr_count, <<16]

wo_cont#:
	local_csr_wr[ACTIVE_CTX_WAKEUP_EVENTS, sig_mask]
	ctx_arb[--]
	alu[sig_mask, --, b, 0]
	.io_completed scratch_read_dn sram_read_dn_sched \
	sram_read_dn_newtq0 sram_read_dn_newtq1 sram_read_dn_newtq2
	local_csr_wr[SAME_ME_SIGNAL, next_thread_sig_csr_val]
	alu[sig_mask, --, B, 1, <<&next_thread_sig]
	rtn[rtn_reg]

#endif	//HBR_EXCLUDED
///////////////////////////////////////////////////////////////////////
.end
;--------------------------- The end of WriteOut loop --------------------------------
end_sched#:
	.io_completed scratch_put_dn sram_write_dn0
.end 
#endm
//------------------------------------------------------------------
// _issue_read_newtq_and_txcount()
//
//    Description: This macro issues a read for new rt, nrt & ubr time queue
//					to be loaded into the departure rings and txcount 
//					for flow control purposes.
//
//    Parameters: None.
//		  Inputs:
//			flag_var:  Flag variable to indicate that a time queue read
//					   has been issued
//			rtqlen, nrtqlen ubrqlen:  Real time/Non Real/UBR time queue lengths 
//					          that needs to be read from sram
//			signal_mask: the global signal mask
//			xmitcount:  the transmit count that needs to be read from sram.
// 		 Outputs:
//			None.
//------------------------------------------------------------------

#macro _issue_read_newtq_and_txcount(flag_var, rtqlen, nrtqlen, ubrqlen, \
				signal_mask, xmitcount, curtq)
.begin

.reg tmp producer consumer
.reg max_tq_mask tq_ofs rt_base, nrt_base ubrt_base 
.reg tql_offset_for_port

//set load_newtq flag variable to one, indicating that the
//SRAM read for loading new tq will be issued shortly.
	alu[flag_var, --, b, 1]

//check if the departure queue rings are full. If yes,
//you cannot load the new tq into the departure queue.
	; get rt_producer and rt_consumer
	alu[producer, RTDQ_SIZE_MASK, AND, *l$index0[3], >>24]
	alu[consumer, RTDQ_SIZE_MASK, AND, *l$index0[3], >>16]
	alu[producer, 0x1, +, producer]
	;mask it
	alu[producer, RTDQ_SIZE_MASK, AND, producer]
	; calculate (producer+1)mod8 == consumer
	alu[tmp, consumer, -, producer]
	beq[lm_ringfull#]

	; get nrt_producer and nrt_consumer
	alu[producer, NRTDQ_SIZE_MASK, AND, *l$index0[3], >>8]
	alu[consumer, NRTDQ_SIZE_MASK, AND, *l$index0[3]]
	alu[producer, 0x1, +, producer]
	;mask it
	alu[producer, NRTDQ_SIZE_MASK, AND, producer]
	; calculate (producer+1)mod8 == consumer
	alu[tmp, consumer, -, producer]
	beq[lm_ringfull#]

	; get ubr_producer and ubr_consumer
	alu[producer, UBRDQ_SIZE_MASK, AND, *l$index0[2], >>8]
	alu[consumer, UBRDQ_SIZE_MASK, AND, *l$index0[2]]
	alu[producer, 0x1, +, producer]
	;mask it
	alu[producer, UBRDQ_SIZE_MASK, AND, producer]
	; calculate (producer+1)mod8 == consumer
	alu[tmp, consumer, -, producer]
	beq[lm_ringfull#]

	alu[curtq, *l$index0[14], +, 1]
	alu[curtq, curtq, AND, *l$index0[4]]	; maintain wrap-around

go_on#:
// Issue read for the lengths of RTQ, NRTQ and UBRTQo curTQ from SRAM
len_read#:
	alu[tql_offset, --, b, curtq, <<TQLEN_ENTRY_SIZE_SHIFT]	; we need TQlen offset in bytes
	; set clear masks to 0xffffffff
	alu[rtqlen, --, ~b, 0]
	alu[nrtqlen, --, ~b, 0]
	alu[ubrqlen, --, ~b, 0]
	; get offset to TQlen table for this port
#define_eval INDIRECT_SHIFT 	(16 - TQLEN_ENTRY_SIZE_SHIFT)	; we need tql_offset_for_port to be in bytes
#define_eval INDIRECT_MASK		((1 << TQLEN_ENTRY_SIZE_SHIFT)-1)	; mask lower 2 bits

	alu[tql_offset_for_port, INDIRECT_MASK, ~AND, *l$index0[2], >>INDIRECT_SHIFT] ; to get TQ offset in bytes

#undef INDIRECT_MASK
#undef INDIRECT_SHIFT

	alu[rt_base, _rtqlen_sram_base, +, tql_offset_for_port]
	alu[nrt_base, _nrtqlen_sram_base, +, tql_offset_for_port]
	alu[ubrt_base, _ubrtqlen_sram_base, +, tql_offset_for_port]

	sram[test_and_clr, rtqlen, rt_base, tql_offset], sig_done[sram_read_dn_newtq0]
	sram[test_and_clr, nrtqlen, nrt_base, tql_offset], sig_done[sram_read_dn_newtq1]
	sram[test_and_clr, ubrqlen, ubrt_base, tql_offset], sig_done[sram_read_dn_newtq2]


#ifdef FLOW_CONTROL
//read the number of cells transmitted on this port from the scratch location 
//and get back to the scheduler loop. This value will be used for flow control
//ops later on
txcount_read#:
	scratch[read, xmitcount, 0, _cells_sent_scratch, 1], sig_done[scratch_read_dn]
	alu[signal_mask, signal_mask, or, 1, <<&scratch_read_dn]	
#endif

	alu[signal_mask, signal_mask, or, 1, <<&sram_read_dn_newtq0]
   	alu[signal_mask, signal_mask, or, 1, <<(&sram_read_dn_newtq0+1)]
	alu[signal_mask, signal_mask, or, 1, <<&sram_read_dn_newtq1]
	alu[signal_mask, signal_mask, or, 1, <<(&sram_read_dn_newtq1+1)]
	alu[signal_mask, signal_mask, or, 1, <<&sram_read_dn_newtq2]
	alu[signal_mask, signal_mask, or, 1, <<(&sram_read_dn_newtq2+1)]
	br[end_macro#]

//The DQ rings (one or both) are full. Cant do loading of new time queues
//Hence set the flag to zero and go back to where you left off
lm_ringfull#:
	alu[flag_var, --, b, 0]
	; set RT_for_the_LBR_TQ to zero. 
	ld_field[*l$index0[1], 0100, 0] 

end_macro#:
.end
#endm



//------------------------------------------------------------------
// _load_tq_into_dqrings()
//
//    Description: Move the time queue poiner and length into real time
//					and non-real time departure queues in the local
//					memory of scheduler 
//
//    Parameters: None.
//		  Inputs:
//				flag_var: flag variable to indicate whether departure queues 
//				have to be loaded or not
//				rtqlen, nrtqlen: the real time and non real time time queue
//				lengths read from SRAM
//			
// 		 Outputs:
//			None.
//------------------------------------------------------------------

#macro _load_tq_into_dqrings(flag_var, rtqlen, nrtqlen, ubrqlen, in_serv_port_lm_base, in_cut_tq)
.begin

.reg lmem_ofs temp old_lm
.reg rtdq_lm_producer nrtdq_lm_producer ubrtdq_lm_producer
.reg dq_len
	; set lm base to serviced port
	local_csr_rd[ACTIVE_LM_ADDR_0]
	immed[old_lm, 0]
	local_csr_wr[ACTIVE_LM_ADDR_0, in_serv_port_lm_base]
	nop
	nop
	nop

	; set RT_for_the_LBR_TQ to zero. 
	ld_field[*l$index0[1], 0100, 0] 

	alu[flag_var, --, b, 0]

//Do the processing for real time time queue
//If the time queue length is zero, no need to move into
//Departure queue. Just move on to process the non real time 
//time queue
	rt_ring#:  
		alu[tq_len, --, b, rtqlen]
	  	beq[nrt_ring#]
		alu[--, tq_len, -, TQ_MAX_LEN]	
		bge[rt_ring_full#]	; set tq_len to RT_MAX_LEN
	rt_ring_continue#:
		; loading rtdq_producer
		alu[rtdq_lm_producer, 0x3c, AND, *l$index0[3], >>22]	// >>24 and <<2 to get address in bytes
		alu[lmem_ofs, dq_lm_offset, +, rtdq_lm_producer]	; dq_lm_offset - DQ table base address for given port
		local_csr_wr[ACTIVE_LM_ADDR_1, lmem_ofs]
		alu[rtdq_lm_producer, rtdq_lm_producer, +, 4]
		alu[rtdq_lm_producer, RTDQ_SIZE_MASK, AND, rtdq_lm_producer, >>2]
		ld_field[*l$index0[3], 1000, rtdq_lm_producer, <<24]

//write the time queue pointer and the time queue length
//into the departure queue
	rt_write_ring#:
		alu[*l$index1[0], in_cut_tq, or, tq_len, <<16]
		; increment DQRTlen by the length of the current TQ
		alu[dq_len, mask_upper16, AND, *l$index0[6]]
		alu[dq_len, tq_len, +, dq_len]
		ld_field[*l$index0[6], 0011, dq_len]

//Do the processing for non real time time queue
//If the time queue length is zero, no need to move into
//Departure queue.
	nrt_ring#:
		alu[tq_len, --, b, nrtqlen]
		beq[ubr_ring#]
		alu[--, tq_len, -, TQ_MAX_LEN]	
		bge[nrt_ring_full#]	; set tq_len to NRT_MAX_LEN
	nrt_ring_continue#:
		; loading nrtdq_producer
		alu[nrtdq_lm_producer, 0x3c, AND, *l$index0[3], >>6]	// >>8 and <<2 to get address in bytes
		alu[lmem_ofs, dq_lm_offset, +, nrtdq_lm_producer]	; dq_lm_offset - DQ table base address for given port
		alu[lmem_ofs, NRTDQ_LM_OFFSET, +, lmem_ofs]				; offset from RTdq
		local_csr_wr[ACTIVE_LM_ADDR_1, lmem_ofs]
		alu[nrtdq_lm_producer, nrtdq_lm_producer, +, 4]
		alu[nrtdq_lm_producer, NRTDQ_SIZE_MASK, AND, nrtdq_lm_producer, >>2]
		ld_field[*l$index0[3], 0010, nrtdq_lm_producer, <<8]

//write the time queue pointer and the time queue length
//into the departure queue
	nrt_write_ring#:
		alu[*l$index1[0], in_cut_tq, or, tq_len, <<16]
		; increment DQNRTlen by the length of the current TQ
		alu[dq_len, --, B, *l$index0[7], >>16]
		alu[dq_len, tq_len, +, dq_len]
		ld_field[*l$index0[7], 1100, dq_len, <<16]

//Do the processing for ubr time queue
//If the time queue length is zero, no need to move into
//Departure queue. 
	ubr_ring#:    
		alu[tq_len, --, b, ubrqlen]  
		beq[end_macro#]
		alu[--, tq_len, -, TQ_MAX_LEN]	
		bge[ubrt_ring_full#]	; set tq_len to UBRT_MAX_LEN
	ubrt_ring_continue#:
		; loading ubrdq_producer
		alu[ubrtdq_lm_producer, 0x3c, AND, *l$index0[2], >>6]	// >>8 and <<2 to get address in bytes
		alu[lmem_ofs, dq_lm_offset, +, ubrtdq_lm_producer]	; dq_lm_offset - DQ table base address for given port
		alu[lmem_ofs, UBRDQ_LM_OFFSET, +, lmem_ofs]				; offset from RTdq
		local_csr_wr[ACTIVE_LM_ADDR_1, lmem_ofs]
		alu[ubrtdq_lm_producer, ubrtdq_lm_producer, +, 4]
		alu[ubrtdq_lm_producer, UBRDQ_SIZE_MASK, AND, ubrtdq_lm_producer, >>2]
		ld_field[*l$index0[2], 0010, ubrtdq_lm_producer, <<8]


//write the time queue pointer and the time queue length
//into the departure queue
	ubr_write_ring#:	
		alu[*l$index1[0], in_cut_tq, or, tq_len, <<16]
		; increment DQUBRTlen by the length of the current TQ
		alu[dq_len, mask_upper16, AND, *l$index0[7]]
		alu[dq_len, tq_len, +, dq_len]
		ld_field[*l$index0[7], 0011, dq_len]
		br[end_macro#]

rt_ring_full#:
		alu[tq_len, --, B, TQ_MAX_LEN]
		br[rt_ring_continue#]

nrt_ring_full#:
		alu[tq_len, --, B, TQ_MAX_LEN]
		br[nrt_ring_continue#]

ubrt_ring_full#:
		alu[tq_len, --, B, TQ_MAX_LEN]
		br[ubrt_ring_continue#]

end_macro#:
	local_csr_wr[ACTIVE_LM_ADDR_0, old_lm]
	nop
	nop
	nop
.end
#endm


//------------------------------------------------------------------
// _flow_control()
//
//    Description: Checks for flow control to be asserted or de-asserted
//					on a port.
//
//    Parameters: None.
//		  Inputs:
//			xmitcount: The transmit count read from SRAM
// 		 Outputs:
//			None.
//------------------------------------------------------------------

#macro _flow_control(xmitcount)
.begin

.reg diff

//Process flow control for the port
//Check if txcount >= schcount. If so, wraparound has ocurred for the txcount.
//In such a case add 256 to txcount.
//Check if schcount - txcount > FC_DELTA. If so set flow control on the port
//Else reset flow control bit for the port.  
flow_control#:
	; get Schcount
	alu[temp, 0xff, AND, *l$index0[1], >>24]
	; compare Schcount and xmitcount
	alu[diff, temp, -, xmitcount]
	bge[fc_delta_check#]
	alu[diff, diff, +, 255]
	alu[diff, diff, +, 1]

fc_delta_check#:
	alu[--, diff, -, FC_DELTA]
	blt[reset_fc#]
	; set FC bit
	alu[*l$index0[1], *l$index0[1], OR, 1] 
	br[end_macro#]

reset_fc#:
	; clear FC bit
	alu[*l$index0[1], *l$index0[1], AND~, 1] 

end_macro#:
.end
#endm


///////////////////////////////////////////////////////////////////////
#ifndef HBR_EXCLUDED

//------------------------------------------------------------------
// _check_and_load_new_hbr_tq()
//
//    Description: 	Check if it is time yet to poll for a new HBR TQ in SRAM
//					A new HBR TQ will be available in SRAM whenever a HBR VC arrives
//					or leaves the system.IF it is time, then, check if a new HBR TQ
//					is present in SRAM by examining a particular location in SRAM.
//					If new HBR TQ is present, load the TQ into LM, in
//					place of the old one.
//
//    Parameters: None.
//		  Inputs:
//			None.
// 		 Outputs:
//			None.
//------------------------------------------------------------------

#macro _check_and_load_new_hbr_tq()
.begin
.reg _hbr_tq_sram_indicator_base 
.sig sram_read_dn_indicator


	alu[@reload_hbrtq_timer, @reload_hbrtq_timer, +, AGGREGATION]
	alu[--, @reload_hbrtq_timer_max, -, @reload_hbrtq_timer]
	bgt[end_macro#]

	alu[@reload_hbrtq_timer, --, b, 0]

	xbuf_alloc($indicator, 1, read_write)

	//read SRAM to see if there is an indication from Xscale that the HBR TQ
	//has changed
	//If yes read it.
	//If not, exit
	alu[$indicator[0], --, ~b, 0]
	immed32(_hbr_tq_sram_indicator_base, HBR_TQ_SRAM_INDICATOR)
	sram[test_and_clr, $indicator[0], _hbr_tq_sram_indicator_base, 0], sig_done[sram_read_dn_indicator]
	ctx_arb[sram_read_dn_indicator]    

	alu[--, --, b, $indicator[0]]
	beq[end_macro#]

    xbuf_free($indicator)

	_read_hbr_tq_sram()

end_macro#:
.end
#endm

#endif
///////////////////////////////////////////////////////////////////////

//------------------------------------------------------------------
// _get_port_entry_static(in_port_no,out_serv_port_lm_base)
//
//    Description: 	Checks if per port tables are in LM using CAM
//					If they are in CAM it returns LM address of it, otherwise it is doing
//					SRAM read, fill LM with PortInfo/UBRwPRI and DQ entry and return LM address of it
//
//    Parameters: 
//		  Inputs: in_port_no - 0...15
// 		 Outputs: out_serv_port_lm_base - base address for PortInfo in LM
//					Sets LM index0/index1 base
//------------------------------------------------------------------
#macro _get_port_entry_static(in_port_no, in_dq_lm_base, in_ubrwpri_lm_base, out_portinfo_lm_base, out_ubrwpri_lm_base, out_curtq)

.begin
.reg lm_offset tmp

	; preparing LM base address for PortInfo table
    #if (PORTINFO_LM_BASE > 0)

	    move(tmp, PORTINFO_LM_BASE)
    	alu[lm_offset, --, B, in_port_no, <<PORTINFO_ENTRY_SIZE_LOG2]
        alu[lm_offset, lm_offset, +, tmp]

    #else /* (PORTINFO_LM_BASE > 0) */

    	alu[lm_offset, --, B, in_port_no, <<PORTINFO_ENTRY_SIZE_LOG2]

    #endif /* (PORTINFO_LM_BASE > 0) */

	local_csr_wr[active_lm_addr_0, lm_offset]
	alu[out_portinfo_lm_base, --, B, lm_offset]

	; preparing LM base address for DQ table
	alu[dq_lm_offset, in_dq_lm_base, +, lm_offset]
	local_csr_wr[active_lm_addr_1, dq_lm_offset]
	alu[lm_offset, --, B, in_port_no, <<5]	; UbrWpri table size is 32 bytes
	alu[out_ubrwpri_lm_base, in_ubrwpri_lm_base, +, lm_offset]
	alu[out_curtq, --, B, *l$index0[14]]	; we will need this when there is issue_load_new_tq.. running

.end
#endm

//------------------------------------------------------------------
// _get_port_entry_static_wo(in_port_no)
//
//    Description: 	Checks if PortInfo[n] and DQ are in LM using CAM
//					If they aren't then swaps LRU with SRAM data for given port
//
//    Parameters: 
//		  Inputs: in_port_no - 0...15
// 		 Outputs: Sets LM index0/index1 base
//------------------------------------------------------------------
#macro _get_port_entry_static_wo(in_msg0, in_ubrwpri_lm_base, out_ubrwpri_lm_base, out_lm_offset)
.begin
.reg lm_offset tmp

    #define_eval INDIRECT_SHIFT 	(19 - PORTINFO_ENTRY_SIZE_LOG2)	; we need port to be multiplied by PORTINFO_ENTRY_SIZE_LOG2

       	alu[out_lm_offset, mask_for_portinfo, AND, in_msg0, >>INDIRECT_SHIFT]

    #undef INDIRECT_SHIFT

	; preparing LM base address for PortInfo table (PortInfo has LM_BASE = 0x000)
	; preparing LM base address for PortInfo table
    #if (PORTINFO_LM_BASE > 0)

	    move(tmp, PORTINFO_LM_BASE)
        alu[out_lm_offset, out_lm_offset, +, tmp]

    #endif /* (PORTINFO_LM_BASE > 0) */


	local_csr_wr[active_lm_addr_0, out_lm_offset]

	; Preparing UBRwPRI address base
    #define_eval INDIRECT_SHIFT 	(19 - 5)	; we need port to be multiplied by PORTINFO_ENTRY_SIZE_LOG2
    #define_eval INDIRECT_MASK		(0x7 << 5)	; for this mode port range is 0-7

       	alu[lm_offset, INDIRECT_MASK, AND, in_msg0, >>INDIRECT_SHIFT]	; UbrWpri table size is 32 bytes

    #undef INDIRECT_MASK
    #undef INDIRECT_SHIFT

	alu[out_ubrwpri_lm_base, in_ubrwpri_lm_base, +, lm_offset]
	nop
.end
#endm


//------------------------------------------------------------------
// _check_shift()
//
//    Description: 	Adjusts TQnum get from shaper in the way that we aren't
//					trying to schedule for TQ that is cached in DQ ring
//
//    Parameters:
//		  Inputs: 
//				in_curtq - number of TQ we should be servicing
//				in_xxtqnum - number of currently serviced TQ
//		  Inputs/Outputs:
//				io_tqnum - number of TQ to which we should schedule
//------------------------------------------------------------------
#macro _check_shift(io_tqnum, in_tqnumA, in_xxtqnum, in_curtq)
.begin
.reg diff_1 	; CurTQ  - xxtqnum
.reg diff_2 	; xxTQnum - CurTQ
.reg diff_3		; CurTQ - tqnum
.reg diff_4		; tqnum - CurTQ
.reg diff_5		; tqnum - xxTQnum
.reg diff_6		; xxTQnum - tqnum

.reg xxtqnumA	; in_xxtqnum register in GPR A
.reg maxtqmask  ; MaxTQmask

// if (xxTQnum <= tqnum <= CurTQ) then tqnum = CurTQ + 1
	alu[diff_1, in_curtq, -, in_xxtqnum]
	bmi[wrap#] ; jump if RTQ > CurTQ

	; If | curTQ - RTQnum| >= RTDQ_SIZE then
	; there is no need to check if tqnum is between them
	alu[--, diff_1, -, RTDQ_SIZE]
	bgt[no_wrap#]

	alu[diff_3, in_curtq, -, tqnum] ; CurTQ - tqnum
    beq[shift_needed#]
	alu[diff_5, in_tqnumA, -, in_xxtqnum] ; tqnum - RTQnum
    beq[shift_needed#]
	alu[--, diff_3, OR, diff_5]
	bmi[end#] ;jump if tqnum isn't in <xxTQnum, CurTQ> range

	alu[tqnum, in_curtq, +, 1] ; tqnum = CurTQ + 1
	; get MaxTQmask to mask incremented value
	alu[tqnum, tqnum, AND, *l$index0[4]]
	br[end#]

wrap#:
    alu[maxtqmask, mask_upper16, AND, *l$index0[4]]
	alu[diff_2, 1, +, maxtqmask] ; (xxTQnum - CurTQ)
	alu[diff_2, diff_2, +, diff_1]
	alu[--, diff_2, -, RTDQ_SIZE]
	bgt[no_wrap#]

	move[xxtqnumA, in_xxtqnum]
	alu[diff_4, in_tqnumA, -, in_curtq] ; tqnum - CurTQ
    beq[shift_needed#]
	alu[diff_6, xxtqnumA, -, tqnum] ; xxTQnum - tqnum
    beq[shift_needed#]
	alu[--, diff_4, OR, diff_6]
	bpl[end#] ;jump if (tqnum > curTQ) || (tqnum < RTQnum)

	alu[tqnum, in_curtq, +, 1] ; tqnum = CurTQ + 1
	; get MaxTQmask to mask incremented value
	alu[tqnum, tqnum, AND, *l$index0[4]]
	br[end#]

no_wrap#:
	alu[--, tqnum, -, in_curtq]
	bne[end#]
shift_needed#:
	alu[tqnum, in_curtq, +, 1] ; tqnum = CurTQ + 1
	; get MaxTQmask to mask incremented value
	alu[tqnum, tqnum, AND, *l$index0[4]]

end#:
.end
#endm




//------------------------------------------------------------------
// _write_lbr()
//
//    Description: 	Puts cell into TQ - for LBR VCs
//
//    Parameters:
//		  Inputs:
//				in_curtq - number of TQ we should be servicing
//				in_xxtqnum - number of currently serviced TQ
//		  Inputs/Outputs:
//				io_tqnum - number of TQ to which we should schedule
//------------------------------------------------------------------
#macro _write_lbr(io_sig_mask, in_tqnum, in_tql_base_for_port, \
 	in_tq_sram_base, in_vcq, in_rtn_req, in_rd_sig, in_wr_sig, \
  	IN_TQ_SHIFT, IN_TQMAX_LEN, IN_TQLEN_SHIFT)

.begin
.reg $tq_len	; length of TQ
.reg tq_offset	; offset within TQ on this port
.reg slot_offset ; offset from the beginning of TQ
.reg tq_base	; base addres for TQ given by tqnum
.reg tq_offset_for_port tq_base_for_port
.reg tmp

	; calculating offset from first TQlen element for that port
	alu[tql_offset, --, b, tqnum, <<IN_TQLEN_SHIFT]

	; increment TQlen
	sram[test_and_incr, $tqlen, in_tql_base_for_port, tql_offset], sig_done[in_rd_sig]

	alu[sig_mask, sig_mask, or, 1, <<&in_rd_sig]
	local_csr_wr[ACTIVE_CTX_WAKEUP_EVENTS, sig_mask]
	alu[sig_mask, --, B, 1, <<&next_thread_sig]	; clear signal mask
#define_eval INDIRECT_SHIFT 	(16 - IN_TQ_SHIFT)	; we need tq_offset_for_port to be in bytes
	alu[tq_offset_for_port, mask_tqofs, AND, *l$index0[2], >>INDIRECT_SHIFT] ; to get TQ offset in bytes
#undef INDIRECT_SHIFT

	ctx_arb[--]

//After wakeup, set the SIGMASK to zero signal next thread
	.io_completed scratch_read_dn sram_read_dn_sched in_rd_sig \
	sram_read_dn_newtq0 sram_read_dn_newtq1 sram_read_dn_newtq2

	alu[tq_base_for_port, tq_offset_for_port, +, in_tq_sram_base]

prepare#:
	alu[--, $tqlen, -, IN_TQMAX_LEN]; check if TQ is full
	bge[find_another#]		; If this TQ is full let's find another

	local_csr_wr[SAME_ME_SIGNAL, next_thread_sig_csr_val]
// Preparing pointers for this TQ write-out

	; Prepare address for writing VCQ
	alu[slot_offset, --, B, $tqlen, <<2]	; $tqlen should be <0,RT_MAX_LEN), shifted to get offset in bytes

	; Calculate TQ address from TQ number
	alu[tq_offset, --, B, tqnum, <<IN_TQ_SHIFT] ; to get TQ addres in bytes

	alu[tq_base, tq_offset, +, tq_base_for_port]

write_vcq#:
    sram[write, $vcq_wo, tq_base, slot_offset, 1], sig_done[in_wr_sig]
	alu[sig_mask, sig_mask, OR, 1, <<&in_wr_sig]
	rtn[rtn_reg]

find_another#:
	; Current TQ is FULL so we need to try another one (recursively)
	local_csr_rd[TIMESTAMP_LOW]
	immed[temp, 0]
	alu[temp, 0x3, AND, temp]	; Randomizing the choice of TQ
	alu[temp, temp, +, 1]
	alu[tqnum, tqnum, +, temp]

	; get MaxTQ to mask incremented value
	alu[tqnum, *l$index0[4], AND, tqnum]

	alu[tql_offset, --, b, tqnum, <<IN_TQLEN_SHIFT]

	; increment TQlen
	sram[test_and_incr, $tqlen, in_tql_base_for_port, tql_offset], ctx_swap[in_rd_sig]
	br[prepare#]

.end
#endm

//------------------------------------------------------------------
// _init_port_shaping_table_reg(out_portshaping_table)
//
//    Description: 	Load 8 portshaping elements to portschaping table register
//					Assuming that size of PortShaping Size is not larger than 
//					8 entries.
//
//    Parameters:
//		  Inputs: 
// 		 Outputs:
//------------------------------------------------------------------
#macro _init_port_shaping_table_reg(out_portshaping_table)
.begin
.sig init_sram_read_dn          //signal for reading portshaping @ init
.reg lm_address sram_address_offset
.reg portshaping_base portshaping_offset tmp

	immed32[portshaping_base, PORTSHAPING_SRAM_BASE]
	immed32[portshaping_offset,0]

	; Preparing LM base address
	immed32[lm_address, PORTSHAPING_LM_BASE]
	local_csr_wr[active_lm_addr_1, lm_address]

#if (MAX_ATM_PORTS > MAX_STATIC_PORTS)
	#error "MAX_ATM_PORTS is out of range. Correct range: 1..8"

#else // MAX_ATM_PORTS <= MAX_STATIC_PORTS

	xbuf_alloc($r_portshaping, MAX_ATM_PORTS,  read)
	sram_read($r_portshaping[0], portshaping_base, portshaping_offset, MAX_ATM_PORTS, init_sram_read_dn, init_sram_read_dn, ___)

	#define_eval i 0
	immed32[out_portshaping_table, 0]
	#while (i < (MAX_STATIC_PORTS/MAX_ATM_PORTS))
		
		#define_eval j 0
		#while (j < MAX_ATM_PORTS)

			alu[tmp, 0x0f, AND, $r_portshaping/**/[j]]

			alu[out_portshaping_table, out_portshaping_table, OR, tmp, <<(4 * (j + MAX_ATM_PORTS * i))]

		#define_eval j (j+1)
		#endloop
		#undef j

	#define_eval i (i+1)
	#endloop
	#undef i
	xbuf_free($r_portshaping)
#endif	// (MAX_ATM_PORTS > MAX_STATIC_PORTS)

.end
#endm


//------------------------------------------------------------------
// _init_16_port_shaping_entries()
//
//    Description: 	Load 16 entries of PortShaping from SRAM to LM
//					starting from entry no. in_start_entry
//					Assuming that size of PortShaping Size is a multiply of 16 entries
//					Checks if there is a need to sleep for time gain
//
//    Parameters:
//		  Inputs: in_start_entry
// 		 Outputs:
//------------------------------------------------------------------
#macro _init_16_port_shaping_entries()
.begin
.sig init_sram_read_dn          //signal for reading portshaping @ init
.reg lm_address sram_address_offset
.reg portshaping_base portshaping_offset tmp

	immed32[portshaping_base, PORTSHAPING_SRAM_BASE]
	immed32[portshaping_offset,0]

	; Preparing LM base address
	immed32[lm_address, PORTSHAPING_LM_BASE]
	local_csr_wr[active_lm_addr_1, lm_address]

	xbuf_alloc($r_portshaping, 8,  read)
	sram[read, $r_portshaping[0], portshaping_base, portshaping_offset, 8], sig_done[init_sram_read_dn]
	ctx_arb[init_sram_read_dn]
; Copy PortShaping from SRAM to LM.
	alu[*l$index1[0], --,b, $r_portshaping[0]]
	alu[*l$index1[1], --,b, $r_portshaping[1]]
	alu[*l$index1[2], --,b, $r_portshaping[2]]
	alu[*l$index1[3], --,b, $r_portshaping[3]]
	alu[*l$index1[4], --,b, $r_portshaping[4]]
	alu[*l$index1[5], --,b, $r_portshaping[5]]
	alu[*l$index1[6], --,b, $r_portshaping[6]]
	alu[*l$index1[7], --,b, $r_portshaping[7]]

	alu[portshaping_offset, 0x20, +, portshaping_offset]
	sram[read, $r_portshaping[0], portshaping_base, portshaping_offset, 8], sig_done[init_sram_read_dn]
	ctx_arb[init_sram_read_dn]

	alu[*l$index1[8], --,b, $r_portshaping[0]]
	alu[*l$index1[9], --,b, $r_portshaping[1]]
	alu[*l$index1[10], --,b, $r_portshaping[2]]
	alu[*l$index1[11], --,b, $r_portshaping[3]]
	alu[*l$index1[12], --,b, $r_portshaping[4]]
	alu[*l$index1[13], --,b, $r_portshaping[5]]
	alu[*l$index1[14], --,b, $r_portshaping[6]]
	alu[*l$index1[15], --,b, $r_portshaping[7]]
	xbuf_free($r_portshaping)
.end;
#endm

//------------------------------------------------------------------
// _init_8_port_shaping_entries()
//
//    Description: 	Load 8 entries of PortShaping from SRAM to LM
//					starting from entry no. 0
//					Assuming that size of PortShaping Size is a multiply of 8 entries
//
//    Parameters:
//		  Inputs:
// 		 Outputs: Sets LM index1 base
//------------------------------------------------------------------
#macro _init_8_port_shaping_entries()
.begin
.sig init_sram_read_dn          //signal for reading portshaping @ init
.reg lm_address sram_address_offset
.reg portshaping_base portshaping_offset tmp

	immed32[portshaping_base, PORTSHAPING_SRAM_BASE]
	immed32[portshaping_offset,0]

	; Preparing LM base address
	immed32[lm_address, PORTSHAPING_LM_BASE]
	local_csr_wr[active_lm_addr_1, lm_address]

	xbuf_alloc($r_portshaping, 8,  read)
	sram[read, $r_portshaping[0], portshaping_base, portshaping_offset, 8], sig_done[init_sram_read_dn]
	ctx_arb[init_sram_read_dn]
; Copy PortShaping from SRAM to LM.
	alu[*l$index1[0], --,b, $r_portshaping[0]]
	alu[*l$index1[1], --,b, $r_portshaping[1]]
	alu[*l$index1[2], --,b, $r_portshaping[2]]
	alu[*l$index1[3], --,b, $r_portshaping[3]]
	alu[*l$index1[4], --,b, $r_portshaping[4]]
	alu[*l$index1[5], --,b, $r_portshaping[5]]
	alu[*l$index1[6], --,b, $r_portshaping[6]]
	alu[*l$index1[7], --,b, $r_portshaping[7]]
	xbuf_free($r_portshaping)
.end;
#endm


//------------------------------------------------------------------
// _get_16_port_shaping_entries(in_start_entry, out_lm_offset)
//
//    Description: 	Load 16 entries of PortShaping from SRAM to LM
//					starting from entry no. in_start_entry
//					Assuming that size of PortShaping Size is a multiply of 16 entries
//
//    Parameters: 
//		  Inputs: in_start_entry  
// 		 Outputs: Sets LM index1 base, out_lm_offset
//------------------------------------------------------------------
#macro _get_16_port_shaping_entries(in_start_entry, out_lm_offset)
.begin
.reg lm_address sram_address_offset
.reg portshaping_base portshaping_offset tmp 
	
	immed32[portshaping_base, PORTSHAPING_SRAM_BASE]
	alu[tmp, --, B, in_start_entry]
	alu[portshaping_offset, --, B, tmp, <<2]

	; Preparing LM base address
	immed32[lm_address, PORTSHAPING_LM_BASE]
	local_csr_wr[active_lm_addr_1, lm_address]
	move[out_lm_offset, lm_address]

	xbuf_alloc($r_portshaping, 8,  read)
	sram[read, $r_portshaping[0], portshaping_base, portshaping_offset, 8], sig_done[get_sram_read_dn]
	ctx_arb[get_sram_read_dn]
; Copy PortShaping from SRAM to LM.
	alu[*l$index1[0], --,b, $r_portshaping[0]]
	alu[*l$index1[1], --,b, $r_portshaping[1]]
	alu[*l$index1[2], --,b, $r_portshaping[2]]
	alu[*l$index1[3], --,b, $r_portshaping[3]]
	alu[*l$index1[4], --,b, $r_portshaping[4]]
	alu[*l$index1[5], --,b, $r_portshaping[5]]
	alu[*l$index1[6], --,b, $r_portshaping[6]]
	alu[*l$index1[7], --,b, $r_portshaping[7]]

	alu[portshaping_offset, 0x20, +, portshaping_offset]
	sram[read, $r_portshaping[0], portshaping_base, portshaping_offset, 8], sig_done[get_sram_read_dn]
	ctx_arb[get_sram_read_dn]

	alu[*l$index1[8], --,b, $r_portshaping[0]]
	alu[*l$index1[9], --,b, $r_portshaping[1]]
	alu[*l$index1[10], --,b, $r_portshaping[2]]
	alu[*l$index1[11], --,b, $r_portshaping[3]]
	alu[*l$index1[12], --,b, $r_portshaping[4]]
	alu[*l$index1[13], --,b, $r_portshaping[5]]
	alu[*l$index1[14], --,b, $r_portshaping[6]]
	alu[*l$index1[15], --,b, $r_portshaping[7]]
	xbuf_free($r_portshaping)

.end;
#endm

//------------------------------------------------------------------
// _ubr_pri_deq(out_vcq)
//
//    Description: 	Strict priority dequeue algorithm
//
//    Parameters: 
//		  Inputs: 
//				UBRwPRI cache byte in the local memory *l$index0[7]
//				UBR w/priority table in the local memory *l$index0[8-15]
// 		 Outputs: out_vcq - VCQ number to dequeue from
//				  			31st bit set when all priority queue are empty
//------------------------------------------------------------------
#macro  _ubr_pri_deq(out_vcq, in_portinfo_lm_addr, in_ubrwpri_lm_addr)
.begin
.reg _entry _pos _cache

	alu[out_vcq, --, b, 1, <<31]	;all priority queues are empty
	ld_field_w_clr[_cache, 0001, *l$index0[4],>>16] ; it's UBRwPRI cache from PortInfo table
	ffs[_pos, _cache]				;find first priority with cells
	beq[_end#]
	local_csr_wr[ACTIVE_LM_ADDR_0, in_ubrwpri_lm_addr]	; 
	nop

	jump[_pos, prio_jump_table#],	targets[_pri_0#, _pri_1#, _pri_2#, _pri_3#, \
											_pri_4#, _pri_5#, _pri_6#, _pri_7#]
prio_jump_table#:
_pri_0#: br[_ubr_pri_0#]
_pri_1#: br[_ubr_pri_1#]
_pri_2#: br[_ubr_pri_2#]
_pri_3#: br[_ubr_pri_3#]
_pri_4#: br[_ubr_pri_4#]
_pri_5#: br[_ubr_pri_5#]
_pri_6#: br[_ubr_pri_6#]
_pri_7#: br[_ubr_pri_7#]


_ubr_pri_0#:
	alu[*l$index0[0], *l$index0[0], -, 1]			;decrement cell counter
	alu[_entry, --, b, *l$index0[0]]
	alu[--, --, b, _entry, <<VCQ_NUMBER_OF_BITS]
	bne[_ext_vc#]
	alu[_cache, _cache, and~, 0x1]					;clear bit in UBRwPRI cache
	br[_ext_vc#]
_ubr_pri_1#:
	alu[*l$index0[1], *l$index0[1], -, 1]			;decrement cell counter
	alu[_entry, --, b, *l$index0[1]]
	alu[--, --, b, _entry, <<VCQ_NUMBER_OF_BITS]
	bne[_ext_vc#]
	alu[_cache, _cache, and~, 0x2]					;clear bit in UBRwPRI cache
	br[_ext_vc#]
_ubr_pri_2#:
	alu[*l$index0[2], *l$index0[2], -, 1]			;decrement cell counter
	alu[_entry, --, b, *l$index0[2]]
	alu[--, --, b, _entry, <<VCQ_NUMBER_OF_BITS]
	bne[_ext_vc#]
	alu[_cache, _cache, and~, 0x4]					;clear bit in UBRwPRI cache
	br[_ext_vc#]
_ubr_pri_3#:
	alu[*l$index0[3], *l$index0[3], -, 1]			;decrement cell counter
	alu[_entry, --, b, *l$index0[3]]
	alu[--, --, b, _entry, <<VCQ_NUMBER_OF_BITS]
	bne[_ext_vc#]
	alu[_cache, _cache, and~, 0x8]					;clear bit in UBRwPRI cache
	br[_ext_vc#]
_ubr_pri_4#:
	alu[*l$index0[4], *l$index0[4], -, 1]			;decrement cell counter
	alu[_entry, --, b, *l$index0[4]]
	alu[--, --, b, _entry, <<VCQ_NUMBER_OF_BITS]
	bne[_ext_vc#]
	alu[_cache, _cache, and~, 0x10]					;clear bit in UBRwPRI cache
	br[_ext_vc#]
_ubr_pri_5#:
	alu[*l$index0[5], *l$index0[5], -, 1]			;decrement cell counter
	alu[_entry, --, b, *l$index0[5]]
	alu[--, --, b, _entry, <<VCQ_NUMBER_OF_BITS]
	bne[_ext_vc#]
	alu[_cache, _cache, and~, 0x20]					;clear bit in UBRwPRI cache
	br[_ext_vc#]
_ubr_pri_6#:
	alu[*l$index0[6], *l$index0[6], -, 1]			;decrement cell counter
	alu[_entry, --, b, *l$index0[6]]
	alu[--, --, b, _entry, <<VCQ_NUMBER_OF_BITS]
	bne[_ext_vc#]
	alu[_cache, _cache, and~, 0x40]					;clear bit in UBRwPRI cache
	br[_ext_vc#]
_ubr_pri_7#:
	alu[*l$index0[7], *l$index0[7], -, 1]			;decrement cell counter
	alu[_entry, --, b, *l$index0[7]]
	alu[--, --, b, _entry, <<VCQ_NUMBER_OF_BITS]
	bne[_ext_vc#]
	alu[_cache, _cache, and~, 0x80]					;clear bit in UBRwPRI cache
_ext_vc#:
	local_csr_wr[ACTIVE_LM_ADDR_0, in_portinfo_lm_addr]	; Set LM index to Portinfo table
	alu[out_vcq, --, b, _entry, >>CELL_COUNT_NUMBER_OF_BITS]	;vcq# 
	nop
	nop
	ld_field[*l$index0[4], 0100, _cache, <<16] 	; Write back cache of UBRwPRI
_end#:
.end
#endm

#endif 		//_ATM_TM_SCHEDULER_UTIL_UC_
