/////////////////////////////////////////////////////////////////////////////////////////
//                                                                      
//                  I N T E L   P R O P R I E T A R Y                   
//                                                                      
//     COPYRIGHT (c)  2001 BY  INTEL  CORPORATION.  ALL RIGHTS          
//     RESERVED.   NO  PART  OF THIS PROGRAM  OR  PUBLICATION  MAY      
//     BE  REPRODUCED,   TRANSMITTED,   TRANSCRIBED,   STORED  IN  A    
//     RETRIEVAL SYSTEM, OR TRANSLATED INTO ANY LANGUAGE OR COMPUTER    
//     LANGUAGE IN ANY FORM OR BY ANY MEANS, ELECTRONIC, MECHANICAL,    
//     MAGNETIC,  OPTICAL,  CHEMICAL, MANUAL, OR OTHERWISE,  WITHOUT    
//     THE PRIOR WRITTEN PERMISSION OF :                                
//                                                                      
//                        INTEL  CORPORATION                            
//                                                                     
//                     2200 MISSION COLLEGE BLVD                        
//                                                                      
//               SANTA  CLARA,  CALIFORNIA  95052-8119 
//
/////////////////////////////////////////////////////////////////////////////////////////

/*
This ME is the first in a series of 3 MEs making context pipe to do the modified 
DRR scheduler function.

The basic scheme for this DRR implementation is defined below.

Every queue has some pre-assigned parameters that define its transfer rate. 
In the DRR scheme every active queue collects credit as time progress and 
uses these credits as packets arrive. If there are not enough credit when a 
packet arrives, that packet is scheduled to be transmitted in some later 
time when its queue collects enough credit. 
In this Scheduler, the way the DRR scheme is accomplished is by defining each round 
quanta that is assigned to each queue. For example, there are 2 active queues: 
queue X can transmit 10 bytes per round, queue Y can transmit 20 bytes per round,and
their last packets were schedule to be transmitted in the current round.
If queue X and Y each receive a 40-byte packet, then the packet in queue X will have to 
wait for 4 rounds and the one in queue Y will have to wait for 2 rounds from the current 
round.
Assuming that current round is round 0, so the SOP of the queue X's packet is placed in 
a fifo used by round 4 and the SOP of queue Y's packet will be placed in the fifo used 
by round 2. These rounds are defined seperately for every port. Depending upon the 
transmission of packets from different ports, packets are drained from these FIFOs and 
sent to the transmit functional block for transmit.

The entire DRR implementation is divided in 3 MEs: Class Scheduler, Count, and 
Port Scheduler.

Class Scheduler ME receives: 
	- 	Enqueue message  from Statistics ME thru next neighbor ring 
	- 	Next dequeue round request thru scratch ring (written by Port Scheduler ME).
		Next dequeue round request is sent every time the packets count of the round 
		being dequeued hit a low-water-mark.

	From the enqueue message Class Scheduler ME calculate the round when a received 
	packet can be dequeued. This depends upon the packet size, the quantum assigned 
	to the packet's queue, the credit the queue has when the packet 
	arrives. These datat is stored in a queue structure in SRAM for each queue.
	
	After receiving a next dequeue round request for port X, Class Scheduler marks 
	the current enqueue round of port X unavailable for future enqueuing, and send 
	that current round number to the Count ME in the next dequeue round response.
	Then the next round become the current enqueue round and Class Scheduler 
	starts enqueueing packets in that round and onwards. 
	
	Queue structure:	
	+---------------------+---------------------------+-----------------------------+
	|			current_credit_used													|
	|																				| 	LW0
	+---------------------+---------------------------+-----------------------------+
	|			last_enq_round_and_quantum_shift									|
	|																				| 
	|[19:16]port [11:0] round: where the last packet for this queue was scheduled	|
	|[24:31]: n where 2^n is quantum credit per round								|	LW1
	+---------------------+---------------------------+-----------------------------+
	|			enqueued_packets_counter  											|	LW2
	+---------------------+---------------------------+-----------------------------+

	16 queue structures are cached in local memory and index by the CAM


	Interfaces of Class_scheduler block:

	 * Enqueue request from Statistics microengine to Class Scheduler microengine on NN ring 
 
 		packet_length				 : 32 bits	- LW0
 	 	port_number					 : 16 bits	- LW1
 	 	queue_number				 : 16 bits	- LW1
 	 	sop_handle					 : 32 bits  - LW2

	 * Enqueue request and new round response from Class Scheduler microengine to Count microengine
		sop_handle					 : bits 31:0		for enqueue request
															
		port_round_number			 : bits 31:0		for enqueue request
															[19:16] port number [11:0] round number
		
		port_next_dequeue_round		 : bits 31:0		for response to Next Dequeue Round
		 													Request from Port Scheduler microengine
		 													[19:16] port number [11:0] round number										[19:16] port number [11:0] round number   
		 													[31] invalid bit: 1 = invalid response
		 																	0 = valid response 
		 													[30:20] reserved (must be 0)
	 
	  If enqueue request or new round response is not valid then 
	  round number will be 0 (0 is INVALID_ROUND_NUM)
*/
///////////////////////////////////////////////////////////////////////////////

#ifndef __CLASS_SCHEDULER_UC__
#define __CLASS_SCHEDULER_UC__

#include 	"scheduler_packet.h"
#include 	"stdmac.uc"
#include 	"xbuf.uc"
#include 	"dl_buf.uc"

// Location in local memory where the per queue data structures are stored
// for the Class Scheduler microengine.
//There will be maximum 16 queue structures cached in local memory. Each 
//structure is in a 64-byte block -> total memory used is 0x400 bytes
#define_eval    QUEUE_STRUCS_LM_BASE        0

// Location in local memory of Class Scheduler ME where per PORT data of 
// the enqueue base round number is stored. Enequeue  base round is the starting
// round that a packet can be enqueue in. In other word, if there is enough
//credit, a new packet can be enqueued in the enqueue base round. If there's not
//enough credit, a new packet will be enqueue in a future round. Future round = enqueue
//base round + number of rounds to wait
#define	ENQ_BASE_ROUNDS_START		0x400

//Define value to write into SAME_ME_SIGNAL CSR
.sig	volatile prev_th_sig
#define next_th_signal		((1 << NEXT_CONTEXT_BIT) | (&prev_th_sig << SIGNAL_NUMBER_FIELD))

.sig	volatile wrback_lru_done
.sig	volatile read_queue_data_done
.sig 	volatile get_round_request_done
.sig	volatile read_deq_cntr_done

#ifdef _DEBUG_COUNTERS_
	//number of packets sent out for enqueue
	.reg @class_sched_enq_ctr
	//number of packets dropped due to round wrapping-around
	.reg @class_sched_dropped_pkts
	//times the nn-ring between CLASS_SCHEDULDER and COUNT is full
	.reg @class_sched_nn_ring_full
#endif

.reg zero reg_ffff_ffff
.reg port_round_mask @round_only_mask @compare_round_val
.reg queue_structures_base per_port_enq_base_rnd_start
	

///////////////////////////////////////////////////////////////////////////////
// init_local_csr
//	 	Description: 
//			Initializes the local CSR's used by class_scheduler code
//
//	 	Outputs:
//
//		Inputs:
//		
///////////////////////////////////////////////////////////////////////////////

#macro init_local_csr()
.begin

	.reg ctx_enable_data nn_ring_empty_val

	local_csr_rd[ctx_enables]
	immed[ctx_enable_data, 0]

	;Bits [19:18] controls threadhold when NN_Empty is asserted.
	;Set [19:18] to 1:0 to specify that the message on NN-ring is 3 longwords
	move(nn_ring_empty_val, 0x80000)
	
	alu[ctx_enable_data, ctx_enable_data, OR, nn_ring_empty_val]
	local_csr_wr[ctx_enables, ctx_enable_data]

	//queue 0 is valid queue number, but the base of queue structures
	//is in SRAM channel 2. Therefore, queue structure address never
	//equals 0. This mean there won't be a look up of 0x0, so 
	//just do a cam_clear, no need to initialize CAM with non-zero data
	cam_clear

	local_csr_wr[nn_get, 0]
	local_csr_wr[nn_put, 0]

.end
#endm


///////////////////////////////////////////////////////////////////////////////
// init_lm_enq_base_rnd_per_port
//	 	Description: 
//			Initializes the per port intial round number 
//			in local memory with INIT_ENQ_ROUND
//
//	 	Outputs:
//
//		Inputs:
//		
///////////////////////////////////////////////////////////////////////////////

#macro init_lm_enq_base_rnd_per_port()
.begin

	.reg init_port, init_port_shifted
	.reg init_lmemaddr entry0

	;Initialize port data structures in local memory
	immed32[init_lmemaddr, ENQ_BASE_ROUNDS_START ]
	immed[init_port, 0x0]
	
	.while (init_port < NUM_PORTS)

		local_csr_wr[active_lm_addr_1, init_lmemaddr]
		alu[init_port_shifted, --, B, init_port,<<16]
		alu[init_lmemaddr, init_lmemaddr, +, 0x4] 
		
		alu[entry0, init_port_shifted, OR, INIT_ENQ_ROUND]
		alu[lm_enq_base_round,--,B,entry0]

		alu[init_port, init_port, +, 1]
		
	.endw

.end

#endm

///////////////////////////////////////////////////////////////////////////////
// class_scheduler_init
//	 	Description: 
//			Initializes all global GPR's,GPR's, local CSR's and local memory 
//			data needed for Class_scheduler code
//
//	 	Outputs:
//
//		Inputs:
//		
///////////////////////////////////////////////////////////////////////////////

#macro class_scheduler_init()
.begin

	move(port_round_mask, PORT_ROUND_MASK_VAL)
	move(@round_only_mask, ROUND_ONLY_MASK_VAL)
	move(@compare_round_val, (TOTAL_ROUNDS - MAX_ROUNDS_FOR_PENDING_COUNTER_UPDATE))

	immed[zero, 0]
	move(reg_ffff_ffff, 0xffffffff)

	#if ((SCHED_QUEUE_STRUCTURES_BASE << (31 - QUEUE_STRUCTURES_PER_PORT_SHIFT)) != 0)
		#error "Queue structures base must be aligned at boundary of \
				1 << QUEUE_STRUCTURES_PER_PORT_SHIFT"
	#endif
	move(queue_structures_base, SCHED_QUEUE_STRUCTURES_BASE)

	#if ((ENQ_BASE_ROUNDS_START << (31 - ENQ_BASE_ROUND_SHIFT)) != 0)
		#error "Port current rounds structure base must be aligned at boundary of \
				1 << PER_PORT_CURR_ROUNDS_SHIFT"
	#endif
	move(per_port_enq_base_rnd_start, ENQ_BASE_ROUNDS_START)	
	
	.if (ctx() == 0)	
	
		thread_0_init()
					
	.else
		ctx_arb[prev_th_sig]	
	.endif
.end
#endm

///////////////////////////////////////////////////////////////////////////////
// thread_0_init
//	 	Description: 
//			Initialization to be done ONLY by thread 0
//
//	 	Outputs:
//
//		Inputs:
//		
///////////////////////////////////////////////////////////////////////////////
#macro thread_0_init()	

.begin

#ifdef _DEBUG_COUNTERS_

	immed[@class_sched_dropped_pkts, 0]
	immed[@class_sched_enq_ctr, 0]
	immed[@class_sched_nn_ring_full, 0]

#endif
		
		init_local_csr()

		init_lm_enq_base_rnd_per_port()

		; Thread 0 of the Sort ME in the functional pipeline waits 
		; for system initialization.			
		ctx_arb[system_init_sig]
.end

#endm
///////////////////////////////////////////////////////////////////////////////
// read_sratch_ring
//	 	Description: 
//			Check scratch ring for NEW ROUND REQUEST message from COUNT or 
//			PORT SCHEDULER ME.
//
//	 	Outputs: $next_dequeue_round_request
//
//		Inputs:
//		
///////////////////////////////////////////////////////////////////////////////
#macro read_sratch_ring($next_dequeue_round_request)

	

	scratch[get, $next_dequeue_round_request, zero, RING_ADDR , 1], \
			sig_done[get_round_request_done]

#endm

///////////////////////////////////////////////////////////////////////////////
// read_nn_ring_msg
//	 	Description: 
//			Read Next Neighbor ring to get message from the previous ME. This
//			message contains information about the new packet to be enqueued 
//
//	 	Outputs: 	packet_length
//					queue_number
//					port_number
//					sop_handle 
//
//		Inputs:
//		
///////////////////////////////////////////////////////////////////////////////

#macro read_nn_ring_msg(packet_length, queue_number, port_number, \
										sop_handle) 

	alu[packet_length,--,B,*n$index++]

	alu[sop_handle,--,B,*n$index++]

	ld_field_w_clr[queue_number,0011,*n$index]

	alu_shf[port_number, --, B, *n$index++, >>16]

#ifdef _DEBUG_COUNTERS_

	alu[@class_sched_enq_ctr, @class_sched_enq_ctr, +, 1]

#endif

#endm

///////////////////////////////////////////////////////////////////////////////
// calc_addrs
//	 	Description: 
//			Calculate SRAM addresses of data structures that keep information
//			about the queue and the port in process
//
//	 	Outputs: 	enq_base_rnd_addr	 -  SRAM address of the enqueue base round
//											for this port
//					enq_struct_addr		 -	SRAM address of the data structure of
//											this queue
//					port_queue_data_base - 	SRAM base address of the data 
//										   	structures of the 256 queues associated 
//											with this port
//
//		Inputs:
//		
///////////////////////////////////////////////////////////////////////////////

#macro calc_addrs(enq_base_rnd_addr, enq_struct_addr, port_queue_data_base)

	;loading lm_addr1 for lm_enq_base_round
	alu_shf[enq_base_rnd_addr, per_port_enq_base_rnd_start, OR, port_number, \
			<<ENQ_BASE_ROUND_SHIFT]	

	;calculate the base of the queue structures of this port
	alu_shf[port_queue_data_base, queue_structures_base, OR, 
			port_number, <<QUEUE_STRUCTURES_PER_PORT_SHIFT]

	alu_shf[enq_struct_addr, port_queue_data_base, or, queue_number, \
			<<QUEUE_STRUCTURE_SHIFT]
	
#endm

///////////////////////////////////////////////////////////////////////////////
// write_to_nn_ring
//	 	Description: 
//			Write message to the next ME on nn-ring
//
//	 	Outputs: 	
//
//		Inputs:		sop_handle	of the packet to enqueue
//					enqueue_port_and_round - the port and the round number 
//											 where this packet be enqueued
//					
//					next_dequeue_round 		- the round where packets to be 
//											dequeued from
///////////////////////////////////////////////////////////////////////////////

#macro write_to_nn_ring(sop_handle, enqueue_port_and_round, next_dequeue_round)

class_sched_nn_ring_full#:

#ifdef _DEBUG_COUNTERS_

	br_!inp_state[NN_full, class_sched_nn_ring_not_full#]
	alu[@class_sched_nn_ring_full, @class_sched_nn_ring_full, +, 1]
	br[class_sched_nn_ring_full#]

class_sched_nn_ring_not_full#:
#else
	br_inp_state[NN_full, class_sched_nn_ring_full#]
#endif

	alu[*n$index++, --, B, sop_handle]
	alu[*n$index++, --, B, enqueue_port_and_round] 

class_sched_measure_perf#:
	alu[*n$index++,--, B, next_dequeue_round] 

#endm

///////////////////////////////////////////////////////////////////////////////
// cam_lookup_and_get_queue_structures
//	 	Description: 
//			Perform CAM lookup to see if the data structure at thes pecified 
//			address is cached in local memory. If not,write back the Least  
//			Recently Used CAM entry, read the structure from SRAM, and cache
//			it in local memory.
//
//	 	Outputs: 	
//
//		Inputs:		enq_struct_addr	- SRAM address of the queue data structure.
//									  Also used as CAM lookup tag
//					$xfer_in		- SRAM transfer register for read data
//					$xfer_out		- SRAm transfer register for  LRU write back
///////////////////////////////////////////////////////////////////////////////
#macro cam_lookup_and_get_queue_structures(enq_struct_addr, $xfer_in, $xfer_out)

.begin
	.reg	cam_result		// result of CAM lookup
	.reg	cam_entry		// entry number from CAM lookup
	.reg	cam_tag			// tag in CAM for entry 

	cam_lookup[cam_result, enq_struct_addr], lm_addr0[0]
	
	//	Check lookup result
	br_bset[cam_result, 7, class_scheduler_cam_hit_phase_1#]
	
class_cheduler_cam_miss_phase_1#:
	// this is a CAM miss case
	// LRU 	queue structure => SRAM; SRAM queue structure => $Xfer

	sram[read, $xfer_in[0], enq_struct_addr, 0, NUM_LWS], sig_done[read_queue_data_done]
	;get CAM entry
	alu[cam_entry, 0xF,and, cam_result, >>3]
	; read CAM tag which is the queue address in SRAM
	cam_read_tag[cam_tag, cam_entry]							
	
 	// Move the modified part of the  LRU queue structure 
	// to transfer registers. l$index0 points to the beginning of 
	// the LRU queue structure
	alu[$xfer_out[0], --, B, lm_q_current_credit_used]
 	alu[$xfer_out[1], --, B, lm_q_last_enq_round_and_quantum_shift]
 	alu[$xfer_out[2], --, B, lm_q_enqueued_packets_count]

	//on the first miss, cam_tag is 0, this write go to the SRAM_ZERO_BLOCK of
	//SRAM that is reserved for temporary data, no valid data will be overwritten.
	sram[write, $xfer_out[0], cam_tag, 0, QUEUE_STRUCTURE_LWS_TO_WRITE_BACK], \
		sig_done[wrback_lru_done]

	cam_write[cam_entry, enq_struct_addr, NORMAL_CAM_STATE]	; update CAM LRU entry

	ctx_arb[prev_th_sig, read_queue_data_done, wrback_lru_done, get_round_request_done], \
			all, br[class_scheduler_cam_miss_phase_2#]

class_scheduler_cam_hit_phase_1#:

	;read dequeue_counter
	sram[read, $xfer_in[/**/DEQ_COUNTER_INDEX], enq_struct_addr, \
		(DEQ_COUNTER_INDEX * 4), 1], sig_done[read_queue_data_done]


	ctx_arb[prev_th_sig, read_queue_data_done, get_round_request_done], \
			all, br[class_scheduler_phase_2#]

	;Phase 2
class_scheduler_cam_miss_phase_2#:

	;move data to local memory. Only 3 longwords of the queue structure
	;need to be read in because the rest are reserved words and the
	;dequeue counter

	alu[lm_q_current_credit_used, --, B, $xfer_in[0]]
 	alu[lm_q_last_enq_round_and_quantum_shift, --, B, $xfer_in[1]]
 	alu[lm_q_enqueued_packets_count, --, B, $xfer_in[2]]
	
.end

#endm

///////////////////////////////////////////////////////////////////////////////
// update_enq_base_round_data
//	 	Description: 
//			Increment the enqueue base round after sending it as the
//			next round to be dequeued from. This is to ensure once
//			dequeue starts on a round, no more enqueueing can happen 
//			in that round
//
//	 	Outputs: 	
//
//		Inputs:		
///////////////////////////////////////////////////////////////////////////////

#macro update_enq_base_round_data()
	alu[lm_enq_base_round, lm_enq_base_round, +, 0x1]
	;mask out roll-over bit
	alu[lm_enq_base_round, port_round_mask, AND, lm_enq_base_round]
#endm

///////////////////////////////////////////////////////////////////////////////
// calc_round_and_update_queue_structure
//	 	Description: 
//			Calculate the round where the packet in process can be enqueued
//			into. Update the last scheduled round in the data structre
//			of the packet's queue
//
//	 	Outputs: 	
//
//		Inputs:	final_port_and_round - the port and the round where the packet
//										is to be enqueued
//				drop_pkt_label		-  the label to branch to if the packet 
//										is to be drop, go to
//
///////////////////////////////////////////////////////////////////////////////
#macro calc_round_and_update_queue_structure(final_port_and_round, drop_pkt_label)
.begin
	.reg quantum_mask quantum_shift last_enq_round
	.reg diff

	alu_shf[quantum_shift, --, B, lm_q_last_enq_round_and_quantum_shift, \
			>>QUANTUM_SHIFT_BITS_START]
	ld_field_w_clr[last_enq_round, 0111, lm_q_last_enq_round_and_quantum_shift]

	;create a mask to calculate left over bytes after each round	
	alu[--, quantum_shift, OR, 0]
	alu_shf[quantum_mask, --, B, reg_ffff_ffff, <<indirect]

	alu[diff, last_enq_round, -, lm_enq_base_round]	
	alu[diff, diff, AND, @round_only_mask]			

	;Check if queue empty (enqueued packets counter == dequeued packets counter)
	alu[--,$xfer_in[/**/DEQ_COUNTER_INDEX/**/], -, lm_q_enqueued_packets_count]
	beq[queue_empty#]

	;When dequeue counter not equal to enqueue counter, there are 2 posibilities:
	;the queue is really not empty or it's empty but the SRAM write in TX_HELPER ME
	;to update dequeue counter is pending.

	;if last_round_schedule is bigger than enqueue_base_round, there're packets
	;waiting to be dequeue (queue not empty). In case last_round_schedule is 
	;smaller than enqueue_base_round, there are 2 possibilty:
	;	- last_scheduled_round wraps around. 
	;	- last_scheduled_round has been sent for dequeued but dequeue counter
	;     lags the enqueue counter.If this is the case, the distance
	;	  between last_enq_round and lm_enq_base_round should be <16.

    alu[--, diff, -, @compare_round_val]
	blt[queue_not_empty#]

queue_empty#:
	;if queue empty make last_round = curr_round and credit used = 0
	alu[lm_q_current_credit_used, --, B, 0]
	alu[last_enq_round, --, B, lm_enq_base_round]
	;find round number
	alu[--, quantum_shift, OR, 0x0]
	alu_shf[final_port_and_round,--, B, packet_length, >>indirect]

	;the credit has been used in the current round is total bytes used MOD round
	;quantum. This is the same as "AND" total_bytes_used with (round_quantum - 1)	
	alu[lm_q_current_credit_used, packet_length, AND~, quantum_mask]

	br[update_queue_data#]

queue_not_empty#:

	alu[total_bytes_used, lm_q_current_credit_used, +, packet_length]
	
	;find round number
	alu[--, quantum_shift, OR, 0x0]
	alu_shf[final_port_and_round,--, B, total_bytes_used, >>indirect]
	;the credit has been used in the current round is total bytes used MOD round
	;quantum. This is the same as "AND" total_bytes_used with (round_quantum - 1)	
	alu[lm_q_current_credit_used, total_bytes_used, AND~, quantum_mask]

#ifdef DROP_PKTS_FOR_WRAP_AROUND_ROUNDS
	.begin

		.reg chk_final
		alu[chk_final, diff, +, final_port_and_round]
		alu[--, chk_final, -, @compare_round_val]
	
		#ifdef _DEBUG_COUNTERS_
			blt[no_drop#]
			alu[@class_sched_dropped_pkts, @class_sched_dropped_pkts, +, 1]
			br[drop_pkt_label]
		#else
			bge[drop_pkt_label]
		#endif

	.end 
no_drop#:
#endif

update_queue_data#:

	;add to last round schedule to get the port number and the final round number
	alu[final_port_and_round, final_port_and_round, +, last_enq_round]	

	; masking off the roll over bits
	alu[final_port_and_round, final_port_and_round, AND, port_round_mask]

	;since Queue Manager treats queue number 0 as invalid, the Scheduler adds 1
	;to the round 0 so that the valid rounds are from 1 to 65536		
	bne[not_round_0#], defer[1]
	alu[lm_q_enqueued_packets_count, lm_q_enqueued_packets_count, +, 1]

	alu[final_port_and_round, --, B, 1]
	
not_round_0#:	
	
	ld_field[lm_q_last_enq_round_and_quantum_shift, 0111, final_port_and_round]

.end

#endm
///////////////////////////////////////////////////////////////////////////////
// activate_enq_base_round_for_next_dequeue_port
//	 	Description: 
//			Find the local memory address of the enqueue base round data for 
//			the port where dequeue will happen. Load the address into the 
//			local memory pointer.
//
//	 	Outputs: 	
//
//		Inputs:	per_port_enq_base_rnd_start - base address in local memory of 
//										all enqueue base round data entries
//				$next_dequeue_round_request- the port of which the enqueue base round 
//										address is to be put into local mem pointer
//
///////////////////////////////////////////////////////////////////////////////
#macro activate_enq_base_round_for_next_dequeue_port(per_port_enq_base_rnd_start, \
											$next_dequeue_round_request)
.begin
	.reg temp

	alu_shf[temp, per_port_enq_base_rnd_start, OR, $next_dequeue_round_request, \
			<<ENQ_BASE_ROUND_SHIFT]
	;needs 3 cycles after CSR write before current_round data can be read
	local_csr_wr[active_lm_addr_1, temp]
.end
#endm

///////////////////////////////////////////////////////////////////////////////
/*
* Class Scheduler macro
	
Phase 1
	
	Read Scratch ring for Next Dequeue Round Request. 
	
	If no message on NN-ring

		Swap out. Wait for (scratch read done  && prev_thread)
		    
	Else
		Read message on NN-ring
		Extract packet length, queue number, port number, SOP from the message 
		Calculate queue address in SRAM. Use Queue address as tag to look up CAM.
		
		If CAM miss
			
			SRAM write to write back the LRU
			SRAM read for Queueu Data Structure of the new queue 
			and the Dequeued Packets Counter of the new queue
			Ctx_arb. Wait for (scratch read done  & prev_thread & SRAM_write_done &
								SRAM_read_done)
		Else (CAM hit)
			
			SRAM read the Dequeued Packets Counter of the new queue
			Ctx_arb. Wait for (scratch read done  & prev_thread & SRAM_read_done)			
		
		Endif
	Endif
	 
Phase 2
	If no message on NN-ring

		If (valid Nex Dequeue Round Request from Scratch ring)
			br[valid_next_dequeue_round_request#]
		Else
			br[beginning]
		Endif
	Else
		    	
		If CAM miss
			Move queue data to lmem
		Endif
		 
		If  (Enqueue Packets Counter == Dequeue Packets Counter) --> queue is empty
			
			last round scheduled = current round
			total bytes already used from queue credit = 0
		
		Else If (the difference between last schedule round and current round is
					less than MAX_ROUNDS_FOR_PENDING_COUNTER_UPDATE)
			
			When dequeue counter not equal to enqueue counter, there are 2 posibilities:
			the queue is really not empty or it's empty but the SRAM write in TX_HELPER ME
			to update dequeue counter is pending.
			If the number of rounds have passed is less than the 
			MAX_ROUNDS_FOR_PENDING_COUNTER_UPDATE, assume that the dequeue counter
			lags the enqueue counter, and the queue is actually empty.
			
			last round scheduled = current round
			total bytes already used from queue credit = 0
				
		Endif

		Compute how many more rounds from the current round this packet will
		have to wait before it can be dequeued. The number of rounds to
		wait is dependent on the total queue credit, the credit already used 
		in this round, and the packet size

		final round = last round schedule + number of rounds to wait

		
		If (valid Next Dequeue Round Request from Scratch ring)

valid_next_dequeue_round_request#:

			Extract port number from Next Dequeue Round Request
			
			Find the location of the current round of that port
			
			Next Dequeue Port and Round = current round
			
			Increment current round (this make the current round nolonger available 
			for enqueueing. Future packets to this port will be enqueued to the 
			new current round)

		Else

			Next Dequeue Port and Round = 1 << INVALID_BIT

		Endif

		Send message on NN-ring: SOP, enqueue port and round, next dequeue port 
								and round

*/
///////////////////////////////////////////////////////////////////////////////
#macro class_scheduler ()

.begin
	.reg 	sop_handle packet_length port_number queue_number
	.reg 	port_queue_data_base enq_base_rnd_addr final_port_and_round

	.reg 	enq_struct_addr addr_out total_bytes_used enqueue_round
	.reg 	 $next_dequeue_round_request temp

	//Allocate xfer regs to read in new queue structure in CAM miss case
	//and 1 more for dequeue_counter		
	xbuf_alloc($xfer_in, NUM_LWS, read)
	xbuf_alloc($xfer_out, NUM_LWS, write)

	;Phase 1
new_phase_start#:

	;check scratch ring for NEW ROUND REQUEST message from COUNT or PORT SCHEDULER ME
	read_sratch_ring($next_dequeue_round_request)

	local_csr_wr[same_me_signal, next_th_signal]	
	br_inp_state[NN_EMPTY, no_enqueue_msg#]

	read_nn_ring_msg(packet_length, queue_number, port_number, \
									sop_handle)

	;Calculate address of current round data for the port
	calc_addrs(enq_base_rnd_addr, enq_struct_addr, port_queue_data_base)

	local_csr_wr[active_lm_addr_1, enq_base_rnd_addr]
	
    ;CAM lookup and read queue structure
	cam_lookup_and_get_queue_structures(enq_struct_addr, $xfer_in, $xfer_out)

class_scheduler_phase_2#:

	;Calculate enqueue round
	calc_round_and_update_queue_structure(final_port_and_round, drop_packet#)

	;Take care of the next dequeue round request from scratch ring
	;First, check for invalid new request message
	br_bclr[$next_dequeue_round_request, 31, invalid_next_dequeue_round_request#]

valid_next_dequeue_round_request#:
	;if valid message, then calculate local memory address for enqueue base round data
	activate_enq_base_round_for_next_dequeue_port(per_port_enq_base_rnd_start, \
											$next_dequeue_round_request)

	write_to_nn_ring(sop_handle, final_port_and_round, lm_enq_base_round)

	update_enq_base_round_data()

	br[new_phase_start#]

invalid_next_dequeue_round_request#:
	
	alu[temp,--, B, 1, <<31]	;invalid response to Next Dequeue Round Request
	write_to_nn_ring(sop_handle, final_port_and_round, temp)

	br[new_phase_start#]

no_enqueue_msg#:
	
	;If NN-ring is empty, swap out until scratch ring get is done
	ctx_arb[prev_th_sig, get_round_request_done], all

check_new_request#:
	;Upon waking up, check for invalid new request message
	br_bclr[$next_dequeue_round_request, 31, new_phase_start#]
	
	alu[sop_handle, --, B, 0]
	;invalid enqueue message
	alu[final_port_and_round, --, B, 1, <<31]	;invalid enqueue message
	
	br[valid_next_dequeue_round_request#]

drop_packet#:
	dl_buf_drop[sop_handle]
	br[check_new_request#]

done#:

.end

#endm

/////////////////////////////////////////////////////////////////////////////////////////

/////////////////////////////////////////////////////////////////////////////////////////
//
// This is where the code begins..................
//
/////////////////////////////////////////////////////////////////////////////////////////
main#:
.begin


	class_scheduler_init()
	
	// Here is where the main loop begins
	.while (1)
		class_scheduler()
	.endw

	//should never go here. This instruction is to avoid warning 383 
	nop
.end

/////////////////////////////////////////////////////////////////////////////////////////

#endif // __CLASS_SCHEDULER_UC__

/////////////////////////////////////////////////////////////////////////////////////////