/////////////////////////////////////////////////////////////////////////////////////////
//                                                                      
//                  I N T E L   P R O P R I E T A R Y                   
//                                                                      
//     COPYRIGHT (c)  2001 BY  INTEL  CORPORATION.  ALL RIGHTS          
//     RESERVED.   NO  PART  OF THIS PROGRAM  OR  PUBLICATION  MAY      
//     BE  REPRODUCED,   TRANSMITTED,   TRANSCRIBED,   STORED  IN  A    
//     RETRIEVAL SYSTEM, OR TRANSLATED INTO ANY LANGUAGE OR COMPUTER    
//     LANGUAGE IN ANY FORM OR BY ANY MEANS, ELECTRONIC, MECHANICAL,    
//     MAGNETIC,  OPTICAL,  CHEMICAL, MANUAL, OR OTHERWISE,  WITHOUT    
//     THE PRIOR WRITTEN PERMISSION OF :                                
//                                                                      
//                        INTEL  CORPORATION                            
//                                                                     
//                     2200 MISSION COLLEGE BLVD                        
//                                                                      
//               SANTA  CLARA,  CALIFORNIA  95052-8119 
//
/////////////////////////////////////////////////////////////////////////////////////////

#ifndef __COUNT_UC__
#define __COUNT_UC__

#include 	"scheduler_packet.h"
#include 	"stdmac.uc"
#include	"xbuf.uc"

//Define value to write into SAME_ME_SIGNAL CSR
.sig	volatile prev_th_sig
#define next_th_signal		((1 << NEXT_CONTEXT_BIT) | (&prev_th_sig << SIGNAL_NUMBER_FIELD))

.sig	volatile read_enq_ctr_done
.sig	volatile read_next_deq_ctr_done
.sig 	volatile wrback_lru_ctr_done
.sig 	volatile empty_prev_next_round_ctr_done
.sig	volatile put_round_request_done

// Location in local memory of COUNT ME where the per round data structures are stored
#define_eval    ROUND_CNTRS_LM_BASE        0

#define	SIGMASK_DEFAULT_PHASE1	( (1 << &prev_th_sig) | \
								  (1 << &read_enq_ctr_done) | \
								  (1 << &wrback_lru_ctr_done) | \
								  (1 << &empty_prev_next_round_ctr_done) )

#define	SIGMASK_DEFAULT_PHASE2	( (1 << &prev_th_sig) | \
								  (1 << &read_next_deq_ctr_done) )
									 
#ifdef _DEBUG_COUNTERS_
	//number of packets COUNT sent out for enqueue 
	.reg @count_enq_ctr
	//number of packets COUNT sent out for dequeue 
	.reg @count_deq_ctr
	//times the nn-ring between COUNT and PORT_SCHEDULER is full 
	.reg @count_nn_ring_full
#endif

///////////////////////////////////////////////////////////////////////////////
// count_init
//	 	Description: 
//			Initialize regsiters and local memory needed by Count code
//
//	 	Outputs: 
//
//		Inputs:
//		
///////////////////////////////////////////////////////////////////////////////
#macro count_init()

.begin
	
	;invalid is bit 31 in the message, but port is message >>16,
	;therefore, in valid bit in port is bit 15 
	alu_shf[prev_deq_port, --, B, 1, <<15]
	alu_shf[prev_deq_round, --, B, 0]

	immed[prev_deq_round_pkts_count, 0]
	immed[prev_next_deq_pkts_ctr_offset, 0]
	;valid scratch ring message has bit 31=1 (opposite with message on nn-ring)
	;this must be the way to do it sinve empty scratch ring returns 0
	move(valid_scratch_ring_msg, 0x80000000)
	immed[zero, 0]
	immed[$zero, 0]
	;when the port and round number are being sent to the port_scheduler
	;block, they are compressed into 16 bits with the most significant
	;4 bits being port number
	move(port_only_mask, 0xf000)

	#if (SCHED_ROUND_COUNTERS_BASE << (31 - NUMBER_OF_BITS_FOR_ROUNDS) )
		#error "Base of round counters must be aligned at boundary of \
				1 << NUMBER_OF_BITS_FOR_ROUNDS"
	#endif

	immed32(round_pkts_ctr_base, SCHED_ROUND_COUNTERS_BASE)
	

	.if (ctx() == 0)

		thread_0_count_init()
	
	.else	
	
		ctx_arb[prev_th_sig]
	
	.endif

.end
#endm

///////////////////////////////////////////////////////////////////////////////
// count_init
//	 	Description: 
//			Initialization only need to be done once (by thread 0)
//
//	 	Outputs: 
//
//		Inputs:
//		
///////////////////////////////////////////////////////////////////////////////
#macro thread_0_count_init()
.begin

	.reg ctx_enable_data
	.reg set_bits

#ifdef _DEBUG_COUNTERS_
	immed[@count_enq_ctr, 0]
	immed[@count_deq_ctr, 0]
	immed[@count_nn_ring_full, 0]
#endif

	; Set bits [15:8] to FF to enable all 8 contexts 
	; Set bits [19:18] to 10 to specify that the message on NN-ring is 3 longwords
	local_csr_rd[ctx_enables]
	immed[ctx_enable_data, 0]

	immed32[set_bits, 0x0008FF00]
	alu[ctx_enable_data, ctx_enable_data, OR, set_bits]
	local_csr_wr[ctx_enables, ctx_enable_data]

	local_csr_wr[nn_get, 0]
	local_csr_wr[nn_put, 0]

	init_local_memory_and_cam()

	local_csr_wr[active_lm_addr_0, ROUND_CNTRS_LM_BASE]

	; Thread 0 of the Sort ME in the functional pipeline waits 
	; for system initialization.
	ctx_arb[system_init_sig]

.end

#endm
///////////////////////////////////////////////////////////////////////////////
// init_local_memory_and_cam
//	 	Description: 
//			Initialize all 16 cam entries with valid tag for 16 ports (one
//			entry per port) 
//			The tag value of a entry N equals to the offset of the
//			packets counter associated with the initial enqueue
//			base round(INIT_ENQ_ROUND) of port N .
//			Even if INIT_ENQ_ROUND = 0, only 1 CAM entry has tag 0 (the 
//			offset to the packets counter of port 0).
//			This will enable CAM_lookup with tag = 0x0 without unpredictable LRU (for 
//			details, refer to IXP2800 Programer Reference Manual under cam_lookup).
//			Init 16 long words in local memory with initial packets count, which is 0.
//
//	 	Outputs: 
//
//		Inputs:
//		
///////////////////////////////////////////////////////////////////////////////

#macro init_local_memory_and_cam()
.begin
	
	.reg counter_offset port lm_offset

	cam_clear

	#define_eval _TAG_VAL (0 - SCHED_QUEUE_STRUCTURES_SIZE)
	move(@default_cam_tag, _TAG_VAL)
	#undef _TAG_VAL

	local_csr_wr[active_lm_addr_0, ROUND_CNTRS_LM_BASE]
	immed[port, 0]


	.while (port < 16)

		;shift port number to find the offset of counter for that port
		alu_shf[counter_offset, --, B, port, <<ROUND_PKTS_COUNTER_SHIFT]

		;within each port, find the offset for the initial round
		alu_shf[counter_offset, counter_offset, OR, INIT_ENQ_ROUND, \
				<<PACKETS_CTR_SHIFT]

		cam_write[port, counter_offset, NORMAL_CAM_STATE]
		
		alu[port, port, +, 1]

		immed[*l$index0[0], 0]		
		;each CAM entry is stored in 64-byte block in local memory
		alu_shf[lm_offset, --, B, port, <<6]

		local_csr_wr[active_lm_addr_0, lm_offset]
	.endw

.end

#endm

///////////////////////////////////////////////////////////////////////////////
// read_nn_ring_message
//	 	Description: 
//			Read NN-ring to get meaasge from previous ME (Class_scheduler)
//
//	 	Outputs: 
//
//		Inputs:
//		
///////////////////////////////////////////////////////////////////////////////

#macro read_nn_ring_message()

	br_inp_state[NN_EMPTY, no_class_scheduler_msg#]

	alu[sop_handle,--,B,*n$index++]

	ld_field_w_clr[enq_round, 0011, *n$index]
	alu_shf[enq_port, --, B, *n$index++, >>16]

	ld_field_w_clr[next_deq_round, 0011, *n$index]
	alu_shf[next_deq_port, --, B, *n$index++, >>16]

#endm
///////////////////////////////////////////////////////////////////////////////
// write_nn_ring_message
//	 	Description: 
//			Write to NN-ring the meaasge to next ME (Port_scheduler) and update
//			debug counters
//
//	 	Outputs: 
//
//		Inputs:		in_sop - packet SOP
//					in_enq_port, in_enq_round - port and round where enqueue 
//												will happen 
//					in_deq_port, in_deq_round - port and round where dequeue 
//												will happen 
//				 	in_deq_round_pkts_count	  - the number of pakets available
//												in the dequeue round
//		
///////////////////////////////////////////////////////////////////////////////
#macro write_to_nn_ring(in_sop, in_enq_port, in_enq_round, in_deq_port, \
						in_deq_round, in_deq_round_pkts_count)
.begin
	.reg temp_enq
	.reg temp_deq

count_nn_ring_full#:

#ifdef _DEBUG_COUNTERS_

	br_!inp_state[NN_full, count_nn_ring_not_full#]
	alu[@count_nn_ring_full, @count_nn_ring_full, +, 1]
	br[count_nn_ring_full#]

count_nn_ring_not_full#:
#else
	br_inp_state[NN_full, count_nn_ring_full#]
#endif
	
	alu[temp_enq, in_enq_round, OR, in_enq_port, <<NUMBER_OF_BITS_FOR_ROUNDS]
	;load the bit 15 of port so that if the message is invalid bit 31 will be set
	ld_field[temp_enq, 1000, in_enq_port, <<16]


	alu[temp_deq, in_deq_round, OR, in_deq_port, <<NUMBER_OF_BITS_FOR_ROUNDS]
	;load the bit 15 of port so that if the message is invalid bit 31 will be set
	ld_field[temp_deq, 1000, in_deq_port, <<16]

	;send message to Port Scheduler	
	

	alu[*n$index++, --, B, temp_enq]

	alu[*n$index++, --, B, in_sop]

	alu[*n$index++, --, B, temp_deq]

count_measure_perf#:
	alu[*n$index++, --, B, in_deq_round_pkts_count]

#ifdef _DEBUG_COUNTERS_

	br_bset[temp_enq, 31, skip_enq_ctr_incr#]

	alu[@count_enq_ctr, @count_enq_ctr, +, 1]

	skip_enq_ctr_incr#:

	br_bset[temp_deq, 31, skip_deq_ctr_incr#]

	alu[@count_deq_ctr, @count_deq_ctr, +, in_deq_round_pkts_count]

	skip_deq_ctr_incr#:

#endif

.end	
#endm
///////////////////////////////////////////////////////////////////////////////
// read_next_dequeue_round_pkts_counter
//	 	Description: 
//			Read the packets counter of the round to be dequeued next
//
//	 	Outputs: 
//
//		Inputs:		
///////////////////////////////////////////////////////////////////////////////
#macro read_next_dequeue_round_pkts_counter()

.begin

	.reg	cam_result		// result of CAM lookup
	.reg	cam_entry		// entry number from CAM lookup
	.reg	cam_tag			// tag in CAM for entry 
	

	;For next dequeue port and round, find the enqueued packets counter 
	;of that round and check if the round has any packets. 

	;address of the round counter =  
	; port number * (number of rounds in each port * size of each counter )
	#define_eval _SHF_BITS  (NUMBER_OF_BITS_FOR_ROUNDS + PACKETS_CTR_SHIFT)
	alu_shf[prev_next_deq_pkts_ctr_offset, --, B, next_deq_port, <<_SHF_BITS]
	#undef _SHF_BITS
	alu_shf[prev_next_deq_pkts_ctr_offset, prev_next_deq_pkts_ctr_offset, OR, \
			next_deq_round, <<PACKETS_CTR_SHIFT]

	;CAM lookup and read packets counter of next dequeue round
	cam_lookup[cam_result, prev_next_deq_pkts_ctr_offset], lm_addr1[0]

	br_bset[cam_result, 7, read_next_dequeue_round_pkts_counter_cam_hit#]

	;CAM miss
	sram[read, $xfer_in[0], round_pkts_ctr_base, prev_next_deq_pkts_ctr_offset, 1], \
		sig_done[read_next_deq_ctr_done]	

	;swap out
	ctx_arb[--] , defer[1]
	local_csr_wr[active_ctx_wakeup_events, sig_mask]
	
	;save the packets counter to be use in next time around. Then, this thread will
	;send dequeue message based on this previous next dequeue response
	.io_completed read_next_deq_ctr_done
	alu[prev_deq_round_pkts_count, --, B, $xfer_in[0]]
		
	br[start_phase_1#]

read_next_dequeue_round_pkts_counter_cam_hit#:	

	alu[cam_entry, 0xF, and, cam_result, >>3]

	;If the counter's cached in local memory, must write it back since
	;it will be 0 out later when the dequeue port and round associated with
	;it is sent to PORT SCHEDULER

	;invalidate the hit entry
	cam_write[cam_entry, @default_cam_tag, NORMAL_CAM_STATE]
	;swap out
	ctx_arb[--] , defer[2]

	;mask out signal from SIGMASK_DEFAULT_PHASE2
	alu_shf[sig_mask, sig_mask, AND~, 1, <<(&read_next_deq_ctr_done)]
	local_csr_wr[active_ctx_wakeup_events, sig_mask]
	
	;upon waking up, save the packets counter to be use in next time around. 
	;Then, this thread will send dequeue message based on this previous 
	;next dequeue response
	alu[prev_deq_round_pkts_count, --, B, *l$index1[0]]	
	br[start_phase_1#]

.end

#endm
///////////////////////////////////////////////////////////////////////////////
// process_prev_next_deq_pkts_count_data
//	 	Description: 
//			Check if the potential next dequeue round (read from NN-ring in 
//			previous message) has packets. If yes, forward it to port_scheduler. 
//			If no, resend the request for a dequeue rount back to Class Scheduler.
//			The reason for processing ONLY the dequeue round from the previous message:
//			once a round is sent as a next dequeue round by the Class Scheduler, 
//			enqueueing is no longer be done to that round to guarantee enqueuing and
//			dequeueing NOT happen on the same round. 
//			This is achieved by incrementing the enqueue base round since 
//			enqueueing can only happen from enqueue base round onward. 
//			However, there maybe enqueueing already started by other threads on that 
//			same round. If we wait until the next message, all enqueueing activities 
//			on the dequeue round must be done.   
//
//	 	Outputs: 
//
//		Inputs:		prev_deq_round_pkts_count
//					prev_deq_port
//					prev_deq_round
//					prev_next_deq_pkts_ctr_offset
//					sig_mask
//	
///////////////////////////////////////////////////////////////////////////////
#macro process_prev_next_deq_pkts_count_data(prev_deq_round_pkts_count, \
		prev_deq_port, prev_deq_round, prev_next_deq_pkts_ctr_offset, sig_mask)

.begin

	;if invalid prev next dequeue port and round message, no need to empty 
	;round counter. Just mask off empty_prev_next_round_ctr_done signal in 
	;default sig_mask.
	;Invalid message has bit 31==1. Since port is message >>16, bit 15 of
	;port is set if invalid
	br_bset[prev_deq_port, 15, mask_sig_empty_prev_next_round_ctr_done#]

	;bit 31 = 1 is VALID bit for scratch ring request (opposite to message on nn-ring)
	alu_shf[$next_deq_round_request, valid_scratch_ring_msg, OR, prev_deq_port]

	alu[--, --, B, prev_deq_round_pkts_count]
	beq[round_has_no_pkts#]

round_has_pkts#:
	;Zero out the packets counter of the dequeue round
	;because dequeuing will happen until the round is empty
	sram[write, $zero, round_pkts_ctr_base, prev_next_deq_pkts_ctr_offset, 1], \
		sig_done[empty_prev_next_round_ctr_done]
	
	br[end_process_prev_next_deq_pkts_count_data#]

	
round_has_no_pkts#:
	;dequeue port has no packet. Send New Dequeue Round Request message back 
	;to Class_Scheduler ME
count_sort_scratch_ring_full#:	
	br_inp_state[RING_FULL, count_sort_scratch_ring_full#]

	scratch[put, $next_deq_round_request, zero, RING_ADDR, 1], \
			sig_done[put_round_request_done]

	;if round has no packet, set INVALID bit in the outcoming dequeue message
	;so that he next block (Port_Scheduler) doesn't have to spend time checking 
	;the empty round. Since bit 31 of the outcoming dequeue message is 
	;comes from bit 15 of prev_deq_port, setting bit 15 in prev_deq_port
	;will result in invalid dequeue message
	alu[prev_deq_port, prev_deq_port, OR, 1, <<15]

	alu_shf[sig_mask, sig_mask, OR, 1, <<(&put_round_request_done)]

mask_sig_empty_prev_next_round_ctr_done#:
	alu_shf[sig_mask, sig_mask, AND~, 1, <<(&empty_prev_next_round_ctr_done)]

end_process_prev_next_deq_pkts_count_data#:

.end
#endm

///////////////////////////////////////////////////////////////////////////////
// read_next_dequeue_round_pkts_counter
//	 	Description: 
//			Read the packets counter of the round to be dequeued next
//
//	 	Outputs: 
//
//		Inputs:		
///////////////////////////////////////////////////////////////////////////////

#macro read_enqueue_pkts_counter(enq_port, enq_round, $xfer_in, sig_mask)
.begin

	.reg 	enq_pkts_ctr_offset 
	.reg	cam_result		// result of CAM lookup
	.reg	cam_entry		// entry number from CAM lookup
	.reg	cam_tag			// tag in CAM for entry 
	

	;address of the round counte = number of round * Each counter size (4 bytes)
	#define_eval _SHF_BITS (NUMBER_OF_BITS_FOR_ROUNDS + PACKETS_CTR_SHIFT)
	alu_shf[enq_pkts_ctr_offset, --, B, enq_port, <<_SHF_BITS]
	alu_shf[enq_pkts_ctr_offset, enq_pkts_ctr_offset, OR, enq_round, <<PACKETS_CTR_SHIFT]
	#undef _SHF_BITS

	cam_lookup[cam_result, enq_pkts_ctr_offset],lm_addr0[0]

	;check lookup result
	br_bset[cam_result, 7, count_read_enq_ctr_cam_hit#]

	;this is a CAM miss case
	;LRU LM port => SRAM; SRAM enqueue counter =>$Xfer0
	;read enqueue counter
	sram[read, $xfer_in[0], round_pkts_ctr_base, enq_pkts_ctr_offset, 1], \
		sig_done[read_enq_ctr_done]

	;get CAM entry
	alu[cam_entry, 0xF, and, cam_result, >>3]
	; read CAM tag which has the port number in bits 19:16 and round number in 11:0
	cam_read_tag[cam_tag, cam_entry]
									
	;Move LRU LM data to the $xfer_out	
	;On the first miss, LRU = 0. This means the content of local memory address 0
	;is written back to the SRAM location of port 0 counter. Since this local mem 
	;value is initialized to 0, this will not cause any problem
	alu[$xfer_out[0], --, B, *l$index0[0]]

	sram[write, $xfer_out[0], round_pkts_ctr_base, cam_tag, 1], \
		sig_done[wrback_lru_ctr_done]
	
	; update CAM LRU entry
	cam_write[cam_entry, enq_pkts_ctr_offset, NORMAL_CAM_STATE]

	;swap out
	ctx_arb[--] , defer[1], br[move_data_to_lm#]
	local_csr_wr[active_ctx_wakeup_events, sig_mask]

count_read_enq_ctr_cam_hit#:

	;Mask out signal from SIGMASK_DEFAULT_PHASE1
	alu[sig_mask, sig_mask, AND~, 1, <<(&read_enq_ctr_done)]

	ctx_arb[--] , defer[2]

	;Mask out signal from SIGMASK_DEFAULT_PHASE1
	alu[sig_mask, sig_mask, AND~, 1, <<(&wrback_lru_ctr_done)]
	local_csr_wr[active_ctx_wakeup_events, sig_mask]

	;upon waking up from context arb, increment enqueue packets counter
	alu[*l$index0[0], *l$index0[0], +, 1]
	br[start_phase_2#]

move_data_to_lm#:
	.io_completed read_enq_ctr_done wrback_lru_ctr_done
	;move enqueue packets counter data to local mem and increment counter
	alu[*l$index0[0], 1, +, $xfer_in[0]]
				
.end

#endm

///////////////////////////////////////////////////////////////////////////////
// Count
//	 	Description: Perform sevral functions:
//			- read message from Class_scheduler
//			- forward enqueue port and round
//			- save next dequeue round to be process after reading the
//			  next message
//			- process the next dequeue round from previous message: check
//				if this round has any packets. If yes, forward the dequeue round.
//				If no, send a Scratch ring message back to Class schedduler to
//				request another dequeue round

/*	
	////////////////Phase 1///////////////////////////////////////////////////
start_phase_1#:

	Initialize signal_mask = previous thread signal

	//First, process previous next dequeue data
	If (previous dequeue round packets count == 0)
	
		Extract the port from previous dequeue port and round
		
		Send port number to CLASS SCHEDULER again on Scratch ring to request another
		next dequeue round

		Add scratch_write_done to signal_mask

		Set invalid bit in previous dequeue port and round so that the next
		block (Port_Scheduler) doesn't have to waste time checking empty round

	Else
	
		SRAM write to zero out the Packets Counter because once a round number 
		is sent for dequeue, all packets in that round will be dequeued (this 
		write does not pose any conflict with the packets counter of the enqueue
		round since the round was marked unvailable for enqueue as soon as it was sent
		as next dequeue packet round)

		Add empty_prev_next_round_ctr_done to signal mask
		
	Endif
	
	//Second, process previous next dequeue data
	If (nn-ring empty)
	 branch to no_class_scheduler_msg#
	
	Else
		Read message from nn-ring: sop, enqueue port and round, next dequeue 
								   port and round
		
		If (invalid enqueue message)
		 	Branch to invalid_enqueue_message#
		Endif

		Write message to nn-ring: sop, enqueue port and round, previous dequeue port
									and round, previous dequeue packets count

		Extract enqueue port and round from enqueue message
		
		Calculate SRAM offset of the Packets Counter for that round
		
		Use  SRAM offset as a tag to look up CAM

		If CAM miss
			
			SRAM write back the LRU
			
			SRAM read to get Packets Counter

			Add sram_read_done, sram_write_back_done to signal mask
		
		Endif

		Ctx_arb[--]. Wait on all signals in signal_mask


		If CAM miss

			Move Packets Counter value of enqueue round to local memory
		Endif


		Increment Packets Counter of enqueue round in local memory

	////////////////Phase 2///////////////////////////////////////////////////
start_phase_2#:	

		//Third, process next dequeue port and round message

		If (invalid next dequeue port and round message)
		 	Branch to invalid_next_dequeue_message#
		Endif

		Extract next dequeue port and round from next dequeue message
		
		Calculate SRAM offset of the Packets Counter for the next dequeue round
		
		Use  SRAM offset as a tag to look up CAM

		If CAM hit			
		
			Save the packets count from CAM entry to previous_next_dequeue_pkts_count

			Invalidate the entry
		
		Else
			SRAM read to get Packets Counter

			Add sram_read_done to signal mask
		
		Endif
		
		Ctx_arb[--]. Wait on all signals in signal_mask
								
		If CAM miss		
		
			Save the Packets Counter to previous_next_dequeue_pkts_count
		Endif

	Endif	//if no nn-ring empty


invalid_enqueue_message#:

		
		If ( (valid previous dequeue port and round) && 
			 (previous_next_dequeue_pkts_count != 0) )
			Write message to nn-ring: sop=0, invalid enqueue port and round, 
									previous dequeue port and round, 
									previous dequeue packets count
								
		Endif

		Ctx_arb[--]. Wait on all signals in signal_mask	
		br[start_phase_2#]

invalid_next_dequeue_message#:		
		Ctx_arb[--]. Wait on all signals in signal_mask	
		br[start_phase_1#]


no_class_scheduler_msg#:
		sop=0
		enqueue port and round = invalid
		next dequeue port and round = invalid

		br[invalid_enqueue_message#]

*/
////////////////////////////////////////////////////////////////////////////////
#macro Count()

.begin

	.reg	enq_port 						//port number from enqueue message
	.reg	enq_round 						//round number from enqueue message
	.reg	next_deq_port 					//port number from next dequeue round 
											//response message
	.reg	next_deq_round 					//round number from next dequeue 
											//round response message

	.reg	port_structs_base 				//base of all per port data structures

	.reg 	next_deq_round_pkts_ctr_offset 	//offset of the packets counter for the 
										  	//round in current next dequeue round 
											//response 

	.reg 	total_bytes_used 				//total bytes has been used up from  
											//the round credit for the current round 
											//(where enqueuing is happening)		
	.reg write $next_deq_round_request 		//port number to send back the CLASS SCHEDULER
	
	.reg 	temp

	xbuf_alloc($xfer_in, 1, read)
	xbuf_alloc($xfer_out, 1, write)	
				
	//------------------Phase 1------------------------------------------------------
start_phase_1#:

	;write wake up signal for next thread
	local_csr_wr[same_me_signal, next_th_signal]	

	;Default value for SIG_MASK is prev_th_sig, read_enq_ctr_done, \
	;and wrback_lru_ctr_done required
	alu_shf[sig_mask, --, B, SIGMASK_DEFAULT_PHASE1]

	process_prev_next_deq_pkts_count_data(prev_deq_round_pkts_count, \
		prev_deq_port, prev_deq_round, prev_next_deq_pkts_ctr_offset, sig_mask)	

	read_nn_ring_message()
		
	;check for invalid enqueue message: bit 31=1. This means bit 15 of
	;enq_port = 1 (enq_port = enqueue message >> 16) 
	br_bset[enq_port, 15, invalid_enqueue_message#]

	;write nn-ring message with enqueue message and prev next dequeue response
	write_to_nn_ring(sop_handle, enq_port, enq_round, prev_deq_port, \
					prev_deq_round, prev_deq_round_pkts_count)
						
	;CAM lookup with tag = enqueue counter offset
	;If CAM miss: read the counter and write-back LRU
	read_enqueue_pkts_counter(enq_port, enq_round, $xfer_in, sig_mask)
	
	//------------------Phase 2------------------------------------------------------
start_phase_2#:
	.io_completed $zero $next_deq_round_request

	;write wake up signal for next thread
	local_csr_wr[same_me_signal, next_th_signal]	

	alu_shf[sig_mask, --, B, SIGMASK_DEFAULT_PHASE2]

	;move current next_deq_round response to previous next_deq_round response
	alu[prev_deq_port, --, B, next_deq_port]
	alu[prev_deq_round, --, B, next_deq_round]
		
	;check for invalid dequeue message (bit 31=1). This means bit 15 of
	;deq_port = 1 (deq_port = dequeue message >> 16) 
	br_bset[next_deq_port, 15, invalid_next_dequeue_round_response#]

	;CAM lookup with tag == the dequeue counter offset	
	;CAM miss: read the counter and save it in prev_deq_round_pkts_count
	;CAM hit: save the counter value in prev_deq_round_pkts_count and invalidate
	;the CAM entry. This counter value will be zero out the next time the
	;dequeue round is sent to Port Scheduler, so no need to write it back.
	;Then branch to Phase 1 at the end

	read_next_dequeue_round_pkts_counter()

invalid_next_dequeue_round_response#:
	;no need to get dequeue round packets counter. Just
	;swap out and begin at phase 1 upon waking up
	ctx_arb[--] , defer[2], br[start_phase_1#]
	
	//mask out signal from SIGMASK_DEFAULT_PHASE2
	alu_shf[sig_mask, sig_mask, AND~, 1, <<(&read_next_deq_ctr_done)]
	local_csr_wr[active_ctx_wakeup_events, sig_mask]

invalid_enqueue_message#:	
	;if previous next dequeue round response is valid AND the round has 
	;>0 packets count, write message to nn-ring	

	;first check for invalid enqueue message: bit 31=1. This means bit 15 of
	;prev_enq_port = 1 (deq_port = dequeue message >> 16) 
	br_bset[prev_deq_port, 15, no_valid_msg#]

	alu[--, prev_deq_round_pkts_count, OR, 0]
	beq[no_valid_msg#]

	;write nn-ring message with enqueue message and prev next dequeue response
	write_to_nn_ring(sop_handle, enq_port, enq_round, prev_deq_port, \
					prev_deq_round, prev_deq_round_pkts_count)

no_valid_msg#:	
	//mask signals from SIGMASK_DEFAULT_PHASE1
	alu[sig_mask, sig_mask, AND~, 1, <<(&read_enq_ctr_done)]

	ctx_arb[--] , defer[2], br[start_phase_2#]

	//mask signals from SIGMASK_DEFAULT_PHASE1
	alu[sig_mask, sig_mask, AND~, 1, <<(&wrback_lru_ctr_done)]
	local_csr_wr[active_ctx_wakeup_events, sig_mask]

no_class_scheduler_msg#:
	
	immed[sop_handle, 0]

	immed[enq_round, 0]	
	alu_shf[enq_port, --, B, 1, <<15]	
	
	immed[next_deq_round, 0]	
	alu_shf[next_deq_port, --, B, 1, <<15]	
	
	;branch to invalid_enqueue_message#
	br[invalid_enqueue_message#]

end_count#:

	xbuf_free($xfer_in)
	xbuf_free($xfer_out)	
		
.end

#endm


/////////////////////////////////////////////////////////////////////////////////////////
//
// This is where the code begins..................
//
/////////////////////////////////////////////////////////////////////////////////////////

main#:

.begin

	.reg round_pkts_ctr_base zero 
	.reg sop_handle 
	.reg prev_deq_round_pkts_count
	.reg prev_deq_port
	.reg prev_deq_round
	.reg @default_cam_tag 
	.reg valid_scratch_ring_msg
	.reg prev_next_deq_pkts_ctr_offset 	//offset of the packets counter for the round
	.reg $zero							//in previous next_dequeue_round response
	.reg port_only_mask					//mask to extract only port number
	.reg	sig_mask 					//mask of signals to write in ctx_wakeup_events_active
	
	count_init()

	Count();

error#:

	nop; to avoid warning
.end

/////////////////////////////////////////////////////////////////////////////////////////

#endif // __COUNT_UC__

