//------------------------------------------------------------------------------------
//
//                   I N T E L   P R O P R I E T A R Y
//
//      COPYRIGHT (c)  2003 BY  INTEL  CORPORATION.  ALL RIGHTS
//      RESERVED.   NO  PART  OF THIS PROGRAM  OR  PUBLICATION  MAY
//      BE  REPRODUCED,   TRANSMITTED,   TRANSCRIBED,   STORED  IN  A
//      RETRIEVAL SYSTEM, OR TRANSLATED INTO ANY LANGUAGE OR COMPUTER
//      LANGUAGE IN ANY FORM OR BY ANY MEANS, ELECTRONIC, MECHANICAL,
//      MAGNETIC,  OPTICAL,  CHEMICAL, MANUAL, OR OTHERWISE,  WITHOUT
//      THE PRIOR WRITTEN PERMISSION OF :
//
//                         INTEL  CORPORATION
//
//                      2200 MISSION COLLEGE BLVD
//
//                SANTA  CLARA,  CALIFORNIA  95052-8119
//
//------------------------------------------------------------------------------------
/////////////////////////////////////////////////////////////////////////////
//
//
//      File Name: 	dl_source.uc
//
//      Purpose: 	Contains macros for dl_sink and dl_source for POS and IPv4 Blocks.
//
//      History:
//
//
//      Date            Comment                         By
//      ---------------------------------------------------------------------
//
//      01/22/2003      Created                         Tomasz Madajczak
//
//
/////////////////////////////////////////////////////////////////////////////

#ifndef __QUAD_GBETH_DL_SOURCE_UC__
#define __QUAD_GBETH_DL_SOURCE_UC__

#include <stdmac.uc>
#include <sig_macros.uc>
#include <dispatch_loop.h>
#include <dl_misc.uc>

/////////////////////////////
//	Some local defines
/////////////////////////////

#if (DL_RING_NUMBER > 11)
#error			"For Ring Number Greater than 11, we cannot use the SCR_Ring#_Full input state. Check RING_NUMBER"
#else
#define_eval	DL_RING_FULL			SCR_Ring/**/DL_RING_NUMBER/**/_Full
#endif

#if (IPV4_TO_QM_SCR_RING > 11)
#error			"For Ring Number Greater than 11, we cannot use the SCR_Ring#_Full input state. Check IPV4_TO_QM_SCR_RING"
#else
#define_eval	DL_IPV4_QM_RING_FULL	SCR_Ring/**/IPV4_TO_QM_SCR_RING /**/_Full
#endif

/////////////////////////////
// Global variables/Registers
/////////////////////////////


.reg 	global	dl_ring					// scratch ring number for pos rx ==>ipv4. This can be made an absolut register
.reg 	global	dl_buf_handle			// The current Buffer handle.
.reg	global	dl_eop_buf_handle		// For large packets, this is the last buffer in the chain
.reg	global	dl_next_block			// Next block that should process the buffer/packet
.reg    global  dl_exception_reg 		// Exception register used to send pkts to CORE
.set            dl_exception_reg

//	Global Variable for a Prev/Next thread signaling.

.sig volatile 	sig_prev
.addr			sig_prev				DL_SIG_WAKE


///////////////////////////////////////////////////////////////////////////////
// _dl_pos_sink_init:
//	 	Description:
//			Initialisation for POS dl_sink. Should be called before using dl_sink.
//			It simply sets a global variable dl_ring to the corresponding ring number
//			This will later save one instruction in dl_sink (for POS).
//
//			[Note: This is an internal macro not meant to be used by the user.
//			This macro will undergo some more changes.]
//
//	 	Outputs:
//
//		Inputs:
//
//		Size:   				:
//			1 instruction
//

#macro _dl_pos_sink_init[]
.begin

	alu_shf[dl_ring, --, B, DL_RING_NUMBER, <<2]	; ring number to be used in get/put

.end
#endm

///////////////////////////////////////////////////////////////////////////////
// dl_source_init:
//	 	Description:
//
//			Initialisation for dl_source. Should be called before using dl_source.
//			Currently not used/required. Place holder.
//
//	 	Outputs:
//
//		Inputs:
//
//		Size:   				:
//
//
#macro dl_source_init[]


#endm


///////////////////////////////////////////////////////////////////////////////
// _dl_eop_buf_handle_init:
//	 	Description:
//			THIS IS NOT AN EXPORTED MACRO. PLS DO NOT USE IT. THIS WILL CHANGE.
//			THIS IS ONLY TEMPORARY.
//
//	 	Outputs:
//
//		Inputs:
//
//		Size:   				:
//
//
#macro _dl_eop_buf_handle_init[]
.begin

	immed32[dl_eop_buf_handle, IX_NULL]

.end
#endm

///////////////////////////////////////////////////////////////////////////////
// dl_sink_init:
//	 	Description:
//
//			Initialisation for dl_sink. Should be called before using dl_sink.
//			Currently not used/required. Place holder.
//
//	 	Outputs:
//
//		Inputs:
//
//		Size:   				:
//
//
#macro dl_sink_init[]


#endm

///////////////////////////////////////////////////////////////////////////////
// dl_source:
//	 	Description:
//
//			This macro is used by IPv4 Block (for now) to get packets  and other info
//			from the POS RX block. It receives them thro a scratch ring (POS_RX_IPV4_RING)
//			The following info is available from the ring.
//			dl_buf_handle		// The current buffer containing SOP
//			dl_eop_buf_handle	// for large packets, the buffer containing EOP. Otherwise NULL
//			meta1				// Long Word 1 from the Meta data
//			meta2				// Long Word 2 from the Meta Data
//			input port			// port from which the packet is rxed.
//
//
//			THREAD_ORDER means if the execution of the threads need to be in
//			order. i.e wait for a signal from prev. thread, do some work and
//			signal next thread. (This thread ordering is required, if the
//			threads access some shared data, in order to get exclusive access
//			to that shared data).
//
//	 	Outputs:
//
//		Inputs:
//			THREAD_ORDER	:	Specifies if the execution of the threads need to be in
//								order. Possible (constant) values are
//								DL_THREAD_ORDER
//								DL_THREAD_NO_ORDER.
//
//			sig_mask		:	Signal mask to wait on. This is the bitmask tobe used in a
//								ctx_arb[--] instrcution. (The bitmask contains a set of signals
//								used in previous I/O on which we want to wait here)
//		Size:   			:
//			N/A
//
// 		Notes:
//		dl_source is meant to be used along with dl_qm_sink in the following way (only).
//
//		dl_source[DL_THREAD_ORDER, 0]
//		.while (1)
//			do_what_ever[]
//			dl_qm_sink[...]
//			dl_source[DL_THREAD_NO_ORDER, ...]
//		.endw
//
//		dl_qm_sink is always ordered, meaning, it will always wait for a signal from a previous
//		thread, do its job and then signal the next thread. Because dl_qm_sink is always ordered,
//		we don't need the dl_source just following it to be ordered. (but the first time we call
//		dl_source it needs to be ordered).
//
//		Performance optimisation: 
//		-------------------------
//		original code:
//		dl_qm_sink signals the next thread after producing on to the scratch ring, which was
//		at the end of that macro. Then dl_source is called (no thread order). At the very top
//		we had a scratch[get...] ctx_swap[..]. This ctx_swap happened too soon (because the signal
//		we sent to the next thread takes about 10 cycles to reach) so the next thread is still
//		not ready to execute. This lead to large "Idle" time in the pipeline affecting throughput.
//		
//		The following changes were made to improve performance.
//		1. In dl_qm_sink we signal the next thread at the first available oppurtunity, even
//		before the scratch[put...]. This is ok because the next thread (in the same ME) cannot
//		run until we sleep. So make sure there is no ctx_swap between this signal and scratch_put.
//
//		2. If the next thread is another ME, then 1 is a wrong solution. So if we have to
//		signal another ME (because we are in thread 7), we postpone the signalling until 
//		later (in dl_source macro). Thus, for thread 7, dl_source will always signal the 
//		next ME.
//
//		3. soon after we make a scratch[put...] in dl_qm_sink, we make scratch[get...] in 
//		dl_source. So we wait for the combined signals (in dl_source macro) rather wait
//		for wait for each of them separately. (This is the reason for having 3 paramters
//		to dl_qm_sink, and the second paramter to dl_source).
//
//		For ease of understanding, look at dl_qm_sink and dl_source as one BIG combined 
//		macro in the while loop.


#define	RX_TO_FUNC_MSG_SIZE		5		// 5 longwords

#macro dl_source[THREAD_ORDER, sig_mask]
.begin
.sig scr_get									; signal for scratch get
.reg ring

	xbuf_alloc[$rdata, RX_TO_FUNC_MSG_SIZE, read]		; allocate xfer register. No Code generated.

#if	(THREAD_ORDER == DL_THREAD_ORDER)

	// Wait for signal from previous thread to ensure thread ordering

	ctx_arb[sig_prev]

#endif
	
	//	Get Data from Scratch Ring.
	alu_shf[ring, --, B, ETH_RX_TO_IPV4_SCR_RING, <<2]		; Ring from which to consume
	scratch[get, $rdata0, 0, ring, RX_TO_FUNC_MSG_SIZE], sig_done[scr_get]	; Get 5 Long words from ring 

	// 	Signal Next Thread. See Notes as why this code is like the way itis shown here.

	br=ctx[7, thread_7#]									; Special case for thread 7.

#if	(THREAD_ORDER == DL_THREAD_ORDER)
	signal_next_ctx[DL_SIG_WAKE]							; !7, so signal the next thread
#endif

	br[wait_for_signals#]

thread_7#:

// If signalling to the first ME, then signal context 1. Context 0 is not
// part of the functional pipeline
#if (defined(START_ME) && (DL_NEXT_ME == START_ME))


	signal_me_ctx1[DL_NEXT_ME, DL_SIG_WAKE]	

#else
	signal_me[DL_NEXT_ME, DL_SIG_WAKE]						; Thread 7, So Signal Next ME

#endif


	//	We wait for the above scratch[get...] and the scratch[put...] in dl_qm_sink.
	//	See Notes above.

wait_for_signals#:
.reg tmp

	//	Add the signal to the signal mask on which to wait

	alu[tmp, --, B, &scr_get]
	alu[--, tmp, OR, 0]										; tmp will be used  by "indirect" in next instruction
	alu_shf[sig_mask, sig_mask, OR, 1, <<indirect]			; shift 1 by scr_get times

	local_csr_wr[active_ctx_wakeup_events, sig_mask]		; tells which signals to wait on.
	ctx_arb[--]

	//	Assembler doesn't know that we actually wait on the I/O (scratch[get..])
.io_completed scr_get

check_ring_empty#:

	//	Check if Ring is empty. ==> it would have returned zero

	alu[--, $rdata0, -, 0]									; Check for 0. i.e check if ring empty
	beq[ring_empty#]								

	//	$rdata0...$rdata4 will have valid data now.
	//	Load Meta Data Cache with $rdata2, and $rdata3 << 16
	// 	$rdata2 (buffer size,offset) => LW1 of meta data
	//	$rdata3(pkt size, rx stat, free_list etc) ==> LW2 in meta data. 
	//	LSB16 bits of $rdata4 is inport => LW3, MSB16 of meta data


	dl_meta_init_cache[0, $rdata2, $rdata3, $rdata4, 0, 0, 0, 0]
      dl_meta_set_nexthop_id[0xffff]	// set nexthop ID to -1


	// 	We branch here so that we can take advantage of the defer slots here.
	//	This branch with defer, will finally save a branch to the end of this macro 
	//	(where 2 cycles may be wasted)

	br[load_ip_cache#], defer[2]
	move[dl_buf_handle, $rdata0]				; LW0 from ring is the current buffer handle (containign SOP)
	move[dl_eop_buf_handle, $rdata1]			; LW1 from ring is last buffer handle (for large packets)

	// 	program flow continues in load_ip_cache#:

	//	Handle Ring empty Condition.
ring_empty#:

	//	If ring is empty, just set dl_buf_handle and dl_next_block to IX_NULL;
	// 	Is this the right thing to do ? TBD ???

	move[dl_buf_handle, IX_NULL]				; Note IX_NULL is NOT ZERO.
	move[dl_next_block, IX_NULL]
	br[nothing_to_do#]

load_ip_cache#:
.sig	sig_ip

	// Read the IP header of the packet and cache it in some place

	dl_load_iphdr_cache[dl_buf_handle, sig_ip, sig_ip]
	move[dl_next_block, DL_SOURCE_NEXT1]

nothing_to_do#:

	xbuf_free[$rdata]							; Free xfer registers. No Code generated.


//	The OC48 project spits out 100s of warnings.
//	We'll investigate. until then....

//eme .set	$$iphdr0, $$iphdr1, $$iphdr2, $$iphdr3, $$iphdr4, $$iphdr5, $$iphdr6, $$iphdr7 
.set	$$iphdr0, $$iphdr1, $$iphdr2, $$iphdr3, $$iphdr4, $$iphdr5, $$iphdr6, $$iphdr7 , $$iphdr8, $$iphdr9

.end
#endm


///////////////////////////////////////////////////////////////////////////////
// dl_sink:
//	 	Description: 
//			Called by POS RX to handover the packet to the next block (IPv4) in the
//			ingress pipeline. We do not do thread ordering in this macro. (It is done
//			by POS RX itself because of tight budget)
//			
//			This macro will produce the following on to the scratch ring.
//			dl_buf_handle		// The current buffer containing SOP 
//			dl_eop_buf_handle	// for large packets, the buffer containing EOP. Otherwise NULL
//			meta1				// Long Word 1 from the Meta data (starting from 0)
//			meta2				// Long Word 2 from the Meta Data
//			meta3				// Long Word 3 from the meta data
//
//			This info will be used by the IPV4 block (by calling dl_source)
//
//			Note:
//			sig_mask is a bit mask of signals to wait on. This is passed in a regiter.
//			If there is no need to wait for any signal, pass SIG_NONE. 
//
//	 	Outputs:
//
//		Inputs:
//			wxfer_prefix:		SRAM Write Transfer Register name prefix (using xbuf_alloc).
//			req_sig			:	signal to be used in the I/O (scratch put)
//			sig_mask		:	What to do with I/O operation - 
//								- wait for signal(s) as specified by sig_mask (register)
//								- do not wait for signal, just return (SIG_NONE - constant)
//			The following Global variables will also act as inputs.
//			dl_next_block		- IX_EXCEPTION, for exception packets
//								- BID_NEXT_BLOCK, block ID of the next block to handle this packet.
//			dl_buf_handle		: Buffer handle of the buffer in the start of chain (contianinf SOP)
//			dl_buf_eop_handle	: Buffer handle of the last buffer in the chain
//		Size:   				:
//			N/A
//
#macro dl_sink[wxfer_prefix, req_sig, sig_mask]
.begin

	// We need to act only on EOP. otherwise just return.

	br=byte[dl_next_block, 0, BID_NEXT_BLOCK, sink_it#], defer[3]

	//	Move required data to xfer registers in the defer slot.

		move[wxfer_prefix/**/0, dl_buf_handle]		; dl_buf_handle -> xfer0
		move[wxfer_prefix/**/1, dl_eop_buf_handle]	; dl_eop_buf_handle -> xfer1
		move[wxfer_prefix/**/3, dl_meta2]			; LW2 of meta data -> xfer3.


	//	In the case of an exception packet, it needs to be sent to the core (Xscale)
	//	through a different ring.

check_exception#:

	br!=byte[dl_next_block, 0, IX_EXCEPTION, nothing_to_do#]
	
	// Do Exception processing here.
	
	// For now just drop the packet.

drop_pkt#:

	alu[@pkt_rx_drop, @pkt_rx_drop, +, 1]
	
	//tkh (added sram counter increment for drop)
	.begin
	.reg port_in
	dl_meta_get_input_port( port_in )
	_packet_rx_incr_counter(port_in, PACKET_PKTS_DROPPED)
	.end
	//tkh

	// 	Drop the packet. Two cases to handle. Single buffer or a bufffer chain.
	//	It is determined by looking at the dl_eop_buf_handle

	alu[--, dl_eop_buf_handle, -, IX_NULL]					; if dl_eop_buf_handle == IX_NULL
	beq[drop_single_buffer#]								; then it is single buffer.

	dl_drop_buffer_chain[dl_buf_handle, dl_eop_buf_handle]	; otherwise, buffer chain.
	br[nothing_to_do#]

drop_single_buffer#:

	dl_drop_buffer[dl_buf_handle]

	br[nothing_to_do#]

sink_it#:


	// Ok. We can sink. Check if the ring is full and then produce the data 
	// onto the ring. If ring full, drop the packet and continue.

// 	br_inp_state[DL_RING_FULL, drop_pkt#]				; Check if ring is full.
	br_inp_state[DL_RING_FULL, sink_it#]	;tkh - buffer drop is causing q_array corrpution, let mac flow control or overrun drop packets

	// Ring Not Full. Put the packet on the ring.

	//	Fillup 2nd LW of scratch data

	move[wxfer_prefix/**/2, dl_meta1]					; buffersize:offset -> xfer2
	move[wxfer_prefix/**/4, dl_meta3]					; inport | 0x0

	scratch[put, wxfer_prefix/**/0, 0, dl_ring, 5], sig_done[req_sig]	; Produce 5 Long words on the ring

	alu[@pkt_rx_enqueued, @pkt_rx_enqueued, +, 1]

	
#if (!streq('sig_mask', 'SIG_NONE'))
	//	Add the signal to the signal mask on which to wait
	alu_shf[sig_mask, sig_mask, OR, 1, <<&req_sig]		; shift 1 by req_sig times
#endif


nothing_to_do#:

#if (!streq('sig_mask', 'SIG_NONE'))

#ifdef FIX_SEQUENCING
	alu_shf[sig_mask, sig_mask, OR, 1, <<&sig_seq]		; tkh add wait on seq sig
#endif

	local_csr_wr[active_ctx_wakeup_events, sig_mask]
	ctx_arb[--]

#ifdef FIX_SEQUENCING
	_packet_rx_signal_next_thread(seq_sig_reg)			; tkh signal next seq
#endif

#endif

	
.end
#endm


///////////////////////////////////////////////////////////////////////////////
// dl_qm_sink:
//	 	Description: 
//			Called by IPv4 Block to send the packet to Queue Manager thro' a scratch ring.
//			This macro will do thread ordering always. 
//		
//			This macro will produce the following on to the scratch ring.
//			dl_buf_handle		// The current buffer containing SOP 
//			dl_eop_buf_handle	// for large packets, the buffer containing EOP. Otherwise NULL
//			Queue Number		// Queue number into which this packet is to be queued.
//								// Queue number = 16 * output Fabric id. (available in meta data)
//	 	Outputs:
//			sig_mask			- if the I/O operation is made (scratch put) then the sig_mask
//								  will have the bit corresponding to sig_scr set. (sig_scr is an
//								  input parameter)
//
//		Inputs:
//			sig_scr				- Signal to use in the I/O (scracth put ) operation
//			wxfer				- SRAM write xfer regsiter to be used in I/O.
//								  Should have been allocated using xbuf_alloc. 
//								  Atleast 3 regsiters should be allocated.
//
//			The following Global variables will also act as inputs.
//			dl_next_block		- IX_EXCEPTION, for exception packets
//								- IX_NULL, for null packets.
//								- BID_NEXT_BLOCK, block ID of the next block to handle this packet.
//			dl_buf_handle		: Buffer handle of the buffer in the start of chain (contianinf SOP)
//			dl_buf_eop_handle	: Buffer handle of the last buffer in the chain
//		
//		Size:   				:
//			N/A
//
//		Notes:
//
//		See notes in dl_source macro

#macro dl_qm_sink[sig_mask, sig_scr, wxfer]
.begin
.sig	sig_meta, sig_ip

	xbuf_alloc[$meta_cache, 8, write]					; This line doesn't produce any code.

	//	This macro can be seen in two parts. The normal packet case (which is the critical
	//	path) and other(drop, null pktetc) cases

	//	First the normal case.
	// now check for next block

	br!=byte[dl_next_block, 0, BID_NEXT_BLOCK, check_exception#]  
	
	//	Write back Meta data. LW0 and LW1 were already written by POS.
	//	So we need to write the rest. (LW2 ...LW7)


//	The 1st LW of the meta data is next buffer pointer and should not be updated
//  when doing dl_meta_flush_cache
#define_eval META_SIZE (META_CACHE_SIZE -1)
	dl_meta_flush_cache[$meta_cache, dl_buf_handle, sig_meta, SIG_NONE, 1, META_SIZE]		; Write back Meta data to SRAM
	dl_flush_iphdr_cache[dl_buf_handle, sig_ip, SIG_NONE]					; Write back IP hdr to DRAM

	//	Wait for the prev. I/O as well as signal from previous thread.

	ctx_arb[sig_meta, sig_ip, sig_prev]

	// 	Signal Next Thread. Signaling next thread has a latency of 10 cycles. 
	//  So we signal at the first available oppurtunity. (see notes in dl_source macro).
	//	It is IMPORTANT that there should be not ctx_swap/arb from here to scratch[put...]
	//	below.

	br=ctx[7, thread_7#]								; Special case for thread 7.
	signal_next_ctx[DL_SIG_WAKE]						; !7, so signal the next thread

thread_7#:

	// 	If thread 7 we have to signal next ME. But we cannot do that until we have issued
	//	the scratch[put..] command. We'll do it in dl_source macro. (see notes in dl_source).


	// Check if Ring is FUll. If Full, just stall the pipeline. (i.e keep looping)

check_ring_full#:
 	br_inp_state[DL_IPV4_QM_RING_FULL, ring_is_full#]; Check if ring is full and loop

	// At this point, ring has enough space. Sink the data.

	move[wxfer/**/0, dl_buf_handle]						; LW0 on the ring is current buffer hande
	move[wxfer/**/1, dl_eop_buf_handle]					; LW1 on the ring is the last buffer in the chain, 
														; for big pkts only. Othereise IX_NULL
.reg	queue_number

	dl_meta_get_output_port[queue_number]				; LW2 on the ring is Queue number for Queue Manager
	alu_shf[queue_number, --, B, queue_number, <<4]		; output_port * 16 is the actual queue number
	alu_shf[wxfer/**/2, queue_number, OR, 1, <<31]		; The msb is always set to 1, to prevent producing a value of 0 on to the ring.

.reg ring

	alu_shf[ring, --, b, IPV4_TO_QM_SCR_RING, <<2]		; ring nos in get/put is always multiple of 4.

	scratch[put, wxfer/**/0, 0, ring, 3], sig_done[sig_scr]	; Produce 3 Long words on the ring

	//	To improve performance, we do not ctx_swap on scratch[put..] above. We'll wait
	//	on this signal in dl_source macro. (see notes in dl_source macro). However this(dl_qm_sink) 
	//	macro has multiple paths in which we do not issue this scratch[put..] at all. 
	//	e.g exception packets, or NULL packets etc. So if we issue scratch[put..], 
	//	we set the sig_mask to indicate that.

.reg tmp

	//	Add the signal to the signal mask on which to wait

	alu[tmp, --, B, &sig_scr]

	br[end#], defer[2]
		alu[--, tmp, OR, 0]								; tmp will be used  by "indirect" in next instruction
		alu_shf[sig_mask, sig_mask, OR, 1, <<indirect]	; shift 1 by req_sig times


ring_is_full#:
	nop													; to set break point
	br[check_ring_full#]								; go back and check again

	// 	Check for exception Packets. Exception packets are sent to the core (Xscale)
	//	on a separate ring. 

check_exception#:

	br!=byte[dl_next_block, 0, IX_EXCEPTION, check_drop#]

#define_eval META_SIZE (META_CACHE_SIZE -1)
	dl_meta_flush_cache[$meta_cache, dl_buf_handle, sig_meta, SIG_NONE, 1, META_SIZE]		; Write back Meta data to SRAM
	// 	Wait for signal from prev thread and then signal next context.
	
	ctx_arb[sig_meta, sig_prev]

#ifndef USE_IMPORT_VAR

	// Signal next context.

	br=ctx[7, thread_7_2#]								; Special case for thread 7.
	signal_next_ctx[DL_SIG_WAKE]						; !7, so signal the next thread

thread_7_2#:

	// 	Drop the packet. Two cases to handle. Single buffer or a bufffer chain.
	//	It is determined by looking at the dl_eop_buf_handle

	alu[--, dl_eop_buf_handle, -, IX_NULL]					; if dl_eop_buf_handle == IX_NULL
	beq[drop_single_buffer#]								; then it is single buffer.

	dl_drop_buffer_chain[dl_buf_handle, dl_eop_buf_handle]	; otherwise, buffer chain.
	br[end#]

drop_single_buffer#:
	dl_drop_buffer[dl_buf_handle]
	br[end#]

#else

    //This code will send pkts to the CORE on two rings based on priority.
    dl_exception_send(dl_buf_handle)

    // Signal next context.

	br=ctx[7, thread_7_2#]								; Special case for thread 7.
	signal_next_ctx[DL_SIG_WAKE]						; !7, so signal the next thread

thread_7_2#:

	br[end#]

#endif //USE_IMPORT_VAR
check_drop#:

	// 	Wait for signal from prev thread and then signal next context.
	
	ctx_arb[sig_prev]

	// Signal next context.

	br=ctx[7, thread_7_2_drop#]								; Special case for thread 7.
	signal_next_ctx[DL_SIG_WAKE]						; !7, so signal the next thread

thread_7_2_drop#:


	br!=byte[dl_next_block, 0, IX_DROP, check_ix_null#]

	// 	Drop the packet. Two cases to handle. Single buffer or a bufffer chain.
	//	It is determined by looking at the dl_eop_buf_handle

	alu[--, dl_eop_buf_handle, -, IX_NULL]					; if dl_eop_buf_handle == IX_NULL
	beq[drop_single_buffer_1#]								; then it is single buffer.

	dl_drop_buffer_chain[dl_buf_handle, dl_eop_buf_handle]	; otherwise, buffer chain.
	br[end#]

drop_single_buffer_1#:
	dl_drop_buffer[dl_buf_handle]
	br[end#]

check_ix_null#:

	// Do something to handle null packet. TBD
	nop

end#:
	xbuf_free[$meta_cache]								; free xfer register. No Code generated.

.end
#endm

///////////////////////////////////////////////////////////////////////////////
// nn_prod_init:
//	 	Description: 
//			Initialise the Next Neighbour Mode (NN mode) for the producer. 
// 			For the producer, NN Registers are wrriten: from This ME. [bit 20 = 1]
//
//
//	 	Outputs:
//
//		Inputs:
//		
//		Size:   				:
//	
//

#macro nn_prod_init[]

.begin
.reg csr_data

	local_csr_rd[CTX_ENABLES]
	immed[csr_data,0]

	// For the source, NN Registers are wrriten: from This ME. [bit 20 = 1]

	bits_set[csr_data, 20, 1]						; [20] = 1

	local_csr_wr[CTX_ENABLES, csr_data]

.end
#endm

///////////////////////////////////////////////////////////////////////////////
// nn_cons_init:
//	 	Description: 
//			Initialise the Next Neighbour Mode (NN mode) for the consumer.
// 			For the consumer, NN Registers are wrriten: from previous ME. [bit 20 = 0]
//			We do some one time initilaisation in consumer???
//			(so producer need not initialise them) Should it be porducer???? TBD
//
//	 	Outputs:
//
//		Inputs:
//		
//		Size:   				:
//	
//
#macro nn_cons_init[]
.begin
.reg csr_data

	local_csr_rd[CTX_ENABLES]
	immed[csr_data, 0]

// 	For the sink, NN Registers are wrriten: from previous ME. [bit 20 = 0]

	bits_clr[csr_data, 20, 1]						; [20] = 0

	// Tell when to trigger ring empty. i.e when nn_get == nn_put (0 entires valid)
	// We have kept this and above instruction (alu_shf) separate for clarity. 
	// we can always merge them when the need arises.

	bits_clr[csr_data, 18, 3]						; [19:18] = 0
	local_csr_wr[CTX_ENABLES, csr_data]

	local_csr_wr[NN_GET, 0]
	local_csr_wr[NN_PUT, 0]
	
.end
#endm

////////////////////////////////////////////////////////////////////////
// Macro Name  : dl_exception_send
// Description : Macro to write the two longwords of exception
//				 and trigger interrupt to activate RM ISR for 
//				 exception packets
// Input       : in_buffer_handle, in_exception_code
// Constant    : nil
// Size        : 4
// Branches    : 0
////////////////////////////////////////////////////////////////////////
#macro dl_exception_send(in_dl_buf_handle)

.begin
    .reg    priority

	/* check for priority */
    dl_exception_get_priority[priority]
	br_bclr[priority, 0, low_priority#]

	dl_exception_write_ring(in_dl_buf_handle, dl_exception_reg, EXCEPTION_RING_OUT_1, XSCALE_INT_B)
	br[end#]

low_priority#:

	dl_exception_write_ring(in_dl_buf_handle, dl_exception_reg, EXCEPTION_RING_OUT_0, XSCALE_INT_A)
	br[end#]
.end

end#:

#endm
////////////////////////////////////////////////////////////////////////
// Macro Name  : dl_exception_write_ring
// Description : Macro to write the two longwords of exception
//				 and trigger interrupt to activate RM ISR for 
//				 exception packets
// Input       : in_buffer_handle, in_exception_code
// Constant    : nil
// Size        : 4
// Branches    : 0
////////////////////////////////////////////////////////////////////////
#macro dl_exception_write_ring(in_buffer_handle, in_exception_code, IN_RING_NUM, IN_INTERRUPT)

.begin
   .reg $excep_lw0, $excep_lw1, ring
   .xfer_order $excep_lw0, $excep_lw1
   .sig scratch_wr_done


   // set the ring number

   alu[ring, --,b, IN_RING_NUM, <<2]

   // set the exception longwords

   alu[$excep_lw0, --,b, in_buffer_handle]
   alu[$excep_lw1, --,b, in_exception_code]

   // check if transmit ring is full

   #define_eval RING_NUM IN_RING_NUM

   scr_write#:
	
   br_inp_state[SCR_RING/**/RING_NUM/**/_FULL, scratch_ring_full#]

   // write the two longwords to the exception ring

   scratch[put, $excep_lw0, ring, 0,2], sig_done[scratch_wr_done]
   ctx_arb[scratch_wr_done]

   // trigger the interrupt

   cap[fast_wr, 0, IN_INTERRUPT]

   // take three  cycles for the CSR to be updated
   // using two defer cycles
   nop
 
   br[end#], defer[2]
   nop
   nop

scratch_ring_full#:
	// 	Drop the packet. Two cases to handle. Single buffer or a bufffer chain.
	//	It is determined by looking at the dl_eop_buf_handle

	alu[--, dl_eop_buf_handle, -, IX_NULL]					; if dl_eop_buf_handle == IX_NULL
	beq[drop_single_buffer_1#]								; then it is single buffer.

	dl_drop_buffer_chain[dl_buf_handle, dl_eop_buf_handle]	; otherwise, buffer chain.
	br[end#]

drop_single_buffer_1#:
	dl_drop_buffer[dl_buf_handle]

end#:
.end

#undef RING_NUM

#endm 

///////////////////////////////////////////////////////////////////////
// Macro Name  : dl_get_exception_pkt
// Description : Macro to write the two longwords of exception
//				 and trigger interrupt to activate RM ISR for 
//				 exception packets. In the current implementation
//               all packets go to queue manager and it is assumed 
//               that all packet processing has been completed
//               at the core.
// Input       : in_buffer_handle, in_exception_code
// Constant    : nil
// Size        : 4
// Branches    : 0
////////////////////////////////////////////////////////////////////////
#macro dl_get_exception_pkt()


.begin
   .reg ring, temp
   .reg queue_number
   .reg read $excep_ctrl_lw0 $excep_ctrl_lw1 $excep_ctrl_lw2
   .xfer_order $excep_ctrl_lw0, $excep_ctrl_lw1, $excep_ctrl_lw2
   .reg write $excep_data_lw0, $excep_data_lw1, $excep_data_lw2
   .xfer_order $excep_data_lw0, $excep_data_lw1, $excep_data_lw2
   .sig scratch_rd_done, meta_read_signal         

   // set the ring number

   alu[ring, --,b, EXCEPTION_RING_IN_0, <<2]

   scratch[get, $excep_ctrl_lw0, ring, 0,3], sig_done[scratch_rd_done]

   ctx_arb[scratch_rd_done]


   // read one longword from exception ring 1
#if 0
   scratch[get, $excep_ctrl_lw0, ring, 0,1], sig_done[scratch_rd_done]
   ctx_arb[scratch_rd_done]
#endif

   .if( $excep_ctrl_lw0 != 0 )

	 move[dl_buf_handle, $excep_ctrl_lw0]				; LW0 from ring is the current buffer handle (containign SOP)

	 // allocate 6 read xfer registers for reading in the meta data 

	 xbuf_alloc[$dl_meta, 7, read_write]

	 //read meta data

     dl_meta_load_cache[$excep_ctrl_lw0, $dl_meta, meta_read_signal, 1, 6]

	 // set the ring number
#if 0	
	 alu[ring, --,b, EXCEPTION_RING_IN_1, <<2]

	 // read the message from the other ring

	 scratch[get, $excep_data_lw0, ring, 0,1], sig_done[scratch_rd_done]

	 // wait for scratch read and meta data read to complete

	 ctx_arb[scratch_rd_done, meta_read_signal]
	 move[dl_eop_buf_handle, $excep_data_lw0]

#else 

     ctx_arb[meta_read_signal]

	 // set the eop handle 

	 move[dl_eop_buf_handle, $excep_ctrl_lw1]
#endif
      
	 //Write three long words on scratch ring

	 move[$excep_data_lw0, dl_buf_handle]
	 move[$excep_data_lw1, dl_eop_buf_handle]

	 ; Retrieve the output port to be the queue number
	 alu[queue_number, 0, +16, $dl_meta3]     ; LW2 on the ring is Queue number for Queue Manager
	 alu[temp, --, B, queue_number, <<4]      ; output_port * 16 is the actual queue number
	 alu[$excep_data_lw2, temp, OR, 1, <<31]  ; The msb is always set to 1, to prevent producing a value of 0 on to the ring.

     alu[ring, --, b, IPV4_TO_QM_SCR_RING, <<2]

     // check for ring full condition. if full then keep trying

check_ring#:
 	 br_inp_state[DL_IPV4_QM_RING_FULL, ring_full#];

	 scratch[put, $excep_data_lw0, 0, ring, 3], sig_done[scratch_rd_done]
	 ctx_arb[scratch_rd_done]

	 br[end#]

     // if the ring is full go back and retry to write to scratch ring
ring_full#:
     br[check_ring#]      

end#:

    xbuf_free[$dl_meta]
  .endif


.end
read_again#:


#endm


///////////////////////////////////////////////////////////////////////
// Macro Name  : _init_l2_validation
// Description : Macro to initialize l2 validation registers
//
////////////////////////////////////////////////////////////////////////
#macro _init_l2_validation(out_id_header_exist, out_l2_table_base)
	move(out_id_header_exist, _L2_HEADER_ALREADY_EXIST_ID)
	immed32(out_l2_table_base, L2_TABLE_SRAM_BASE)
#endm

///////////////////////////////////////////////////////////////////////
// Macro Name  : _l2_validate
// Description : Macro to check whether the l2 address for an IP address
//				 (particularly for a given next hop id) is known. If not
//				 the packet is sent via the exception path to the core for
//               ARP resolution.
//
// Input       : in_buffer_handle, in_next_hop_id
////////////////////////////////////////////////////////////////////////
#macro _l2_validate()
.begin
.reg nexthop l2_entry_addr
.sig sig_sram_read

.reg sig_mask
.reg sigmask_default; default value for signal mask

	; check if the packet is not for us
	alu[--, dl_next_block, -, BID_QM]
	bne[end#]

	xbuf_alloc($l2_entry_lw, 1, read)

 	// get next_hop_id
	dl_meta_get_nexthop_id(nexthop)
	alu[--, nexthop, -, id_l2_header_exist]
	beq[ethernet_l2_header_exist#]

	alu_shf[l2_entry_addr, --, B, nexthop, <<_L2_TABLE_ENTRY_SHFT]
	sram[read, $l2_entry_lw0, l2_table_base, l2_entry_addr, 1], ctx_swap[sig_sram_read]

	// if not valid L2 table entry, send to the appropriate core component in XSCALE 
	// an exception request, since there still no component yet
	br_bset[$l2_entry_lw0, _L2_TABLE_VALID_BIT, ethernet_l2_header_exist#] 

exception_to_core#:	

#ifdef USE_IMPORT_VAR

	// issue exception message to core component
	alu[dl_exception_reg, --,b,0]
 	dl_set_exception(IX_CC_ETH_TX_PKT_ID, IX_NOL2_ADDR_IP_PACKET)
	move[dl_next_block, IX_EXCEPTION]
   	
#else // #ifdef USE_IMPORT_VAR

	// since there is no core coponent now, go normal path
	move[dl_next_block, IX_DROP]

#endif // #ifdef USE_IMPORT_VAR

	xbuf_free($l2_entry_lw)

.end // nexthop l2_entry_addr


ethernet_l2_header_exist#:
end#:
#endm


///////////////////////////////////////////////////////////////////////////////
// dl_packet_tx_sink:
//	 	Description: 
//			Called by IPv4 Block to send the packet to packet Tx block by a scratch ring.
//			This macro will do thread ordering always. 
//		
//			This macro will produce the following on to the scratch ring.
//			dl_buf_handle		// The current buffer containing SOP 

//	 	Outputs:
//			sig_mask			- if the I/O operation is made (scratch put) then the sig_mask
//								  will have the bit corresponding to sig_scr set. (sig_scr is an
//								  input parameter)
//
//		Inputs:
//			sig_scr				- Signal to use in the I/O (scracth put ) operation
//			wxfer				- SRAM write xfer regsiter to be used in I/O.
//								  Should have been allocated using xbuf_alloc. 
//								  Atleast 3 regsiters should be allocated.
//
//			The following Global variables will also act as inputs.
//			dl_next_block		- IX_EXCEPTION, for exception packets
//								- IX_NULL, for null packets.
//								- BID_NEXT_BLOCK, block ID of the next block to handle this packet.
//			dl_buf_handle		: Buffer handle of the buffer in the start of chain (contianinf SOP)
//			dl_buf_eop_handle	: Buffer handle of the last buffer in the chain
//		
//		Size:   				:
//			N/A
//
//		Notes:
//
//		See notes in dl_source macro

#macro dl_packet_tx_sink[sig_mask, sig_scr, wxfer]
.begin
.sig	sig_meta, sig_ip


	//	This macro can be seen in two parts. The normal packet case (which is the critical
	//	path) and other(drop, null pktetc) cases

	//	First the normal case.

	// compute the cell count

	// ESWAR : no need to compute cells
	// compute_sop_buf_cframes(dl_buf_handle)

	// now check for next block

	br!=byte[dl_next_block, 0, BID_NEXT_BLOCK, check_exception#]  
	
	xbuf_alloc[$meta_cache, 8, write]					; This line doesn't produce any code.

	//	Write back Meta data. LW0 and LW1 were already written by POS.
	//	So we need to write the rest. (LW2 ...LW7)


//	The 1st LW of the meta data is next buffer pointer and should not be updated
//  when doing dl_meta_flush_cache
#define_eval META_SIZE (META_CACHE_SIZE -1)
	dl_meta_flush_cache[$meta_cache, dl_buf_handle, sig_meta, SIG_NONE, 1, META_SIZE]		; Write back Meta data to SRAM
	dl_flush_iphdr_cache[dl_buf_handle, sig_ip, SIG_NONE]					; Write back IP hdr to DRAM

	//	Wait for the prev. I/O as well as signal from previous thread.

	ctx_arb[sig_meta, sig_ip, sig_prev]

	xbuf_free[$meta_cache]								; free xfer register. No Code generated.

	// 	Signal Next Thread. Signaling next thread has a latency of 10 cycles. 
	//  So we signal at the first available oppurtunity. (see notes in dl_source macro).
	//	It is IMPORTANT that there should be not ctx_swap/arb from here to scratch[put...]
	//	below.

	br=ctx[7, thread_7#]								; Special case for thread 7.
	signal_next_ctx[DL_SIG_WAKE]						; !7, so signal the next thread

thread_7#:

	// 	If thread 7 we have to signal next ME. But we cannot do that until we have issued
	//	the scratch[put..] command. We'll do it in dl_source macro. (see notes in dl_source).


	// Check if Ring is FUll. If Full, just stall the pipeline. (i.e keep looping)

check_ring_full#:

	move[wxfer/**/0, dl_buf_handle]						; LW0 on the ring is current buffer hande
	move[wxfer/**/1, dl_eop_buf_handle]					; LW1 on the ring is the last buffer in the chain, 
														; for big pkts only. Othereise IX_NULL
.reg	queue_number

	dl_meta_get_fabric_port[queue_number]				; LW2 on the ring is Queue number for Queue Manager
	alu_shf[queue_number, --, B, queue_number, <<4]		; fabric * 16 is the actual queue number
	alu_shf[wxfer/**/2, queue_number, OR, 1, <<31]		; The msb is always set to 1, to prevent producing a value of 0 on to the ring.

	// Write the buf_handle to the scratch ring

	_ethernet_bmk_encap_wr_scr_ring(1, wxfer/**/0, sig_scr)
//	_ethernet_bmk_encap_wr_scr_ring_drct(1, wxfer/**/0, sig_scr)
	//	To improve performance, we do not ctx_swap on scratch[put..] above. We'll wait
	//	on this signal in dl_source macro. (see notes in dl_source macro). However this(dl_qm_sink) 
	//	macro has multiple paths in which we do not issue this scratch[put..] at all. 
	//	e.g exception packets, or NULL packets etc. So if we issue scratch[put..], 
	//	we set the sig_mask to indicate that.

.reg tmp

	//	Add the signal to the signal mask on which to wait

	alu[tmp, --, B, &sig_scr]

	br[end#], defer[2]
		alu[--, tmp, OR, 0]								; tmp will be used  by "indirect" in next instruction
		alu_shf[sig_mask, sig_mask, OR, 1, <<indirect]	; shift 1 by req_sig times


ring_is_full#:
	nop													; to set break point
	br[check_ring_full#]								; go back and check again

	// 	Check for exception Packets. Exception packets are sent to the core (Xscale)
	//	on a separate ring. 

check_exception#:

	// 	Wait for signal from prev thread and then signal next context.
	
	ctx_arb[sig_prev]

	// Signal next context.

	br=ctx[7, thread_7_2#]								; Special case for thread 7.
	signal_next_ctx[DL_SIG_WAKE]						; !7, so signal the next thread

thread_7_2#:

	br!=byte[dl_next_block, 0, IX_EXCEPTION, check_drop#]

	// 	Drop the packet. Two cases to handle. Single buffer or a bufffer chain.
	//	It is determined by looking at the dl_eop_buf_handle

	alu[--, dl_eop_buf_handle, -, IX_NULL]					; if dl_eop_buf_handle == IX_NULL
	beq[drop_single_buffer#]								; then it is single buffer.

	dl_drop_buffer_chain[dl_buf_handle, dl_eop_buf_handle]	; otherwise, buffer chain.
	br[end#]

drop_single_buffer#:
	dl_drop_buffer[dl_buf_handle]
	br[end#]

check_drop#:

	br!=byte[dl_next_block, 0, IX_DROP, check_ix_null#]

	// 	Drop the packet. Two cases to handle. Single buffer or a bufffer chain.
	//	It is determined by looking at the dl_eop_buf_handle

	alu[--, dl_eop_buf_handle, -, IX_NULL]					; if dl_eop_buf_handle == IX_NULL
	beq[drop_single_buffer_1#]								; then it is single buffer.

	dl_drop_buffer_chain[dl_buf_handle, dl_eop_buf_handle]	; otherwise, buffer chain.
	br[end#]

drop_single_buffer_1#:
	dl_drop_buffer[dl_buf_handle]
	br[end#]

check_ix_null#:

	// Do something to handle null packet. TBD
	nop

end#:

.end
#endm


///////////////////////////////////////////////////////////////////////////////
// Macro Name  : _ethernet_bmk_encap_wr_scr_ring
// Description : Macro to write messages to the corresponding scratch ring.
//				 This micro will chekc the ring status.  If the ring
//				 is full, the macro will do a busy wait until ring is not full.
// Output      : 
// Input       : in_num, in_message
// Size        : 2
// Branches    : 1
///////////////////////////////////////////////////////////////////////////////
#macro _ethernet_bmk_encap_wr_scr_ring(in_num, in_message, sig_scr)

.begin
.reg tmp, ring

		//	Get output port number

		dl_meta_get_output_port[tmp]
		
		// Compute offset for the jump table
		// and then jump to the corresponding processing

		alu[tmp, --, b, tmp, <<2]
		jump[tmp, tbl_0#], targets[tbl_0#,tbl_1#,tbl_2#,tbl_3#]
		

tbl_0#:
		// Wait for scratch ring not full

		br_inp_state[SCR_Ring/**/IPV4_TO_PACKET_TX_SCR_RING_0/**/_Full, tbl_0#]

		// Get scratch ring number

		alu[ring, --, b, IPV4_TO_PACKET_TX_SCR_RING_0, <<2]

		// Write to the scratch ring

		scratch[put, in_message, 0, ring, in_num], sig_done[sig_scr]		

		br[tbl_done#]

#if 0
tbl_0_busy#:
#ifdef	_DEBUG_COUNTERS_
		alu[@func_ring_full_0,@func_ring_full_0,+,1]
#endif
		nop
		br[tbl_0#]
#endif

tbl_1#:
		// Wait scratch ring not full

		br_inp_state[SCR_Ring/**/IPV4_TO_PACKET_TX_SCR_RING_1/**/_Full, tbl_1#]

		// Get scratch ring number

		alu[ring, --, b, IPV4_TO_PACKET_TX_SCR_RING_1, <<2 ]

		// Write to the scratch ring

		scratch[put, in_message, 0, ring, in_num], sig_done[sig_scr]		

		br[tbl_done#]

#if 0
tbl_1_busy#:
#ifdef	_DEBUG_COUNTERS_
		alu[@func_ring_full_1,@func_ring_full_1,+,1]
#endif
		nop
		br[tbl_1#]
#endif

tbl_2#:
		// Wait for scratch ring not full

		br_inp_state[SCR_Ring/**/IPV4_TO_PACKET_TX_SCR_RING_2/**/_Full, tbl_2#]

		// Get scratch ring number

		alu[ring, --, b, IPV4_TO_PACKET_TX_SCR_RING_2, <<2 ]

		// Write to the scratch ring

		scratch[put, in_message, 0, ring, in_num], sig_done[sig_scr]		
		br[tbl_done#]

#if 0
tbl_2_busy#:
#ifdef	_DEBUG_COUNTERS_
		alu[@func_ring_full_2,@func_ring_full_2,+,1]
#endif
		nop
		br[tbl_2#]
#endif

tbl_3#:
		// Wait for scratch ring not full

		br_inp_state[SCR_Ring/**/IPV4_TO_PACKET_TX_SCR_RING_3/**/_Full, tbl_3#]

		// Get scratch ring number

		alu[ring, --, b, IPV4_TO_PACKET_TX_SCR_RING_3, <<2 ]

		// Write to the scratch ring

		scratch[put, in_message, 0, ring, in_num], sig_done[sig_scr]		
		br[tbl_done#]
#if 0
tbl_3_busy#:
#ifdef	_DEBUG_COUNTERS_
		alu[@func_ring_full_3,@func_ring_full_3,+,1]
#endif
		nop
		br[tbl_3#]
#endif

tbl_done#:

.end

#endm



#endif	// __QUAD_GBETH_DL_SOURCE_UC__
