////////////////////////////////////////////////////////////////////////////////
//                                                                     
//                  I N T E L   P R O P R I E T A R Y                   
//                                                                      
//     COPYRIGHT (c)  2001-2002 BY  INTEL  CORPORATION.  ALL RIGHTS          
//     RESERVED.   NO  PART  OF THIS PROGRAM  OR  PUBLICATION  MAY      
//     BE  REPRODUCED,   TRANSMITTED,   TRANSCRIBED,   STORED  IN  A    
//     RETRIEVAL SYSTEM, OR TRANSLATED INTO ANY LANGUAGE OR COMPUTER    
//     LANGUAGE IN ANY FORM OR BY ANY MEANS, ELECTRONIC, MECHANICAL,    
//     MAGNETIC,  OPTICAL,  CHEMICAL, MANUAL, OR OTHERWISE,  WITHOUT    
//     THE PRIOR WRITTEN PERMISSION OF :                                
//                                                                      
//                        INTEL  CORPORATION                            
//                                                                     
//                     2200 MISSION COLLEGE BLVD                        
//                                                                      
//               SANTA  CLARA,  CALIFORNIA  95052-8119                  
//                                                                      
////////////////////////////////////////////////////////////////////////////////
//
//
//      File Name: sphy_mphy4_tx.uc
//
//      Purpose: Packet (POS/Ethernet) transmitter for Egress IXP2400
//
///////////////////////////////////////////////////////////////////////////////


#ifndef	__SPHY_MPHY4_TX_UC__
#define	__SPHY_MPHY4_TX_UC__

///////////////////////////////////////////////////////////////////////////////
// 
// This SPHY/MPHY4 packet (POS/Ethernet) transmitter (TX) runs on POS-PHY 
// SPHY/MPHY4 trnsmit mode on the egress IXP2400. POS SPHY mode can be 
// configured as 1x32 (one port with 32 bit device bus width), 2x16 (two ports 
// with 16 bit device bus width), 4x8 (four ports with 8 bit device bus width),
// 2x8+1x16 (2 ports with 8 bit device bus width and one with 16 bit device bus
// width) modes. This microblock supports SPHY_1x32(one OC48), SPHY_4x8 (four 
// OC12) and MPHY-4 modes, which can be selected through "#define TX_PHY_MODE 
// SPHY_1_32", "#define TX_PHY_MODE SPHY_4_8" and "#define TX_PHY_MODE 	MPHY_4"
// in dl_system.h of the project. This microblock runs on one ME in SPHY_1x32 
// mode, all 8 threads in this ME serve the same port. In SPHY_4x8 and MPHY-4
// mode, this microblock can run in one ME (two threads handle one port) or 
// two MEs (four threads handle one port, "THIS_ME=PACKET_TX_FIRST_ME" needs 
// be specified in the building option for the ME which serves port 0 and 1,
// "THIS_ME=PACKET_TX_SECOND_ME" needs be specified in the building option 
// for the ME which serves port 2 and 3). The tbuf element size is configured
// as 128 bytes.
// 
// In the egress pipeline, there is L2-encap block which added approriate 
// PPP or Ethernet header before tx request is sent to this microblock, 
// so this microblock can handle both POS and Ethernet transmission. For
// POS, the min packet (49 bytes long) is the critical case. For Ethernet, 
// 129 bytes long packet is the critical case. Each thread usaually just 
// handles one mpacket at one time, but in order to handle the critical case
// for Ethernet, if one packet can't be not fitted into one mpacket, but less
// than	240 bytes, than the thread will handle two mpackets in one time due
// to this packet can surely be fitted into two mpackets (since  offset_mod_8
// always less than 8), first mpacket will transmit fixed 120 bytes, the 
// second mpacket will transmit the rest bytes.
//
// In order to avoid overflooding the packet queue in the local memory, this
// microblock reads scratch ring to get tx request for a specific port only 
// when there are spaces in the packet queue for that port. It also keeps 
// account of how many packets transmitted for each port, and update pkts_txd 
// count in Egress Scheduler through a reflect write when it finishs the 
// transmission of one packet, so Egress Scheduler will not overflood it's 
// downstream. 
//
// Following are the assumptions used to develop this microblock:
// 1. The meta data at SRAM and the L2 header and payload at DRAM are correct,
//    this microblock will not do any sanity check in the size and order of 
//    the buffers in any packet.
// 2. The L2 header and payload in DRAM will be contiguous from the offset to 
//    the end of a buffer in a packet.
// 3. The offset for all buffers in a packet except SOP buffer is 0.
//
// There are three issues need to be addressed in this microblock. The
// first two are imposed by IXP2400 hardware, the third one keeps this 
// microblock capable to handle the packets received in diffrent receiving 
// interface in (POS, ATM, Ethernet) in ingress microblocks. They are
// 1. DRAM[tbuf_wr,...] instruction, which is used to move packet data from 
//    DRAM to TBUF, always access DRAM and TBUF on 8-bytes word boundary.
// 2. The sum of the prepend length and payload length of a mpacket must be an
//    integral multiple of the bus width, except for a EOP mpacket. In SPHY
//    1X32 and MPHY4 mode, the bus width is 32 bits (4 bytes).
// 3. The offset of L2 header and  payload data offset in the first buffer of 
//    a packet can be any value. 
// 
// In order to address the abovementioned three issues, POS TX transmits the
// maximum allowable payload bytes for every non-eop mpacket by making sure 
// that the sum of the prepend length and payload length of a non-eop mpacket
// is an integral multiple of device bus width. If necessary, maximum of 3 
// bytes at the  end of one buffer is saved in the local memory and transmit 
// with the payload of the next buffer of the same packet in the next mpacket.
//
// This microblock keeps active buffer (which the microblock is 
// processing) descriptor info and secondary buffer (buffer descriptor for the 
// next buffer after the active buffer in the packet, it is in the software 
// next pointer field of meta data of the active buffer) descriptor info in 
// local memory, this microblock will read the necessary info (buffer size, 
// data offset, freelist, and the software next pointer) of the secondary 
// buffer from sram in parallel with it's processing of the active buffer. When
// the processing of the active buffer finishes, the secondary buffer will 
// become the active buffer, and continue until all buffers in the packet are 
// processed.
//  
// This microblock has the option to add L2 header before payload by 
// msf[write.], so L2_ENCAP microblock can be removed from the pipeline for 
// fllowing possible purpose: a. reducing DRAM access, b. saving numbers of ME
// used in the pipeline. In order to add Ethernet header, 
// "ADD_L2_HEADER,ETHERNET_TX" needs to be specified in the build option of 
// this microblock. "ADD_L2_HEADER,POS_TX" needs to be specified in the build 
// option of this microblock to add PPP header.
// 

///////////////////////////////////////////////////////////////////////////////

// Following definition enable this microblock waiting for an inter-ME signal 
// generated by the ME whcih initializes common resources, such as scratch ring,
// near the end of thread 0 of this microblockinitialization to make sure all 
// common resource are available. This option should be enabled when this POS TX 
// microblock is integrated with other egress microblocks.  
#define WAIT_FOR_COMMON_RESOURCE_INITIALIZATION 	


// following definition selects sram transfer registers to be used with 
// dl_meta.uc dispatch loop
#define DL_META_DATA_IN_SXFER


// include stdmac.uc in IXPblocks Portable library

#include <stdmac.uc>

// include local memory macros in IXPblocks 

#include <localmem.uc>

// global definition of common constants which are shared by all microblocks

#include "dl_system.h"

// definition of hardware register address 

#include <hardware.h>

// definition of PHY operation mode and misc.

#include <definitions.h>

// definition of constants used

#include "sphy_mphy4_tx.h"

// definition of utilization macros used

#include "sphy_mphy4_tx_util.uc"

// initialization code for POS/Ethernet TX

#include "sphy_mphy4_tx_init.uc"

// dispatch loop macros

#include "dispatch_loop.uc"

#if(TX_PHY_MODE == SPHY_1_32)
// Following definition will adjust mpkt length to avoid SKIP bit set in the 
// last mpkt of non-EOP buffer due to the hardware limitation that the sum of
// payload length and prepend length must be mulitple of deviec bus width
// This is necessary for MPHY_16 mode in IXP2400 chip rev. A0 and A1 due to 
// one MSF hardware bug (HW-bug#1294), just put here in case it is necessary 
// SPHY_1X32 mode, more instructions are required to do this, but 
// fortunately, this doesn't happen in worst case path 
// #define AVOID_USING_SKIP_BIT_IN_TCW0
#endif //#if(TX_PHY_MODE == SPHY_1_32)

#if(TX_PHY_MODE == MPHY_4)
// Following definition will adjust mpkt length to avoid SKIP bit set in the 
// last mpkt of non-EOP buffer due to the hardware limitation that the sum of
// payload length and prepend length must be mulitple of deviec bus width
// This is necessary for MPHY_16 mode in IXP2400 chip rev. A0 and A1 due to 
// one MSF hardware bug (HW-bug#1294), just put here in case it is necessary 
// SPHY_1X32 mode, more instructions are required to do this, but 
// fortunately, this doesn't happen in worst case path 
// #define AVOID_USING_SKIP_BIT_IN_TCW0
#endif //#if(TX_PHY_MODE == MPHY_4)

#ifdef COUNTERS
.reg @cntr_base					; start address of Packet TX counters in sram
								; initialized to PACKET_TCX_COUNTER_BASE
.reg $byte_cnt					; sram xfer register to keep value to add to 
								; packets transmitted counter in sram		
#endif //#ifdef	COUNTERS


#ifdef UNIT_TEST
.reg @buf_freed			; buffer freed for debuging
#endif //UNIT_TEST

.reg addr_tx_seq		; MSF register address to read mpkt txed out of tbuf
// context relative GPR variables:

// following context relative GPR variables are preassigned in initialization 
// to save instrction cycles in POS TX process
.reg sring_tr           		; scratch ring which QM put tx requests
.reg sig1_next_context_gpr		; value to write to SAME_ME_SIGNAL csr to wake up
								; next thread
.reg sig2_next_context_gpr		; value to write to SAME_ME_SIGNAL csr to wake up
								; next thread
.reg sig3_next_context_gpr		; value to write to SAME_ME_SIGNAL csr to wake up
								; next thread
.reg port_entry_turnaround		; packet queue head/tail offset value to reset 
								; head/tail offset
.reg $txed_port					; xfer register to keep packet txed to notify
								; egress scheduler by reflect write							


///////////////////////////////////////////////////////////////////////////////

//	Global Signals
.sig sig_scratch_access_1 			; signal for scratch ring read
// following three signals need to be declared as volatile
.sig volatile sig1_next_context		; signal to wakeup next thread, declared as
									; volatile to eliminate assembling warning
.sig volatile sig2_next_context		; signal to wakeup next thread, declared as
									; volatile to eliminate assembling warning
.sig volatile sig3_next_context		; signal to wakeup next thread, declared as
									; volatile to eliminate assembling warning
.sig sig_sram_read_1 				; signal 1 for sram read
.sig sig_sram_read_2 				; signal 2 for sram read
.sig sig_msf_access_1				; signal 1 for msf access (read or write)
.sig sig_msf_access_2				; signal 2 for msf access (read or write)
.sig sig_msf_access_3				; signal 3 for msf access (read or write)
#ifndef DISABLE_TX2SCHED_FEEDBACK
.sig sig_reflect_write 				; signal for reflect write
#endif	// #ifndef DISABLE_TX2SCHED_FEEDBACK
.sig sig_dram_access_1				; signal 1 for dram access (dram[tbuf_wr..])
.sig sig_dram_access_2				; signal 2 for dram access (dram[tbuf_wr..])
.sig volatile common_init_complete_sig_num		; this is the inter-ME signal 
										; generated by the ME whcih initializes
										; common resources, such as scratch ring,
										; POS TX MEs will wait for this signal at
										; end of their initialization to make 
										; sure all common resource are available 

//	Manual signal allocation
.addr common_init_complete_sig_num	ME_INIT_SIGNAL	; manual signal allocation,
													; ME_INIT_SIGNAL defined as
													; in dl_system.h  

#ifdef DEBUG_TX_PENDING_LOCKED
.sig sig_sram_write_1 				; signal 1 for sram write
#endif //#ifdef DEBUG_TX_PENDING_LOCKED

#ifdef COUNTERS
.sig sig_counter_update									; signal for update counter
#endif //#ifdef COUNTERS		

#ifdef	_DEBUG_COUNTERS_

//---------------------------------------------------------------------
// Declare some counters to ease debugging.
//---------------------------------------------------------------------
.reg @pkt_tx_num_tx_requests_rxed	// number of tx requests received
.reg @pkt_tx_num_pkts_notified	 	// number of packeted notified to tx
.reg @pkt_tx_num_tbufs_txed			// number of tbufs txed
#endif // _DEBUG_COUNTERS_


///////////////////////////////////////////////////////////////////////////////
// _sphy_mphy4_tx_sop_eop_mpacket()
//
// Description: handle the packet which can be fitted into one mpacket 
//				dequeue one packet
//
// Outputs:					
//							None
//
// Inputs: 
//		in_payload			payload length
//      in_offset			payload offset
//		in_offset_mod_8		remaider of offset divided by 8
//
// Constants:
//						 	None 
//
//	Labels:
//		PHASE1_START_LABEL	symbolic address for start of inifinite TX 
//							processing loop
////	Size:
//		4 instructions
//
///////////////////////////////////////////////////////////////////////////////
#macro _sphy_mphy4_tx_sop_eop_mpacket(in_payload, in_offset, \
		in_offset_mod_8, PHASE1_START_LABEL)
.begin
.reg sig_mask_2
.reg eob_bd

	// allocate 5 read xfer registers for reading in the meta data 
	xbuf_alloc($dl_meta, 5, read)	


#ifdef ADD_L2_HEADER		
#ifdef ETHERNET_TX
	// allocate 4 read_write xfer registers for l2_table entry operation 
	xbuf_alloc($l2_entry_lw, 4, read_write)	
#endif // ETHERNET_TX
#endif // ADD_L2_HEADER

	// it is sop_eop packet

#ifdef ADD_L2_HEADER
	// check wheather layer 2 header already exist by checking nexthop_id
#ifdef ETHERNET_TX
.set $l2_entry_lw0 $l2_entry_lw1 $l2_entry_lw2 $l2_entry_lw3
#endif /* ETHERNET_TX */
.begin
.reg tmp_yz
	move[tmp_yz, L2_HEADER_ALREADY_EXIST_ID]
    alu[--, *l$index0[NEXTHOPID_OFFSET], -, tmp_yz]
	beq[sop_eop_check_l2h_exist_done#]

#ifdef ETHERNET_TX
//.set $l2_entry_lw0 $l2_entry_lw1 $l2_entry_lw2 $l2_entry_lw3
	_sphy_mphy4_tx_read_l2_table_entry(sig_mask_1, l2_table_base, \
											*l$index0[NEXTHOPID_OFFSET])
#endif // ETHERNET_TX
	// set adding l2 header flag in exe_stst_flag 
	alu_shf[exe_stat_flag, exe_stat_flag, or, 1, \
						<<NEED_PREPEND_L2_HEADER_FLAG_BIT] 

sop_eop_check_l2h_exist_done#:
.end // tmp_yz
#endif // ADD_L2_HEADER

	// before _advance_queue_head, need to save eob_bd and eob_flist
	// for free buffer in phase 3
   	ld_field_w_clr[eob_bd, 0111, *l$index0[ABD_0_OFFSET]]

	// advanced queue head, so next packet in the queue of this port will be 
	// processed when this port is selected again [6]
	_sphy_mphy4_tx_advance_queue_head(port_entry_turnaround)

#ifndef DISABLE_TX2SCHED_FEEDBACK
	// notify scheduler that one packet has been transmitted from this port
	// 2 instructions for SPHY_1X32 mode, and 10 instructions for other modes
	_sphy_mphy4_tx_notify_scheduler($txed_port, deq_port)
	// swap out to wait scrtach access, reflect write, msf read to complete 
	// and my turn
#endif	// #ifndef DISABLE_TX2SCHED_FEEDBACK


	// swap out to wait for scratch access, reflect write, msf read and my turn
	local_csr_wr[ACTIVE_CTX_WAKEUP_EVENTS, sig_mask_1]	; set active_ctx_wakeup_events
														; csr
	ctx_arb[--]	, defer[2]								; ctx_arb on sig_mask_1
		// tbuf_ele_1 already set before, use defer slots to update the next 
		// available tbuf element in local memory to save cycles for this 
		// critical POS min packet case [2]
		_sphy_mphy4_tx_update_next_availble_tbuf_ele()

	// following .io_complete is used to eliminate assembling warning due to the
	// usage of "ACTIVE_CTX_WAKEUP_EVENTS" csr
	.io_completed sig2_next_context
#ifndef DISABLE_TX2SCHED_FEEDBACK
	.io_completed sig_reflect_write
#endif	// #ifndef DISABLE_TX2SCHED_FEEDBACK
	.io_completed sig_msf_access_3
	.io_completed sig_scratch_access_1
#ifdef ADD_L2_HEADER
#ifdef ETHERNET_TX
	.io_completed sig_sram_read_2
#endif // ETHERNET_TX
#endif // ADD_L2_HEADER

// end of of phase 1 with sop_eop mpkt ready for tx	


//end of phase 1
sop_eop_mpkt_phase2#:
// beginning of phase 2

#ifdef ADD_L2_HEADER
#ifdef POS_TX
	// allocate 1 write xfer registers for writing L2 header 
	xbuf_alloc($prepend_w, 1, write)	
#endif // POS_TX
#endif // ADD_L2_HEADER

// beginning of phase 2 with sop_eop mpkt ready for tx	
	// wake up next thread in beginning of phase 2 to reduce wakeup latency
	local_csr_wr[SAME_ME_SIGNAL, sig2_next_context_gpr] 

	//initialize the sig_mask_2 for phase 2 for sop_eop_mpkt 
	alu[sig_mask_2, --, b, sigmask_phs2_default]			; initialize sig_mask_2

	// tbuf_ele_1 already got in phase 1 and next_avail_tbuf_ele already 
	// updated in defer slots of phase 1, now get this tbuf_ele address 
	// (addr_of_tbuf_1) [2]
	_sphy_mphy4_tx_get_tbuf_addr(addr_of_tbuf_1, tbuf_ele_1)

	// check tbuf full to avoid overwritting tbuf [8]
	_sphy_mphy4_tx_sop_eop_check_tbuf_full_and_wait_not_full(addr_of_tx_control_1, \
					tcw0_1, tbuf_ele_1, in_payload)

#ifndef ADD_L2_HEADER
	// dram access takes longer latency, so do it first
	// move payload to tbuf [9]
	_sphy_mphy4_tx_move_sop_paylo_to_tbuf(indir_ref, dram_addr, eob_bd, \
				addr_of_tbuf_1, indiref_base, in_offset, in_payload, \
				in_offset_mod_8, sig_dram_access_1) 

#else /* ADD_L2_HEADER */
	br_bclr[exe_stat_flag, NEED_PREPEND_L2_HEADER_FLAG_BIT, \
								sop_eop_no_l2_header_move_paylo_to_tbuf#]

	// dram access takes longer latency, so do it first
	// move payload to tbuf [9]
		_sphy_mphy4_tx_move_sop_paylo_to_tbuf_with_l2_hdr_space(indir_ref, \
				dram_addr, eob_bd, addr_of_tbuf_1, indiref_base, \
				in_offset, in_payload, in_offset_mod_8, sig_dram_access_1) 

	// add necessary prepend info in tcw0
	ld_field[tcw0_1, 0100, TCW0_SOP_PREPEND_BYTE, <<TCWO_W1_SHF]							 
	
	_sphy_mphy4_tx_set_signal[sig_mask_2, sig_msf_access_1]	; add sig_msf_access_1

#ifdef POS_TX
    // write  PPP header for SOP/EOP mpacket
.begin
.reg tmp_mpls_nexthopid tmp_ppp_mpls_protocol
	move(tmp_mpls_nexthopid, NEXT_HOP_ID_MPLS)
	alu[--, *l$index0[NEXTHOPID_OFFSET], -, tmp_mpls_nexthopid]
	bne[sop_eop_ppp_hdr_selection_done#], defer[3]
		alu_shf[$prepend_w0, --, b, PPP_IP_PROTOCOL, <<IP_PROTOCOL_SHFT]
		alu[tmp_ppp_mpls_protocol, --, B, 0]
		immed_w0[tmp_ppp_mpls_protocol, PPP_MPLS_PROTOCOL]
	alu_shf[$prepend_w0, --, b, tmp_ppp_mpls_protocol, <<IP_PROTOCOL_SHFT]
 sop_eop_ppp_hdr_selection_done#:
 .end
    // write PPP header to tbuf
	msf[write, $prepend_w0, addr_of_tbuf_1, 0, 1], sig_done[sig_msf_access_1]
#endif // POS_TX

#ifdef ETHERNET_TX

	// not valid L2 table entry case has already been handled in ethernet_arp 
	// microblock, so here all L2 table entry are valid
	alu[$l2_entry_lw0, --, b, $l2_entry_lw0]
	alu[$l2_entry_lw1, --, b, $l2_entry_lw1]
	alu[$l2_entry_lw2, --, b, $l2_entry_lw2]
	alu[$l2_entry_lw3, --, b, $l2_entry_lw3]

    // write Ethernet L2 header for SOP mpacket
    // write Ethernet L2 header to tbuf
	msf[write, $l2_entry_lw0, addr_of_tbuf_1, 0, 4], sig_done[sig_msf_access_1]
#endif // ETHERNET_TX

sop_eop_add_l2_header_done#:
#endif //ADD_L2_HEADER
 	// now, check whether a valid tx request gotten in the scratch read from the
	// scratch ring in phase 1. If there is a valid tx request, enqueu that 
	// packet to the queue of that port 
.begin 
.reg enq_port
// following .set used to eliminate assembling warning
.set $dl_meta0 $dl_meta1 $dl_meta2 $dl_meta3 $dl_meta4 
	// didn't get tx request due to queue full
	br_bclr[exe_stat_flag, GET_TX_REQUEST_FLAG_BIT, \
		sop_eop_no_tx_request_read#] 
	// check whether there is null requests in scratch ring
	alu[--, $tx_request_lw0, -, 0x0]		; valid tx request
	// no valid tx request, not worst case, can take branchout penalty
	beq[sop_eop_no_valid_tx_request#]		; no valid tx request
	
	// read meta data for new tx request [4]
	_sphy_mphy4_tx_read_meta_data_from_sram($dl_meta, $tx_request_lw0) 
	// set sop meta read flag in exe_stst_flag 
	alu_shf[exe_stat_flag, exe_stat_flag, or, 1, <<READ_SOP_BD_FLAG_BIT] 

	// set local memory index 0 to queue entry at queue tail of 
	// that port and enqueue this packet [6]
	_sphy_mphy4_tx_set_lmindex0_to_queue_tail(port_entry_turnaround)
.end // enq_port

sop_eop_end_of_phase_2#:
	// end of phase 2
	// swap out to wait for dram and msf acess complete, and wait for my turn
	// also use the defer slots to update payload offset field of tcw0_1 and 
	// move it to $w0
	// swap out to wait signals specified in sig_mask_2
	local_csr_wr[ACTIVE_CTX_WAKEUP_EVENTS, sig_mask_2]	; set active_ctx_wakeup_events
														; csr
	ctx_arb[--]	, defer[2]								; ctx_arb on sig_mask
		// add payload offset to tcw0_1 nad move it to $w0
		alu_shf[tcw0_1, tcw0_1, OR, in_offset_mod_8, <<TCW0_PAYLOAD_OFFSET_LSB_LOC]
		alu[$w0, --, b, tcw0_1]					; element trnsmit control word


	// following .io_complete is used to eliminate assembling warning due to the
	// usage of "ACTIVE_CTX_WAKEUP_EVENTS" csr
	.io_completed sig3_next_context
	.io_completed sig_dram_access_1
	.io_completed sig_sram_read_1
    .io_completed sig_msf_access_1
#ifdef DEBUG_TX_PENDING_LOCKED
	.io_completed sig_sram_write_1 	
#endif //#ifdef DEBUG_TX_PENDING_LOCKED
 
#ifdef ADD_L2_HEADER
#ifdef POS_TX
	xbuf_free($prepend_w)
#endif // POS_TX
#endif // ADD_L2_HEADER
// end of of phase 2 in sop_eop mpacket path
//end of phase 2

sop_eop_mpkt_phase3#:
// beginning of phase 3

// beginning of phase 3	in sop_eop mpacket path
	// wake up next thread in the very beginning of phase 3 to reduce wakeup 
	// latency
	local_csr_wr[SAME_ME_SIGNAL, sig3_next_context_gpr]

    // write transmit contrl word (TCW) to TBUF_ELEMENT_CTRL_V_# corresponding
	// the tbuf_element to initiate transmission
	msf[write, $w0, addr_of_tx_control_1, 0, 2], sig_done[sig_msf_access_1]

	_sphy_mphy4_tx_debug_incr_counter[@pkt_tx_num_tbufs_txed]

#ifdef COUNTERS
	// update counters in sram (10) 	
	_sphy_mphy4_tx_update_counters_sop_eop($byte_cnt, tcw0_1, \
													sig_counter_update)			
#endif //#ifdef	COUNTERS

	_sphy_mphy4_tx_free_buffer(eob_bd)			; free buffer [4]
	
	// if didn't read meta data in phase 2, just branch out
	br_bclr[exe_stat_flag, READ_SOP_BD_FLAG_BIT, sop_eop_cont_1#] 

	// medat data for sop is ready, move them to local memory
	_sphy_mphy4_tx_save_sop_meta_to_lm($tx_request_lw0) 

sop_eop_cont_1#:
#ifndef COUNTERS
    // swap out to wait for  msf access to complete and my turn
	ctx_arb[sig_msf_access_1, sig1_next_context], defer[2], \
			br[PHASE1_START_LABEL]

		alu[exe_stat_flag, --, B, 0]		; reset exe_stat_flag

		alu[sig_mask_1, --, B, sigmask_phs1_default]

#else // #ifndef COUNTERS
    // swap out to wait for  msf access and sram access to complete and my turn
	ctx_arb[sig_msf_access_1, sig_counter_update, \
			sig1_next_context], defer[2], br[PHASE1_START_LABEL]

		alu[exe_stat_flag, --, B, 0]		; reset exe_stat_flag

		alu[sig_mask_1, --, B, sigmask_phs1_default]

#endif //#ifndef COUNTERS
// end of phase 3 in the sop_eop mpacket path

///////////////////////////////////////////////////////////////////////////////
// branch taken to wait to make sure tbuf elements not overwritten before they
// have been transmitted in eop_sop mpacket case
///////////////////////////////////////////////////////////////////////////////

sop_eop_no_tx_request_read#:
sop_eop_no_valid_tx_request#:
	_sphy_mphy4_tx_clear_signal(sig_mask_2, sig_sram_read_1)	; clear sig_sram_read_1
	br[sop_eop_end_of_phase_2#]

#ifdef ADD_L2_HEADER

sop_eop_no_l2_header_move_paylo_to_tbuf#:
	// dram access takes longer latency, so do it first
	// move payload to tbuf [9]
	_sphy_mphy4_tx_move_sop_paylo_to_tbuf(indir_ref, dram_addr, eob_bd, \
				addr_of_tbuf_1, indiref_base, in_offset, in_payload, \
				in_offset_mod_8, sig_dram_access_1) 

	br[sop_eop_add_l2_header_done#]


#ifdef ETHERNET_TX
	// free 4 read_write xfer registers for l2_table entry operation
	xbuf_free($l2_entry_lw)	
#endif // ETHERNET_TX
#endif // ADD_L2_HEADER

	// free 5 read xfer registers for reading in the meta data
	xbuf_free($dl_meta)	

.end // sig_mask_2 eob_bd

// end of sop_eop mpacket processing code
#endm // end of #macro _sphy_mphy4_tx_sop_eop_mpacket()

///////////////////////////////////////////////////////////////////////////////

///////////////////////////////////////////////////////////////////////////////
// _sphy_mphy4_tx_get_tx_request_for_no_tx()
//
// Description: handle no port is ready to transmit case, check and handle 
//				valid tx request
//
// Outputs:					
//							None
//
// Inputs:  
//							None
//
// Constants:
//						 	None 
//
//	Labels:
//		PHASE1_START_LABEL	symbolic address for start of inifinite TX 
//							processing loop
/////////////////////////////////////////////////////////////////////////////////
#macro _sphy_mphy4_tx_get_tx_request_for_no_tx(PHASE1_START_LABEL)
.begin
.reg sig_mask_2
// beginning of no port ready to transmit code 
	// allocate 5 read xfer registers for reading in the meta data 
	xbuf_alloc($dl_meta, 5, read)

    // not ready for tx, check whether valid tx request received
	// swap out to wait scratch read Tx request to complete and my turn
	// clear the signal bit in sig_mask_phase2
#ifndef DISABLE_TX2SCHED_FEEDBACK
	_sphy_mphy4_tx_clear_signal(sig_mask_1, sig_reflect_write)
#endif	// #ifndef DISABLE_TX2SCHED_FEEDBACK

	_sphy_mphy4_tx_clear_signal(sig_mask_1, sig_msf_access_3)
	// swap out to wait for scratch access, and my turn
	local_csr_wr[ACTIVE_CTX_WAKEUP_EVENTS, sig_mask_1]	; set active_ctx_wakeup_events
														; csr
	ctx_arb[--], defer[2]								; ctx_arb on sig_mask_1
		alu[sig_mask_2, --, B, 0]		;reset sig_mask
		_sphy_mphy4_tx_set_signal(sig_mask_2, sig3_next_context)	; initialize to sig3_next_context

	// following .io_complete is used to eliminate assembling warning due to the
	// usage of "ACTIVE_CTX_WAKEUP_EVENTS" csr
	.io_completed sig2_next_context
	.io_completed sig_scratch_access_1
// end of phase 1 of no port ready to transmit

// beginning of phase 2 of no port ready to transmit 
	// wake up next thread in beginning of phase 2 to reduce wakeup latency
	local_csr_wr[SAME_ME_SIGNAL, sig2_next_context_gpr]

	// now, check whether a valid tx request gotten in the scratch read from the
	// scratch ring in phase 1. If there is a valid tx request, enqueu that 
	// packet to the queue of that port 
.begin 
.reg enq_port
// following .set used to eliminate assembling warning
.set $dl_meta0 $dl_meta1 $dl_meta2 $dl_meta3 $dl_meta4

	// didn't get tx request due to queue full
	br_bclr[exe_stat_flag, GET_TX_REQUEST_FLAG_BIT, \
		no_port_rdy_no_tx_request_read#]
	alu[--, $tx_request_lw0, -, 0x0]
	beq[no_port_rdy_no_tx_request#]

	_sphy_mphy4_tx_read_meta_data_from_sram($dl_meta, $tx_request_lw0) 
	_sphy_mphy4_tx_set_signal(sig_mask_2, sig_sram_read_1)	; add sig_sram_read_1
	// set sop meta read flag in exe_stst_flag 
	alu_shf[exe_stat_flag, exe_stat_flag, or, 1, <<READ_SOP_BD_FLAG_BIT] 

	// following stuff can be done in phase 3, but put here for consuming sram
	// access latency, set local memory index 1 to queue entry at queue tail of 
	// that port and enqueue this packet 
	_sphy_mphy4_tx_set_lmindex0_to_queue_tail(port_entry_turnaround)
.end // enq_port

no_port_rdy_no_tx_request_read#:
no_port_rdy_no_tx_request#:
	// swap out to wait signals specified in sig_mask_2
	local_csr_wr[ACTIVE_CTX_WAKEUP_EVENTS, sig_mask_2]	; set active_ctx_wakeup_events
														; csr
	ctx_arb[--]											; ctx_arb on sig_mask
	// following .io_complete is used to eliminate assembling warning due to the
	// usage of "ACTIVE_CTX_WAKEUP_EVENTS" csr
	.io_completed sig3_next_context
	.io_completed sig_sram_read_1
// end of phase 2 of no port ready to transmit

// beginning of phase 3 of no port ready to transmit
	// wake up next thread in beginning of phase 3 to reduce wakeup latency
	local_csr_wr[SAME_ME_SIGNAL, sig3_next_context_gpr]

	// set $w1 to deq_port for next run is sop_eop case (POS min packet 
	// case) to save one instruction in that critical case
	// workaround for HW-bug#1249, copy channel # into tcw1 (reserved in PRM)
	alu[$w1, --, B, deq_port]	


	// if didn't read meta data in phase 2, just branch out
	br_bclr[exe_stat_flag, READ_SOP_BD_FLAG_BIT, no_port_rdy_cont_1#] 

	// meda data for sop is ready, move them to local memory
	_sphy_mphy4_tx_save_sop_meta_to_lm($tx_request_lw0) 

no_port_rdy_cont_1#:
	// swap out to wait for my turn
	ctx_arb[sig1_next_context], and, defer[2], br[PHASE1_START_LABEL]

		alu[exe_stat_flag, --, B, 0]		; reset exe_stat_flag

		alu[sig_mask_1, --, B, sigmask_phs1_default]

	// free 3 read xfer registers for reading in the meta data
	xbuf_free($dl_meta)	

.end // sig_mask_2
// end of not ready for transmission code
#endm // end of macro _sphy_mphy4_tx_get_tx_request_for_no_tx()


///////////////////////////////////////////////////////////////////////////////

///////////////////////////////////////////////////////////////////////////////
// _sphy_mphy4_tx_sop_only_non_critical_mpkt()
//
// Description: set local memory index 0 to the head of the queue entry to 
//				dequeue one packet
//
// Outputs:					
//							None
//
// Inputs:  
//		in_payload			payload length
//      in_offset			payload offset
//		in_offset_mod_8		remaider of offset divided by 8
//
// Constants:
//						 	None 
//
//	Labels:
//		PHASE1_START_LABEL	symbolic address for start of inifinite TX 
//							processing loop
/////////////////////////////////////////////////////////////////////////////////
#macro _sphy_mphy4_tx_sop_only_non_critical_mpkt(in_payload, in_offset, \
		in_offset_mod_8, PHASE1_START_LABEL)

// beginning of sop_only_non_critical mpacket processing code
// sop_only_non_critical_mpacket path in middle of phase 1
 .begin
.reg sig_mask_2 sop_bd rmnd_ofset enq_port sop_paylo tmp

	// allocate 5 read xfer registers for reading in the meta data 
	xbuf_alloc[$dl_meta, 5, read]

#ifdef ADD_L2_HEADER		
#ifdef ETHERNET_TX
	// allocate 4 read_write xfer registers for l2_table entry operation 
	xbuf_alloc($l2_entry_lw, 4, read_write)	
#endif // ETHERNET_TX
#endif // ADD_L2_HEADER

#ifndef DISABLE_TX2SCHED_FEEDBACK
	// since no complete packet is sent with this thread, clear the signal bit \
	// related reflect write in sig_mask_1
	_sphy_mphy4_tx_clear_signal(sig_mask_1, sig_reflect_write)
#endif	// #ifndef DISABLE_TX2SCHED_FEEDBACK

	// reset p_sop_flag in queue entry
	alu_shf[*l$index0[ABD_0_OFFSET], *l$index0[ABD_0_OFFSET], and~, 1, \
															<<SOP_BIT_LOC]

// following .set used to eliminate assembling warning
.set $dl_meta0 $dl_meta1 $dl_meta2 $dl_meta3 $dl_meta4 enq_port   
	// check whether to read secondary BD 
    br_bclr[*l$index0[ABD_0_OFFSET], RD_NBD_BIT_LOC, \
							sop_only_non_critical_rd_sbd_done#]

	 // read secondary buffer meta data from SRAM 
	_sphy_mphy4_tx_read_sb_meta_data_from_sram($dl_meta, sig_mask_1, \
														*l$index0[SBD_0_OFFSET]) 
	// reset read secobary buffer meta data bit 
	alu_shf[*l$index0[ABD_0_OFFSET], *l$index0[ABD_0_OFFSET], and~, 1, \
															<<RD_NBD_BIT_LOC]
	// set read secondary BD flag in exe_stat_flag for use in phase 2
	alu_shf[exe_stat_flag, exe_stat_flag, or, 1, <<READ_SND_BD_FLAG_BIT]	; set flag

	// check whether this port should be blocked from transmission for any following
	// threads which serve the same in the same run to prvent them accessing  the 
	// secondary buffer meta data (which is not ready yet in this phase, but will be 
	// ready in next phase) when they become active buffer
	// calculate the max bytes to consume before secondary buffer meta data need to be
	// ready
	alu_shf[tmp, --, b, TBUF_ELE_SIZE, <<ME_THREAD_NUM_SHFT] ; max bytes required
	alu[--, in_payload, -, tmp]			; compare with how much left
	bgt[sop_only_non_critical_rd_sbd_done#]
	// set *l$index1[GLOBAL_NOT_LOCK_SNDB_RD_FLAG_INDEX] to 0 to prvent any threads
	// handling the same port in the same run to access the secondary buffer data 
	// (which is not ready yet) when it becomes active buffer, set 
	// put *l$index1[GLOBAL_NOT_LOCK_SNDB_RD_FLAG_INDEX] to 1
	// when secondary meta data saved in phase 2.
	alu[*l$index1[GLOBAL_NOT_LOCK_SNDB_RD_FLAG_INDEX], --, B, 0]

sop_only_non_critical_rd_sbd_done#:

  	// get sop_bd, offset_rpaylo, and offset_mod_8 for use in phase 2
    ld_field_w_clr[sop_bd, 0111, *l$index0[ABD_0_OFFSET]]

#ifdef ADD_L2_HEADER
	// check wheather layer 2 header already exist by checking nexthop_id
#ifdef ETHERNET_TX
.set $l2_entry_lw0 $l2_entry_lw1 $l2_entry_lw2 $l2_entry_lw3
#endif /* ETHERNET_TX */
.begin
.reg tmp_yz
	move[tmp_yz, L2_HEADER_ALREADY_EXIST_ID]
    alu[--, *l$index0[NEXTHOPID_OFFSET], -, tmp_yz]
	beq[sop_only_non_critical_check_l2h_exist_done#]

#ifdef ETHERNET_TX
//.set $l2_entry_lw0 $l2_entry_lw1 $l2_entry_lw2 $l2_entry_lw3
	_sphy_mphy4_tx_read_l2_table_entry(sig_mask_1, l2_table_base, \
											*l$index0[NEXTHOPID_OFFSET])
#endif // ETHERNET_TX
	// set adding l2 header flag in exe_stst_flag 
	alu_shf[exe_stat_flag, exe_stat_flag, or, 1, \
						<<NEED_PREPEND_L2_HEADER_FLAG_BIT] 

sop_only_non_critical_check_l2h_exist_done#:
.end // tmp_yz
#endif // ADD_L2_HEADER

	.set sop_paylo	
	// now consider the restriction of the limitation that the sum of the prepend
	// length and payload length must be multiple of device bus width
#ifndef ADD_L2_HEADER
	alu[sop_paylo, TBUF_ELE_SIZE, -, offset_mod_8]
	alu[sop_paylo, sop_paylo, and, DEVICE_BUS_WIDTH_RESTRICTION_MASK]
#else //#ifndef ADD_L2_HEADER
#ifdef POS_TX
.begin
.reg allow_total
	alu[sop_paylo, TBUF_ELE_SIZE_MINUS_8, -, offset_mod_8]
	alu[allow_total, sop_paylo, +, L2_HEADER_SIZE]
	alu[allow_total, allow_total, and, DEVICE_BUS_WIDTH_RESTRICTION_MASK]
	alu[sop_paylo, allow_total, -, L2_HEADER_SIZE]
.end //allow_total
#endif // POS_TX
#ifdef ETHERNET_TX
.begin
.reg allow_total
	alu[sop_paylo, TBUF_ELE_SIZE_MINUS_16, -, offset_mod_8]
	alu[allow_total, sop_paylo, +, L2_HEADER_SIZE]
	alu[allow_total, allow_total, and, DEVICE_BUS_WIDTH_RESTRICTION_MASK]
	alu[sop_paylo, allow_total, -, L2_HEADER_SIZE]
.end //allow_total
#endif // ETHERNET_TX
#endif //#ifndef ADD_L2_HEADER
	
	// update ab_paylo_rmnd and ab_offset_rpaylo in queue entry with 
	// sop_only_non_critical_paylo [3]
	_sphy_mphy4_tx_update_ab_paylo_rmnd_and_offset_rpaylo(sop_paylo)

 	// swap out to wait signals specified in sig_mask_1
	local_csr_wr[ACTIVE_CTX_WAKEUP_EVENTS, sig_mask_1] 	; set csr
	ctx_arb[--]	, defer[2]					
		// tbuf_ele_1 already set, use defer slot to update the next available
		// tbuf element in local memory to save cycles [2]
		_sphy_mphy4_tx_update_next_availble_tbuf_ele()

	// following .io_complete is used to eliminate assembling warning due to the
	// usage of "ACTIVE_CTX_WAKEUP_EVENTS" csr
	.io_completed sig2_next_context
	.io_completed sig_scratch_access_1
	.io_completed sig_msf_access_3
	.io_completed sig_sram_read_1
#ifdef ADD_L2_HEADER
#ifdef ETHERNET_TX
	.io_completed sig_sram_read_2
#endif // ETHERNET_TX
#endif // ADD_L2_HEADER

// end of of phase 1 with sop_only non critical mpkt ready for tx


#ifdef ADD_L2_HEADER		
#ifdef POS_TX
	// allocate 1 write xfer register for writing PPP header 
	xbuf_alloc($prepend_w, 1, write)	
#endif // POS_TX
#endif // ADD_L2_HEADER
// beginning of phase 2 with sop_only mpkt ready for tx	
	// wake up next thread in beginning of phase 2 to reduce wakeup latency
	local_csr_wr[SAME_ME_SIGNAL, sig2_next_context_gpr] 

	alu[sig_mask_2, --, b, sigmask_phs2_default]			; initialize sig_mask_2

	// workaround for HW-bug#1249, copy channel # into tcw1 (reserved in PRM)
	alu_shf[$w1, --, b, deq_port]

	// tbuf_ele_1 already got in phase 1 and next_avail_tbuf_ele already 
	// updated in defer slots of phase 1, now get this tbuf_ele address 
	// (addr_of_tbuf_1) 
	_sphy_mphy4_tx_get_tbuf_addr(addr_of_tbuf_1, tbuf_ele_1)

	// check tbuf full to avoid overwritting tbuf [8]
	_sphy_mphy4_tx_not_seop_check_tbuf_full_and_wait_not_full(tbuf_ele_1)

   	// setup tcw0_1 base for sop_only case
	alu_shf[tcw0_1, --, B, 1, <<TCW0_SOP_BIT_LOC]	; tcw0 base for sop only mpacket

#ifndef ADD_L2_HEADER

	// dram access takes longer latency, so do it first
	// move payload to tbuf
	_sphy_mphy4_tx_move_sop_paylo_to_tbuf(indir_ref, dram_addr, \
			sop_bd, addr_of_tbuf_1, indiref_base, in_offset, sop_paylo, \
			in_offset_mod_8, sig_dram_access_1)

#else /* ADD_L2_HEADER */

	br_bclr[exe_stat_flag, NEED_PREPEND_L2_HEADER_FLAG_BIT, \
							sop_only_non_critical_no_l2_header_move_paylo_to_tbuf#]

	// dram access takes longer latency, so do it first
	// move payload to tbuf
	_sphy_mphy4_tx_move_sop_paylo_to_tbuf_with_l2_hdr_space(indir_ref, \
			dram_addr, sop_bd, addr_of_tbuf_1, indiref_base, in_offset, \
			sop_paylo, in_offset_mod_8, \
			sig_dram_access_1)

	// add necessary prepend info in tcw0
	ld_field[tcw0_1, 0100, TCW0_SOP_PREPEND_BYTE, <<TCWO_W1_SHF]

	_sphy_mphy4_tx_set_signal[sig_mask_2, sig_msf_access_1]	; add sig_msf_access_1
							 
#ifdef POS_TX
    // write  PPP header for SOP mpacket 
.begin
.reg tmp_mpls_nexthopid tmp_ppp_mpls_protocol
	move(tmp_mpls_nexthopid, NEXT_HOP_ID_MPLS)
	alu[--, *l$index0[NEXTHOPID_OFFSET], -, tmp_mpls_nexthopid]
	bne[sop_only_non_critical_ppp_hdr_selection_done#], defer[3]
		alu_shf[$prepend_w0, --, b, PPP_IP_PROTOCOL, <<IP_PROTOCOL_SHFT]
		alu[tmp_ppp_mpls_protocol, --, B, 0]
		immed_w0[tmp_ppp_mpls_protocol, PPP_MPLS_PROTOCOL]
	alu_shf[$prepend_w0, --, b, tmp_ppp_mpls_protocol, <<IP_PROTOCOL_SHFT]
 sop_only_non_critical_ppp_hdr_selection_done#:
 .end
   // write PPP header to tbuf
	msf[write, $prepend_w0, addr_of_tbuf_1, 0, 1], sig_done[sig_msf_access_1]
#endif // POS_TX
#ifdef ETHERNET_TX
	// not valid L2 table entry case has already been handled in ethernet_arp 
	// microblock, so here all L2 table entry are valid
	alu[$l2_entry_lw0, --, b, $l2_entry_lw0]
	alu[$l2_entry_lw1, --, b, $l2_entry_lw1]
	alu[$l2_entry_lw2, --, b, $l2_entry_lw2]
	alu[$l2_entry_lw3, --, b, $l2_entry_lw3]
	alu[$l2_entry_lw3, --, b, $l2_entry_lw3]

    // write Ethernet L2 header for SOP mpacket
    // write Ethernet L2 header to tbuf
	msf[write, $l2_entry_lw0, addr_of_tbuf_1, 0, 4], sig_done[sig_msf_access_1]
#endif // ETHERNET_TX

sop_only_non_critical_add_l2_header_done#:
#endif //#ifndef ADD_L2_HEADER
	// set payload_length, payload_offset, and channel fields in tcw0
	_sphy_mphy4_tx_set_tcw0_with_paylo_len_offset_chnl(tcw0_1, sop_paylo, \
												in_offset_mod_8, deq_port)

	// check whether we need to save secondary buffer meta data 
	// if didn't read meta data in phase 1, just branch out
	br_bclr[exe_stat_flag, READ_SND_BD_FLAG_BIT, sop_only_non_critical_save_sbd_meta_done#]
	 
	// medat data for secondary buffer is ready, move them to local memory
	_sphy_mphy4_tx_save_sbd_meta_to_lm() 
	// this port was blocked to prvent any following threads (which serves 
	// this port) in the same run to access the secondary buffer data when it 
	// becomes active buffer, now secondary meta data saved, put this port back
	alu[*l$index1[GLOBAL_NOT_LOCK_SNDB_RD_FLAG_INDEX], --, B, 1]	; unblock

sop_only_non_critical_save_sbd_meta_done#:

	// now, check whether a valid tx request gotten in the scratch read from the
	// scratch ring in phase 1. If there is a valid tx request, enqueu that 
	// packet to the queue of that port 

// following .set used to eliminate assembling warning
.set $dl_meta0 $dl_meta1 $dl_meta2 $dl_meta3 $dl_meta4
	// didn't get tx request due to queue full
	br_bclr[exe_stat_flag, GET_TX_REQUEST_FLAG_BIT, \
		sop_only_not_critical_no_tx_request_read#] 
	// check whether there is null requests in scratch ring
	alu[--, $tx_request_lw0, -, 0x0]		; valid tx request
	// no valid tx request, not worst case, can take branchout penalty
	beq[sop_only_non_critical_no_valid_tx_request#]	; no valid tx request
	
    // read SOP meta data, read earlier to reduce latency 
	// read meta data from SRAM by using the BD pointer in tx request
 	_sphy_mphy4_tx_read_meta_data_from_sram($dl_meta, $tx_request_lw0) 
	// set sop meta read flag in exe_stst_flag 
	alu_shf[exe_stat_flag, exe_stat_flag, or, 1, <<READ_SOP_BD_FLAG_BIT] 

	// set local memory index 0 to queue entry at queue tail of 
	// that port and enqueue this packet 
	_sphy_mphy4_tx_set_lmindex0_to_queue_tail(port_entry_turnaround)

sop_only_non_critical_end_of_phase_2#:
	// end of phase 2
	// swap out to wait for dram and msf acess complete, and wait for my turn
	// also calculate address of TBUF_ELEMENT_CONTROL_V# for tbuf_element for
	// later use to save 2 instruction cycles in ctx_arb defer slots
	// swap out to wait signals specified in sig_mask_2
	local_csr_wr[ACTIVE_CTX_WAKEUP_EVENTS, sig_mask_2]	; set active_ctx_wakeup_events
														; csr
	ctx_arb[--]	, defer[2]								; ctx_arb on sig_mask
#ifdef IXP2800
		alu_shf[tmp, --, B, tbuf_ele_1, <<ELE_2_TX_CNTRL_ADDR]
#else // #ifdef IXP2800
		alu_shf[tmp, --, B, tbuf_ele_1, <<3]
#endif // #ifdef IXP2800
		alu[addr_of_tx_control_1, tmp, +, *l$index1[GLOBAL_CTW_VD_BASE_INDEX]]
	
	// following .io_complete is used to eliminate assembling warning due to the
	// usage of "ACTIVE_CTX_WAKEUP_EVENTS" csr
	.io_completed sig3_next_context
	.io_completed sig_dram_access_1
	.io_completed sig_sram_read_1
	.io_completed sig_msf_access_1
#ifdef DEBUG_TX_PENDING_LOCKED
	.io_completed sig_sram_write_1 	
#endif //#ifdef DEBUG_TX_PENDING_LOCKED

#ifdef ADD_L2_HEADER		
#ifdef POS_TX
	// free 1 write xfer register for writing PPP header 
	xbuf_free($prepend_w)	
#endif // POS_TX
#endif // ADD_L2_HEADER

// end of of phase 2 in sop_noly mpacket path	


// beginning of phase 3	in sop_only mpacket path
	// wake up next thread in the very beginning of phase 3 to reduce wakeup 
	// latency
	local_csr_wr[SAME_ME_SIGNAL, sig3_next_context_gpr]

    // write transmit contrl word (TCW) to TBUF_ELEMENT_CTRL_V_# corresponding
	// the tbuf_element to initiate transmission
	alu[$w0, --, b, tcw0_1]					; element trnsmit control word
	msf[write, $w0, addr_of_tx_control_1, 0, 2], sig_done[sig_msf_access_1]

	_sphy_mphy4_tx_debug_incr_counter[@pkt_tx_num_tbufs_txed]

#ifdef COUNTERS
	// update counters in sram (9) 	
	_sphy_mphy4_tx_update_counters_sop_only($byte_cnt, tcw0_1, \
												sig_counter_update)					
#endif //#ifdef	COUNTERS

	// if didn't read meta data in phase 2, just branch out
	br_bclr[exe_stat_flag, READ_SOP_BD_FLAG_BIT, sop_only_non_critical_cont_1#] 

	// medat data for sop is ready, move them to local memory
	_sphy_mphy4_tx_save_sop_meta_to_lm($tx_request_lw0) 


sop_only_non_critical_cont_1#:
#ifndef COUNTERS
    // swap out to wait for  msf access to complete and my turn
	ctx_arb[sig_msf_access_1, sig1_next_context], defer[2], br[PHASE1_START_LABEL]

		alu[exe_stat_flag, --, B, 0]		; reset exe_stat_flag

		alu[sig_mask_1, --, B, sigmask_phs1_default]

#else // #ifndef COUNTERS
    // swap out to wait for  msf access and sram access to complete and my turn
	ctx_arb[sig_msf_access_1, sig_counter_update, sig1_next_context], \
			defer[2], br[PHASE1_START_LABEL]

		alu[exe_stat_flag, --, B, 0]		; reset exe_stat_flag

		alu[sig_mask_1, --, B, sigmask_phs1_default]

#endif //#ifndef COUNTERS
// end of phase 3 in the sop_only_non_critical mpacket path

///////////////////////////////////////////////////////////////////////////////
// branch taken to wait to make sure tbuf elements not overwritten before they
// have been transmitted 
///////////////////////////////////////////////////////////////////////////////
		
sop_only_not_critical_no_tx_request_read#:
sop_only_non_critical_no_valid_tx_request#:
	_sphy_mphy4_tx_clear_signal(sig_mask_2, sig_sram_read_1)	; clear sig_sram_read_1
	br[sop_only_non_critical_end_of_phase_2#]

#ifdef ADD_L2_HEADER

sop_only_non_critical_no_l2_header_move_paylo_to_tbuf#:
	// dram access takes longer latency, so do it first
	// move payload to tbuf
	_sphy_mphy4_tx_move_sop_paylo_to_tbuf(indir_ref, dram_addr, sop_bd, addr_of_tbuf_1, \
			indiref_base, in_offset, sop_paylo, in_offset_mod_8, \
			sig_dram_access_1)
	br[sop_only_non_critical_add_l2_header_done#]

#ifdef ETHERNET_TX
	// free 4 read_write xfer registers for l2_table entry operation 
	xbuf_free($l2_entry_lw)	
#endif // ETHERNET_TX
#endif ADD_L2_HEADER		

	// free 3 read xfer registers for reading in the meta data
	xbuf_free[$dl_meta]	

.end // sig_mask_2 sop_bd rmnd_ofset enq_port sop_paylo tmp

// end of sop_only_non_critical mpacket processing code
#endm // end of #macro _sphy_mphy4_tx_sop_only_non_critical_mpkt()

///////////////////////////////////////////////////////////////////////////////

///////////////////////////////////////////////////////////////////////////////
// 	_sphy_mphy4_tx_ethernet_critical_pkt()
//
// Description: For Ethernet, 129 bytes long packet is the critical case, if  
//				the packet can't be not fitted into one mpacket, but less than 
//				240 bytes, it can sure be transmitted in two mpackets (since
//				offset_mod_8 always less than 8), first mpacket will transmit
//				fixed 120 bytes, the second mpacket will transmit the rest bytes
//
// Outputs:					
//							None
//
// Inputs:  
//		in_payload			payload length
//      in_offset			payload offset
//		in_offset_mod_8		remaider of offset divided by 8
//
// Constants:
//						 	None 
//
//	Labels:
//		PHASE1_START_LABEL	symbolic address for start of inifinite TX 
//							processing loop
//
///////////////////////////////////////////////////////////////////////////////
#macro _sphy_mphy4_tx_ethernet_critical_pkt(in_payload, in_offset, \
		in_offset_mod_8, PHASE1_START_LABEL)

// beginning of ethernet_critical_pkt mpacket processing code
// ethernet_critical_pkt path in middle of phase 1
 .begin
.reg sig_mask_2 sop_bd rmnd_ofset enq_port sop_paylo tmp
.reg indir_ref_2 dram_addr_2
.reg tbuf_ele_2				; tbuf element number for mpacket 2
.reg addr_of_tbuf_2			; address of tbuf for mpkt 2
.reg addr_of_tx_control_2	; address of TBUF_ELEMENT_CTRL_V_# for mpkt 2
.reg tcw0_2					; mpkt 2 transmit control word 0
.reg $w2 $w3
.xfer_order $w2 $w3

	// allocate 5 read xfer registers for reading in the meta data 
	xbuf_alloc($dl_meta, 5, read)	

#ifdef ADD_L2_HEADER		
#ifdef ETHERNET_TX
	// allocate 4 read_write xfer registers for l2_table entry operation 
	xbuf_alloc($l2_entry_lw, 4, read_write)	
#endif // ETHERNET_TX
#endif // ADD_L2_HEADER

  	// get sop_bd 
    ld_field_w_clr[sop_bd, 0111, *l$index0[ABD_0_OFFSET]]
	
#ifdef ADD_L2_HEADER
	// check wheather layer 2 header already exist by checking nexthop_id
#ifdef ETHERNET_TX
.set $l2_entry_lw0 $l2_entry_lw1 $l2_entry_lw2 $l2_entry_lw3
#endif /* ETHERNET_TX */
.begin
.reg tmp_yz
	move[tmp_yz, L2_HEADER_ALREADY_EXIST_ID]
    alu[--, *l$index0[NEXTHOPID_OFFSET], -, tmp_yz]
	beq[ethernet_critical_pkt_check_l2h_exist_done#]

#ifdef ETHERNET_TX
//.set $l2_entry_lw0 $l2_entry_lw1 $l2_entry_lw2 $l2_entry_lw3
	_sphy_mphy4_tx_read_l2_table_entry(sig_mask_1, l2_table_base, \
											*l$index0[NEXTHOPID_OFFSET])
#endif // ETHERNET_TX
	// set adding l2 header flag in exe_stst_flag 
	alu_shf[exe_stat_flag, exe_stat_flag, or, 1, \
						<<NEED_PREPEND_L2_HEADER_FLAG_BIT] 

ethernet_critical_pkt_check_l2h_exist_done#:
.end // tmp_yz
#endif // ADD_L2_HEADER	
	// advanced queue head, so next packet in the queue of this port will be 
	// processed when this port is selected again [6]
	_sphy_mphy4_tx_advance_queue_head(port_entry_turnaround)

#ifndef DISABLE_TX2SCHED_FEEDBACK
	// notify scheduler that one packet has been transmitted from this port [10]
	// 2 instructions for SPHY_1X32 mode, and 10 instructions for other modes
	_sphy_mphy4_tx_notify_scheduler($txed_port, deq_port)
	// swap out to wait scrtach access, reflect write, msf read to complete 
	// and my turn
#endif	// #ifndef DISABLE_TX2SCHED_FEEDBACK

	// tbuf_ele_1 already set, updating next available tbuf element in local 
	// memory	
	_sphy_mphy4_tx_update_next_availble_tbuf_ele()
	// allocating tbuf_ele_2
 	alu[tbuf_ele_2, --, b, *l$index1[GLOBAL_AVAIL_TBUF_ELEMENT_INDEX]]

 	// swap out to wait signals specified in sig_mask_1
	local_csr_wr[ACTIVE_CTX_WAKEUP_EVENTS, sig_mask_1] 	; set csr
	ctx_arb[--]	, defer[2]					
		// after allocating tbuf_ele_2 in phase 1,
		// use defer slots to update the next available tbuf element in local
		// memory to save cycles [2]
		_sphy_mphy4_tx_update_next_availble_tbuf_ele()

	// following .io_complete is used to eliminate assembling warning due to the
	// usage of "ACTIVE_CTX_WAKEUP_EVENTS" csr
	.io_completed sig2_next_context
#ifndef DISABLE_TX2SCHED_FEEDBACK
	.io_completed sig_reflect_write
#endif	// #ifndef DISABLE_TX2SCHED_FEEDBACK
	.io_completed sig_msf_access_3
	.io_completed sig_scratch_access_1
#ifdef ADD_L2_HEADER
#ifdef ETHERNET_TX
	.io_completed sig_sram_read_2
#endif // ETHERNET_TX
#endif // ADD_L2_HEADER

// end of of phase 1 with sop_only non critical mpkt ready for tx


// beginning of phase 2 with sop_only mpkt ready for tx	

#ifdef ADD_L2_HEADER
#ifdef POS_TX
	// allocate 1 write xfer registers for writing L2 header 
	xbuf_alloc($prepend_w, 1, write)	
#endif // POS_TX
#endif // ADD_L2_HEADER

	// wake up next thread in beginning of phase 2 to reduce wakeup latency
	local_csr_wr[SAME_ME_SIGNAL, sig2_next_context_gpr] 

	alu[sig_mask_2, --, b, sigmask_phs2_default]			; initialize sig_mask_2

	// workaround for HW-bug#1249, copy channel # into tcw1 (reserved in PRM)
	alu_shf[$w1, --, b, deq_port]

	// tbuf_ele_1 already got in phase 1, now get this tbuf_ele address 
	// (addr_of_tbuf_1) 
	_sphy_mphy4_tx_get_tbuf_addr(addr_of_tbuf_1, tbuf_ele_1)

	// tbuf_ele_2 already got in phase 1,
	// and next available tbuf element in local memory already updated in defer
	// slots of phase 1, now get this tbuf_ele address (addr_of_tbuf_2) 
	_sphy_mphy4_tx_get_tbuf_addr(addr_of_tbuf_2, tbuf_ele_2)

	// check tbuf full to avoid overwritting tbuf [8]
	_sphy_mphy4_tx_not_seop_check_tbuf_full_and_wait_not_full(tbuf_ele_2)

	.set sop_paylo	
#ifndef ADD_L2_HEADER
	// dram access takes longer latency, so do it first
	// move payload to tbuf 
	alu[sop_paylo, --, B, TBUF_ELE_SIZE_MINUS_8]
#else //#ifndef ADD_L2_HEADER
#ifdef POS_TX
	alu[sop_paylo, --, B, TBUF_ELE_SIZE_MINUS_18]
#endif // POS_TX
#ifdef ETHERNET_TX
	alu[sop_paylo, --, B, TBUF_ELE_SIZE_MINUS_26]
#endif // ETHERNET_TX
#endif //#ifndef ADD_L2_HEADER

   	// setup tcw0_1 base for sop_only case
	alu_shf[tcw0_1, --, B, 1, <<TCW0_SOP_BIT_LOC]	; tcw0 base for sop only mpacket

#ifndef ADD_L2_HEADER
	// move data to tbuf for first mpkt [9]
	_sphy_mphy4_tx_move_sop_paylo_to_tbuf(indir_ref, dram_addr, sop_bd, \
				addr_of_tbuf_1, indiref_base, in_offset, sop_paylo, \
				in_offset_mod_8, sig_dram_access_1) 

#else /*  ADD_L2_HEADER */
	br_bclr[exe_stat_flag, NEED_PREPEND_L2_HEADER_FLAG_BIT, \
							ethernet_critical_pkt_no_l2_header_move_paylo_to_tbuf#]

	// move data to tbuf for first mpkt [9]
	_sphy_mphy4_tx_move_sop_paylo_to_tbuf_with_l2_hdr_space(indir_ref, \
				dram_addr, sop_bd, addr_of_tbuf_1, indiref_base, in_offset, \
				sop_paylo, in_offset_mod_8, sig_dram_access_1) 

	// add necessary prepend info in tcw0
	ld_field[tcw0_1, 0100, TCW0_SOP_PREPEND_BYTE, <<TCWO_W1_SHF]							 

	_sphy_mphy4_tx_set_signal[sig_mask_2, sig_msf_access_1]	; add sig_msf_access_1

#ifdef POS_TX
    // write  PPP header for SOP mpacket
.begin
.reg tmp_mpls_nexthopid tmp_ppp_mpls_protocol
	move(tmp_mpls_nexthopid, NEXT_HOP_ID_MPLS)
	alu[--, *l$index0[NEXTHOPID_OFFSET], -, tmp_mpls_nexthopid]
	bne[ethernet_critical_ppp_hdr_selection_done#], defer[3]
		alu_shf[$prepend_w0, --, b, PPP_IP_PROTOCOL, <<IP_PROTOCOL_SHFT]
		alu[tmp_ppp_mpls_protocol, --, B, 0]
		immed_w0[tmp_ppp_mpls_protocol, PPP_MPLS_PROTOCOL]
	alu_shf[$prepend_w0, --, b, tmp_ppp_mpls_protocol, <<IP_PROTOCOL_SHFT]
 ethernet_critical_ppp_hdr_selection_done#:
 .end

    // write PPP header to tbuf
	msf[write, $prepend_w0, addr_of_tbuf_1, 0, 1], sig_done[sig_msf_access_1]
#endif // POS_TX

#ifdef ETHERNET_TX

	// not valid L2 table entry case has already been handled in ethernet_arp 
	// microblock, so here all L2 table entry are valid
	alu[$l2_entry_lw0, --, b, $l2_entry_lw0]
	alu[$l2_entry_lw1, --, b, $l2_entry_lw1]
	alu[$l2_entry_lw2, --, b, $l2_entry_lw2]
	alu[$l2_entry_lw3, --, b, $l2_entry_lw3]
	alu[$l2_entry_lw3, --, b, $l2_entry_lw3]

    // write Ethernet L2 header for SOP mpacket
    // write Ethernet L2 header to tbuf
	msf[write, $l2_entry_lw0, addr_of_tbuf_1, 0, 4], sig_done[sig_msf_access_1]
#endif // ETHERNET_TX

ethernet_critical_pkt_add_l2_header_done#:
#endif //ADD_L2_HEADER

	// set payload_length, payload_offset, and channel fields in tcw0 [3]
	_sphy_mphy4_tx_set_tcw0_with_paylo_len_offset_chnl(tcw0_1, sop_paylo, \
												in_offset_mod_8, deq_port)

   	// setup tcw0_2 
#ifndef ADD_L2_HEADER
	alu[sop_paylo, in_payload, -, TBUF_ELE_SIZE_MINUS_8] 
#else //#ifndef ADD_L2_HEADER
#ifdef POS_TX
	alu[sop_paylo, in_payload, -, TBUF_ELE_SIZE_MINUS_18]
	// update in_offset_mod_8
	alu[in_offset_mod_8, in_offset_mod_8, +, TBUF_ELE_SIZE_MINUS_18]
	alu[in_offset_mod_8, in_offset_mod_8, and, 0x7] ; update in_offset_mod_8
#endif // POS_TX
#ifdef ETHERNET_TX
	alu[sop_paylo, in_payload, -, TBUF_ELE_SIZE_MINUS_26]
	// update in_offset_mod_8
	alu[in_offset_mod_8, in_offset_mod_8, +, TBUF_ELE_SIZE_MINUS_26]
	alu[in_offset_mod_8, in_offset_mod_8, and, 0x7] ; update in_offset_mod_8
#endif // ETHERNET_TX
#endif //#ifndef ADD_L2_HEADER
	alu_shf[tcw0_2, --, B, 1, <<TCW0_EOP_BIT_LOC]	; tcw0 base for tcw0_2
	// set payload_length, payload_offset, and channel fields in tcw0 [3]
	_sphy_mphy4_tx_set_tcw0_with_paylo_len_offset_chnl(tcw0_2, sop_paylo, \
												in_offset_mod_8, deq_port)

#ifndef ADD_L2_HEADER
	alu[in_offset, in_offset, +, TBUF_ELE_SIZE_MINUS_8]
#else //#ifndef ADD_L2_HEADER
#ifdef POS_TX
	alu[in_offset, in_offset, +, TBUF_ELE_SIZE_MINUS_18]
#endif // POS_TX
#ifdef ETHERNET_TX
	alu[in_offset, in_offset, +, TBUF_ELE_SIZE_MINUS_26]
#endif // ETHERNET_TX
#endif //#ifndef ADD_L2_HEADER
	// 	move data to tbuf for second mpkt, payload offset and payload length can 
	// be retrived from tcw0_2  [11]
	_sphy_mphy4_tx_move_not_sop_paylo_to_tbuf(indir_ref_2, dram_addr_2, sop_bd, \
		addr_of_tbuf_2, indiref_base, in_offset, tcw0_2, sig_dram_access_2) 
	_sphy_mphy4_tx_set_signal(sig_mask_2, sig_dram_access_2)	; add sig_dram_access_2
	_sphy_mphy4_tx_set_push_signal(sig_mask_2, sig_dram_access_2)	; add push signal

	// now, check whether a valid tx request gotten in the scratch read from the
	// scratch ring in phase 1. If there is a valid tx request, enqueu that 
	// packet to the queue of that port 

// following .set used to eliminate assembling warning
.set $dl_meta0 $dl_meta1 $dl_meta2 $dl_meta3 $dl_meta4
	// didn't get tx request due to queue full
	br_bclr[exe_stat_flag, GET_TX_REQUEST_FLAG_BIT, \
		ethernet_critical_pkt_no_tx_request_read#] 
	// check whether there is null requests in scratch ring
	alu[--, $tx_request_lw0, -, 0x0]		; valid tx request
	// no valid tx request, not worst case, can take branchout penalty
	beq[ethernet_critical_pkt_no_valid_tx_request#]	; no valid tx request
	
    // read SOP meta data, read earlier to reduce latency 
	// read meta data from SRAM by using the BD pointer in tx request
 	_sphy_mphy4_tx_read_meta_data_from_sram($dl_meta, $tx_request_lw0) 
	// set sop meta read flag in exe_stst_flag 
	alu_shf[exe_stat_flag, exe_stat_flag, or, 1, <<READ_SOP_BD_FLAG_BIT] 

	// set local memory index 0 to queue entry at queue tail of 
	// that port and enqueue this packet 
	_sphy_mphy4_tx_set_lmindex0_to_queue_tail(port_entry_turnaround)

ethernet_critical_end_of_phase_2#:
	// end of phase 2
	// swap out to wait for dram and msf acess complete, and wait for my turn
	// also calculate address of TBUF_ELEMENT_CONTROL_V# for tbuf_element for
	// later use to save 2 instruction cycles in ctx_arb defer slots
	// swap out to wait signals specified in sig_mask_2
	local_csr_wr[ACTIVE_CTX_WAKEUP_EVENTS, sig_mask_2]	; set active_ctx_wakeup_events
														; csr
	ctx_arb[--]	, defer[2]								; ctx_arb on sig_mask
#ifdef IXP2800
		alu_shf[tmp, --, B, tbuf_ele_1, <<ELE_2_TX_CNTRL_ADDR]
#else // #ifdef IXP2800
		alu_shf[tmp, --, B, tbuf_ele_1, <<3]
#endif // #ifdef IXP2800
		alu[addr_of_tx_control_1, tmp, +, *l$index1[GLOBAL_CTW_VD_BASE_INDEX]]
	
	// following .io_complete is used to eliminate assembling warning due to the
	// usage of "ACTIVE_CTX_WAKEUP_EVENTS" csr
	.io_completed sig3_next_context
	.io_completed sig_dram_access_1
	.io_completed sig_sram_read_1
	.io_completed sig_dram_access_2
	.io_completed sig_msf_access_1
#ifdef DEBUG_TX_PENDING_LOCKED
	.io_completed sig_sram_write_1 	
#endif //#ifdef DEBUG_TX_PENDING_LOCKED

#ifdef ADD_L2_HEADER
#ifdef POS_TX
	xbuf_free($prepend_w)
#endif // POS_TX
#endif // ADD_L2_HEADER

// end of of phase 2 in ethernet_critical_pkt path	


// beginning of phase 3	in sop_only mpacket path
	// wake up next thread in the very beginning of phase 3 to reduce wakeup 
	// latency
	local_csr_wr[SAME_ME_SIGNAL, sig3_next_context_gpr]

	// write tcw for first mpkt
    // write transmit contrl word (TCW) to TBUF_ELEMENT_CTRL_V_# corresponding
	// the tbuf_element to initiate transmission
	alu[$w0, --, b, tcw0_1]					; element transmit control word
	msf[write, $w0, addr_of_tx_control_1, 0, 2], sig_done[sig_msf_access_1]

	_sphy_mphy4_tx_debug_incr_counter[@pkt_tx_num_tbufs_txed]

#ifdef IXP2800
		alu_shf[tmp, --, B, tbuf_ele_2, <<ELE_2_TX_CNTRL_ADDR]
#else // #ifdef IXP2800
		alu_shf[tmp, --, B, tbuf_ele_2, <<3]
#endif // #ifdef IXP2800
		alu[addr_of_tx_control_2, tmp, +, *l$index1[GLOBAL_CTW_VD_BASE_INDEX]]

	// write tcw for second mpkt
    // write transmit contrl word (TCW) to TBUF_ELEMENT_CTRL_V_# corresponding
	// the tbuf_element to initiate transmission
	alu[$w2, --, B, tcw0_2]					; element transmit control word
	// workaround for HW-bug#1249, copy channel # into tcw1 (reserved in PRM)
	alu[$w3, --, B, deq_port]
	msf[write, $w2, addr_of_tx_control_2, 0, 2], sig_done[sig_msf_access_2]

	_sphy_mphy4_tx_debug_incr_counter[@pkt_tx_num_tbufs_txed]


#ifdef COUNTERS
	// update counters in sram (12) 	
	_sphy_mphy4_tx_update_counters_ether_critical($byte_cnt, tcw0_1, \
												tcw0_2, sig_counter_update)					
#endif //#ifdef	COUNTERS

	_sphy_mphy4_tx_free_buffer(sop_bd)			; free buffer

	// if didn't read meta data in phase 2, just branch out
	br_bclr[exe_stat_flag, READ_SOP_BD_FLAG_BIT, ethernet_critical_pkt_cont_1#] 

	// medat data for sop is ready, move them to local memory
	_sphy_mphy4_tx_save_sop_meta_to_lm($tx_request_lw0) 

ethernet_critical_pkt_cont_1#:
#ifndef COUNTERS
    // swap out to wait for  msf access to complete and my turn
	ctx_arb[sig_msf_access_1, sig_msf_access_2, sig1_next_context], \
			defer[2], br[PHASE1_START_LABEL]

		alu[exe_stat_flag, --, B, 0]		; reset exe_stat_flag

		alu[sig_mask_1, --, B, sigmask_phs1_default]

#else // #ifndef COUNTERS
    // swap out to wait for  msf access and sram access to complete and my turn
	ctx_arb[sig_msf_access_1, sig_msf_access_2, sig_counter_update, \
			sig1_next_context], defer[2], br[PHASE1_START_LABEL]

		alu[exe_stat_flag, --, B, 0]		; reset exe_stat_flag

		alu[sig_mask_1, --, B, sigmask_phs1_default]

#endif //#ifndef COUNTERS
// end of phase 3 in the ethernet_critical_pkt mpacket path

///////////////////////////////////////////////////////////////////////////////
// branch taken to wait to make sure tbuf elements not overwritten before they
// have been transmitted 
///////////////////////////////////////////////////////////////////////////////
ethernet_critical_pkt_no_tx_request_read#:
ethernet_critical_pkt_no_valid_tx_request#:
	_sphy_mphy4_tx_clear_signal(sig_mask_2, sig_sram_read_1)	; clear sig_sram_read_1
	br[ethernet_critical_end_of_phase_2#]
		
#ifdef ADD_L2_HEADER

ethernet_critical_pkt_no_l2_header_move_paylo_to_tbuf#:
	// move data to tbuf for first mpkt [9]
	_sphy_mphy4_tx_move_sop_paylo_to_tbuf(indir_ref, dram_addr, sop_bd, \
				addr_of_tbuf_1, indiref_base, in_offset, sop_paylo, \
				in_offset_mod_8, sig_dram_access_1) 
	br[ethernet_critical_pkt_add_l2_header_done#]

#ifdef ETHERNET_TX
	// free 4 read_write xfer registers for l2_table entry operation
	xbuf_free($l2_entry_lw)	
#endif // ETHERNET_TX
#endif // ADD_L2_HEADER

	// free 5 read xfer registers for reading in the meta data
	xbuf_free($dl_meta)	

.end // sig_mask_2 sop_bd rmnd_ofset enq_port sop_paylo tmp
	 // tbuf_ele_2 addr_of_tbuf_2 addr_of_tx_control_2 tcw0_2	
	 //	$w2 $w4

// end of ethernet_critical_pkt mpacket processing code
#endm // end of #macro 	_sphy_mphy4_tx_ethernet_critical_pkt()


///////////////////////////////////////////////////////////////////////////////

///////////////////////////////////////////////////////////////////////////////
// _sphy_mphy4_tx_not_sop_mpkt()
//
// Description: handle mpacket which is not the first mpacket of a 
//				multiple-mpacket packet
//
// Outputs:					
//							None
//
// Inputs:  
//						 	None 
//
// Constants:
//						 	None 
//
//	Labels:
//		PHASE1_START_LABEL	symbolic address for start of inifinite TX 
//							processing loop
//
///////////////////////////////////////////////////////////////////////////////

#macro _sphy_mphy4_tx_not_sop_mpkt(PHASE1_START_LABEL)
///////////////////////////////////////////////////////////////////////////////
// beginning of not_sop_mpacket processing code
// now still in phase 1
not_sop_mpkt#:
.begin 
.reg ab_bd lov_to_next_offset rmnd_paylo offset_rpaylo tmp
.reg sig_mask_2 
.reg $$dw0 $$dw1 $$dw2 $$dw3
.xfer_order $$dw0 $$dw1 $$dw2 $$dw3
	// allocate 5 read xfer registers for reading in the meta data 
	xbuf_alloc($dl_meta, 5, read)	
#ifndef DISABLE_TX2SCHED_FEEDBACK
	// since maybe no complete packet is sent with this thread, clear the \
	// signal bit related reflect write in sig_mask_1 first
	_sphy_mphy4_tx_clear_signal(sig_mask_1, sig_reflect_write)
#endif	// #ifndef DISABLE_TX2SCHED_FEEDBACK

	alu[tcw0_1, --, B, deq_port]	; initialize tcw0_1 to deq_port

// following .set used to eliminate assembling warning
.set $dl_meta0 $dl_meta1 $dl_meta2 $dl_meta3 $dl_meta4
.set $w0 $w1
.set $$dw0 $$dw1 $$dw2 $$dw3
.set lov_to_next_offset

	alu_shf[rmnd_paylo, --, b, *l$index0[ABD_1_OFFSET], >>PAYLO_RMND_LOC]
    ld_field_w_clr[offset_rpaylo, 0011, *l$index0[ABD_1_OFFSET]]

	// check whether to read secondary BD 
    br_bclr[*l$index0[ABD_0_OFFSET], RD_NBD_BIT_LOC, not_sop_sbd_rd_done#]

	 // read secondary buffer meta data from SRAM 
	_sphy_mphy4_tx_read_sb_meta_data_from_sram($dl_meta, sig_mask_1, \
													*l$index0[SBD_0_OFFSET]) 

	// reset need to read secondary buffer meta data flag 
	alu_shf[*l$index0[ABD_0_OFFSET], *l$index0[ABD_0_OFFSET], and~, 1, \
			<<RD_NBD_BIT_LOC]
	// set read secondary BD flag in exe_stat_flag for use in phase 2
	alu_shf[exe_stat_flag, exe_stat_flag, or, 1, <<READ_SND_BD_FLAG_BIT]

	// check whether this port should be blocked from transmission for any 
	// following threads which serve the same in the same run to prvent them 
	// accessing  the secondary buffer meta data (which is not ready yet in this
	// phase, but will be ready in next phase) when they become active buffer
	// calculate the max bytes to consume before secondary buffer meta data need
	// to be ready

	alu_shf[tmp, --, b, TBUF_ELE_SIZE, <<ME_THREAD_NUM_SHFT] ; max bytes required
	alu[--, rmnd_paylo, -, tmp]			; compare with how much left
	bgt[not_sop_sbd_rd_done#]
	// set *l$index1[GLOBAL_NOT_LOCK_SNDB_RD_FLAG_INDEX] to 0 to prvent any threads
	// handling the same port in the same run to access the secondary buffer data 
	// (which is not ready yet) when it becomes active buffer, set 
	// put *l$index1[GLOBAL_NOT_LOCK_SNDB_RD_FLAG_INDEX] to 1
	// when secondary meta data saved in phase 2.
	alu[*l$index1[GLOBAL_NOT_LOCK_SNDB_RD_FLAG_INDEX], --, B, 0]

not_sop_sbd_rd_done#:
  	// get ab_bd, rmnd_paylo, offset_rpaylo, offset_mod_8, ab_flist for active 
	// buffer
    ld_field_w_clr[ab_bd, 0111, *l$index0[ABD_0_OFFSET]] ; get active buffer descriptor
	alu[offset_mod_8, offset_rpaylo, and, CONST_07]
.begin
.reg lpb_paylo
	// check whether there is some leftover bytes (maximum 8 bytes) from 
	// previous bufferwhich need to be moved to tbuf at the beginning
	// of this mpacket
	alu[--, *l$index1[GLOBAL_LEFTOVER_FLAG_INDEX], -, NO_LEFTOVER_FLAG]
    beq[not_sop_no_leftover#]
	// yes, there is leftover from previous buffer in the same packet
	// move leftover bytes to $w0 for latter use, also clear that bit,
	// so later threads in the same run will not repeat the  same thing
    alu[*l$index1[GLOBAL_LEFTOVER_FLAG_INDEX], --, B, NO_LEFTOVER_FLAG] 	; reset
	alu[$w0, --, b, *l$index1[GLOBAL_LEFTOVER_DATA_INDEX]] ; get leftover from previous buffer
	alu[$w1, --, b, 0]						  ; dummy
	// payload leftover will be put into prepend to eliminate any alignment problem
	// also before saved to local memory, byte alignment is done on leftover bytes
	// so make sure the offset of those leftover bytes is 0
	alu[lpb_paylo, --, B, *l$index1[GLOBAL_LEFTOVER_LEN_INDEX]]
	alu_shf[tcw0_1, tcw0_1, or, lpb_paylo, <<TCWO_PREPEND_LEN_LSB_LOC]
	alu_shf[exe_stat_flag, exe_stat_flag, or, 1, \
			<<LEFTOV_FROM_PB_FLAG_BIT]				;set flag

	// check whether current mpacket is eop packet
	// first check whether this is a last buffer in a packet
    ld_field_w_clr[tmp, 0111, *l$index0[SBD_0_OFFSET]]
	alu[--, tmp, -, IX_NULL]
	bne[lp_not_last_buffer#]
	// check whether all remianed payload can fit into one tbuf, leftover already
	// occupies 8 bytes in tbuf 
	alu[tmp, offset_mod_8, +, rmnd_paylo]
	alu[--, TBUF_ELE_SIZE_MINUS_8, -, tmp]
	blt[lp_last_bp_not_eop_mpkt#]
	// it is EOP case, set EOP bit in tcw0_1
	alu_shf[tcw0_1, tcw0_1, or, 1, <<TCW0_EOP_BIT_SHFT]
	// set payload_length and payload_offset of tcw0_1 with rmnd_paylo and
	// offset_mod_8
	_sphy_mphy4_tx_set_tcw0_with_paylo_len_offset(tcw0_1, rmnd_paylo, offset_mod_8)
	// eop mpacket is eob mpacket too, need to free buffer at phase 3
	alu_shf[exe_stat_flag, exe_stat_flag, or, 1, <<EOB_MPKT_FLAG_BIT]
	br[not_sop_need_to_advance_queue_head#] 
	
	// following case need to consider device bus width restriction
lp_last_bp_not_eop_mpkt#:
	// calculate what max can be transmit from this buffer
	// first total max payload (include lp_paylo) must be multiple of 
	// device bus width
.begin 
.reg alw_mpkt_paylo	; maximum allowable mpkt payload from active buffer
					; with consideration of multiple of device bus width

	alu[tmp, TBUF_ELE_SIZE_MINUS_8, -, offset_mod_8]
	alu[tmp, tmp, +, lpb_paylo]	;  now offset_mod_8 ahould be 0
	alu[tmp, tmp, and, DEVICE_BUS_WIDTH_RESTRICTION_MASK]	
	alu[alw_mpkt_paylo, tmp, -, lpb_paylo]

	// update ab_paylo_rmnd and ab_offset_rpaylo in queue entry with 
	// alw_mpkt_paylo
	_sphy_mphy4_tx_update_ab_paylo_rmnd_and_offset_rpaylo(alw_mpkt_paylo)
	// set payload_length and payload_offset of tcw0_1 with alw_mpkt_paylo
	// and offset_mod_8
	_sphy_mphy4_tx_set_tcw0_with_paylo_len_offset(tcw0_1, alw_mpkt_paylo, \
											offset_mod_8)	
	br[not_sop_end_of_phase1#]
.end //	 alw_mpkt_paylo
lp_not_last_buffer#:
	// since not last buffer in a packet , so only check whether it is a eob 
	// mpacket	
	// which rmnd_paylo we can say it is eob mpacket
	// calculate what max can be transmit from this buffer
	// first total max payload (include lp_paylo) must be multiple of device 
	// bus width
.begin
.reg alw_mpkt_paylo	; maximum allowable payload from active buffer
					; with consideration of multiple of device bus width
	alu[tmp, TBUF_ELE_SIZE_MINUS_8, -, offset_mod_8]	;  now offset_mod_8 ahould be 0
	alu[tmp, tmp, +, lpb_paylo]	
	alu[tmp, tmp, and, DEVICE_BUS_WIDTH_RESTRICTION_MASK]
	alu[alw_mpkt_paylo, tmp, -, lpb_paylo]

	// if rmnd_paylo is less than this max allowable bytes, of cours, we are in 
	// eob 
	alu[--, alw_mpkt_paylo, -, rmnd_paylo]
	blt[lp_not_last_bp_not_eob#]

	// it is eob, set eob bit
	alu_shf[exe_stat_flag, exe_stat_flag, or, 1, <<EOB_MPKT_FLAG_BIT]

.begin
.reg need_to_transmit
.reg allow_to_transmit
	alu[need_to_transmit, lpb_paylo, +, rmnd_paylo]
	alu[allow_to_transmit, need_to_transmit, and, DEVICE_BUS_WIDTH_RESTRICTION_MASK]
	alu[allow_to_transmit, need_to_transmit, -, lpb_paylo]

	// set payload_length and payload_offset of tcw0_1 with allow_to_transmit
	// and offset_mod_8
	_sphy_mphy4_tx_set_tcw0_with_paylo_len_offset(tcw0_1, allow_to_transmit, \
									offset_mod_8)

	// check whether there are some leftover to raed to $$dw0, $$dw1, $$dw2, and
	// $$dw3
	// check whether we have some leftover to next buffer 
	alu[--, rmnd_paylo, -, allow_to_transmit]		
	beq[lp_not_last_buffer_no_need_to_save_leftover#]	; it meets multiple of 4 criteria
	// set flag in exe_stat_flag for later use 
	alu_shf[exe_stat_flag, exe_stat_flag, or, 1, <<SAVE_LEFTOV_TO_LM_FLAG_BIT]
	// get true active buffer BD from ab_bd  
	alu_shf[tmp, --, b, ab_bd, <<2]
    // use following dl_buf_get_data_from_meta macro from dispatch_loop.uc to
	// get dram address at the beginning of the buffer from true_dl_buf_hdl
	// now it takes one instruction
    dl_buf_get_data_from_meta(dram_addr, tmp)	; get beginning dram addr for buffer  
	
	alu[tmp, offset_rpaylo, +, allow_to_transmit]
	alu[dram_addr, dram_addr, +, tmp]	
	// in some case, the spome of offset_mod_8 + bytes_leftvoer will over 8 bytes 
	// (for example, offset_mod_8=7, and bytes_leftover is 3), so inorder to 
	// cover all cases, read 16 bytes here 
	// add sig_dram_access_1 to sig_mask
	dram[read, $$dw0, dram_addr, 0, 2], sig_done[sig_dram_access_1]	
	_sphy_mphy4_tx_set_signal(sig_mask_1, sig_dram_access_1)	; add sig_dram_access_1
	// since we use sig_mask and ACTIVE_CTX_WAKEUP_EVENTS csr, we need to add to 
	// wait for the push signal of dram access
	_sphy_mphy4_tx_set_push_signal(sig_mask_1, sig_dram_access_1)	; add push signal

	// this port should be blocked from transmission before those $$dw0, $$dw1, 
	//$$dw2, and $$dw3 is gotten and saved to LM
	alu[*l$index1[GLOBAL_NOT_LOCK_LEFTOVER_FLAG_INDEX], --, B, 0]	; block this port
															; until leftover ready
	alu[lov_to_next_offset, tmp, and, CONST_07]	; 8 bytes boudary due to dram[read..] 
	alu[tmp, rmnd_paylo, -, allow_to_transmit]
	alu[*l$index1[GLOBAL_LEFTOVER_LEN_INDEX], --, B, tmp]	; save leftover length
.end // need_to_transmit allow_to_transmit 
lp_not_last_buffer_no_need_to_save_leftover#:
	_sphy_mphy4_tx_secondary_buffer_become_active_buffer()
	br[not_sop_end_of_phase1#]	

lp_not_last_bp_not_eob#:	
	// update ab_paylo_rmnd and ab_offset_rpaylo in queue entry with 
	// alw_mpkt_paylo
	_sphy_mphy4_tx_update_ab_paylo_rmnd_and_offset_rpaylo(alw_mpkt_paylo)
	// set payload_length and payload_offset of tcw0_1 with alw_mpkt_paylo
	// and offset_mod_8
	_sphy_mphy4_tx_set_tcw0_with_paylo_len_offset(tcw0_1, alw_mpkt_paylo, \
											offset_mod_8)
	br[not_sop_end_of_phase1#]
.end //	 alw_mpkt_paylo
.end //  lpb_paylo, lp_offset

// no leftover from previous buffer
 not_sop_no_leftover#:

	// check whether current mpacket is eop packet
	// first check whether this is a last buffer in a packet
    ld_field_w_clr[tmp, 0111, *l$index0[SBD_0_OFFSET]]
	alu[--, tmp, -, IX_NULL]
	bne[no_lp_not_last_buffer#]
	// check whether all remianed payload can fit into one tbuf, leftover already
	// occupies 8 bytes 
	alu[tmp, offset_mod_8, +, rmnd_paylo]

	alu[--, TBUF_ELE_SIZE, -, tmp]
	blt[no_lp_last_bp_not_eop_mpkt#]
	// it is EOP case, set EOP bit in tcw0_1
	alu_shf[tcw0_1, tcw0_1, or, 1, <<TCW0_EOP_BIT_SHFT]
	// set payload_length and payload_offset of tcw0_1 with rmnd_paylo
	// and offset_mod_8
	_sphy_mphy4_tx_set_tcw0_with_paylo_len_offset(tcw0_1, rmnd_paylo, offset_mod_8)
	alu_shf[exe_stat_flag, exe_stat_flag, or, 1, <<EOB_MPKT_FLAG_BIT]
	br[not_sop_need_to_advance_queue_head#] 
	
	// following case need to consider device bus width restriction
no_lp_last_bp_not_eop_mpkt#:
	// calculate what max can be transmit from this buffer
	// first total max payload (include lp_paylo) must be multiple of device bus
	// width
.begin 
.reg alw_mpkt_paylo		; maximum allowable payload from active buffer
						; with consideration of multiple of device bus width
	alu[tmp, TBUF_ELE_SIZE, -, offset_mod_8]
	alu[alw_mpkt_paylo, tmp, and, DEVICE_BUS_WIDTH_RESTRICTION_MASK]
	// update ab_paylo_rmnd and ab_offset_rpaylo in queue entry with 
	// alw_mpkt_paylo
	_sphy_mphy4_tx_update_ab_paylo_rmnd_and_offset_rpaylo(alw_mpkt_paylo)
	// set payload_length and payload_offset of tcw0_1 with alw_mpkt_paylo
	// and offset_mod_8
	_sphy_mphy4_tx_set_tcw0_with_paylo_len_offset(tcw0_1, alw_mpkt_paylo, \
											offset_mod_8)	
	br[not_sop_end_of_phase1#]
.end //	 alw_mpkt_paylo
no_lp_not_last_buffer#:
	// check whether is EOB
	// which rmnd_paylo we can say it is eob
	// calculate what max can be transmit from this buffer
	// first total max payload (include lp_paylo) must be multiple of device 
	// bus width
.begin
.reg alw_mpkt_paylo	; maximum allowable payload from active buffer
					; with consideration of multiple of device bus width
	alu[tmp, TBUF_ELE_SIZE, -, offset_mod_8]
	alu[alw_mpkt_paylo, tmp, and, DEVICE_BUS_WIDTH_RESTRICTION_MASK]

	// if rmnd_paylo is less than this max allowable bytes, of cours, we are in 
	// eob 
	alu[--, alw_mpkt_paylo, -, rmnd_paylo]
	blt[no_lp_not_last_bp_not_eob#]

	// it is eob, set eob bit
	alu_shf[exe_stat_flag, exe_stat_flag, or, 1, <<EOB_MPKT_FLAG_BIT]
.begin
.reg allow_to_transmit 
	alu[allow_to_transmit, rmnd_paylo, and, DEVICE_BUS_WIDTH_RESTRICTION_MASK]

	// check whether we have something to transmit
	alu[--, allow_to_transmit, -, 0]
	bgt[no_lp_not_last_buffer_something_to_transmit#]
	// if it is the case, then set skip bit in tcw0_1
	alu_shf[tcw0_1, tcw0_1, or, 1, <<TCW0_SKIP_BIT_SHFT]	; set skip bit
	// set flag in exe_stat_flag for later use 
	alu_shf[exe_stat_flag, exe_stat_flag, or, 1, <<SKIP_TRANSMIT_FLAG_BIT]
no_lp_not_last_buffer_something_to_transmit#:
	// set payload_length and payload_offset of tcw0_1 with allow_to_transmit
	// and offset_mod_8
	_sphy_mphy4_tx_set_tcw0_with_paylo_len_offset(tcw0_1, allow_to_transmit, \
										offset_mod_8)
	// check whether there are some leftover to raed to $$dw0, $$dw1, $$dw2, and 
	// $$dw3
	alu[--, rmnd_paylo, -, allow_to_transmit]		
	beq[no_lp_not_last_buffer_no_need_to_save_leftover#] ; it meets multiple of device 
														 ; bus width criteria
	// set flag in exe_stat_flag for later use 
	alu_shf[exe_stat_flag, exe_stat_flag, or, 1, <<SAVE_LEFTOV_TO_LM_FLAG_BIT]
	// get true active buffer BD from ab_bd  
	alu_shf[tmp, --, b, ab_bd, <<2]
    // use following dl_buf_get_data_from_meta macro from dispatch_loop.uc to
	// get dram address at the beginning of the buffer from true_dl_buf_hdl
	// now it takes one instruction
    dl_buf_get_data_from_meta(dram_addr, tmp)
	
	alu[tmp, offset_rpaylo, +, allow_to_transmit]
	alu[dram_addr, dram_addr, +, tmp]
	// in some case, the spome of offset_mod_8 + bytes_leftvoer will over 8 bytes 
	// (for example, offset_mod_8=7, and bytes_leftover is 3), so inorder to 
	// cover all cases, read 16 bytes here 
	dram[read, $$dw0, dram_addr, 0, 2], sig_done[sig_dram_access_1]
	// add sig_dram_access_1 to sig_mask_1
	_sphy_mphy4_tx_set_signal(sig_mask_1, sig_dram_access_1)	; add sig_dram_access_1
	// since we use sig_mask and ACTIVE_CTX_WAKEUP_EVENTS csr, we need to add to 
	// wait for the push signal of dram access
	_sphy_mphy4_tx_set_push_signal(sig_mask_1, sig_dram_access_1)	; add push signal
	// this port should be blocked from transmission before those $$dw0, $$dw1, 
	//$$dw2, and $$dw3 is gotten and saved to LM
	alu[*l$index1[GLOBAL_NOT_LOCK_LEFTOVER_FLAG_INDEX], --, B, 0]	; block this port
															; until leftover ready
	alu[lov_to_next_offset, tmp, and, CONST_07]	; 8 bytes boudary due ro dram[read..] 
	alu[tmp, rmnd_paylo, -, allow_to_transmit]
	alu[*l$index1[GLOBAL_LEFTOVER_LEN_INDEX], --, B, tmp]	; save leftover length
.end // allow_to_transmit 
no_lp_not_last_buffer_no_need_to_save_leftover#:
	_sphy_mphy4_tx_secondary_buffer_become_active_buffer()
	br[not_sop_end_of_phase1#]	
no_lp_not_last_bp_not_eob#:	
	// update ab_paylo_rmnd and ab_offset_rpaylo in queue entry with 
	// alw_mpkt_paylo
	_sphy_mphy4_tx_update_ab_paylo_rmnd_and_offset_rpaylo(alw_mpkt_paylo)
	// set payload_length and payload_offset of tcw0_1 with alw_mpkt_paylo
	// and offset_mod_8
	_sphy_mphy4_tx_set_tcw0_with_paylo_len_offset(tcw0_1, alw_mpkt_paylo, \
											offset_mod_8)
.end //	 alw_mpkt_paylo
	br[not_sop_end_of_phase1#]	   

	// advance queue header for EOP mpacket 
not_sop_need_to_advance_queue_head#:
	// even the deq_port_bit wil be calculated in _sphy_mphy4_tx_advance_queue_head[] 
	// macro gaian, but it is the same as original deq_port_bit [6]
	_sphy_mphy4_tx_advance_queue_head(port_entry_turnaround)
#ifndef DISABLE_TX2SCHED_FEEDBACK
	// notify scheduler that one packet has been transmitted from this port 
	// 2 instructions for SPHY_1X32 mode, and 10 instructions for other modes
	_sphy_mphy4_tx_notify_scheduler($txed_port, deq_port)
	// add sig_reflect_write to sig_mask_1, since reflect write used in
	// _sphy_mphy4_tx_notify_scheduler
	_sphy_mphy4_tx_set_signal(sig_mask_1, sig_reflect_write)	; add sig_reflect_write
#endif	// #ifndef DISABLE_TX2SCHED_FEEDBACK

not_sop_end_of_phase1#:
	// swap out to wait signals specified in sig_mask
	local_csr_wr[ACTIVE_CTX_WAKEUP_EVENTS, sig_mask_1]	; set csr													; csr
	ctx_arb[--], defer[2]
		// tbuf_ele_1 already set, use the defer slots to update the next 
		// available tbuf element in local memory to save cycles [2]
		_sphy_mphy4_tx_update_next_availble_tbuf_ele()

	// following .io_complete is used to eliminate assembling warning due to the
	// usage of "ACTIVE_CTX_WAKEUP_EVENTS" csr
	.io_completed sig2_next_context
	.io_completed sig_scratch_access_1
	.io_completed sig_msf_access_3
	.io_completed sig_sram_read_1
	.io_completed sig_dram_access_1
#ifndef DISABLE_TX2SCHED_FEEDBACK
	.io_completed sig_reflect_write
#endif	// #ifndef DISABLE_TX2SCHED_FEEDBACK

// end of of phase 1 with not_sop mpkt ready for tx

// beginning of phase 2 with not_sop mpkt ready for tx	
	// wake up next thread in beginning of phase 2 to reduce wakeup latency
	local_csr_wr[SAME_ME_SIGNAL, sig2_next_context_gpr] 

	alu[sig_mask_2, --, B, 0]			; reset sig_mask_2

	_sphy_mphy4_tx_set_signal(sig_mask_2, sig3_next_context)	; initialize to
															; sig3_next_context
															 
	// workaround for HW-bug#1249, copy channel # into tcw1 (reserved in PRM)
	alu_shf[$w1, --, b, deq_port]

	// tbuf_ele_1 already got in phase 1 and next_avail_tbuf_ele already 
	// updated in defer slots of phase 1, now get this tbuf_ele address 
	// (addr_of_tbuf_1) 
	_sphy_mphy4_tx_get_tbuf_addr(addr_of_tbuf_1, tbuf_ele_1)

	// check tbuf full to avoid overwritting tbuf [10]
	_sphy_mphy4_tx_not_seop_check_tbuf_full_and_wait_not_full(tbuf_ele_1)

// following .set used to eliminate assembling warning
//.set addr_of_tbuf ab_bd lov_to_next_offset $w0 $w1 $$dw0 $$dw1 $$dw2

	// check whether need to move leftover ($w0) to the beginning of tbuf
	br_bclr[exe_stat_flag, LEFTOV_FROM_PB_FLAG_BIT, ph2_move_lp_to_tbuf_done#]
	msf[write, $w0, addr_of_tbuf_1, 0, 1], sig_done[sig_msf_access_1]
	_sphy_mphy4_tx_set_signal(sig_mask_2, sig_msf_access_1)	; add sig_msf_access_1 
	alu[addr_of_tbuf_1, addr_of_tbuf_1, +, 8]
ph2_move_lp_to_tbuf_done#:

	// check whether need to move payload from active buffer to tbuf
	br_bset[exe_stat_flag, SKIP_TRANSMIT_FLAG_BIT, ph2_move_paylo_done#]
	// payload offset and payload length can be retrived from tcw0_1
	 
	_sphy_mphy4_tx_move_not_sop_paylo_to_tbuf(indir_ref, dram_addr, ab_bd, \
		addr_of_tbuf_1, indiref_base, offset_rpaylo, tcw0_1, sig_dram_access_1) 
	_sphy_mphy4_tx_set_signal(sig_mask_2, sig_dram_access_1)	; add sig_dram_access_1
	_sphy_mphy4_tx_set_push_signal(sig_mask_2, sig_dram_access_1)	; add push signal
ph2_move_paylo_done#:

	// check whether we have leftover which needed to be to saved in LM
	br_bclr[exe_stat_flag, SAVE_LEFTOV_TO_LM_FLAG_BIT, ph2_save_leftov_to_lm_done#]
	// $$dw0, $$dw1, $$dw2, and $$dw3 ready now, do necessary bytes alignment with
	// lov_to_next_offset, so the leftover will be realigned in first two longword
	// without any offset to the first byte of the first long word
.begin 
.reg r0
	_sphy_mphy4_tx_leftover_bytes_alignment(r0, lov_to_next_offset, $$dw0, $$dw1, \
											$$dw2, $$dw3)
	alu[*l$index1[GLOBAL_LEFTOVER_DATA_INDEX], --, b, r0]	; save leftover to local memory
.end // r0	
	// set flag in queue status
    alu[*l$index1[GLOBAL_LEFTOVER_FLAG_INDEX], --, B, PAYLO_LEFTOVER_FLAG] ; set
   	// data saved, port ready to be transmitted again, put this port back
	alu[*l$index1[GLOBAL_NOT_LOCK_LEFTOVER_FLAG_INDEX], --, B, 1]	; unblock
ph2_save_leftov_to_lm_done#:

	// check whether we need to save secondary buffer meta data 
	// if didn't read meta data in phase 2, just branch out
	br_bclr[exe_stat_flag, READ_SND_BD_FLAG_BIT, not_sop_save_sbd_meta_done#]

	// medat data for secondary buffer is ready, move them to local memory
	_sphy_mphy4_tx_save_sbd_meta_to_lm() 
	// this port was blocked to prvent any following threads (which serves 
	// this port) in the same run to access the secondary buffer data when it 
	// becomes active buffer, now secondary meta data saved, put this port back
	alu[*l$index1[GLOBAL_NOT_LOCK_SNDB_RD_FLAG_INDEX], --, B, 1]	; unblock
not_sop_save_sbd_meta_done#:

	// now, check whether a valid tx request gotten in the scratch read from the
	// scratch ring in phase 1. If there is a valid tx request, enqueu that 
	// packet to the queue of that port 

// following .set used to eliminate assembling warning
.set $dl_meta0 $dl_meta1 $dl_meta2 $dl_meta3 $dl_meta4 

	// didn't get tx request due to queue full
	br_bclr[exe_stat_flag, GET_TX_REQUEST_FLAG_BIT, not_sop_no_tx_request_read#] 

	// check whether there is null requests in scratch ring
	alu[--, $tx_request_lw0, -, 0x0]		; valid tx request
	// no valid tx request, not worst case, can take branchout penalty
	beq[not_sop_no_valid_tx_request#]		; no valid tx request

    // read SOP meta data, read earlier to reduce latency 
	// read meta data from SRAM by using the BD pointer in tx request
  	_sphy_mphy4_tx_read_meta_data_from_sram($dl_meta, $tx_request_lw0) 
	_sphy_mphy4_tx_set_signal(sig_mask_2, sig_sram_read_1) 	; add sig_sram_read_1
	// set sop meta read flag in exe_stst_flag 
	alu_shf[exe_stat_flag, exe_stat_flag, or, 1, <<READ_SOP_BD_FLAG_BIT] 
	// set local memory index 0 to queue entry at queue tail of 
	// that port and enqueue this packet 
	_sphy_mphy4_tx_set_lmindex0_to_queue_tail(port_entry_turnaround)

not_sop_no_tx_request_read#:
not_sop_no_valid_tx_request#:
	// end of phase 2
	// swap out to wait for dram and msf acess complete, and wait for my turn
	// also calculate address of TBUF_ELEMENT_CONTROL_V# for tbuf_element for
	// later use to save 2 instruction cycles in ctx_arb defer slots
	// swap out to wait signals specified in sig_mask
	local_csr_wr[ACTIVE_CTX_WAKEUP_EVENTS, sig_mask_2]	; set csr
	ctx_arb[--]	, defer[2]								; ctx_arb on sig_mask
#ifdef IXP2800
		alu_shf[tmp, --, B, tbuf_ele_1, <<ELE_2_TX_CNTRL_ADDR]
#else // #ifdef IXP2800
		alu_shf[tmp, --, B, tbuf_ele_1, <<3]
#endif // #ifdef IXP2800
		alu[addr_of_tx_control_1, tmp, +, *l$index1[GLOBAL_CTW_VD_BASE_INDEX]]
	
	// following .io_complete is used to eliminate assembling warning due to the
	// usage of "ACTIVE_CTX_WAKEUP_EVENTS" csr
	.io_completed sig3_next_context
	.io_completed sig_dram_access_1
	.io_completed sig_msf_access_1
	.io_completed sig_sram_read_1
#ifdef DEBUG_TX_PENDING_LOCKED
	.io_completed sig_sram_write_1 	
#endif //#ifdef DEBUG_TX_PENDING_LOCKED

// end of of phase 2 in sop_noly mpacket path	


// beginning of phase 3	in not_sop mpacket path
	// wake up next thread in the very beginning of phase 3 to reduce wakeup 
	// latency
	local_csr_wr[SAME_ME_SIGNAL, sig3_next_context_gpr]

    // write transmit contrl word (TCW) to TBUF_ELEMENT_CTRL_V_# corresponding
	// the tbuf_element to initiate transmission
	alu[$w0, --, b, tcw0_1]		; element trnsmit control word
	// workaround for HW-bug#1249, copy channel # into tcw1 (reserved in PRM)
	alu_shf[$w1, --, b, deq_port]
	msf[write, $w0, addr_of_tx_control_1, 0, 2], sig_done[sig_msf_access_1]

	_sphy_mphy4_tx_debug_incr_counter[@pkt_tx_num_tbufs_txed]

#ifdef COUNTERS
	// update counters in sram (15) 	
	_sphy_mphy4_tx_update_counters_not_sop($byte_cnt, tcw0_1, \
												sig_counter_update)
#endif //#ifdef	COUNTERS

	// check whether need tio free buffer
	br_bclr[exe_stat_flag, EOB_MPKT_FLAG_BIT, not_sop_free_buffer_done#] 

	_sphy_mphy4_tx_free_buffer(ab_bd)			; free buffer

not_sop_free_buffer_done#:


	// if didn't read meta data in phase 2, just branch out
	br_bclr[exe_stat_flag, READ_SOP_BD_FLAG_BIT, not_sop_save_sop_meta_done#] 
	// medat data for sop is ready, move them to local memory
	_sphy_mphy4_tx_save_sop_meta_to_lm($tx_request_lw0) 

not_sop_save_sop_meta_done#:
#ifndef COUNTERS
    // swap out to wait for  msf access to complete and my turn
	ctx_arb[sig_msf_access_1, sig1_next_context], defer[2], \
		br[PHASE1_START_LABEL]

		alu[exe_stat_flag, --, B, 0]					; reset exe_stat_flag

		alu[sig_mask_1, --, B, sigmask_phs1_default] 	; Set signal mask 1

#else // #ifndef COUNTERS
    // swap out to wait for  msf access and sram access to complete and my turn
	ctx_arb[sig_msf_access_1, sig_counter_update, sig1_next_context], \
		defer[2], br[PHASE1_START_LABEL]

		alu[exe_stat_flag, --, B, 0]		; reset exe_stat_flag

		alu[sig_mask_1, --, B, sigmask_phs1_default]

#endif //#ifndef COUNTERS
// end of phase 3 in the not_sop mpacket path

///////////////////////////////////////////////////////////////////////////////
// branch taken to wait to make sure tbuf elements not overwritten before they
// have been transmitted in not_sop mpacket case, otherwise it also causes
// tx_pending bits in Tx_MPHY_Status msf register locked in 1, in addition to 
// screwing up previos mpackets		
///////////////////////////////////////////////////////////////////////////////

	// free 3 read xfer registers for reading in the meta data
	xbuf_free($dl_meta)	


.end	// ab_bd lov_to_next_offset  rmnd_paylo offset_rpaylo
		// $$dw0 $$Dw1 $$dw2 $$dw3
// end of not_sop mpacket processing
#endm // end of #macro _sphy_mphy4_tx_not_sop_mpkt()


///////////////////////////////////////////////////////////////////////////////

											
///////////////////////////////////////////////////////////////////////////////
// sphy_mphy4_tx()
//
// Description:		Packet (POS/Ethernet) transmission
//
// Outputs:					
//					None
//
// Inputs:  
//					None
//
// Constants:
//					None 
//
//	Size:
//		 			?? instructions for worst case
//
///////////////////////////////////////////////////////////////////////////////

#macro sphy_mphy4_tx()

.begin 
.reg sig_mask_1				; wakeup events for a thread to wakeup in phase 1
.reg tbuf_ele_1				; tbuf element number for mpacket 1
.reg addr_of_tbuf_1			; address of tbuf for mpkt 1
.reg addr_of_tx_control_1	; address of TBUF_ELEMENT_CTRL_V_# for mpkt 1
.reg tcw0_1					; mpkt 1 transmit control word 0
.reg sigmask_phs1_default 	; default value for signal mask for phase 1
.reg sigmask_phs2_default 	; default value for signal mask for phase 2
.reg exe_stat_flag			; thread execution status flags 
.reg deq_port
.reg indiref_base			; base to calculate the address field in indirect ref
.reg l2_table_base			; base of L2 table in sram
.reg tcw0_sop_eop_base		; use gpr to save instruction count in min packet 
							; path
.reg $w0 $w1
.reg indir_ref dram_addr
.xfer_order $w0 $w1
.reg volatile $tx_request_lw0		; transmit request

	// allocate 1 read xfer registers for reading mpackets actually transmitted
	xbuf_alloc[$mpkts_sent, 1, read]

	// gprs initialized so they may save some cycles 

	_sphy_mphy4_tx_init_gprs(indiref_base, l2_table_base, \
				sigmask_phs1_default, sigmask_phs2_default)

	// The first time through the loop, wait for next thread signal only 

	// wait for my turn
	ctx_arb[sig1_next_context]

	// initialize deq_port to port number handled by this thread
	alu[deq_port, --, B, *l$index1[GLOBAL_PORT_ID_INDEX]]

	// initialize $w1 to deq_port, this $w1 should not changed or set back 
	// to deq_port at the beginning of following phase_1#:, s can save
	// one instruction cycle in sop_eop (POS min packet case)
	// workaround for HW-bug#1249, copy channel # into tcw1 (reserved in PRM)
	alu[$w1, --, B, deq_port]	

	// use tcw0_sop_eop_base to save instruction count in min packet path
	immed[tcw0_sop_eop_base, TCW0_SOP_EOP_BASE_W0]
	immed_w1[tcw0_sop_eop_base, TCW0_SOP_EOP_BASE_W1]

	alu[exe_stat_flag, --, B, 0]					; reset exe_stat_flag

	alu[sig_mask_1, --, B, sigmask_phs1_default] 	; Set signal mask 1


phase1#:

// beginning of phase 1

.begin

.reg tmp

	// check whether there is enough space in local memory to stor incoming 
	// packet tx request 
	alu[--, *l$index1[GLOBAL_PKTS_IN_QUEUE_INDEX], -, SKIP_GET_TXR_THRESHOLD]
	
	// no space, so don't get tx request, keep tx requests in scratch queue
	// use defer slots to save instruction cycles
	bgt[get_tx_request_done#], defer [3]

		// wake up next thread in beginning of phase 1 to reduce wakeup latency
		local_csr_wr[SAME_ME_SIGNAL, sig1_next_context_gpr] 

		// set local memory index 0 to queue head for the specific port, set 
		// earlier to let it settle down before we actually use it
		local_csr_wr[active_lm_addr_0, *l$index1[GLOBAL_HEAD_OFFSET_INDEX]]

		// get *l$index1[GLOBAL_NOT_LOCK_SNDB_RD_FLAG_INDEX] for later use  	
		alu[tmp, --, B, *l$index1[GLOBAL_NOT_LOCK_SNDB_RD_FLAG_INDEX]]
		

	// get tx request from scracth ring
	scratch[get, $tx_request_lw0, sring_tr, 0, 1], sig_done[sig_scratch_access_1]

	alu_shf[exe_stat_flag, exe_stat_flag, OR, 1, <<GET_TX_REQUEST_FLAG_BIT]	

	_sphy_mphy4_tx_set_signal(sig_mask_1, sig_scratch_access_1)	; sig_scratch_access_1

get_tx_request_done#:

check_mpkt_ready_for_tx#:

    alu[--, *l$index1[GLOBAL_PKTS_IN_QUEUE_INDEX], -, 0] 

	beq[not_ready_to_tx#]	

    alu[tmp, tmp, AND, *l$index1[GLOBAL_NOT_LOCK_LEFTOVER_FLAG_INDEX]] 
.end // tmp

	beq[not_ready_to_tx#]

	// get mpackets actually sent out of the tbuf for use in phase 2 to avoid 
	// overwritting existing mpackets which have not been transmitted out yst

	msf[read, $mpkts_sent0, addr_tx_seq, 0, 1], sig_done[sig_msf_access_3]


	// set local memory index 0 to queue head for SPHY 
	// initialize tcw0 for SOP_EOP mpacket (worst case) [3] 
	
	alu[tcw0_1, --, B, tcw0_sop_eop_base]

 	alu[tbuf_ele_1, --, b, *l$index1[GLOBAL_AVAIL_TBUF_ELEMENT_INDEX]]

    br_bclr[*l$index0[ABD_0_OFFSET], SOP_BIT_LOC, not_sop_mpacket#]

.begin 
.reg offset_mod_8
.reg payload_len
.reg payload_offset

check_whether_sop_eop_mpacket#:
	alu_shf[payload_len, --, b, *l$index0[ABD_1_OFFSET], >>PAYLO_RMND_LOC]
    ld_field_w_clr[payload_offset, 0011, *l$index0[ABD_1_OFFSET]]
	alu[offset_mod_8, payload_offset, and, CONST_07]
.begin
.reg tmp
#ifndef ADD_L2_HEADER 
   alu[tmp, TBUF_ELE_SIZE, -, offset_mod_8]
#else //#ifndef ADD_L2_HEADER 
#ifdef POS_TX
   alu[tmp, TBUF_ELE_SIZE_MINUS_8, -, offset_mod_8]
#endif // POS_TX
#ifdef ETHERNET_TX
   alu[tmp, TBUF_ELE_SIZE_MINUS_16, -, offset_mod_8]
#endif // ETHERNET_TX
#endif //#ifndef ADD_L2_HEADER 
	alu[--, payload_len, -, tmp]
.end //tmp
	// not sop_eop packet, can take branchout penalty
	bgt[sop_mpkt#]
	_sphy_mphy4_tx_sop_eop_mpacket(payload_len, payload_offset, \
								offset_mod_8, phase1#)


sop_mpkt#:
	alu[--, *l$index0[SBD_0_OFFSET], -, IX_NULL]	; is null (0xFF)?
	bne[sop_only_non_critical_mpkt#]
#ifndef ADD_L2_HEADER
	// for a packet less than  or equal to 240 bytes, it can be fitted into 
	// two mpackes; make simple, first mpkt take 120 bytes, others in second
	// mpaket
 	alu[--, payload_len, -, TWO_TBUF_ELE_SIZE_MINUS_16]
#else //#ifndef ADD_L2_HEADER
#ifdef POS_TX
	// for a packet with payload (except L2 header) less than  or equal to 
	// 232 bytes, it can be fitted into two mpackts; make simple, first mpkt 
	// take 112 bytes payload, other 120 bytes in second mpaket
 	alu[--, payload_len, -, TWO_TBUF_ELE_SIZE_MINUS_26]
#endif // POS_TX
#ifdef ETHERNET_TX
	// for a packet with payload (except L2 header) less than  or equal to 
	// 222 bytes, it can be fitted into two mpackts; make simple, first mpkt 
	// take 102 bytes payload, other 120 bytes in second  mpaket
 	alu[--, payload_len, -, TWO_TBUF_ELE_SIZE_MINUS_34] // ????????
#endif // ETHERNET_TX
#endif //#ifndef ADD_L2_HEADER
	// not critical packet, can take branchout penalty
	bgt[sop_only_non_critical_mpkt#]
	_sphy_mphy4_tx_ethernet_critical_pkt(payload_len, payload_offset, \
								offset_mod_8, phase1#)

sop_only_non_critical_mpkt#:

	_sphy_mphy4_tx_sop_only_non_critical_mpkt(payload_len, payload_offset, \
								offset_mod_8, phase1#)

not_sop_mpacket#:
	_sphy_mphy4_tx_not_sop_mpkt(phase1#)

.end // payload_len payload_offset offset_mod_8 

not_ready_to_tx#:
	_sphy_mphy4_tx_get_tx_request_for_no_tx(phase1#)

.end 
#endm // end of #macro sphy_mphy4_tx()

///////////////////////////////////////////////////////////////////////////////
// The main code starts here

main#:
	// initialize the microblock
#if(TX_PHY_MODE == SPHY_1_32)
	sphy_mphy4_tx_init(port_entry_turnaround, SPHY_1_32)
#endif // #if(TX_PHY_MODE == SPHY_1_32)

#if(TX_PHY_MODE == SPHY_4_8)
	sphy_mphy4_tx_init(port_entry_turnaround, SPHY_4_8)
#endif // #if(TX_PHY_MODE == SPHY_4_8)

#if(TX_PHY_MODE == MPHY_4)
	sphy_mphy4_tx_init(port_entry_turnaround, MPHY_4)
#endif // #if(TX_PHY_MODE == MPHY_4)


	// call the packet processing block
	sphy_mphy4_tx()

/////////////////////////////////////////////////////////////////////////////////////

#endif 		// __SPHY_MPHY4_TX_UC__

/////////////////////////////////////////////////////////////////////////////////////


      

