//------------------------------------------------------------------------------------
//                                                                     
//                  I N T E L   P R O P R I E T A R Y                   
//                                                                      
//     COPYRIGHT (c)  1998-2000 BY  INTEL  CORPORATION.  ALL RIGHTS          
//     RESERVED.   NO  PART  OF THIS PROGRAM  OR  PUBLICATION  MAY      
//     BE  REPRODUCED,   TRANSMITTED,   TRANSCRIBED,   STORED  IN  A    
//     RETRIEVAL SYSTEM, OR TRANSLATED INTO ANY LANGUAGE OR COMPUTER    
//     LANGUAGE IN ANY FORM OR BY ANY MEANS, ELECTRONIC, MECHANICAL,    
//     MAGNETIC,  OPTICAL,  CHEMICAL, MANUAL, OR OTHERWISE,  WITHOUT    
//     THE PRIOR WRITTEN PERMISSION OF :                                
//                                                                      
//                        INTEL  CORPORATION                            
//                                                                     
//                     2200 MISSION COLLEGE BLVD                        
//                                                                      
//               SANTA  CLARA,  CALIFORNIA  95052-8119                  
//                                                                      
//-----------------------------------------------------------------------------------
// tx_scheduler_f.uc
// transmit scheduler and transmit arbitor for 16 100M ports and 1 Gig port
//-----------------------------------------------------------------------------------
//
//
// system: SA1200
// subsystem: transmit microcode
// usage: reference design
// author: dfh 12/08/97
// revisions:
//		dfh		5/20/98		base level 2	each transmit fill gets 1 task
//		dfh		7/4/98		FAST PORT, assign 1 or 2 elements, 100M block via signals
//		dfh		7/27/98		Support 1 Fast Port. Moved armed_to_clear logic to Arbitor
//		dfh		12/16/98	BL4 remove include config.h
//      mff     9/10/99     ifdef A1_CHIP - workaround optimize_mem hw bug
//
//-----------------------------------------------------------------------------------

// design:
//
// 1. Arbitor decides for each port which priority queue to use
//		It is assumed a port can have n priorities. (for example n=8)
//		This is placed in an 32x4 array shared as a global GPR with Scheduler
//
// 2. Scheduler assigns ports and T-FIFO Elements to TFill threads
//		the task assignment message consists of
//
//		The tx slow port task assignment message format
//		+--------+---------+----------+---------------+
//		|invalid |  unused |  Element |    queue      |
//		|   31   |  30:12  |   11:8   |     7:0       |
//		+--------+---------+----------+---------------+
//		valid			1 = assign is invalid
//		Element			identifies the tfifo element
//		Queue			identifies the queue
//
//		The tx fast port task assignment message format
//
//		+-----+------+---------+------+----------+------+------+
//		|Valid|unused|EleCount |unused|  Element |unused|QueSel|
//		| 31  | 30:24|  23:21  |      |   19:16  |      | 2:0  |
//		+-----+------+---------+------+----------+------+------+
//		Valid			1 = assign is valid
//		EleCount		indicates number of elements assigned (0 = no assignment)
//						bit 12 on = 1 element
//						bit 13 on = 2 elements
//		Element			identifies the tfifo element
//		QueSel			identifies the queue
//
// 3. Four TFill threads will be used to copy packet data in SDRAM to T-FIFO
//		TFill can calculate the packet location given port, queue and 
//		queue_descriptor_base
//
// 4. The Scheduler writes predetermined task assignment mailboxes for
//	assignments to the TFills. A 17 word block is used as a fifo for these assignments.
//
// 7. The TFill read task assignment mailboxes to pick up task assignment. If the
//	assignment valid bit is on, it clears it, increments the global message id, and
//	signals the next tfill thread to go.
//
// 8. TFill runs with the assignment, reading packet descriptor from sram and transferring 
//	packet data from SDRAM to T-FIFO.
// 
// 9. When TFill has updated the packet queue linked list and freed the used descriptor, 
//	it writes status to FBI, then loops back to do the next assignment
//	



// registers:
//
// $tx_rdy_copy				bit per port ready, copy of tx_rdy CSR pushed by FBI to $xfer0
// $tfifo_outptr			real tfifo output pointer from hardware, pushed to $xfer1
// $xfer1 has				$tfifo_outptr
// $task_assignment1		task_assignment message
// @pw1e					bit per port queuue, 1 if packet on that port's queue
// ele_inptr				pointer to next free element
// ele_outptr				pointer to element that has been busy the longest
// current_port				current port index
// task_msg_addr			temporary variable
// temp						temporary variable
// this_assign				transmit assignment while being assembled
//
// @fast_ether_qselects		queues assigned by arbitor



// this ucode goes to fbox 5


// sdram, sram, scratch shared addresses
#include "mem_map.h"

// standard macros
#include "stdmac.uc"
#include "ixplib.uc"



#ifndef FAST_PORT1
#define FAST_PORT1 16
#endif

.operand_synonym offset temp

; Ele_FreeBinding		get the port that is bound to an element
; up to 32 ports, each port's binding is 5 bits
; output: current_port
; input: ele_inptr
;
#macro Ele_FreeBinding[current_port, ele_outptr]							
	alu[offset, --, B, ele_outptr, <<2]									; adjust for array element size
    jump[offset, ele_free0binding#], defer[3]
	alu[elements_free, elements_free, +, 1]								; increment number of free elements
	alu[ele_outptr, 1, +4, ele_outptr]									; increment output ptr
	alu[ele_outptr, 0xf, AND, ele_outptr]								; make it 4 bits again

ele_free0binding#:
	alu[current_port, 0x1f, AND, bindings_5_0]							; extract the 5 bit binding
    br[ele_free_binding_done#], defer[1]
	alu[bindings_5_0, bindings_5_0, OR, 0x1f]							; invalidate the 5 bit binding
	nop

	alu_shf[current_port, 0x1f, AND, bindings_5_0, >>5]					; extract the 5 bit binding
    br[ele_free_binding_done#], defer[1]
	alu_shf[bindings_5_0, bindings_5_0, OR, 0x1f, <<5]					; invalidate the 5 bit binding
	nop

	alu_shf[current_port, 0x1f, AND, bindings_5_0, >>10]				; extract the 5 bit binding
    br[ele_free_binding_done#], defer[1]
	alu_shf[bindings_5_0, bindings_5_0, OR, 0x1f, <<10]					; invalidate the 5 bit binding
	nop

	alu_shf[current_port, 0x1f, AND, bindings_5_0, >>15]				; extract the 5 bit binding
    br[ele_free_binding_done#], defer[1]
	alu_shf[bindings_5_0, bindings_5_0, OR, 0x1f, <<15]					; invalidate the 5 bit binding
	nop

	alu_shf[current_port, 0x1f, AND, bindings_5_0, >>20]				; extract the 5 bit binding
    br[ele_free_binding_done#], defer[1]
	alu_shf[bindings_5_0, bindings_5_0, OR, 0x1f, <<20]					; invalidate the 5 bit binding
	nop

	alu_shf[current_port, 0x1f, AND, bindings_5_0, >>25]				; extract the 5 bit binding
    br[ele_free_binding_done#], defer[1]
	alu_shf[bindings_5_0, bindings_5_0, OR, 0x1f, <<25]					; invalidate the 5 bit binding
	nop

	alu[current_port, 0x1f, AND, bindings_11_6]							; extract the 5 bit binding
    br[ele_free_binding_done#], defer[1]
	alu[bindings_11_6, bindings_11_6, OR, 0x1f]							; invalidate the 5 bit binding
	nop

	alu_shf[current_port, 0x1f, AND, bindings_11_6, >>5]				; extract the 5 bit binding
    br[ele_free_binding_done#], defer[1]
	alu_shf[bindings_11_6, bindings_11_6, OR, 0x1f, <<5]				; invalidate the 5 bit binding
	nop

	alu_shf[current_port, 0x1f, AND, bindings_11_6, >>10]				; extract the 5 bit binding
    br[ele_free_binding_done#], defer[1]
	alu_shf[bindings_11_6, bindings_11_6, OR, 0x1f, <<10]				; invalidate the 5 bit binding
	nop

	alu_shf[current_port, 0x1f, AND, bindings_11_6, >>15]				; extract the 5 bit binding
    br[ele_free_binding_done#], defer[1]
	alu_shf[bindings_11_6, bindings_11_6, OR, 0x1f, <<15]				; invalidate the 5 bit binding
	nop

	alu_shf[current_port, 0x1f, AND, bindings_11_6, >>20]				; extract the 5 bit binding
    br[ele_free_binding_done#], defer[1]
	alu_shf[bindings_11_6, bindings_11_6, OR, 0x1f, <<20]				; invalidate the 5 bit binding
	nop

	alu_shf[current_port, 0x1f, AND, bindings_11_6, >>25]				; extract the 5 bit binding
    br[ele_free_binding_done#], defer[1]
	alu_shf[bindings_11_6, bindings_11_6, OR, 0x1f, <<25]				; invalidate the 5 bit binding
	nop

	alu[current_port, 0x1f, AND, bindings_15_12]						; extract the 5 bit binding
    br[ele_free_binding_done#], defer[1]
	alu[bindings_15_12, bindings_15_12, OR, 0x1f]						; invalidate the 5 bit binding
	nop

	alu_shf[current_port, 0x1f, AND, bindings_15_12, >>5]				; extract the 5 bit binding
    br[ele_free_binding_done#], defer[1]
	alu_shf[bindings_15_12, bindings_15_12, OR, 0x1f, <<5]				; invalidate the 5 bit binding
	nop

	alu_shf[current_port, 0x1f, AND, bindings_15_12, >>10]				; extract the 5 bit binding
    br[ele_free_binding_done#], defer[1]
	alu_shf[bindings_15_12, bindings_15_12, OR, 0x1f, <<10]				; invalidate the 5 bit binding
	nop

	alu_shf[current_port, 0x1f, AND, bindings_15_12, >>15]				; extract the 5 bit binding
    br[ele_free_binding_done#], defer[1]
	alu_shf[bindings_15_12, bindings_15_12, OR, 0x1f, <<15]				; invalidate the 5 bit binding

ele_free_binding_done#:
#endm



; Ele_SetBinding		bind a port to an element
; up to 32 ports, each port's binding is 5 bits
; output: current_port
; input: ele_inptr
;
#macro Ele_SetBinding[current_port, ele_ptr]							
	alu[offset, const_7C, AND, ele_ptr, <<2]							; adjust for array element size
    jump[offset, ele_set0binding#], defer[3]
	alu[this_assign, single_assign, OR, ele_inptr, <<8]					; insert element in assignment
	load_bset_result1[current_port]										; get next ready port
	alu_shf[current_port, current_port, AND~, 1, <<8]					; clear the valid bit

ele_set0binding#:
	alu[bindings_5_0, bindings_5_0, AND~, 0x1f]							; clear the 5 bit binding
    br[ele_set_binding_done#], defer[1]
    alu[bindings_5_0, bindings_5_0, OR, current_port]					; insert new binding
	nop
	alu_shf[bindings_5_0, bindings_5_0, AND~, 0x1f, <<5]				; clear the 5 bit binding
    br[ele_set_binding_done#], defer[1]
    alu_shf[bindings_5_0, bindings_5_0, OR, current_port, <<5]			; insert new binding					; 
	nop
	alu_shf[bindings_5_0, bindings_5_0, AND~, 0x1f, <<10]				; clear the 5 bit binding
    br[ele_set_binding_done#], defer[1]
    alu_shf[bindings_5_0, bindings_5_0, OR, current_port, <<10]			; insert new binding
	nop
	alu_shf[bindings_5_0, bindings_5_0, AND~, 0x1f, <<15]				; clear the 5 bit binding
    br[ele_set_binding_done#], defer[1]
    alu_shf[bindings_5_0, bindings_5_0, OR, current_port, <<15]			; insert new binding
	nop
	alu_shf[bindings_5_0, bindings_5_0, AND~, 0x1f, <<20]				; clear the 5 bit binding
    br[ele_set_binding_done#], defer[1]
    alu_shf[bindings_5_0, bindings_5_0, OR, current_port, <<20]			; insert new binding
	nop
	alu_shf[bindings_5_0, bindings_5_0, AND~, 0x1f, <<25]				; clear the 5 bit binding
    br[ele_set_binding_done#], defer[1]
    alu_shf[bindings_5_0, bindings_5_0, OR, current_port, <<25]			; insert new binding
	nop

	alu[bindings_11_6, bindings_11_6, AND~, 0x1f]						; clear the 5 bit binding
    br[ele_set_binding_done#], defer[1]
    alu[bindings_11_6, bindings_11_6, OR, current_port]					; insert new binding
	nop
	alu_shf[bindings_11_6, bindings_11_6, AND~, 0x1f, <<5]				; clear the 5 bit binding
    br[ele_set_binding_done#], defer[1]
    alu_shf[bindings_11_6, bindings_11_6, OR, current_port, <<5]		; insert new binding
	nop
	alu_shf[bindings_11_6, bindings_11_6, AND~, 0x1f, <<10]				; clear the 5 bit binding
    br[ele_set_binding_done#], defer[1]
    alu_shf[bindings_11_6, bindings_11_6, OR, current_port, <<10]		; insert new binding
	nop
	alu_shf[bindings_11_6, bindings_11_6, AND~, 0x1f, <<15]				; clear the 5 bit binding
    br[ele_set_binding_done#], defer[1]
    alu_shf[bindings_11_6, bindings_11_6, OR, current_port, <<15]		; insert new binding
	nop
	alu_shf[bindings_11_6, bindings_11_6, AND~, 0x1f, <<20]				; clear the 5 bit binding
    br[ele_set_binding_done#], defer[1]
    alu_shf[bindings_11_6, bindings_11_6, OR, current_port, <<20]		; insert new binding
	nop
	alu_shf[bindings_11_6, bindings_11_6, AND~, 0x1f, <<25]				; clear the 5 bit binding
    br[ele_set_binding_done#], defer[1]
    alu_shf[bindings_11_6, bindings_11_6, OR, current_port, <<25]		; insert new binding
	nop

	alu[bindings_15_12, bindings_15_12, AND~, 0x1f]						; clear the 5 bit binding
    br[ele_set_binding_done#], defer[1]
    alu[bindings_15_12, bindings_15_12, OR, current_port]				; insert new binding
	nop
	alu_shf[bindings_15_12, bindings_15_12, AND~, 0x1f, <<5]			; clear the 5 bit binding
    br[ele_set_binding_done#], defer[1]
    alu_shf[bindings_15_12, bindings_15_12, OR, current_port, <<5]		; insert new binding
	nop
	alu_shf[bindings_15_12, bindings_15_12, AND~, 0x1f, <<10]			; clear the 5 bit binding
    br[ele_set_binding_done#], defer[1]
    alu_shf[bindings_15_12, bindings_15_12, OR, current_port, <<10]		; insert new binding
	nop
	alu_shf[bindings_15_12, bindings_15_12, AND~, 0x1f, <<15]			; clear the 5 bit binding
    br[ele_set_binding_done#], defer[1]
    alu_shf[bindings_15_12, bindings_15_12, OR, current_port, <<15]		; insert new binding
	nop
ele_set_binding_done#:
#endm



// TxSched_InsertQueue
// block the port
// insert the port and selected queue for that port into the transfer reg assignment
// this macro assumes up to 16 ports, 4 queues per port
//
#macro TxSched_InsertQueue[current_port]
#define_eval shift_token 0
	alu_shf[temp, --, B, current_port, <<2]						; shift for 4 words per jump target
	jump[temp, queue_array#], defer[3]
	alu_shf[this_assign, this_assign, OR, current_port, <<3]	; insert portx8
	alu[qsel, --, B, @fast_ether_qselects]
	alu[this_assign, this_assign, OR, qsel]						; insert queue in assignment
queue_array#:
	alu[ports_in_tfifo, ports_in_tfifo, OR, 1]		
	alu[qsel, 0x3, AND, qsel]
	br[tx_got_queue#], defer[1]
	alu[$task_assignment1, --, B, this_assign]							; move assignment to xfer reg

	alu_shf[ports_in_tfifo, ports_in_tfifo, OR, 1, <<1]		
	alu[qsel, 0x3, AND, qsel, >>2]
	br[tx_got_queue#], defer[1]
	alu[$task_assignment1, --, B, this_assign]							; move assignment to xfer reg

	alu_shf[ports_in_tfifo, ports_in_tfifo, OR, 1, <<2]		
	alu[qsel, 0x3, AND, qsel, >>4]
	br[tx_got_queue#], defer[1]
	alu[$task_assignment1, --, B, this_assign]							; move assignment to xfer reg

	alu_shf[ports_in_tfifo, ports_in_tfifo, OR, 1, <<3]		
	alu[qsel, 0x3, AND, qsel, >>6]
	br[tx_got_queue#], defer[1]
	alu[$task_assignment1, --, B, this_assign]							; move assignment to xfer reg
	
	alu_shf[ports_in_tfifo, ports_in_tfifo, OR, 1, <<4]		
	alu[qsel, 0x3, AND, qsel, >>8]
	br[tx_got_queue#], defer[1]
	alu[$task_assignment1, --, B, this_assign]							; move assignment to xfer reg
	
	alu_shf[ports_in_tfifo, ports_in_tfifo, OR, 1, <<5]		
	alu[qsel, 0x3, AND, qsel, >>10]
	br[tx_got_queue#], defer[1]
	alu[$task_assignment1, --, B, this_assign]							; move assignment to xfer reg
	
	alu_shf[ports_in_tfifo, ports_in_tfifo, OR, 1, <<6]		
	alu[qsel, 0x3, AND, qsel, >>12]
	br[tx_got_queue#], defer[1]
	alu[$task_assignment1, --, B, this_assign]							; move assignment to xfer reg
	
	alu_shf[ports_in_tfifo, ports_in_tfifo, OR, 1, <<7]		
	alu[qsel, 0x3, AND, qsel, >>14]
	br[tx_got_queue#], defer[1]
	alu[$task_assignment1, --, B, this_assign]							; move assignment to xfer reg
	
	alu_shf[ports_in_tfifo, ports_in_tfifo, OR, 1, <<8]		
	alu[qsel, 0x3, AND, qsel, >>16]
	br[tx_got_queue#], defer[1]
	alu[$task_assignment1, --, B, this_assign]							; move assignment to xfer reg
	
	alu_shf[ports_in_tfifo, ports_in_tfifo, OR, 1, <<9]		
	alu[qsel, 0x3, AND, qsel, >>18]
	br[tx_got_queue#], defer[1]
	alu[$task_assignment1, --, B, this_assign]							; move assignment to xfer reg
	
	alu_shf[ports_in_tfifo, ports_in_tfifo, OR, 1, <<10]		
	alu[qsel, 0x3, AND, qsel, >>20]
	br[tx_got_queue#], defer[1]
	alu[$task_assignment1, --, B, this_assign]							; move assignment to xfer reg
	
	alu_shf[ports_in_tfifo, ports_in_tfifo, OR, 1, <<11]		
	alu[qsel, 0x3, AND, qsel, >>22]
	br[tx_got_queue#], defer[1]
	alu[$task_assignment1, --, B, this_assign]							; move assignment to xfer reg
	
	alu_shf[ports_in_tfifo, ports_in_tfifo, OR, 1, <<12]		
	alu[qsel, 0x3, AND, qsel, >>24]
	br[tx_got_queue#], defer[1]
	alu[$task_assignment1, --, B, this_assign]							; move assignment to xfer reg
	
	alu_shf[ports_in_tfifo, ports_in_tfifo, OR, 1, <<13]		
	alu[qsel, 0x3, AND, qsel, >>26]
	br[tx_got_queue#], defer[1]
	alu[$task_assignment1, --, B, this_assign]							; move assignment to xfer reg
	
	alu_shf[ports_in_tfifo, ports_in_tfifo, OR, 1, <<14]		
	alu[qsel, 0x3, AND, qsel, >>28]
	br[tx_got_queue#], defer[1]
	alu[$task_assignment1, --, B, this_assign]							; move assignment to xfer reg
	
	alu_shf[ports_in_tfifo, ports_in_tfifo, OR, 1, <<15]		
	alu[qsel, 0x3, AND, qsel, >>30]
	br[tx_got_queue#], defer[1]
	alu[$task_assignment1, --, B, this_assign]							; move assignment to xfer reg
	
tx_got_queue#:
#endm


//  TFIFO_BlockElements    
//		block element by decrementing the number of free elements
// 		increment ele_inptr
//
#macro TFIFO_BlockElements[ele_inptr, elements_to_assign]
	alu[elements_free, elements_free, -, elements_to_assign]			; decrement the number of free elements
	alu[ele_inptr, elements_to_assign, +, ele_inptr]					; increment ptr
	alu[ele_inptr, 0xf, AND, ele_inptr]									; make it 4 bits again
#endm



// TFIFO_TryFreeElement
//		free one element if fbi outptr has advanced
//
#macro TFIFO_TryFreeElement[ele_outptr]
	alu[--, latest_tfifo_outptr, -, ele_outptr]							; compare fbi outptr with local outptr
	br=0[try_free_done#], defer[1]										; cannot free any if equal (local outptr can't pass )

// otherwise we we can free the element associated with ele_outptr
free_element#:
	Ele_FreeBinding[current_port, ele_outptr]							; 7 insns
	ctx_arb[voluntary]
	alu[temp, current_port, B-A, 1, <<5]								; setup left indirect
	alu[--, temp, B, 0]
	alu_shf[ports_in_tfifo, ports_in_tfifo, AND~, 1, <<indirect]		; clear bit	to unblock the port
	alu[--, latest_tfifo_outptr, -, ele_outptr]							; compare fbi outptr with local outptr
	br!=0[free_element#]												; cannot free any if equal (local outptr can't pass )
try_free_done#:
#endm



#macro TxSched_WaitForAssignSignal
#ifdef SCRATCH_ASSIGN
	ctx_arb[fbi], defer[1]
#else
	ctx_arb[sram], defer[1]
#endif
#endm

//TX_Sched_WriteAssign[assignment_xfer_reg, task_msg_addr, offset]
//	write assignment for tfill
//
#macro TX_Sched_WriteAssign[assignment_xfer_reg, task_msg_addr, offset]
write_assign#:
#ifdef SCRATCH_ASSIGN
	scratch[write, $task_assignment1, task_msg_addr, offset, 1], sig_done
#else
	sram[write, $task_assignment1, task_msg_addr, offset, 1], sig_done
#endif

#endm


// TxSched_FastPort
//		schedule 1Gb fast port
//
#macro TxSched_FastPort
	alu[ports_rdy_to_proc, 1, AND, ports_rdy_to_proc, >>FAST_PORT1]					; check fast port ready
	br=0[tx_fast_done#], defer[1]														; skip if fast port not ready

	alu[qelements, @remote_gig_ele_count, -, local_qele_count]				; if remote and local counts are the same
	br=0[tx_fast_done#]														;  then there are no packets to process
tx_fast_assign#:
		.if (qelements > elements_free)										; 1-4 elements are assigned at a time
			.if(elements_free > 4)
				alu[elements_to_assign, --, B, 4]
			.else
				alu[elements_to_assign, --, B, elements_free]
			.endif
		.else
			.if(qelements > 4)
				alu[elements_to_assign, --, B, 4]
			.else
				alu[elements_to_assign, --, B, qelements]
			.endif
		.endif
		alu[qselect, --, B, @gig_que_select]								; get preferred queue
		alu[this_assign, qselect, OR, ele_inptr, <<16]						; insert ele_inptr and qselect
		TFIFO_BlockElements[ele_inptr, elements_to_assign]					; 3 insns, block the element, inc ele_inptr
		msgq_wait															; wait for prev msgq_send to complete
		alu[local_qele_count, elements_to_assign, +, local_qele_count]		; update local ele count for q
		alu[local_qele_count, const_0, +16, local_qele_count]				; limit number to 16 bits
		alu_shf[this_assign, this_assign, OR, fport_msg_id, <<24]			; insert msg_id
		alu_shf[$task_assignment1, this_assign, OR, elements_to_assign, <<21]	; insert element count assigned
		msgq_send[$task_assignment1, fport_task_msg_addr, fport_msg_id, ASYNC]	; 1 insn
tx_fast_done#:
#endm


// TxSched_SlowPort
//		schedule 100Mb port												27 insns
//		
#macro TxSched_SlowPort
	alu[ports_rdy_to_proc, @ports_with_new_packets, AND, ports_rdy_to_proc]
	br=0[end#], defer[1]												; if ports blocked, skip
	find_bset[ports_rdy_to_proc], clr_results							; find first bit set in 15:0
	Ele_SetBinding[current_port, ele_inptr]								; 8 insns, set current port fbset result, bind to element
	TFIFO_BlockElements[ele_inptr, 1]									; 3 insns, inc ele_inptr, decr elements_free by 1
	msgq_wait															; wait for prev msgq_send to complete
	TxSched_InsertQueue[current_port]									; 9 insns, get selected priority queue
	msgq_send[$task_assignment1, task_msg_addr, msg_id, ASYNC]			; 1 insn
end#:
#endm



// TxSched_TrySchedule
//		issue a macro function if tfifo elements free, 
//		otherwise let tx_arb run
//
#macro TxSched_TrySchedule[function]
autopush_tx_rdy#:
	br_inp_state[push_protect, autopush_tx_rdy#], defer[2]
	alu[latest_tfifo_outptr, --, B, $tfifo_outptr]
	alu[ports_rdy_to_proc, $tx_rdy_copy, AND~, ports_in_tfifo]			; get ports rdy, with packet, not blocked
	TFIFO_TryFreeElement[ele_outptr]									; 6 insns
	ctx_arb[voluntary]
	.if (elements_free > 0)
		function
	.endif
#endm


.operand_synonym $tfifo_outptr $xfer0
.operand_synonym $tx_rdy_copy $xfer1
.operand_synonym $task_assignment1 $xfer4
.operand_synonym $task_assignment2 $xfer5
.operand_synonym $pwp $xfer6				; ports with 1 element in 15:0	(set by receive, cleared by tx_fill)
.operand_synonym $gig_ele_count $xfer7			; holds revolving count of elements for 8 FAST PORT queues

.xfer_order $xfer0 $xfer1 $xfer2 $xfer3 $xfer4 $xfer5 $xfer6 $xfer7

//#define SCRATCH_ASSIGN

// --------------------------------startup------------------------------------
// 
	br=ctx[0, tx_scheduler#]		; transmit scheduler
	br=ctx[1, tx_arb#]				; transmit arbitor


// in hardware pass 0 these nops are needed instead of just ctx_arb[kill]to prevent a hang
kill_unused_contexts#:
	nop
	nop
	ctx_arb[kill]					; kill context 2 and 3
	


; on startup, branch here if this context is the transmit scheduler
;
tx_scheduler#:
;
; initialize free element pointer
    immed[ele_inptr, 0]
    immed[ele_outptr, 0]
	immed[msg_id, 0]
	immed[fport_msg_id, 0]
	immed[const_7c, 0x7c]
	immed[task_msg_addr, XMIT_TASK_MSG_BASE]		; slow port assignments
	alu[fport_task_msg_addr, 18, +, task_msg_addr]	; fast port assignments
	immed32[single_assign, 0x1000]					; 1 element assignment
	immed[elements_free, 16]
	immed_w0[bindings_15_12, 0xffff]
	immed_w0[bindings_11_6, 0xffff]
	immed_w0[bindings_5_0, 0xffff]
	immed_w1[bindings_15_12, 0xffff]
	immed_w1[bindings_11_6, 0xffff]
	immed_w1[bindings_5_0, 0xffff]
	immed[const_0, 0]

// setup transmit ready control to identify which thread fbi will auto_push ready bits to
// xmit_rdy_ctl 
// +--------+--------+-------+----+---------+-----+----------+
// |tfifo v |reserved|enab ap|when| pushcmd | sig |thread id |
// | 31:16  |  15:10 |   9   |  8 |  7:6    |  5  |   4:0    |
// +--------+--------+-------+----+---------+-----+----------+
// field			value
// enab ap		1	enable autopush
// when			0   after xmit_rdybits lo/hi have been assembled
// pushcmd		2	xmit_rdybits_lo<31:0> to $xfer1, xmit_ptrs to $xfer0
// sig			1	signal
// thread_id		20	tx_scheduler

.operand_synonym ii temp
	immed[$xfer2, 0x2b4]
write_rdy_ctl#:
	csr [write, $xfer2, xmit_rdy_ctl], ctx_swap				; 

// initialize the message queue first address to be an invalid message
// the succeeding msgq_wait will consume the signal
//
	msgq_init[$task_assignment1, task_msg_addr, SYNC]
	msgq_init[$task_assignment1, fport_task_msg_addr, ASYNC]
 


// initializations done



//--------------------- transmit scheduling main loop --------------------------
//
tx_sched_loop#:

	ctx_arb[voluntary]
	TxSched_TrySchedule[TxSched_SlowPort]						; schedule slow port
	TxSched_TrySchedule[TxSched_FastPort]						; schedule fast port
	TxSched_TrySchedule[TxSched_SlowPort]						; schedule slow port
	br[tx_sched_loop#]


//--------------------- end transmit scheduling main loop ----------------------


// fast port element count check
bad_count#:															; if bit count from receive is bad, come here to die
	br[bad_count#]



//-------------------------------tx arbitor-------------------------------------
// on startup, branch here if this context is the transmit arbitor
//

tx_arb#:

.local pwp_addr gig_ele_count_addr

// get ports with new packets and ports with empty queues

	immed[pwp_addr, XMIT_PWP_VECTOR]					; ports with new packets
	immed[gig_ele_count_addr, XMIT_FPORT1_ELE_COUNT]	; fast port elements queued
	immed[@gig_que_select, 0]							; assign gig qselect = 0
	immed[@fast_ether_qselects, 0]							; assign fast ether qselects = 0
	immed[@remote_gig_ele_count, 0]
	immed[const_0, 0]

tx_arb_loop#:
// tbd: for priority select, this would decide which queue is next for each port
// note: this version always picks the same queue for each port
// 

#ifdef FAST_PORT1
	scratch[read, $gig_ele_count, gig_ele_count_addr, 0, 1]

#endif
	scratch[read, $pwp, pwp_addr, 0, 1], ctx_swap

	alu[@ports_with_new_packets, --, B, $pwp]				; local ports with 1 element to tx_scheduler	

#ifdef FAST_PORT1
	alu[@remote_gig_ele_count, const_0, +16, $gig_ele_count]					; FAST PORT element counts	
#endif

	br[tx_arb_loop#]

.endlocal
