//------------------------------------------------------------------------------------
//                                                                      
//                   I N T E L   P R O P R I E T A R Y                   
//                                                                       
//      COPYRIGHT (c)  1998-2000 BY  INTEL  CORPORATION.  ALL RIGHTS          
//      RESERVED.   NO  PART  OF THIS PROGRAM  OR  PUBLICATION  MAY      
//      BE  REPRODUCED,   TRANSMITTED,   TRANSCRIBED,   STORED  IN  A    
//      RETRIEVAL SYSTEM, OR TRANSLATED INTO ANY LANGUAGE OR COMPUTER    
//      LANGUAGE IN ANY FORM OR BY ANY MEANS, ELECTRONIC, MECHANICAL,    
//      MAGNETIC,  OPTICAL,  CHEMICAL, MANUAL, OR OTHERWISE,  WITHOUT    
//      THE PRIOR WRITTEN PERMISSION OF :                                
//                                                                       
//                         INTEL  CORPORATION                            
//                                                                      
//                      2200 MISSION COLLEGE BLVD                        
//                                                                       
//                SANTA  CLARA,  CALIFORNIA  95052-8119                  
//                                                                       
//------------------------------------------------------------------------------------
// rec_scheduler2.uc
// receive scheduler 16 100Mbit ports
//
//
// system: SA1200
// subsystem: receive microcode
// usage: example
// author: dfh 4/27/98
// revisions:
//		dfh		10/1/98		support all 16 100M ports
//		dfh		12/16/98	BL4 remove include config.h
//
// ---------------------------SA1200 microcode--------------------------

// Assembly time DEFINE switches
//		BUFFER_COUNT				how many 2KB packet buffers to allocate
//		FETCH9						if 1, set bit 29 on in receive request


// FBOX microcode assignment
//
//    1. the receive threads are threads 0-11
//    2. receive scheduler is thread 16

// general design flow:

// 1. initialize memory freelists
// 2. initialize registers used in the main loop
// 3. major loop:
//
//  a) get thd_done and calculate rec_rdy_true
//          because there is a delay on receiving rec_ready from the fbi,
//          we will delay making a new assignment to a port if it was assigned on previous iteration
//     assume:
//     CSR Rec_rdycnt is pushed to $xfer0
//     CSR Rec_rdybits are pushed to $xfer1
// 
//	For an iteration of the major loop, we expect amortize these 30 cycles in the minor loop,
//	which picks up two thread dones per iteration.           
//
//  b) minor loop: find_next_threads
//     
//      1)  find first bit set in thread_done_copy 21:0 and 23:22
//       
//		2) process two thread dones, alternating between the fast port and any slow port   


// key registers:
//
// thread_done_copy			copy of thd_done_reg0 CSR.
// current_thread			current thread index
// current_port				current port index
// rec_rdy_true				calc from rec_rdy AND !port_mask0 AND !port_mask1
// this_rr					receive request while being assembled

//---------------------------------definitions-------------------------------
#include "mem_map.h"
#include "stdmac.uc"
#include "mem.uc"			// memory allocation macros
#include "refdes_macros.uc"


//----------------------------------startup----------------------------------
.xfer_order $xfer0 $xfer1 $xfer2


	br=ctx[0, rec_scheduler#]		; receive scheduler
	br=ctx[1, tdone_rdybit_reader#]	; context to read thread done and rdy bits

// in hardware pass 0 these nops are needed instead of just ctx_arb[kill]to prevent a hang
kill_unused_contexts#:
	nop
	nop
	ctx_arb[kill]					; kill contexts 2-3


rec_scheduler#:

// setup receive ready control to identify which thread fbi will auto_push ready bits to
//
// +--------+----------+-------+----+---------+-----+----------+
// |1thread |push count|enab ap|when| pushcmd | sig |thread id |
// |  14    |  12:10   |   9   |  8 |  7:6    |  5  |   4:0    |
// +--------+----------+-------+----+---------+-----+----------+
// field		description
// 1thread		for fast port put the thread of receive request to recevie control
// push count	delay timer for both receive and xmit
// enab ap		enable autopush
// when			after rec_rdycnt has incremented	
// pushcmd		push nothing
// sig			signal
// thread_id	rec_scheduler

setup_ctl#:
	immed[$xfer2, 0]							// autopush nothing and signal nobody
	csr [write, $xfer2, rcv_rdy_ctl], ctx_swap 


// call	#macro freelist_create[freelist_id, base_addr, stride, count]
//		initialize freelist 0 for BUFFER_COUNT 2KB_packet_descriptors
//		see also: mem_map.h, stdmac.uc, stdfunc.uc
//
#ifdef XMIT_INIT
	ctx_arb[inter_thread]					; wait for signal from xmit init
#else	// not XMIT_INIT
#ifndef BUFFER_COUNT
	#define BUFFER_COUNT FREELIST_BTYPE_SRAM8_SDRAM2K_COUNT
#endif

#ifdef ALT_BANKS
// create freelists 0 and 1 for odd and even banks
#define_eval HALF_BUFFER_COUNT (BUFFER_COUNT / 2)
freelist_create[0, SRAM_BUFF_DESCRIPTOR_BASE, 4, HALF_BUFFER_COUNT]
#define_eval NEXT_BUFF_DESCRIPTOR_BASE (SRAM_BUFF_DESCRIPTOR_BASE + (HALF_BUFFER_COUNT * 4))
freelist_create[1, NEXT_BUFF_DESCRIPTOR_BASE, 4, HALF_BUFFER_COUNT]

#else	// not ALT_BANKS
freelist_create[0, SRAM_BUFF_DESCRIPTOR_BASE, 4, BUFFER_COUNT]
#endif	// not ALT_BANKS
#endif	// not XMIT_INIT

freelist_created#:
// set the autopush synch rate for 16 port version = 100
//
	immed[$xfer7, 100]
	csr[write, $xfer7, rdybus_synch_count_default], ctx_swap

// signal receive threads to go
fast_wr[0, inter_thd_sig]
fast_wr[4, inter_thd_sig]
fast_wr[8, inter_thd_sig]


// program fbi bus mode and rdybus sequencer program for transmit/receive autopush and rdy bit polling
// note: Core can do this by writing to the FBI CSRs
// prerequisite: define constants as described in 
//		Fbus_SetupRdyProg,  refdes_macros.uc
//
#ifdef REAL_RDYBUS_PROG
setup_rdyprg#:
	Fbus_SetupRdyProg
#endif

	immed[fbi_req_outstanding, 0]								; initialize no fbi receive request in progress

	immed[const_0x101, 0x101]
	immed[port_mask_fairness, 0xffff]

// context 0: job is to select thread/port  and write receive requests
//
rx_sched_first#:
	ctx_arb[voluntary]											; allow other contexts to run
	alu[--, --, B, @thread_done_capture]
	br=0[rx_sched_first#]
	alu[--, --, B, @rec_rdy]
	br=0[rx_sched_first#]

rx_sched_major#:
	ctx_arb[voluntary]
	alu[thread_done_copy, thread_done_copy, OR, @thread_done_capture]
	immed[@thread_done_capture, 0]									; clear captured bits
	alu[thread_done_copy, thread_done_copy, OR, thread_done_skips]	; recover skipped threads due to port not ready
	immed[thread_done_skips, 0]										; clear skip bits

//--------------------------rec scheduler loop -------------------------

rx_find_next_thread#:
	find_bset[thread_done_copy], clr_results					; find first bit set in 15:0
rx_find_next_thread2#:
	find_bset[thread_done_copy, >>16]							; find first bit set in 31:16
	RxSched_UpdatePortMask[port_mask_prev, port_mask_current]

rx_service_thread_a#:
	load_bset_result1[current_threadx2a]
	br=0[rx_sched_major#], defer[1]								; no thread done set
	load_bset_result2[current_threadx2b]
	alu_shf[current_threadx2, current_threadx2a, AND~, const_0x101]	; clear the valid bit and low order bit
	RxSched_RequestSlowPort[current_threadx2]					; ~27 insns, schedule receive request for slow thread

rx_service_thread_b#:
	alu[--, --, B, current_threadx2b]	
	br=0[rx_find_next_thread#], defer[1]						; no thread done in 23:16
	alu_shf[current_threadx2, current_threadx2b, AND~, const_0x101]	; clear the valid bit and low order bit
	RxSched_UpdatePortMask[port_mask_prev, port_mask_current]
	RxSched_RequestSlowPort[current_threadx2]					; ~27 insns, schedule receive request for slow thread
	br[rx_find_next_thread2#], defer[1]							; iterate
	find_bset[thread_done_copy], clr_results					; find first bit set in 15:0

//---------------------end rec scheduler loop --------------------------



//---------------------context 1 --------------------------
// context 1: job is to read thread_done1 register to get receive thread done status
//				and to read ready bits and ready count
//
.operand_synonym $rec_rdy_count $xfer0
.operand_synonym $rec_rdy $xfer1
.operand_synonym $thread_done_reg0 $xfer1

tdone_rdybit_reader#:
	immed[const0, 0]
tdone_rdybit_reader_loop#:													; 7 instr overhead
	csr[read, $thread_done_reg0, THREAD_DONE_REG0], ctx_swap	; thread done two bits per thread
	alu[$thread_done_reg0, --, B, $thread_done_reg0]			; write back to thread done to clear captured bits
	csr[write, $thread_done_reg0, THREAD_DONE_REG0], ctx_swap, defer[1]
 	alu[@thread_done_capture, @thread_done_capture, OR, $thread_done_reg0]	; take thread done snapshot
	ctx_arb[voluntary]
// must read these two together
	csr[read, $rec_rdy_count, RCV_RDY_CNT]
	csr[read, $rec_rdy, RCV_RDY_LO], ctx_swap
	alu[@rec_rdy_count, const0, +8, $rec_rdy_count]
	br[tdone_rdybit_reader_loop#], defer[1]
	alu[@rec_rdy, const0, +16, $rec_rdy]


