//------------------------------------------------------------------------------------
//                                                                      
//                   I N T E L   P R O P R I E T A R Y                   
//                                                                       
//      COPYRIGHT (c)  2000 BY  INTEL  CORPORATION.  ALL RIGHTS          
//      RESERVED.   NO  PART  OF THIS PROGRAM  OR  PUBLICATION  MAY      
//      BE  REPRODUCED,   TRANSMITTED,   TRANSCRIBED,   STORED  IN  A    
//      RETRIEVAL SYSTEM, OR TRANSLATED INTO ANY LANGUAGE OR COMPUTER    
//      LANGUAGE IN ANY FORM OR BY ANY MEANS, ELECTRONIC, MECHANICAL,    
//      MAGNETIC,  OPTICAL,  CHEMICAL, MANUAL, OR OTHERWISE,  WITHOUT    
//      THE PRIOR WRITTEN PERMISSION OF :                                
//                                                                       
//                         INTEL  CORPORATION                            
//                                                                      
//                      2200 MISSION COLLEGE BLVD                        
//                                                                       
//                SANTA  CLARA,  CALIFORNIA  95052-8119                  
//                                                                       
//------------------------------------------------------------------------------------
// rec_scheduler_f2.uc
// receive scheduler 2 Gig ports
//
//
// system: SA1200
// subsystem: receive microcode
// usage: example
// author: dfh 1/27/99
// revisions:
//
// ---------------------------SA1200 microcode--------------------------

// FBOX microcode assignment
//
//    1. the receive threads are threads 0-11
//    2. receive scheduler is thread 16-17

// general design flow:

// context 0 (thread 16)
//
// 1. initialize memory freelists and fast port mutex location on behalf of receive threads
// 2. initialize registers used in the main loop
// 3. main loop: find_next_threads
//
//     
//      1) find first bit set in thread_done_copy 15:0, representing receive threads 0-7
//       
//      2) find first bit set in thread_done_copy 23:16, representing receive threads 8-11
//
//		3) choose a port based on ready flags, attempting to alternate between ports
//
//		4) assemble and issue receive request for each receive thread done


// context 1 (thread 17)
//
// polls (reads) the following registers:
//	rec ready count
//	thread_done1

// key registers:
//
// thread_done_copy			copy of thd_done_reg0 CSR.
// current_thread			current thread index
// current_port				current port index
// this_rr					receive request while being assembled



//---------------------------------definitions-------------------------------
#include "mem_map.h"
#include "stdmac.uc"
#include "mem.uc"			// memory allocation macros
#include "refdes_macros.uc"

#define FAST_PORT_ENABLED



//-----------------------------------macros----------------------------------


// RxSched_SendF2ReceiveRequest									; normal case 8 cycles
//		send the receive request
//
#macro RxSched_SendF2ReceiveRequest[portnum]

// present format of RCV_REQ (3/13/98 transactor) is
// +-+------+-----+----+-----+-----+-----+----+-----+------+-------+------+
// | |fetch9| msg |stat| elem| elem|seq# |1or2|misc | sig  | thread| port |
// | |      | pkt |    | #2  | #1  |     |    |     | sched|       |      |
// | |   29 |28:27| 26 |25:22|21:18|17:16| 15 |14:12|  11  | 10:6  | 5:0  |
// +-+------+-----+----+-----+-----+-----+----+-----+------+-------+------+
// this will cause FBI to set RCV_CNTL

; write receive request register
; this code does one element xfers
 
	alu[--, --, B, fbi_req_outstanding]							; test for previous receive request outstanding 
	br=0[rec_req_avail_check#], defer[1]						; if request is not outstanding skip the ctx_arb
	immed[fbi_req_outstanding, 1]								; set request outstanding
	ctx_arb[fbi]												; wait on
rec_req_avail_check#:
	br_inp_state[rec_req_avail, _got_rr_avail#], defer[1]
	alu_shf[this_rr, this_rr, OR, portnum]						; insert port num into new receive request
 	br[rec_req_avail_check#]

_got_rr_avail#:
write_rr#:
    alu_shf[$xfer0, this_rr, OR, current_thread, <<18]			; insert element num into new receive request
    csr[write, $xfer0, rcv_req], sig_done						; send req to FBI
#endm


// RxSched_RequestFastPort
//		schedule the receive request for a gigabit port				; 14 insns
//		we believe it is ready, and can do a speculative assign (without blocking the port)
//
#macro RxSched_RequestFastPort[portnum, current_threadx2]
update_tdone_copy#:
// update thread_done_copy
	alu_shf[current_inv_threadx2, current_threadx2, B-A, 1, <<5]	; setup left indirect shift
	alu[--, current_inv_threadx2, B, 0]								; clear the wait encode 2 bits of 
	alu[thread_done_copy, thread_done_copy, AND~, 0x3, <<indirect]	; the thread_done_copy
// insert sequence number for fast mpackets
	alu_shf[this_rr, --, B, current_thread, <<6]
	alu_shf[this_rr, this_rr, OR, 1, <<16]							; insert seq_field (fast port) into this receive request
	alu_shf[last_was_fport1, last_was_fport1, XOR, 1, <<26]
//
// TBD: insert mpacket sequence number into element 2 field for correct sequenced 
//		operation of state save/restore
//
// send request
req_send#:
	RxSched_SendF2ReceiveRequest[portnum]								; 8 insns
#endm

// 8 insns
#macro RxSched_ChooseFPort[current_port, @rec_rdy_count, last_was_fport1]
start#:
	alu[temp, @rec_rdy_count_reg, OR, last_was_fport1]		; bits 26:24
	alu_shf[temp, 0xE, AND, temp, >>23]			; effectively shift left<<1 for jump stride 2
	jump[temp, port_choice#]
	nop
	nop
	nop
port_choice#:									
	ctx_arb[voluntary]							; 0, neither port ready
	br[start#]
	br[end#], defer[1]
	immed[current_port, FAST_PORT1]				; 1, only fport1 ready
	br[end#], defer[1]
	immed[current_port, FAST_PORT2]				; 2, only fport2 ready
	br[end#], defer[1]
	immed[current_port, FAST_PORT1]				; 3, last was fport2, both fports ready
	ctx_arb[voluntary]							; 4, neither port ready
	br[start#]
	br[end#], defer[1]
	immed[current_port, FAST_PORT1]				; 5, only fport1 ready
	br[end#], defer[1]
	immed[current_port, FAST_PORT2]				; 6, only fport12 ready
	br[end#], defer[1]
	immed[current_port, FAST_PORT1]				; 7, last was fport2, both fports ready
end#:
#endm



//----------------------------------startup----------------------------------
	br=ctx[0, rec_scheduler#]					; receive scheduler
	br=ctx[1, tdone_rdybit_reader#]				; context to read thread done and rdy bits

// in hardware pass 0 these nops are needed instead of just ctx_arb[kill]to prevent a hang
kill_unused_contexts#:
	nop
	nop
	ctx_arb[kill]								; kill contexts 2-3


.xfer_order $xfer0 $xfer1 $xfer2


rec_scheduler#:


// setup receive ready control to identify which thread fbi will auto_push ready bits to
//
// +--------+----------+-------+----+---------+-----+----------+
// |1thread |push count|enab ap|when| pushcmd | sig |thread id |
// |  14    |  12:10   |   9   |  8 |  7:6    |  5  |   4:0    |
// +--------+----------+-------+----+---------+-----+----------+
// field			value
// 1thread		1	for fast port put the thread of receive request to recevie control
// push count	3	delay timer for both receive and xmit
// enab ap		0	enable autopush
// when			0   after rec_rdycnt has incremented	
// pushcmd		0	rec_rdybits_lo<31:0> to $xfer1, rec_rdycnt to $xfer0
// sig			1	signal
// thread_id	16	rec_scheduler

setup_ctl#:
	immed[$xfer2, 0x4c10]		
	csr [write, $xfer2, rcv_rdy_ctl], ctx_swap 


#ifndef FAST_PORT1
#define FAST_PORT1 0
#endif

#ifndef FAST_PORT2
#define FAST_PORT2 8
#endif

	fast_wr[3, THREAD_DONE_INCR1]			; increment seq1 "" for first receive
	fast_wr[3, THREAD_DONE_INCR2]			; increment seq2 "" for first receive



// call	#macro freelist_create[freelist_id, base_addr, stride, count]
//		initialize freelist 0 for BUFFER_COUNT 2KB_packet_descriptors
//		see also: mem_map.h, stdmac.uc, stdfunc.uc
//
#ifndef BUFFER_COUNT
	#define BUFFER_COUNT FREELIST_BTYPE_SRAM8_SDRAM2K_COUNT
#endif
//
freelist_create[0, SRAM_BUFF_DESCRIPTOR_BASE, 4, BUFFER_COUNT]

freelist_created#:

// signal receive threads to go
fast_wr[0, inter_thd_sig]
fast_wr[4, inter_thd_sig]
fast_wr[8, inter_thd_sig]


// program fbi bus mode and rdybus sequencer program for transmit/receive autopush and rdy bit polling
// note: Core can do this by writing to the FBI CSRs
// prerequisite: define constants as described in 
//		Fbus_SetupRdyProg,  refdes_macros.uc
//
#ifdef REAL_RDYBUS_PROG
setup_rdyprg#:
	Fbus_SetupRdyProg
#endif


	immed[fbi_req_outstanding, 0]								; initialize no fbi receive request in progress

	immed[const_0x101, 0x101]

// context 0: job is to select thread/port  and write receive requests
//
rx_sched_first#:
	ctx_arb[voluntary]											; allow other contexts to run
	alu[--, --, B, @thread_done_capture]
	br=0[rx_sched_first#]

rx_sched_major#:
	ctx_arb[voluntary]
	alu[thread_done_copy, thread_done_copy, OR, @thread_done_capture]
	immed[@thread_done_capture, 0]									; clear captured bits
	alu[thread_done_copy, thread_done_copy, OR, thread_done_skips]	; recover skipped threads due to port not ready
	immed[thread_done_skips, 0]										; clear skip bits

//--------------------------rec scheduler loop -------------------------------

rx_find_next_thread#:
	find_bset[thread_done_copy], clr_results					; find first bit set in 15:0
rx_find_next_thread2#:
	find_bset[thread_done_copy, >>16]							; find first bit set in 31:16
	RxSched_ChooseFPort[current_port, @rec_rdy_count, last_was_fport1]

rx_service_thread_a#:
	load_bset_result1[current_threadx2a]
	br=0[rx_sched_major#], defer[1]							; no thread done set in 15:0
	load_bset_result2[current_threadx2b]
	alu_shf[current_threadx2a, current_threadx2a, AND~, const_0x101]	; clear the valid bit and low order bit
	alu_shf[current_thread, --, B, current_threadx2a, >>1]				; current threadx2 / 2
	RxSched_RequestFastPort[current_port, current_threadx2a]			; 14 insns, schedule receive request for slow thread

rx_service_thread_b#:
	alu[--, --, B, current_threadx2b]									; test 23:16
	br=0[rx_find_next_thread#], defer[1]								; no thread done in 23:16
	alu_shf[current_threadx2b, current_threadx2b, AND~, const_0x101]	; clear the valid bit and low order bit
	alu_shf[current_thread, --, B, current_threadx2b, >>1]				; current threadx2 / 2
	RxSched_ChooseFPort[current_port, @rec_rdy_count, last_was_fport1]	; 8 insns
	RxSched_RequestFastPort[current_port, current_threadx2a]			; 14 insns, schedule receive request for slow thread

rx_iterate#:
	br[rx_find_next_thread2#], defer[1]									; iterate
	find_bset[thread_done_copy], clr_results							; find first bit set in 15:0


//---------------------end minor loop --------------------------



//---------------------context 1 --------------------------
// context 1: job is to read thread_done1 register to get receive thread done status
//				and to read rec ready count which contains the fast ready bits
//
.operand_synonym $rec_rdy_count $xfer0
.operand_synonym $thread_done_reg0 $xfer1

tdone_rdybit_reader#:
tdone_rdybit_reader_loop#:													
	csr[read, $thread_done_reg0, THREAD_DONE_REG0], ctx_swap	; thread done two bits per thread
	alu[$thread_done_reg0, --, B, $thread_done_reg0]			; write back to thread done to clear captured bits
	csr[write, $thread_done_reg0, THREAD_DONE_REG0], ctx_swap, defer[1]
 	alu[@thread_done_capture, @thread_done_capture, OR, $thread_done_reg0]	; take thread done snapshot
	ctx_arb[voluntary]
	nop
	csr[read, $rec_rdy_count, RCV_RDY_CNT], ctx_swap
	br[tdone_rdybit_reader_loop#], defer[1]
	alu_shf[@rec_rdy_count_reg, --, B, $rec_rdy_count]


