//------------------------------------------------------------------------------------
//                                                                      
//                   I N T E L   P R O P R I E T A R Y                   
//                                                                       
//      COPYRIGHT (c)  1998-2000 BY  INTEL  CORPORATION.  ALL RIGHTS          
//      RESERVED.   NO  PART  OF THIS PROGRAM  OR  PUBLICATION  MAY      
//      BE  REPRODUCED,   TRANSMITTED,   TRANSCRIBED,   STORED  IN  A    
//      RETRIEVAL SYSTEM, OR TRANSLATED INTO ANY LANGUAGE OR COMPUTER    
//      LANGUAGE IN ANY FORM OR BY ANY MEANS, ELECTRONIC, MECHANICAL,    
//      MAGNETIC,  OPTICAL,  CHEMICAL, MANUAL, OR OTHERWISE,  WITHOUT    
//      THE PRIOR WRITTEN PERMISSION OF :                                
//                                                                       
//                         INTEL  CORPORATION                            
//                                                                      
//                      2200 MISSION COLLEGE BLVD                        
//                                                                       
//                SANTA  CLARA,  CALIFORNIA  95052-8119                  
//                                                                       
//------------------------------------------------------------------------------------
// rec_scheduler.uc
// receive scheduler 12 100Mbit ports
//
//
// system: SA1200
// subsystem: receive microcode
// usage: example
// author: dfh 4/27/98
// revisions:
//		dfh		10/23/98	add BUFFER_COUNT switch to enable larger freelists
//		dfh		12/16/98	BL4 remove include config.h
//
//
// ---------------------------SA1200 microcode--------------------------

// FBOX microcode assignment
//
//    1. the receive threads must be within threads 0-15
//    2. receive thread 0 always gets element 0, receive thread 1 gets element 1, etc.
//    3. receive scheduler is thread 16

// general design flow:

// 1. get port and control store configuration from memory
// 2. initialize registers used in the main loop
// 3. main loop
//
//  a) get thd_done and calculate rec_rdy_true
//          because rec_ready could indicate a port just assigned,
//          we need to delay making a new assignment to a port
//          until the FBI has indicated by incrementing CSR Rec_rdycnt
//          that the rec_ready is indeed fresh.
//     assume:
//     CSR Rec_rdybits are pushed to $xfer1 and 1
//     CSR Rec_rdycnt is pushed to $xfer0
//            
//
//  b) find_next_threads
//    check thread waits (all threads type 1 then type 2)
//          find first bit set in thread_done_copy
//                if found, look at 2 bit encode for that thread. 
//
//       1) wait encode = 0x3	eop
//               
//               get next free port that is not a continue and assign (Sched_WriteRR)
//
//       2) wait encode = 0x1	not eop continue
//				
//				check port ready and if so, assign (Sched_WriteRR)
//               
//
//  c) repeat the main loop


// key context registers:
//
// thread_done_copy			copy of thd_done_reg0 CSR.
// rec_rdy_true				calc from rec_rdy AND !port_mask0 AND !port_mask1
// current_thread			current thread index
// this_rr					receive request while being assembled

;-----------------------begin---------------------
#include "mem_map.h"
#include "stdmac.uc"
#include "mem.uc"			// memory allocation macros
#include "refdes_macros.uc"


#ifndef FID
#define FID 4						; normally this is in FBOX 4
#endif

#define_eval CONTEXT2_TID (FID*4+2)

.xfer_order $xfer0 $xfer1 $xfer2 $xfer3 $xfer4 $xfer5 $xfer6 $xfer7

; startup
	br=ctx[0, rec_scheduler#]		; receive scheduler
	br=ctx[2, _context_2#]			; service thread is context 2, thread 18

// in hardware pass 0 these nops are needed instead of just ctx_arb[kill]to prevent a hang
kill_unused_contexts#:
	nop
	nop
	ctx_arb[kill]					; kill contexts 1 and 3


.local thread_done_capture thread_done_copy current_thread rec_rdy_count current_inv_thread current_port_mask fbi_req_outstanding thread_done_skips this_rr rec_rdycnt_copy current_inv_threadx2 current_threadx2 prev_port_mask rec_rdy_true

rec_scheduler#:
// setup receive ready control to identify which thread fbi will auto_push ready bits to
//
// +----------+-------+----+---------+-----+----------+
// |push count|enab ap|when| pushcmd | sig |thread id |
// |  12:10   |   9   |  8 |  7:6    |  5  |   4:0    |
// +----------+-------+----+---------+-----+----------+
// field			value
// push count	3	delay timer for both receive and xmit
// enab ap		1	enable autopush
// when			0   after rec_rdycnt has incremented	
// pushcmd		2	rec_rdybits_lo<31:0> to $xfer1, rec_rdycnt to $xfer0
// sig			1	signal
// thread_id		16	rec_scheduler

setup_ctl#:
#ifdef XMIT_INIT
	immed[$xfer2, 0xeac]			// thread id 12	
#else
	immed[$xfer2, 0xeb0]			// thread id 16
#endif
	csr [write, $xfer2, rcv_rdy_ctl], ctx_swap 

// call	#macro freelist_create[freelist_id, base_addr, stride, count]
//		initialize freelist 0 for BUFFER_COUNT 2KB_packet_descriptors
//		see also: mem_map.h, stdmac.uc, stdfunc.uc
//
#ifdef XMIT_INIT
	ctx_arb[inter_thread]					; wait for signal from xmit init
#else	// not XMIT_INIT
#ifndef BUFFER_COUNT
	#define BUFFER_COUNT FREELIST_BTYPE_SRAM8_SDRAM2K_COUNT
#endif

#ifdef ALT_BANKS
// create freelists 0 and 1 for odd and even banks
#define_eval HALF_BUFFER_COUNT (BUFFER_COUNT / 2)
freelist_create[0, SRAM_BUFF_DESCRIPTOR_BASE, 4, HALF_BUFFER_COUNT]
#define_eval NEXT_BUFF_DESCRIPTOR_BASE (SRAM_BUFF_DESCRIPTOR_BASE + (HALF_BUFFER_COUNT * 4))
freelist_create[1, NEXT_BUFF_DESCRIPTOR_BASE, 4, HALF_BUFFER_COUNT]

#else	// not ALT_BANKS
freelist_create[0, SRAM_BUFF_DESCRIPTOR_BASE, 4, BUFFER_COUNT]
#endif	// not ALT_BANKS
#endif	// not XMIT_INIT

freelist_created#:

// set the autopush synch rate for 12 port version = 40
//
	immed[$xfer7, 40]
	csr[write, $xfer7, rdybus_synch_count_default], ctx_swap

// signal receive threads to go
fast_wr[0, inter_thd_sig]
fast_wr[4, inter_thd_sig]
fast_wr[8, inter_thd_sig]

// program fbi bus mode and rdybus sequencer program for transmit/receive autopush and rdy bit polling
// note: Core can do this by writing to the FBI CSRs
// prerequisite: define constants as described in 
//		Fbus_SetupRdyProg,  refdes_macros.uc
//
#ifdef REAL_RDYBUS_PROG
setup_rdyprg#:
	Fbus_SetupRdyProg
#endif



// take cycles to verify that collection count is incrementing by comparing it 
// to thread_done_count
#ifdef CHECK_COUNT
	immed[tempa, 12]
	alu[thread_done_count, 0, -, tempa]		; we get 12 thread dones right away
	immed[coll_count_total, 0]									
	immed[coll_count, 0]									
#endif


	immed[fbi_req_outstanding, 0]								; initialize no fbi receive request in progress

//---------------------begin the main loop --------------------------

rec_sched_loop#:
	alu[--, --, B, fbi_req_outstanding]								; test for previous receive request outstanding 
	br=0[read_thread_done#], defer[1], guess_branch					; if request is not outstanding skip the ctx_arb
	immed[fbi_req_outstanding, 0]									; clear request outstanding
	ctx_arb[fbi]													; wait on

.operand_synonym $thread_done_reg0 $xfer2
read_thread_done#:
	csr[read, $thread_done_reg0, thread_done_reg0], sig_done		; 32 bits, 2 per thread


// assuming we are configured for 16 ports,
// the fbi will push rdy_count and rec_ready to $xfer0 and $xfer1 respectively
//
.operand_synonym $rec_rdy_count $xfer0
.operand_synonym $rec_rdy $xfer1

// this push protect check prevents a collision of the read with a write
// at rd $xfer0 and $xfer1 regs
//
autopush_rec_rdy_count#:
	br_inp_state[push_protect, autopush_rec_rdy_count#], defer[1]
	ld_field_w_clr[rec_rdy_count, 0001, $rec_rdy_count]					; extract just the count

// test to verify collection count
#ifdef CHECK_COUNT
	ld_field_w_clr[tempa, 0001, $rec_rdy_count, >>8]
	alu[tempb, --, B, tempa]											; save
	.if (tempa < coll_count)											; if it has wrapped, add the carry
		alu[tempa, tempa, +, 1, <<8]
	.endif
	alu[tempa, tempa, -, coll_count]									; get difference
	alu[coll_count_total, coll_count_total, +, tempa]					; add diff to total
	alu[coll_count, --, B, tempb]										; save this collection count
#endif
// update port mask if rec ready count has incremented
 	alu[--, rec_rdy_count, -, rec_rdycnt_copy]				; compare new count with old count
	br=0[port_mask_done#], defer[1]							; if no change keep the port mask
	alu[rec_rdycnt_copy, --, B, rec_rdy_count]				; save a copy of fbi rec_rdy_count
	alu[prev_port_mask, --, B, current_port_mask]			; use to mask of rdy bits this iteration
	immed[current_port_mask, 0]								; if change, start with fresh port mask
port_mask_done#:


	ctx_arb[fbi]														; wait for prev read to finish

process_thread_done#:
	ld_field_w_clr[thread_done_capture, 0111, $thread_done_reg0]		; take thread done snapshot
    alu[$thread_done_reg0, --, B, thread_done_capture]					; write back to thread done to clear captured bits
	csr[write, $thread_done_reg0, thread_done_reg0]
	alu[thread_done_copy, thread_done_capture, OR, thread_done_skips]	; skip bits from prev top iteration
	br[find_next_thread#], defer[1]
	immed[thread_done_skips, 0]											; clear skip bits


set_skips#:														; this is entered if port is busy 
	alu[--, current_inv_threadx2, B, 0]
	alu[thread_done_skips, thread_done_skips, OR, 0x3, <<indirect]

find_next_thread#:
	find_bset[thread_done_copy], clr_results					; find first bit set in 15:0
find_next_thread2#:
	find_bset[thread_done_copy, >>16]							; find first bit set in 31:16
autopush_rec_rdy#:
	br_inp_state[push_protect, autopush_rec_rdy#], defer[2]
	alu[rec_rdy_true, $rec_rdy, AND~, prev_port_mask]			; block ports just assigned last iteration
	nop
	nop
	load_bset_result1[current_threadx2]
	br=0[rec_sched_loop#], defer[1]								; no thread done set
	alu_shf[current_threadx2, current_threadx2, AND~, 1, <<8]	; clear the valid bit
	alu_shf[current_thread, --, B, current_threadx2, >>1]		; current threadx2 / 2

// clear the wait encode 2 bits of the thread_done_copy

    alu_shf[current_inv_threadx2, current_threadx2, B-A, 1, <<5]; setup left indirect shift
	alu[--, current_inv_threadx2, B, 0]
	alu[thread_done_copy, thread_done_copy, AND~, 0x3, <<indirect]

	br_bclr_ind[rec_rdy_true, current_thread, set_skips#]		; stdmac.uc if port not ready (port = thread), skip
    alu_shf[current_inv_thread, current_thread, B-A, 1, <<5]	; 32 - thread id for later

// now go ahead and write receive request

//----------------------------------------------------------------
assemble_rr#:

// present format of RCV_REQ (3/13/98 transactor) is
// +-+------+-----+----+-----+-----+-----+----+-----+------+-------+------+
// | |fetch9| msg |stat| elem| elem|seq# |1or2|misc | sig  | thread| port |
// | |      | pkt |    | #2  | #1  |     |    |     | sched|       |      |
// | |   29 |28:27| 26 |25:22|21:18|17:16| 15 |14:12|  11  | 10:6  | 5:0  |
// +-+------+-----+----+-----+-----+-----+----+-----+------+-------+------+
// this will cause FBI to set RCV_CNTL

// this code does one element xfers
 
	alu[--, --, B, fbi_req_outstanding]								; test for previous receive request outstanding 
	br=0[rec_req_avail_check#], defer[1], guess_branch				; if request is not outstanding skip the ctx_arb
	immed[fbi_req_outstanding, 1]									; set request outstanding
	ctx_arb[fbi]													; wait on

// test to see if a receive request register has been made available by the FBI
rec_req_avail_check#:
	br_inp_state[rec_req_avail, got_rr_avail#], defer[1]

#ifdef FETCH9
	alu_shf[this_rr, current_thread, OR, 1, <<29]	; fetch 9, insert port num into new receive request
#else
	alu[this_rr, --, B, current_thread]				; insert port number (= current thread)
#endif
	br[rec_req_avail_check#]

got_rr_avail#:
// write receive request register
    alu_shf[this_rr, this_rr, OR, current_thread, <<6]				; insert thread num into new receive request

write_rr#:
    alu_shf[$xfer0, this_rr, OR, current_thread, <<18]				; insert element num into new receive request
    csr[write, $xfer0, rcv_req], sig_done							; send req to FBI
rx_assigned#:

// this is quick and dirty, good for 5 billion fbus rec_req transactions
#ifdef CHECK_COUNT
	alu[thread_done_count, thread_done_count, +, 1]
	alu[--, coll_count_total, -, thread_done_count]
	br<0[coll_count_err#]											; tdone count should be behind or equal
#endif
	alu[--, current_inv_thread, B, 0]								; setup left indirect shift, port=thread
 	alu_shf[current_port_mask, current_port_mask, OR, 1, <<indirect]; block the port
	br[find_next_thread2#], defer[1]								; iterate
	find_bset[thread_done_copy], clr_results						; find first bit set in 15:0


coll_count_err#:
	br[coll_count_err#]

.endlocal

// service thread 
//		supports hash and memory functions calls from core
//
.local to_core_mb from_core_mb bcopy_len len addr to from format my_thread_id func msvbufs thread_mb_base link_pc indirect_wr_len

_context_2#:
	immed[my_thread_id, CONTEXT2_TID]
	br[Sv_WaitForCall#]

// service thread microcode
#include "sv_msg.uc"
#include "sv_disp.uc"
#include "memsv.uc"
#include "hashsv.uc"

.endlocal