;;; -*- syntax: common-lisp; package: clm; base: 10; mode: lisp -*-

(in-package :clm)

;;;
;;; A monitor, debugger, and associated run-time library of support routines for the 56000 compiler in
;;; code56.lisp.  The assembler is dsp56.lisp.  C code to talk to the chip is in next56.c (or macintosh56.c).
;;; Some referenced Ariel QP code is in qplib56.lisp (stubs in qpstubs.lisp).
;;;
;;; 
;;; ------------------- 56000 MONITOR --------------------------------
;;;
;;; A monitor for the 56000 -- closely modelled on /NextLibrary/Documentation/Ariel/degnext.asm
;;; All we get from the Next DSP world is DSPOpenNoBoot and a memory-mapped "host interface".

(defconstant fractional-one #x7fffff)	; (24 bit 2's complement 1.0-epsilon)

;;; Host Command Vectors
(defconstant BREAK-HC #x24)		; drop into debugger
#+QP (defconstant QP-HC #x26)		; location of QP-merge host command (or R-T loc if not QP-monitor)
#+QP (defconstant QP-MRG #x28)
(defconstant RESTART-HC #x2A)		; restart instrument
(defconstant OUT-HC #x2C)		; forced output

(defconstant DEGMON #x2E)		; P mem locs used for temp storage by degmon
					; #x3E=illegal instruction (just after last of user HC locs)

(defun Initialize-monitor-names ()	; separate from monitor itself to help debugger find symbols
  (emit '(DEFINE M-BCR   #xFFFE))	; Port A bus control register 
  (emit '(DEFINE M-PBC   #xFFE0))	; Port B control register
  (emit '(DEFINE M-PBDDR #xFFE2))	; Port B data direction register
  (emit '(DEFINE M-PBD   #xFFE4))	; Port B data register
  (emit '(DEFINE M-PCC   #xFFE1))	; Port C control register
  (emit '(DEFINE M-PCDDR #xFFE3))	; Port C data direction register
  (emit '(DEFINE M-PCD   #xFFE5))	; Port C data register
  (emit '(DEFINE M-HCR   #xFFE8))	; host control register
  (emit '(DEFINE M-HSR   #xFFE9))	; host status register
  (emit '(DEFINE M-HRX   #xFFEB))	; host receive data register
  (emit '(DEFINE M-HTX   #xFFEB))	; host transmit data register
  (emit '(DEFINE M-HRIE  0))		; host receive interrupt enable
  (emit '(DEFINE M-HTIE  1))		; host transmit interrupt enable
  (emit '(DEFINE M-HCIE  2))		; host command interrupt enable (CVR set to #x12 by reset)

  (emit '(DEFINE M-HF2   3))		; host flag 2
  (emit '(DEFINE M-HF3   4))		; host flag 3

  ;; HF3="at a breakpoint" flag (i.e. in the monitor awaiting attention of user or lisp).
  ;; HF2="data ready" flag (i.e. in .flush-buffers ready to send blocks of output).
  ;; HF2&HF3="at .input" flag (for random access IO).
  ;; both 0 = "I'm busy, or I'm not in use"
  ;; I could use about 10 more of these bits!!

  (emit '(DEFINE M-HRDF  0))		; host receive data full
  (emit '(DEFINE M-HTDE  1))		; host transmit data empty
  (emit '(DEFINE M-HCP   2))		; host command pending
  (emit '(DEFINE M-HF    #x18))		; host flag mask
  (emit '(DEFINE M-HF0   3))		; host flag 0
  (emit '(DEFINE M-HF1   4))		; host flag 1
  (emit '(DEFINE M-DMA   7))		; DMA status
  (emit '(DEFINE M-IPR   #xFFFF))	; interrupt priority register
  (emit '(DEFINE fractional-one #x7fffff))
  (emit '(DEFINE <nil>   #x10000)) 
  (emit '(DEFINE <t>     0))
  (emit '(DEFINE temp-loc 1))		;L side (these are the only reserved data memory locations)
  (emit '(DEFINE temp-loc-1 2))		;all three are used as temporary storage by the run-time library
  (emit '(DEFINE temp-loc-2 3))
  (emit `(DEFINE DEGMON    ,DEGMON))	
  (emit `(DEFINE DE-HPD    ,DEGMON))
  (emit `(DEFINE DE-FLAG   ,(+ DEGMON 1)))
  (emit `(DEFINE DE-HPD2   ,(+ DEGMON 2)))
  (emit `(DEFINE DE-FLAG2  ,(+ DEGMON 3)))
  (emit `(DEFINE DE-SR     ,(+ DEGMON 4)))
  (emit `(DEFINE DE-PC     ,(+ DEGMON 5)))
  (emit `(DEFINE DE-IPR    ,(+ DEGMON 6)))
  (emit `(DEFINE DE-SR2    ,(+ DEGMON 7))))

(defun get-dsp-monitor (&optional (QP-mon nil))
  (Init-emit)				; initialize the assembly environment
  (Initialize-monitor-names)		; first names from ioequ.asm
  #+QP (QP-initialize-names)		; QP names (qplib56.lisp)
  (emit '(ORG 0))
  (emit '(JMP DE-RESET))		; hardware reset vector (don't make these jumps SHORT)
  (emit '(JMP DE-TRACER))		; stack error
  (emit '(JSR DE-TRACER))		; TRACE interrupt handler (i.e. single-stepping)
  (emit '(JSR DE-TRACER))		; SWI handler
  (emit '(ORG #x1E))			; skip all NOP interrupt vectors
					;(IRQA IRQB SSI (4) SCI (5) then...
  (emit '(JSR DE-TRACER))		; NMI ("reserved for hardware development")
					;then Host receive data, host transmit data, then...
  (emit '(ORG #x24))			;host command to force jump to debugger
  (emit '(JSR .break))
  (emit '(ORG #x26))
  (if QP-mon
      (progn
	#+QP (emit '(JSR .QP-merge-monitor)) ;QP-merge command (when QP DRAM used as cache)
	#+QP (emit '(ORG #x28))
	#+QP (emit '(JSR .QP-master-flush-and-clear-DRAM))
	)
    (emit '(JSR .R-T)))

  (emit '(ORG #x2A))
  (emit '(JSR .User-Start))		;HC to restart an already-loaded instrument
  (when (not QP-mon)
    (emit '(ORG #x2C))
    (emit '(JSR .force-flush-buffers)))
    
  (emit `(ORG ,DEGMON))			;currently #x2E -- overwrite all the unused host commands
  (emit '(DE-RESET))			;this block over-written as temp storage by break-point handler
  (emit '(LOAD R0 0 SHORT))
  #-MultiSound (emit '(STORE R0 X-IO M-BCR))		; no wait states in any memory
  #+MultiSound (emit '(STORE #x3330 X-IO M-BCR))
  (emit '(STORE R0 X-IO M-HCR))		; clear host control register
  (emit '(STORE R0 P DE-FLAG SHORT))	; DEGMON registers
  (emit '(STORE R0 P DE-FLAG2 SHORT))
  (emit '(STORE R0 P DE-IPR SHORT))
  (emit '(STORE R0 P DE-SR2 SHORT))
  (emit '(LOAD A1 #xBF080))		; JSR
  (emit '(STORE A1 P 0 SHORT))
  (emit (LIST 'LOAD 'A1 (+ dsp-pc 2) 'SHORT)) ; (it appears that RESET vector is now JSR DE-RUN [changed to DE-RSTC])
  (emit '(STORE A1 P 1 SHORT))		; JSR destination
  (emit '(JMP DE-RSTC SHORT))		; enter main program loop
  
  (emit '(.break))
  (emit '(DE-TRACER))
  (emit '(MOVE X-IO M-HSR P DE-FLAG))	; flush out host interface
  (emit '(MOVE X-IO M-HRX P DE-HPD))
  (emit '(MOVE X-IO M-HSR P DE-FLAG2))
  (emit '(MOVE X-IO M-HRX P DE-HPD2))
  (emit '(STORE SSL P DE-SR SHORT))	; old SR
  (emit '(STORE SSH P DE-PC SHORT))	; old PC -- this auto-decrements SP -- unlike DEGMON, we will restore it
  (emit '(LOAD SSH P DE-PC SHORT))
  (emit '(LOAD SSL P DE-SR SHORT))
  (emit '(BSET M-HF3 X-IO M-HCR))	; set HF3 -- signal to outer world that we've hit a break-point
  
  (emit '(break-loop))			; sit in infinite loop executing commands loaded down from host

  #+mcl (emit '(BSET 0 X-IO M-PBC))	; (re-)activate host port just in case
					; (this sets carry bit causing me confusion during testing)
  (emit '(JCLR M-HRDF X-IO M-HSR break-loop))
  (emit '(MOVE X-IO M-HRX P uop-1))	;treat value received as one-word instruction
  (emit '(loop-2))
  (emit '(JCLR M-HRDF X-IO M-HSR loop-2))
  (emit '(MOVE X-IO M-HRX P uop-2))	;two words total (room for 2-word instruction)
  (emit '(uop-1))
  (emit '(NOP))
  (emit '(uop-2))
  (emit '(NOP))
  (emit '(JMP break-loop SHORT))	;stay in debugger loop until explicit RTS or whatever
  (emit '(HIDE break-loop loop-2))

  (emit '(DE-RSTC))			; normally execution starts here (reset clears some things and jumps here)
  (emit '(BSET 0 X-IO M-PBC))		; activate host port
  (emit '(STORE #x0C00 X-IO M-IPR))	; disable most interrupts -- leaves HC enabled
  (emit '(BSET M-HCIE X-IO M-HCR))	; enable host command interrupts 
					; (used here only to force a drop into the debugger)
  (emit '(BCLR M-HF3 X-IO M-HCR))	; clear HF3
  (emit '(BCLR M-HF2 X-IO M-HCR))	; clear HF2

  ;; M-PCC = 0 at reset, so all pins default to general purpose IO
  #+Next  (emit '(BSET 3 X-IO M-PCDDR))	; enable pc3 as output (see degmon.asm on /usr/lib/dsp/smsrc)
  #+Next  (emit '(BCLR 3 X-IO M-PCD))	; clear to enable external RAM -- that is, if these two commands
					;   are omitted, you only have 1K total memory (internal
					;   P memory (512 words) and 256 each of X and Y).  With this, the
					;   three memory types are overlayed according to the
					;   two images shown in Appendix H of sysrefman:
					;   NextLibrary/Documentation/NeXT/SysRefMan/ApH_DSPDetails.wn
					;   (P mem overlays X mem)
					;   The clm world ignores Next's image 1 entirely.  The same map
					;   works for the QP board, the 32K memory expansion module from NeXT,
					;   but the 192K module from SFSU is laid out like the Digidesign
					;   Sound Accelerator -- all the address lines are "real", and
					;   external memory is shadowed on the low side by the 56000's 
					;   internal memory (which we want to use as much as possible to
					;   avoid bus contention), and on the high side by the various
					;   memory-mapped registers above #xFFC0.
  #+mcl (emit '(STORE 5 X-IO M-PCDDR))  ; These are taken from Digidesign code -- I've forgotten what the bits mean.
  #+mcl (emit '(STORE 1 X-IO M-PCD))
  
  (emit '(ORI 2 MR))			; set I1 (in status register 6-9)
  (emit '(ANDI #xFE MR))		; clear I0  => exceptions IPL 0 and 1 are now masked
					; OMR (different from MR) is 2 at start-up (set by NeXT?) = normal expanded mode
					;   DE=0, so data ROMs are disabled
  ;; this block of NOP's is for QP (Ariel QuintProcessor) use 
  #+QP (progn
	 (emit '(NOP))
	 (emit '(NOP))
	 (emit '(NOP))
	 (emit '(NOP)))

  (emit '(DE-END))			;fall into user program (actually into the loader)
  (emit '(HIDE DE-TRACER DE-RSTC DE-END DE-RESET UOP-1 UOP-2))
  (if (> dsp-pc Internal-P-size)
      (error "This monitor will not load correctly -- pc = ~D" dsp-pc))
  dsp-pc)				; return end of monitor (around P 68)

;;; after booting, R1=49152 and R2=65513 (FFE9=M-HSR), so apparently Motorola's bootstrap routine uses them.
;;; see test-prog and dsp-debug for examples of using the monitor.


;;; DSP DEBUGGING AIDS (also see debugger at end of this file) ------------------------------------

(defun dsp-set (&optional (slot 0) (dsp 0))
  (setf current-slot slot)
  (setf current-dsp dsp)
  (set-current-active-dsp (if (zerop slot) 0 (1+ (+ (* (1- (/ slot 2)) 5) dsp))))
  #+QP (if (> dsps 1) (qp-set-current-dsp slot dsp))
  )

(defun dsp-debug (&optional (slot 0) (dsp 0) (memmap #xe00000))
  #-QP (declare (ignore memmap))
  (if (zerop slot) (dsp-close))
  (initialize-everything_56)
  (let ((monend (- (get-dsp-monitor) 4)))
    #-QP (declare (ignore monend))
    (emit '(NOP))
    (emit '(NOP))
    (emit '(JSR .break))
    #+QP (when (and (/= 0 slot) (= 4 dsp)) (QP-initialize-DRAM-load))
    (dsp-set slot dsp)
    (if (zerop slot)
	(dsp-set-up (min (+ dsp-pc 1) (1- internal-p-size)) internal-p-memory)
      #+QP (qp-boot-dsp slot 
		   dsp 
		   (min (+ dsp-pc 1) (1- internal-p-size)) 
		   internal-p-memory 
		   monend 
		   (if (= 4 dsp) 0 memmap))
      )))

(defun dsp-close (&optional (slot 0) (dsp 0))
  (set-current-active-dsp (if (zerop slot) 0 (1+ (+ (* (1- (/ slot 2)) 5) dsp))))
  #+QP (qp-set-current-dsp slot dsp)		;make sure hostInterface is right
  (if (zerop slot)
      (if (/= 0 (dsp-is-open))
	  (dsp-1-close))		;otherwise this contains a free that might be garbage
    #+QP (qp-close)
    ))

(defun dsp-open (i arr &optional (slot 0) (dsp 0) (monend 0) (memmap #xe00000))
  #-QP (declare (ignore monend memmap))
  (if (and (zerop slot) 
	   (/= 0 (dsp-is-open)))
      (dsp-close))			;leave QP alone (i.e. don't bother "closing" individual chips)
  (dsp-set slot dsp)
  (if (zerop slot)
      (dsp-set-up (min i (1- internal-p-size)) arr)
    #+QP (qp-boot-dsp slot 
		 dsp 
		 (min i (1- internal-p-size)) 
		 arr 
		 monend 
		 (if (= 4 dsp) 0 memmap))
    ))




;;; 
;;; ----------------- 56000 RUN-TIME LIBRARY ----------------------------------------------------------------
;;;
;;; Overall organization is perhaps suboptimal -- the actual 56000 code is normally here, called
;;; when Run (run.lisp) emits a call on the code56.lisp macro that JSR's to a public entry point
;;; here, causing the associated run-time data loading procedure (in ins56.lisp) to be inserted
;;; into the "instrument".  There is also a display function (also in ins56.lisp) for every special
;;; data structure -- so for example, OSCIL in an instrument becomes <OSCIL> in RUN, causing
;;; the <OSCIL> macro (code56) to be expanded, emitting JSR .OSCIL (below) and inserting PASS-OSC
;;; (ins56) into the instrument, with DM-OSC available if we call display-dsp-state.  

;;; Memory is allocated at load (init!) time by Get-X-Memory, Get-Y-Memory, and Get-L-Mem, each taking an optional
;;; argument saying how big a block to allocate.  L and P pointers are then saved in X:0, Y:0.
;;; L 1..3 are used as debugger temporaries, and by other routines as temporary storage. (Run time
;;; memory allocation uses push and pop -- see code56.lisp).

;;; If a routine needs a safe location for a local variable, it should use:
;;; (emit (list 'DEFINE 'local-var-name (get-X-memory))) or some variant thereof in an init function.
;;; This will allocate a location in SRAM that is guaranteed to be left alone by
;;; the rest of the world, and gives it a name.  The name should either be unique,
;;; or should be UNDEFINEd when the routine is done with it (like HIDE with labels).

;;; If you want to add to the library, see addons.lisp.


;;; get-datum and put-datum shovel un-interpreted data to or from the HI ("host interface")
;;;   from DSP's viewpoint (put-one-word and get-one-word are the associated routines for Lisp (next56.c))

(defun get-datum (loc)
  (let ((lab (new-label)))
    (emit (list lab))
    (emit `(JCLR M-HRDF X-IO M-HSR ,lab))
    (if (listp loc)
	(emit `(MOVE X-IO M-HRX ,@loc))
      (emit `(LOAD ,loc X-IO M-HRX)))
    (emit `(HIDE ,lab))))

(defun put-datum (loc)
  (let ((lab (new-label)))
    (emit (list lab))
    (emit `(JCLR M-HTDE X-IO M-HSR ,lab))
    (if (listp loc)
	(emit `(MOVE ,@loc X-IO M-HTX))
      (emit `(STORE ,loc X-IO M-HTX)))
    (emit `(HIDE ,lab))))

(defun get-array (temp-R mem org size)
  ;; read in an array from the host, using TEMP-R to shovel it from the interface to x or y memory.
  ;; MEM is X or Y, ORG is where to start, SIZE is how many words to read and store.
  (let ((do-lab (new-label)))
    (if (< org 256)
	(emit `(LOAD ,temp-R ,org SHORT))
      (emit `(LOAD ,temp-R ,org)))
    (emit `(DO ,size ,do-lab))
    (get-datum (list mem temp-R 'R+1))
    (emit '(NOP))
    (emit `(,do-lab LOCAL))))

(defun put-array (temp-R mem org size)
  ;; same as above, but in other direction
  (let ((do-lab (new-label))
	(lab (new-label)))
    (if (< org 256)
	(emit `(LOAD ,temp-R ,org SHORT))
      (emit `(LOAD ,temp-R ,org)))
    (emit `(DO ,size ,do-lab))
    (put-datum (list mem temp-R 'R+1))
    (emit `(,do-lab))
    (emit `(HIDE ,do-lab ,lab))))

(defun emil (op)
  (if (and (zerop relocation-bit) 
	   (< dsp-pc 450) 
	   (not (eq (car (last op)) 'SHORT)))
      (emit (append op '(SHORT)))
    (emit op)))
	   

(defstruct libinfo loads uses needs)
(defun add-uses (l1 l2)
  (setf (libinfo-uses l1) (union (libinfo-uses l1) (libinfo-uses l2))))
(defun add-all (use need)
  (let ((uses use))
    (loop for i in need do (if (boundp i) (setf uses (union uses (libinfo-uses (eval i))))))
    uses))
(defmacro libdecl (name load use need)
  `(defvar ,name (make-libinfo :loads ,load :uses (add-all ,use ,need) :needs ,need)))

;;; To add something to the run-time library, add the call to run.lisp (if desired), a hook into code56.lisp (if desired), 
;;; then the actual code here, with libdecl to tie everything together.  
;;; Libdecl args are:
;;; NAME -- the entry point for a JSR.
;;; LOAD -- the routine here to call to load the code into the dsp program
;;; USES -- what registers the routine expects to use freely - a list
;;; NEED -- what other entry points need to be loaded (only top level is needed) -- a list

(libdecl .clear-x-or-y-memory 'clear-x-or-y-memory-load '(A B R7) #+QP '(.QP-initialize-DRAM) #-QP nil)
(libdecl .R-clear-x-or-y-memory 'clear-x-or-y-memory-load '(A B R7) #+QP '(.QP-initialize-DRAM) #-QP nil)
(defun clear-x-or-y-memory-load ()	;A1=mem base, A0=mem size, B=0 if X, 1 if Y
  (emit '(.R-clear-x-or-y-memory))
  (emit '(CLR A))			;check for external delays, or nil delays
  (emit '(LOAD A1 X R4 R+1))		;delay and all-comb jump here to clear its delay line
  (emit '(JSET 3 Y R4 R done))		;maybe initial delay line values set explicitly by user
  #+QP (emit '(JSET 2 Y R4 R DRAM-mem-clear))
  (emit '(TST A))			;base=0 means this can't be a real delay line (nil or whatever)
  (emil '(JEQ done))
  (emit '(JSET 1 Y R4 R done))		;external delay line
  (emit '(LOAD B Y R4 R+1))		;0=x, 1=y, 2=external, 4=QP DRAM, 8=don't clear
  (emit '(LOAD A0 X R4 R))		;dly-size
  (emit '(TST B)     '(COPY A1 R7))
  (emil '(JEQ clear-x-memory))
  (emit '(CLR B))
  (emit '(REP A0))
  (emit '(STORE B Y R7 R+1))
  (emit '(RTS))
  (emit '(clear-x-memory LOCAL))
  (emit '(REP A0))
  (emit '(STORE B X R7 R+1))
  (emit '(done LOCAL))
  (emit '(RTS))
  #+QP (progn
	 (emit '(DRAM-mem-clear LOCAL))
	 (emit '(JSSET DMC-reset Y-IO DMC-MCR .QP-initialize-DRAM))
	 (emit '(UPDATE R4 R+1))
	 ;; here the main complication is that we can have 24 bit addresses and sizes, so we can't use DO or Rn
	 (QP-refresh-off)
	 (emit '(CLR B)    '(LOAD X1 0))
	 (emit '(LOAD B X R4 R))
	 (emit '(LOAD X0 1))
	 (emit '(loop-start))
	 (emit '(TST B))
	 (emil '(JEQ QP-done))
	 (QP-write-DRAM 'A1 'X1)
	 (emit '(SUB X0 B))
	 (emit '(ADD X0 A))
	 (emil '(JMP loop-start))
	 (emit '(QP-done LOCAL))
	 (emit '(HIDE loop-start))
	 (QP-refresh-on)
	 (emit '(RTS)))
  )
  
(libdecl .r-array-clear-x-or-y-memory 'r-array-clear-x-or-y-memory-load '(A B X R7 R4) '(.r-clear-x-or-y-memory))
(defun r-array-clear-x-or-y-memory-load ()
  (emit '(.r-array-clear-x-or-y-memory)) ;for arrays of delay lines
  ;; R5=>array, should end up => 1st element delay portion, N5=element size, B1=array size
  (emit '(CLR A) '(LOAD N5 Y R5 R+1))	;array element size
  (emit '(LOAD X0 Y R5 R-1))		;address of array top
  (emit '(LOAD A X R5 R))		;A=base*2
  (emit '(ASR A))
  (emit '(COPY A1 R5))			;R5=>current element of array
  (emit '(NOP))
  (emit '(arr-loop))			;do loop by hand (increment (element size) may be /= 1)
  (emit '(COPY R5 R4))
  (emit '(JSR .r-clear-x-or-y-memory))
  (emit '(CLR A)       '(UPDATE R5 R+N))
  (emit '(COPY R5 A))
  (emit '(CMP X0 A))			;S2-S1 (i.e. cur-pos - top-pos)
  (emit '(JLE arr-loop))
  (emit '(HIDE arr-loop))
  (emit '(RTS)))

(libdecl .clear-L-memory 'clear-L-memory-load '(A B R7) nil)
(defun clear-L-memory-load ()
  (emit '(.clear-L-memory))
  (emit '(TST A))
  (emil '(JEQ done)) 
  (emit '(CLR B)     '(COPY A1 R7))
  (emit '(REP A0))
  (emit '(STORE B L R7 R+1))
  (emit '(done LOCAL))
  (emit '(RTS)))


(defun load-chip-data ()

  ;; once the initial boot program has been loaded, CLM instruments jump into this code to
  ;;  load the rest of the program (into external P memory) and the data (X/Y/external).
  ;;  There are two possibilities in existing hardware -- P mem overlays X mem externally,
  ;;  or P mem is a separate block.  In the first case, there's a notion of an external
  ;;  memory block split into P+X and Y where Y can be accessed in the upper half of the
  ;;  block.  In the second case, the external memory is shadowed by the internal, so
  ;;  from the software point of view, there's no explicit external memory.  In the first
  ;;  case we load internal X, internal Y, external X, external Y.  In the second case, we 
  ;;  load internal X (which automatically spills out into the extension memory), internal Y 
  ;;  (ditto) and external P.  We split out the external Y in the first case to save loading
  ;;  explicitly a million zeros.

  ;; load internal X and Y, then (if needed) external X and Y, then (if available and needed) external P.
  (emit '(CLR B))
  (      get-datum 'B1)			;number of internal X words to load
  (emit '(TST B))			; 0?
  (emit '(JLE internal-Y))
  (      get-datum 'R3)			;address of first to-be-loaded word
  (emit '(DO B1 internal-X-load))
  (emit     '(ix))
  (emit     '(JCLR M-HRDF X-IO M-HSR ix))
  (emit     '(MOVE X-IO M-HRX X R3 R+1))
  (emit     '(NOP))
  (emit     '(internal-X-load LOCAL))
  (emit '(internal-Y LOCAL))
  (emit '(NOP))
  (      get-datum 'B1)			;ditto internal Y
  (emit '(TST B))			; 0?
  (emit '(JLE external-X))
  (      get-datum 'R3)			;address of first to-be-loaded word
  (emit '(DO B1 internal-Y-load))
  (emit     '(iy))
  (emit     '(JCLR M-HRDF X-IO M-HSR iy))
  (emit     '(MOVE X-IO M-HRX Y R3 R+1))
  (emit     '(NOP))
  (emit     '(internal-Y-load LOCAL))
  (emit '(external-X LOCAL))
  (emit '(NOP))
  (      get-datum 'B1)			;ditto external X
  (emit '(TST B))			; 0?
  (emit '(JLE external-Y))
  (      get-datum 'R3)			;address of first to-be-loaded word
  (emit '(DO B1 external-X-load))
  (emit     '(ex))
  (emit     '(JCLR M-HRDF X-IO M-HSR ex))
  (emit     '(MOVE X-IO M-HRX X R3 R+1))
  (emit     '(NOP))
  (emit     '(external-X-load LOCAL))
  (emit '(external-Y LOCAL))
  (emit '(NOP))
  (      get-datum 'B1)			;ditto external Y
  (emit '(TST B))			; 0?
  (emit '(JLE external-P))
  (      get-datum 'R3)			;address of first to-be-loaded word
  (emit '(DO B1 external-Y-load))
  (emit     '(ey))
  (emit     '(JCLR M-HRDF X-IO M-HSR ey))
  (emit     '(MOVE X-IO M-HRX Y R3 R+1))
  (emit     '(NOP))
  (emit     '(external-Y-load LOCAL))
  (emit '(external-P LOCAL))
  (emit '(NOP))
  (      get-datum 'B1)			;ditto external P
  (emit '(TST B))			; 0?
  (emit '(JLE all-done))
  (      get-datum 'R3)			;address of first to-be-loaded word
  (emit '(DO B1 external-P-load))
  (emit     '(ep))
  (emit     '(JCLR M-HRDF X-IO M-HSR ep))
  (emit     '(MOVE X-IO M-HRX P R3 R+1))
  (emit     '(NOP))
  (emit     '(external-P-load LOCAL))
  (emit '(all-done LOCAL))
  (emit '(NOP))
  (emit '(HIDE ix iy ex ey ep)))



(libdecl .out-n 'out-n-load '(A B X R0 R5 R7) nil)
(defun out-n-load ()			;assume R4 has home, X0 has input
  (emit '(.out-n))			;X:home=start, Y:home=end, X:(home+1)=current ptr, Y:(home+1)=0 if X side, 1 if Y
  (emit '(CLR A)    '(LOAD X1 Y R4 R))	;X1=end
  (emit '(CLR B)    '(LOAD A X R4 R+1))	;A=start -- 0 if no buffer present
  (emit '(TST A)    '(LOAD B X R4 R))	;B=cur ptr in buffer
  (emit '(JEQ out-done))		;out on closed chan is a nop
  (emit '(CMP X1 B))			;are we full? (B-X1 = cur-end)

  ;; here we have full output buffers (normally 3) -- we have to signal the 68000 via HF2,
  ;; then wait for all the data to be transferred, then reset the cur-ptr to start, and store X0.

  ;; One problem is that READINs and OUTns can be arbitrarily interleaved, so we're assuming
  ;; that the output buffers will fill up together on a pass, then on a subsequent pass we'll
  ;; flush them all (i.e. if the first buffer to fill triggered the flush, we'd be trying to
  ;; flush the second channel before it was ready for that, and it might actually want input).
  ;; An added source of woe is a timing race in the reading of the host flags -- see next56.c.

  ;; but we can't just hope there's an INA or READIN at the start of a pass -- we might be
  ;; using external delays, but no input, and that causes this system to hang.  So we need
  ;; an end-of-pass or start-of-pass check for full output buffers.

  (emil '(JLT store-it))
  (emil '(JGT gotta-flush))
  (emit '(BSET 0 X <output+1>))
  (emil '(JMP store-it))
  (emit '(gotta-flush LOCAL))

  (emit '(STORE X0 X temp-loc))
  (emit '(STORE R4 Y temp-loc))
  (emit '(JSR .flush-buffers))
  (emit '(LOAD R4 Y temp-loc))		;R4 should be home+1 still (i.e. leave temp-loc alone in .flush-buffers)
  (emit '(LOAD X0 X temp-loc))
  (emit '(store-it LOCAL))
  (emit '(LOAD R5 X R4 R))		;current pointer into buffer (may have changed in .flush-buffers)
  (emit '(JSET 0 Y R4 R y-mem))		;assume loader put 1 in Y:(home+1) if Y side delay line
  (emit '(STORE X0 X R5 R+1))		;store current input, increment buffer pointer
  (emil '(JMP ok))
  (emit '(y-mem LOCAL))
  (emit '(STORE X0 Y R5 R+1))		;same, but on Y side 
  (emit '(ok LOCAL))
  (emit '(STORE R5 X R4 R))		;update curp
  (emit '(BSET 1 Y R4 R))		;warn subsequent calls not to update the pointer
  (emit '(out-done LOCAL))
  (emit '(RTS)))

(libdecl .out-n-again 'out-n-again-load '(A R5) '(.out-n))
(defun out-n-again-load ()		;here R4 has home+1 (needed for X/Y mem choice), X0 has new input
					;  we assume current pointer has been incremented already
  (emit '(.out-n-again))
  (emit '(JCLR 1 Y R4 R not-again))
  (emit '(LOAD R5 X R4 R))		;current pointer (actually ahead 1)
  (emit '(JSET 0 Y R4 R y-mem))		;X/Y mem table
  (emit '(LOAD A X R5 1-R))		;get current sample
  (emit '(ADD X0 A))			;merge in current input
  (emit '(STORE A X R5 R))
  (emit '(RTS))
  (emit '(y-mem LOCAL))			;same on Y side
  (emit '(LOAD A Y R5 1-R))
  (emit '(ADD X0 A))
  (emit '(STORE A Y R5 R))
  (emit '(RTS))
  (emit '(not-again LOCAL))
  (emit '(UPDATE R4 R-1))
  (emit '(JMP .out-n)))

(libdecl .outsig 'outsig-load nil nil)
(defun outsig-load ()
  (emit '(.outsig))			;assume X0=datum, A=pass, X1=fil-chn
  (emit '(BSET M-HF2 X-IO M-HCR))
  (emit '(BSET M-HF3 X-IO M-HCR))
  (put-datum %external-output)
  (put-datum 'X0)
  (put-datum 'X1)
  (put-datum 'A0)
  (put-datum 'A1)
  (emit '(BCLR M-HF3 X-IO M-HCR))
  (emit '(BCLR M-HF2 X-IO M-HCR))
  (emit '(RTS)))

(libdecl .overflow-warning 'overflow-warning-load nil nil)
(defun overflow-warning-load ()
  (emit '(.overflow-warning))
  (emit '(BSET M-HF2 X-IO M-HCR))
  (emit '(BSET M-HF3 X-IO M-HCR))
  (put-datum %external-overflow-warning)
  (put-datum 'B1)
  (put-datum 'A0)
  (put-datum 'A1)
  (emit '(BCLR M-HF3 X-IO M-HCR))
  (emit '(BCLR M-HF2 X-IO M-HCR))
  (emit '(RTS)))

(libdecl .array-index-warning 'array-index-warning-load nil nil)
(defun array-index-warning-load ()
  (emit '(.array-index-warning))
  (emit '(BSET M-HF2 X-IO M-HCR))
  (emit '(BSET M-HF3 X-IO M-HCR))
  (put-datum %external-array-index-warning)
  (put-datum 'A1)
  (put-datum 'B1)
  (put-datum 'A0)
  (put-datum 'B0)
  (emit '(BCLR M-HF3 X-IO M-HCR))
  (emit '(BCLR M-HF2 X-IO M-HCR))
  (emit '(RTS)))

(libdecl .UG-nil-warning 'UG-nil-warning-load nil nil)
(defun UG-nil-warning-load ()
  (emit '(.UG-nil-warning))
  (emit '(BSET M-HF2 X-IO M-HCR))
  (emit '(BSET M-HF3 X-IO M-HCR))
  (put-datum %external-nil-UG-warning)
  (put-datum 'A1)
  (emit '(RTS)))

(libdecl .locsig-n 'locsig-n-load nil nil)
(defun locsig-n-load ()
  (emit '(.locsig-n))			;assume X0=datum, A=pass, X1=id
  (emit '(BSET M-HF2 X-IO M-HCR))
  (emit '(BSET M-HF3 X-IO M-HCR))
  (put-datum %external-locsig)
  (put-datum 'X0)
  (put-datum 'X1)
  (put-datum 'A0)
  (put-datum 'A1)
  (emit '(BCLR M-HF3 X-IO M-HCR))
  (emit '(BCLR M-HF2 X-IO M-HCR))
  (emit '(RTS)))

(defun load-flush-buffers (input output)
  (emit '(.flush-with-no-fill))
  (emit '(CLR B))
  (emit `(STORE B Y ,input))
  (emit '(.flush-buffers))

  (emit '(BCLR 0 X <output+1>))

  (emit '(BSET M-HF2 X-IO M-HCR))	;tell 68000 we're ready to transmit data
  (emit '(CLR B)   `(LOAD R5 X ,output)) ;R5=>address of first out-n structure on the list
  (emit '(COPY B R0))
  (emit '(size-wait))			;wait for 68000 to say how much data it wants (last buffer confusion)
  (emit '(JCLR M-HRDF X-IO M-HSR size-wait))
  (emit '(LOAD B1 X-IO M-HRX))
  (emit '(LOAD X1 #x10000))		;65536=16 bits (if bigger, can't be SRAM buffer size)
  (emit '(CMP X1 B)    '(LOAD X1 #x8000)) ;for 8 bit right shift (from 24 DSP to 16 SND)
  #+QP (emit '(JGE QP-DRAM-merge))

  ;; in the normal case, B1 is the size of each of the IO buffers.  If it is >#x10000 here,
  ;;   we are dealing with Ariel's QP board which caches as much of an on-going computation 
  ;;   as possible in its DRAM (this saves lots of host-interface IO and merging in next56.c).
  ;;   In the latter case, rather than MOVE data Host-Interface, we write the data to the
  ;;   QP master or do the merge to DRAM -- all handled by qplib56.lisp.

  ;; sigadrs (passed to c-read-dsp-block in grab-dsp-data) has the addresses of the file
  ;;   buffers -- this way we can handle any order of OUTA, OUTB, and REVIN in the instrument.
  ;;   *sigarr* has the file id and channel number and this is mapped at initialization time
  ;;   to the current address of the corresponding buffer.  For the QP cache, we map from the
  ;;   file buffer address to the corresponding address in DRAM, keeping track of the current
  ;;   DRAM frame and so on.

  ;; It is safe to use DO and Rn as address here because we are dealing with the local SRAM
  ;;   buffer, not the QP DRAM, so the sizes and addresses will fit in 16 bits.

  (emit `(DO Y ,output all-bufs))	;Y <output> = number of output streams open
  (emit     '(CLR A)  '(LOAD R4 X R5 R+1))	;R4->out-n structure base, R5->next out-n struct
  (emit     '(NOP))			;let pipeline settle on R4
  (emit     '(LOAD A X R4 R))		;check for inactive streams
  (emit     '(TST A)  '(LOAD R7 X R4 R+1))	;R7->first word of this buffer, R4->2nd word of out-n structure
  (emil     '(JEQ no-data))		;can't jump to LA (DO restriction)

  (emit     '(LOAD X0 X R4 R))		;now check for ridiculous case where it's a real channel and we have an output
					;  structure open for it, but neglected to actually send any output!
  (emit     '(CMP X0 A))
  (emit     '(JEQ no-data))

  ;; now check for the case where the instrument is calling outb (or whatever), but it's a 1-channel output file
  (emit     '(CLR A)  '(COPY R0 X0))
  (emit     '(LOAD A Y <output+1>))
  (emit     '(CMP X0 A))		;have we finished?
  (emit     '(JLE no-useful-data))

  (emit     '(JSET 0 Y R4 R y-mem))	;is buffer in X or Y memory?
  ;; save one instruction in the (otherwise idle-time) IO loop (31-Jan-92)
  (emit     '(LOAD X0 X R7 R+1))	;(31-Jan-92)
  (emit     '(DO B1 end-x-buf))		
  (emit         '(x-buf-wait))
  (emit         '(JCLR M-HTDE X-IO M-HSR x-buf-wait))
  (emit         '(MPYR X0 X1 A) '(LOAD X0 X R7 R+1)) ;(31-Jan-92) ;R added 18-May-94
  (emit         '(STORE A X-IO M-HTX))
  (emit         '(end-x-buf LOCAL))
  (emit     '(NOP))
  (emil     '(JMP ok))
  (emit     '(y-mem LOCAL))
  (emit     '(LOAD X0 Y R7 R+1))	;(31-Jan-92)
  (emit     '(DO B1 end-y-buf))
  (emit         '(y-buf-wait))
  (emit         '(JCLR M-HTDE X-IO M-HSR y-buf-wait))
  (emit         '(MPYR X0 X1 A) '(LOAD X0 Y R7 R+1)) ;(31-Jan-92) (18-May-94)
  (emit         '(STORE A X-IO M-HTX))
  (emit         '(end-y-buf LOCAL))
  (emit     '(NOP))
  (emit     '(ok LOCAL))
  (emit     '(UPDATE R0 R+1))
  (emit     '(no-useful-data LOCAL))
  (emit     '(CLR A)    '(UPDATE R4 R-1))
  (emit     '(LOAD A X R4 R+1))		;get buf start (to reset curptr)
  (emit     '(no-data LOCAL))
  (emit     '(STORE A X R4 R))		;now buffer is ready to go on (writes 0 to bufstart in dummy case of no data)
  (emit '(all-bufs LOCAL))
  
  ;; finally, check for the case where, for example, it's a stereo file, but we only wrote outa, or even 
  ;; sillier, it's stereo output and we have outb, but for some reason never actually called it!
  (emit '(check-for-left-over-output-buffers))
  (emit '(CLR A)  '(COPY R0 X0))
  (emit '(LOAD A Y <output+1>))
  (emit '(CMP X0 A))			;have we finished?
  (emit '(JLE no-more-data))
  (emit '(CLR A)  '(UPDATE R0 R+1))
  (emit '(DO B1 end-no-buf))
  (emit     '(no-buf-wait))
  (emit     '(JCLR M-HTDE X-IO M-HSR no-buf-wait))
  (emit     '(STORE A X-IO M-HTX))
  (emit     '(NOP))
  (emit     '(end-no-buf LOCAL))
  (emit '(NOP))
  (emit '(JMP check-for-left-over-output-buffers))
  (emit '(no-more-data LOCAL))

  (emit '(HIDE size-wait x-buf-wait y-buf-wait no-buf-wait check-for-left-over-output-buffers))
  (emit '(input-check))			;QP-DRAM-merge jumps here when finished (to clear HF2 and so on)
  (emit '(BCLR M-HF2 X-IO M-HCR))
  (emit `(LOAD A Y ,input))
  (emit '(TST A))
  (emil '(JEQ no-input))
  (emit '(JSR .fill-buffers))
  (emit '(no-input LOCAL))
  (emit '(NOP))
  (emit '(RTS))
  #+QP (progn
	 (emit '(QP-DRAM-merge LOCAL))
	 ;; here we decide whether we are a slave or master, then jump to the appropriate merger
	 (emit '(JSR .QP-DRAM-merge))
	 ;; when done merging output, check for input (caller has to be smart about this!! -- if there is input,
	 ;; and we are merging to DRAM, the caller has to wait for the merge to finish before expecting to
	 ;; run the SRAM loading process).
	 (emil '(JMP input-check))
	 (emit '(HIDE input-check)))
  (emit '(.force-flush-buffers))
  (emit `(DEFINE flush-x ,(get-L-mem)))	;these allocations at (dsp) compile time should be safe since they happen before load-library
  (emit `(DEFINE flush-B ,(get-L-mem)))
  (emit `(DEFINE flush-A ,(get-L-mem)))
  (emit `(DEFINE flush-RA ,(get-L-mem)))
  (emit `(DEFINE flush-RB ,(get-L-mem)))
  (emit '(STORE A L flush-A))
  (emit '(STORE B L flush-B))
  (emit '(STORE X L flush-X))
  (emit '(STORE R5 X flush-RA))
  (emit '(STORE R7 Y flush-RA))
  (emit '(STORE R4 X flush-RB))
  (emit '(JSR .flush-buffers))
  (emit '(LOAD A L flush-A))
  (emit '(LOAD B L flush-B))
  (emit '(LOAD X L flush-X))
  (emit '(LOAD R5 X flush-RA))
  (emit '(LOAD R7 Y flush-RA))
  (emit '(LOAD R4 X flush-RB))
  (emit '(UNDEFINE flush-A flush-B flush-RA flush-RB flush-X))
  (emit '(RTI)))



(libdecl .in-n 'in-n-load '(A B X R5 R7) nil)
(defun in-n-load ()			;assume R4 has home, X0 gets input
  (emit '(.in-n))			;X:home=start, Y:home=end, X:(home+1)=current ptr, Y:(home+1)=0 if X side, 1 if Y
					;X:(home+2) is used if there's no buffer space (currently unimplemented)
					;everything is 0 if pass-xy-IO passed a null file in
  (emit '(CLR B)    '(LOAD X1 Y R4 R+1));X1=end
  (emit '(LOAD B X R4 R))		;B=cur
  (emit '(CMP X1 B))			;are we at end (B-X1 = cur-end)
  (emit '(JLE get-it))
  (emit '(STORE R4 Y temp-loc))
  (emit '(JSR .flush-buffers))
  (emit '(LOAD R4 Y temp-loc))		;R4 should be home+1 still (i.e. leave temp-loc alone in .flush-buffers)
  (emit '(get-it LOCAL))
  (emit '(LOAD R5 X R4 R))		;current pointer into buffer (may have changed in .flush-buffers)
  (emit '(JSET 0 Y R4 R y-mem))		;assume loader put 1 in Y:(home+1) if Y side delay line
  (emit '(LOAD X0 X R5 R+1))
  (emil '(JMP ok))
  (emit '(y-mem LOCAL))
  (emit '(LOAD X0 Y R5 R+1))		;same, but on Y side 
  (emit '(NOP))				;let R5 settle
  (emit '(ok LOCAL))
  (emit '(STORE R5 X R4 R))		;update curp
  (emit '(RTS)))


(defun load-fill-buffers (input)
  (emit '(.fill-buffers))
  (emit '(CLR B)   `(LOAD R5 X ,input)) ;R5=>address of first in-n structure on the list
  (emit '(size-wait))			;wait for 68000 to say how much data it wants (last buffer confusion)
  (emit '(JCLR M-HRDF X-IO M-HSR size-wait))
  (emit '(LOAD B1 X-IO M-HRX))
  (emit '(LOAD X1 #x80))		;16 SND to 24 DSP (in low reg)
  (emit `(DO Y ,input all-bufs))	;Y <input> = number of input streams open
  (emit     '(CLR A) '(LOAD R4 X R5 R+1))	;R4->in-n structure base, R5->next in-n struct
  (emit     '(NOP))
  (emit     '(LOAD A X R4 R))
  (emit     '(TST A) '(LOAD R7 X R4 R+1))	;R7->first word of this buffer, R4->2nd word of in-n structure
  (emil     '(JEQ no-data))
  (emit     '(JSET 0 Y R4 R y-mem))	;is buffer in X or Y memory?
  (emit     '(DO B1 x-end-buf))		
  (emit         '(x-buf-wait))
  (emit         '(JCLR M-HRDF X-IO M-HSR x-buf-wait))
  (emit         '(LOAD X0 X-IO M-HRX))
  (emit         '(MPY X0 X1 A))
  (emit         '(STORE A0 X R7 R+1))
  (emit         '(x-end-buf LOCAL))
  (emit     '(NOP))
  (emil     '(JMP ok))
  (emit     '(y-mem LOCAL))
  (emit     '(DO B1 y-end-buf))		
  (emit         '(y-buf-wait))
  (emit         '(JCLR M-HRDF X-IO M-HSR y-buf-wait))
  (emit         '(LOAD X0 X-IO M-HRX))
  (emit         '(MPY X0 X1 A))
  (emit         '(STORE A0 Y R7 R+1))
  (emit         '(y-end-buf LOCAL))
  (emit     '(NOP))
  (emit     '(ok LOCAL))
  (emit     '(CLR A)    '(UPDATE R4 R-1))
  (emit     '(LOAD A X R4 R+1))		;get buf start (to reset curptr)
  (emit     '(no-data LOCAL))
  (emit     '(STORE A X R4 R))		;now buffer is ready to go on
  (emit '(all-bufs LOCAL))
  (emit '(NOP))				;this NOP is vital!
  (emit '(HIDE size-wait x-buf-wait y-buf-wait))
  (emit '(RTS)))


;;; basic (fractional) square root from Motorola
;;; Y1 = 24 bit input fraction, A1=>24 bit sqrt result

(libdecl .frac-sqrt 'frac-sqrt-load '(A B X Y) nil)
(defun frac-sqrt-load ()
  (emit '(.frac-sqrt))
  (emit '(COPY Y1 A))
  (emit '(TST A))
  (emil '(JLE done))
  (emit '(CLR B)      '(LOAD X0 #x40 SHORT))
  (emit '(COPY X0 X1))
  (emit '(DO 23 endl))
  (emit     '(SMPY X0 X0 A))
  (emit     '(ADD Y1 A))
  (emit     '(TGE X0 B))
  (emit     '(TFR X1 A))
  (emit     '(ASR A))
  (emit     '(ADD B A)    '(COPY A X1))
  (emit     '(COPY A X0))
  (emit     '(endl LOCAL))
  (emit '(COPY B1 A))
  (emit '(done LOCAL))
  (emit '(RTS)))

;;; integer sqrt
;;; Y1 = 24 bit input integer, A=>24 bit real sqrt result (i.e. it looks like a real, but has only 12 bits fractional accuracy)

(libdecl .int-sqrt 'int-sqrt-load '(A B X Y) nil)
(defun int-sqrt-load ()
  (emit '(.int-sqrt))
  (emit '(COPY Y1 A))
  (emit '(TST A))
  (emil '(JLE done))
  (emit '(.inner-int-sqrt))
  (emit '(CLR B)      '(LOAD X0 #x40 SHORT))
  (emit '(COPY X0 X1))
  (emit '(DO 23 endl))
  (emit     '(SMPY X0 X0 A))
  (emit     '(ASR A))			;make it integer mpy
  (emit     '(ADD Y1 A))
  (emit     '(TGE X0 B))
  (emit     '(TFR X1 A))
  (emit     '(ASR A))
  (emit     '(ADD B A)    '(COPY A X1))
  (emit     '(COPY A X0))
  (emit     '(endl LOCAL))
  (emit '(COPY B1 X0))
  (emit '(LOAD X1 #x800))		;ash -12
  (emit '(MPY X1 X0 A))
  (emit '(done LOCAL))
  (emit '(RTS)))

;;; real sqrt
;;; A = input real, A=output real sqrt.  

(libdecl .real-sqrt 'real-sqrt-load  '(A B X Y R5) '(.int-sqrt))
(defun real-sqrt-load ()
  (emit '(.real-sqrt))
  (emit '(TST A)       '(LOAD B #x10000))
  (emil '(JLE done))
  (emit '(CMP B A)     '(LOAD R5 0 SHORT))
  (emil '(JGE int-time))
  (emit '(shift-up))			;shift "real" left until it looks like a big integer
  (emit '(REP 4))
  (emit '(ASL A))
  (emit '(CMP B A)     '(UPDATE R5 R+1))
  (emil '(JLT shift-up))
  (emit '(int-time LOCAL))
  (emit '(COPY A1 Y1))
  (emit '(JSR .inner-int-sqrt))		;then take the square root of that integer
  (emit '(COPY R5 B))
  (emit '(TST B))
  (emil '(JEQ done))
  (emit '(LSL B))			;and now shift it back by half the previous up-shift
  (emit '(REP B))
  (emit '(ASR A))
  (emit '(done LOCAL))
  (emit '(HIDE shift-up))
  (emit '(RTS)))


(libdecl .ash 'ash-load '(A B) nil)
(defun ash-load ()
  (emit '(.ash))		;A=int, B=shift
  (emit '(TST B))
  (emil '(JEQ done))
  (emil '(JMI minus))
  (emit '(REP B))
  (emit '(ASL A))
  (emit '(RTS))
  (emit '(minus LOCAL))
  (emit '(ABS B))
  (emit '(REP B))
  (emit '(ASR A))
  (emit '(done LOCAL))
  (emit '(RTS)))


;;; Random numbers (from JOS AP vrand macro from Knuth)

(defvar ran-seed 431)			;Y random-seed set to this in ins56.lisp at init time

(libdecl .random 'random-load '(A X) nil)
(defun random-load ()
  (emit '(.random))
  (emit (list 'DEFINE 'random-seed (get-Y-memory)))
  (emit '(LOAD X0 Y random-seed))   
  (emit '(LOAD A 2 SHORT))
  (emit '(LOAD X1 5609937))
  (emit '(MAC X1 X0 A))
  (emit '(ASR A))
  (emit '(STORE A0 Y random-seed))
  (emit '(RTS)))

;;; The Fm-violin appeared to depend on the peculiar noise we used on the Samson box.
;;; It used L0+L1*M0, 20 bits, overflow ignored, L1 reset to new value upon trigger.  In SAIL, this
;;; is:
;;;    val:=(L0+((L1*M0) LAND '3777777) LAND '3777777)
;;; where
;;;    L0 is intialized to '660623
;;;    L1 to '2336545 (actually (round ('3777777*RAN(Seed)) LSH 16) ASH -16) where RAN=SAIL's (i.e. LIB40's) RAN
;;;    M0 to '1254635

(libdecl .sambox-random-init 'sambox-random-init nil nil)
(defun sambox-random-init ()
  (emit '(.sambox-random-init))
  (emit (list 'DEFINE 'sambox-seed (get-Y-memory)))
  (emit '(Y-ORG sambox-seed))
  (emit `(Y-DATA #o2336545)))

(libdecl .sambox-random 'Sambox-random-load '(A B X) '(.sambox-random-init))
(defun Sambox-random-load ()
  (emit '(.sambox-random))
  (emit '(CLR A)          '(LOAD X0 Y sambox-seed))
  (emit '(CLR B)          `(LOAD A0 ,(ash #o660623 1)))
  (emit                   '(LOAD X1 #o1254635))
  (emit '(MAC X0 X1 A))
  (emit '(ASR A)          '(LOAD X0 #o3777777))
  (emit '(COPY A0 B1))
  (emit '(AND X0 B))
  (emit '(STORE B1 Y sambox-seed))
  (emit '(REP 4))			;attempts to avoid this by preshifting failed, perhaps because of sign interpretation
  (emit '(ASL B))
  (emit '(COPY B1 A0))
  (emit '(RTS)))


;;; Real multiply (from Motorola "Fractional and Integer Arithmetic")

(libdecl .real-mpy 'real-mpy-load '(A B X Y) nil)
(defun real-mpy-load ()			;A*B, both 48 bit "reals"
  (emit '(.real-mpy))
  (emit '(TST A))
  (emil '(JEQ done))			;X=0, so done
  (emil '(JPL x-pos))
  (emit '(ABS A)    '(STORE B L temp-loc))
  (emit '(LSL A))
  (emit '(ASR A))
  (emit '(COPY A0 B))
  (emit '(NEG B)    '(LOAD A0 0 SHORT))
  (emit '(NEG A)    '(COPY B1 X0))
  (emit '(COPY A1 X1))
  (emit '(LOAD B L temp-loc))
  (emil '(JMP set-y))
  (emit '(x-pos LOCAL))
  (emit '(LSL A)    '(COPY A1 X1))
  (emit '(ASR A))
  (emit '(COPY A0 X0))
  (emit '(set-y LOCAL))
  (emit '(TST B))
  (emil '(JPL y-pos))
  (emit '(ABS B))
  (emit '(LSL B))
  (emit '(ASR B))
  (emit '(COPY B0 A))
  (emit '(NEG A)     '(LOAD B0 0 SHORT))
  (emit '(NEG B)     '(COPY A1 Y0))
  (emit '(COPY B1 Y1))
  (emil '(JMP do-mpy))
  (emit '(y-pos LOCAL))
  (emit '(LSL B)     '(COPY B1 Y1))
  (emit '(ASR B))
  (emit '(COPY B0 Y0))
  (emit '(do-mpy LOCAL))
  (emit '(MPY X0 Y0 B))			;fractional parts
  (emit '(ASL B))			;remove sign bit
  (emit '(COPY B2 A))			;shift right 24 bits
  (emit '(COPY B1 A0))
  (emit '(MAC X1 Y0 A))			;cross terms
  (emit '(MAC X0 Y1 A))
  (emit '(COPY A0 B1))
  (emit '(COPY A1 A0))			;another 24 bit right shift
  (emit '(COPY A2 A1))
  (emit '(ASL A))			;it appears that the Motorola realmult procedure is buggy -- 
					;if there is any integer part from the cross-terms, it is
					;shifted right one too many bits because the ensuing
					;integer mpy below shifts for fractional mpy before adding!
  (emit '(MAC X1 Y1 A))			;integer part
  (emit '(ASR A))			;eliminate zero fill bit
  (emit '(COPY A0 A))			;I think this zeros A0 and A2 becomes sign extension
  (emit '(COPY B1 A0))
  (emit '(done LOCAL))
  (emit '(RTS)))

(libdecl .real-frac-mpy 'real-frac-mpy-load '(A B X Y) nil)
(defun real-frac-mpy-load ()		;A=real, X1=fraction
  (emit '(.real-frac-mpy))
  (emit '(TST A))
  (emil '(JEQ done))			;Y=0, so done
  (emil '(JPL y-pos))
  (emit '(ABS A))
  (emit '(LSL A))
  (emit '(ASR A))
  (emit '(COPY A0 B))
  (emit '(NEG B)     '(LOAD A0 0 SHORT))
  (emit '(NEG A)     '(COPY B1 Y0))
  (emit '(COPY A1 Y1))
  (emil '(JMP mpy-y))
  (emit '(y-pos LOCAL))
  (emit '(LSL A)     '(COPY A1 Y1))
  (emit '(ASR A))
  (emit '(COPY A0 Y0))
  (emit '(mpy-y LOCAL))			;now Y=signed integer and signed fraction
  (emit '(MPY X1 Y0 B))			;frac * frac
  (emit '(ASL B))			;remove sign bit
  (emit '(COPY B2 A))			;shift right 24 bits
  (emit '(COPY B1 A0))
  (emit '(MAC X1 Y1 A))			;cross term
  (emit '(done LOCAL))
  (emit '(RTS)))
	

(libdecl .real-int-mpy 'real-int-mpy-load '(A B X Y) '(.real-mpy))
(defun real-int-mpy-load ()		;real in A, integer in X1
  (emit '(.real-int-mpy))
  (emit '(TST A))
  (emil '(JEQ done))			;Y=0, so done
  (emil '(JPL y-pos))
  (emit '(ABS A))
  (emit '(LSL A))
  (emit '(ASR A))
  (emit '(COPY A0 B))
  (emit '(NEG B)     '(LOAD A0 0 SHORT))
  (emit '(NEG A)     '(COPY B1 Y0))
  (emit '(COPY A1 Y1))
  (emil '(JMP mpy-y))
  (emit '(y-pos LOCAL))
  (emit '(LSL A)     '(COPY A1 Y1))
  (emit '(ASR A))
  (emit '(COPY A0 Y0))
  (emit '(mpy-y LOCAL))			;now Y=signed integer and signed fraction
  (emit '(MPY X1 Y0 A))			;int * frac
  (emit '(COPY A0 B1))
  (emit '(COPY A1 A0))			;another 24 bit right shift
  (emit '(COPY A2 A1))
  (emit '(ASL A))			;see note in real-mpy
  (emit '(MAC X1 Y1 A))			;integer part
  (emit '(ASR A))			;eliminate zero fill bit
  (emit '(COPY A0 A))			;I think this zeros A0 and A2 becomes sign extension
  (emit '(COPY B1 A0))
  (emit '(done LOCAL))
  (emit '(RTS)))
 

;;; Long integer multiplies would be easy except that the low 24 bits are unsigned, but the
;;; MAC instruction and friends consider it to be signed.  So in each case, we have to first
;;; clear that bit, do the multiplies, adds and shifts, then fix up the case where bit 23
;;; happened to be 1.  (And of course we have to use memory because the BCLR #n,D instruction
;;; doesn't work on rev A and rev B 56000's).  Each of these returns a long-int result,
;;; truncating any fractional part (i.e. just shifting it into oblivion).

(libdecl .int-long-int-mpy 'int-long-int-mpy-load '(A B X Y) nil)
(defun int-long-int-mpy-load ()		;long-int in A, integer in X1
  (emit '(.int-long-int-mpy))
  (emit '(TST A)       '(STORE A L temp-loc))
  (emil '(JEQ done))
  (emit '(ABS A)       '(STORE X1 X temp-loc-1))
  (emit '(COPY X1 B))
  (emit '(TST B)       '(STORE A L temp-loc-2))
  (emil '(JEQ clear-A))
  (emit '(BCLR 23 Y temp-loc-2))	;this is the troublesome "sign" bit
  (emit '(ABS B)       '(COPY A Y1))   
  (emit                '(LOAD Y0 Y temp-loc-2)    '(COPY B X1))
  (emit '(MPY Y1 X1 A) '(STORE A L temp-loc-2))
  (emit '(COPY A0 A1))
  (emit '(LOAD A0 0))
  (emit '(MAC Y0 X1 A))
  (emit '(JCLR 23 Y temp-loc-2 no-sign-bit))
  (emit '(ADD B A))
  (emit '(no-sign-bit LOCAL))
  (emit '(ASR A))
  (emit '(JCLR 23 X temp-loc A-pos))
  (emit '(JSET 23 X temp-loc-1 done))
  (emit '(NEG A))
  (emit '(RTS))
  (emit '(A-pos LOCAL))
  (emit '(JCLR 23 X temp-loc-1 done))
  (emit '(NEG A))
  (emit '(done LOCAL))
  (emit '(RTS))
  (emit '(clear-A LOCAL))
  (emit '(CLR A))
  (emit '(RTS)))


(libdecl .frac-long-int-mpy 'frac-long-int-mpy-load '(A B X Y) nil)
(defun frac-long-int-mpy-load ()		;long-int in A, fraction in X1
  (emit '(.frac-long-int-mpy))
  (emit '(TST A)       '(STORE A L temp-loc))
  (emil '(JEQ done))
  (emit '(ABS A)       '(STORE X1 X temp-loc-1))
  (emit '(COPY X1 B))
  (emit '(TST B)       '(STORE A L temp-loc-2))
  (emil '(JEQ clear-A))
  (emit '(BCLR 23 Y temp-loc-2))
  (emit '(ABS B)       '(COPY A Y1))
  (emit                '(LOAD Y0 Y temp-loc-2)   '(COPY B X1))
  (emit '(MPY Y1 X1 A) '(STORE A L temp-loc-2))
  (emit '(COPY A1 A2))
  (emit '(COPY A0 A1))
  (emit '(LOAD A0 0))
  (emit '(MAC Y0 X1 A))
  (emit '(JCLR 23 Y temp-loc-2 no-sign-bit))
  (emit '(ADD B A))
  (emit '(no-sign-bit LOCAL))
  (emit '(COPY A1 A0))
  (emit '(COPY A2 A1))
  (emit '(LOAD A2 0))
  (emit '(JCLR 23 X temp-loc A-pos))
  (emit '(JSET 23 X temp-loc-1 done))
  (emit '(NEG A))
  (emit '(RTS))
  (emit '(A-pos LOCAL))
  (emit '(JCLR 23 X temp-loc-1 done))
  (emit '(NEG A))
  (emit '(done LOCAL))
  (emit '(RTS))
  (emit '(clear-A LOCAL))
  (emit '(CLR A))
  (emit '(RTS)))


(libdecl .real-long-int-mpy 'real-long-int-mpy-load '(A B X Y) nil)
(defun real-long-int-mpy-load ()	;long-int in A, real in B, returns long-int result (truncates)
  (emit '(.real-long-int-mpy))
  (emit '(TST A)    '(STORE A L temp-loc))
  (emit '(JEQ done))
  (emit '(ABS A)    '(STORE B L temp-loc-1))
  (emit '(TST B)    '(STORE A L temp-loc-2))
  (emit '(JEQ clear-A))
  (emit '(BCLR 23 Y temp-loc-2))
  (emit '(LOAD Y0 Y temp-loc-2))
  (emit '(STORE A L temp-loc-2))
  (emit '(ABS B)    '(COPY A Y1))
  (emit '(CLR A)    '(COPY B X1))
  (emit '(LSL B))			;make B0 a true fraction
  (emit '(ASR B))
  (emit '(MPY Y1 X1 B) '(COPY B0 X0))
  (emit '(COPY B0 A1))
  (emit '(MAC Y0 X1 A))
  (emit '(ASR A))			;shift integer parts, but not fractional parts (below)
  (emit '(MAC Y1 X0 A))
  (emit '(JCLR 23 Y temp-loc-2 no-sign-int))
  (emit '(ADD X A))
  (emit '(no-sign-int LOCAL))
  (emit '(COPY A1 A2))
  (emit '(COPY A0 A1))
  (emit '(LOAD A0 0))
  (emit '(MAC X0 Y0 A))
  (emit '(COPY A1 A0))
  (emit '(COPY A2 A1))
  (emit '(LOAD A2 0))
  (emit '(JCLR 23 X temp-loc A-pos))
  (emit '(JSET 23 X temp-loc-1 done))
  (emit '(NEG A))
  (emit '(RTS))
  (emit '(A-pos LOCAL))
  (emit '(JCLR 23 X temp-loc-1 done))
  (emit '(NEG A))
  (emit '(done LOCAL))
  (emit '(RTS))
  (emit '(clear-A LOCAL))
  (emit '(CLR A))
  (emit '(RTS)))


;;; added complication here is that both bit 23's may be on, so we have to add a special fix up for that case.
;;; (Motorola's version of this code doesn't work for negative arguments)

(libdecl .long-int-mpy 'long-int-mpy-load '(A B X Y) nil)
(defun long-int-mpy-load ()		;long-ints in A and B
  (emit '(.long-int-mpy))
  (emit '(TST A)        '(STORE A L temp-loc))
  (emit '(JEQ done))
  (emit '(ABS A)        '(STORE B L temp-loc-1))
  (emit '(TST B)        '(STORE A0 X temp-loc-2))
  (emit '(JEQ clear-A))
  (emit '(BCLR 23 X temp-loc-2))
  (emit '(ABS B)        '(COPY A X1))
  (emit                 '(LOAD X0 X temp-loc-2)     '(COPY B Y1))
  (emit '(STORE B0 Y temp-loc-2))
  (emit '(BCLR 23 Y temp-loc-2))
  (emit '(LOAD Y0 Y temp-loc-2))
  (emit '(MPY Y1 X0 A)  '(STORE A0 X temp-loc-2))
  (emit '(MAC Y0 X1 A)  '(STORE B0 Y temp-loc-2))
  (emit '(JCLR 23 X temp-loc-2 no-B))
  (emit '(ADD B A))
  (emit '(no-B LOCAL))
  (emit '(JCLR 23 Y temp-loc-2 no-A))
  (emit '(LOAD B L temp-loc))
  (emit '(ABS B)         '(LOAD X1 #x800))
  (emit '(ADD B A)       '(LOAD Y1 #x800))
  (emit '(JCLR 23 X temp-loc-2 no-A))
  (emit '(MAC X1 Y1 A))
  (emit '(no-A LOCAL))
  (emit '(COPY A0 A1))
  (emit '(LOAD A0 0))
  (emit '(MAC Y0 X0 A))
  (emit '(ASR A))
  (emit '(JCLR 23 X temp-loc A-pos))
  (emit '(JSET 23 X temp-loc-1 done))
  (emit '(NEG A))
  (emit '(RTS))
  (emit '(A-pos LOCAL))
  (emit '(JCLR 23 X temp-loc-1 done))
  (emit '(NEG A))
  (emit '(done LOCAL))
  (emit '(RTS))
  (emit '(clear-A LOCAL))
  (emit '(CLR A))
  (emit '(RTS)))


(libdecl .shift-AB-up 'shift-AB-up-load '(X0) nil)
(defun shift-AB-up-load ()
  (emit '(.shift-AB-up))
  (emit '(TST A))			;protect against 0 here
  (emil '(JEQ all-done))
  (emit '(TST B)    '(LOAD X0 #x100000))
  (emil '(JEQ all-done))
  (emit '(cmpm-AB))
  (emit '(CMPM X0 A))
  (emil '(JGE all-done))
  (emit '(CMPM X0 B))
  (emil '(JGE all-done))
  (emit '(ASL A))			;since this pulls in 0's on the right, it should never get caught
					;(i.e. ASR -1 = -1 -> infinite loop)
  (emit '(ASL B))
  (emil '(JMP cmpm-AB))
  (emit '(all-done LOCAL))
  (emit '(RTS))
  (emit '(HIDE cmpm-AB)))


(libdecl .basic-divide 'basic-divide-load '(A B X Y R5) nil)
(defun basic-divide-load ()
  (emit '(.basic-divide))		;A1 / B1 in 24 bit accuracy
  ;; to get more accuracy we need to write the divide algorithm explicitly (can't use DIV)
  (emit '(TST A)    '(COPY A1 X0))	;needed for EOR instruction
  (emil '(JEQ all-done))		;A=0 so no divide needed
  (emit '(TST B)    '(COPY B1 X1))	;save B (clobbered during sign check below)
  (emil '(JEQ all-done))		;B=0 (actually an error) so return numerator (as per SAIL)
  (emit '(EOR X0 B) '(LOAD Y0 fractional-one))
  (emil '(JPL pos-div))			;i.e. EOR didn't set sign bit, so result is positive
  (emit '(LOAD Y0 #xffffff))		;-1.0
  (emit '(pos-div LOCAL))		;have to ABS both anyway (might both be negative)
  (emit '(COPY X1 B))			;restore divisor
  (emit '(ABS A)    '(STORE Y0 X temp-loc))
  (emit '(ABS B)    '(STORE X0 X temp-loc-1))
  (emit '(LOAD R5 1 SHORT))		;R5 is shift counter (always 1 shift because real result, not signed fraction)
  (emit '(cmp-AB))			;DIV instruction wants divisor>dividend (fractional result)
  (emit '(CMP B A))
  (emil '(JEQ rtn-one))
  (emil '(JLT div-time))		;A < B, so we're ready to crank DIV 24 times
  (emit '(ASR A)   '(UPDATE R5 R+1))
  (emit '(ASL B)   '(UPDATE R5 R+1))	;trying to keep close to normalized data
  (emil '(JMP cmp-AB))

  ;; we could get more bits of accuracy here by shifting A left as long as it is less than B, then a fixup right
  ;;   followed, after the divide, by the reverse shifts at the end (remainder?)
  ;; in its current state, if A is very small, and B relatively large (as happens in
  ;;  .trig-sine when we calculate (mod phase two-pi)), we may get very few bits from the DIV loop
  ;;  a quick scan of the library leads me to believe that R6 is safe to use here, so...
  ;; (emit '(LOAD R6 0))
  ;; (emit '(shift-A-again))
  ;; (emit '(ASL A)  '(UPDATE R6 R+1))
  ;; (emit '(CMP A B))                  ;A and B both positive here
  ;; (emit '(JGT shift-a-again))
  ;; (emit '(HIDE shift-a-again))
  ;; (emit '(ASR A) '(UPDATE R6 R-1))
  ;; followed later by fixup via R6 (ASR A)

  (emit '(div-time LOCAL))
  (emit '(COPY B1 X0))			;DIV X0 A = 1 bit of A1 / X0
  (emit '(ANDI #xFE CCR))		;clear carry bit (used by DIV)
  (emit '(REP 24))
  (emit '(DIV X0 A))			;now (this is confused) A0=quotient, A1=remainder?
  (emit '(ADD X0 A) '(LOAD B 0 SHORT))	;fix up remainder
  (emil '(JMP shifts))
  (emit '(rtn-one LOCAL))		;we jumped here if B=A
  (emit '(CLR A)    '(LOAD B 0 SHORT))
  (emit '(LOAD A0 #x800000))		;fractional-one???	;will fix up later (A1=remainder at this point)
  (emit '(shifts LOCAL))		;now take account of shifting done to make B>A above
  (emit '(COPY A1 B0))
  (emit '(LOAD A1 0 SHORT))
  (emit '(REP R5))			;R5 is at least 1 (get rid of sign bit)
  (emit '(ASL A))
  (emit '(REP R5))
  (emit '(ASL B))
  ;; (emit '(no-shifts LOCAL))
  (emit '(JCLR 23 X temp-loc pos-result))
  (emit '(NEG A))
  (emit '(pos-result LOCAL))
  (emit '(JCLR 23 X temp-loc-1 pos-rem))
  (emit '(NEG B))
  (emit '(pos-rem LOCAL))
  (emit '(all-done LOCAL))
  (emit '(RTS))
  (emit '(HIDE cmp-AB)))


(libdecl .int-mod 'int-mod-load '(A B X Y R5) '(.basic-divide))
(defun int-mod-load ()
  (emit '(.int-mod))			;A mod B
  (emit '(STORE A X temp-loc-2))	;first two temp-locs used by basic-divide
  (emit '(STORE B Y temp-loc-2))
  (emit '(JSR .basic-divide))		;A1 / B1 -- real result in A
  (emit '(COPY A1 X0))			;floor
  (emit '(LOAD X1 Y temp-loc-2))
  (emit '(MPY X1 X0 B)    '(LOAD A X temp-loc-2))
  (emit '(ASR B))			;make it integer mpy
  (emit '(COPY B0 X1))
  (emit '(SUB X1 A))
  (emit '(RTS)))


(libdecl .real-mod-init 'real-mod-init nil nil)
(defun real-mod-init ()
  (emit `(.real-mod-init))
  (emit `(DEFINE somewhere ,(get-l-mem))))

(libdecl .real-mod 'real-mod-load '(A B X Y R5) '(.real-mod-init .basic-divide .real-int-mpy .shift-AB-up))
(defun real-mod-load ()
  (emit '(.real-mod))			;A mod B
  ;; try to avoid the divide -- if B>0 (common case) and |A|<B then if A>=0, return A else B+A
  (emit '(CMPM A B)  '(STORE A L temp-loc-2)) ;|B|-|A|
  ;; (emit '(STORE A L temp-loc-2))	;first two temp-locs used by basic-divide
  (emit '(JNE not-equal))
  (emit '(CLR A))
  (emit '(RTS))
  (emit '(not-equal LOCAL))
  (emit '(JMI hard-case))
  (emit '(TST B))
  (emit '(JMI b-neg))
  (emit '(TST A))
  (emit '(JMI add-a-b))			;B>0 but A<0 = add A and B (since B>|A|)
  (emit '(RTS))				;B>0, A>0 so return A
  (emit '(add-a-b))
  (emit '(ADD B A))
  (emit '(RTS))
  (emit '(b-neg LOCAL))
  (emit '(TST A))
  (emit '(JGT add-a-b))			;B<0 but A>0, and |B|>A, so return A+B
  (emit '(HIDE add-a-b))
  (emit '(RTS))				;B<0 and A<=0 and |B|>|A| => A
  (emit '(hard-case LOCAL))		;here we have to divide
  (emit '(STORE B L somewhere))
  (emit '(JSR .shift-AB-up))
  (emit '(JSR .basic-divide))		;A1 / B1 -- real result in A

  (emit '(CLR B))			;check for bits dangling off divide result causing horrific trouble after floor
  (emit '(LOAD B0 #xff))
  (emit '(ADD B A))

  (emit '(COPY A1 X1))
  (emit '(LOAD A L somewhere))
  (emit '(JSR .real-int-mpy))
  (emit '(TFR A B)  '(LOAD A L temp-loc-2))
  (emit '(SUB B A))
  (emit '(UNDEFINE somewhere))		;still allocated
  (emit '(RTS)))

(libdecl .int-rem 'int-rem-load '(A B X Y R5) '(.basic-divide .real-int-mpy))
(defun int-rem-load ()
  (emit '(.int-rem))
  (emit '(STORE A X temp-loc-2))	;first two temp-locs used by basic-divide
  (emit '(STORE B Y temp-loc-2))
  (emit '(JSR .basic-divide))		;A1 / B1 -- real result in A
  (emit '(COPY A1 X0))			;truncate
  (emit '(LOAD X1 Y temp-loc-2))
  (emit '(MPY X1 X0 B)   '(LOAD A X temp-loc-2))
  (emit '(ASR B))			;make it integer mpy
  (emit '(COPY B0 X1))
  (emit '(SUB X1 A))
  (emit '(RTS)))


(libdecl .real-rem-init 'real-rem-init nil nil)
(defun real-rem-init ()
  (emit `(.real-rem-init))
  (emit `(DEFINE rem-somewhere ,(get-l-mem))))

(libdecl .real-rem 'real-rem-load '(A B X Y R5) '(.real-rem-init .basic-divide .real-int-mpy .shift-AB-up))
(defun real-rem-load ()
  (emit '(.real-rem))			;A rem B
  (emit '(STORE A L temp-loc-2))	;first two temp-locs used by basic-divide
  (emit '(STORE B L rem-somewhere))
  (emit '(JSR .shift-AB-up))
  (emit '(JSR .basic-divide))		;A1 / B1 -- real result in A
  (emit '(LOAD X1 X rem-somewhere))		;see note above
  (emit '(LOAD A L rem-somewhere))
  (emit '(JSR .real-int-mpy))
  (emit '(TFR A B)  '(LOAD A L temp-loc-2))
  (emit '(SUB B A))
  (emit '(RTS)))

(libdecl .gcd 'gcd-load '(A B X Y R5) '(.int-rem))
(defun gcd-load ()
  (emit '(.gcd))			;GCD A B
  (emit '(CMP B A))
  (emil '(JGE no-swap))
  (emit '(TFR A B)  '(STORE B L temp-loc))
  (emit '(LOAD A L temp-loc))
  (emit '(no-swap))
  (emit '(JSR .int-rem))
  (emit '(TST A))			;rem = 0 -> found it
  (emil '(JEQ ok))
  (emit '(TFR B A)  '(STORE A L temp-loc))
  (emit '(LOAD B L temp-loc))
  (emil '(JMP no-swap))
  (emit '(ok LOCAL))
  (emit '(TFR B A))
  (emit '(HIDE no-swap))
  (emit '(RTS)))

(libdecl .lcm-init 'lcm-init nil nil)
(defun lcm-init ()
  (emit `(.lcm-init))
  (emit `(DEFINE lcm-loc ,(get-l-mem))))

(libdecl .lcm 'lcm-load '(A B X Y R5) '(.lcm-init .gcd .basic-divide))
(defun lcm-load ()
  (emit '(.lcm))
  (emit '(STORE A X lcm-loc))
  (emit '(STORE B Y lcm-loc))
  (emit '(JSR .gcd))			;both A and B have result
  (emit '(LOAD X L lcm-loc))
  (emit '(MPY X1 X0 A))
  (emit '(JSR .basic-divide))
  (emit '(RTS)))

;;; envelopes (linear or exponential)
;;;

(libdecl .env 'env-load '(A B X Y R4 R5 R3 R6) nil)
(defun env-load (&optional (load-expt nil))
  (emit '(.env))			;R7=>home (i.e. "current-value" normally), N7=1 we assume
  (emit '(LOAD R3 X R7 RN))		;X:home+1 = ptr into env data
  (emit '(LOAD A L <pass-counter>))	;kill a cycle to let the pipeline settle
  (emit '(try-again))
  (emit '(LOAD B L R3 R+1))		;get pass counter of end of the current segment, R3=>current rate
  (emit '(TST B)       '(STORE B L temp-loc-1))	;only negative if end of env or time to get next external segments
  (emit '(JPL env-update))		;JPL => N=0 => not negative (i.e. not the same as plusp)

  ;; here we look for a continued envelope (i.e. only partial data in DSP memory)
  ;; for the other half, see handle-external-envelope in next56.lisp
  ;; B1 = (- index) B0 = segs
  (emit '(JSET 23 Y temp-loc-1 stick-point)) ;-1 in B0 => end of envelope
  (emit '(BSET M-HF2 X-IO M-HCR))	; set HF2
  (emit '(BSET M-HF3 X-IO M-HCR))	; set HF3 -- next56 now can tell we are awaiting data
  (put-datum %external-envelope)	; see table in next56.lisp
  (put-datum 'B1)
  (put-datum 'B0)
  (get-datum 'B)			;tsegs again if still more to come, -1 otherwise
  (get-datum 'B0)			;tsegs coming back
  (get-datum 'R3)
  (emit '(BCLR M-HF3 X-IO M-HCR))	; clear HF3
  (emit '(BCLR M-HF2 X-IO M-HCR))	; clear HF2 and go on to reset the envelope pointers and what-not
					; also set end indication if B1 /= 0
  (emit '(STORE B L temp-loc-2))
  (emit '(STORE R3 X R7 RN))		;pointer now reset
  (emit '(ASL B))
  (emit '(DO B0 load-new-env-data))
  (        get-datum 'A)		;A not A1! to set A2 for L mem move below
  (        get-datum 'A0)
  (emit   '(STORE A L R3 R+1))
  (emit   '(load-new-env-data LOCAL))
  (emit '(LOAD B L temp-loc-1))		;old end marker (index in B1 is right in any case)
  (emit '(LOAD B0 X temp-loc-2))	;either -1 = end marker or tsegs as before
  (emit '(STORE B L R3 R))
  (emit '(JMP .env))			;start all over

  (emit '(stick-point LOCAL))
  (emit '(LOAD A L R7 R))		;current value -> stick at end
  (emit '(RTS))

  (emit '(env-update LOCAL))
  (emit '(CMP B A)      '(LOAD B L R3 R+1))
  (emil '(JLT ptr-ok))			;if B>A, current rate is correct 
  ;; here we have to decide whether magify-seg is dealing with the pass counter at the
  ;; end of the current segment, or at the start of the next, and keep this part of the
  ;; code in sync with env in mus.lisp.
  (emit '(STORE R3 X R7 RN))		;otherwise update ptr
  (emit '(JMP try-again))		;added 10-June-92 -- we were off by one because of the
					;  confusion about whether we mean the end of this segment
					;  or the start of the next, but we can't just read the new
					;  rate because it might be an end marker or some other
					;  special thing, and we might accidentally add it into
					;  the "current value".  So, we just start over.
  (emit '(HIDE try-again))
  (emit '(ptr-ok LOCAL))
  (emit '(LOAD A L R7 R))		;"current-value" or "power" -- this is what we return (prog1 style)
  (emit '(ADD A B)      '(LOAD X1 1))	;X1=1 is used in exp check
  (emit '(STORE B L R7 R+1))		;save new version of current-value (not used on this pass unless step function)
  (when load-expt
    (emit '(LOAD B L R7 RN))		;N7=1 so this is func-base (at [cur-val-loc+2] = 1.0 if linear, 0.0 if step
					; if exp func, this same location is a pointer to data-base (base of env data)
    (emit '(CMP X1 B)   '(COPY B R6))	; so R6 => data-base if exp-func, otherwise garbage
    (emil '(JEQ seg-done))		;pointer to base=1.0 means it's linear
    (emit '(TST B)      '(LOAD N6 3))
    (emil '(JNE exp-time))		;in step function, so...
    (emit '(LOAD A L R7 1-R))		;  return rate
    (emit '(STORE B L R7 R))		;  zero "current-value"
    (emil '(JMP seg-done))
    (emit '(exp-time LOCAL))
    (emit '(LOAD B L R3 1-R))		;check for rate=0 --> skip expt in that case
    (emit '(TST B))
    (emit '(JNE full-expt))
    (emit '(LOAD A L R6 RN))		;current-value ([data-base+3])
    (emit '(RTS))
    (emit '(full-expt LOCAL))

    (emit '(TRANSFER A B) '(LOAD A L R6 R+1))
    ;;    (emit '(JSR .expt))			;uses A B X Y R4 R5, A=base B=power upon entry, returns value in A
    ;; old form used full expt on every pass -- this is silly because the "base" never changes
    (emit '(STORE A L orig-A))		;mimic setup in .expt
    (emit '(STORE B L temp-loc-1))
    (emit '(LOAD A L R6 RN))		;pick up preloaded (log (abs base))
    (emit '(JSR .constant-base-expt))

    (emit '(LOAD B 1))			;(- (expt base power) 1.0)
    (emit '(SUB B A)      '(LOAD B L R6 R+1))
    (emit '(JSR .real-mpy))		;* scaler
    (emit '(LOAD B L R6 R+1))
    (emit '(ADD B A))			;+ offset
    (emit '(STORE A L R6 R))
    (emit '(seg-done LOCAL)))
  (emit '(RTS)))

(libdecl .restart-env 'restart-env-load '(A B X Y R3 R6) nil)
(defun restart-env-load ()
  (emit '(.restart-env))		;R7=>home (current-value), N7=3
					;    x[home+1] => env data current segment, y[home+1]=(data-size + (true-data-base ash 8))
					;    l[home+2] = (real)base
					;    x[home+3] => restart-info: 
					;       x[restart]: true-data-base  y[restart]:number of pairs
					;       x[restart+1]: current-value-location if expenv, else 0 (y=0)
					;       then L cur-val-loc val (power or current-value)
					;            L start-sample
					;            L current-value of expenv, else unset
					; data base if seg is data-base [segs]
					;           if exp is base scaler offset current-value (log (abs base)) [segs]
					;   segs are pairs (long-int:pass real:rate) followed by -1 -1
					; true-data-base => segs base, there are data-size/2 pairs
  ;; if seg, put init-y into current-value,
  ;;         run through pairs, subtract out start-sample from x, add in <pass-counter>
  ;; if exp, put power in current-value, put init-y in true-current-value at (- true-base 2), run through pairs
  ;; restart info is at (R7)+4 -- if 0, fall into error handler with %external-unrestartable-envelope-error
  (emit '(LOAD A X R7 RN))
  (emit '(TST A)       '(LOAD N3 2))
  (emit '(JNE restart-ok))
  (emit '(BSET M-HF2 X-IO M-HCR))	;if restart-info ptr = 0, we've got trouble
  (emit '(BSET M-HF3 X-IO M-HCR))
  (put-datum %external-unrestartable-envelope-error)
  (emit '(JSR .break))			;is this the right thing?
  (emit '(restart-ok LOCAL))		;if we get here, we've got what appears to be a restart block
  (emit '(LOAD R3 X R7 RN))		;  R3=>restart block
  (emit '(LOAD A L <pass-counter>))
  (emit '(LOAD B L R3 RN))		;cur-val-loc, whatever it actually is
  (emit '(STORE B L R7 R+1))
  (emit '(LOAD R6 X R3 R))		;R6=>true-data-base
  (emit '(LOAD X0 Y R3 R+1))		;X0=number of pairs, R3=>current-value-loc if expenv
  (emit '(LOAD B L R3 RN))		;B=start-sample
  (emit '(STORE A L R3 RN))		; update start-sample for next iteration
  (emit '(STORE R6 X R7 R))		;update env's ptr into current segment
  (emit '(SUB B A)     '(LOAD B L R3 R+1))
  (emit '(TST B)       '(LOAD Y L R3 RN))
  (emit '(COPY B1 R3))
  (emit '(JEQ not-exp-env))
  (emit   '(STORE Y L R3 R))
  (emit '(not-exp-env LOCAL))
  (emit '(LOAD N6 2))
  (emit '(DO X0 reset-samples))
  (emit     '(LOAD B L R6 R))
  (emit     '(ADD A B))
  (emit     '(STORE B L R6 R+N))
  (emit '(reset-samples LOCAL))
  (emit '(NOP))
  (emit '(RTS)))


;;; sine and cosine
;;;

(libdecl .sine-init 'sine-init nil nil)
(defun sine-init ()
  (emit '(.sine-init))
  (emit (list 'DEFINE 'sine-table-base (get-L-mem 8)))
  (emit '(Y-ORG sine-table-base))
  (emit (list 'Y-DATA
	      (make-fraction (sin 0))
	      (make-fraction (sin 1))
	      (make-fraction (sin 2))
	      (make-fraction (sin 3))
	      (make-fraction (sin 4))
	      (make-fraction (sin 5))
	      (make-fraction (sin 6))
	      (make-fraction (sin 7))))
  (emit '(X-ORG sine-table-base))
  (emit (list 'X-DATA
	      (make-fraction (cos 0))
	      (make-fraction (cos 1))
	      (make-fraction (cos 2))
	      (make-fraction (cos 3))
	      (make-fraction (cos 4))
	      (make-fraction (cos 5))
	      (make-fraction (cos 6))
	      (make-fraction (cos 7)))))

(libdecl .cosine-init 'cosine-init nil nil)
(defun cosine-init ()			;put pi/2 at loc half-pi as "real" -- int in X, unsigned frac in Y
  (emit '(.cosine-init))
  (emit (list 'DEFINE 'half-pi (get-L-mem)))
  (emit '(X-ORG half-pi))
  (emit '(X-DATA 1))
  (emit '(Y-ORG half-pi))
  (emit (list 'Y-DATA (logand (floor (scale-float (- (/ (coerce pi 'single-float) 2) 1.0) 24)) #xffffff))))

;;; since sin (z1 + z2) = sin z1 cos z2 + cos z1 sin z2   (think of z1=integer, z2=fraction),
;;; we can do sin and cos without any huge table and table lookup as follows:
;;; store sin 0 ... sin 7 and cos 0 .. cos 7 in data memory (to handle the integer part),
;;; then use the power series representations to get the fractional part.  Multiply
;;; cross terms, and add -- voila!  This is a reasonable thing to do because we can
;;; interleave the sin and cos power series calculations, taking maximal advantage of
;;; the parallel move operations (X Y, X R, Y R, and so on).  We assume the caller
;;; of the basic routine has checked that the argument (A as a "real") is between 0 and two-pi.
;;; Only specially messy case involves A1=0 (that's the only time the result of the integer
;;; part is not fractional), so we check for that explicitly.  

;;; change made 13-Oct-90: add two terms to the polynomial to give use ca 22 bits of accuracy (previous
;;;   was 16) -- this mainly to make huge FFTs more likely to be correct, but I belatedly realized
;;;   the factor of 1/N in the FFT would shift all my data away for a really big FFT...  

(libdecl .sine 'sine-load '(A B X Y R1) '(.sine-init))
(defun sine-load ()			;R1 used to point to sine table base (also A B X Y and N1)
  (emit '(.sine))			;here phase is assumed to be 0..three-pi
  (emit '(TST A) '(COPY A1 B))		;if phase=0, return 0, put integer part in B
  (emil '(JEQ all-done))

  (emit '(LSL A) '(COPY A1 N1))		;prepare to make A0 a true fraction, save integer part
					;  (emit '(ASR A) '(LOAD Y0 -.0001984))
  (emit '(ASR A) '(LOAD Y0 .00000275))	;now A0 is ok as arg to MPY, Y0 is first sin coeff
  (emit '(TST B) '(COPY A0 X0))		;is there integer part?  Put fractional part in X0.
  (emil '(JNE normal-sine))		;(JPL = not negative), JGT worries about overflow

  ;; use Abramowitz and Stegun 4.3.97 polynomial approximation to sine
  (emit '(MPY X0 X0 B))			;x^2 in B (B1)
					; inserted term here
  (emit                '(COPY B X1)   '(LOAD B -.00019841))
  (emit '(MAC X1 Y0 B))
					; end inserted term
  (emit                '(COPY B Y0)   '(LOAD B .00833333)) ;now x^2 in X1, second sin coeff in B
  (emit '(MAC X1 Y0 B))
  (emit                '(COPY B Y0)   '(LOAD B -.16666666))
  (emit '(MAC X1 Y0 B) '(COPY X0 A))	
  (emit                '(COPY B Y0))
  (emit '(MPY X1 Y0 B))
  (emit                '(COPY B Y0))
  (emit '(MACR X0 Y0 A))
  (emit '(RTS))

  ;; interleave 4.3.97 as above with 4.3.99 -- polynomial approximation to cosine
  (emit '(normal-sine LOCAL))			;integer part /= 0, so do the double power series mentioned above
					;  (emit '(MPY X0 X0 A)                 '(LOAD Y1 -.001388))
					;  (emit                '(COPY A X1)    '(LOAD A .00833333))
					;  (emit '(MAC X1 Y0 A)                 '(LOAD B .041666))
; new form: (next term is .00000026 = 2 as a 24 bit fraction)
  (emit '(LOAD R1 sine-table-base))

  (emit '(MPY X0 X0 A)                 '(LOAD Y1  .00002476))
  (emit                '(COPY A X1)    '(LOAD A  -.00019841))
  (emit '(MAC X1 Y0 A)                 '(LOAD B  -.00138884))
  (emit '(MAC X1 Y1 B) '(COPY A Y0)    '(LOAD A   .00833333))
  (emit '(MAC X1 Y0 A) '(COPY B Y1)    '(LOAD B   .04166664))
;back to old...
  (emit '(MAC X1 Y1 B) '(COPY A Y0)    '(LOAD A  -.16666666))
  (emit '(MAC X1 Y0 A) '(COPY B Y1)    '(LOAD B  -.49999999))
  (emit '(MAC X1 Y1 B) '(COPY A Y0)    '(STORE X0 X temp-loc))
  (emit '(MPY X1 Y0 A) '(COPY B Y1))
  (emit '(MPY X1 Y1 B) '(COPY A Y0)    '(LOAD A X temp-loc))
  (emit '(MAC X0 Y0 A) '(LOAD X1 fractional-one))
  (emit '(ADD X1 B)    '(COPY A Y0)    '(LOAD X1 X R1 RN))
  (emit '(MPYR X1 Y0 A) '(COPY B X0)   '(LOAD Y1 Y R1 RN))
  (emit '(MPYR X0 Y1 B))
  (emit '(ADD B A)     '(LOAD B fractional-one))
  (emit '(CMP B A))
  (emit '(TGT B A))
  (emit '(all-done LOCAL))
  (emit '(RTS)))

(libdecl .cosine 'cosine-load '(A B X Y) '(.sine .sine-init .cosine-init))
(defun cosine-load ()
  (emit '(.cosine))
  (emit '(LOAD B L half-pi SHORT))		;just use Cos(x) = Sin(x + pi/2)
  (emit '(ADD B A))
  (emit '(JMP .sine)))

	      
(libdecl .oscil-init 'oscil-init nil nil)
(defun oscil-init ()			;put pi/2 at loc half-pi as "real" -- int in X, unsigned frac in Y
  (emit '(.oscil-init))
  (emit (list 'DEFINE 'two-pi (get-L-mem)))
  (emit '(X-ORG two-pi))
  (emit '(X-DATA 6))
  (emit '(Y-ORG two-pi))
  (emit (list 'Y-DATA (logand 
		       (floor (scale-float (- (* (coerce pi 'single-float) 2) 6.0) 24)) 
		       #xffffff))))

(libdecl .fix-up-phase 'fix-up-phase-load '(A B) '(.oscil-init)) ;for two-pi definition
(defun fix-up-phase-load ()
  (emit '(.compute-and-fixup-phase))
  (emit '(CLR B)     '(LOAD A L R3 R+1));A=current phase, R=>current frequency
  (emit '(STORE B L temp-loc))		;clear temp-loc (will be trigger indication)
  (emit '(LOAD B L R3 R+1))		;B=frequency, R=>fm term
  (emit '(ADD B A)   '(LOAD B L R3 R+1));A=freq+phase, B=fm, R=>increment (i.e. alias for base here)
					;must use B (or A) to get proper sign extension
  (emit '(ADD B A)   '(LOAD B L two-pi));A=instantaneous phase, B=two-pi for trigger check
  (emit '(.fix-up-phase))		;phase in A, two-pi in B, condition codes already set, unaffected by JSR
  (emil '(JGE check-two-pi))		;A not negative
  (emit '(make-pos))
  (emit '(ADD B A)     '(STORE B L temp-loc))	;A negative, so add in two-pi until positive
  (emil '(JMI make-pos))		;was JLT
  (emit '(check-two-pi LOCAL))
  (emit '(CMP B A))			;is A less than two-pi
  (emil '(JLE ok))
  (emit '(make-two-pi))
  (emit '(SUB B A)     '(STORE B L temp-loc))	;nope -- subtract out two-pi until it is
  (emil '(JGT make-two-pi))
  (emit '(ADD B A))			;ran over by one on preceeding loop
  (emit '(ok LOCAL))
  (emit '(HIDE make-two-pi make-pos))
  (emit '(RTS)))
;;; since we loop subtracting here, FM with an astronomical index will slow us down a lot


;;; OSCIL -- uses R2 to point to the OSCIL structure on chip, stored as:
;;;          current-fm (set to 0 at start, I hope) (all three values are L values -- "reals")
;;;          frequency  (set to (osc-freq s) at start)
;;;          current-phase (set to (osc-phase at start) 
;;; R2 points at current-phase, N2=2.

(libdecl .oscil 'oscil-load '(A B X Y) '(.sine .fix-up-phase .oscil-init .sine-init))
(defun oscil-load ()			;assume address of oscil data is in R2, N2=2
  (emit '(.oscil))
  (emit '(STORE A L temp-loc-1))
  (emit '(CLR B)    '(LOAD A L R2 R+1))	;get current phase, R=>freq
  (emit '(LOAD B L R2 R+1))		;current frequency, R=>fm
  (emit '(ADD B A) '(LOAD B L R2 R-N))	;phase+freq, get fm, point back at phase for later update
  (emit '(ADD B A) '(LOAD B L two-pi))	;phase+fm, get limit check (must use B, not X or Y)
					;we only save one instruction by noticing <zero> at compile time
  (emit '(JSR .fix-up-phase))
  (emit '(LOAD B L temp-loc-1))
					;by updating phase first, we start at sample 1 in effect, rather than 0.
  (emit '(TST B)    '(STORE A L R2 R))
  (emit '(JEQ .sine))			;will RTS, so this is an exit from .oscil
  (emit '(ADD B A)  '(LOAD B L two-pi))	;phase modulation term is not zero, so deal with it.
  (emit '(JSR .fix-up-phase))
  (emit '(JMP .sine)))

(libdecl .trig-sine 'trig-sine-load '(A B X Y) '(.fix-up-phase .sine .oscil-init .sine-init .real-mod))
(defun trig-sine-load ()
  (emit '(.trig-sine))			;arbitrary phase in A
  (emit '(LOAD B L two-pi))		;mimic OSCIL and friends
  (emit '(JSR .real-mod))
  (emit '(JMP .sine)))

(libdecl .trig-cosine 'trig-cosine-load '(A B X Y) '(.fix-up-phase .cosine .cosine-init .sine .oscil-init .sine-init .real-mod))
(defun trig-cosine-load ()
  (emit '(.trig-cosine))		;arbitrary phase in A
  (emit '(LOAD B L two-pi))		;mimic OSCIL and friends
  (emit '(JSR .real-mod))		;get A between 0 and 2*pi
  (emit '(JMP .cosine)))

	      
(libdecl .sw-init 'sw-init nil nil)
(defun sw-init ()			;put pi at loc one-pi as "real" -- int in X, unsigned frac in Y
  (emit '(.sw-init))
  (emit (list 'DEFINE 'one-pi (get-L-mem)))
  (emit '(X-ORG one-pi))
  (emit '(X-DATA 3))
  (emit '(Y-ORG one-pi))
  (emit (list 'Y-DATA (logand 
		       (floor (scale-float (- one-pi 3.0) 24)) 
		       #xffffff)))
  (emit (list 'DEFINE 'three-half-pi (get-L-mem)))
  (emit '(X-ORG three-half-pi))
  (emit '(X-DATA 4))
  (emit '(Y-ORG three-half-pi))
  (emit (list 'Y-DATA (logand 
		       (floor (scale-float (- three-half-pi 4.0) 24)) 
		       #xffffff))))

(libdecl .square-wave 'square-wave-load '(A B R6) '(.fix-up-phase .oscil-init .sw-init))
(defun square-wave-load ()		;stored: cur-val phase freq fm base (5 words)
  (emit '(.square-wave))		;assume R3 points at square wave data block, phase term (i.e. base+1)
  (emit '(COPY R3 R6))			;need phase pointer later
  (emit '(JSR .compute-and-fixup-phase))
  (emit '(STORE A L R6 R))		;save current phase
  (emit '(LOAD B L one-pi))
  (emit '(CMP A B)    '(LOAD A L R3 R-N)) ;is phase > pi, load up sw-base -> A
  (emil '(JGE ok))
  (emit '(CLR A))			;phase > pi, so return 0.0
  (emit '(ok LOCAL))
  (emit '(STORE A L R3 R))
  (emit '(RTS)))
 

(libdecl .pulse-train 'pulse-train-load '(A B R6) '(.fix-up-phase .oscil-init))
(defun pulse-train-load ()		;stored: cur-val phase freq fm base (5 words)
  (emit '(.pulse-train))		;assume R3 points at pulse-train data block, phase term (i.e. base+1)
  (emit '(COPY R3 R6))
  (emit '(JSR .compute-and-fixup-phase))
  (emit '(STORE A L R6 R))		;save current phase
  (emit '(LOAD A L temp-loc))		;triggered?
  (emit '(TST A)     '(LOAD B L R3 R))	;increment=base (set in make-pulse)
  (emit '(TNE B A))			;i.e. if no trigger, then A already has the right output value
  (emit '(RTS)))


(libdecl .triangle-wave 'triangle-wave-load '(A B R6) '(.fix-up-phase .oscil-init .sw-init .cosine-init .real-mpy))
(defun triangle-wave-load ()
  (emit '(.triangle-wave))
  (emit '(COPY R3 R6))
  (emit '(JSR .compute-and-fixup-phase))
  (emit '(STORE A L R6 R))
  (emit '(LOAD B L half-pi))
  (emit '(CMP B A)    '(LOAD B L three-half-pi))
  (emil '(JLT do-mpy))
  (emit '(CMP B A)    '(LOAD B L two-pi))
  (emil '(JLT three-case))
  (emit '(SUB B A))
  (emil '(JMP do-mpy))
  (emit '(three-case LOCAL))
  (emit '(TFR A B)    '(LOAD A L one-pi))
  (emit '(SUB B A))
  (emit '(do-mpy LOCAL))
  (emit '(LOAD B L R3 R-N))
  (emit '(JSR .real-mpy))		;we used to use increments rather than multiplies, but that wanders off the map
  (emit '(STORE A L R3 R))
  (emit '(RTS)))


(libdecl .sawtooth-wave 'sawtooth-wave-load '(A B R6) '(.fix-up-phase .oscil-init .sw-init .real-mpy))
(defun sawtooth-wave-load ()
  (emit '(.sawtooth-wave))
  (emit '(COPY R3 R6))
  (emit '(JSR .compute-and-fixup-phase))
  (emit '(STORE A L R6 R))
  (emit '(LOAD B L one-pi))
  (emit '(SUB B A)   '(LOAD B L R3 R-N))
  (emit '(JSR .real-mpy))
  (emit '(STORE A L R3 R))
  (emit '(RTS)))


;;; table-shift
(libdecl .table-shift 'table-shift-load '(A B R4 R5) nil)
(defun table-shift-load ()
  (emit '(.table-shift))		;R3->table   start:size   <>:x-or-y
  (emit '(LOAD R4 X R3 R))
  (emit '(LOAD B Y R3 R+1))
  (emit '(LOAD A 1))
  (emit '(SUB A B))
  (emit '(TST B))
  (emit '(JLE all-done))
  (emit '(LUA R5 R4 R+1))
  (emit '(JCLR 0 Y R3 R x-side))
  (emit '(DO B yy))
  (emit   '(LOAD A Y R5 R+1))
  (emit   '(STORE A Y R4 R+1))
  (emit   '(NOP))
  (emit   '(yy LOCAL))
  (emit '(NOP))
  (emit '(RTS))
  (emit '(x-side LOCAL))
  (emit '(DO B xx))
  (emit   '(LOAD A X R5 R+1))
  (emit   '(STORE A X R4 R+1))
  (emit   '(NOP))
  (emit   '(xx LOCAL))
  (emit '(NOP))
  (emit '(all-done LOCAL))
  (emit '(RTS)))
  

;;; dot-product of tables (arrays are easier except we need .real-mpy)
(libdecl .table-dot-product 'table-dot-product-load '(A B X Y R5 R0) nil)
(defun table-dot-product-load ()	;R3->one table, R4->other
  (emit '(.table-dot-product))
  (emit '(LOAD A Y R3 R))
  (emit '(LOAD B Y R4 R))
  (emit '(CMP A B)     '(LOAD R0 X R4 R+1))
  (emit '(TGT A B))			;B <- MIN A B   
  (emit '(LOAD R5 X R3 R+1))
  (emit '(CLR A))
  (emit '(JCLR 0 Y R4 R a-x))
  (emit '(LOAD X0 Y R0 R+1))
  (emit '(JCLR 0 Y R3 R b-x-a-y))
  (emit '(DO B yy))
  (emit '(LOAD X1 Y R5 R+1))
  (emit '(MAC X0 X1 A))
  (emit '(LOAD X0 Y R0 R+1))
  (emit '(yy LOCAL))
  (emit '(NOP))
  (emit '(RTS))
  (emit '(b-x-a-y LOCAL))
  (emit '(LOAD Y1 X R5 R+1))
  (emit '(REP B))
  (emit '(MAC X0 Y1 A)   '(LOAD Y1 Y R0 R+1)   '(LOAD X0 X R5 R+1))
  (emit '(RTS))
  (emit '(a-x LOCAL))
  (emit '(LOAD X0 X R0 R+1))
  (emit '(JCLR 0 Y R3 R b-x-a-x))
  (emit '(LOAD Y1 Y R5 R+1))
  (emit '(REP B))
  (emit '(MAC X0 Y1 A)   '(LOAD X0 X R0 R+1)   '(LOAD Y1 Y R5 R+1))
  (emit '(RTS))
  (emit '(b-x-a-x LOCAL))
  (emit '(DO B xx))
  (emit '(LOAD X1 X R5 R+1))
  (emit '(MAC X0 X1 A))   
  (emit '(LOAD X0 X R0 R+1))
  (emit '(xx LOCAL))
  (emit '(NOP))
  (emit '(RTS)))



;;; for array dot product, x[0]=2*base, x[1]=size-1

;;; dot-product of arrays

(libdecl .array-dot-product-init 'array-dot-product-init nil nil)
(defun array-dot-product-init ()
  (emit `(.array-dot-product-init))
  (emit `(DEFINE dot-product-sum ,(get-l-mem))))

(libdecl .array-dot-product 'array-dot-product-load '(A B X Y R5 R0 N0) '(.array-dot-product-init .real-mpy))
(defun array-dot-product-load ()	;R3->one array, R4->other
  (emit '(.array-dot-product))
  (emit '(LOAD A X R3 R+1))
  (emit '(ASR A)       '(LOAD B X R4 R+1))
  (emit '(ASR B)       '(COPY A R0))
  (emit '(COPY B R5))
  (emit '(LOAD A X R3 R))
  (emit '(LOAD B X R4 R))
  (emit '(CMP A B))
  (emit '(TGT A B))			;B <- MIN A B  
  (emit '(CLR A)      '(COPY B N0))
  (emit '(STORE A L dot-product-sum))
  (emit '(DO N0 sum))
  (emit     '(LOAD A L R0 R+1))
  (emit     '(LOAD B L R5 R+1))
  (emit     '(JSR .real-mpy))
  (emit     '(LOAD B L dot-product-sum))
  (emit     '(ADD B A))
  (emit     '(STORE A L dot-product-sum))
  (emit '(sum LOCAL))
  (emit '(NOP))
  (emit '(RTS)))
 

;;; table-interp "unit generator"
(libdecl .table-interp 'table-interp-load '(B X Y R4) nil)
(defun table-interp-load ()
  (emit '(.table-interp))		;A=index (real), R3->table header: base  base-1
					;                                  size  x-or-y
  (emit '(ABS A)     '(LOAD R4 X R3 R))	;R4->base of table
  (emit '(LSL A)     '(LOAD B Y R3 R+1)) ;B=size of table
  (emit '(ASR A)     '(LOAD X1 1))	;convert A0 to fraction
  (emit '(COPY A0 Y0))
  (emit '(LOAD A0 0))			;truncate (since we're positive)
  (emit '(check-index))
  (emit '(CMP B A))
  (emil '(JLT ok))
  (emit '(SUB B A))
  (emil '(JMP check-index))
  (emit '(HIDE check-index))
  (emit '(ok LOCAL))
  (emit '(SUB X1 B)   '(COPY A N4))			;N4=index of lower entry
  (emit '(JCLR 0 Y R3 R x-side))
  (emit '(LOAD X0 Y R4 RN))
  (emil '(JMP got-low-data))
  (emit '(x-side LOCAL))
  (emit '(LOAD X0 X R4 RN))
  (emit '(got-low-data LOCAL))
  (emit '(COPY N4 X1))
  (emit '(CMP X1 B)    '(UPDATE R4 R+N)) ;are we at the end of the table?
  (emil '(JNE go-on))
  (emit '(LOAD R4 X R3 R))
  (emit '(NOP))
  (emit '(go-on LOCAL))
  (emit '(UPDATE R4 R+1))
  (emit '(JCLR 0 Y R3 R x-1-side))
  (emit '(LOAD B Y R4 R))		;now Y0=fraction in index, X0=first entry, B=second entry
  (emil '(JMP got-data))
  (emit '(x-1-side LOCAL))
  (emit '(LOAD B X R4 R))
  (emit '(got-data LOCAL))
  (emit '(SUB X0 B)     '(COPY X0 A))	;B=diff, A=first
  (emit '(COPY B X1))			;X1=diff
  (emit '(MAC X1 Y0 A))			;A=first + diff*frac
  (emit '(RTS)))


;;; array-interp "unit generator"
(libdecl .array-interp 'array-interp-load '(B X Y R4) '(.real-frac-mpy))
(defun array-interp-load ()
  (emit '(.array-interp))		;A=index (real), R3->array header: base*2      size-1
					;                                  1 (el-siz)  top
  (emit '(ABS A)     '(LOAD B X R3 R+1))
  (emit '(ASR B)     '(LOAD X0 1))
  (emit '(LSL A)     '(COPY B1 Y0))	;Y0 and R4->base of array
  (emit '(ASR A)     '(LOAD B X R3 R))  ;B=size-1 of array, convert A0 to fraction
  (emit '(ADD X0 B)  '(COPY A0 X1))	;B=size, X1=interp fraction
  (emit '(LOAD A0 0))			;truncate (since we're positive)
  (emit '(check-index))
  (emit '(CMP B A)   '(COPY Y0 R4))
  (emil '(JLT ok))
  (emit '(SUB B A))
  (emil '(JMP check-index))
  (emit '(HIDE check-index))
  (emit '(ok LOCAL))
  (emit '(SUB X0 B)   '(COPY A N4))	;N4=index of lower entry
  (emit '(NOP))
  (emit '(LOAD A L R4 RN))
  (emit '(COPY N4 X0))
  (emit '(CMP X0 B)   '(UPDATE R4 R+N))	;are we at the end of the table?
  (emil '(JNE go-on))
  (emit '(COPY Y0 R4))
  (emit '(JMP got-base))
  (emit '(go-on LOCAL))
  (emit '(UPDATE R4 R+1))
  (emit '(got-base LOCAL))
  (emit '(TFR A B)    '(STORE A L temp-loc))
  (emit '(LOAD A L R4 R))
  (emit '(SUB B A))
  (emit '(JSR .real-frac-mpy))
  (emit '(LOAD B L temp-loc))
  (emit '(ADD B A))
  (emit '(RTS)))


(libdecl .sum-of-cosines 'sum-of-cosines-load '(A B X Y R6) '(.trig-sine .basic-divide .real-mpy .sine .fix-up-phase))
;; sin((n+.5)x)/sin(x/2)*scaler -- compute and fix-up phase as per simple-wave,

(defun sum-of-cosines-load ()		;stored phase freq fm scaler:temp (n+.5)
  (emit '(.sum-of-cosines))
  (emit '(COPY R3 R6))
  (emit '(JSR .compute-and-fixup-phase))
  (emit '(STORE A L R6 R))
  (emit '(ASR A))
  (emit '(JSR .sine))
  (emit '(STORE A Y R3 R+1))		;divisor eventually
  (emit '(LOAD B L R3 R-1))
  (emit '(LOAD A L R6 R))
  (emit '(JSR .real-mpy))		;(n+.5) * x
  (emit '(JSR .trig-sine))
  (emit '(COPY A X1))
  (emit '(LOAD X0 X R3 R))		;scaler
  (emit '(MPY X1 X0 A)    '(LOAD B Y R3 R))
  (emit '(JMP .basic-divide)))


(libdecl .randh 'randh-load '(A B X Y R6) '(.oscil-init .fix-up-phase .random .sambox-random .real-frac-mpy))
(defun randh-load ()			;assume N3=2, R3=>noi struct
  (emit '(.randh))
  (emit '(COPY R3 R6))
  (emit '(JSR .compute-and-fixup-phase))
  (emit '(STORE A L R6 R))
  (emit              '(LOAD B L temp-loc))
  (emit '(TST B)     '(LOAD A L R3 R+1))	;A=current-value, R=>amplitude
  (emil '(JEQ done))
  ;; here we need a jump to Sambox-random rather than random if randh option set -- jump to (noi-ran-op r) in effect
  (emit '(JCLR 0 X R3 RN normal-case))	;if X[6]=1 use sambox noise
  (emit '(JSR .Sambox-random))
  (emil '(JMP fix-up-result))
  (emit '(normal-case LOCAL))
  (emit '(JSR .random))
  (emit '(fix-up-result LOCAL))
  (emit '(COPY A0 X1))
  (emit '(LOAD A L R3 R-1))
  (emit '(JSR .real-frac-mpy))
  (emit '(STORE A L R3 R))
  (emit '(done LOCAL))
  (emit '(RTS)))


(libdecl .randi 'randi-load '(A B X Y R6) '(.oscil-init .fix-up-phase .random .real-frac-mpy))
(defun randi-load ()			         ;R3->randi info block [phase, freq, fm, output, base, incr, 1/passes
  (emit '(.randi))			         ;assumes N3=2
  (emit '(COPY R3 R6))
  (emit '(JSR .compute-and-fixup-phase))
  (emit '(STORE A L R6 R))		;R3->output
  (emit                    '(LOAD A L R3 R+N))   ;current output
  (emit                    '(LOAD B L R3 R-N))   ;current increment
  (emit '(ADD B A)         '(LOAD B L temp-loc))
  (emit '(TST B)           '(STORE A L R3 R+1))  ;store new output, R3->base
  (emil '(JEQ done))
  (emit '(JSR .random))
  (emit                    '(COPY A0 X1))        ;X1=new random number
  (emit                    '(LOAD A L R3 R-1))	 ;A=base amp, R3->output
  (emit '(JSR .real-frac-mpy))                   ;A=base*random
  (emit                    '(LOAD B L R3 R+N))   ;B=output, R3->incr
  (emit '(SUB B A)         '(UPDATE R3 R+1))     ;R3->1/passes
  (emit                    '(LOAD X1 Y R3 R-1))  ;Y0=1/passes (fraction)
  (emit '(JSR .real-frac-mpy))
  (emit                    '(STORE A L R3 R-N))  ;new increment, R3->output
  (emit                    '(LOAD A L R3 R))     ;return output
  (emit '(done LOCAL))
  (emit '(RTS)))


(libdecl .ur-table-lookup 'ur-table-lookup-load '(A B X) nil)
(defun ur-table-lookup-load ()		;phase freq fm tblsiz:tbl-base tblsiz/two-pi 0:mem (for higher level table lookup)
  (emit '(.ur-table-lookup))		;R3->base of tbl struct
  (emit '(LOAD A L R3 R+1))		     ;current phase
  (emit '(LOAD B L R3 R+1))		     ;frequency
  (emit '(ADD B A)       '(LOAD B L R3 R+1)) ;fm or gliss
  (emit '(add-B))
  (emit '(ADD B A)       '(LOAD B X R3 R))   ;table-size : table-base
  (emil '(JLT add-B))			;add in table-size until phase >= 0
  (emit '(cmp-B))
  (emit '(CMP A B)       '(LOAD N3 3 SHORT))	;are we within table bounds?
  (emil '(JGT ok-phase))		;yes -- go do table lookup
  (emit '(SUB B A))			;no -- subtract out table size
  (emil '(JMP cmp-B))			;and try again
  (emit '(.table-lookup-with-ok-phase))	;others who jump here should have A=true phase as real
					;  B=0 (not vital), and R3->table base.  R-N(3)=cur phase
  (emit '(ok-phase LOCAL))
  (emit '(COPY A1 B))			;B=int part of phase
  (emit '(LOAD X1 Y R3 R-N))		;X1=table base, R->current phase
  (emit '(ADD X1 B)      '(STORE A L R3 R))   ;B=lower table entry addr, update current phase
  (emit '(LSL A)         '(COPY B R3))	;make A0 into true fraction, R3->lower table entry
  (emit '(ASR A))			;finish A0 conversion, wait for R3 to settle
  (emit '(COPY A0 X0))			;X0=fraction (if any)
  (emit '(JSET 0 Y temp-loc-1 y-mem))	;where is the table?
  (emit '(LOAD A X R3 R+1))		;lower table entry
  (emit '(LOAD B X R3 R))		;upper entry
  (emil '(JMP got-diff))
  (emit '(y-mem LOCAL))
  (emit '(LOAD A Y R3 R+1))
  (emit '(LOAD B Y R3 R))
  (emit '(got-diff LOCAL))
  (emit '(SUB A B))			;B=difference
  (emit '(COPY B X1))
  (emit '(MACR X1 X0 A))
  (emit '(HIDE cmp-B add-B))
  (emit '(RTS)))

#|
(libdecl .table-lookup 'table-lookup-load '(A B X) '(.fix-up-phase))
(defun table-lookup-load ()		;this is now handled as a macro in mus.lisp
  (emit '(.table-lookup))
  (emit '(COPY R3 R6))			;save pointer to current phase
  (emit '(JSR .compute-and-fixup-phase))
  (emit '(STORE A L R6 R))		;save new phase
  (emit '(UPDATE R3 R+1))		;R3->"mag" in effect
  (emit '(LOAD B L R3 R-1))		;R3->base (in Y) for ur-table-lookup
  (emit '(JSR .real-MPY))		;scale "phase"
  (emit '(LOAD N3 3 SHORT))
  (emit '(JMP .table-lookup-with-ok-phase)))
|#


(libdecl .delay 'delay-load '(A B X R5) #+QP '(.QP-initialize-DRAM .UG-nil-warning) #-QP '(.UG-nil-warning))
(defun delay-load ()			;assume R4 has home+1, X0 has input
  (emit '(.delay))			;X:home=start, Y:home=end, X:(home+1)=current ptr, Y:(home+1)=0 if X side, 1 if Y
  (emit '(JSET 1 Y R4 R ext-dly-mem))	;Y:(home+1) has 2 on = external memory reference
  (emit '(JCLR 2 Y R4 R normal-mem))
					; if QP DRAM...
  #+QP (progn
	 (emit '(LOAD X1 X R4 R))	;current pointer into buffer (R5=16 bit, so should use some 24 bit register -- X1)
	 (QP-refresh-off)
	 (QP-read-DRAM 'X1 'A)
	 (QP-write-DRAM 'X1 'X0)
	 (QP-refresh-on)
	 (emit '(CLR B)    '(UPDATE R4 R-1))
	 (emit '(LOAD B1 1))
	 (emit '(ADD X1 B) '(LOAD X1 Y R4 R))
	 (emit '(CMP X1 B) '(LOAD X1 X R4 R+1))
	 (emit '(JLE store-Q-ptr))
	 (emit '(STORE X1 X R4 R))
	 (emit '(RTS))
	 (emit '(store-Q-ptr LOCAL))
	 (emit '(STORE B X R4 R))
	 (emit '(RTS)))
					;if normal 56000 memory...
  (emit '(normal-mem LOCAL))
  (emit '(LOAD R5 X R4 R))		;current pointer into buffer
  (emit '(JSET 0 Y R4 R-1 y-mem))	;assume loader put 1 in Y:(home+1) if Y side delay line
  (emit '(LOAD A X R5 R))		;output of delay line
  (emit '(STORE X0 X R5 R+1))		;store current input, increment buffer pointer
  (emil '(JMP ok))
  (emit '(y-mem LOCAL))
  (emit '(LOAD A Y R5 R))
  (emit '(STORE X0 Y R5 R+1))		;same, but on Y side 
  (emit '(ok LOCAL))
					;now update delay structure data
  (emit '(CLR B)    '(LOAD X1 Y R4 R))	;X1=end of buffer
  (emit '(COPY R5 B))			;B=incremented pointer
  (emit '(CMP X1 B) '(LOAD X1 X R4 R+1)) ;past end of buffer?
  (emil '(JLE store-ptr))		;set in pass-dly (code56.lisp) as actual buffer last address
  (emit '(STORE X1 X R4 R))
  (emit '(RTS))
  (emit '(store-ptr LOCAL))
  (emit '(STORE R5 X R4 R))		;update current pointer
  (emit '(RTS))

  (emit '(ext-dly-mem LOCAL))		;delay line is in external (68000) memory
  ;; if XY:home=0, then this is an uninitialized delay and something has gone awry.
  ;; Overall pattern is 0-0:0-3:0-0 in that case (pass-nil-dly in ins56.lisp)
  (emit '(CLR B)  '(LOAD A L R4 R))
  (emit '(LOAD B0 3))
  (emit '(CMP A B)  '(COPY R4 A))
  (emit '(JSEQ .UG-nil-warning))

  (emit '(BSET M-HF3 X-IO M-HCR))
  (get-datum 'A)
  (emit '(BCLR M-HF3 X-IO M-HCR))
  (put-datum 'X0)			;X0=input, A=output (let 68000 handle pointers and so on in this case)
  (emit '(RTS)))

(libdecl .tap 'tap-load '(A B X R5) #+QP '(.QP-initialize-DRAM .UG-nil-warning) #-QP '(.UG-nil-warning))
(defun tap-load ()			;assume R4 has home+1, X0=offset
  (emit '(.tap))			;X:home=start, Y:home=end, X:(home+1)=current ptr, Y:(home+1)=0 if X side, 1 if Y
  (emit '(LOAD A X R4 R-1))		;buffer pointer for delay line
  (emit '(SUB X0 A) '(LOAD B X R4 R+1))	; ditto - offset => A (might be less than X:home), B=start
  (emit '(CMP B A)  '(LOAD X1 1))	; = A-B = negative if less than buffer start
  (emit '(JGE buf_ok))
  (emit '(SUB B A)  '(LOAD B Y R4 1-R))
  (emit '(ADD B A)  '(LOAD X0 Y R4 R+1)) ;increment R4 back to home+1
  (emit '(ADD X1 A))			; home-start is off by 1 (we doing mod size here)
  (emit '(buf_ok LOCAL))		;now A = location of tap data in QP/X/Y mem
  (emit '(JSET 1 Y R4 R ext-dly-mem))	;Y:(home+1) has 2 on = external memory reference
  (emit '(JCLR 2 Y R4 R normal-mem))
  
  ;; if QP DRAM...
  #+QP (progn
	 (emit '(COPY A X1))
	 (QP-refresh-off)
	 (QP-read-DRAM 'X1 'A)
	 (QP-refresh-on)
	 (emit '(RTS)))
					;if normal 56000 memory...
  (emit '(normal-mem LOCAL))
  (emit '(COPY A R5))			;current pointer into buffer
  (emit '(JSET 0 Y R4 R-1 y-mem))	;assume loader put 1 in Y:(home+1) if Y side delay line
  (emit '(LOAD A X R5 R))		;output of delay line
  (emit '(RTS))
  (emit '(y-mem LOCAL))
  (emit '(LOAD A Y R5 R))
  (emit '(RTS))
  
  (emit '(ext-dly-mem LOCAL))		;delay line is in external (68000) memory
  ;; if XY:home=0, then this is an uninitialized delay and something has gone awry.
  ;; Overall pattern is 0-0:0-3:0-0 in that case (pass-nil-dly in ins56.lisp)
  ;; this actually won't work, I'm afraid.  We need a way for next56.c to tell a tap from a delay line.
  ;; until someone actually wants this, the other cases will have to suffice.
  (emit '(JMP .UG-nil-warning))
  (emit '(RTS)))


(libdecl .zdelay 'zdelay-load '(A B X Y R3 R6) #+QP '(.QP-initialize-DRAM .delay .UG-nil-warning) #-QP '(.UG-nil-warning))
(defun zdelay-load ()			;assume R4 has (home+1) as in .delay, X0 has input
  (emit '(.zdelay))			;L:home=delay structure (see above), L:(home+3)=pm, L:(home+4)=phase, {N4=2}, L:(home+5)=size
					;delay unit is at 0 loc in struct for .R-array-clear if array of zdlys
  (emit '(JSET 1 Y R4 R ext-zdly-mem))	;Y:(home+1) has 2 on = external memory reference
  (emit '(LOAD A L R5 R+1))		;R5 assumed to point at pm
  (emit '(LOAD B L R5 R+1))
  ;; (emit '(ADD A B))			;B = phase -- 5-Nov-92 surely this should be SUB (see mus.lisp)
  (emit '(SUB A B))			;i.e. B-A=>B where A=pm, B=phase
  (emit '(tst-B))
  (emit '(TST B)    '(LOAD A L R5 R))	;A <= size (as a real), see if phase is negative
  (emit '(JLT neg-phase))
  (emit '(cmp-B))
  (emit '(CMP B A)  '(COPY B1 Y1))	;B>=0 -- is it in range? (CMP B A = A-B -- very confusing)
  (emit '(JGT ok-low))			;i.e. A-B>0 -- so B is less than A
  (emit '(SUB A B))
  (emit '(JMP cmp-B))
  (emit '(neg-phase LOCAL))
  (emit '(ADD A B))
  (emit '(JMP tst-B))
  (emit '(HIDE tst-B cmp-B))
  (emit '(ok-low LOCAL))
  (emit '(LSL B))			;convert B0 to a true fraction
  (emit '(ASR B))
  (emit '(COPY B0 Y0))
  (emit '(CLR B))
  (emit '(LOAD B1 1))
  (emit '(ADD Y1 B))
  (emit '(CMP B A))			;check for wrap around (is low index = last buffer location?)
  (emit '(JGT high-ok))
  (emit '(CLR B))
  (emit '(high-ok LOCAL))		;B=high index, Y1=low, Y0=frac part, X0=input, A=size, R5=>size, R4=>(home+1)

  (emit '(JCLR 2 Y R4 R-1 normal-zmem))
					; if QP DRAM...
  #+QP (progn
	 (emit '(LOAD A X R4 R+1))	;start of buffer
	 (emit '(ADD A B))
	 (emit '(ADD Y1 A))
	 (QP-refresh-off)
	 (QP-read-DRAM 'B 'X1)		;buf[high] => X1
	 (QP-read-DRAM 'A 'Y1)		;buf[low] => Y1
	 (emit '(LOAD A X R4 R-1))	;A=current ptr
	 (QP-write-DRAM 'A 'X0)		;write new input at current buffer loc
	 (QP-refresh-on)
	 
	 (emit '(CLR B))
	 (emit '(LOAD B1 1))
	 (emit '(ADD A B)    '(LOAD X0 Y R4 R))
	 (emit '(CMP X0 B)   '(LOAD X0 X R4 R+1))
	 (emit '(JLE store-ptr))
	 (emit '(STORE X0 X R4 R))
	 (emit '(JMP interp))
	 (emit '(store-ptr LOCAL))
	 (emit '(STORE B X R4 R))
	 (emit '(JMP interp)))

  (emit '(normal-zmem LOCAL))
  (emit '(LOAD A X R4 R+1))		;start of buffer
  (emit '(ADD A B)        '(COPY R5 N5))
  (emit '(LOAD R5 X R4 R))		;current pointer into buffer
  (emit '(ADD Y1 A)       '(COPY B R6))
  (emit '(COPY A R3))

  (emit '(JSET 0 Y R4 R-1 y-mem))	;assume loader put 1 in Y:(home+1) if Y side delay line
  (emit '(LOAD X1 X R6 R))
  (emit '(LOAD Y1 X R3 R))
  (emit '(STORE X0 X R5 R+1))		;store current input, increment buffer pointer
  (emil '(JMP ok))
  (emit '(y-mem LOCAL))
  (emit '(LOAD X1 Y R6 R))
  (emit '(LOAD Y1 Y R3 R))
  (emit '(STORE X0 Y R5 R+1))		;same, but on Y side 
  (emit '(ok LOCAL))
					;now update delay structure data
  (emit '(CLR B)    '(LOAD X0 Y R4 R))	;X1=end of buffer
  (emit '(COPY R5 B))			;B=incremented pointer
  (emit '(CMP X0 B) '(LOAD X0 X R4 R+1)) ;past end of buffer?
  (emil '(JLE store-ptr))		;set in pass-dly (code56.lisp) as actual buffer last address
  (emit '(STORE X0 X R4 R))
  (emit '(JMP interp-1))
  (emit '(store-ptr LOCAL))
  (emit '(STORE R5 X R4 R))		;update current pointer

  (emit '(interp-1 LOCAL))
  (emit '(COPY N5 R5))			;restore pointer to pm et al

  (emit '(interp LOCAL))		; here X1=buf[high], Y1=buf[low], Y0=frac
  (emit '(COPY X1 B))
  (emit '(SUB Y1 B)   '(COPY Y1 A))
  (emit '(COPY B Y1))
  (emit '(MAC Y0 Y1 A) '(UPDATE R5 R-1))
  ;; here A is our result, R5 points at size
					;  (emit '(UPDATE R5 R-1))
  (emit '(LOAD B L R5 R+1))
  (emit '(LOAD Y0 1))
  (emit '(ADD Y0 B)     '(LOAD X0 X R5 R-1))
  (emit '(CMP X0 B))			;B-X0 => phase-size
  (emit '(JLT phase-ok))
  (emit '(CLR B))
  (emit '(phase-ok LOCAL))
  (emit '(STORE B L R5 R))
  (emit '(RTS))

  (emit '(ext-zdly-mem LOCAL))		; as of 23-May-92, handled as an external function

  (emit '(CLR B)  '(LOAD A L R4 R))
  (emit '(LOAD B0 3))
  (emit '(CMP A B)  '(COPY R4 A))
  (emit '(JSEQ .UG-nil-warning))

  (emit '(UPDATE R4 R+1))
  (emit '(BSET M-HF2 X-IO M-HCR))	; set HF2
  (emit '(BSET M-HF3 X-IO M-HCR))	; set HF3
  (put-datum %external-zdelay)		;see table next56.lisp
  ;; now send back delay id, input, pm-int, pm-frac, and get outval (fractional), return outval (A)
  ;; backpointer id is in Y:(home+2)
  (put-datum '(Y R4 R))			; id
  (put-datum 'X0)			; input
  (put-datum '(X R5 R))			; pm-int
  (put-datum '(Y R5 R))			; pm-frac
  (get-datum 'A)			; returned value
  (emit '(BCLR M-HF3 X-IO M-HCR))	; clear HF3
  (emit '(BCLR M-HF2 X-IO M-HCR))	; clear HF2
  (emit '(RTS)))

(libdecl .ztap 'ztap-load '(A B X Y R3 R6) '(.QP-initialize-DRAM .delay .UG-nil-warning))
(defun ztap-load ()			;assume R4 has (home+1) as in .delay,
  (emit '(.ztap))			;L:home=delay structure (see above), L:(home+3)=pm, L:(home+4)=phase, {N4=2}, L:(home+5)=size
					;delay unit is at 0 loc in struct for .R-array-clear if array of zdlys
  (emit '(JSET 1 Y R4 R ext-ztap-mem))	;Y:(home+1) has 2 on = external memory reference
  (emit '(LOAD A L R5 R+1))		;R5 assumed to point at pm
  (emit '(LOAD B L R5 R+1))
  ;; (emit '(ADD A B))			;B = phase
  (emit '(SUB A B))			;i.e. B-A=>B where A=pm, B=phase
  (emit '(tst-B))
  (emit '(TST B)    '(LOAD A L R5 R))	;A = size (as a real), see if phase is negative
  (emit '(JLT neg-phase))
  (emit '(cmp-B))
  (emit '(CMP B A)  '(COPY B1 Y1))	;B>=0 -- is it in range? (CMP B A = A-B -- very confusing)
  (emit '(JGT ok-low))			;i.e. A-B>0 -- so B is less than A
  (emit '(SUB A B))
  (emit '(JMP cmp-B))
  (emit '(neg-phase LOCAL))
  (emit '(ADD A B))
  (emit '(JMP tst-B))
  (emit '(HIDE tst-B cmp-B))
  (emit '(ok-low LOCAL))
  (emit '(LSL B))			;convert B0 to a true fraction
  (emit '(ASR B))
  (emit '(COPY B0 Y0))
  (emit '(CLR B))
  (emit '(LOAD B1 1))
  (emit '(ADD Y1 B))
  (emit '(CMP B A))			;check for wrap around (is low index = last buffer location?)
  (emit '(JGT high-ok))
  (emit '(CLR B))
  (emit '(high-ok LOCAL))		;B=high index, Y1=low, Y0=frac part, A=size, R5=>size, R4=>(home+1)

  (emit '(JCLR 2 Y R4 R-1 normal-zmem))
					; if QP DRAM...
  (emit '(LOAD A X R4 R+1))		;start of buffer
  (emit '(ADD A B))
  (emit '(ADD Y1 A))
  (QP-refresh-off)
  (QP-read-DRAM 'B 'X1)			;buf[high] => X1
  (QP-read-DRAM 'A 'Y1)			;buf[low] => Y1
  (QP-refresh-on)
  (emit '(JMP interp))

  (emit '(normal-zmem LOCAL))
  (emit '(LOAD A X R4 R+1))		;start of buffer
  (emit '(ADD A B)        '(COPY R5 N5))
  (emit '(LOAD R5 X R4 R))		;current pointer into buffer
  (emit '(ADD Y1 A)       '(COPY B R6))
  (emit '(COPY A R3))

  (emit '(JSET 0 Y R4 R-1 y-mem))	;assume loader put 1 in Y:(home+1) if Y side delay line
  (emit '(LOAD X1 X R6 R))
  (emit '(LOAD Y1 X R3 R))
  (emil '(JMP ok))
  (emit '(y-mem LOCAL))
  (emit '(LOAD X1 Y R6 R))
  (emit '(LOAD Y1 Y R3 R))
  (emit '(ok LOCAL))
  (emit '(COPY N5 R5))			;restore pointer to pm et al

  (emit '(interp LOCAL))		; here X1=buf[high], Y1=buf[low], Y0=frac
  (emit '(COPY X1 B))
  (emit '(SUB Y1 B)   '(COPY Y1 A))
  (emit '(COPY B Y1))
  (emit '(MAC Y0 Y1 A) '(UPDATE R5 R-1))
  ;; here A is our result, R5 points at size
  (emit '(RTS))

  (emit '(ext-ztap-mem LOCAL))		; as of 23-May-92, handled as an external function

  (emit '(CLR B)  '(LOAD A L R4 R))
  (emit '(LOAD B0 3))
  (emit '(CMP A B)  '(COPY R4 A))
  (emit '(JSEQ .UG-nil-warning))

  (emit '(UPDATE R4 R+1))
  (emit '(BSET M-HF2 X-IO M-HCR))	; set HF2
  (emit '(BSET M-HF3 X-IO M-HCR))	; set HF3
  (put-datum %external-ztap)		;see table next56.lisp
  ;; now send back delay id, input, pm-int, pm-frac, and get outval (fractional), return outval (A)
  ;; backpointer id is in Y:(home+2)
  (put-datum '(Y R4 R))			; id
;  (put-datum 'X0)			; input
  (put-datum '(X R5 R))			; pm-int
  (put-datum '(Y R5 R))			; pm-frac
  (get-datum 'A)			; returned value
  (emit '(BCLR M-HF3 X-IO M-HCR))	; clear HF3
  (emit '(BCLR M-HF2 X-IO M-HCR))	; clear HF2
  (emit '(RTS)))



(libdecl .all-comb 'all-comb-load '(A B X Y R5) '(.UG-nil-warning))
(defun all-comb-load ()			          ;A=input, N4=2 we hope
  (emit '(.all-comb))			          ;R4 has home+1, as in delay
  (emit '(JSET 1 Y R4 R ext-all-comb-mem))        ;associated delay line is in external memory
  (emit '(CLR B)            '(LOAD R5 X R4 R))	  ;R5=cur ptr in delay buf (i.e. X:home+1)

  #+QP (emit '(JCLR 2 Y R4 R normal-all-comb-mem))     ;QP DRAM case (type = 4)
  #+QP (progn
	 (emit '(UPDATE R4 R+N))
	 (QP-refresh-off)
	 (emit '(LOAD Y1 Y R4 R))                 ; Y1 <- G1
	 (QP-read-DRAM 'R5 'X1)
	 (emit '(MAC X1 Y1 A)      '(LOAD Y1 X R4 R-N))  ; A=IN+DRAM*G1, Y1=G0
	 (QP-write-DRAM 'R5 'A)
	 (QP-refresh-on)
	 (emit '(UPDATE R5 R+1))                  ; increment buffer pointer
	 (emil '(JMP ok-1)))

  (emit '(normal-all-comb-mem LOCAL))
  (emit '(JSET 0 Y R4 R+N y-mem))	          ;Y:home+1=1 if Y side, 0 if X
  (emit '(LOAD Y1 Y R4 R))		          ;X1=DM, Y1=G1 (Y:home+3)
  (emit '(LOAD X1 X R5 R))   
  (emit '(MACR X1 Y1 A)     '(LOAD Y1 X R4 R-N))  ;A=(IN+DM*G1), Y1=G0, R4->home+1
  (emit '(STORE A X R5 R+1) '(COPY A Y0))
  (emil '(JMP ok))
  (emit '(y-mem LOCAL))
  (emit '(LOAD Y1 Y R4 R))                        ;X1=DM, Y1=G1 (Y:home+3)
  (emit '(LOAD X1 Y R5 R))
  (emit '(MACR X1 Y1 A)     '(LOAD Y1 X R4 R-N))  ;A=(IN+DM*G1), Y1=G0
  (emit '(STORE A Y R5 R+1))

  (emit '(ok-1 LOCAL))
  (emit '(COPY A Y0))                             ;DM:=(IN+DM*G1), increment buffer pointer (R5)
  (emit '(ok LOCAL))
  (emit '(TRANSFER X1 A)    '(UPDATE R4 R-1))     ;R4->buffer limits
  (emit '(MAC Y0 Y1 A)      '(COPY R5 B))         ;A=DM+G0*(IN+DM*G1), B=current incremented buffer pointer
  (emit '(LOAD X1 Y R4 R))                        ;X1=end of buffer
  (emit '(CMP X1 B)         '(LOAD X1 X R4 R+1))  ;are we past end of buffer?  X1=start
  (emil '(JLE store-ptr))
  (emit '(STORE X1 X R4 R))
  (emit '(RTS))
  (emit '(store-ptr LOCAL))
  (emit '(STORE R5 X R4 R))
  (emit '(RTS))

  (emit '(ext-all-comb-mem LOCAL))                ;external memory reference (68030 handles pointers, we do multiplies)
  (emit '(STORE A L temp-loc))
  (emit '(CLR B)  '(LOAD A L R4 R))
  (emit '(LOAD B0 3))
  (emit '(CMP A B)  '(COPY R4 A))
  (emit '(JSEQ .UG-nil-warning))
  (emit '(LOAD A L temp-loc))

  (emit '(BSET M-HF3 X-IO M-HCR))
  (get-datum 'X1)                                 ;current value in delay line (DM)
  (emit '(BCLR M-HF3 X-IO M-HCR))
  (emit '(UPDATE R4 R+N))
  (emit '(LOAD Y1 Y R4 R))
  (emit '(MACR X1 Y1 A)      '(LOAD Y1 X R4 R))    ;A=IN+DM*G1, Y1=G0
  (put-datum 'A)
  (emit '(COPY A Y0))
  (emit '(TRANSFER X1 A))
  (emit '(MAC Y0 Y1 A))
  (emit '(RTS)))


(libdecl .clear-table 'clear-table-load '(A R4) nil)
(defun clear-table-load ()
  (emit '(.clear-table))		;R3->table header, X0=end index of clear
  (emit '(CLR A)           '(LOAD R4 X R3 R+1))
  (emit '(JCLR 0 Y R3 R-1 x-side))
  (emit   '(REP X0))
  (emit     '(STORE A Y R4 R+1))
  (emit   '(RTS))
  (emit   '(x-side LOCAL))
  (emit   '(REP X0))
  (emit     '(STORE A X R4 R+1))
  (emit   '(RTS)))


(libdecl .run-block 'run-block-load '(A B X Y R3 N3 R4 R5) '(.clear-table))
(defun run-block-load ()                ;R2->blk: L: loc  L: ctr   table-header (base   base-1)
					;                                       (size   x-or-y)
					;CTR is trigger value, LOC is current location in BUF
  (emit '(.run-block))
  (emit '(CLR B)        '(LOAD A X R2 R+1))            ;A<-LOC (as an integer -- frac part in Y)
  (emit '(STORE B L temp-loc))  
  (emit '(UPDATE R2 R+1)) 
  (emit '(LOAD X0 Y R2 R))		;X0<-SIZE
  (emit '(CMP X0 A)     '(COPY A N3))	;beyond end of table?
  (emit '(JGE check-CTR))		;yes -- 0 already in temp-loc (i.e. A >= X0 = index >= size)
  (emit '(LOAD R3 X R2 R+1))		;rblk-b[0]
  (emit '(JCLR 0 Y R2 R-1 x-side))
  (emit   '(LOAD A Y R3 RN))		;rblk-b[loc]
  (emit   '(JMP got-value))
  (emit   '(x-side LOCAL))
  (emit   '(LOAD A X R3 RN))
  (emit '(got-value LOCAL))
  (emit '(STORE A Y temp-loc))		;now we have return value (assumed fractional)
  (emit '(check-CTR LOCAL))
  (emit '(COPY R2 R3))			;table-header-word-0
  (emit '(LOAD A L R2 1-R))		;A<-CTR (assumed real)
  (emit '(LOAD B L R2 1-R))		;B<-LOC actually an integer, passes as real
  (emit '(LOAD Y0 1))
  (emit '(ADD Y0 B))			;loc<-loc+1
  (emit '(STORE B X R2 R))
  (emit '(CMP B A))			;if LOC >= CTR then go prepare the table for the next contents (LOC=0 is the trigger)
  (emit '(JGT all-done))		;we're in bounds, loc has been incremented, temp-loc has current value
  (emit '(CMP X0 B)     '(COPY X0 X1))
  (emit '(JGE easy-case))		;if loc>=size just clear entire table (otherwise shift current contents)
  (emit '(COPY B X0))			;X0<-loc
  (emit '(JSR .clear-table))		;R3->table header, X0=index of end of clear (REP X0 buf[i]<-0)
  (emit '(LOAD R4 X R3 R+1))		;rblk-b[0]
  (emit '(SUB X1 B)     '(COPY B N5))	;loc-siz, N5=loc
  (emit '(ABS B)        '(COPY R4 R5))	;R5<-rblk-b[0] (ABS because previous SUB was backwards)
  (emit '(CLR A))               
  (emit '(UPDATE R5 R+N))		;R5<-rblk-b[loc]
  (emit '(JCLR 0 Y R3 R-1 x-side-shift))
  (emit   '(DO B shift-y))
  (emit     '(LOAD X0 Y R5 R))		;b[i]<-b[i+loc], b[i+loc]<-0
  (emit     '(STORE X0 Y R4 R+1))
  (emit     '(STORE A Y R5 R+1))
  (emit     '(NOP))
  (emit   '(shift-y LOCAL))
  (emit   '(NOP))
  (emit   '(JMP decf-ctr))
  (emit '(x-side-shift LOCAL))
  (emit   '(DO B shift-x))
  (emit     '(LOAD X0 X R5 R))
  (emit     '(STORE X0 X R4 R+1))
  (emit     '(STORE A X R5 R+1))
  (emit     '(NOP))
  (emit   '(shift-x LOCAL))
  (emit   '(NOP))
  (emit '(JMP decf-ctr))
  (emit '(easy-case LOCAL))
  (emit '(JSR .clear-table))
  (emit '(decf-ctr LOCAL))
  (emit '(LOAD B L R2 R+1))		;B<-LOC
  (emit '(LOAD A L R2 R))		;A<-CTR
  (emit '(SUB B A))			;CTR<-CTR - LOC
  (emit '(CLR B)     '(STORE A L R2 R-1))
  (emit '(STORE B L R2 R))		;LOC<-0
  (emit '(all-done LOCAL))
  (emit '(LOAD A Y temp-loc))
  (emit '(RTS)))


(libdecl .wave-train-init 'wave-train-init nil nil)
(defun wave-train-init ()
  (emit `(.wave-train-init))
  (emit `(DEFINE wave-index-loc ,(get-L-mem)))
  (emit `(DEFINE current-index-loc ,(get-L-mem))))

(libdecl .wave-train 'wave-train-load '(A B X Y R3 R4 R5) '(.wave-train-init .table-interp .clear-table 
							    .basic-divide .run-block .shift-AB-up .real-mpy))
(defun wave-train-load ()		;R2->wt block (i.e. blk struct within wt struct)
  (emit '(.wave-train))			;wt: blk [4], wave-data [2 -- a table], phase freq fm old-freq old-ctr internal-mag sampling-rate
  (emit '(LOAD A X R2 R))		;A<-loc (rblk-loc (wt-b w)) 
  (emit '(TST A)        '(LOAD N2 4))	;is it zero (signal to refill the block)
  (emit '(JNE just-data))
  (emit '(LUA R3 R2 R+N))		;point to wave-data (wt-wave) (R3 needed by table-interp, index in A)
  (emit '(LOAD N2 6))
  (emit '(STORE R3 X wave-index-loc))	;save pointer to table base (might be changed by table-interp)
  (emit '(LOAD A L R2 RN))		;phase, as a real
  (emit '(TST A)        '(LOAD N2 2))	;is phase 0? (easy case)
  (emit '(JNE need-interp))
  (emit '(LOAD R4 X R2 RN))		;R4->blk table base (R2->blk struct, R3->wt struct)
  (emit '(LOAD N2 3))
  (emit '(LOAD R5 X R3 R))		;R5->wt wave data table base
  (emit '(LOAD X0 Y R3 R+1))		;R3->wt-wave[2], X0=size
  (emit '(UPDATE R2 R+N))		;R2->wt-b[3] = rblk-tab[2]
  (emit '(DO X0 add-in-another-wave))	;tables can be in either X or Y memory...
  (emit     '(JCLR 0 Y R2 R rblk-x))
  (emit     '(LOAD A Y R4 R))
  (emit     '(JMP get-wt))
  (emit     '(rblk-x LOCAL))
  (emit     '(LOAD A X R4 R))
  (emit     '(get-wt LOCAL))
  (emit     '(JCLR 0 Y R3 R wt-x))
  (emit     '(LOAD B Y R5 R+1))
  (emit     '(JMP got-both))
  (emit     '(wt-x LOCAL))
  (emit     '(LOAD B X R5 R+1))
  (emit     '(got-both LOCAL))
  (emit     '(ADD B A))
  (emit     '(JCLR 0 Y R2 R add-rblk-x))
  (emit     '(STORE A Y R4 R+1))
  (emit     '(JMP go-on))
  (emit     '(add-rblk-x LOCAL))
  (emit     '(STORE A X R4 R+1))
  (emit     '(go-on LOCAL))
  (emit     '(NOP))
  (emit     '(NOP))
  (emit '(add-in-another-wave LOCAL))
  (emit '(NOP))
  (emit '(JMP update-ctr))
  (emit '(need-interp LOCAL))
  ;; here we have to call table-interp to get the wt data, then add it into blk 
  ;; A = initial phase (and will be index to table interp), R3->1st word of wt-tab, R2->1st wd of blk (i.e of wt), N2=2
  ;; R3 lives at X wave-index-loc (needs to be restored after table-interp, to be safe)
  ;; current index lives at L current-index-loc, R2 at Y wave-index-loc
  (emit '(LOAD R4 X R2 RN))		;R4->base of rblk-tab (N2=2,R2->wt-b)
  (emit '(LOAD N2 3))
  (emit '(LOAD X0 Y R3 R))		;X0=size, R3->wt-wave[1]
  (emit '(UPDATE R2 R+N))		;R2->blk[2]
  (emit '(NOP))
  (emit '(STORE R2 Y wave-index-loc))
  (emit '(DO X0 interp-and-incf))
  (emit     '(STORE A L current-index-loc))
  (emit     '(JSR .table-interp))
  (emit     '(JCLR 0 Y R2 R x-interp))
  (emit     '(LOAD B Y R4 R))
  (emit     '(JMP interp-ok))
  (emit     '(x-interp LOCAL))
  (emit     '(LOAD B X R4 R))
  (emit     '(interp-ok))
  (emit     '(ADD B A))
  (emit     '(JCLR 0 Y R2 R interp-rblk-x))
  (emit     '(STORE A Y R4 R+1))
  (emit     '(JMP interp-go-on))
  (emit     '(interp-rblk-x LOCAL))
  (emit     '(STORE A X R4 R+1))
  (emit     '(interp-go-on LOCAL))
  (emit     '(CLR B)     '(LOAD A L current-index-loc))
  (emit     '(LOAD R2 Y wave-index-loc))
  (emit     '(LOAD B 1))
  (emit     '(ADD B A)    '(LOAD R3 X wave-index-loc))
  (emit '(interp-and-incf LOCAL))
  (emit '(UPDATE R3 R+1))		;R3->wave[2]
  (emit '(update-ctr))			;here R2->4th word of blk struct, R3->2nd word of wt struct
  (emit '(LOAD N2 2))
  (emit '(LOAD N3 3))
  (emit '(UPDATE R2 R-N))		;R2->ctr -- eventually needs to point to first word (pipeline delay here...)
  (emit '(LOAD A L R3 RN))		;A<-fm
  (emit '(TST A)     '(LOAD N3 4))
  (emit '(JNE real-hard-case))
  (emit '(LOAD A L R3 RN))		;A<-old freq
  (emit '(LUA R4 R3 R+N))		;R4->old freq
  (emit '(LOAD N3 2))
  (emit '(UPDATE R4 R+1))		;R4->old ctr
  (emit '(LOAD B L R3 RN))		;B<-freq
  (emit '(CMP B A)    '(LOAD B L R4 R))	;freq=old freq?  B<-old ctr
  (emit '(CLR A))			;i.e. fm*mag=0 (from earlier)
  (emit '(JNE hard-case))
  (emit '(LOAD A L R2 R))		;A<-ctr
  (emit '(JMP got-data))
  (emit '(real-hard-case LOCAL))
  (emit '(LOAD N3 6))			;A=fm here
  (emit '(NOP))
  (emit '(LOAD B L R3 RN))		;B<-mag
  (emit '(JSR .real-mpy))		;(* fm internal-mag)
  (emit '(hard-case LOCAL))
  ;; here we have to implement (incf (rblk-ctr (wt-b w)) (/ sampling-rate (+ (wt-freq w) (* fm (wt-internal-mag w)))))
  ;; R2->rblk-ctr, R3->wt-tab-2nd word, A = (fm * internal-mag)
  (emit '(LOAD N3 2))
  (emit '(NOP))
  (emit '(LOAD B L R3 RN))		;B<-freq
  (emit '(LOAD N3 7))
  (emit '(ADD A B))			;B = current freq with fm, if any (the divisor)
  (emit '(LOAD A L R3 RN))		;A = sampling-rate
  (emit '(JSR .shift-AB-up))
  (emit '(JSR .basic-divide))		;uses R5
  (emit '(LOAD B L R2 R))		;B<-ctr
  (emit '(got-data LOCAL))	
  (emit '(ADD B A))
  (emit '(STORE A L R2 R-1))
  (emit '(just-data LOCAL))
  (emit '(JSR .run-block))
  (emit '(RTS)))



;;; Filters
#|
;;; First code is an example taken from Motorola "Digital Filters on the DSP56000"
;;; assumes R3->current "states" (i.e. zn)
;;;         M3=states-1 (modulo addressing)
;;;         R6->coeffs (IIR then FIR) 
;;;         R4->current buf loc in states (R3<-R4 before start)
;;;         B1=filter order - 1

(defun direct-form-canonic-filter-load ()
  (emit '(.direct-form-canonic-filter))
  ;; first IIR section
  (emit                  '(LOAD X0 X R3 R+1)   '(LOAD Y0 Y R6 R+1))
  (emit '(REP B1))
  (emit '(MAC X0 Y0 A)   '(LOAD X0 X R3 R+1)   '(LOAD Y0 Y R6 R+1))
  (emit '(MACR X0 Y0 A))
  (emit                  '(STORE A X R3 R+1))
  ;; now FIR section
  (emit                  '(LOAD X0 X R3 R+1)   '(LOAD Y0 Y R6 R+1))
  (emit '(REP B1))
  (emit '(MAC X0 Y0 A)   '(LOAD X0 X R3 R+1)   '(LOAD Y0 Y R6 R+1))
  (emit '(MACR X0 Y0 A))		;SMACR??
  (emit                  '(STORE R3 X R4 R))
  (emit '(RTS)))

|#
;;; now our version of the same thing (used for all IIR/FIR filters)
;;; all three types assume all coefficients (and SO in ladder) are fractional -- not right!

(libdecl .direct-filter 'direct-filter-load '(A B X Y R0 R3 R4 R5 R6) nil)
(defun direct-filter-load ()		;Y:p Y:a X:d+1
  (emit '(.direct-filter))		;R2->flt struct: type   (end c)  (start d)  so
					;                order  (end p)  (end a)  (end d) (d has extra end word of 0)
  (emit '(LOAD N2 Y R2 R+1))		;N2=m
  (emit '(LOAD R3 Y R2 R+1))		;R3->end of p (steps backwards)
  (emit '(CLR B)   '(LOAD R6 X R2 R))   ;R6->start of d (B=xout=0.0)
  (emit '(LOAD R0 Y R2 R+1))		;R0->end of a
  (emit '(LOAD R5 Y R2 R))		;R5->end of d
  (emit '(STORE A X R6 R))		;d[0]=inp
  (emit '(LUA R4 R5 R+1))		;R4->dummy word 1 past end of d
  (emit '(LOAD X0 X R5 R-1))
  (emit '(DO N2 direct-loop))
  (emit                      '(STORE X0 X R4 R-1) '(LOAD Y0 Y R3 R-1))  ; X0=d[j], Y0=p[j]
  (emit     '(MAC X0 Y0 B)   '(LOAD Y0 Y R0 R-1)  '(LOAD A X R6 R))     ; Y0=a[j], A=d[0]
  (emit     '(SMAC X0 Y0 A))
  (emit                      '(STORE A X R6 R)     '(COPY A Y1))        ; d[0]=A
  (emit                      '(LOAD X0 X R5 R-1))
  (emit    '(direct-loop LOCAL))
  (emit '(TFR B A)    '(STORE X0 X R4 R)   '(LOAD Y0 Y R3 R))
  (emit '(MAC Y0 Y1 A))
  (emit '(RTS)))

(libdecl .lattice-filter 'lattice-filter-load '(A B X Y R0 R3 R4 R5 R6) nil)
(defun lattice-filter-load ()
  (emit '(.lattice-filter))		;R2->flt struct as above
  (emit '(LOAD N2 Y R2 R+1))		;N2=m
  (emit '(LOAD R3 Y R2 R+1))		;R3->end of p (steps backwards)
  (emit '(LOAD R6 X R2 R))		;R6->start of d
  (emit '(CLR B)   '(LOAD R0 Y R2 R+1))	;R0->end of a
  (emit '(LOAD R5 Y R2 R))		;R5->end of d
  (emit '(LOAD N5 2))
  (emit '(UPDATE R0 R-1))
  (emit '(UPDATE R5 R-1))
  (emit '(DO N2 lattice-loop))
  (emit                            '(LOAD Y0 Y R0 R)      '(LOAD X1 X R5 R))
  (emit     '(SMAC Y0 X1 A)        '(LOAD X1 Y R0 R-1))
  (emit                            '(LOAD A X R5 R+1)     '(COPY A Y0))
  (emit     '(MAC X1 Y0 A)         '(LOAD X0 Y R3 R-1))
  (emit                            '(STORE A X R5 R-N)    '(COPY A Y1))
  (emit     '(MAC X0 Y1 B)         '(COPY Y0 A))
  (emit     '(lattice-loop LOCAL))
  (emit '(TFR B A)    '(STORE A X R6 R)    '(LOAD Y1 Y R3 R))
  (emit '(MAC Y0 Y1 A))
  (emit '(RTS)))

(libdecl .ladder-filter 'ladder-filter-load '(A B X Y R0 R3 R4 R5 R6 R7) nil)
(defun ladder-filter-load ()
  (emit '(.ladder-filter))		;R2->flt struct as above
  (emit '(LOAD N2 Y R2 R+1))		;N2=m
  (emit '(LOAD R7 X R2 R))		;R7->end of c
  (emit '(LOAD R3 Y R2 R+1))		;R3->end of p (steps backwards)
  (emit '(LOAD R6 X R2 R))		;R6->start of d
  (emit '(CLR B)   '(LOAD R0 Y R2 R+1))	;R0->end of a
  (emit '(LOAD R5 Y R2 R))		;R5->end of d
  (emit '(LOAD N5 2))
  (emit '(UPDATE R0 R-1))
  (emit '(UPDATE R5 R-1))
  (emit '(LOAD X0 X R7 1-R)  '(COPY A Y1))                  ; X0=c[i], Y1=inp
  (emit '(DO N2 ladder-loop))
  (emit      '(LOAD X1 X R5 R+1)  '(LOAD Y0 Y R0 R-1))      ; X1=d[i], Y0=a[i] 
  (emit      '(MPY X0 X1 A))
  (emit      '(MAC Y0 Y1 A))
  (emit      '(MPY X0 Y1 A)       '(STORE A X R5 R))
  (emit      '(SMAC X1 Y0 A)      '(LOAD Y1 Y R3 R-1))
  (emit                           '(LOAD X0 X R5 R-N))
  (emit      '(MAC X0 Y1 B)       '(LOAD X0 X R7 1-R)    '(COPY A Y1))             ; X0=c[i], Y1=inp
  (emit      '(ladder-loop LOCAL))
  (emit '(STORE A X R6 R)  '(LOAD Y0 Y R3 R))
  (emit '(MAC Y0 Y1 B))
  (emit '(COPY B Y0)  '(LOAD X0 X R2 R))
  (emit '(MPY X0 Y0 A))
  (emit '(RTS)))

(libdecl .filter 'filter-load '(A B X Y R0 R3 R4 R5 R6 R7) '(.direct-filter .lattice-filter .ladder-filter))
(defun filter-load ()			;A=input
  (emit '(.filter))			;R2->flt struct: type:
  (emit '(LOAD B X R2 R))		;get type
  (emit '(TST B)        '(LOAD X0 1))
  (emit '(JEQ .direct-filter))		;  0 = direct form
  (emit '(CMP X0 B)     '(LOAD X0 2))
  (emit '(JEQ .lattice-filter))		;  1 = lattice form
  (emit '(CMP X0 B))
  (emit '(JEQ .ladder-filter))		;  2 = ladder form
  (emit '(JSR .break)))			;  t = error


;;; LOG (natural log) 
;;; Abramowitz and Stegun 4.1.44

(libdecl .log-init 'log-init nil nil)
(defun log-init ()
  (let ((x-side nil) (y-side nil))
    (emit '(.log-init))
    (emit (list 'DEFINE 'log-table-base (get-L-mem 25)))
    (loop for i from 24 downto 0 do
      (multiple-value-bind
	  (int frac) (make-real (log (expt 2 i)))
	(push int x-side)
	(push frac y-side)))
    (emit '(X-ORG log-table-base))
    (emit `(X-DATA ,@x-side))
    (emit '(Y-ORG log-table-base))
    (emit `(Y-DATA ,@y-side))))

(libdecl .log 'log-load '(A B X R4 N4) '(.log-init))
(defun log-load ()
   (emit '(.log))
   (emit '(TST A)       '(LOAD B 1))	;not SHORT here -- SHORT puts the 1 in B1 MSB
   (emil '(JLE log-done))
   (emit '(CMP B A)     '(LOAD R4 0))
   (emil '(JEQ log-one))
   (emil '(JLT neg-log))
   (emit '(LOAD B 2))
   (emit '(STORE R4 Y temp-loc))	;i.e. temp-loc <= 0 for positive log
   (emit '(cmp-p))
   (emit '(CMP B A))
   (emil '(JLT frac-part))
   (emit '(ASR A)       '(UPDATE R4 R+1))
   (emil '(JMP cmp-p))
   (emit '(neg-log))
   (emit '(ASL A)       '(UPDATE R4 R+1))
   (emit '(CMP B A)     '(STORE B1 Y temp-loc))
   (emil '(JLT neg-log))
   (emit '(frac-part LOCAL))
   (emit '(LSL A)       '(LOAD X1 -.00645))
   (emit '(ASR A)       '(LOAD B .036088))
					;now the polynomial approximation -- can't interleave here
					;because the two halves overflow -1.0 to 1.0 (ruining fractional multiplies)
   (emit                '(COPY A0 X0))
   (emit '(MAC X1 X0 B))
   (emit                '(COPY B X1)    '(LOAD A -.095329))
   (emit '(MAC X1 X0 A))
   (emit                '(COPY A X1)    '(LOAD A .16765))
   (emit '(MAC X1 X0 A)) 
   (emit                '(COPY A X1)    '(LOAD A -.240733))
   (emit '(MAC X1 X0 A))
   (emit                '(COPY A X1)    '(LOAD A .331799))
   (emit '(MAC X1 X0 A))
   (emit                '(COPY A X1)    '(LOAD A -.49987))
   (emit '(MAC X1 X0 A)) 
   (emit                '(COPY A X1)    '(LOAD A fractional-one))
   (emit '(MAC X1 X0 A) '(LOAD N4 log-table-base)) 
   (emit '(CLR B)       '(COPY A X1))
   (emit '(MPY X1 X0 A))
   (emit                '(COPY A1 B0))
   (emit '(ASL B)       '(LOAD A L R4 RN))
   (emit '(JCLR 0 Y temp-loc add-log-time))
   (emit '(NEG A))
   (emit '(add-log-time LOCAL))
   (emit '(ADD B A))
   (emit '(log-done LOCAL))
   (emit '(RTS))
   (emit '(log-one LOCAL))
   (emit '(CLR A))
   (emit '(RTS))
   (emit '(HIDE neg-log cmp-p)))


(libdecl .expt-init 'expt-init nil nil)
(defun expt-init ()
  (let ((x-side nil) (y-side nil))
    (emit '(.expt-init))
    (emit (list 'DEFINE 'orig-A (get-L-mem)))
    (emit (list 'DEFINE 'orig-B (get-L-mem)))
    (emit (list 'DEFINE 'expt-table-base (get-L-mem 24)))
    (loop for i from 23 downto 0 do
      (multiple-value-bind
	  (int frac) (make-real (exp i))
	(push int x-side)
	(push frac y-side)))
    (emit '(X-ORG expt-table-base))
    (emit `(X-DATA ,@x-side))
    (emit '(Y-ORG expt-table-base))
    (emit `(Y-DATA ,@y-side))))

(libdecl .expt 'expt-load '(A B X Y R4 R5) '(.expt-init .log .real-mpy .shift-AB-up .basic-divide))
(defun expt-load ()			;(A = base   B = power) (.basic-divide uses R5)
  (emit '(.expt))
  (emit '(TST B)         '(STORE A L orig-A)) ;anything to 0th power (including 0) = 1
  (emit '(JNE not-1))
  (emit '(CLR A))
  (emit '(LOAD A1 1))			;load A with 1.0
  (emit '(RTS))
  (emit '(not-1 LOCAL))
  (emit '(TST A)         '(STORE B L temp-loc-1))
  (emit '(JEQ expt-all-done))		;0^n if n /= 0 => 0
  (emit '(ABS A)         '(STORE B L orig-B)) ;if A < 0, put off sign decision until later (if B integer, this is not complex)
  (emit '(JSR .log))			;(uses temp-loc)

  (emit '(.constant-base-expt))
  ;; this entry point is for (common) case where we know the base is constant and can preload
  ;; A with (log (abs base)), power (unmodified) in temp-loc-1, base (unmodified) in orig-A

  (emit '(STORE A L temp-loc-2))	;L 3 (temp-loc-1 is L 2)
  (emit '(ABS A)         '(LOAD B L temp-loc-1))
  (emit '(ABS B))			;will pick up sign later
  (emit '(JSR .real-mpy))		;we now have b*log a (in a ^ b which is (e^(b log a)) (uses temp-loc)
  (emit '(.inner-exp))
  ;; if A>23 then we'll overflow our table, so return 0
  (emit '(LOAD B 23))
  (emit '(CMP A B))
  (emit '(JGE no-overflow))
  (emit '(CLR A))
  (emit '(RTS))
  (emit '(no-overflow LOCAL))
  (emit '(LSL A)         '(COPY A1 R4))
  (emit '(ASR A)         '(LOAD N4 expt-table-base))
  (emit '(COPY A0 X0))
  (emit                                 `(LOAD Y1 ,(/ .001388 4.0)))	
					;these coeff/4 are to keep numbers fractional for mpy benefit
  (emit '(MPY X0 X0 A)                  `(LOAD Y0 ,(/ .0000198 4.0)))
  (emit                  '(COPY A X1)   `(LOAD B ,(/ .008333 4.0)))
  (emit '(MAC X1 Y0 B)                  `(LOAD A ,(/ .04166667 4.0)))
  (emit '(MAC X1 Y1 A)   '(COPY B Y0)   `(LOAD B ,(/ .1666667 4.0)))
  (emit '(MAC X1 Y0 B)   '(COPY A Y1)   `(LOAD A ,(/ .5 4.0)))
  (emit '(MAC X1 Y1 A)   '(COPY B Y0)   `(LOAD B ,(floor fractional-one 4)))
  (emit '(MAC X1 Y0 B)   '(COPY A Y1))
  (emit '(MPY X1 Y1 A)   '(COPY B Y0)   `(LOAD B ,(floor fractional-one 4)))
  (emit '(MAC X0 Y0 B))
  (emit '(ADD B A))
  (emit '(COPY A1 A0))			;A was fraction--make it a real (has to be positive here)
  (emit '(LOAD A1 0 SHORT))
  (emit '(ASL A))			;one for frac-real conversion, 2 for /4 above
  (emit '(ASL A))
  (emit '(ASL A)         '(LOAD B L R4 RN))
  (emit '(JSR .real-mpy))		;e^(int+frac) = e^int * e^frac
  (emit '(JCLR 23 X temp-loc-2 log-pos))
  (emit '(JCLR 23 X temp-loc-1 invert))	;i.e. A<0 and B>=0
  (emil '(JMP expt-done))		;i.e. A<0 and B<0
  (emit '(log-pos LOCAL))
  (emit '(JCLR 23 X temp-loc-1 expt-done)) ;i.e. A>0 and B>=0
  (emit '(invert LOCAL))
  (emit '(TFR A B)       '(LOAD A 1))	;1/A -- brute force negative power calc
  (emit '(JSR .shift-AB-up))
  (emit '(JSR .basic-divide))		;clobbers temp-loc and temp-loc-1
  (emit '(expt-done LOCAL))
  (emit '(LOAD B L orig-A))
  (emit '(TST B)          '(LOAD X L orig-B))
  (emit '(JGE expt-all-done))
  (emit '(STORE X L temp-loc))
  (emit '(JCLR 0 X temp-loc expt-all-done)) ; this is a kludge -- assume for now that user has used integer power
  (emit '(NEG A))
  (emit '(expt-all-done LOCAL))
  (emit '(RTS)))

(libdecl .exp 'exp-load '(A B X Y R4) '(.expt))
(defun exp-load ()
  (emit '(.exp))
  (emit '(STORE A L temp-loc-1))
  (emit '(ABS A)   '(LOAD B 1))
  (emit '(STORE B L temp-loc-2))	;(log e)
  (emit '(STORE B L orig-A))
  (emit '(JMP .inner-exp)))


(libdecl .basic-tan 'basic-tan-load '(A B X Y) nil)
(defun basic-tan-load ()		;argument is in X0 and is a positive fraction (Abramowitz and Stegun 4.3.101)
  (emit '(.basic-tan))
  (emit '(MPY X0 X0 A)   '(STORE X0 X temp-loc))
  (emit                  '(COPY A Y0)  `(LOAD B  ,(/ .024565 2.0)))   ;Y0=x^2
  (emit '(MPY Y0 Y0 A)                 `(LOAD X1 ,(/ .0095168 2.0)))
  (emit                  '(STORE Y0 X temp-loc-1))
  (emit                  '(COPY A X0)  `(LOAD Y1 ,(/ .00290052 2.0))) ;X0=x^4
  (emit '(MAC X0 X1 B)                 `(LOAD A  ,(/ .0533740 2.0)))
  (emit '(MAC X0 Y1 A)   '(COPY B Y0)  `(LOAD B  ,(/ .1333923 2.0)))
  (emit '(MAC X0 Y0 B)   '(COPY A Y1)  `(LOAD A  ,(/ .3333314 2.0)))
  (emit '(MAC X0 Y1 A)   '(COPY B Y0)  `(LOAD B  ,(floor fractional-one 2)))
  (emit '(MAC X0 Y0 B)   '(COPY A Y1)  '(LOAD X0 X temp-loc))
  (emit '(MPY X0 Y1 A)   '(COPY B Y0))
  (emit '(MPY X0 Y0 B)   '(COPY A Y1)  '(LOAD X0 X temp-loc-1))
  (emit '(MPY X0 Y1 A))
  (emit '(ADD B A))
  (emit '(COPY A1 A0))			;A was fraction--make it a real (has to be positive here)
  (emit '(LOAD A1 0 SHORT))
  (emit '(ASL A))			;one for frac-real conversion, 1 for /2 above
  (emit '(ASL A))
  (emit '(RTS)))

(libdecl .atan-init 'atan-init nil nil)
(defun atan-init ()			;put 1.0 at loc
  (emit '(.atan-init))
  (emit (list 'DEFINE 'real-one (get-L-mem)))
  (emit '(X-ORG real-one))
  (emit '(X-DATA 1))
  (emit '(Y-ORG real-one))
  (emit '(Y-DATA 0))
  (emit `(DEFINE atan-temp ,(get-L-mem))))

(libdecl .tan-init 'tan-init nil nil)
(defun tan-init ()
  (emit '(.tan-init))
  (emit `(DEFINE tan-temp ,(get-L-mem)))
  (emit `(DEFINE tan-temp-1 ,(get-L-mem)))
  (emit `(DEFINE tan-temp-2 ,(get-L-mem))))

(libdecl .tan 'tan-load '(A B X Y R5) '(.tan-init .basic-tan .atan-init .sw-init .cosine-init .real-mod .trig-sine .trig-cosine .basic-divide))
(defun tan-load ()		;argument is in A and is a real
  (emit '(.tan))
  (emit '(TST A)     '(STORE A L tan-temp))
  (emit '(JNE not-zero))
  (emit '(RTS))
  (emit '(not-zero LOCAL))
  (emit '(ABS A)     '(LOAD B L real-one))
  (emit '(CMP B A)   '(LOAD R5 tan-temp))
  (emit '(JGE quad-reduce))
  (emit '(tan-ok))
  (emit '(ASR A))
  (emit '(COPY A0 X0))
  (emit '(JSR .basic-tan))
  (emit '(JCLR 23 X R5 R not-neg))
  (emit '(NEG A))
  (emit '(not-neg LOCAL))
  (emit '(RTS))
  (emit '(quad-reduce LOCAL))
  (emit '(LOAD B L one-pi))
  (emit '(JSR .real-mod))
  (emit '(LOAD B L real-one))
  (emit '(CMP B A))
  (emit '(JLT tan-ok))
  (emit '(STORE A L tan-temp-2))
  (emit '(JSR .trig-cosine))
  (emit '(STORE A X tan-temp-1))
  (emit '(LOAD A L tan-temp-2))
  (emit '(JSR .trig-sine))
  (emit '(LOAD B X tan-temp-1))
  (emit '(JSR .basic-divide))
  (emit '(LOAD B L tan-temp))
  (emit '(TST B))
  (emit '(JGE A-not-neg))
  (emit '(NEG A))
  (emit '(A-not-neg LOCAL))
  (emit '(RTS)))

(libdecl .basic-atan 'basic-atan-load '(A B X Y) nil)
#|
(defun basic-atan-load ()			;4.4.47
  (emit '(.basic-atan))			;argument in X0
  (emit '(MPY X0 X0 B)   '(LOAD A -.085133))
  (emit                  '(COPY B X1)   '(LOAD Y0 .0208351))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   '(LOAD A .180141))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   '(LOAD A -.3302995))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   '(LOAD A .999866))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0))
  (emit '(MPY X0 Y0 A))
  (emit '(RTS)))
|#
;;; that version of atan not accurate enough
(defun basic-atan-load ()			;4.4.49
  (emit '(.basic-atan))			;argument in X0
  (emit '(MPY X0 X0 B)   `(LOAD A ,(* .5 -.0161657367)))
  (emit                  '(COPY B X1)   `(LOAD Y0 ,(* .5 .0028662257)))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   `(LOAD A ,(* .5 .0429096138)))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   `(LOAD A ,(* .5 -.07528964)))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   `(LOAD A ,(* .5 .1065626393)))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   `(LOAD A ,(* .5 -.1420889944)))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   `(LOAD A ,(* .5 .1999355085)))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   `(LOAD A ,(* .5 -.3333314528)))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   `(LOAD A ,(* .5 1.0)))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0))
  (emit '(MPY X0 Y0 A))
  (emit '(ASL A))
  (emit '(RTS)))


;;; atan x = pi/2 - atan (1/x)

(libdecl .atan 'atan-load '(A B X Y R5) '(.basic-divide .basic-atan .cosine-init .shift-AB-up .atan-init))
(defun atan-load ()			;arg in A, result in A, both real
  (emit '(.atan))
  (emit '(TST A)     '(STORE A L atan-temp))
  (emit '(JNE not-easy-case))
  (emit '(RTS))				;(atan 0) => 0
  (emit '(not-easy-case LOCAL))
  (emit '(ABS A)     '(LOAD B L real-one))
  (emit '(CMP B A))
  (emit '(JGE arrcot))			;arg<1.0 => goto basic-atan
  (emit '(atan-ok))
  (emit '(ASR A)     '(LOAD R5 atan-temp)) ;convert arg to fraction
  (emit '(COPY A0 X0))
  (emit '(JSR .basic-atan))
  (emit '(LOAD X0 1))			;and convert result to real
  (emit '(COPY A1 X1))
  (emit '(MPY X0 X1 A))
  (emit '(JCLR 23 X R5 R not-neg))
  (emit '(NEG A))
  (emit '(not-neg LOCAL))
  (emit '(RTS))
  (emit '(arrcot LOCAL))
  (emit '(TFR A B))
  (emit '(LOAD A1 1))
  (emit '(LOAD A0 0))
  (emit '(JSR .shift-AB-up))
  (emit '(JSR .basic-divide))
  (emit '(JSR atan-ok))
  (emit '(TST A)       '(LOAD B L half-pi))
  (emit '(JMI add-half-pi))
  (emit '(SUB A B))
  (emit '(TFR B A))
  (emit '(RTS))
  (emit '(add-half-pi LOCAL))
  (emit '(ADD B A))
  (emit '(NEG A))
  (emit '(RTS))
  (emit '(HIDE atan-ok)))

(libdecl .atan2-init 'atan2-init nil nil)
(defun atan2-init ()
  (emit '(.atan2-init))
  (emit `(DEFINE atan2-x ,(get-L-mem)))
  (emit `(DEFINE atan2-y ,(get-L-mem))))

(libdecl .atan2 'atan2-load '(A B X Y R5) '(.atan2-init .atan .basic-divide .shift-AB-up .sw-init))
(defun atan2-load ()			;y in A, x in B, result in A, both real
  (emit '(.atan2))
  (emit '(STORE A L atan2-y))
  (emit '(STORE B L atan2-x))
  (emit '(JSR .shift-AB-up))
  (emit '(JSR .basic-divide))
  (emit '(JSR .atan))
  ;; if both pos, return A or if yneg xpos
  ;; if ypos or zero and x neg, add pi
  ;; if yneg, xneg subtract pi
  (emit '(LOAD B L atan2-x))
  (emit '(TST B))
  (emit '(JGE done))
  (emit '(LOAD B L atan2-y))
  (emit '(TST B))
  (emit '(LOAD B L one-pi))
  (emit '(JGE add-pi))
  (emit '(SUB B A))
  (emit '(JMP done))
  (emit '(add-pi LOCAL))
  (emit '(ADD B A))
  (emit '(done LOCAL))
  (emit '(RTS)))



#|
(defun Ax (x) 
  (multiple-value-bind
      (int frac) (make-real x)
    (ex `(LOAD A ,int))
    (ex `(LOAD A0 ,frac))))

(defun Bx (x) 
  (multiple-value-bind
      (int frac) (make-real x)
    (ex `(LOAD B ,int))
    (ex `(LOAD B0 ,frac))))

(defun testax (y x) 
  (format nil "~A ~A" 
	  (progn 
	    (ax y) 
	    (bx x)
	    (ex '(JSR .atan2)) 
	    (real56 (dsp-reg 'A2) 
		    (dsp-reg 'A1) 
		    (dsp-reg 'A0))) 
	  (atan y x)))

|#
  

(libdecl .asin 'asin-load '(A B X Y) nil)
(defun asin-load ()			;4.4.40 -- coeffs are same as asinh with sign changes 
					;this is not a very good approximation if arg > .5 or so
  (emit '(.asin))			;argument in A
  (emit '(ABS A)         '(STORE A L temp-loc))
  (emit '(COPY A1 X0))
  (emit '(MPY X0 X0 B)                  `(LOAD Y0 ,(/ .0446428 2.0)))
  (emit                  '(COPY B X1)   `(LOAD A  ,(/ .075 2.0)))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   `(LOAD A  ,(/ .16666667 2.0)))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   `(LOAD A  ,(floor fractional-one 2)))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0))
  (emit '(MPY X0 Y0 A))
  (emit '(COPY A1 A0))			;A was fraction--make it a real (has to be positive here)
  (emit '(LOAD A1 0 SHORT))
  (emit '(ASL A))			;one for frac-real conversion, 1 for /2 above
  (emit '(ASL A))
  (emit '(JCLR 23 X temp-loc ok))
  (emit '(NEG A))
  (emit '(ok LOCAL))
  (emit '(RTS)))


(libdecl .acos 'acos-load '(A B X Y) '(.asin .cosine-init)) ;cosine-init defines and preloads half-pi
(defun acos-load ()
  (emit '(.acos))
  (emit '(JSR .asin))
  (emit '(TFR A B)       '(LOAD A L half-pi))
  (emit '(SUB B A))
  (emit '(RTS)))


;;; these hyperbolic functions only work for fractional arguments -- these could be extended if anyone needs them

(libdecl .sinh 'sinh-load '(A B X Y) nil)
(defun sinh-load ()			;4.5.62
  (emit '(.sinh))			;argument in A
  (emit '(ABS A)         '(STORE A L temp-loc))
  (emit '(COPY A1 X0))
  (emit '(MPY X0 X0 B)                  `(LOAD A  ,(/ .008333 2.0)))
  (emit                  '(COPY B X1)   `(LOAD Y0 ,(/ .0000198 2.0)))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   `(LOAD A  ,(/ .16666667 2.0)))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   `(LOAD A  ,(floor fractional-one 2)))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0))
  (emit '(MPY X0 Y0 A))
  (emit '(COPY A1 A0))			;A was fraction--make it a real (has to be positive here)
  (emit '(LOAD A1 0 SHORT))
  (emit '(ASL A))			;one for frac-real conversion, 1 for /2 above
  (emit '(ASL A))
  (emit '(JCLR 23 X temp-loc ok))
  (emit '(NEG A))
  (emit '(ok LOCAL))
  (emit '(RTS)))


(libdecl .cosh 'cosh-load '(A B X Y) nil)
(defun cosh-load ()			;4.5.63
  (emit '(.cosh))			;argument in X0
  (emit '(MPY X0 X0 B)                  `(LOAD A  ,(/ .04166667 2.0)))
  (emit                  '(COPY B X1)   `(LOAD Y0 ,(/ .0013888 2.0)))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   `(LOAD A  ,(/ .5 2.0)))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   `(LOAD A  ,(floor fractional-one 2)))
  (emit '(MAC X1 Y0 A))
  (emit '(COPY A1 A0))	
  (emit '(LOAD A1 0 SHORT))
  (emit '(ASL A))			;one for frac-real conversion, 1 for /2 above
  (emit '(ASL A))
  (emit '(RTS)))


(libdecl .tanh 'tanh-load '(A B X Y) nil)
(defun tanh-load ()			;4.5.64
  (emit '(.tanh))			;argument in X0
  (emit '(MPY X0 X0 B)                  '(LOAD A -.053968254))
  (emit                  '(COPY B X1)   '(LOAD Y0 .0218694))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   '(LOAD A .13333333))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   '(LOAD A -.3333333))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   '(LOAD A fractional-one))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0))
  (emit '(MPY X0 Y0 A))
  (emit '(RTS)))


(libdecl .atanh 'atanh-load '(A B X Y) nil)
(defun atanh-load ()			;4.6.32 -- this not very good if arg > .5 or so
  (emit '(.atanh))			;argument in A (next coeff is 1/11 and so on)
  (emit '(ABS A)         '(STORE A L temp-loc))
  (emit '(COPY A1 X0))
  (emit '(MPY X0 X0 B)                  `(LOAD A  ,(/ 1.0 28.0)))
  (emit                  '(COPY B X1)   `(LOAD Y0 ,(/ 1.0 36.0)))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   `(LOAD A  ,(/ 1.0 20.0)))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   `(LOAD A  ,(/ 1.0 12.0)))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0)   `(LOAD A  ,(/ 1.0 4.0)))
  (emit '(MAC X1 Y0 A))
  (emit                  '(COPY A Y0))
  (emit '(MPY X0 Y0 A))
  (emit '(COPY A1 A0))	
  (emit '(LOAD A1 0 SHORT))
  (emit '(ASL A))			;one for frac-real conversion, 2 for /4 above
  (emit '(ASL A))
  (emit '(ASL A))
  (emit '(JCLR 23 X temp-loc ok))
  (emit '(NEG A))
  (emit '(ok LOCAL))
  (emit '(RTS)))


(libdecl .asinh 'asinh-load '(A B X Y R4) '(.log .real-sqrt .real-mpy))
(defun asinh-load ()			;4.6.20 (simpler definition doesn't converge very well)
  (emit '(.asinh))			;argument in A
  (emit '(STORE A L temp-loc))
  (emit '(TFR A B))			;get x^2 via real-mpy
  (emit '(JSR .real-mpy))
  (emit '(LOAD B 1))
  (emit '(ADD B A))			;x^2 + 1
  (emit '(JSR .real-sqrt))		;sqrt of that in A
  (emit '(LOAD B L temp-loc))		;add in x
  (emit '(ADD B A))
  (emit '(JSR .log))			;asinh in A we hope
  (emit '(RTS)))


(libdecl .acosh 'acosh-load '(A B X Y R4) '(.log .real-sqrt .real-mpy))
(defun acosh-load ()			;4.6.21 (can't find a simpler definition)
  (emit '(.acosh))			;argument in A, must be geq 1.0
  (emit '(STORE A L temp-loc))
  (emit '(TFR A B))			;get x^2 via real-mpy
  (emit '(JSR .real-mpy))
  (emit '(LOAD B 1))
  (emit '(SUB B A))			;x^2 - 1
  (emit '(JSR .real-sqrt))		;sqrt of that in A
  (emit '(LOAD B L temp-loc))		;add in x
  (emit '(ADD B A))
  (emit '(JSR .log))			;acosh in A we hope
  (emit '(RTS)))


(defun ld-real (reg num)
  (multiple-value-bind
      (int frac) (make-real num)
    (if (member reg '(A B))
	(let ((low-reg (if (eq reg 'A) 'A0 'B0)))
	  (emit `(LOAD ,reg ,int))
	  (emit `(LOAD ,low-reg ,frac)))
      (let ((high-reg (if (eq reg 'X) 'X1 'Y1))
	    (low-reg (if (eq reg 'X) 'X0 'Y0)))
	(emit `(LOAD ,high-reg ,int))
	(emit `(LOAD ,low-reg ,frac))))))

;;; Modified Bessel function (for Kaiser windows and asymmetric fm)

(libdecl .I0-init 'I0-init nil nil)
(defun I0-init ()
  (emit '(.I0-init))
  (emit `(DEFINE I0-local-1 ,(get-L-mem)))
  (emit `(DEFINE I0-local-2 ,(get-L-mem)))
  (emit `(DEFINE I0-local-3 ,(get-L-mem))))

(libdecl .I0 'i0-load '(A B X Y) '(.I0-init .real-sqrt .real-frac-mpy .real-mpy .exp .shift-AB-up .basic-divide))
(defun i0-load ()
  (emit '(.I0))				;input (real) in A, I0(x) returned in A
  (emit '(ABS A))    
  (emit '(STORE A L I0-local-3))
  (ld-real 'B 3.75)
  (emit '(CMP A B)    `(LOAD X1 ,(/ 1.0 3.75)))
  (emit '(JLE harder-case))
  (emit '(JSR .real-frac-mpy))		;x/3.75
  (emit '(TFR A B))
  (emit '(JSR .real-mpy))		;sqr
  (emit '(STORE A L I0-local-1))	;"y"
  (ld-real 'B 0.45813e-2)
  (emit '(JSR .real-mpy))
  (ld-real 'B 0.360768e-1)
  (emit '(ADD A B)    '(LOAD A L I0-local-1))
  (emit '(JSR .real-mpy))
  (ld-real 'B 0.2659732)
  (emit '(ADD A B)    '(LOAD A L I0-local-1))
  (emit '(JSR .real-mpy))
  (ld-real 'B 1.2067492)
  (emit '(ADD A B)    '(LOAD A L I0-local-1))
  (emit '(JSR .real-mpy))
  (ld-real 'B 3.0899424)
  (emit '(ADD A B)    '(LOAD A L I0-local-1))
  (emit '(JSR .real-mpy))
  (ld-real 'B 3.5156229)
  (emit '(ADD A B)    '(LOAD A L I0-local-1))
  (emit '(JSR .real-mpy))
  (ld-real 'B 1.0)
  (emit '(ADD B A))
  (emit '(RTS))
  (emit '(harder-case LOCAL))
  (emit '(JSR .real-frac-mpy))		;ax/3.75
  (emit '(STORE A L I0-local-1))
  (ld-real 'B 0.392377e-2)
  (emit '(JSR .real-mpy))
  (ld-real 'B -0.1647633e-1)
  (emit '(ADD A B)    '(LOAD A L I0-local-1))
  (emit '(JSR .real-mpy))
  (ld-real 'B 0.2635537e-1)
  (emit '(ADD A B)    '(LOAD A L I0-local-1))
  (emit '(JSR .real-mpy))
  (ld-real 'B -0.2057706e-1)
  (emit '(ADD A B)    '(LOAD A L I0-local-1))
  (emit '(JSR .real-mpy))
  (ld-real 'B 0.916281e-2)
  (emit '(ADD A B)    '(LOAD A L I0-local-1))
  (emit '(JSR .real-mpy))
  (ld-real 'B -0.157565e-2)
  (emit '(ADD A B)    '(LOAD A L I0-local-1))
  (emit '(JSR .real-mpy))
  (ld-real 'B 0.225319e-2)
  (emit '(ADD A B)    '(LOAD A L I0-local-1))
  (emit '(JSR .real-mpy))
  (ld-real 'B 0.1328592e-1)
  (emit '(ADD A B)    '(LOAD A L I0-local-1))
  (emit '(JSR .real-mpy))
  (ld-real 'B 0.39894228)
  (emit '(ADD A B)    '(LOAD A L I0-local-3)) ;ax
  (emit '(STORE B L I0-local-1))
  (emit '(JSR .exp))
  (emit '(STORE A L I0-local-2))
  (emit '(LOAD A L I0-local-3))
  (emit '(JSR .real-sqrt))
  (emit '(TFR A B)    '(LOAD A L I0-local-2))
  (emit '(JSR .shift-AB-up))
  (emit '(JSR .basic-divide))		;uses temp-loc and friends, so we get local-1 et al for safety
  (emit '(LOAD B L I0-local-1))
  (emit '(JSR .real-mpy))
  (emit '(RTS)))


(libdecl .input 'input-load '(B X Y R5) nil)
(defun input-load ()			;get a new buffer full for insig
  (emit '(.input))			;B=bufstart, R4=struct ptr (to bufloc), N4=offset to get bufsize (better be 1)
					; we exit with R4 changed to point at bufstart
  (emit '(BSET M-HF2 X-IO M-HCR))	; set HF2
  (emit '(BSET M-HF3 X-IO M-HCR))	; set HF3
  (put-datum %external-input)				;see table next56.lisp
  (emit '(LOAD X L R4 RN))		;buffer size
  (put-datum 'X0)			;wait for chance to write
  (put-datum '(Y R4 R-1))		;channel/file/X-or-Y, R4->bufstart
  (put-datum 'B0)			;first pass of new buffer (low word, then high)
  (put-datum 'B1)
  (emit '(BCLR M-HF3 X-IO M-HCR))	; clear HF3
  (emit '(BCLR M-HF2 X-IO M-HCR))	; clear HF2
  (emit '(ADD X B)    '(STORE B L R4 R-1)) ;new bufstart, R4->bufend
  (emit '(STORE B L R4 R+1))		;new bufend, R4->bufstart
;  (emit '(LOAD B L R4 R))		;we want B = bufstart after exit
  (emit '(LOAD R5 X R4 RN))
  (emit '(LOAD Y1 #x80))
  (emit '(JCLR 0 Y R4 RN x-side))
  (emit '(DO X0 get-y-buf))
  (emit       '(wait-1))
  (emit       '(JCLR M-HRDF X-IO M-HSR wait-1))
  (emit       '(HIDE wait-1))
  (emit       '(LOAD Y0 X-IO M-HRX))
  (emit       '(MPY Y0 Y1 B))
  (emit       '(STORE B0 Y R5 R+1))
  (emit       '(get-y-buf LOCAL))
  (emit '(NOP))				;now DO needs time to settle
  (emit '(LOAD B L R4 R))		;we want B = bufstart after exit
  (emit '(RTS))
  (emit '(x-side LOCAL))
  (emit '(DO X0 get-x-buf))
  (emit       '(wait-2))
  (emit       '(JCLR M-HRDF X-IO M-HSR wait-2))
  (emit       '(HIDE wait-2))
  (emit       '(LOAD Y0 X-IO M-HRX))
  (emit       '(MPY Y0 Y1 B))
  (emit       '(STORE B0 X R5 R+1))
  (emit       '(get-x-buf LOCAL))
  (emit '(NOP))
  (emit '(LOAD B L R4 R))		;we want B = bufstart after exit
  (emit '(RTS)))
  
(libdecl .insig-n 'insig-n-load '(A B R4 R5 X Y) '(.input))
(defun insig-n-load ()			;random access input (see .in-n above for outa-based input, as used in reverbs)
  (emit '(.insig-n))			;R4->struct base, A=pass (long-int) (R4 is changed by .insig-n)
					;struct: L:file-end  
					;        L:bufend  
					;        L:bufstart 
					;        X:bufloc  Y:X-or-Y/fil/chn
					;        L:bufsize
  (emit '(TST A)    '(LOAD B L R4 R+1))	;B<-file-end (if past it, just return 0)
  (emil '(JLT Clear-A))			;if A<0 return 0
  (emit '(CMP B A)  '(LOAD B L R4 R+1)) ;A-B (pass-lastpass), R->bufstart
  (emil '(JLE go-on))			;if pass is within file, get it
  (emit '(Clear-A LOCAL))
  (emit '(CLR A))			;otherwise return 0
  (emit '(RTS))
  (emit '(go-on LOCAL))
  (emit '(CMP B A)   '(LOAD N4 1))	;is pass less than bufend?
  (emil '(JGE go-on-1))
  (emit '(LOAD B L R4 R))		;mimic .input call (i.e. get B=bufstart, R4->bufstart)
  (emil '(JMP next-check))
  (emit '(go-on-1 LOCAL))
  (emit '(TFR A B)   '(UPDATE R4 R+1))	;B = bufstart in .input, R->bufloc, N4=1
  (emit '(JSR .input))			;fixes up buffer boundaries in struct, fills buffer with input data, 
  (emit '(JMP go-on-2 LOCAL))		;returns with B=bufstart, R->bufstart
  (emit '(next-check LOCAL))
  (emit '(CMP B A))			;is pass >= bufstart?
  (emil '(JGE go-on-2))
  (emit '(TFR A B)   '(UPDATE R4 R+1))	;R->bufloc
  (emit '(LOAD X L R4 RN))		;bufsize as long-int

  (emit '(SUB X B)   '(LOAD Y0 1))	;moving backwards here (check for 0)
;;; here we were getting a bogus end-of-buffer N5 value (i.e. N5=buffer size, not (1- buffer size))
;;; previous line was LOAD Y1 0 followed by TLT Y1 B -- this could never have worked!! 19-Mar-92
  (emit '(LOAD Y1 0))
  (emit '(ADD Y B))

  (emit '(TLT Y1 B))			;informal tests indicate this does indeed 0 B
  (emit '(JSR .input))			;R->bufstart
  (emit '(go-on-2 LOCAL))		;here we have A=pass and it's within the buffer somewhere, B=bufstart
  (emit '(SUB B A)   '(LOAD R5 X R4 RN)) ;pass - bufstart, exited .input with R4->bufstart, N4=1, so R5->bufloc
  (emit '(COPY A0 N5))			;N5<-location of A within buffer
  (emit '(JCLR 0 Y R4 RN x-side))	;bit 0 of Y[R4] = 0 if X side buffer, R4 RN (N4=1) -> bufloc data
  (emit '(LOAD A Y R5 RN))
  (emit '(RTS))
  (emit '(x-side LOCAL))
  (emit '(LOAD A X R5 RN))
  (emit '(RTS)))


(libdecl .resample 'resample-load '(A B X Y R2 R4 R5 R6) '(.insig-n))
(defun resample-load ()			;R2->s1:s2  cur-x:cur-out  incr[L]  i[L] (L5):insig-block, R4 points to it
  (emit '(.resample))
  (emit                 '(LOAD A X R2 R))		   ;A=s1
  (emit '(TFR A B)      '(LOAD X1 Y R2 R+1))               ;X1=s2, B=s1, R2=>cur-x
  (emit '(SUB X1 A)     '(LOAD N2 2 SHORT))                ;A=s1-s2
  (emit                 '(COPY A Y0)  '(LOAD X0 X R2 R+1)) ;Y0=-diff, X0=cur-x (fractional), R2=>incr (real)
  (emit '(SMAC X0 Y0 B) '(LOAD A L R2 R-1))                ;B=output, A=incr (real), R2=>cur-out
  (emit '(CLR B)        '(STORE B Y R2 R))                 ;update cur-out, R2=>cur-x
  (emit                 '(COPY X0 B0))                     ;make cur-x a real (in B)
  (emit '(ASL B))
  (emit '(ADD B A))                                        ;A=new interpolation point as diff from i
  (emit '(TFR A B)      '(LOAD A1 0))                      ;B=interp, get new cur-x
  (emit '(ASR A)        '(LOAD X1 2))
  (emit                 '(STORE A0 X R2 R))                ;update cur-x, R2=>cur-out
  (emit                 '(LOAD Y1 1))
  (emit '(CMP Y1 B)     '(LOAD Y1 0))			   ;CMP using 24 bit, sign extends and zero fills, 
                                                           ;  so this is equivalent to asking B>1.0
  (emit '(JLT done))                                       ;if cur-x < 1.0, we're done
  (emit                 '(COPY B1 Y0))                     ;now get integer part of B (new i coming up)
  (emit                 '(LOAD A L R2 RN))                 ;A=i
  (emit '(ADD Y A)      '(COPY R4 R6))                     ;A=new i, save R4 which gets changed by .insig-n
  (emit '(CMP X1 B)     '(STORE A L R2 RN))                ;update i
  (emil '(JLT shift-back-one))
  (emit '(JSR .insig-n))		                   ;changes R4 (12-Jan-93) -- must reset for second call
  (emit '(CLR B)        '(LOAD N2 3 SHORT))
  (emit                 '(STORE A X R2 1-R))               ;s1 updated
  (emit                 '(LOAD A L R2 RN))                 ;A=new i
  (emit                 '(LOAD B0 1))
  (emit '(ADD B A)      '(COPY R6 R4))                     ;restore original R4
  (emit '(JSR .insig-n))
  (emit                 '(STORE A Y R2 R+1))               ;s2 updated, R=>cur-out
  (emil '(JMP done))
  (emit '(shift-back-one LOCAL))
  (emit '(JSR .insig-n))
  (emit                 '(LOAD Y1 Y R2 1-R))               ;s2=>s1
  (emit                 '(STORE Y1 X R2 R))
  (emit                 '(STORE A Y R2 R+1))               ;s2 updated, R2=>cur-out
  (emit '(done LOCAL))
  (emit                 '(LOAD A Y R2 R))
  (emit '(RTS)))


(libdecl .readin 'readin-load '(A B X Y R5) '(.insig-n))
(defun readin-load ()			;R2->rdin, R4->block  L:i L:inc (+/-1)  (L5):insig-block, result in X0, R4 changed too
  (emit '(.readin))			;used also by readin-reverse (inc negative)
  (emit '(LOAD A L R2 R))		;i
  (emit '(JSR .insig-n))		;IN[i]
  (emit '(CLR B)        '(COPY A X0))	;     ->X0
  (emit '(LOAD A L R2 R+1))		;i
  (emit '(LOAD B L R2 R-1))		;inc
  (emit '(ADD B A))			;i+inc
  (emit '(STORE A L R2 R))		;     ->i
  (emit '(RTS)))


(libdecl .src-init 'src-init nil nil)
(defun src-init ()
  (emit '(.src-init))
  (emit `(DEFINE sr-change ,(get-L-mem)))
  (emit `(DEFINE sr-sum ,(get-L-mem)))	; used as temporary storage until final convolution loop
  (emit `(DEFINE sr-base-data[0] ,(get-L-mem))) 
  (emit `(DEFINE sr-fsx-lim ,(get-L-mem)))  
  (emit `(DEFINE sr-rd-datawd0 ,(get-L-mem)))
  (emit `(DEFINE sr-rd[2]-filtwd0 ,(get-L-mem)))
  (emit `(DEFINE sr-temp ,(get-L-mem)))
  (emit `(DEFINE sr-ctrs ,(get-L-mem)))
  (emit `(DEFINE sr-datawd2-loc ,(get-L-mem)))

  ;;  X[sr-base-data[0]]=pointer to sr struct base (R2),     Y[sr-base-data[0]]=ptr to data array
  ;;  X[sr-fsx-lim]=fsx,                                     Y[sr-fsx-lim]=lim
  ;;  X[sr-rd-datawd0]=ptr to sr-rd base [sr+6],             Y[sr-rd-datawd0]=ptr to sr-data header
  ;;  X[sr-rd[2]-filtwd0]=ptr to readin (rdin+2),            Y[sr-rd[2]-filtwd0]=ptr to filt header
  ;;  X[sr-datawd2-loc]=data header 2nd word,                Y[sr-datawd2-loc]=current value out loc (see lisp code)
  ;;  sr-ctrs and sr-temp are temporary storage locations (for loop counters and so forth)
  )

(libdecl .src 'src-load '(A B X Y R2 R3 R4 R5 R6) '(.src-init .table-interp .table-shift .readin .real-int-mpy))
(defun src-load ()
  (emit '(.src))			;R2->sr struct with sinc table, A=sr-change, if any
					;    left:right  width:samples-per-pass  x  incr  data[2]  rdin[7]  filt[2]  sinc[2] filt-mult:nil
					;if we just let x increment, left and right need to be long integers (pass counters)

  (emit '(STORE A L sr-change))		;save sr-change for later (incr+sr-change at very end)
  (emit '(CLR B)       '(STORE R2 X sr-base-data[0]))
  (emit '(STORE B L sr-sum))
  (emit '(STORE B L sr-ctrs))		;clear these out
  (emit '(STORE B L sr-temp))
  (emit '(STORE B L sr-datawd2-loc))
  (emit '(COPY R2 B))			;save various pointers into our data structure (rd rd-blk data)
  (emit '(LOAD X0 4))
  (emit '(ADD X0 B)    '(LOAD X0 1))
  (emit '(COPY B R5))
  (emit '(ADD X0 B)    '(STORE B Y sr-rd-datawd0))
  (emit '(ADD X0 B)    '(STORE B X sr-datawd2-loc))
  (emit '(LOAD X0 2))
  (emit '(ADD X0 B)    '(STORE B X sr-rd-datawd0))
  (emit '(LOAD X0 5))
  (emit '(ADD X0 B)    '(STORE B X sr-rd[2]-filtwd0))
  (emit                '(STORE B Y sr-rd[2]-filtwd0))
  (emit '(LOAD B X R5 R))
  (emit                '(STORE B Y sr-base-data[0]))
  (emit '(LOAD A L R2 R+1))		;A1=left, A0=right, R2->width
  (emit '(LOAD X0 X R2 R+1))		;X0=width, R2->x
  (emit '(LOAD B L R2 R))		;B=x
  (emit '(SUB X0 B)    '(COPY A0 Y1))	;B=start-x (x - width) (SUB leaves LSW of B alone in this case), Y1=right
  (emil '(JGE ok))			;if B<0 set it to 0 (fsx max 0)
  (emit '(CLR B))
  (emit '(ok LOCAL))
  (emit '(COPY B1 Y0))			;Y0=fsx (floor start-x) -- start-x no longer needed
  (emit                '(STORE Y0 X sr-fsx-lim))
					;  (setf fsx (max 0 (floor start-x)))
  (emit '(COPY X0 B))			;B=width
  (emit '(ASL B)       '(LOAD A0 0 SHORT))      ;B=lim (2*width), A=left as real (a0 0 short does not sign extend or zero a1)
  (emit '(CMP Y0 A)    '(STORE B Y sr-fsx-lim))    ;left-fsx
					;  (setf lim (* 2 (sr-width s)))
  (emit '(JLT align))			;                <0 then align
					;  (when (or (< (sr-left s) fsx) (> (+ fsx lim) (sr-right s)))...
  (emit   '(ADD Y0 B))			;B=fsx+lim
  (emit   '(SUB Y1 B))			;(lim+fsx)-right
  (emit   '(COPY B N4))
  (emit   '(JLE no-align))		;               <=0 then we're in bounds so no need to readin and whatnot
  (emit   '(JMP already-have-N4))
  (emit   '(align LOCAL))		;here we get our window on the data to match the current sinc bounds
					;  (if (<= fsx (sr-right s)) ...
  (emit   '(ADD Y0 B))			;B=fsx+lim
  (emit   '(SUB Y1 B))			;(lim+fsx)-right
  (emit   '(COPY B N4))

  (emit   '(already-have-N4 LOCAL))
  (emit   '(COPY Y0 B))			;B<-fsx
  (emit   '(SUB Y1 B)     '(LOAD N2 3))	;fsx - right, also R2 RN => 2nd word of sr-data table header
 
  (emit   '(JGT no-data-shift))		;         <= 0 then shift data in table, else readin until we find real data

  ;; here we shift left in the data array (fsx is <= right limit, so we bring those down before reading new data)
  ;; B = i in lisp version, Y1=sr-right, R3 is "loc", R4->data[k]

  (emit     '(ABS B)       '(LOAD R3 0))
  (emit     '(LOAD R4 Y sr-base-data[0]))	;R4->bufstart of data table
					;  (emit     '(LOAD R5 Y srbase-data[0]))	;R5 ditto
  (emit     '(LOAD X0 1))
  (emit     '(UPDATE R4 R+N))
  (emit     '(ADD X0 B)    '(LOAD R5 Y sr-base-data[0]))
  (emit     '(JCLR 0 Y R2 RN x-side-table))
  (emit       '(DO B y-side))
  (emit         '(LOAD A Y R4 R+1))
  (emit         '(STORE A Y R5 R+1))
  (emit         '(UPDATE R3 R+1))
  (emit         '(y-side LOCAL))
  (emit       '(NOP))
  (emit       '(JMP loc-to-lim))
  (emit     '(x-side-table LOCAL))
  (emit       '(DO B x-side))
  (emit         '(LOAD A X R4 R+1))
  (emit         '(STORE A X R5 R+1))
  (emit         '(UPDATE R3 R+1))
  (emit         '(x-side LOCAL))
  (emit       '(NOP))
  (emit       '(JMP loc-to-lim))
  (emit   '(no-data-shift LOCAL))
  ;; here we throw away samples until we finally reach fsx
  (emit   '(LOAD A -1))			;(if (/= (sr-right s) -1) (loop...)
  (emit   '(CMP Y1 A)    '(LOAD R3 0))
  (emit   '(JEQ loc-to-lim))
  (emit     '(STORE B X sr-ctrs))
  (emit     '(loop-toss-data))
  (emit       '(LOAD R2 X sr-rd-datawd0))	;rd[0]
  (emit       '(LOAD R4 X sr-rd[2]-filtwd0))
  (emit       '(JSR .readin))		;affects A B X Y R5
  (emit       '(LOAD X0 1))
  (emit       '(LOAD A Y sr-ctrs))
  (emit       '(ADD X0 A)    '(LOAD B X sr-ctrs))
  (emit       '(CMP A B)     '(STORE A Y sr-ctrs))
  (emit       '(JGT loop-toss-data))
  (emit     '(HIDE loop-toss-data))
  (emit   '(loc-to-lim LOCAL))
  ;; now we get true new data, if any is needed (this is all within the top check)

  (emit   '(LOAD B Y sr-fsx-lim))		;B=lim
  (emit   '(COPY R3 Y1))
  (emit   '(SUB Y1 B))
  (emit   '(JLT setfs))
  (emit   '(STORE R3 Y sr-datawd2-loc))	;loc (after possible shifting above)
  (emit   '(LOAD R5 Y sr-rd[2]-filtwd0))	;address of filter, if any
  (emit   '(LOAD N3 Y sr-datawd2-loc))
  (emit   '(LOAD R3 Y sr-base-data[0]))	;data[0], R5->2nd word of sr-data header
  (emit   '(CLR A)    '(STORE B X sr-ctrs))
  (emit   '(STORE A Y sr-ctrs))
  (emit   '(LOAD A Y R5 R))		;size of filter table -- 0 if filter nil (x=<nil>)
  (emit   '(TST A)    '(UPDATE R3 R+N))	;is there a filter on this run?
  (emit   '(JNE smooth-table))

  ;get table size and readin for loc to lim
  (emit   '(loop-store-data))
  (emit     '(LOAD R2 X sr-rd-datawd0))
  (emit     '(LOAD R4 X sr-rd[2]-filtwd0))
  (emit     '(STORE R3 Y sr-temp))
  (emit     '(JSR .readin))
  (emit     '(LOAD R5 X sr-datawd2-loc))
  (emit     '(LOAD R3 Y sr-temp))	;is this necessary? (.readin, .input, .insig-n leave R3 alone) -- pipeline delay
  (emit     '(JCLR 0 Y R5 R x-side))
  (emit       '(STORE X0 Y R3 R+1))
  (emit       '(JMP ok))
  (emit       '(x-side LOCAL))
  (emit       '(STORE X0 X R3 R+1))
  (emit       '(ok LOCAL))
  (emit     '(LOAD X0 1))
  (emit     '(LOAD A Y sr-ctrs))
  (emit     '(ADD X0 A)    '(LOAD B X sr-ctrs))
  (emit     '(CMP A B)     '(STORE A Y sr-ctrs))
  (emit     '(JGE loop-store-data))
  (emit   '(HIDE loop-store-data))
  (emit   '(JMP setfs))
  
  (emit   '(smooth-table LOCAL))
  (emit   '(STORE R3 Y sr-temp))
  (emit   '(loop-smoothed-data))
  (emit     '(LOAD R3 Y sr-rd[2]-filtwd0))

  ;;low pass filter if new srate lower than old -- use "modified least squares" from Hamming "Digital Filters"
  ;;R3->filter table header X:bufstart Y:size
  ;;initial table shift via .table-shift where R3->table, R4 and R5 are changed
  ;;here main problem is that we are summing fractions and may overflow

  (emit     '(JSR .table-shift))	;upon return R4->last table entry location, R3->2nd word of table header
  (emit     '(STORE R4 X sr-temp))	;temporary shelter for R4
  (emit     '(LOAD R2 X sr-rd-datawd0))
  (emit     '(LOAD R4 X sr-rd[2]-filtwd0))
  (emit     '(JSR .readin))		;upon return, result in X0 (R3 not affected)
  (emit     '(LOAD R4 X sr-temp))
  (emit     '(JCLR 0 Y R3 R-1 x-side))	;R3->first word of (filter) table header
  (emit       '(STORE X0 Y R4 R))
  (emit       '(JMP smooth-it))
  (emit       '(x-side LOCAL))
  (emit       '(STORE X0 X R4 R))
  (emit     '(smooth-it LOCAL))
  (emit     '(CLR A)     '(LOAD B Y R3 R))	;B=(filter) table size, A=0
  (emit     '(LOAD R4 X R3 R+1))		;R4->buf1
  (emit     '(LOAD X0 2))		;first and last samples are halved, rest just added ("modified least squares")
  (emit     '(LOAD R2 X sr-base-data[0]))
  (emit     '(LOAD N2 17))
  (emit     '(SUB X0 B))			;B=number of samples summed without scaling
  (emit     '(LOAD X1 X R2 RN))			
  (emit     '(JCLR 0 Y R3 R x-side))	;does not affect the condition codes
  (emit       '(LOAD Y0 Y R4 R+1))
  (emit       '(LOAD X0 Y R4 R+1))
  (emit       '(LOAD Y1 Y R4 R+1))
  (emit       '(JLE last))
  (emit       '(REP B))
  (emit         '(MAC X1 Y1 A)   '(LOAD Y1 Y R4 R+1))
  (emit       '(JMP last))
  (emit     '(x-side LOCAL))
  (emit       '(LOAD Y0 X R4 R+1))
  (emit       '(LOAD X0 X R4 R+1))
  (emit       '(LOAD Y1 X R4 R+1))
  (emit       '(JLE last))
  (emit       '(REP B))
  (emit         '(MAC X1 Y1 A)   '(LOAD Y1 X R4 R+1))
  (emit     '(last LOCAL))
  (emit     '(MPY X0 X1 B))
  (emit     '(MAC X1 Y0 B))
  (emit     '(ASR B)      '(LOAD R5 X sr-datawd2-loc)) 
  (emit     '(ADD B A)    '(LOAD R3 Y sr-temp)) 

  ;;A=modified least squares low pass filter result 
  ;; now store the result in the src data table

  (emit     '(JCLR 0 Y R5 R x-side))
  (emit       '(STORE A Y R3 R+1))
  (emit       '(JMP ok))
  (emit       '(x-side LOCAL))
  (emit       '(STORE A X R3 R+1))
  (emit     '(ok LOCAL))
  (emit     '(STORE R3 Y sr-temp))
  (emit     '(LOAD X0 1))
  (emit     '(LOAD A Y sr-ctrs))
  (emit     '(ADD X0 A)     '(LOAD B X sr-ctrs))
  (emit     '(CMP A B)      '(STORE A Y sr-ctrs))
  (emit     '(JGE loop-smoothed-data))
  (emit    '(HIDE loop-smoothed-data))
  (emit   '(setfs LOCAL))
  (emit   '(LOAD R2 X sr-base-data[0]))	;ptr to sr base

  ;;  (emit   '(LOAD X0 X sr-fsx-lim))		;fsx
  (emit   '(LOAD B X sr-fsx-lim))
  (emit   '(LOAD A Y sr-fsx-lim))		;lim

  ;;in the following block we subtract out lim from left right and x 
  ;;to keep them from overflowing a short integer (24 bits)

  (emit   '(CMP B A)     '(COPY B X0))
  (emit   '(JGE no-mod))
  (emit   '(SUB A B)     '(LOAD N2 2))
  (emit   '(LOAD R2 X sr-base-data[0]))
  (emit   '(COPY B X0))
  (emit   '(LOAD B L R2 RN))
  (emit   '(SUB A B))
  (emit   '(STORE B L R2 RN))
  (emit   '(no-mod LOCAL))

  (emit   '(ADD X0 A)    '(STORE X0 X R2 R)) 
					;  (setf (sr-left s) fsx)
  (emit   '(STORE A Y R2 R))		;  (setf (sr-right s) (+ fsx lim))
  (emit '(no-align LOCAL))		;at this point nothing is in any known state
  
  ;; end of alignment block  
  ;; now convolve with sinc offset by frac part of (sr-x s)
  
  (emit '(CLR A)       '(LOAD N2 15))
  (emit '(LOAD R2 X sr-base-data[0]))
  (emit '(STORE A L sr-sum))		;(setf sum 0.0)
  (emit '(LUA R3 R2 R+N))		;R3->sinc table header

  ;;(loop for i from 0 to lim and j from left-x by samples-per-zero-crossing do incf sum data[i]*sinc(j))
  ;; table-interp takes index (real) in A, R3-> table header, interpolated lookup value in A
  ;; so we need a place to protect the sum of the convolution
  ;; table-interp affects B X Y R4, increments R3 by 1

  (emit '(LOAD N2 2))
  (emit '(CLR B)      '(LOAD A X R2 R))	;sr-left
  (emit '(STORE B L sr-ctrs))
  (emit '(LOAD B L R2 RN))		;sr-x
  (emit '(LOAD N2 1))
  (emit '(SUB B A))
  (emit '(LOAD X1 Y R2 RN))		;samples-per-pass
  (emit '(JSR .real-int-mpy))		;result will be negative (- (sr-left sr) (sr-x sr)), but reflected in sinc table
  (emit '(LOAD R6 Y sr-base-data[0]))
  (emit '(LOAD B Y R3 R))		;sinc table size (need size-1 for end of table check)
  (emit '(LOAD X1 1))
  (emit '(SUB X1 B))
  (emit '(STORE B Y sr-rd[2]-filtwd0))	;this word is not used from here on, so we'll grab it
  (emit '(loop-convolve))
  (emit   '(STORE A L sr-temp))		;sinc interp loc
  (emit   '(ABS A)      '(LOAD B Y sr-rd[2]-filtwd0))
  (emit   '(CMP A B))
  (emit   '(JGT not-zero-A))		;.table-interp wraps pointer around, but we want just 0
  (emit   '(UPDATE R3 R+1))		; mimic .table-interp (since fixed up later)
  (emit   '(UPDATE R6 R+1))
  (emit   '(LOAD X0 Y R2 RN))		;samples-per-pass
  (emit   '(JMP got-A))
  (emit   '(not-zero-A LOCAL))
  (emit   '(JSR .table-interp))		;R3<-R3+1 (so fixed up later), R4 changed
  (emit   '(LOAD R5 X sr-datawd2-loc))
  (emit   '(COPY A X1))
  (emit   '(JCLR 0 Y R5 R x-side))
  (emit     '(LOAD Y0 Y R6 R+1))
  (emit     '(JMP ok))
  (emit     '(x-side LOCAL))
  (emit     '(LOAD Y0 X R6 R+1))
  (emit     '(ok LOCAL))
  (emit   '(LOAD A L sr-sum))
  (emit   '(MAC X1 Y0 A)    '(LOAD X0 Y R2 RN))
  (emit   '(STORE A L sr-sum))		;X side is the fractional running sum
  (emit   '(got-A LOCAL))
  (emit   '(LOAD Y1 1))
  (emit   '(LOAD A Y sr-ctrs))
  (emit   '(ADD Y1 A)       '(LOAD B Y sr-fsx-lim))
  (emit   '(CMP A B)        '(UPDATE R3 R-1))
  (emit   '(JLT convolve-done))
  (emit   '(STORE A Y sr-ctrs))
  (emit   '(LOAD A L sr-temp))
  (emit   '(ADD X0 A))
  (emit   '(JMP loop-convolve))
  (emit '(convolve-done LOCAL))
  (emit '(HIDE loop-convolve))
  (emit '(LOAD N2 2))
  (emit '(LOAD X0 X sr-sum))
  (emit '(UPDATE R2 R+N))
  (emit '(NOP))
  ;; save X0 (sum), R2->addr+2
  (emit '(LOAD B L R2 R+1))		;B=x, R2->incr
  (emit '(LOAD A L R2 R-1))		;A=incr, R2->x
  (emit '(ADD B A)    '(LOAD B L sr-change))
  (emit '(ADD B A))			;A=x+incr+sr-change
  (emit '(STORE A L R2 R))		;A=incremented x
  ;; value returned in X0
  (emit '(RTS)))



(libdecl .expand 'expand-load '(A B X Y R0 R3 R4 R6) '(.readin .random))
(defconstant expand-readin-offset 7)	;if first thing in E is RD, then 2, else add on offset necessary
(defun expand-load ()			;R2->spd structure, N2=1
  (emit '(.expand))			;  cur-val : x-or-y 
					;  ctr : ptr-to-buf 
					;  cur-out : (len - 1)
					;  steady : rmp 
					;  incr = (/ amp rmp) at init time : temp
  					;  RD BLOCK [7 words]
					;  out-spd : s50
					;  cur-in (long-int)
					;  in-spd :s20
  ;; first get cur-val, increment buffer ctr, check trigger
  (emit '(COPY R2 R6))			;save a pointer to the e struct that we can trust everywhere
  (emit '(LOAD N3 Y R2 RN))		;base of buffer
  (emit '(LOAD R3 X R2 RN))		;index into buffer (this order so we can increment easily)
  (emit '(JCLR 0 Y R2 R x-side))
  (emit '(LOAD A Y R3 RN))		;cur-val
  (emit '(STORE A X R2 R+1))		;save it, R2->ctr
  (emil '(JMP got-cur-val))
  (emit '(x-side LOCAL))
  (emit '(LOAD A X R3 RN))		;same but X side
  (emit '(STORE A X R2 R+1))
  (emit '(got-cur-val LOCAL))
  (emit '(CLR A)      '(UPDATE R3 R+1))	;increment ctr
  (emit '(LOAD B X R2 RN))		;B<-cur-out
  (emit '(COPY R3 A))			;check for ctr>cur-out
  (emit '(CMP A B)    '(STORE R3 X R2 R+1)) ;ctr has been incremented, (cur-out - ctr in CC), R2->cur-out
  (emit '(JGT done))			;if cur-out > ctr, we're done
  (emit '(LOAD N6 1 SHORT))
  ;; shift-block-back
  (emit '(LOAD A Y R2 R))		  ;A<-len
  (emit '(SUB B A)    '(LOAD R3 Y R6 RN)) ;A<-len - cur-out - 1, R3->buffer base
  (emit '(CLR B)      '(LOAD N3 X R2 R+1)) ;N3=cur-out, R2->steady
  (emit '(JCLR 0 Y R6 R do-x-side))
  (emit '(DO A y-shift-loop))
  (emit    '(LOAD X0 Y R3 RN))
  (emit    '(STORE B Y R3 RN))		;set old to 0
  (emit    '(STORE X0 Y R3 R+1))
  (emit    '(y-shift-loop LOCAL))
  (emit '(NOP))
  (emil '(JMP got-shift))
  (emit '(do-x-side LOCAL))
  (emit '(DO A x-shift-loop))		;same, but X side
  (emit    '(LOAD X0 X R3 RN))
  (emit    '(STORE B X R3 RN))
  (emit    '(STORE X0 X R3 R+1))
  (emit    '(x-shift-loop LOCAL))
  (emit '(NOP))
  (emit '(got-shift LOCAL))		;shift back and clear done
  ;; add-one-segment
  (emit '(LOAD N3 Y R2 R))		;rmp
  (emit '(LOAD R0 X R2 R))		;steady
  (emit '(LOAD N2 2 SHORT))
  (emit '(LOAD R3 Y R6 RN))		;buf[0] address
  (emit '(LOAD N6 4 SHORT))		;incr = X R6 RN
					;set up for readin, R2->rd struct, R4=R2+2 (R2 untouched, R4 changed)
  (emit '(UPDATE R2 R+N))		;R2->rd struct
  (emit '(CLR A)   '(STORE B Y R6 RN))	;amp<-0
  (emit '(LUA R4 R2 R+N))		;R4->insig struct
  (emit '(COPY N3 A))
  (emit '(TST A))
  (emil '(JLE past-incr-add-loop))
  (emit '(DO N3 incr-add-loop))
  (emit     '(JSR .readin))		;result in X0, all data registers may be changed (.insig, .input)
  (emit     '(LOAD Y0 X R6 RN))		;incr->Y0
  (emit     '(LOAD A Y R6 RN))		;current amp
  (emit     '(ADD Y0 A))		;amp->amp+incr
  (emit     '(COPY A Y0))		;prepare for fractional mpy
  (emit     '(STORE A Y R6 RN))		;save new amp
  (emit     '(JCLR 0 Y R6 R incr-x-side))
  (emit     '(LOAD A Y R3 R))		;current buffer value
  (emit     '(MAC Y0 X0 A))		;cur + amp*readin
  (emit     '(STORE A Y R3 R+1))	;save new and increment buffer pointer
  (emil     '(JMP got-incr))
  (emit     '(incr-x-side LOCAL))
  (emit     '(LOAD A X R3 R))		;current buffer value
  (emit     '(MAC Y0 X0 A))		;cur + amp*readin
  (emit     '(STORE A X R3 R+1))	;save new and increment buffer pointer
  (emit     '(got-incr LOCAL))
  (emit     '(LUA R4 R2 R+N))		;R4->insig struct
  (emit     '(incr-add-loop LOCAL))
  (emit '(past-incr-add-loop LOCAL))
  (emit '(CLR A))
  (emit '(COPY R0 A))
  (emit '(TST A))
  (emit '(JLE past-steady-add-loop))
  (emit '(DO R0 steady-add-loop))
  (emit     '(JSR .readin))		;result in X0, all data registers may be changed (.insig, .input)
  (emit     '(LOAD Y0 Y R6 RN))		;current amp
  (emit     '(JCLR 0 Y R6 R incr-x-side))
  (emit     '(LOAD A Y R3 R))		;current buffer value
  (emit     '(MAC Y0 X0 A))		;cur + amp*readin
  (emit     '(STORE A Y R3 R+1))	;save new and increment buffer pointer
  (emil     '(JMP got-incr))
  (emit     '(incr-x-side LOCAL))
  (emit     '(LOAD A X R3 R))		;current buffer value
  (emit     '(MAC Y0 X0 A))		;cur + amp*readin
  (emit     '(STORE A X R3 R+1))	;save new and increment buffer pointer
  (emit     '(got-incr LOCAL))
  (emit     '(LUA R4 R2 R+N))		;R4->insig struct
  (emit     '(steady-add-loop LOCAL))
  (emit '(past-steady-add-loop LOCAL))
  (emit '(CLR A))
  (emit '(COPY N3 A))
  (emit '(TST A))
  (emil '(JLE past-decr-add-loop))
  (emit '(DO N3 decr-add-loop))
  (emit     '(JSR .readin))		;result in X0, all data registers may be changed (.insig, .input)
  (emit     '(LOAD Y0 X R6 RN))		;incr->Y0
  (emit     '(LOAD A Y R6 RN))		;current amp
  (emit     '(SUB Y0 A))		;amp->amp-incr
  (emit     '(COPY A Y0))		;prepare for fractional mpy
  (emit     '(STORE A Y R6 RN))		;save new amp
  (emit     '(JCLR 0 Y R6 R incr-x-side))
  (emit     '(LOAD A Y R3 R))		;current buffer value
  (emit     '(MAC Y0 X0 A))		;cur + amp*readin
  (emit     '(STORE A Y R3 R+1))	;save new and increment buffer pointer
  (emil     '(JMP got-incr))
  (emit     '(incr-x-side LOCAL))
  (emit     '(LOAD A X R3 R))		;current buffer value
  (emit     '(MAC Y0 X0 A))		;cur + amp*readin
  (emit     '(STORE A X R3 R+1))	;save new and increment buffer pointer
  (emit     '(got-incr LOCAL))
  (emit     '(LUA R4 R2 R+N))		;R4->insig struct
  (emit     '(decr-add-loop LOCAL))
  (emit '(past-decr-add-loop LOCAL))
  ;; overlap-add is done -- now set-expansion-triggers
  (emit '(CLR B)       '(LOAD N6 12 SHORT)) ;to jump over the rd block in the e structure
  (emit '(LUA R2 R6 R+1))		;R2->ctr (to be zeroed)
  (emit '(LUA R3 R6 R+N))		;R3->just past rd block
  (emit '(STORE B X R2 R+1))		;ctr<-0, R2->cur-out (X side)
  (emit '(JSR .random))			;get a random number (in A0) (-1 to 1 unfortunately)
  (emit '(LOAD B Y random-seed))
  (emit '(ABS B)        '(LOAD A X R3 R)) ;A=out-spd
  (emit '(COPY B Y0))
  (emit '(LOAD X0 Y R3 R+1))		;X0=s50, R3->cur-in et al
  (emit '(MAC X0 Y0 A)  '(LOAD B L R3 R+1)) ;A=new cur-out
  (emit '(STORE A X R2 R))		;save new cur-out
  (emit '(JSR .random))
  (emit '(LOAD A Y random-seed))	;get 0..1, not -1..1
  (emit '(ABS A)        '(LOAD X0 Y R3 R))
  (emit '(COPY A Y0))
  (emit '(MPY X0 Y0 A)  '(LOAD N6 5 SHORT))
  (emit '(COPY A1 A0))			;s20*random as long-int
  (emit '(LOAD A1 0))
  (emit '(ADD A B)      '(LOAD A0 X R3 R-1)) ;B=new readin i, A=in-spd (as long-int)
  (emit '(STORE B L R6 RN))		;save new rdin-i
  (emit '(LOAD B L R3 R))		;now get new cur-in
  (emit '(ADD A B))
  (emit '(STORE B L R3 R))
  (emit '(done LOCAL))
  (emit '(LOAD A X R6 R))		;assume R6 is never moved
  (emit '(RTS)))



#|
;;; This is the Motorola version of the FFT -- it is very fast, and somewhat inconvenient.
;;; This first version assumes you have the sine/cosine coeffs loaded, and returns the data
;;; in bit-reversed order (!).  It can be made faster by splitting out multiplies by 1, and so on.
;;; Another assumption is that all intermediate and final values are "fractional" (i.e. between
;;; -1 and 1) -- this is surely false for IFFT's and probably false for FFTs -- 
;;; you must prescale by 1/n, but since we have
;;; only 24 bits here, that risks shifting out data when n>256.  So many constraints...
;;; One slightly confusing thing is the SUBL instruction in the innermost loop -- here we are
;;; basically getting the x[j]:=x[i]-tr, then putting x[i] in A, and using SUBL to get
;;; 2*x[i]-(x[i]-tr) == x[i]+tr (or whatever).  

(defun Motorola-fft-load ()		;needs points/2 points/4 log-points data-loc coeff-loc
					;complex in and out data (bit reversed output in place)
					;base address of data must have log2(points) low order bits = 0
					;real in X, imag in Y, cos in X, sin in Y
					;points must be power of 2
					;coming in, temp-loc->points/2:points/4,
					;           temp-loc-1->log-points:old run-time stack top 
					;           temp-loc-2->data-loc:coeff-loc
  (emit '(.Motorola-fft))
  (emit '(LOAD N0 X temp-loc SHORT))	;points/2
  (emit '(LOAD N2 1 SHORT))
  (emit '(LOAD R2 X temp-loc-1 SHORT))	;log-points (handled as an assembly-time constant in Motorola ref)
  (emit '(LOAD N6 Y temp-loc SHORT))	;points/4
  (emit '(LOAD M0 #xffffff))		;these shouldn't be necessary
  (emit '(COPY M0 M1))
  (emit '(COPY M0 M4))
  (emit '(COPY M0 M5))
  (emit '(LOAD M6 0 SHORT))		;bit reversed addressing
  (emit '(DO R2 end-pass))
  (emit     '(LOAD R0 X temp-loc-2 SHORT)) ;data-loc
  (emit     '(COPY R0 R4))
  (emit     '(LUA R1 R0 R+N))
  (emit     '(LOAD R6 Y temp-loc-2 SHORT)) ;coeff-loc
  (emit     '(LUA R5 R1 R-1))
  (emit     '(COPY N0 N1))
  (emit     '(COPY N0 N4))
  (emit     '(COPY N0 N5))
  (emit     '(DO N2 end-grp))
  (emit         '(LOAD X1 X R1 R)    '(LOAD Y0 Y R6 R))
  (emit         '(LOAD A X R5 R)     '(LOAD B Y R0 R))
  (emit         '(LOAD X0 X R6 R+N))
  (emit         '(DO N0 end-bfy))
  (emit             '(MAC X1 Y0 B)   '(LOAD Y1 Y R1 R+1))
  (emit             '(SMACR X0 Y1 B) '(STORE A X R5 R+1)   '(LOAD A Y R0 R))
  (emit             '(SUBL B A)      '(LOAD B X R0 R)      '(STORE B Y R4 R))
  (emit             '(SMAC X1 X0 B)  '(LOAD A X R0 R+1)    '(STORE A Y R5 R))
  (emit             '(SMACR Y1 Y0 B) '(LOAD X1 X R1 R))
  (emit             '(SUBL B A)      '(STORE B X R4 R+1)   '(LOAD B Y R0 R))
  ;; ignoring preloading, phase confusion, and so on, this is basically doing the following:
  ;; MAC:     Ai + Br(sin) => B,  Bi => Y1
  ;; SMACR:   (B) - Bi(cos) => B, Ai => A
  ;; SUBL:    2Ai - (B) => A,     Ar => B
  ;; SMAC:    Ar - Br(cos) => B,  Ar => A
  ;; SMACR:   (B) - Bi(sin) => B, Br => X1
  ;; SUBL:    2Ar - (B) => A,     Ai => B
  (emit             '(end-bfy LOCAL))
  (emit         '(STORE A X R5 R+N)  '(LOAD Y1 Y R1 R+N))
  (emit         '(LOAD X1 X R0 R+N)  '(LOAD Y1 Y R4 R+N))
  (emit         '(end-grp LOCAL))
  (emit     '(COPY N0 B1))
  (emit     '(LSR B)     '(COPY N2 A1))
  (emit     '(LSL A)     '(COPY B1 N0))
  (emit     '(COPY A1 N2))
  (emit     '(end-pass LOCAL))
  (emit '(NOP))
  (emit '(RTS)))

|#


;;; the following FFT is taken from Numerical Recipes in Pascal, Press, Flannery, Teukolsky, and Vetterling.
;;; for a Lisp version of the same thing, see num.lisp (or four1.lisp).
;;; We use this rather than Motorola because 
;;;    1) we don't want to have to load sine/cosine/envelope tables, and don't want to have to ensure that
;;;       the data is on a power of 2 boundary (for the bit reversed addressing mode)
;;;    2) we want to use nearly the same code on the Ariel QP board where DRAM (data) access is more complicated.
;;;    3) we want to protect our run-time library's assumptions about AGU registers (i.e. protect R1 principally)
;;;    4) we want to do big(ger) FFTs.
;;;    5) we want normally-ordered output data.
;;; (for convolution, I think we could skip the bit reversal stuff), and for fft-filtering, we could address
;;;    the freq env with bit reversal, so normally ordered output is mostly a debugging convenience).
;;;
;;; The version here is actually a revision of FOUR1 in Numerical Recipes to use two 0-based arrays rather
;;; than one 1-based array with interleaved data, and to use a more obvious trigonometric recurrence.  I
;;; believe the original code was trying to get around a prohibitively expensive SIN function call by saving
;;; sin(x/2) for the subsequent run.  The price paid was n adds for log n (base 2) SINs -- for small FFTs
;;; this may be reasonable, but surely this loses in bigger cases, and in any case it makes the trig messier.
;;; (they use the identity cos(x) - 1 = -2 (sin(x/2))^2, then later have to get rid of the
;;; extra factor the -1 brings in).  (Abramowitz and Stegun 4.3.20).  The actual recurrences used to
;;; get sin((n+1)x) from sin(nx) are:
;;;     A and S 4.3.17: cos((n+1)x) = cos(nx)cos(x) - sin(nx)sin(x) and
;;;     A and S 4.3.16: sin((n+1)x) = 2 sin(nx)cos(x)


(libdecl .fft-init 'fft-init nil nil)
(defun fft-init ()
  (emit '(.fft-init))
  ;; get space for various local variables
  (emit (list 'DEFINE 'n     (get-X-memory)))
  (emit (list 'DEFINE 'ipow  (get-Y-memory)))
  (emit (list 'DEFINE 'mmax  (get-X-memory)))
  (emit (list 'DEFINE 'pow   (get-Y-memory)))
  (emit (list 'DEFINE 'prev  (get-X-memory)))
  (emit (list 'DEFINE 'isign (get-Y-memory)))
  (emit (list 'DEFINE 'wr    (get-X-memory)))
  (emit (list 'DEFINE 'wi    (get-Y-memory)))
  (emit (list 'DEFINE 'theta (get-L-mem)))
  (emit (list 'DEFINE 'theta-fixup (get-L-mem)))
  (emit (list 'DEFINE 'wpr   (get-X-memory)))
  (emit (list 'DEFINE 'wpi   (get-Y-memory)))
  (emit (list 'DEFINE 'data-addr (get-X-memory)))
  ;; It's a small bug in my assembler that you can't attach the memory type to the location.
  ;; For now, you have to keep the X's and Y's straight by hand.
  )

(libdecl .basic-fft 'basic-fft-load '(A B X Y R0 R4 R5 R6 R7) '(.fft-init .sine .cosine))
(defun basic-fft-load ()		;A=number of points (a power of 2), B=isign (-1=ifft, 1=fft)
					;R3->data (X=real, Y=imaginary)
					;data is assumed to be "fractional", but right shifted by the caller
					;(the notion here is to shift before FFT, then filter, IFFT, and voila -- data).
  (emit '(.basic-fft))

  ;;get power of two from number of points (assume not less than 4)
  (emit '(CLR B)     '(STORE B Y isign))
  (emit '(LOAD B 4))
  (emit '(LOAD R6 2))
  (emit '(power-loop))
  (emit '(CMP A B))
  (emil '(JGE ok))			;should be "E" not "G", or user must zero memory up to next power of 2
  (emit '(ASL B)      '(UPDATE R6 R+1))	;try next power
  (emil '(JMP power-loop))
  (emit '(HIDE power-loop))
  (emit '(ok LOCAL))
  (emit '(STORE R6 Y ipow))		;(setf ipow (ceiling (/ (log n) (log 2))))

  ;; in-place shuffle for bit-reversed data ordering where R3->data location (need not be mod addr)
  ;; A = number of data points (must be power of 2) -- from the point of view of the Lisp code (below)
  ;; R3=j, R5=i, x and y base address are in R4 and R6.
					;    (dotimes (i n)			;bit reversal section starts here
					;      (when (> j i)
					;	(setf tempr (aref xdata j))	;swap (as complex) data[j] and data[i]
					;	(setf tempi (aref ydata j))
					;	(setf (aref xdata j) (aref xdata i))
					;	(setf (aref ydata j) (aref ydata i))
					;	(setf (aref xdata i) tempr)
					;	(setf (aref ydata i) tempi))
					;      (let ((m (floor n 2)))            ;i.e. 1 in bit reversed arithmetic
					;	(do () 
					;	    ((or (< m 2) (< j m)))
					;	  (decf j m)                     ;here we are "carrying" to the right
					;	  (setf m (floor m 2)))
					;	(incf j m)))
  (emit '(ASR A)      '(STORE A X n))
  (emit '(ASL A)      '(STORE A Y pow))	;(setf pow (floor n 2)) (used later)
  (emit '(STORE R3 X data-addr))
  (emit '(COPY R3 R6))
  (emit '(COPY R3 R4))
  (emit '(LOAD R3 0))			;pretend this is our base address to get bit reversed arithmetic to work
  (emit '(LOAD R5 0))
  (emit '(LOAD N3 Y pow))		;pts/2 (for bit-reversed addressing)
  (emit '(LOAD N5 X n))			;number of points total
  (emit '(LOAD M3 0))			;bit reversed addressing for R3
  (emit '(DO N5 shuffle-done))
  (emit    '(COPY R3 B))
  (emit    '(COPY R5 A))
  (emit    '(CMP B A))			;are we swapping down? (is j>i)
  (emil    '(JLE shuffle-done-1))	;E for case where shuffle is no-op
  (emit    '(COPY R5 N6))
  (emit    '(COPY R3 N4))
  (emit    '(LOAD B L R6 RN))
  (emit    '(LOAD A L R4 RN))
  (emit    '(STORE A L R6 RN))
  (emit    '(STORE B L R4 RN))
  (emit    '(shuffle-done-1 LOCAL))
  (emit    '(UPDATE R3 R+N))
  (emit    '(UPDATE R5 R+1))
  (emit    '(shuffle-done LOCAL))
  (emit '(LOAD M3 #xffffff))
  ;; data is now ready for fft (which will end up with freq data in normal order)

  (emit '(CLR B)   '(LOAD A 2))
  (emit '(ASR A)   '(STORE A X mmax))	;(setf mmax 2)
  (emit '(STORE A X prev))		;(setf prev 1)
  (emit '(LOAD A1 3))			;(setf theta (* 3.14159265 isign))

  (emit '(LOAD R7 isign))

  (emit `(LOAD A0 ,(logand (floor (scale-float (- (coerce pi 'single-float) 3.0) 24)) #xffffff)))
  (emit '(JCLR 23 Y R7 R setf-theta))

  (emit '(NEG A)   '(LOAD B1 6))	;use positive angle (in .sine and .cosine)
  (emit `(LOAD B0 ,(logand (floor (scale-float (- (* 2 (coerce pi 'single-float)) 6.0) 24)) #xffffff)))
  (emit '(setf-theta LOCAL))
  (emit '(STORE A L theta))
  (emit '(LOAD R7 Y ipow))		;use R7 as DO counter (DO Y ipow ... assumes ipow<64)
  (emit '(STORE B L theta-fixup))
  ;; now run the FFT

  (emit '(DO R7 fft-loop))		;(dotimes (lg ipow)...
  (emit     '(ADD B A)  '(LOAD N3 X mmax))
  (emit     '(JSR .sine))
  (emit     '(STORE A Y wpi))		;  (setf wpi (sin theta))
  (emit     '(LOAD A L theta))
  (emit     '(LOAD B L theta-fixup))
  (emit     '(ADD B A)  '(LOAD R0 X data-addr))
  (emit     '(JSR .cosine))		;  (setf wpr (cos theta))
  (emit     '(CLR A)    '(STORE A X wpr))
  (emit     '(STORE A Y wi))		;  (setf wi 0.0)
  (emit     '(LOAD A #x7fffff))
  (emit     '(STORE A X wr))		;  (setf wr 1.0) (fractional)
  ;;  (emit     '(LOAD R0 X data-addr))	;    used as ii in next loop
  ;;  (emit     '(LOAD N3 X mmax))		; R3="i"
  (emit     '(COPY N3 N4))		; R4="j"
  (emit     '(COPY N3 N5))		; R5 also = "i" (saves a command in the inner loop)
  (emit     '(COPY N3 N6))		; R6="j" also
  (emit     '(LOAD N7 X prev))
  (emit     '(LOAD X0 X wr))
  (emit     '(DO N7 loop-1))		;  (dotimes (ii prev)...
  ;;  (emit         '(LOAD X0 X wr))	;    (setf wrs wr) (see below)
  ;;  (emit         '(LOAD Y0 Y wi))	;    (setf wis wi)

  (emit         '(COPY R0 R3))		; set up i and j for loop below
  (emit         '(COPY R0 R5))
  (emit         '(COPY R0 A))
  (emit         '(LOAD B X prev))
  (emit         '(ADD B A)    '(LOAD Y0 Y wi))
  (emit         '(COPY A R4))

  (emit         '(LOAD A X R3 R))	;preload x[i] 
  (emit         '(COPY R4 R6))
  (emit         '(LOAD M7 Y pow))
  (emit         '(LOAD X1 X R4 R))	;preload x[j]
  ;;  (emit         '(LUA R6 R4 R-N))   ;    (see comment below) -- we should also (LOAD B Y R6 R) here
					;	(do* ((jj 0 (+ jj 1))
					;	      (i ii (+ i mmax))
					;	      (j (+ i prev) (+ j mmax)))
					;	    ((>= jj pow))
					;	  (setf tempr (- (* wrs (aref xdata j)) (* wis (aref ydata j))))
					;	  (setf tempi (+ (* wrs (aref ydata j)) (* wis (aref xdata j))))
					;	  (setf (aref xdata j) (- (aref xdata i) tempr))
					;	  (setf (aref ydata j) (- (aref ydata i) tempi))
					;	  (incf (aref xdata i) tempr)
					;	  (incf (aref ydata i) tempi))
  (emit         '(DO M7 inner-loop))
  (emit             '(MAC X0 X1 A)    '(LOAD Y1 Y R4 R))
  (emit             '(SMAC Y0 Y1 A)   '(LOAD B X R3 R))      ;      '(STORE B Y R6 R+N)) (see comment below)
  (emit             '(SUBL A B)       '(STORE A X R3 R)     '(LOAD A Y R5 R))
  (emit             '(MAC X1 Y0 A)    '(STORE B X R4 R+N))
  (emit             '(MAC X0 Y1 A)    '(LOAD B Y R3 R+N)    '(LOAD X1 X R4 R))
  (emit             '(SUBL A B)       '(STORE A Y R5 R+N)   '(LOAD A X R3 R))
  (emit             '(STORE B Y R6 R+N))
  ;; here we add the last instruction to the inner loop, slowing down our FFT by a lot, when on first
  ;; glance we should be able to put it up with the SMAC intruction, then deal with the very last y[j]
  ;; by hand, but that means predecrementing the R6 pointer by "prev" (i.e. prev-mmax = -prev), which
  ;; means we can easily get funny (i.e. negative) indices, and may end up stomping around in Y-IO space.
  ;; I believe the code would still work and not mess anything up, but it just made me feel queasy.
  (emit             '(inner-loop LOCAL))
					;	(setf wtemp wr)
					;	(setf wr (- (* wr wpr) (* wi wpi)))
					;	(setf wi (+ (* wi wpr) (* wtemp wpi))))
  (emit         '(LOAD X1 X wpr))
  (emit         '(MPY X0 X1 A)    '(LOAD Y1 Y wpi))
  (emit         '(SMACR Y0 Y1 A)  '(UPDATE R0 R+1))

  (emit         '(LOAD B fractional-one)) ; this overflow check maybe not needed (1.0 => 0 if not careful -- in A2)
  (emit         '(CMP B A))
  (emit         '(TGT B A))

  (emit         '(MPY X1 Y0 B)    '(STORE A X wr))
  (emit         '(MACR X0 Y1 B)   '(COPY A X0))

  (emit         '(LOAD A fractional-one))
  (emit         '(CMP A B))
  (emit         '(TGT A B))

  (emit         '(STORE B Y wi))
  (emit         '(loop-1 LOCAL))
					;      (setf pow (* pow .5))
					;      (setf prev mmax)
					;      (setf theta (* theta .5))
					;      (setf mmax (* mmax 2)))))
  (emit     '(LOAD B Y pow))
  (emit     '(ASR B)    '(LOAD A L theta))
  (emit     '(ASR A)    '(STORE B Y pow))
  (emit     '(STORE A L theta))
  (emit     '(LOAD B X mmax))
  (emit     '(ASL B)    '(STORE B X prev))
  (emit     '(STORE B X mmax))
  (emit     '(LOAD B L theta-fixup))
  (emit     '(fft-loop LOCAL))
  (emit '(LOAD M7 #xffffff))
  (emit '(RTS)))


(libdecl .fft-window-init 'fft-window-init nil nil)
(defun fft-window-init ()
  (emit '(.fft-window-init))
  (emit (list 'DEFINE 'mult  (get-L-mem)))
  (emit (list 'DEFINE 'angle (get-L-mem)))
  (emit (list 'DEFINE 'sweep (get-L-mem)))
  (emit (list 'DEFINE 'parzen-table-base (get-L-mem 15)))
  (emit '(Y-ORG parzen-table-base))
  ;; 2/(n+1) table for powers of 2 from 0 to 14
  (emit (list 'Y-DATA
	      #x7fffff
	      (make-fraction .66666667)
	      (make-fraction 0.4000000)
	      (make-fraction 0.2222222)
	      (make-fraction 0.1176471)
	      (make-fraction 0.0606061)
	      (make-fraction 0.0307692)
	      (make-fraction 0.0155039)
	      (make-fraction 0.0077821)
	      (make-fraction 0.0038986)
	      (make-fraction 0.0019512)
	      (make-fraction 0.0009761)
	      (make-fraction 0.0004882)
	      (make-fraction 0.0002441)
	      (make-fraction 0.0001221))))

(defun window-name (number)
  (case number
    (0 "Rectangular") (1 "Hanning") (2 "Welch") (3 "Parzen") (4 "Bartlett") (5 "Hamming") (6 "order 0")
    (7 "order 1") (8 "order 2") (9 "order 3") (10 "order 4") (11 "exponential") (12 "Kaiser")))

(libdecl .basic-fft-window 'basic-fft-window-load '(A B X) '(.fft-window-init .cosine))
(defun basic-fft-window-load ()
  (emit '(.basic-fft-window))
  ;; B=window type, A=pts, R3->data
  (emit '(TST B)     '(COPY A N3))
  (emit '(JEQ done))			;0=rectangular (i.e. no change to data)
  (emit '(LUA R6 R3 R+N))		;R6->end of table (all windows are symmetrical around the midpoint)
  (emit '(LSR A)     '(LOAD X0 1))
  (emit '(COPY A N6))			;N6=pts/2
  (emit '(CMP X0 B)  '(LOAD X0 2))
  (emit '(JEQ .hanning-window))		;1=Hanning: cos^2 x (other powers are discussed in the literature)
  (emit '(CMP X0 B)  '(LOAD X0 3))
  (emit '(JEQ .welch-window))		;2=Welch: Parzen squared in a sense
  (emit '(CMP X0 B)  '(LOAD X0 4))
  (emit '(JEQ .parzen-window))		;3=Parzen: triangular with small offset from 0
  (emit '(CMP X0 B)  '(LOAD X0 5))
  (emit '(JEQ .bartlett-window))	;4=Bartlett: triangular (Parzen without the offset)
  (emit '(CMP X0 B)  '(LOAD X0 6))
  (emit '(JEQ .hamming-window))		;5=Hamming: a + (1-a)*cos 2*i*pi/N (a=.54 for now)
  (emit '(CMP X0 B)  '(LOAD X0 7))
  (emit '(JEQ .order0-window))		;6=order0 -- same as rectangular
  (emit '(CMP X0 B)  '(LOAD X0 8))
  (emit '(JEQ .order1-window))		;7=order1 --same as Hamming
  (emit '(CMP X0 B)  '(LOAD X0 9))
  (emit '(JEQ .order2-window))		;8=order2 -- apparently a "Blackman-Harris" window of 2 cosines
  (emit '(CMP X0 B)  '(LOAD X0 10))
  (emit '(JEQ .order3-window))		;9=order3 -- 3 cosines
  (emit '(CMP X0 B)  '(LOAD X0 11))
  (emit '(JEQ .order4-window))		;10=order4 -- 4 cosines
  (emit '(CMP X0 B)  '(LOAD X0 12))
  (emit '(JEQ .exponential-window))	;11=exponential (some day...)
  (emit '(CMP X0 B))
  (emit '(JEQ .kaiser-window))		;12=Kaiser (some day -- too complicated for this first pass)
  (emit '(JMP error))

  (emit '(.bartlett-window LOCAL))
  (emit '(CLR B))			;get rid of type parameter
  (emit '(LSR A)   '(LOAD B0 #x7fffff)) ;get 2/(N-1) as increment of ramp
  (emit '(get-shift))
  (emit '(TST A))
  (emit '(JEQ ok))
  (emit '(LSR A))
  (emit '(ASR B)    '(COPY B0 Y1))
  (emit '(JMP get-shift))
  (emit '(HIDE get-shift))
  (emit '(ok LOCAL))
  (emit '(CLR A)    '(COPY B0 Y0))
  (emit '(MPY Y0 Y1 A))
  (emit '(COPY A1 A0))
  (emit '(LOAD A1 0))
  (emit '(ADD A B))
  (emit '(CLR B)    '(COPY B0 Y0))
  (emit '(inner-bartlett))
  (emit '(DO N6 bartlett))
  (emit     '(COPY B Y1)      '(LOAD X0 X R3 R))
  (emit     '(MPY X0 Y1 A)    '(LOAD X1 X R6 1-R))
  (emit     '(MPY X1 Y1 A)    '(STORE A X R3 R+1))
  (emit     '(ADD Y0 B)       '(STORE A X R6 R))
  (emit     '(bartlett LOCAL))
  (emit '(NOP))
  (emit '(RTS))

  (emit '(.parzen-window LOCAL))	;get (N-1)/(N+1) as offset and increment 
  (emit '(LOAD R0 1))
  (emit '(LSR A)    '(LOAD N0 parzen-table-base))
  (emit '(get-power))
  (emit '(TST A))
  (emit '(JEQ ok))
  (emit '(LSR A)    '(UPDATE R0 R+1))
  (emit '(JMP get-power))
  (emit '(HIDE get-power))
  (emit '(ok LOCAL))
  (emit '(LOAD B Y R0 RN))
  (emit '(COPY B Y0))
  (emit '(JMP inner-bartlett))
  (emit '(HIDE inner-bartlett))

  (emit '(.welch-window LOCAL))
  (emit '(LOAD R0 1))
  (emit '(LSR A)    '(LOAD N0 parzen-table-base))
  (emit '(get-power))
  (emit '(TST A))
  (emit '(JEQ ok))
  (emit '(LSR A)    '(UPDATE R0 R+1))
  (emit '(JMP get-power))
  (emit '(HIDE get-power))
  (emit '(ok LOCAL))
  (emit '(LOAD B Y R0 RN))
  (emit '(COPY B Y1))
  (emit '(DO N6 welch))
  (emit     '(COPY B Y0)      '(LOAD X0 X R3 R))
  (emit     '(TFR B A))
  (emit     '(SMAC Y0 Y0 A))		;triangle squared (in a sense) -- this is slightly different from Recipes code
  (emit     '(ADD B A))
  (emit     '(COPY A Y0))
  (emit     '(MPY X0 Y0 A)    '(LOAD X1 X R6 1-R))
  (emit     '(MPY X1 Y0 A)    '(STORE A X R3 R+1))
  (emit     '(ADD Y1 B)       '(STORE A X R6 R))
  (emit     '(welch LOCAL))
  (emit '(NOP))
  (emit '(RTS))

  (emit '(.get-angle))
  (emit '(LSR A)    '(LOAD B 3))
  (emit `(LOAD B0 ,(logand (floor (scale-float (- (coerce pi 'single-float) 3.0) 24)) #xffffff)))
  (emit '(tst-angle))
  (emit '(TST A))
  (emit '(JEQ ok))
  (emit '(ASR B))
  (emit '(LSR A))
  (emit '(JMP tst-angle))
  (emit '(HIDE tst-angle))
  (emit '(ok LOCAL))
  (emit '(RTS))
  
  (emit '(.hanning-window LOCAL))
  (emit '(JSR .get-angle))
  (emit '(LOAD A1 .5))
  (emit '(LOAD A0 .5))
  (emit '(STORE A L mult))
  (emit '(inner-hanning))
  (emit '(CLR A)    '(STORE B L sweep))
  (emit '(DO N6 hanning))
  (emit     '(STORE A L angle))
  (emit     '(JSR .cosine))		;A B X Y clobbered!
  (emit     '(LOAD X0 X mult))		;A1=factor on cos
  (emit     '(COPY A X1))
  ;; to get higher order Hanning windows, we could MPY X1 X1 B here as often as necessary (well, MPY X0 X0 B...)
  (emit     '(MPY X0 X1 A)    '(LOAD X1 Y mult))
  (emit     '(SUB X1 A)       '(LOAD Y L sweep))		;A0=constant
  (emit     '(NEG A)          '(LOAD X0 X R3 R))
  (emit     '(COPY A X1))
  (emit     '(MPY X0 X1 A)    '(LOAD X0 X R6 1-R))
  (emit     '(MPY X0 X1 B)    '(STORE A X R3 R+1))
  (emit     '(LOAD A L angle))
  (emit     '(ADD Y A)        '(STORE B X R6 R))
  (emit     '(hanning LOCAL))
  (emit '(NOP))
  (emit '(RTS))

  (emit '(.hamming-window LOCAL))
  (emit '(.order1-window LOCAL))
  (emit '(JSR .get-angle))
  (emit '(LOAD A1 .46))			;JOS always uses 1.0 as the constant, so all his factors are scaled to fit
  (emit '(LOAD A0 .54))
  (emit '(STORE A L mult))
  (emit '(JMP inner-hanning))
  (emit '(HIDE inner-hanning))

  ;; cos 2x = 2cos^2 x -1
  ;; cos 3x = 4cos^3 x - 3cos x
  ;; cos 4x = 8cos^4 x - 8cos^2 x +1

  (emit '(.order2-window LOCAL))
  ;; Harris:  .42323 - .49755*cos x + .07922 cos 2x = .34401 - .49755*cos x + .15844*cos^2 x
  ;; JOS:     .41776 - .5*cos x + .082236*cos 2x 
  ;; I'll use the Harris numbers until someone complains...
  (emit '(JSR .get-angle))
  (emit '(CLR A)     '(STORE B L sweep))
  (emit '(DO N6 order2))
  (emit     '(STORE A L angle))
  (emit     '(JSR .cosine))
  (emit     '(COPY A X0))
  (emit     '(LOAD B -.49755))
  (emit     '(LOAD X1 .15844))
  (emit     '(MAC X0 X1 B)    '(LOAD A .34401))
  (emit     '(COPY B X1))
  (emit     '(MAC X0 X1 A)    '(LOAD X1 X R3 R))
  (emit     '(COPY A X0))
  (emit     '(MPY X0 X1 A)    '(LOAD X1 X R6 1-R))
  (emit     '(MPY X0 X1 B)    '(STORE A X R3 R+1))
  (emit     '(LOAD A L angle))
  (emit     '(LOAD Y L sweep))
  (emit     '(ADD Y A)        '(STORE B X R6 R))
  (emit     '(order2 LOCAL))
  (emit '(NOP))
  (emit '(RTS))

  (emit '(.order3-window LOCAL))
  ;; Harris:  .35875 - .48829*cos x + .14128*cos 2x - .01168*cos 3x = .2174699 - .45325*cos x + .28256*cos^2x - .04672*cos^3x
  ;; JOS:     .33388 - .47944*cos x + .16611*cos 2x - .02055*cos 3x    
  (emit '(JSR .get-angle))
  (emit '(CLR A)     '(STORE B L sweep))
  (emit '(DO N6 order3))
  (emit     '(STORE A L angle))
  (emit     '(JSR .cosine))
  (emit     '(COPY A X0))
  (emit     '(LOAD B .28256))
  (emit     '(LOAD X1 -.04672))
  (emit     '(MAC X0 X1 B)    '(LOAD A -.45325))
  (emit     '(COPY B X1))
  (emit     '(MAC X0 X1 A)    '(LOAD B .2174699))
  (emit     '(COPY A X1))
  (emit     '(MAC X0 X1 B)    '(LOAD X1 X R3 R))
  (emit     '(COPY B X0))
  (emit     '(MPY X0 X1 A)    '(LOAD X1 X R6 1-R))
  (emit     '(MPY X0 X1 B)    '(STORE A X R3 R+1))
  (emit     '(LOAD A L angle))
  (emit     '(LOAD Y L sweep))
  (emit     '(ADD Y A)        '(STORE B X R6 R))
  (emit     '(order3 LOCAL))
  (emit '(NOP))
  (emit '(RTS))

  (emit '(.order4-window LOCAL))
  ;; JOS:     .286509 - .448752*cos x + .20784*cos 2x - .051756*cos 3x + .00513456*cos 4x
  ;; but this gives negative window values!! (Am I missing something?)  I'll make a slight tweak...
  ;; JOS/BIL: .287333 - .4471689*cos x + .2084454*cos 2x - .0519053*cos 3x + .00514933*cos 4x
  ;;     = .084037 - .29145*cos x + .375696*cos^2x - .20762*cos^3x + .041194*cos^4x
  (emit '(JSR .get-angle))
  (emit '(CLR A)     '(STORE B L sweep))
  (emit '(DO N6 order4))
  (emit     '(STORE A L angle))
  (emit     '(JSR .cosine))
  (emit     '(COPY A X0))
  (emit     '(LOAD B -.20762))
  (emit     '(LOAD X1 .041194))
  (emit     '(MAC X0 X1 B)    '(LOAD A .375696))
  (emit     '(COPY B X1))
  (emit     '(MAC X0 X1 A)    '(LOAD B -.29145))
  (emit     '(COPY A X1))
  (emit     '(MAC X0 X1 B)    '(LOAD A .084037))
  (emit     '(COPY B X1))
  (emit     '(MAC X0 X1 A)    '(LOAD X1 X R3 R))
  (emit     '(COPY A X0))
  (emit     '(MPY X0 X1 A)    '(LOAD X1 X R6 1-R))
  (emit     '(MPY X0 X1 B)    '(STORE A X R3 R+1))
  (emit     '(LOAD A L angle))
  (emit     '(LOAD Y L sweep))
  (emit     '(ADD Y A)        '(STORE B X R6 R))
  (emit     '(order4 LOCAL))
  (emit '(NOP))
  (emit '(RTS))

  (emit '(.kaiser-window LOCAL))
  ;;(/ (I0 (* pi alpha (sqrt (- 1.0 (sqr (/ i (/ N 2))))))) (I0 (* pi alpha))) 
  ;; also given as (/ (I0 (* beta (sqrt (- 1.0 (sqr (/ (* 2 k) (1- N))))))) (I0 beta))

#|
  ;; here is an untested version of the Kaiser window:

  ;; B=window type, A and N3=pts, R3->data, R6->end of data
  ;; assume also that X has beta

  (emit '(.kaiser-window LOCAL))
  (emit '(STORE X L mult))
  (emit '(STORE A X sweep))
  (emit '(CLR A)         '(STORE R3 Y angle))
  (emit '(CLR B)         '(STORE R6 X angle))
  (emit '(LOAD A L mult))
  (emit '(JSR .I0))			;I0(beta)
  (emit '(TFR A B))
  (emit '(LOAD A 1))			;get 1/I0(beta) as multiplier (avoid divides within loop)
  (emit '(JSR .shift-AB-up))
  (emit '(JSR .basic-divide))
  (emit '(LOAD R3 Y angle))
  (emit '(LOAD R6 X angle))
  (emit '(STORE A Y sweep))		;get it out of harm's way
  (emit '(LOAD A X sweep))
  (emit '(LSR A)   '(LOAD B0 #x7fffff)) ;get 2/(N-1) as increment of ramp
  (emit '(get-shift))
  (emit '(TST A))
  (emit '(JEQ ok))
  (emit '(LSR A))
  (emit '(ASR B)    '(COPY B0 Y1))
  (emit '(JMP get-shift))
  (emit '(HIDE get-shift))
  (emit '(ok LOCAL))
  (emit '(CLR A)    '(COPY B0 Y0))
  (emit '(MPY Y0 Y1 A))
  (emit '(COPY A1 A0))
  (emit '(LOAD A1 0))
  (emit '(ADD A B))
  (emit '(CLR B)    '(COPY B0 Y0))
  (emit '(STORE B L angle))
  (emit '(DO N6 kaiser))
  (emit     '(COPY B Y1))
  (emit     '(JSR .frac-sqrt))		;input in Y1, result in A1
  (emit     '(COPY A1 X1))		;sqrt(1-ramp)
  (emit     '(LOAD A L mult))		;beta
  (emit     '(JSR .real-frac-mpy))	;input in X1, result in A
  (emit     '(JSR .I0))
  (emit     '(LOAD X1 Y sweep))		;1/I0(beta)
  (emit     '(JSR .real-frac-mpy))
  (emit     '(LOAD B L angle))
  (emit     '(COPY A Y1)      '(LOAD X0 X R3 R))
  (emit     '(MPY X0 Y1 A)    '(LOAD X1 X R6 1-R))
  (emit     '(MPY X1 Y1 A)    '(STORE A X R3 R+1))
  (emit     '(ADD Y0 B)       '(STORE A X R6 R))
  (emit     '(STORE B L angle))
  (emit     '(kaiser LOCAL))
  (emit '(NOP))
|#
  (emit '(RTS))

  (emit '(.exponential-window LOCAL))

  (emit '(exponential LOCAL))
  (emit '(RTS))

  ;; another window is Gaussian: (exp (* -.5 (sqr (* alpha (/ i ( N 2))))))
  ;;    (i as above, alpha usually between 2.5 and 3.5)

  (emit '(error LOCAL))
  (emit '(JSR .break))
  (emit '(.order0-window LOCAL))
  (emit '(done LOCAL))
  (emit '(HIDE .get-angle))
  (emit '(RTS)))


(libdecl .basic-spectrum 'basic-spectrum-load '(A B X Y) '(.frac-sqrt))
(defun basic-spectrum-load ()		;(sqrt (sqr x) + (sqr y))
  (emit '(.basic-spectrum))		;pts in A, R3->data
  (emit '(COPY A N3))			;A clobbered in .frac-sqrt, so can't be DO loop counter
  (emit '(DO N3 end-spectrum))
  (emit     '(LOAD B X R3 R))
  (emit     '(ASR B))			;all this shifting is an attempt to avoid overflows in MAC
  (emit     '(COPY B X0))
  (emit     '(MPY X0 X0 A)  '(LOAD B Y R3 R))
  (emit     '(ASR B))
  (emit     '(COPY B X0))
  (emit     '(MACR X0 X0 A))
  (emit     '(COPY A Y1))
  (emit     '(JSR .frac-sqrt))		;uses A B X Y, input in Y1
  (emit     '(ASL A))
  (emit     '(STORE A X R3 R+1))

  ;; here we could get the phase as follows:
  ;; (emit '(LOAD A Y R3 R))
  ;; (emit '(LOAD B X R3 R))
  ;; ;; here XJS uses (emit '(NEG A)) which strikes me as unnecessary
  ;; (emit '(JSR .basic-divide)) ;imag/real (result is a real, not a fraction)
  ;; (emit '(JSR .atan))         ;phase as a real in radians
  ;; (emit '(ASR A))
  ;; (emit '(ASR A))             ;make it storeable in 16 bits
  ;; (emit '(STORE A Y R3 R+1))

  (emit     '(end-spectrum LOCAL))
  (emit '(NOP))
  (emit '(RTS)))


(libdecl .spectrum-init 'spectrum-init nil nil)
(defun spectrum-init ()
  (emit '(.spectrum-init))
  (emit (list 'DEFINE 'sp-pts (get-X-memory)))
  (emit (list 'DEFINE 'sp-typ (get-Y-memory)))
  (emit (list 'DEFINE 'sp-adr (get-X-memory))))

(libdecl .spectrum 'spectrum-load '(A B X Y) '(.spectrum-init .basic-spectrum .basic-fft-window .basic-fft))
(defun spectrum-load ()
  ;; takes ptr to data, size of data, window type
  ;; scales and windows data, does fft on data, returns power spectrum.
  ;; A=pts, R3->data, B=window type
  ;; is it necessary to scale the data here??
  (emit '(.spectrum))
  (emit '(STORE A X sp-pts))
  (emit '(STORE B Y sp-typ))
  (emit '(CLR B)    '(STORE R3 X sp-adr))
  (emit '(COPY A1 B0))
  (emit '(LOAD A 1))
  (emit '(get-multiplier))
  (emit '(TST B))
  (emit '(JEQ ok))
  (emit '(ASR B))
  (emit '(ASR A))
  (emit '(JMP get-multiplier))
  (emit '(HIDE get-multiplier))
  (emit '(ok LOCAL))
  (emit '(COPY R3 R6))
  (emit '(LOAD N6 X sp-pts))
  (emit '(COPY A0 X0))
  (emit '(LOAD X1 X R3 R+1))
  (emit '(DO N6 scale-data))
  (emit     '(MPY X0 X1 B)    '(LOAD X1 X R3 R+1))
  (emit     '(STORE B X R6 R+1))
  (emit     '(NOP))
  (emit     '(scale-data LOCAL))
  (emit '(LOAD A X sp-pts))
  (emit '(LOAD R3 X sp-adr))
  (emit '(LOAD B Y sp-typ))
  (emit '(JSR .basic-fft-window))
  (emit '(LOAD A X sp-pts))
  (emit '(LOAD R3 X sp-adr))
  (emit '(LOAD B 1))			;fft, not ifft
  (emit '(JSR .basic-fft))
  (emit '(LOAD A X sp-pts))
  (emit '(LOAD R3 X sp-adr))
  (emit '(JMP .basic-spectrum)))


(libdecl .fft-window 'fft-window-load '(R2 R3 R5 R6 X Y A B) nil)
;;; windowing can be optimized by passing a table with the fft-data 
(defun fft-window-load ()
  (emit '(.fft-window))
  ;; assume fft real data table header pointed to by R3, window table header pointed to by R5
  (emit '(LOAD R2 X R3 R))
  (emit '(LOAD X0 Y R3 R))
  (emit '(LOAD R6 X R5 R+1))
  (emit '(JCLR 0 Y R5 R x-side))
  (emit   '(DO X0 y-window-it))
  (emit	    '(LOAD X1 X R2 R) '(LOAD Y0 Y R6 R+1))
  (emit     '(MPY Y0 X1 A)) 
  (emit     '(STORE A X R2 R+1)) 
  (emit     '(y-window-it LOCAL))
  (emit   '(NOP))
  (emit   '(RTS))
  (emit '(x-side LOCAL))
  (emit   '(LOAD Y0 X R6 R+1))
  (emit   '(DO X0 x-window-it))
  (emit	    '(LOAD X1 X R2 R)) 
  (emit     '(MPY Y0 X1 A) '(LOAD Y0 X R6 R+1))
  (emit     '(STORE A X R2 R+1))
  (emit     '(x-window-it LOCAL))
  (emit   '(NOP))
  (emit   '(RTS)))


(libdecl .fft-filter-init 'fft-filter-init nil nil)
(defun fft-filter-init ()
  (emit '(.fft-filter-init))
  (emit `(DEFINE ff-base-datawd0 ,(get-L-mem)))
  (emit `(DEFINE ff-ctrs ,(get-L-mem)))
  (emit `(DEFINE ff-rd-addrs ,(get-L-mem)))
  (emit `(DEFINE ff-wrt-addrs ,(get-L-mem)))
  (emit `(DEFINE ff-env-val ,(get-L-mem))))

(libdecl .fft-filter 'fft-filter-load '(A B X Y R0 R2 R3 R4 R5 R6 R7) 
	 '(.fft-filter-init .sine-init .cosine-init .sine .cosine .basic-fft .run-block .readin .random))

;;; run-block data: loc ctr tbl(2)
;;; rdin is 7 and needs ptr to base+2 as well as base
;;; env is 3 + dynamically loaded space (or 2 if table)
;;; we will assume that datar is X side, datai Y side (since .basic-fft wants that), and lined up (i.e. use get-L-mem)

;;; so ff struct is allocated: 
;;;  0:  b [4]  (L:loc L:ctr table-header)
;;;  4:  siz:hop 
;;;  5:  half-siz:fourth-siz 
;;;  6:  data tables header (i.e. header for datar, datai assumed Y) [2]
;;;  8:  rdin block [7]
;;; 15:  env [3]
;;; 18 words of static space + b-blk, datar datai, envelope, and rdin-blk

(defun fft-filter-load ()
  (emit '(.fft-filter))			;R2->fftflt base
  (emit '(LOAD A L R2 R))		;(rblk-loc fftflt-b ff))
  (emit '(TST A)     '(LOAD N2 6))	;loc=0 is sign that we need to process another block of data
  (emit '(JNE get-result))
  (emit '(LUA R4 R2 R+N))		;datar header
  (emit '(STORE R2 X ff-base-datawd0))	;save fftflt base
  (emit '(STORE R4 Y ff-base-datawd0))	;save data array header addr
  (emit '(JSR readin-data))
  ;; .basic-fft wants A=num of pts, B=1 for fft, -1 for ifft, R3->data
  (emit '(LOAD R2 X ff-base-datawd0))	;just in case rdin forgets to restore it
  (emit '(LOAD R4 Y ff-base-datawd0))
  (emit '(LOAD B 1))
  (emit '(LOAD N2 4))
  (emit '(LOAD R3 X R4 R))		;datar[0]
  (emit '(LOAD A X R2 RN))		;(fftflt-siz ff)
  (emit '(JSR .basic-fft))
  (emit '(JSR apply-envelope-and-shift))
  (emit '(LOAD R2 X ff-base-datawd0))	;just in case apply-envelope forgets to restore it
  (emit '(LOAD R4 Y ff-base-datawd0))	;data array header addr
  (emit '(LOAD N2 4))
  (emit '(LOAD B -1))			;for inverse fft
  (emit '(LOAD R3 X R4 R))		;datar[0]
  (emit '(LOAD A X R2 RN))		;(fftflt-siz ff)
  (emit '(JSR .basic-fft))
  (emit '(LOAD R2 X ff-base-datawd0))
  (emit '(LOAD N2 4))
  (emit '(COPY R2 R3))
  (emit '(LOAD N3 5))
  (emit '(LOAD B Y R2 RN))		;(fftflt-hop ff)
  (emit '(TST B)    '(LOAD A X R3 RN))	;(fftflt-half-siz ff)
  (emit '(JLT first-time))
  (emit '(JSR readout-data))
  (emit '(CLR B)    '(LOAD R2 X ff-base-datawd0))
  (emit '(LOAD N2 4))
  (emit '(LOAD N3 1))
  (emit '(COPY R2 R3))
  (emit '(LOAD B Y R2 RN))
  (emit '(STORE B L R3 RN))		;ctr (as "real") <- hop (short int)
  (emit '(JMP get-result))
  (emit '(first-time LOCAL))
  (emit '(CLR B)    '(STORE A Y R2 RN)) ;hop <- half-siz
  (emit '(LOAD N2 1))
  (emit '(LOAD B Y R3 RN))		;fourth-size
  (emit '(STORE B L R2 RN))		;ctr (as "real") <- fourth size (normal integer)
  (emit '(JSR readout-first-data))
  (emit '(LOAD R2 X ff-base-datawd0))	;just to be safe...
  (emit '(get-result LOCAL))
  (emit '(JSR .run-block))		;fractional result in A
  (emit '(RTS))

  (emit '(readin-data))
  (emit '(LOAD R3 Y ff-base-datawd0))	;data array header
  (emit '(LOAD N2 5))
  (emit '(LOAD R4 X R3 R))		;datar[0]
  (emit '(CLR A)     '(LOAD B Y R3 R))	;B=data size, A=0 to clear entire data array
  (emit '(REP B))
  (emit   '(STORE A L R4 R+1))
  (emit '(STORE A X ff-ctrs))		;loop ctr <- 0
  (emit '(LOAD R4 X R3 R))		;datar[0]
  (emit '(LOAD N4 Y R2 RN))		;fftflt-siz / 4
  (emit '(LOAD A X R2 RN))		;(fftflt-half-siz ff)
  (emit '(LOAD N2 17))
  (emit '(UPDATE R4 R+N))
  (emit '(LOAD X1 Y R2 RN))		;scaler (1/fft-size)
  (emit '(LOAD N2 8))
  (emit '(STORE A Y ff-ctrs))		;loop limit <- half-siz
  (emit '(UPDATE R2 R+N))		;R2 -> rdin block
  (emit '(LOAD N2 2))
  (emit '(STORE R4 X ff-wrt-addrs))	;data[siz/4]
  (emit '(STORE X1 Y ff-wrt-addrs))
  (emit '(LUA R4 R2 R+N))		;R4 -> 3rd word of rdin block
  (emit '(STORE R2 X ff-rd-addrs))
  (emit '(STORE R4 Y ff-rd-addrs))
  (emit '(readin-loop))
  (emit   '(LOAD R2 X ff-rd-addrs))
  (emit   '(LOAD R4 Y ff-rd-addrs))

  (emit '(LOAD A L R4 R))
  (emit '(TST A))
  (emit '(JNE not-ran))
  (emit '(JSR .random))
  (emit '(COPY A0 X0))
  (emit '(JMP got-x0))
  (emit '(not-ran LOCAL))

  (emit   '(JSR .readin))		;result in X0

  (emit '(got-x0 LOCAL))

  (emit   '(LOAD X1 Y ff-wrt-addrs))
  (emit   '(MPY X0 X1 A)   '(LOAD R4 X ff-wrt-addrs))
  (emit   '(STORE A X R4 R+1))
  (emit   '(LOAD X0 1))
  (emit   '(STORE R4 X ff-wrt-addrs))
  (emit   '(LOAD B X ff-ctrs))
  (emit   '(ADD X0 B)      '(LOAD A Y ff-ctrs)) ;limit
  (emit   '(CMP B A)       '(STORE B X ff-ctrs))
  (emit   '(JGT readin-loop))
  (emit '(HIDE readin-loop readin-data))
  (emit '(RTS))

  (emit '(apply-envelope-and-shift))
  ;; X[17] = 0 if table, 1 if envelope (Y[17] = scaler 1/N), 2 if convolution desired
  (emit '(LOAD R2 X ff-base-datawd0))

  (emit '(LOAD R6 Y ff-base-datawd0))
  (emit '(LOAD N2 15))
  (emit '(LOAD R0 X R6 R))		;datar[0]
  (emit '(LUA R7 R2 R+N))		;filtr table header base
  (emit '(LOAD N2 4))
  (emit '(LOAD R3 X R7 R+1))		;r3->filtr[0], r7->second word of header
  (emit '(COPY R0 R4))
  (emit '(CLR A)     '(LOAD N0 X R2 RN))
  (emit '(STORE A L ff-ctrs))
  (emit '(LUA R6 R0 R+N))
  (emit '(LOAD N2 5))
  (emit '(UPDATE R6 R-1))
  (emit '(LOAD A X R2 RN))		;half-siz
  (emit '(COPY R6 R5))
  (emit '(LOAD N2 17))
  (emit '(STORE A Y ff-ctrs))
  (emit '(JSET 1 X R2 RN convolution))
  (emit '(JCLR 0 X R2 RN table-value))

  (emit   '(LOAD N7 1))
  (emit   '(LOAD N3 1))
  (emit   '(LOAD R3 X R7 R-1))		;X:home+1 = ptr into env data
  (emit   '(LOAD B L R7 R))		;initial envelope value
  (emit   '(STORE B L ff-env-val))

  ;; need datar[0..half] and datar[end..half (by -1)]
  ;; and datai for both -- get env val, then mpy on all four, and store
  ;; let R0->datar[low], R6->datar[high]
  ;;     R4->datai[low], R5->datai[high]
  ;;     R7->env base,   R3->env data
  ;;     X0, X1, Y1 be data, Y0 be env val, B current incremental env-val
  ;; for env home(L)=initial value, home+1(X)=ptr to data list

  (emit '(env-loop))

  (emit '(JCLR 0 X R2 RN table-value))

  (emit   '(LOAD B X ff-ctrs))		;first check whether we are at right place in env data list
  (emit   '(LOAD X0 Y R3 R))		;pass counter of next rate (long-int originally)
  (emit   '(CMP X0 B)      '(LOAD A L R3 RN))
  (emil   '(JLT ptr-ok))		;if B>A, current rate is correct 
  (emit     '(UPDATE R3 R+1))		;otherwise update ptr
  (emit     '(UPDATE R3 R+1))
  (emit   '(ptr-ok LOCAL))
  (emit   '(LOAD B L ff-env-val))
  (emit   '(ADD A B)       '(LOAD X0 X R0 R))
  (emit   '(LOAD Y0 1))
  (emit   '(CMP Y0 B)      '(STORE B L ff-env-val))
  (emit   '(JGT fixup))
  (emit   '(ASR B))       
  (emit   '(COPY B0 Y1))
  ;; here we assume the envelope values are positive fractions

  (emit   '(JMP scale-data))
  (emit '(table-value LOCAL))
  (emit   '(LOAD X0 X R0 R))
  (emit   '(JCLR 0 Y R7 R x-side))
  (emit     '(LOAD Y1 Y R3 R+1))
  (emit     '(JMP scale-data))
  (emit     '(x-side LOCAL))
  (emit     '(LOAD Y1 X R3 R+1))
  (emit   '(scale-data LOCAL))

  (emit   '(MPY X0 Y1 B)   '(LOAD X0 X R6 R))          ;datar is X side (R0 and R6)
  (emit   '(MPY X0 Y1 B)   '(STORE B X R0 R+1))
  (emit   '(STORE B X R6 R-1))
  (emit   '(LOAD X0 Y R4 R))                           ;datai is Y side (R4 and R5)
  (emit   '(MPY X0 Y1 B)   '(LOAD X0 Y R5 R))    
  (emit   '(MPY X0 Y1 B)   '(STORE B Y R4 R+1))
  (emit   '(STORE B Y R5 R-1))
  (emit   '(do-ctr))
  (emit   '(LOAD X0 1))
  (emit   '(LOAD A X ff-ctrs))
  (emit   '(ADD X0 A)      '(LOAD B Y ff-ctrs))
  (emit   '(CMP A B)       '(STORE A X ff-ctrs))
  (emit   '(JGT env-loop))
  (emit   '(HIDE env-loop apply-envelope-and-shift))
  (emit '(RTS))
  (emit '(fixup LOCAL))
  (emit '(UPDATE R0 R+1))
  (emit '(UPDATE R6 R-1))
  (emit '(UPDATE R5 R-1))
  (emit '(UPDATE R4 R+1))
  (emit '(JMP do-ctr))

  (emit '(convolution))

  ;;addr+17 = table header where filtr is X side, filti is Y side
  ;;do a complex multiply of these with datar and datai, returning results in datar and datai
  ;;do entire loop here to keep things somewhat simple
  ;;when we arrive here, Y ff-ctrs=limit of loop (midpoint), X=0
  ;; let R0->datar[low], R6->datar[high]
  ;;     R4->datai[low], R5->datai[high]
  ;;     R7->filtr base (wd2), R3->filtr data

  ;;complex multiply is X[R0]*X[R3] - Y[R4]*Y[R3] => temp
  ;;                    X[R0]*Y[R3] + X[R3]*Y[R4] => Y[R4] (R4<-R4+1)
  ;;                    temp => X[R0] (R0<-R0+1)
  ;;                    X[R6]*X[R3] - Y[R5]*Y[R3] => temp
  ;;                    X[R6]*Y[R3] + X[R3]*Y[R5] => Y[R5] (R5<-R5-1)
  ;;                    temp => X[R6] (R6<-R6-1, R3<-R3+1)

  (emit   '(LOAD X0 X R0 R) '(LOAD Y0 Y R4 R))

  (emit   '(LOAD X1 Y R7 R))
  (emit   '(MPY X0 X1 A))
  (emit   '(ASR A)          '(LOAD Y1 Y R3 R))
  (emit   '(MPY X1 Y0 B)    '(COPY A0 X0))
  (emit   '(ASR B)          '(LOAD X1 X R3 R))

  (emit   '(MPY X0 X1 A)    '(COPY B0 Y0))
  (emit   '(SMAC Y0 Y1 A))
  (emit   '(MPY X0 Y1 A)    '(STORE A X R0 R+1)   '(LOAD Y1 Y R7 R))
  (emit   '(MAC X1 Y0 A)    '(LOAD X0 X R6 R))

  (emit   '(MPY X0 Y1 B)    '(STORE A Y R4 R+1))
  (emit   '(ASR B))
  (emit   '(COPY B0 X0))

  (emit   '(MPY X0 X1 A)    '(LOAD Y0 Y R5 R))

  (emit   '(MPY Y0 Y1 B))
  (emit   '(ASR B)          '(LOAD Y1 Y R3 R+1))
  (emit   '(COPY B0 Y0))

  (emit   '(SMAC Y0 Y1 A)   '(LOAD B X ff-ctrs))
  (emit   '(MPY X0 Y1 A)    '(STORE A X R6 R-1))
  (emit   '(MAC X1 Y0 A)    '(LOAD X0 1))
  (emit   '(STORE A Y R5 R-1))
  (emit   '(ADD X0 B)       '(LOAD A Y ff-ctrs))
  (emit   '(CMP B A)        '(STORE B X ff-ctrs))
  (emit   '(JGT convolution))
  (emit   '(HIDE convolution))
  (emit '(RTS))

  (emit '(readout-first-data))
  (emit '(LOAD R2 X ff-base-datawd0))	;almost certainly redundant, but...
  (emit '(LOAD N2 4))
  (emit '(LOAD R3 Y ff-base-datawd0))
  (emit '(LOAD A X R2 RN))		;fftflt-siz
  (emit '(LOAD N2 5))
  (emit '(LOAD N5 Y R2 RN))		;fourth-size
  (emit '(LOAD B Y R2 RN))
  (emit '(SUB B A)    '(LOAD N2 2))
  (emit '(LOAD R5 X R3 R))		;datar[0]
  (emit '(LOAD R3 X R2 RN))		;rblk-b[0]
  (emit '(LOAD N2 3))
  (emit '(UPDATE R5 R+N))		;datar[siz/4]
  (emit '(JCLR 0 Y R2 RN x-side))
  (emit '(DO A fill-y-buf))
  (emit   '(LOAD X0 X R5 R+1))
  (emit   '(STORE X0 Y R3 R+1))
  (emit   '(NOP))
  (emit   '(fill-y-buf LOCAL))
  (emit '(NOP))
  (emit '(RTS))
  (emit '(x-side LOCAL))
  (emit '(DO A fill-x-buf))
  (emit   '(LOAD X0 X R5 R+1))
  (emit   '(STORE X0 X R3 R+1))
  (emit   '(NOP))
  (emit   '(fill-x-buf LOCAL))
  (emit '(NOP))
  (emit '(HIDE readout-first-data))
  (emit '(RTS))

  (emit '(readout-data))
  (emit '(LOAD R2 X ff-base-datawd0))	;almost certainly redundant, but...
  (emit '(LOAD N2 4))
  (emit '(LOAD R3 Y ff-base-datawd0))
  (emit '(LOAD R7 Y R2 RN))		;fftflt-hop
  (emit '(LOAD N0 Y R2 RN))		;R0 -> buf[j]
  (emit '(COPY N0 N4))			;R4 -> data[j]
  (emit '(LOAD N2 2))
  (emit '(LOAD R4 X R3 R))
  (emit '(LOAD R0 X R2 RN))
  (emit '(LOAD N2 3))
  (emit '(COPY R4 R5))			;R5 -> data[i]
  (emit '(COPY R0 R3))			;R3 -> buf[i]
  (emit '(UPDATE R4 R+N))
  (emit '(UPDATE R0 R+N))
  (emit '(JCLR 0 Y R2 RN x-side))
  (emit   '(LOAD X0 X R5 R+1))
  (emit   '(DO R7 fill-y-blk))
  (emit                    '(LOAD A X R4 R+1)  '(LOAD B Y R3 R))
  (emit     '(ADD X0 B)    '(STORE A Y R0 R+1))
  (emit                    '(LOAD X0 X R5 R+1) '(STORE B Y R3 R+1))
  (emit     '(fill-y-blk LOCAL))
  (emit   '(NOP))
  (emit   '(RTS))
  (emit  '(x-side LOCAL))
  ;; since both arrays are in X memory, code is less compact
  (emit  '(DO R7 fill-x-blk))
  (emit    '(LOAD X0 X R5 R+1))
  (emit    '(LOAD B X R3 R))
  (emit    '(ADD X0 B)    '(LOAD X0 X R4 R+1))
  (emit    '(STORE B X R3 R+1))
  (emit    '(STORE X0 X R0 R+1))
  (emit    '(fill-x-blk LOCAL))
  (emit   '(NOP))
  (emit   '(HIDE readout-data))
  (emit   '(RTS)))


(libdecl .convolve 'convolve-load '(A B X Y R0 R2 R3 R4 R5 R6 R7) '(.fft-filter))
(defun convolve-load ()
  (emit '(.convolve))			;nearly the same as fft-filter (and piggy backs on it mostly)
  (emit '(JCLR 2 X R2 RN .fft-filter))
  (emit '(LOAD N2 8))
  (emit '(NOP))
  (emit '(UPDATE R2 R+N))
  (emit '(JSR .readin))
  (emit '(COPY X0 A))			;compiler expects fractional result in A (.run-block)
  (emit '(RTS)))



;;; LOADER (kinda brute force, but easier than the "right" thing, which we don't need anyway)

(defun not-loaded (name)
  (multiple-value-bind 
      (val exists) (gethash name labels)
    (or (not exists)
	(eq (car val) 'UNDEFINED))))

(defun complete-union (lst)
  ;; lst is a list of library routines that need to be loaded, each routine may require others, and so on.
  (if lst
      (let ((complete-list nil))
	(loop while lst do
	  (let* ((cur-func (pop lst))
		 (cur-needs (libinfo-needs (eval cur-func))))
	    (if (not (member cur-func complete-list))
		(setf complete-list (union (adjoin cur-func complete-list) (complete-union cur-needs))))))
	complete-list)))
    
(defun load-library-routines (load-list &optional already-inited)
  ;; LOAD-LIST is a list of library routines required by the user's code, but not in any order.
  (let ((true-load-list (complete-union load-list))
	(init-list (set-difference
		    '(#+QP .QP-initialize-DRAM 
		      .sine-init .cosine-init
		      .oscil-init .log-init .expt-init .sambox-random-init .sw-init .atan-init
		      .random .fft-window-init
		      .real-mod-init .real-rem-init .lcm-init .array-dot-product-init
		      .wave-train-init .tan-init .atan2-init .I0-init .src-init
		      .fft-init .spectrum-init .fft-filter-init
		      )
		    already-inited)))
    (loop while init-list do
      (let ((cur-func (pop init-list)))
	(if (and (member cur-func true-load-list)
		 (not-loaded cur-func))
	    (funcall (libinfo-loads (eval cur-func))))))
    (loop while true-load-list do
      (let ((cur-func (pop true-load-list)))
	(if (not-loaded cur-func)
	    (funcall (libinfo-loads (eval cur-func))))))))


;;;
;;; -------------------- 56000 DEBUGGER ------------------------------
;;; 
;;; uses emit-prog to show where we are in the current program -- can read/write anywhere on chip or in memory,
;;; display values in various formats, keeps current-display list (like RAID), and so on.  Until DSP-GO (which
;;; tells the program to continue), we are in a loop executing arbitrary instructions (using the assembler in
;;; dsp56.lisp).  Break points can be cleared, and a running program can be forced into the debugger by calling
;;; dsp-force-break.  Various decoding procedures are provided wherever individual bits are used as flags.

(defun print-hash-entry (a b) (format t "~S:~S~%" a b))
(defun show-dsp-names () (maphash 'print-hash-entry names))     (defun print-dsp-names () (show-dsp-names))
(defun show-dsp-labels () (maphash 'print-hash-entry labels))   (defun print-dsp-labels () (show-dsp-labels))
(defun show-compiler-names () (maphash 'print-hash-entry vars)) (defun print-compiler-names () (show-compiler-names))

(defun report-bit (i n)
  (if (logbitp n i) "on" "off"))

(defun dsp-error (i)			;this from NeXT -- usr/include/dsp/dsperrno.h or something
  (case (- i 7000)			;we really should not see these codes, but maybe the Mach error will appear
    (11 "miscellaneous error")
    (13 "communications error")
    (14 "DSP ram broken")
    (15 "DSP not responding")
    (21 "user error?")
    (23 "apparently hardware trouble")
    (26 "Mach system error")
    (28 "DSP aborted")
    (t "unknown DSP error")))

(defun decode-ICR (i)
  (if (> i 255)
      (format nil "error: ~A" (dsp-error i))
    (format nil "RREQ: ~A, TREQ: ~A, HF0 and HF1 flags: ~A, mode: ~A, INIT: ~A"
	    (report-bit i 0) 
	    (report-bit i 1) 
	    (logand (ash i -3) 3) 
	    (case (logand (ash i -5) 3) 
	      (0 "interrupt mode (DMA off)")
	      (1 "24-bit DMA")
	      (2 "16-bit-DMA")
	      (3 "8-bit DMA"))
	    (report-bit i 7))))

(defun decode-CVR (i)
  (if (> i 255)
      (format nil "error: ~A" (dsp-error i))
    (format nil "Host vector: ~X, HC: ~X"
	    (logand i #x1F) (report-bit i 7))))

(defun decode-ISR (i)
  (if (> i 255)
      (format nil "error: ~A" (dsp-error i))
    (format nil "RXDF: ~A, TXDE: ~A, TRDY: ~A, HF2 and HF3 flags: ~A, DMA: ~A, HREQ: ~A"
	    (report-bit i 0)
	    (report-bit i 1)
	    (report-bit i 2)
	    (logand (ash i -3) 3)
	    (report-bit i 6)
	    (report-bit i 7))))

(defun HI-status ()
  (if (= 0 (dsp-check-host-interface))	;checkHostInterface in next56.c
      (if (or (/= 0 (dsp-is-open)) 
	      #+QP (/= 0 (qp-is-open))
	      )
	  (let ((nICR (dsp-read-ICR))
		(nCVR (dsp-read-CVR))
		(nISR (dsp-read-ISR)))
	    (format nil "~%ICR: #x~X~8T(~A)~%CVR: #x~X~8T(~A)~%ISR: #x~X~8T(~A)~%"
		    nICR (decode-ICR nICR)
		    nCVR (decode-CVR nCVR)
		    nISR (decode-ISR nISR)))
	(format nil "DSP is closed!"))
    (format nil "no hostInterface!")))

(defun decode-SR (sr)
  (if (= sr -1)
      (format nil "communication trouble -- SR=-1")
    (format nil "MR: LOOP FLAG: ~A, TRACE: ~A, SCALING MODE: ~A, INTERRUPT MASK: ~A,~%~
               CCR: ~A~A ~A ~A ~A ~A ~A ~A"
	    (report-bit sr 15)
	    (report-bit sr 13)
	    (logand (ash sr -10) 3)
	    (logand (ash sr -8) 3)
	    (if (logbitp 7 sr) "auto-scale large store set " "") 
	    ;; this bit doesn't work on ccrma machines, but is documented in Motorola's FFT discussion.
	    (if (logbitp 6 sr) "limit" "")
	    (if (logbitp 5 sr) "extension" "")
	    (if (logbitp 4 sr) "unnormalized" "")
	    (if (logbitp 3 sr) "negative" "")
	    (if (logbitp 2 sr) "zero" "")
	    (if (logbitp 1 sr) "overflow" "")
	    (if (logbitp 0 sr) "carry" ""))))

(defun decode-OMR (omr)
  (if (= omr -1)
      (format nil "communication trouble -- OMR=-1")
    (format nil "OMR: EA: ~A, SD: ~A, DE: ~A, Mode: ~A"
	    (report-bit omr 7)
	    (report-bit omr 6)
	    (report-bit omr 2)
	    (logand omr 3))))
  
(defun decode-SP (sp)
  (if (= -1 sp)
      (format nil "communication trouble -- SP=-1")
    (format nil "SP: ~A ~A ~A"
	    (logand sp 15)
	    (if (logbitp 4 sp) "stack error" "")
	    (if (logbitp 5 sp) "underflow" ""))))


(defun assemble (op1 &optional op2 op3)
  (if (not (gethash 'm-htx names))	;are main opsys names defined?
      (progn
	(initialize-monitor-names)
	#+QP (QP-initialize-names)
	))
  (setf pipe-1 -1)			;previous pipeline instruction class
  (setf pipe-2 -1)			;ditto but two back
  (setf pipe-3 -1)			;ditto but 3 back (need that much at end of DO loops)
  (setf last-reg nil)  
  (setf after-Do-reg nil)

  (setf dsp-pc 0)
  (edit-dsp-program 0 0)
  (edit-dsp-program 1 0)

  (if op2
      (emit op1 op2 op3)
    (if op3
	(progn
	  (emit op1)
	  (emit op3))
      (emit op1)))
  (values (get-dsp-program 0) (get-dsp-program 1) dsp-pc))

(defun check-HI ()
  (if (and (or (/= 0 (dsp-is-open)) 
	       #+QP (/= 0 (qp-is-open))
	       )
	   (/= -1 (dsp-check-host-interface)))
      (let ((ISR (dsp-read-isr)))
	(when (logbitp 0 ISR)		;RXDF bit on -- dsp has something it wants us to see
	  (print (format nil "DSP says: ~D" (dsp-get-one-word)))
	  (setf ISR (dsp-read-isr))
	  (when (logbitp 0 ISR)
	    (if (y-or-n-p "DSP has still more to say -- read it?")
		(loop while (logbitp 0 ISR) do
		  (print (dsp-get-one-word))))))
	(if (logbitp 7 (dsp-read-cvr))
	    (print "DSP is stopped -- host command stuck")
	  (if (and (logbitp 1 ISR) (not (logbitp 2 ISR)))
	      (print "DSP is wedged -- TXDF is on, but TRDY is off"))))
    (error "dsp is apparently closed or in use by some other process")))

(defun check-undef-lab (op)
  (or (not (eq 'JSR (car op)))
      (not (eq 'UNDEFINED (car (gethash (cadr op) labels))))))

(defun dsp-execute (op1 &optional op2 op3)
  (multiple-value-bind 
      (wd1 wd2) (assemble op1 op2 op3)
    (check-HI)
    (if (check-undef-lab op1)
	(progn
	  (if (= -1 (dsp-put-one-word wd1)) (error "cannot put op-1"))
	  (if (= -1 (dsp-put-one-word wd2)) (error "cannot put op-2")))
      (format nil "undefined label: ~A" (cadr op1)))))


(defun dsp-clear-HF3 ()
  (dsp-execute '(BCLR M-HF3 X-IO M-HCR)))

(defun dsp-clear-HF2 ()
  (dsp-execute '(BCLR M-HF2 X-IO M-HCR)))

(defun dsp-go ()			;exit debugger loop and continue program execution
  (dsp-clear-HF3)
  (dsp-execute '(RTS)))

(defun dsp-step ()			;single step once
  (dsp-execute '(RTI)))

(defun dsp-run ()			;turn off single-stepping and run
  (dsp-execute '(ANDI #xDF MR))		;these 2 ops cannot be combined because ANDI needs time to settle
  (dsp-clear-HF3)
  (dsp-execute '(RTI)))

(defun dsp-single-step ()		;turn on single-stepping (use dsp-run or dsp-step, not dsp-go from here)
  (dsp-execute '(ORI #x20 MR) nil '(RTS)))

(defun dsp-go-on ()
  (dsp-go)
  #+Excl (top-level:do-command "continue" 0)
  )					;get us out of lisp debugger too

(defun dsp-reg (reg)			;report current contents (bits) of REG
  (if (or (eq reg 'SSH) (eq reg 'PC))	;.break stored it for us in P DE-SP (out of harm's way)
      (dsp-execute '(MOVE P DE-PC X-IO M-HTX))
    (if (eq reg 'SSL)
	(dsp-execute '(MOVE P DE-SR X-IO M-HTX))
      (dsp-execute `(STORE ,reg X-IO M-HTX))))
  (dsp-get-one-word))			;and return REG's value

(defun dspSR () (decode-sr (dsp-reg 'SR)))
(defun dspSP () (decode-sp (dsp-reg 'SP)))
(defun dspOMR () (decode-omr (dsp-reg 'OMR)))
(defun dspICR () (decode-icr (dsp-read-icr)))
(defun dspISR () (decode-isr (dsp-read-isr)))
(defun dspCVR () (decode-cvr (dsp-read-cvr)))
	
(defun bit8 (i) (if (> i 127) (- i 256) i))
(defun bit16 (i) (if (>= i (expt 2 15)) (- i (expt 2 16)) i))
(defun bit24 (i) (if (>= i (expt 2 23)) (- i (expt 2 24)) i))
(defun frac24 (i) (scale-float (float (bit24 i)) -23))
(defun real56 (a2 a1 a0) (declare (ignore a2)) (float (+ (bit24 a1) (/ a0 (expt 2 24)))))

(defun bit48 (int frac)
  (let ((num (+ (ash int 24) frac)))
    (if (>= num (expt 2 47)) (- num (expt 2 48)) num)))

(defun dsp-mem (mem loc &optional (typ 'real) (mistake nil))
  (if (or mistake (integerp typ))
      (dsp-memory mem loc typ (if mistake mistake 'integer))
    (if (not (eq mem 'L))
	(progn
	  (if (or (eq mem 'Y-IO) (eq mem 'X-IO))
	      (progn
		(dsp-execute `(LOAD A ,mem ,loc))
		(dsp-execute `(STORE A X-IO M-HTX)))
	    (dsp-execute `(MOVE ,mem ,loc X-IO M-HTX)))
	  (if (eq typ 'fraction)
	      (frac24 (dsp-get-one-word))
	    (dsp-get-one-word)))
      (let ((int 0) (frac 0))
	(dsp-execute `(MOVE X ,loc X-IO M-HTX))
	(setf int (dsp-get-one-word))
	(dsp-execute `(MOVE Y ,loc X-IO M-HTX))
	(setf frac (dsp-get-one-word))
	(if (eq typ 'real) 
	    (real56 0 int frac)
	  (bit48 int frac))))))

(defun dsp-set-reg (reg val)
  (dsp-execute `(LOAD ,reg ,val)))

(defun dsp-get-patch-location ()
  (dsp-mem 'Y 0))

(defun dsp-alu ()
  (list (dsp-reg 'A2) (dsp-reg 'A1) (dsp-reg 'A0)
	(dsp-reg 'B2) (dsp-reg 'B1) (dsp-reg 'B0)
	(dsp-reg 'X1) (dsp-reg 'X0) (dsp-reg 'Y1) (dsp-reg 'Y0)))

(defun dsp-ctrl ()
  (list (dsp-reg 'SR) (dsp-reg 'OMR) (dsp-reg 'SP)
	(dsp-reg 'SSH) (dsp-reg 'SSL) (dsp-reg 'LA) (dsp-reg 'LC)))

(defun R-reg-name (i) (nth i '(R0 R1 R2 R3 R4 R5 R6 R7)))
(defun N-reg-name (i) (nth i '(N0 N1 N2 N3 N4 N5 N6 N7)))
(defun M-reg-name (i) (nth i '(M0 M1 M2 M3 M4 M5 M6 M7)))

(defun dsp-reg-16 (i) (bit16 (dsp-reg i)))

(defun dsp-agu ()
  (loop for i from 0 to 7 collect 
    (list (dsp-reg (R-reg-name i))
	  (dsp-reg (N-reg-name i))
	  (dsp-reg-16 (M-reg-name i)))))

(defun dsp-dpy-AB (name a2 a1 a0)
  (format nil "~%~S:~4T~X~12T~X~20T~X~%~4T~D~12T~D~20T~D~%~4T~12T~6F~20T~6F~%~4T~F"
	  name a2 a1 a0
	  (bit8 (logand a2 #xff)) (bit24 a1) (bit24 a0)
	  (frac24 a1) (frac24 a0) 
	  (real56 a2 a1 a0)))
	  
(defun dspA () (dsp-dpy-AB 'A (dsp-reg 'A2) (dsp-reg 'A1) (dsp-reg 'A0)))
(defun dspB () (dsp-dpy-AB 'B (dsp-reg 'B2) (dsp-reg 'B1) (dsp-reg 'B0)))

(defun dsp-dpy-XY (name x1 x0)
  (format nil "~%~S:~4T~X~12T~X~%~4T~D~12T~D~%~4T~6F~12T~6F~%~4T~F"
	  name x1 x0 (bit24 x1) (bit24 x0) (frac24 x1) (frac24 x0) (real56 0 x1 x0)))

(defun dspX () (dsp-dpy-XY 'X (dsp-reg 'X1) (dsp-reg 'X0)))
(defun dspY () (dsp-dpy-XY 'Y (dsp-reg 'Y1) (dsp-reg 'Y0)))

(defun dsp-dpy-R (name r0)
  (format nil "~%~S:~4T~X~12T~D"
	  name r0 (bit24 r0)))

(defun combine-lists (l1 l2)
  (loop for i in l1 and j in l2 collect i collect j))

(defun dsp-dpy-all ()
  (let ((ALU (dsp-alu))
	(AGU (dsp-agu))
	(CTL (dsp-ctrl)))
  (format nil "~{~S: ~D   ~}~%~{~S: ~S   ~}~%~{~S: ~D   ~}"
	  (combine-lists '(A2 A1 A0 B2 B1 B0 X1 X0 Y1 Y0) ALU)
	  (combine-lists '(R0 R1 R2 R3 R4 R5 R6 R7) AGU)
	  (combine-lists '(SR OMR SP SSH SSL LA LC) CTL))))

(defun dsp-memory (typ beg &optional (end nil) (d-typ 'integer))
  (if end
      (loop for i from beg to end collect 
	(dsp-mem typ i d-typ))
    (dsp-mem typ beg d-typ)))
    
(defun show-emit-prog (pc)
  (loop for i in emit-prog do
    (if (< (abs (- pc (car i))) 3) 
	(progn 
	  (if (and
	       (= pc (car i))
	       (true-op (caadr i)))	;try to point just at the actual operation, not surrounding persiflage
	      (princ " -->")
	    (princ "    "))
	  (princ (cdr i))
	  (terpri)))))

(defvar cur-max-pc 0)
(defvar cur-pc-target 0)
(defvar cur-label nil)

(defun closest-label (a b)
  (if (and (<= (car b) cur-pc-target)
	   (> (car b) cur-max-pc))
      (progn
	(setf cur-max-pc (car b))
	(setf cur-label a))))

(defun find-closest-label (pc)
  (setf cur-pc-target pc)
  (setf cur-max-pc 0)
  (setf cur-label nil)
  (maphash 'closest-label labels)
  cur-label)

(defvar display-list nil)
(defun dsp+ (reg) (push reg display-list))
(defun dsp- (reg) (setf display-list (remove reg display-list)))
(defun dsp0 () (setf display-list nil))

(defun show-display-state ()
  (loop for i in display-list do
    (print i)
    (if (listp i)
	(print (dsp-mem (car i) (cadr i)))
      (case i
	(A (print (dspA)))
	(B (print (dspB)))
	(X (print (dspX)))
	(Y (print (dspY)))
	(t (print (dsp-reg i)))))))

(defun dsp-service-break ()
  (let* ((hi-ok-1 (= 0 (dsp-check-host-interface)))
	 (hi-ok-2 (or #+QP (/= 0 (qp-is-open)) 
		      (/= 0 (dsp-is-open))))
	 (hi-ok (and hi-ok-1 hi-ok-2))
	 (isr (if hi-ok (dsp-read-isr) 0))
         (isr-ok (not (and (logbitp 1 ISR) (not (logbitp 2 ISR)))))
	 (pc (if (or (not hi-ok) 
		     (not isr-ok)	   ; TXDF on, TRDY off (probably forgot to boot the damn thing)
		     (= 0 (logand isr 7))) ; HI is off (forgot to get memory mapped interface)
		 -1
	       (progn
		 (dsp-force-break)
		 (setf isr (dsp-read-isr))
		 (if (logbitp 4 isr)	;HF3 is on -- probably ok to poke
		     (- (dsp-reg 'SSH) 2)
		   -1)))))
					;JSR stores return address in SSH, JSR .break is 2 word op
    (print (format nil "~&pc: ~A~A" 
		   (if (plusp pc) pc "unknown ") 
		   (if (not hi-ok) 
		       (if (not hi-ok-1) " -- no host interface" "dsp closed")
		     (if (not isr-ok) " -- dsp not booted or wedged"
		       (if (= 0 (logand isr 7)) " -- HI is off"
			 " -- can't see what's wrong")))))
    (when (and emit-prog hi-ok isr-ok c56-debug (plusp pc))
      (show-emit-prog pc)
      (show-display-state))
    (break)))

(defun dsp-force-break ()
  (dsp-write-CVR #x92))			;#x92=turn on HC bit and set #x24 as vector location (JSR .break)

(defun dsp-set-mem (mem loc val)
  (dsp-execute `(STORE X0 Y temp-loc))
  (dsp-execute `(LOAD X0 ,val))
  (dsp-execute `(STORE X0 ,mem ,loc))
  (dsp-execute `(LOAD X0 Y temp-loc)))

(defun dsp-set-op (op pc)
  (dsp-set-mem 'P pc (assemble op)))

(defun dsp-clear-break (pc)
  (let ((sought-op (assemble '(JSR .break))))
    (if (= (get-dsp-program (- pc 2)) sought-op)
	(progn
	  (dsp-set-op '(NOP) (- pc 2))
	  (dsp-set-op '(NOP) (- pc 1)))
      (cerror "ignore it" "not a break-point"))))

(defun dsp-help ()
  (format nil "
To go on         -- dsp-go
Decode           -- decode-ICR, -CVR, -ISR, -SR, -SP, -OMR, dspSR, dspSP, dspOMR, dspICR etc
Read             -- (dsp-reg reg), (dsp-mem mem loc)
                 -- dspA, dspB, dspX, dspY, 
                 -- (dsp-dpy-AB name a2 a1 a0), (dsp-dpy-XY name x1 x0)
                 -- dsp-dpy-all, (dsp-memory typ beg end), (dsp-dpy-R name r0)
Write            -- (dsp-set-reg reg val), (dsp-set-mem mem loc val), (dsp-set-op op pc)
Execute          -- (dsp-execute op) ;dsp-get-one-word to get result, if any.
Add to list      -- (dsp+ reg)
Remove from list -- (dsp- reg)
Clear list       -- dsp0
Force break      -- dsp-force-break
Clear break      -- (dsp-clear-break pc)
Single step      -- dsp-single-step to turn it on, and dsp-run to turn it off.  
                 -- dsp-step to step once.
Check interface  -- hi-status
Number decode    -- bit8, bit24, frac24, real56
"))

(defun dsp? () (dsp-help))

(defun ds (&optional (num 1))
  (loop for i from 0 below num do (dsp-step))
  (show-emit-prog (dsp-reg 'SSH))
  (show-display-state))


(defun ex (op) (dsp-execute op))


(defun find-location-name (loc labs)
  (if labs
      (loop for i in labs do
	(if (<= (cadr i) loc) (return (car i))))))

(defun dsp-stack-trace ()
  ;; for Jumps (JSR et al), the address of the instruction immediately following the jump is pushed on the stack.
  ;; for DO, LA is pushed, then PC.  We can't tell whether we had a long or short jump, so some guesswork is needed.
  (let* ((SP (dsp-reg 'SP))
	 (LA (1+ (dsp-reg 'LA)))
	 (loc 0)
	 (nam nil)
	 (ins-name (or (dsp-data-ins-name (find-dsp current-slot current-dsp)) *current-instrument-name*))
	 (label-names (if ins-name (get ins-name :dsp-labels))))
    (loop for i from SP downto 1 do
      (dsp-execute '(STORE SSH X-IO M-HTX))
      (setf loc (dsp-get-one-word))
      (setf nam (symbol-name (find-location-name loc label-names)))
      ;; we use symbol-name to strip off the package name
      (princ (format nil "~%  ~A~A~A" loc (if nam ": " "") (if nam nam ""))))
    (setf nam (symbol-name (find-location-name LA label-names)))
    (if nam (princ (format nil "~%  LA: ~A: ~A" LA nam))))
    nil)


#|
(defun test-prog ()
  (dsp-close)
  (initialize-everything_56)
  (get-dsp-monitor)
  (emit '(NOP))
  (log-init)
  (expt-init)
  (emit '(JSR .break))

  (basic-divide-load)
  (real-mpy-load)
  (shift-AB-up-load)
  (real-int-mpy-load)
  (real-mod-load)
  (log-load)
  (expt-load)

  (emit '(.USER-START))
  (emit '(.USER-START-1))
  (emit '(.PASS-START))
  (emit '(.R-T))
  (emit '(.FORCE-FLUSH-BUFFERS))
  (emit '(JMP .break))

  (check-for-undefined-labels)
  (dsp-set-up (min (+ dsp-pc 1) internal-p-size) internal-p-memory)

  (loop for i from 0 below (get-heap-ptr) do
    (dsp-set-mem 'X i (getf-x-mem i))
    (dsp-set-mem 'Y i (getf-y-mem i)))
  (when (> ex-ptr 0)
    (loop for i from 0 to ex-ptr do
      (dsp-set-mem 'X (+ i 40960) (getf-x-mem (+ i 40960))))))


(defun a-b (a b) 
  (multiple-value-bind 
      (aint afrac) (make-real a)
    (multiple-value-bind 
	(bint bfrac) (make-real b)
      (ex `(LOAD A ,aint)) 
      (ex `(LOAD A0 ,afrac)) 
      (ex `(LOAD B ,bint)) 
      (ex `(LOAD B0 ,bfrac)) 
      (ex '(JSR .real-mod)) 
      (format nil "~A and ~A" (dspA) (mod a b)))))
|#