unit ablas;

(*************************************************************************
Copyright (c) 2007-2008, Sergey Bochkanov (ALGLIB project).

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

- Redistributions of source code must retain the above copyright
  notice, this list of conditions and the following disclaimer.

- Redistributions in binary form must reproduce the above copyright
  notice, this list of conditions and the following disclaimer listed
  in this license in the documentation and/or other materials
  provided with the distribution.

- Neither the name of the copyright holders nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*************************************************************************)
{$FPUTYPE SSE2}
{$ASMMODE ATT}
{$GOTO ON}

interface

function IsSSE2Supported():Boolean;

function ASMDotProduct1(V1: PDouble; V2: PDouble; N: Integer):Double;cdecl;

procedure ASMMove1(VDst: PDouble; VSrc: PDouble; N: Integer);cdecl;
procedure ASMMoveS1(VDst: PDouble; VSrc: PDouble; N: Integer; S: Double);cdecl;
procedure ASMMoveNeg1(VDst: PDouble; VSrc: PDouble; N: Integer);cdecl;

procedure ASMAdd1(VDst: PDouble; VSrc: PDouble; N: Integer);cdecl;
procedure ASMAddS1(VDst: PDouble; VSrc: PDouble; N: Integer; S: Double);cdecl;
procedure ASMSub1(VDst: PDouble; VSrc: PDouble; N: Integer);cdecl;
procedure ASMMulS1(VDst: PDouble; N: Integer; S: Double);cdecl;

var
    // user-controlled flag, mostly for performance checking reasons
    UseSSE2IfPresent : Boolean = True;

implementation

const
    // constants for assembler codes
    Zero : Double = 0.0;
    SSE2Zero: array [0..1] of Double = (0.0, 0.0);

var
    // initialized at unit startup
    SSE2SupportEnabled : LongBool;

label
    nosse2, finished;

(************************************************************************
Returns True if SSE2 is present on current CPU, False otherwise.
************************************************************************)
function IsSSE2Supported():Boolean;
begin
    Result:=SSE2SupportEnabled;
end;


(************************************************************************
Dot product of V1[0:N-1] and V2[0:N-1].

SSE2 is used if present, generic FPU - otherwise.
************************************************************************)
function ASMDotProduct1(V1: PDouble; V2: PDouble; N: Integer):Double;cdecl;
var
    XMMBuf: array[0..16*3+16-1] of Char;
    XMMTot: array[0..16-1] of Char;
    BufPtr: Pointer;
    UseSSE2: LongBool;
    V: Double;
label
    halfalignedsse2, precycle, cycle, tail2, tail1,
    fullalignedsse2, aprecycle, acycle, atail4, atail2, atail1,
    endsse, loadxmm,
    fpu, fpuprecycle, fpucycle, endfpu, loadfpu;
begin
    if N<=0 then
    begin
        Result:=0;
        Exit;
    end;
    UseSSE2:=SSE2SupportEnabled and UseSSE2IfPresent;

    //
    // assembler coding
    // Assume N>0
    // TODO: saving FPU state
    //
    asm
        //PUSHF
        //PUSHA
        MOVL N, %ECX
        MOVL V1, %ESI
        MOVL V2, %EDI

        //
        // Save FPU state
        //


        //
        // Choose between FPU or SSE2
        //
        MOVL UseSSE2, %EAX        //
        TESTL %EAX, %EAX          // decide whether we use SSE2
        JZ fpu                    // or not depending on flag

        //CMPL $8, %ECX             // use FPU for small N's
        //JLE fpu                   //

        MOVL %EDI, %EAX           // decide whether we use
        TESTL $7, %EAX            // SSE2 or not depending on VDst
        JNZ fpu                   // offset (must be 0/8 modulo 16)

        //
        // Save SSE state
        //
        LEAL XMMBuf, %EBX
        SHRL $4, %EBX
        INCL %EBX
        SHLL $4, %EBX
        MOVL %EBX, BufPtr
        MOVAPD %XMM0, (%EBX)
        MOVAPD %XMM1, 16(%EBX)
        MOVAPD %XMM2, 32(%EBX)

        //
        // Choose between half-aligned or full-aligned SSE2
        //
        MOVL %ESI, %EAX
        MOVL %EDI, %EBX
        ANDL $15, %EAX
        ANDL $15, %EBX
        CMPL %EAX, %EBX
        JNE halfalignedsse2
        JMP fullalignedsse2

        //
        // HALF-ALIGNED SSE2
        // Process:
        // 0. Process head (1 double, if needed for 16-byte alignment)
        // 1. Main loop (process 4 doubles per iteration)
        // 2. Process tail with SSE2 (2 doubles, if exist)
        // 3. Process tail with FPU (1 double, if exists)
        //
    halfalignedsse2:
        MOVUPD SSE2Zero, %XMM0
        FLDZ
        TESTL $15, %EDI
        JZ precycle
        FLDQ (%ESI)
        FMULQ (%EDI)
        FADD
        ADDL $8, %ESI
        ADDL $8, %EDI
        DECL %ECX
        JZ endsse
    precycle:
        MOVL %ECX, %EBX
        ANDL $3, %EBX
        SHRL $2, %ECX
        JZ tail2
    cycle:
        MOVUPD   (%ESI), %XMM1
        MOVUPD 16(%ESI), %XMM2
        MULPD   (%EDI), %XMM1
        MULPD 16(%EDI), %XMM2
        ADDPD %XMM1, %XMM0
        ADDPD %XMM2, %XMM0
        ADDL $32, %ESI
        ADDL $32, %EDI
        DECL %ECX
        JNZ cycle
    tail2:
        CMPL $2, %EBX
        JB tail1
        MOVUPD (%ESI), %XMM1
        MULPD  (%EDI), %XMM1
        ADDPD  %XMM1, %XMM0
        ADDL $16, %ESI
        ADDL $16, %EDI
    tail1:
        TESTL $1, %EBX
        JZ endsse
        FLDQ (%ESI)
        FMULQ (%EDI)
        FADD
        JMP endsse

        //
        // FULL-ALIGNED SSE2
        // Process:
        // 0. Process head (1 double, if needed for 16-byte alignment)
        // 1. Main loop (process 4 doubles per iteration)
        // 2. Process tail with SSE2 (2 doubles, if exist)
        // 3. Process tail with FPU (1 double, if exists)
        //
    fullalignedsse2:
        MOVUPD SSE2Zero, %XMM0
        FLDZ
        TESTL $15, %EDI
        JZ aprecycle
        FLDQ (%ESI)
        FMULQ (%EDI)
        FADD
        ADDL $8, %ESI
        ADDL $8, %EDI
        DECL %ECX
        JZ endsse
    aprecycle:
        MOVL %ECX, %EBX
        ANDL $7, %EBX
        SHRL $3, %ECX
        JZ atail4
    acycle:
        MOVAPD   (%ESI), %XMM1
        MOVAPD 16(%ESI), %XMM2
        MOVAPD 32(%ESI), %XMM3
        MOVAPD 48(%ESI), %XMM4
        MULPD   (%EDI), %XMM1
        MULPD 16(%EDI), %XMM2
        MULPD 32(%EDI), %XMM3
        MULPD 48(%EDI), %XMM4
        ADDPD %XMM2, %XMM1
        ADDPD %XMM4, %XMM3
        ADDPD %XMM3, %XMM1
        ADDPD %XMM1, %XMM0
        ADDL $64, %ESI
        ADDL $64, %EDI
        DECL %ECX
        JNZ acycle
    atail4:
        CMPL $4, %EBX
        JB atail2
        MOVAPD   (%ESI), %XMM1
        MOVAPD 16(%ESI), %XMM2
        MULPD    (%EDI), %XMM1
        MULPD  16(%EDI), %XMM2
        ADDPD  %XMM1, %XMM0
        ADDPD  %XMM2, %XMM0
        ADDL $32, %ESI
        ADDL $32, %EDI
        SUBL $4, %EBX
    atail2:
        CMPL $2, %EBX
        JB atail1
        MOVAPD (%ESI), %XMM1
        MULPD  (%EDI), %XMM1
        ADDPD  %XMM1, %XMM0
        ADDL $16, %ESI
        ADDL $16, %EDI
    atail1:
        TESTL $1, %EBX
        JZ endsse
        FLDQ (%ESI)
        FMULQ (%EDI)
        FADD
        JMP endsse

        //
        // Load state
        //
    endsse:
        MOVUPD %XMM0, XMMTot
        LEAL XMMTot, %ESI
        FADDQ (%ESI)
        FADDQ 8(%ESI)
        FSTPQ V
    loadxmm:
        MOVL BufPtr, %EBX
        MOVAPD (%EBX), %XMM0
        MOVAPD 16(%EBX), %XMM1
        MOVAPD 32(%EBX), %XMM2
        JMP loadfpu


        //
        // NO SSE2, JUST FPU
        //
    fpu:
        FLDZ
        SHRL $1, %ECX
        JNC fpuprecycle
        FLDQ (%ESI)
        FMULQ (%EDI)
        FADD
        ADDL $8, %ESI
        ADDL $8, %EDI
    fpuprecycle:
        CMPL $0, %ECX
        JE endfpu
    fpucycle:
        FLDQ  (%ESI)
        FMULQ (%EDI)
        FLDQ  8(%ESI)
        FMULQ 8(%EDI)
        FADD
        FADD
        ADDL $16, %ESI
        ADDL $16, %EDI
        DECL %ECX
        JNZ fpucycle

        //
        // Load FPU state
        //
    endfpu:
        FSTPQ V
    loadfpu:


        //POPF
        //POPA
    end ['EAX', 'EBX', 'ECX', 'ESI', 'EDI'];
    Result:=V;
end;



(************************************************************************
Move VSrc[0:N-1] to VDst[0:N-1].
************************************************************************)
procedure ASMMove1(VDst: PDouble; VSrc: PDouble; N: Integer);cdecl;
begin
    if N<=0 then
        Exit;

    //
    // assembler coding
    // Assume N>0
    //
    asm
        PUSHF
        MOVL N, %ECX
        SHLL $2, %ECX
        MOVL VSrc, %ESI
        MOVL VDst, %EDI
        CLD
        REP MOVSW
        POPF
    end ['ECX', 'ESI', 'EDI'];
end;


(************************************************************************
Move S*VSrc[0:N-1] to VDst[0:N-1].
************************************************************************)
procedure ASMMoveS1(VDst: PDouble; VSrc: PDouble; N: Integer; S: Double);cdecl;
var
    XMMBuf: array[0..16*3+16-1] of Char;
    BufPtr: Pointer;
    UseSSE2: LongBool;
label
    halfalignedsse2, precycle, cycle, tail2, tail1,
    fullalignedsse2, aprecycle, acycle, atail2, atail1,
    loadxmm,
    fpu, fpuprecycle, fpucycle, loadfpu;
begin
    if N<=0 then
        Exit;
    UseSSE2:=SSE2SupportEnabled and UseSSE2IfPresent;

    //
    // assembler coding
    // Assume N>0
    // TODO: saving FPU state
    //
    asm
        //PUSHF
        MOVL N, %ECX
        MOVL VSrc, %ESI
        MOVL VDst, %EDI

        //
        // Save FPU state
        //


        //
        // Choose between FPU or SSE2
        //
        MOVL UseSSE2, %EAX        //
        TESTL %EAX, %EAX          // decide whether we use SSE2
        JZ fpu                    // or not depending on flag

        CMPL $8, %ECX             // use FPU for small N's
        JLE fpu                   //

        MOVL %EDI, %EAX           // decide whether we use
        TESTL $7, %EAX            // SSE2 or not depending on VDst
        JNZ fpu                   // offset (must be 0/8 modulo 16)

        //
        // Save SSE state
        //
        LEAL XMMBuf, %EBX
        SHRL $4, %EBX
        INCL %EBX
        SHLL $4, %EBX
        MOVL %EBX, BufPtr
        MOVAPD %XMM0, (%EBX)
        MOVAPD %XMM1, 16(%EBX)
        MOVAPD %XMM2, 32(%EBX)

        //
        // Choose between half-aligned or full-aligned SSE2
        //
        MOVL %ESI, %EAX
        MOVL %EDI, %EBX
        ANDL $15, %EAX
        ANDL $15, %EBX
        CMP %EAX, %EBX
        JNE halfalignedsse2
        JMP fullalignedsse2

        //
        // HALF-ALIGNED SSE2
        // Process:
        // 0. Process head (1 double, if needed for 16-byte alignment)
        // 1. Main loop (process 4 doubles per iteration)
        // 2. Process tail with SSE2 (2 doubles, if exist)
        // 3. Process tail with FPU (1 double, if exists)
        //
    halfalignedsse2:
        TESTL $15, %EDI
        JZ precycle
        FLDQ (%ESI)
        FMULQ S
        FSTPQ (%EDI)
        ADDL $8, %ESI
        ADDL $8, %EDI
        DECL %ECX
        JZ loadxmm
    precycle:
        MOVLPD S, %XMM2
        MOVHPD S, %XMM2
        MOVL %ECX, %EBX
        ANDL $3, %EBX
        SHRL $2, %ECX
        JZ tail2
    cycle:
        MOVUPD   (%ESI), %XMM0
        MOVUPD 16(%ESI), %XMM1
        MULPD %XMM2, %XMM0
        MULPD %XMM2, %XMM1
        MOVAPD %XMM0,   (%EDI)
        MOVAPD %XMM1, 16(%EDI)
        ADDL $32, %ESI
        ADDL $32, %EDI
        DECL %ECX
        JNZ cycle
    tail2:
        CMPL $2, %EBX
        JB tail1
        MOVUPD   (%ESI), %XMM0
        MULPD %XMM2, %XMM0
        MOVAPD %XMM0, (%EDI)
        ADDL $16, %ESI
        ADDL $16, %EDI
    tail1:
        TESTL $1, %EBX
        JZ loadxmm
        FLDQ (%ESI)
        FMULQ S
        FSTPQ (%EDI)
        JMP loadxmm

        //
        // FULL-ALIGNED SSE2
        // Process:
        // 0. Process head (1 double, if needed for 16-byte alignment)
        // 1. Main loop (process 4 doubles per iteration)
        // 2. Process tail with SSE2 (2 doubles, if exist)
        // 3. Process tail with FPU (1 double, if exists)
        //
    fullalignedsse2:
        TESTL $15, %EDI
        JZ aprecycle
        FLDQ (%ESI)
        FMULQ S
        FSTPQ (%EDI)
        ADDL $8, %ESI
        ADDL $8, %EDI
        DECL %ECX
        JZ loadxmm
    aprecycle:
        MOVLPD S, %XMM2
        MOVHPD S, %XMM2
        MOVL %ECX, %EBX
        ANDL $3, %EBX
        SHRL $2, %ECX
        JZ atail2
    acycle:
        MOVAPD   (%ESI), %XMM0
        MOVAPD 16(%ESI), %XMM1
        MULPD %XMM2, %XMM0
        MULPD %XMM2, %XMM1
        MOVAPD %XMM0,   (%EDI)
        MOVAPD %XMM1, 16(%EDI)
        ADDL $32, %ESI
        ADDL $32, %EDI
        DECL %ECX
        JNZ acycle
    atail2:
        CMPL $2, %EBX
        JB atail1
        MOVAPD   (%ESI), %XMM0
        MULPD %XMM2, %XMM0
        MOVAPD %XMM0,   (%EDI)
        ADDL $16, %ESI
        ADDL $16, %EDI
    atail1:
        TESTL $1, %EBX
        JZ loadxmm
        FLDQ (%ESI)
        FMULQ S
        FSTPQ (%EDI)
        JMP loadxmm

        //
        // Load state
        //
    loadxmm:
        MOVL BufPtr, %EBX
        MOVAPD (%EBX), %XMM0
        MOVAPD 16(%EBX), %XMM1
        MOVAPD 32(%EBX), %XMM2
        JMP loadfpu


        //
        // NO SSE2, JUST FPU
        //
    fpu:
        SHRL $1, %ECX
        JNC fpuprecycle
        FLDQ (%ESI)
        FMULQ S
        FSTPQ (%EDI)
        ADDL $8, %ESI
        ADDL $8, %EDI
    fpuprecycle:
        CMPL $0, %ECX
        JE loadfpu
    fpucycle:
        FLDQ  (%ESI)
        FMULQ S
        FLDQ  8(%ESI)
        FMULQ S
        FXCH %ST(1)
        FSTPQ (%EDI)
        FSTPQ 8(%EDI)
        ADDL $16, %ESI
        ADDL $16, %EDI
        DECL %ECX
        JNZ fpucycle

        //
        // Load FPU state
        //
    loadfpu:


        //POPF
    end ['EAX', 'EBX', 'ECX', 'ESI', 'EDI'];
end;


(************************************************************************
Move -VSrc[0:N-1] to VDst[0:N-1].
************************************************************************)
procedure ASMMoveNeg1(VDst: PDouble; VSrc: PDouble; N: Integer);cdecl;
label
    nosse2, nosse2start, nosse2cycle, nosse2end,
    endall;
begin
    //
    // prepare data
    //
    if N<=0 then
        Exit;

    //
    // assembler coding
    // Assume N>0
    // TODO: saving FPU state
    //
    asm
        MOVL N, %ECX
        MOVL VSrc, %ESI
        MOVL VDst, %EDI

        //
        // NO SSE2, JUST FPU
        //
    nosse2:
        TESTL $1, %ECX
        JZ nosse2start
        FLDQ (%ESI)
        FCHS
        FSTPQ (%EDI)
        ADDL $8, %ESI
        ADDL $8, %EDI
    nosse2start:
        SHRL $1, %ECX
        TEST %ECX, %ECX
        JZ nosse2end
    nosse2cycle:
        FLDQ (%ESI)
        FCHS
        FLDQ 8(%ESI)
        FCHS
        ADDL $16, %ESI
        FXCH %ST(1)
        FSTPQ (%EDI)
        FSTPQ 8(%EDI)
        ADDL $16, %EDI
        DECL %ECX
        JNZ nosse2cycle
    nosse2end:
    endall:
    end ['EAX', 'EBX', 'ECX', 'ESI', 'EDI'];
end;


(************************************************************************
Adds VSrc[0:N-1] to VDst[0:N-1].
************************************************************************)
procedure ASMAdd1(VDst: PDouble; VSrc: PDouble; N: Integer);cdecl;
var
    XMMBuf: array[0..16*2+16-1] of Char;
    BufPtr: Pointer;
    UseSSE2, FPURestore: LongBool;
label
    halfalignedsse2, precycle, cycle, tail2, tail1,
    fullalignedsse2, aprecycle, acycle, atail2, atail1,
    loadxmm,
    fpu, fpuprecycle, fpucycle, loadfpu;
begin
    if N<=0 then
        Exit;
    UseSSE2:=SSE2SupportEnabled and UseSSE2IfPresent;

    //
    // assembler coding
    // Assume N>0
    // TODO: saving FPU state
    //
    asm
        //PUSHF
        MOVL N, %ECX
        MOVL VSrc, %ESI
        MOVL VDst, %EDI

        //
        // Save FPU state
        //


        //
        // Choose between FPU or SSE2
        //
        MOVL UseSSE2, %EAX        //
        TESTL %EAX, %EAX          // decide whether we use SSE2
        JZ fpu                    // or not depending on flag

        CMPL $8, %ECX             // use FPU for small N's
        JLE fpu                   //

        MOVL %EDI, %EAX           // decide whether we use
        TESTL $7, %EAX            // SSE2 or not depending on VDst
        JNZ fpu                   // offset (must be 0/8 modulo 16)

        //
        // Save SSE state
        //
        LEAL XMMBuf, %EBX
        SHRL $4, %EBX
        INCL %EBX
        SHLL $4, %EBX
        MOVL %EBX, BufPtr
        MOVAPD %XMM0, (%EBX)
        MOVAPD %XMM1, 16(%EBX)

        //
        // Choose between half-aligned or full-aligned SSE2
        //
        MOVL %ESI, %EAX
        MOVL %EDI, %EBX
        ANDL $15, %EAX
        ANDL $15, %EBX
        CMP %EAX, %EBX
        JNE halfalignedsse2
        JMP fullalignedsse2

        //
        // HALF-ALIGNED SSE2
        // Process:
        // 0. Process head (1 double, if needed for 16-byte alignment)
        // 1. Main loop (process 4 doubles per iteration)
        // 2. Process tail with SSE2 (2 doubles, if exist)
        // 3. Process tail with FPU (1 double, if exists)
        //
    halfalignedsse2:
        TESTL $15, %EDI
        JZ precycle
        FLDQ (%ESI)
        FADDQ (%EDI)
        FSTPQ (%EDI)
        ADDL $8, %ESI
        ADDL $8, %EDI
        DECL %ECX
        JZ loadxmm
    precycle:
        MOVL %ECX, %EBX
        ANDL $3, %EBX
        SHRL $2, %ECX
        JZ tail2
    cycle:
        MOVUPD   (%ESI), %XMM0
        MOVUPD 16(%ESI), %XMM1
        ADDPD   (%EDI), %XMM0
        ADDPD 16(%EDI), %XMM1
        MOVAPD %XMM0,   (%EDI)
        MOVAPD %XMM1, 16(%EDI)
        ADDL $32, %ESI
        ADDL $32, %EDI
        DECL %ECX
        JNZ cycle
    tail2:
        CMPL $2, %EBX
        JB tail1
        MOVUPD   (%ESI), %XMM0
        ADDPD   (%EDI), %XMM0
        MOVAPD %XMM0,   (%EDI)
        ADDL $16, %ESI
        ADDL $16, %EDI
    tail1:
        TESTL $1, %EBX
        JZ loadxmm
        FLDQ (%ESI)
        FADDQ (%EDI)
        FSTPQ (%EDI)
        JMP loadxmm

        //
        // FULL-ALIGNED SSE2
        // Process:
        // 0. Process head (1 double, if needed for 16-byte alignment)
        // 1. Main loop (process 4 doubles per iteration)
        // 2. Process tail with SSE2 (2 doubles, if exist)
        // 3. Process tail with FPU (1 double, if exists)
        //
    fullalignedsse2:
        TESTL $15, %EDI
        JZ aprecycle
        FLDQ (%ESI)
        FADDQ (%EDI)
        FSTPQ (%EDI)
        ADDL $8, %ESI
        ADDL $8, %EDI
        DECL %ECX
        JZ loadxmm
    aprecycle:
        MOVL %ECX, %EBX
        ANDL $3, %EBX
        SHRL $2, %ECX
        JZ atail2
    acycle:
        MOVAPD   (%ESI), %XMM0
        MOVAPD 16(%ESI), %XMM1
        ADDPD   (%EDI), %XMM0
        ADDPD 16(%EDI), %XMM1
        MOVAPD %XMM0,   (%EDI)
        MOVAPD %XMM1, 16(%EDI)
        ADDL $32, %ESI
        ADDL $32, %EDI
        DECL %ECX
        JNZ acycle
    atail2:
        CMPL $2, %EBX
        JB atail1
        MOVAPD   (%ESI), %XMM0
        ADDPD   (%EDI), %XMM0
        MOVAPD %XMM0,   (%EDI)
        ADDL $16, %ESI
        ADDL $16, %EDI
    atail1:
        TESTL $1, %EBX
        JZ loadxmm
        FLDQ (%ESI)
        FADDQ (%EDI)
        FSTPQ (%EDI)
        JMP loadxmm

        //
        // Load state
        //
    loadxmm:
        MOVL BufPtr, %EBX
        MOVAPD (%EBX), %XMM0
        MOVAPD 16(%EBX), %XMM1
        JMP loadfpu


        //
        // NO SSE2, JUST FPU
        //
    fpu:
        SHRL $1, %ECX
        JNC fpuprecycle
        FLDQ (%ESI)
        FADDQ (%EDI)
        FSTPQ (%EDI)
        ADDL $8, %ESI
        ADDL $8, %EDI
    fpuprecycle:
        CMPL $0, %ECX
        JE loadfpu
    fpucycle:
        FLDQ  (%ESI)
        FADDQ (%EDI)
        FLDQ  8(%ESI)
        FADDQ 8(%EDI)
        //FXCH %ST(1)
        FSTPQ 8(%EDI)
        FSTPQ (%EDI)
        ADDL $16, %ESI
        ADDL $16, %EDI
        DECL %ECX
        JNZ fpucycle

        //
        // Load FPU state
        //
    loadfpu:


        //POPF
    end ['EAX', 'EBX', 'ECX', 'ESI', 'EDI'];
end;


procedure ASMAddS1(VDst: PDouble; VSrc: PDouble; N: Integer; S: Double);cdecl;
var
    XMMBuf: array[0..16*3+16-1] of Char;
    BufPtr: Pointer;
    UseSSE2: LongBool;
label
    halfalignedsse2, precycle, cycle, tail2, tail1,
    fullalignedsse2, aprecycle, acycle, atail2, atail1,
    loadxmm,
    fpu, fpuprecycle, fpucycle, loadfpu;
begin
    if N<=0 then
        Exit;
    UseSSE2:=SSE2SupportEnabled and UseSSE2IfPresent;

    //
    // assembler coding
    // Assume N>0
    // TODO: saving FPU state
    //
    asm
        //PUSHF
        MOVL N, %ECX
        MOVL VSrc, %ESI
        MOVL VDst, %EDI

        //
        // Save FPU state
        //


        //
        // Choose between FPU or SSE2
        //
        MOVL UseSSE2, %EAX        //
        TESTL %EAX, %EAX          // decide whether we use SSE2
        JZ fpu                    // or not depending on flag

        CMPL $8, %ECX             // use FPU for small N's
        JLE fpu                   //

        MOVL %EDI, %EAX           // decide whether we use
        TESTL $7, %EAX            // SSE2 or not depending on VDst
        JNZ fpu                   // offset (must be 0/8 modulo 16)

        //
        // Save SSE state
        //
        LEAL XMMBuf, %EBX
        SHRL $4, %EBX
        INCL %EBX
        SHLL $4, %EBX
        MOVL %EBX, BufPtr
        MOVAPD %XMM0, (%EBX)
        MOVAPD %XMM1, 16(%EBX)
        MOVAPD %XMM2, 32(%EBX)

        //
        // Choose between half-aligned or full-aligned SSE2
        //
        MOVL %ESI, %EAX
        MOVL %EDI, %EBX
        ANDL $15, %EAX
        ANDL $15, %EBX
        CMP %EAX, %EBX
        JNE halfalignedsse2
        JMP fullalignedsse2

        //
        // HALF-ALIGNED SSE2
        // Process:
        // 0. Process head (1 double, if needed for 16-byte alignment)
        // 1. Main loop (process 4 doubles per iteration)
        // 2. Process tail with SSE2 (2 doubles, if exist)
        // 3. Process tail with FPU (1 double, if exists)
        //
    halfalignedsse2:
        TESTL $15, %EDI
        JZ precycle
        FLDQ (%ESI)
        FMULQ S
        FADDQ (%EDI)
        FSTPQ (%EDI)
        ADDL $8, %ESI
        ADDL $8, %EDI
        DECL %ECX
        JZ loadxmm
    precycle:
        MOVLPD S, %XMM2
        MOVHPD S, %XMM2
        MOVL %ECX, %EBX
        ANDL $3, %EBX
        SHRL $2, %ECX
        JZ tail2
    cycle:
        MOVUPD   (%ESI), %XMM0
        MOVUPD 16(%ESI), %XMM1
        MULPD %XMM2, %XMM0
        MULPD %XMM2, %XMM1
        ADDPD   (%EDI), %XMM0
        ADDPD 16(%EDI), %XMM1
        MOVAPD %XMM0,   (%EDI)
        MOVAPD %XMM1, 16(%EDI)
        ADDL $32, %ESI
        ADDL $32, %EDI
        DECL %ECX
        JNZ cycle
    tail2:
        CMPL $2, %EBX
        JB tail1
        MOVUPD   (%ESI), %XMM0
        MULPD %XMM2, %XMM0
        ADDPD   (%EDI), %XMM0
        MOVAPD %XMM0,   (%EDI)
        ADDL $16, %ESI
        ADDL $16, %EDI
    tail1:
        TESTL $1, %EBX
        JZ loadxmm
        FLDQ (%ESI)
        FMULQ S
        FADDQ (%EDI)
        FSTPQ (%EDI)
        JMP loadxmm

        //
        // FULL-ALIGNED SSE2
        // Process:
        // 0. Process head (1 double, if needed for 16-byte alignment)
        // 1. Main loop (process 4 doubles per iteration)
        // 2. Process tail with SSE2 (2 doubles, if exist)
        // 3. Process tail with FPU (1 double, if exists)
        //
    fullalignedsse2:
        TESTL $15, %EDI
        JZ aprecycle
        FLDQ (%ESI)
        FMULQ S
        FADDQ (%EDI)
        FSTPQ (%EDI)
        ADDL $8, %ESI
        ADDL $8, %EDI
        DECL %ECX
        JZ loadxmm
    aprecycle:
        MOVLPD S, %XMM2
        MOVHPD S, %XMM2
        MOVL %ECX, %EBX
        ANDL $3, %EBX
        SHRL $2, %ECX
        JZ atail2
    acycle:
        MOVAPD   (%ESI), %XMM0
        MOVAPD 16(%ESI), %XMM1
        MULPD %XMM2, %XMM0
        MULPD %XMM2, %XMM1
        ADDPD   (%EDI), %XMM0
        ADDPD 16(%EDI), %XMM1
        MOVAPD %XMM0,   (%EDI)
        MOVAPD %XMM1, 16(%EDI)
        ADDL $32, %ESI
        ADDL $32, %EDI
        DECL %ECX
        JNZ acycle
    atail2:
        CMPL $2, %EBX
        JB atail1
        MOVAPD   (%ESI), %XMM0
        MULPD %XMM2, %XMM0
        ADDPD   (%EDI), %XMM0
        MOVAPD %XMM0,   (%EDI)
        ADDL $16, %ESI
        ADDL $16, %EDI
    atail1:
        TESTL $1, %EBX
        JZ loadxmm
        FLDQ (%ESI)
        FMULQ S
        FADDQ (%EDI)
        FSTPQ (%EDI)
        JMP loadxmm

        //
        // Load state
        //
    loadxmm:
        MOVL BufPtr, %EBX
        MOVAPD (%EBX), %XMM0
        MOVAPD 16(%EBX), %XMM1
        MOVAPD 32(%EBX), %XMM2
        JMP loadfpu


        //
        // NO SSE2, JUST FPU
        //
    fpu:
        SHRL $1, %ECX
        JNC fpuprecycle
        FLDQ (%ESI)
        FMULQ S
        FADDQ (%EDI)
        FSTPQ (%EDI)
        ADDL $8, %ESI
        ADDL $8, %EDI
    fpuprecycle:
        CMPL $0, %ECX
        JE loadfpu
    fpucycle:
        FLDQ  (%ESI)
        FMULQ S
        FADDQ (%EDI)
        FLDQ  8(%ESI)
        FMULQ S
        FADDQ 8(%EDI)
        FXCH %ST(1)
        FSTPQ (%EDI)
        FSTPQ 8(%EDI)
        ADDL $16, %ESI
        ADDL $16, %EDI
        DECL %ECX
        JNZ fpucycle

        //
        // Load FPU state
        //
    loadfpu:


        //POPF
    end ['EAX', 'EBX', 'ECX', 'ESI', 'EDI'];
end;


(************************************************************************
Subtracts VSrc[0:N-1] from VDst[0:N-1].
************************************************************************)
procedure ASMSub1(VDst: PDouble; VSrc: PDouble; N: Integer);cdecl;
var
    XMMBuf: array[0..16*4+16-1] of Char;
    BufPtr: Pointer;
    UseSSE2, FPURestore: LongBool;
label
    halfalignedsse2, precycle, cycle, tail2, tail1,
    fullalignedsse2, aprecycle, acycle, atail2, atail1,
    loadxmm,
    fpu, fpuprecycle, fpucycle, loadfpu;
begin
    if N<=0 then
        Exit;
    UseSSE2:=SSE2SupportEnabled and UseSSE2IfPresent;

    //
    // assembler coding
    // Assume N>0
    // TODO: saving FPU state
    //
    asm
        //PUSHF
        MOVL N, %ECX
        MOVL VSrc, %ESI
        MOVL VDst, %EDI

        //
        // Save FPU state
        //


        //
        // Choose between FPU or SSE2
        //
        MOVL UseSSE2, %EAX        //
        TESTL %EAX, %EAX          // decide whether we use SSE2
        JZ fpu                    // or not depending on flag

        CMPL $8, %ECX             // use FPU for small N's
        JLE fpu                   //

        MOVL %EDI, %EAX           // decide whether we use
        TESTL $7, %EAX            // SSE2 or not depending on VDst
        JNZ fpu                   // offset (must be 0/8 modulo 16)

        //
        // Save SSE state
        //
        LEAL XMMBuf, %EBX
        SHRL $4, %EBX
        INCL %EBX
        SHLL $4, %EBX
        MOVL %EBX, BufPtr
        MOVAPD %XMM0, (%EBX)
        MOVAPD %XMM1, 16(%EBX)
        MOVAPD %XMM2, 32(%EBX)
        MOVAPD %XMM3, 48(%EBX)

        //
        // Choose between half-aligned or full-aligned SSE2
        //
        MOVL %ESI, %EAX
        MOVL %EDI, %EBX
        ANDL $15, %EAX
        ANDL $15, %EBX
        CMP %EAX, %EBX
        JNE halfalignedsse2
        JMP fullalignedsse2

        //
        // HALF-ALIGNED SSE2
        // Process:
        // 0. Process head (1 double, if needed for 16-byte alignment)
        // 1. Main loop (process 4 doubles per iteration)
        // 2. Process tail with SSE2 (2 doubles, if exist)
        // 3. Process tail with FPU (1 double, if exists)
        //
    halfalignedsse2:
        TESTL $15, %EDI
        JZ precycle
        FLDQ (%EDI)
        FSUBQ (%ESI)
        FSTPQ (%EDI)
        ADDL $8, %ESI
        ADDL $8, %EDI
        DECL %ECX
        JZ loadxmm
    precycle:
        MOVL %ECX, %EBX
        ANDL $3, %EBX
        SHRL $2, %ECX
        JZ tail2
    cycle:
        MOVUPD   (%ESI), %XMM0
        MOVUPD 16(%ESI), %XMM1
        MOVAPD   (%EDI), %XMM2
        MOVAPD 16(%EDI), %XMM3
        SUBPD %XMM0, %XMM2
        SUBPD %XMM1, %XMM3
        MOVAPD %XMM2,   (%EDI)
        MOVAPD %XMM3, 16(%EDI)
        ADDL $32, %ESI
        ADDL $32, %EDI
        DECL %ECX
        JNZ cycle
    tail2:
        CMPL $2, %EBX
        JB tail1
        MOVUPD (%ESI), %XMM0
        MOVAPD (%EDI), %XMM1
        SUBPD %XMM0, %XMM1
        MOVAPD %XMM1, (%EDI)
        ADDL $16, %ESI
        ADDL $16, %EDI
    tail1:
        TESTL $1, %EBX
        JZ loadxmm
        FLDQ (%EDI)
        FSUBQ (%ESI)
        FSTPQ (%EDI)
        JMP loadxmm

        //
        // FULL-ALIGNED SSE2
        // Process:
        // 0. Process head (1 double, if needed for 16-byte alignment)
        // 1. Main loop (process 4 doubles per iteration)
        // 2. Process tail with SSE2 (2 doubles, if exist)
        // 3. Process tail with FPU (1 double, if exists)
        //
    fullalignedsse2:
        TESTL $15, %EDI
        JZ aprecycle
        FLDQ (%EDI)
        FSUBQ (%ESI)
        FSTPQ (%EDI)
        ADDL $8, %ESI
        ADDL $8, %EDI
        DECL %ECX
        JZ loadxmm
    aprecycle:
        MOVL %ECX, %EBX
        ANDL $3, %EBX
        SHRL $2, %ECX
        JZ atail2
    acycle:
        MOVAPD   (%ESI), %XMM0
        MOVAPD 16(%ESI), %XMM1
        MOVAPD   (%EDI), %XMM2
        MOVAPD 16(%EDI), %XMM3
        SUBPD %XMM0, %XMM2
        SUBPD %XMM1, %XMM3
        MOVAPD %XMM2,   (%EDI)
        MOVAPD %XMM3, 16(%EDI)
        ADDL $32, %ESI
        ADDL $32, %EDI
        DECL %ECX
        JNZ acycle
    atail2:
        CMPL $2, %EBX
        JB atail1
        MOVAPD (%ESI), %XMM0
        MOVAPD (%EDI), %XMM1
        SUBPD %XMM0, %XMM1
        MOVAPD %XMM1, (%EDI)
        ADDL $16, %ESI
        ADDL $16, %EDI
    atail1:
        TESTL $1, %EBX
        JZ loadxmm
        FLDQ (%EDI)
        FSUBQ (%ESI)
        FSTPQ (%EDI)
        JMP loadxmm

        //
        // Load state
        //
    loadxmm:
        MOVL BufPtr, %EBX
        MOVAPD   (%EBX), %XMM0
        MOVAPD 16(%EBX), %XMM1
        MOVAPD 32(%EBX), %XMM2
        MOVAPD 48(%EBX), %XMM3
        JMP loadfpu


        //
        // NO SSE2, JUST FPU
        //
    fpu:
        SHRL $1, %ECX
        JNC fpuprecycle
        FLDQ (%EDI)
        FSUBQ (%ESI)
        FSTPQ (%EDI)
        ADDL $8, %ESI
        ADDL $8, %EDI
    fpuprecycle:
        CMPL $0, %ECX
        JE loadfpu
    fpucycle:
        FLDQ  (%EDI)
        FSUBQ (%ESI)
        FLDQ  8(%EDI)
        FSUBQ 8(%ESI)
        //FXCH %ST(1)
        FSTPQ 8(%EDI)
        FSTPQ (%EDI)
        ADDL $16, %ESI
        ADDL $16, %EDI
        DECL %ECX
        JNZ fpucycle

        //
        // Load FPU state
        //
    loadfpu:


        //POPF
    end ['EAX', 'EBX', 'ECX', 'ESI', 'EDI'];
end;


(************************************************************************
Multiply VSrc[0:N-1] by S.
************************************************************************)
procedure ASMMulS1(VDst: PDouble; N: Integer; S: Double);cdecl;
var
    XMMBuf: array[0..16*3+16-1] of Char;
    BufPtr: Pointer;
    UseSSE2: LongBool;
label
    halfalignedsse2, precycle, cycle, tail2, tail1,
    fullalignedsse2, aprecycle, acycle, atail2, atail1,
    loadxmm,
    fpu, fpuprecycle, fpucycle, loadfpu;
begin
    if N<=0 then
        Exit;
    UseSSE2:=SSE2SupportEnabled and UseSSE2IfPresent;

    //
    // assembler coding
    // Assume N>0
    // TODO: saving FPU state
    //
    asm
        //PUSHF
        MOVL N, %ECX
        MOVL VDst, %EDI

        //
        // Save FPU state
        //


        //
        // Choose between FPU or SSE2
        //
        MOVL UseSSE2, %EAX        //
        TESTL %EAX, %EAX          // decide whether we use SSE2
        JZ fpu                    // or not depending on flag

        CMPL $8, %ECX             // use FPU for small N's
        JLE fpu                   //

        MOVL %EDI, %EAX           // decide whether we use
        TESTL $7, %EAX            // SSE2 or not depending on VDst
        JNZ fpu                   // offset (must be 0/8 modulo 16)

        //
        // Save SSE state
        //
        LEAL XMMBuf, %EBX
        SHRL $4, %EBX
        INCL %EBX
        SHLL $4, %EBX
        MOVL %EBX, BufPtr
        MOVAPD %XMM0, (%EBX)
        MOVAPD %XMM1, 16(%EBX)
        MOVAPD %XMM2, 32(%EBX)

        //
        // FULL-ALIGNED SSE2
        // Process:
        // 0. Process head (1 double, if needed for 16-byte alignment)
        // 1. Main loop (process 4 doubles per iteration)
        // 2. Process tail with SSE2 (2 doubles, if exist)
        // 3. Process tail with FPU (1 double, if exists)
        //
    fullalignedsse2:
        TESTL $15, %EDI
        JZ aprecycle
        FLDQ (%EDI)
        FMULQ S
        FSTPQ (%EDI)
        ADDL $8, %EDI
        DECL %ECX
        JZ loadxmm
    aprecycle:
        MOVLPD S, %XMM2
        MOVHPD S, %XMM2
        MOVL %ECX, %EBX
        ANDL $3, %EBX
        SHRL $2, %ECX
        JZ atail2
    acycle:
        MOVAPD   (%EDI), %XMM0
        MOVAPD 16(%EDI), %XMM1
        MULPD %XMM2, %XMM0
        MULPD %XMM2, %XMM1
        MOVAPD %XMM0,   (%EDI)
        MOVAPD %XMM1, 16(%EDI)
        ADDL $32, %EDI
        DECL %ECX
        JNZ acycle
    atail2:
        CMPL $2, %EBX
        JB atail1
        MOVAPD   (%EDI), %XMM0
        MULPD %XMM2, %XMM0
        MOVAPD %XMM0,   (%EDI)
        ADDL $16, %EDI
    atail1:
        TESTL $1, %EBX
        JZ loadxmm
        FLDQ (%EDI)
        FMULQ S
        FSTPQ (%EDI)
        JMP loadxmm

        //
        // Load state
        //
    loadxmm:
        MOVL BufPtr, %EBX
        MOVAPD (%EBX), %XMM0
        MOVAPD 16(%EBX), %XMM1
        MOVAPD 32(%EBX), %XMM2
        JMP loadfpu


        //
        // NO SSE2, JUST FPU
        //
    fpu:
        SHRL $1, %ECX
        JNC fpuprecycle
        FLDQ (%EDI)
        FMULQ S
        FSTPQ (%EDI)
        ADDL $8, %EDI
    fpuprecycle:
        CMPL $0, %ECX
        JE loadfpu
    fpucycle:
        FLDQ  (%EDI)
        FMULQ S
        FLDQ  8(%EDI)
        FMULQ S
        FXCH %ST(1)
        FSTPQ (%EDI)
        FSTPQ 8(%EDI)
        ADDL $16, %EDI
        DECL %ECX
        JNZ fpucycle

        //
        // Load FPU state
        //
    loadfpu:


        //POPF
    end ['EAX', 'EBX', 'ECX', 'EDI'];
end;


initialization

    //
    // detect SSE2 support
    //
    asm
        PUSHA
        MOVL $0, %EAX
        CPUID
        CMPL $1, %EAX
        JB nosse2
        MOVL $1, %EAX
        CPUID
        MOVL $1, %EBX
        SHLL $26, %EBX
        ANDL %EBX, %EDX
        JZ nosse2
        MOVL $1, %EAX
        MOVL %EAX, SSE2SupportEnabled
        JMP finished
    nosse2:
        MOVL $0, %EAX
        MOVL %EAX, SSE2SupportEnabled
    finished:
        POPA
    end;
end.
