#ifndef included_xfma
#define included_xfma
#include "pushpop.z80"
#include "mov.z80"
#include "routines/add64.z80"
#include "routines/sub64.z80"
#include "routines/sla64.z80"
#include "routines/rr64.z80"
#include "routines/srl64.z80"
#include "routines/srl64_x4.z80"
#include "routines/swapbuf.z80"
#include "xmul.z80"
;Fused Multiply-Add
;Performs x*y+t -> z
;HL points to x
;DE points to y
;BC points to z
;IX points to t
call pushpop
push bc ;save the output location
push ix ;save the location of what to add
;First multiply x and y, but keep full 128-bits precision
ld bc,var_z+8
call xmul
;Now we need to perform a high-precision addition
;First we move the float to scrap
pop hl
ld de,var_z-10
call mov10
;Now do a special add
call fma_add
;Now return the result
pop de
ld hl,var_z+8
jp mov10
; Zero out the bottom 8 bytes of the addend
ld hl,0
ld (var_z-12),hl
ld (var_z-14),hl
ld (var_z-16),hl
ld (var_z-18),hl
; Check for special cases
ld de,(var_z-2)
ld hl,(var_z+16)
res 7,h
res 7,d
ld a,h
or l
jp z,caseadd_fma
ld a,d
or e
jp z,caseadd1_fma
; Now make sure var_z has the bigger exponent
sbc hl,de
jr nc,+_
xor a
sub l
ld l,a
sbc a,a
sub h
ld h,a
push hl
; We need to swap.
ld de,var_z-18
ld hl,var_z
ld bc,18
call swapbuf
pop hl
ld a,h
or a
ret nz
ld a,l
cp 130
ret nc
;Now we need to shift down by A bits.
or a
jr z,add_shifted_fma
rra \ call c,srl_var_z_m_18
rra \ call c,srl2_var_z_m_18
rra \ call c,srl4_var_z_m_18
and $1F
ld l,a
ld bc,255&(var_z-19)
ld h,(var_z-18)>>8
add hl,bc
sub 18
ld c,a
ld de,var_z-19
ld c,a
ld a,17
sub c
jr z,add_shifted_fma
ld b,a
xor a
ld (de),a \ inc de \ djnz $-2
;If the signs match, then just add
;If they differ, then subtract
ld hl,var_z-1
ld a,(var_z+17)
xor (hl)
jp p,xfma_add
; Subtract the mantissas
ld hl,var_z-18
ld de,var_z
call sub64
inc hl
inc de
call sbc64
jr nc,+_
;Negate the mantissa, invert the sign
; Invert the sign
inc de
inc de
ld a,(de)
xor 80h
ld (de),a
; Negate the mantissa
ld hl,var_z
ld bc,$1000
ld a,c \ sbc a,(hl) \ ld (hl),a \ inc hl
djnz $-4
ret m
;need to shift up until top bit is 1. Should be at most 1 shift, I think
ld de,(var_z+16)
; Make sure that the mantissa isn't zero
ld hl,var_z
ld b,15
ld a,(hl)
inc hl \ or (hl) \ jr nz,+_
djnz -_
inc hl
ld (hl),a
inc hl
ld (hl),a
dec de
ld a,d
and $7F
or e
jp z,add_zero_fma
ld hl,var_z
call sla64
inc hl
call rl64
jp p,-_
ld (var_z+16),de
;add the mantissas
ld hl,var_z-18
ld de,var_z
call add64
inc hl
inc de
call adc64
ret nc
ex de,hl
inc hl
inc (hl) \ jr nz,+_
inc hl
inc (hl)
ld a,(hl)
dec hl
and $7F
jr z,add_inf_fma
dec hl
jp rr64
ld hl,var_z-3
ld b,a
call srl64_4
dec hl
call rrd8
ld a,b
call srl_var_z_m_18
ld hl,var_z-3
ld b,a
call srl64
dec hl
call rr64
ld a,b
;zero+x => x for all x
;NaN +x => NaN for all x
;inf-inf=> NaN
;inf +x => inf, x != inf
;x+zero => x
;x+inf => inf
;x+NaN => NaN
xor a
ld (var_z+15),a
ld h,a
ld l,a
ld (var_z+16),hl
xor a
ld (var_z+15),a
dec a
ld h,a
ld l,a
ld (var_z+16),hl