Of course, a full 256-by-128-bit division is not required, because the lower half of x<<128 is zero anyway (so we might as well start out with "remainder = x, result = 0")
The code for that is this disaster:
Code: Select all
longdiv:
; divides big endian int128 at hl * 1<<128 by big endian int128 at de
; result at $8000 (big endian)
; returns with Carry = true if the result overflows (result must then hold x<<127 / y)
; returns with Carry = false otherwise (result must hold x<<128 / y)
.define _result $8000
.define _divisor $8020
push de
; copy dividend into upper half of result (odd but correct since the lower half is 0)
ld de,_result
ld bc,16
ldir
; copy divisor
pop hl
ld de,_divisor
ld bc,16
ldir
; zero lower half of result
ld hl,_result+16
ld (hl),0
ld de,_result+17
ld bc,15
ldir
; actual division
ld c,129
;reset carry because on the first iteration
;there hasn't been a leftshift yet that could carry
;so we must check whether the high half of the result is smaller than the divisor
or a
;also make hl point to _result because we skip the first leftshift
ld hl,_result-1
jr _entrance
_divloop:
; shift result left
ld hl,_result+31
ld b,l ;b = 31
; optimized first iteration
sla (hl)
-: dec hl
rl (hl)
djnz {-}
dec hl
; entrance here to skip first leftshift
_entrance:
; if the left shift overflowed, the high half of the result
; is definitely NOT smaller than the divisor, so skip the comparison
jr c,_dosubtract
; hl -> _result-1
ld de,_divisor-1
ld b,16
-: inc hl
inc de
ld a,(de)
cp (hl)
jr z,{+}
jr nc,_skipsubtract ; (result >> 128) > divisor
jr _dosubtract ; (result >> 128) < divisor
; loop while equal
+: djnz {-}
; if equal, no need to subtract
jr _skipsubtract
_dosubtract:
; increment result
; it was leftshifted so it can't carry
ld hl,_result+31
inc (hl)
; subtract divisor from upper half of result
ld de,_result+15
ld hl,_divisor+15
ld b,e ; b = 15
ld a,(de)
sub (hl)
ld (de),a
-: dec de
dec hl
ld a,(de)
sbc a,(hl)
ld (de),a
djnz {-}
_skipsubtract:
dec c
jr z,_exit
ld a,(_result+16)
rla
jr nc,_divloop
ret
_exit:
or a
ret
But I think there should be faster ways to do this - are there any? Or does it have to be this algorithm, but more optimized?