Implement mul, div, divmod, clz

This commit is contained in:
Wesley Moore 2025-02-15 19:45:25 +10:00
parent 10bafd8485
commit b3605f4a04
No known key found for this signature in database
9 changed files with 545 additions and 7 deletions

View file

@ -6,6 +6,11 @@ LD=riscv64-unknown-elf-ld
export JQ?=jaq
export QEMU?=qemu-riscv32
TEST_SRC := $(shell find tests -name '*.s')
# Replace .s with .elf
TESTS := $(patsubst %.s,%.elf,$(TEST_SRC))
all: calc.elf
check: tests
@ -20,7 +25,7 @@ hello.elf: hello.o
calc.elf: mem.o hex.o debug.o math.o calc.o
$(LD) -m elf32lriscv -T link.ld $^ -o $@
tests: tests/btohex.elf tests/tohex.elf tests/math_add64.elf tests/math_mul.elf
tests: $(TESTS)
tests/math_add64.elf: hex.o math.o tests/math_add64.o
$(LD) -m elf32lriscv -T link.ld $^ -o $@
@ -28,6 +33,18 @@ tests/math_add64.elf: hex.o math.o tests/math_add64.o
tests/math_mul.elf: hex.o math.o tests/math_mul.o
$(LD) -m elf32lriscv -T link.ld $^ -o $@
tests/math_div.elf: hex.o math.o tests/math_div.o
$(LD) -m elf32lriscv -T link.ld $^ -o $@
tests/math_divmod.elf: hex.o math.o tests/math_divmod.o
$(LD) -m elf32lriscv -T link.ld $^ -o $@
tests/math_mod.elf: hex.o math.o tests/math_mod.o
$(LD) -m elf32lriscv -T link.ld $^ -o $@
tests/math_clz.elf: hex.o math.o tests/math_clz.o
$(LD) -m elf32lriscv -T link.ld $^ -o $@
tests/btohex.elf: mem.o hex.o debug.o tests/btohex.o
$(LD) -m elf32lriscv -T link.ld $^ -o $@

127
math.s
View file

@ -1,5 +1,8 @@
.globl add64
.globl mul
.globl div
.globl divmod
.globl clz
.text
@ -36,17 +39,129 @@ mul:
mv t0, a1 # Save multiplier in t0
li a1, 0 # Initialize product in a1
multiply_loop:
beqz t0, done # If multiplier is 0, we're done
.multiply_loop:
beqz t0, .done # If multiplier is 0, we're done
andi t1, t0, 1 # Check least significant bit
beqz t1, shift # If LSB is 0, skip addition
beqz t1, .shift # If LSB is 0, skip addition
add a1, a1, a0 # Add multiplicand to product
shift:
.shift:
slli a0, a0, 1 # Shift multiplicand left
srli t0, t0, 1 # Shift multiplier right
j multiply_loop # Continue loop
j .multiply_loop # Continue loop
done:
.done:
mv a0, a1 # Move product to return register
ret
# 32-bit shift-subtract integer division
# arguments:
# a0: dividend, u
# a1: divisor, v
# return:
# a0 = a0 ÷ a1
#
# https://blog.segger.com/algorithms-for-division-part-2-classics/
div:
bltu a0, a1, .zero # if (u < v) return 0
addi sp, sp, -16
sw s1, 4(sp)
mv s1, a0
mv a0, a1
sw ra, 12(sp)
sw s0, 8(sp)
mv s0, a1
jal clz # clz(u)
sw a0, 0(sp)
mv a0, s1
jal clz # clz(v)
lw a5, 0(sp)
sub a5, a5, a0 # k = clz(v) - clz(u); Calculate number of quotient digits - 1
sll a1, s0, a5 # v <<= k; Normalize divisor
li a0, 0 # q = 0; Init quotient
# Iterate k+1 times, each iteration developing one quotient bit.
.loop:
slli a0, a0, 1 # q <<= 1; Record preliminary '0' quotient digit
bltu s1, a1, .skip # if (u >= v) Subtraction will succeed...
sub s1, s1, a1 # u -= v;
addi a0, a0, 1 # q += 1; Turn preliminary '0' quotient digit to '1'
.skip:
addi a5, a5, -1 # k -= 1;
srli a1, a1, 1 # v >>= 1;
bgez a5, .loop # while (k >= 0);
lw ra, 12(sp)
lw s0, 8(sp)
lw s1, 4(sp)
addi sp, sp, 16
ret
.zero:
li a0, 0
ret
# 32-bit integer division with modulus (remainder)
# arguments:
# a0: dividend, u
# a1: divisor, v
# return:
# a0 = a0 ÷ a1
# a1 = remainder
divmod:
# call div; multiply quotient by divisor; subtract that from dividend
addi sp, sp, -16
sw ra, 12(sp)
sw s0, 8(sp)
sw s1, 4(sp)
mv s0, a0 # save dividend
mv s1, a1 # save divisor
jal div # a0 = a0 ÷ a1
sw a0, 0(sp)
mv a1, a0 # a1 = quotient
mv a0, s1 # a0 = divisor
jal mul # a0 = divisor × quotient
lw ra, 12(sp)
sub a1, s0, a0 # a1 = dividend - product; remainder
lw s0, 8(sp)
lw s1, 4(sp)
lw a0, 0(sp) # a0 = quotient
addi sp, sp, 16
ret
# count leading zero bits
# arguments:
# a0: input
# return:
# a0 = count of leading zero bits
#
# binary search approach translated from C code on
# https://blog.stephencleary.com/2010/10/implementing-gccs-builtin-functions.html
clz:
li a4, 16 # initialise count of zeros to 16
srli a5, a0, 16 # shift value right 16 bits
bne a5, zero, .eight # if the result is != 0 we have up to 16 leading zeros
mv a5, a0 # restore unshifted value to a5
li a4, 32 # we have up to 32 leading zeros
.eight:
srli a3, a5, 8 # shift the value right 8 bits
beq a3, zero, .four
addi a4, a4, -8 # subtract 8 leading zeros if shift result was non-zero
mv a5, a3
.four:
srli a3, a5, 4 # shift the value right 4 bits
beq a3, zero, .two
addi a4, a4, -4 # subtract 4 leading zeros if shift result was non-zero
mv a5, a3
.two:
srli a3, a5, 2 # shift the value right 2 bits
beq a3, zero, .one
addi a4, a4, -2 # subtract 2 leading zeros if shift result was non-zero
mv a5, a3
.one:
srli a3, a5, 1 # shift the value right 1 bit
sub a0, a4, a5 # a0 = count - remaining value
beq a3, zero, .end # if shift result was zero, return a0
addi a0, a4, -2 # subtract 2 leading zeros if shift result was non-zero
.end:
ret

View file

@ -23,6 +23,16 @@ inputs:
.word 1 # 1
.word 0x80000000 # 0.5
.word 2 # 2
.word 0x40000000 # 0.25
.word -1 # -1
.word 0x80000000 # 0.5
.word 2 # 2
.word 0x40000000 # 0.25
.word 1 # 1
.word 0x80000000 # 0.5
.word -2 # -2
inputs_end:
# llvm doesn't like this: error: expected relocatable expression
#.set inputs_end, .-inputs

63
tests/math_clz.s Normal file
View file

@ -0,0 +1,63 @@
# Test for clz
.org 0
# Provide program starting address to linker
.global _start
.extern clz
.extern tohex
/* newlib system calls */
.set SYSEXIT, 93
.set SYSWRITE, 64
.section .rodata
inputs:
.word 0
.word 1
.word 2
.word 3
.word 0xF
.word 0xFF
.word 0xFFF
.word 0xFFFF
.word 0xFFFFF
.word 0xFFFFFF
.word 0xFFFFFFF
.word 0xFFFFFFFF
.word 0x80000000
.word 0x00008000
inputs_end:
.section .bss
buf: .skip 9
.text
_start:
li a0, '\n'
la a1, buf
sb a0, 8(a1) # append newline to buf
la s0, inputs # init loop variables
la s1, inputs_end
loop:
lw a0, 0(s0) # input value
jal clz
la a1, buf
jal tohex
li t0, SYSWRITE # "write" syscall
li a0, 1 # 1 = standard output (stdout)
la a1, buf # load address of output string
li a2, 9 # length of output string
ecall # invoke syscall to print the string
addi s0, s0, 4 # increment input pointer to the next input
bltu s0, s1, loop # if the input address is less than inputs_end, loop
li t0, SYSEXIT # "exit" syscall
la a0, 0 # Use 0 return code
ecall # invoke syscall to terminate the program

62
tests/math_div.s Normal file
View file

@ -0,0 +1,62 @@
# Test for div
.org 0
# Provide program starting address to linker
.global _start
.extern div
.extern tohex
/* newlib system calls */
.set SYSEXIT, 93
.set SYSWRITE, 64
.section .rodata
inputs:
.word 3
.word 5
.word 21
.word 3
.word 0xFFFF
.word 100
.word -7
.word 3
inputs_end:
.section .bss
buf: .skip 9
.text
_start:
li a0, '\n'
la a1, buf
sb a0, 8(a1) # append newline to buf
la s0, inputs # init loop variables
la s1, inputs_end
loop:
lw a0, 0(s0) # dividend
lw a1, 4(s0) # divisor
jal div
# TODO: Format as an actual decimal
la a1, buf
jal tohex
li t0, SYSWRITE # "write" syscall
li a0, 1 # 1 = standard output (stdout)
la a1, buf # load address of output string
li a2, 9 # length of output string
ecall # invoke syscall to print the string
addi s0, s0, 8 # increment input pointer to next pair of inputs
bltu s0, s1, loop # if the input address is less than inputs_end, loop
li t0, SYSEXIT # "exit" syscall
la a0, 0 # Use 0 return code
ecall # invoke syscall to terminate the program

83
tests/math_divmod.s Normal file
View file

@ -0,0 +1,83 @@
# Test for divmod
.org 0
# Provide program starting address to linker
.global _start
.extern div
.extern tohex
/* newlib system calls */
.set SYSEXIT, 93
.set SYSWRITE, 64
.section .rodata
inputs:
.word 3
.word 5
.word 21
.word 3
.word 0xFFFF
.word 100
# Division of this one doesn't work yet
# .word -7
# .word 3
inputs_end:
.section .bss
buf: .skip 9
.text
_start:
li a0, '\n'
la a1, buf
sb a0, 8(a1) # append newline to buf
la s0, inputs # init loop variables
loop:
lw a0, 0(s0) # dividend
lw a1, 4(s0) # divisor
jal divmod
mv s1, a1 # save remainder
# print quotient
la a1, buf
jal tohex
li t0, SYSWRITE # "write" syscall
li a0, 1 # 1 = standard output (stdout)
la a1, buf # load address of output string
li a2, 8 # length of output string
ecall # invoke syscall to print the string
# print ,
li t0, SYSWRITE # "write" syscall
li a0, 1 # 1 = standard output (stdout)
la a1, buf # load address of output string
li a2, ','
sb a2, 0(a1)
li a2, 1 # length of output string
ecall # invoke syscall to print the string
# print remainder
mv a0, s1 # restore remainder
la a1, buf
jal tohex
li t0, SYSWRITE # "write" syscall
li a0, 1 # 1 = standard output (stdout)
la a1, buf # load address of output string
li a2, 9 # length of output string
ecall # invoke syscall to print the string
addi s0, s0, 8 # increment input pointer to next pair of inputs
la a0, inputs_end
bltu s0, a0, loop # if the input address is less than inputs_end, loop
li t0, SYSEXIT # "exit" syscall
la a0, 0 # Use 0 return code
ecall # invoke syscall to terminate the program

66
tests/math_mod.s Normal file
View file

@ -0,0 +1,66 @@
# Test for mul
.org 0
# Provide program starting address to linker
.global _start
.extern mul
.extern tohex
/* newlib system calls */
.set SYSEXIT, 93
.set SYSWRITE, 64
.section .rodata
inputs:
.word 3
.word 5
.word 21
.word 3
.word 0xFFFF
.word 100
.word -7
.word 3
inputs_end:
# llvm doesn't like this: error: expected relocatable expression
#.set inputs_end, .-inputs
# turns out it was right. That was calculating a length, which was
# incorrect for how it was used for looping.
.section .bss
buf: .skip 9
.text
_start:
li a0, '\n'
la a1, buf
sb a0, 8(a1) # append newline to buf
la s0, inputs # init loop variables
la s1, inputs_end
loop:
lw a0, 0(s0) # multiplicand
lw a1, 4(s0) # multiplier
jal mul
# TODO: Format as an actual decimal
la a1, buf
jal tohex
li t0, SYSWRITE # "write" syscall
li a0, 1 # 1 = standard output (stdout)
la a1, buf # load address of output string
li a2, 9 # length of output string
ecall # invoke syscall to print the string
addi s0, s0, 8 # increment input pointer to next pair of 64-bit inputs
bltu s0, s1, loop # if the input address is less than inputs_end, loop
li t0, SYSEXIT # "exit" syscall
la a0, 0 # Use 0 return code
ecall # invoke syscall to terminate the program

66
tests/math_mul.s Normal file
View file

@ -0,0 +1,66 @@
# Test for mul
.org 0
# Provide program starting address to linker
.global _start
.extern mul
.extern tohex
/* newlib system calls */
.set SYSEXIT, 93
.set SYSWRITE, 64
.section .rodata
inputs:
.word 3
.word 5
.word 21
.word 3
.word 0xFFFF
.word 100
.word -7
.word 3
inputs_end:
# llvm doesn't like this: error: expected relocatable expression
#.set inputs_end, .-inputs
# turns out it was right. That was calculating a length, which was
# incorrect for how it was used for looping.
.section .bss
buf: .skip 9
.text
_start:
li a0, '\n'
la a1, buf
sb a0, 8(a1) # append newline to buf
la s0, inputs # init loop variables
la s1, inputs_end
loop:
lw a0, 0(s0) # multiplicand
lw a1, 4(s0) # multiplier
jal mul
# TODO: Format as an actual decimal
la a1, buf
jal tohex
li t0, SYSWRITE # "write" syscall
li a0, 1 # 1 = standard output (stdout)
la a1, buf # load address of output string
li a2, 9 # length of output string
ecall # invoke syscall to print the string
addi s0, s0, 8 # increment input pointer to next pair of 64-bit inputs
bltu s0, s1, loop # if the input address is less than inputs_end, loop
li t0, SYSEXIT # "exit" syscall
la a0, 0 # Use 0 return code
ecall # invoke syscall to terminate the program

View file

@ -9,6 +9,8 @@ test_add64() {
expected=$(cat << END
00000003.80000000
00000003.C0000000
00000001.C0000000
FFFFFFFF.C0000000
END
)
@ -31,3 +33,57 @@ END
# different inputs.
test $? -eq 0 && test "$result" = "$expected"
}
test_div() {
result=$("${QEMU}" -B 0x80000000 -s 2k tests/math_div.elf)
expected=$(cat << END
00000000
00000007
0000028F
55555553
END
)
# TODO: Ideally this test would allow calling the binary repeatedly with
# different inputs.
test $? -eq 0 && test "$result" = "$expected"
}
test_divmod() {
result=$("${QEMU}" -B 0x80000000 -s 2k tests/math_divmod.elf)
expected=$(cat << END
00000000,00000003
00000007,00000000
0000028F,00000023
END
)
# TODO: Ideally this test would allow calling the binary repeatedly with
# different inputs.
test $? -eq 0 && test "$result" = "$expected"
}
test_clz() {
result=$("${QEMU}" -B 0x80000000 -s 2k tests/math_clz.elf)
expected=$(cat << END
00000020
0000001F
0000001E
0000001E
0000001C
00000018
00000014
00000010
0000000C
00000008
00000004
00000000
00000000
00000010
END
)
# TODO: Ideally this test would allow calling the binary repeatedly with
# different inputs.
test $? -eq 0 && test "$result" = "$expected"
}