From b3605f4a04f86c8c3543f616471efca22675a965 Mon Sep 17 00:00:00 2001 From: Wesley Moore <wes@wezm.net> Date: Sat, 15 Feb 2025 19:45:25 +1000 Subject: [PATCH] Implement mul, div, divmod, clz --- Makefile | 19 ++++++- math.s | 127 +++++++++++++++++++++++++++++++++++++++++--- tests/math_add64.s | 10 ++++ tests/math_clz.s | 63 ++++++++++++++++++++++ tests/math_div.s | 62 +++++++++++++++++++++ tests/math_divmod.s | 83 +++++++++++++++++++++++++++++ tests/math_mod.s | 66 +++++++++++++++++++++++ tests/math_mul.s | 66 +++++++++++++++++++++++ tests/test_math.sh | 56 +++++++++++++++++++ 9 files changed, 545 insertions(+), 7 deletions(-) create mode 100644 tests/math_clz.s create mode 100644 tests/math_div.s create mode 100644 tests/math_divmod.s create mode 100644 tests/math_mod.s create mode 100644 tests/math_mul.s diff --git a/Makefile b/Makefile index b610fcd..426aaf6 100644 --- a/Makefile +++ b/Makefile @@ -6,6 +6,11 @@ LD=riscv64-unknown-elf-ld export JQ?=jaq export QEMU?=qemu-riscv32 +TEST_SRC := $(shell find tests -name '*.s') + +# Replace .s with .elf +TESTS := $(patsubst %.s,%.elf,$(TEST_SRC)) + all: calc.elf check: tests @@ -20,7 +25,7 @@ hello.elf: hello.o calc.elf: mem.o hex.o debug.o math.o calc.o $(LD) -m elf32lriscv -T link.ld $^ -o $@ -tests: tests/btohex.elf tests/tohex.elf tests/math_add64.elf tests/math_mul.elf +tests: $(TESTS) tests/math_add64.elf: hex.o math.o tests/math_add64.o $(LD) -m elf32lriscv -T link.ld $^ -o $@ @@ -28,6 +33,18 @@ tests/math_add64.elf: hex.o math.o tests/math_add64.o tests/math_mul.elf: hex.o math.o tests/math_mul.o $(LD) -m elf32lriscv -T link.ld $^ -o $@ +tests/math_div.elf: hex.o math.o tests/math_div.o + $(LD) -m elf32lriscv -T link.ld $^ -o $@ + +tests/math_divmod.elf: hex.o math.o tests/math_divmod.o + $(LD) -m elf32lriscv -T link.ld $^ -o $@ + +tests/math_mod.elf: hex.o math.o tests/math_mod.o + $(LD) -m elf32lriscv -T link.ld $^ -o $@ + +tests/math_clz.elf: hex.o math.o tests/math_clz.o + $(LD) -m elf32lriscv -T link.ld $^ -o $@ + tests/btohex.elf: mem.o hex.o debug.o tests/btohex.o $(LD) -m elf32lriscv -T link.ld $^ -o $@ diff --git a/math.s b/math.s index 9e2b2d1..2b73208 100644 --- a/math.s +++ b/math.s @@ -1,5 +1,8 @@ .globl add64 .globl mul +.globl div +.globl divmod +.globl clz .text @@ -36,17 +39,129 @@ mul: mv t0, a1 # Save multiplier in t0 li a1, 0 # Initialize product in a1 -multiply_loop: - beqz t0, done # If multiplier is 0, we're done +.multiply_loop: + beqz t0, .done # If multiplier is 0, we're done andi t1, t0, 1 # Check least significant bit - beqz t1, shift # If LSB is 0, skip addition + beqz t1, .shift # If LSB is 0, skip addition add a1, a1, a0 # Add multiplicand to product -shift: +.shift: slli a0, a0, 1 # Shift multiplicand left srli t0, t0, 1 # Shift multiplier right - j multiply_loop # Continue loop + j .multiply_loop # Continue loop -done: +.done: mv a0, a1 # Move product to return register ret + +# 32-bit shift-subtract integer division +# arguments: +# a0: dividend, u +# a1: divisor, v +# return: +# a0 = a0 ÷ a1 +# +# https://blog.segger.com/algorithms-for-division-part-2-classics/ +div: + bltu a0, a1, .zero # if (u < v) return 0 + addi sp, sp, -16 + sw s1, 4(sp) + mv s1, a0 + mv a0, a1 + sw ra, 12(sp) + sw s0, 8(sp) + mv s0, a1 + jal clz # clz(u) + sw a0, 0(sp) + mv a0, s1 + jal clz # clz(v) + lw a5, 0(sp) + sub a5, a5, a0 # k = clz(v) - clz(u); Calculate number of quotient digits - 1 + sll a1, s0, a5 # v <<= k; Normalize divisor + li a0, 0 # q = 0; Init quotient + + # Iterate k+1 times, each iteration developing one quotient bit. +.loop: + slli a0, a0, 1 # q <<= 1; Record preliminary '0' quotient digit + bltu s1, a1, .skip # if (u >= v) Subtraction will succeed... + sub s1, s1, a1 # u -= v; + addi a0, a0, 1 # q += 1; Turn preliminary '0' quotient digit to '1' +.skip: + addi a5, a5, -1 # k -= 1; + srli a1, a1, 1 # v >>= 1; + bgez a5, .loop # while (k >= 0); + + lw ra, 12(sp) + lw s0, 8(sp) + lw s1, 4(sp) + addi sp, sp, 16 + ret + +.zero: + li a0, 0 + ret + +# 32-bit integer division with modulus (remainder) +# arguments: +# a0: dividend, u +# a1: divisor, v +# return: +# a0 = a0 ÷ a1 +# a1 = remainder +divmod: + # call div; multiply quotient by divisor; subtract that from dividend + addi sp, sp, -16 + sw ra, 12(sp) + sw s0, 8(sp) + sw s1, 4(sp) + mv s0, a0 # save dividend + mv s1, a1 # save divisor + jal div # a0 = a0 ÷ a1 + sw a0, 0(sp) + mv a1, a0 # a1 = quotient + mv a0, s1 # a0 = divisor + jal mul # a0 = divisor × quotient + lw ra, 12(sp) + sub a1, s0, a0 # a1 = dividend - product; remainder + lw s0, 8(sp) + lw s1, 4(sp) + lw a0, 0(sp) # a0 = quotient + addi sp, sp, 16 + ret + +# count leading zero bits +# arguments: +# a0: input +# return: +# a0 = count of leading zero bits +# +# binary search approach translated from C code on +# https://blog.stephencleary.com/2010/10/implementing-gccs-builtin-functions.html +clz: + li a4, 16 # initialise count of zeros to 16 + srli a5, a0, 16 # shift value right 16 bits + bne a5, zero, .eight # if the result is != 0 we have up to 16 leading zeros + mv a5, a0 # restore unshifted value to a5 + li a4, 32 # we have up to 32 leading zeros +.eight: + srli a3, a5, 8 # shift the value right 8 bits + beq a3, zero, .four + addi a4, a4, -8 # subtract 8 leading zeros if shift result was non-zero + mv a5, a3 +.four: + srli a3, a5, 4 # shift the value right 4 bits + beq a3, zero, .two + addi a4, a4, -4 # subtract 4 leading zeros if shift result was non-zero + mv a5, a3 +.two: + srli a3, a5, 2 # shift the value right 2 bits + beq a3, zero, .one + addi a4, a4, -2 # subtract 2 leading zeros if shift result was non-zero + mv a5, a3 +.one: + srli a3, a5, 1 # shift the value right 1 bit + sub a0, a4, a5 # a0 = count - remaining value + beq a3, zero, .end # if shift result was zero, return a0 + addi a0, a4, -2 # subtract 2 leading zeros if shift result was non-zero +.end: + ret diff --git a/tests/math_add64.s b/tests/math_add64.s index 693688a..775eeec 100644 --- a/tests/math_add64.s +++ b/tests/math_add64.s @@ -23,6 +23,16 @@ inputs: .word 1 # 1 .word 0x80000000 # 0.5 .word 2 # 2 + + .word 0x40000000 # 0.25 + .word -1 # -1 + .word 0x80000000 # 0.5 + .word 2 # 2 + + .word 0x40000000 # 0.25 + .word 1 # 1 + .word 0x80000000 # 0.5 + .word -2 # -2 inputs_end: # llvm doesn't like this: error: expected relocatable expression #.set inputs_end, .-inputs diff --git a/tests/math_clz.s b/tests/math_clz.s new file mode 100644 index 0000000..68b8a70 --- /dev/null +++ b/tests/math_clz.s @@ -0,0 +1,63 @@ +# Test for clz + +.org 0 +# Provide program starting address to linker +.global _start + +.extern clz +.extern tohex + +/* newlib system calls */ +.set SYSEXIT, 93 +.set SYSWRITE, 64 + +.section .rodata + +inputs: + .word 0 + .word 1 + .word 2 + .word 3 + .word 0xF + .word 0xFF + .word 0xFFF + .word 0xFFFF + .word 0xFFFFF + .word 0xFFFFFF + .word 0xFFFFFFF + .word 0xFFFFFFFF + .word 0x80000000 + .word 0x00008000 +inputs_end: + +.section .bss + +buf: .skip 9 + +.text + +_start: + li a0, '\n' + la a1, buf + sb a0, 8(a1) # append newline to buf + + la s0, inputs # init loop variables + la s1, inputs_end +loop: + lw a0, 0(s0) # input value + jal clz + la a1, buf + jal tohex + + li t0, SYSWRITE # "write" syscall + li a0, 1 # 1 = standard output (stdout) + la a1, buf # load address of output string + li a2, 9 # length of output string + ecall # invoke syscall to print the string + + addi s0, s0, 4 # increment input pointer to the next input + bltu s0, s1, loop # if the input address is less than inputs_end, loop + + li t0, SYSEXIT # "exit" syscall + la a0, 0 # Use 0 return code + ecall # invoke syscall to terminate the program diff --git a/tests/math_div.s b/tests/math_div.s new file mode 100644 index 0000000..3fa680f --- /dev/null +++ b/tests/math_div.s @@ -0,0 +1,62 @@ +# Test for div + +.org 0 +# Provide program starting address to linker +.global _start + +.extern div +.extern tohex + +/* newlib system calls */ +.set SYSEXIT, 93 +.set SYSWRITE, 64 + +.section .rodata + +inputs: + .word 3 + .word 5 + + .word 21 + .word 3 + + .word 0xFFFF + .word 100 + + .word -7 + .word 3 +inputs_end: + +.section .bss + +buf: .skip 9 + +.text + +_start: + li a0, '\n' + la a1, buf + sb a0, 8(a1) # append newline to buf + + la s0, inputs # init loop variables + la s1, inputs_end +loop: + lw a0, 0(s0) # dividend + lw a1, 4(s0) # divisor + jal div + # TODO: Format as an actual decimal + la a1, buf + jal tohex + + li t0, SYSWRITE # "write" syscall + li a0, 1 # 1 = standard output (stdout) + la a1, buf # load address of output string + li a2, 9 # length of output string + ecall # invoke syscall to print the string + + addi s0, s0, 8 # increment input pointer to next pair of inputs + bltu s0, s1, loop # if the input address is less than inputs_end, loop + + li t0, SYSEXIT # "exit" syscall + la a0, 0 # Use 0 return code + ecall # invoke syscall to terminate the program diff --git a/tests/math_divmod.s b/tests/math_divmod.s new file mode 100644 index 0000000..f8f6298 --- /dev/null +++ b/tests/math_divmod.s @@ -0,0 +1,83 @@ +# Test for divmod + +.org 0 +# Provide program starting address to linker +.global _start + +.extern div +.extern tohex + +/* newlib system calls */ +.set SYSEXIT, 93 +.set SYSWRITE, 64 + +.section .rodata + +inputs: + .word 3 + .word 5 + + .word 21 + .word 3 + + .word 0xFFFF + .word 100 + + # Division of this one doesn't work yet + # .word -7 + # .word 3 +inputs_end: + +.section .bss + +buf: .skip 9 + +.text + +_start: + li a0, '\n' + la a1, buf + sb a0, 8(a1) # append newline to buf + + la s0, inputs # init loop variables +loop: + lw a0, 0(s0) # dividend + lw a1, 4(s0) # divisor + jal divmod + mv s1, a1 # save remainder + + # print quotient + la a1, buf + jal tohex + li t0, SYSWRITE # "write" syscall + li a0, 1 # 1 = standard output (stdout) + la a1, buf # load address of output string + li a2, 8 # length of output string + ecall # invoke syscall to print the string + + # print , + li t0, SYSWRITE # "write" syscall + li a0, 1 # 1 = standard output (stdout) + la a1, buf # load address of output string + li a2, ',' + sb a2, 0(a1) + li a2, 1 # length of output string + ecall # invoke syscall to print the string + + # print remainder + mv a0, s1 # restore remainder + la a1, buf + jal tohex + li t0, SYSWRITE # "write" syscall + li a0, 1 # 1 = standard output (stdout) + la a1, buf # load address of output string + li a2, 9 # length of output string + ecall # invoke syscall to print the string + + addi s0, s0, 8 # increment input pointer to next pair of inputs + la a0, inputs_end + bltu s0, a0, loop # if the input address is less than inputs_end, loop + + li t0, SYSEXIT # "exit" syscall + la a0, 0 # Use 0 return code + ecall # invoke syscall to terminate the program diff --git a/tests/math_mod.s b/tests/math_mod.s new file mode 100644 index 0000000..7abcbdb --- /dev/null +++ b/tests/math_mod.s @@ -0,0 +1,66 @@ +# Test for mul + +.org 0 +# Provide program starting address to linker +.global _start + +.extern mul +.extern tohex + +/* newlib system calls */ +.set SYSEXIT, 93 +.set SYSWRITE, 64 + +.section .rodata + +inputs: + .word 3 + .word 5 + + .word 21 + .word 3 + + .word 0xFFFF + .word 100 + + .word -7 + .word 3 +inputs_end: + # llvm doesn't like this: error: expected relocatable expression + #.set inputs_end, .-inputs + # turns out it was right. That was calculating a length, which was + # incorrect for how it was used for looping. + +.section .bss + +buf: .skip 9 + +.text + +_start: + li a0, '\n' + la a1, buf + sb a0, 8(a1) # append newline to buf + + la s0, inputs # init loop variables + la s1, inputs_end +loop: + lw a0, 0(s0) # multiplicand + lw a1, 4(s0) # multiplier + jal mul + # TODO: Format as an actual decimal + la a1, buf + jal tohex + + li t0, SYSWRITE # "write" syscall + li a0, 1 # 1 = standard output (stdout) + la a1, buf # load address of output string + li a2, 9 # length of output string + ecall # invoke syscall to print the string + + addi s0, s0, 8 # increment input pointer to next pair of 64-bit inputs + bltu s0, s1, loop # if the input address is less than inputs_end, loop + + li t0, SYSEXIT # "exit" syscall + la a0, 0 # Use 0 return code + ecall # invoke syscall to terminate the program diff --git a/tests/math_mul.s b/tests/math_mul.s new file mode 100644 index 0000000..7abcbdb --- /dev/null +++ b/tests/math_mul.s @@ -0,0 +1,66 @@ +# Test for mul + +.org 0 +# Provide program starting address to linker +.global _start + +.extern mul +.extern tohex + +/* newlib system calls */ +.set SYSEXIT, 93 +.set SYSWRITE, 64 + +.section .rodata + +inputs: + .word 3 + .word 5 + + .word 21 + .word 3 + + .word 0xFFFF + .word 100 + + .word -7 + .word 3 +inputs_end: + # llvm doesn't like this: error: expected relocatable expression + #.set inputs_end, .-inputs + # turns out it was right. That was calculating a length, which was + # incorrect for how it was used for looping. + +.section .bss + +buf: .skip 9 + +.text + +_start: + li a0, '\n' + la a1, buf + sb a0, 8(a1) # append newline to buf + + la s0, inputs # init loop variables + la s1, inputs_end +loop: + lw a0, 0(s0) # multiplicand + lw a1, 4(s0) # multiplier + jal mul + # TODO: Format as an actual decimal + la a1, buf + jal tohex + + li t0, SYSWRITE # "write" syscall + li a0, 1 # 1 = standard output (stdout) + la a1, buf # load address of output string + li a2, 9 # length of output string + ecall # invoke syscall to print the string + + addi s0, s0, 8 # increment input pointer to next pair of 64-bit inputs + bltu s0, s1, loop # if the input address is less than inputs_end, loop + + li t0, SYSEXIT # "exit" syscall + la a0, 0 # Use 0 return code + ecall # invoke syscall to terminate the program diff --git a/tests/test_math.sh b/tests/test_math.sh index 9897d7c..8a0d8ed 100644 --- a/tests/test_math.sh +++ b/tests/test_math.sh @@ -9,6 +9,8 @@ test_add64() { expected=$(cat << END 00000003.80000000 00000003.C0000000 +00000001.C0000000 +FFFFFFFF.C0000000 END ) @@ -31,3 +33,57 @@ END # different inputs. test $? -eq 0 && test "$result" = "$expected" } + +test_div() { + result=$("${QEMU}" -B 0x80000000 -s 2k tests/math_div.elf) + expected=$(cat << END +00000000 +00000007 +0000028F +55555553 +END +) + + # TODO: Ideally this test would allow calling the binary repeatedly with + # different inputs. + test $? -eq 0 && test "$result" = "$expected" +} + +test_divmod() { + result=$("${QEMU}" -B 0x80000000 -s 2k tests/math_divmod.elf) + expected=$(cat << END +00000000,00000003 +00000007,00000000 +0000028F,00000023 +END +) + + # TODO: Ideally this test would allow calling the binary repeatedly with + # different inputs. + test $? -eq 0 && test "$result" = "$expected" +} + +test_clz() { + result=$("${QEMU}" -B 0x80000000 -s 2k tests/math_clz.elf) + expected=$(cat << END +00000020 +0000001F +0000001E +0000001E +0000001C +00000018 +00000014 +00000010 +0000000C +00000008 +00000004 +00000000 +00000000 +00000010 +END +) + + # TODO: Ideally this test would allow calling the binary repeatedly with + # different inputs. + test $? -eq 0 && test "$result" = "$expected" +}