From b3605f4a04f86c8c3543f616471efca22675a965 Mon Sep 17 00:00:00 2001
From: Wesley Moore <wes@wezm.net>
Date: Sat, 15 Feb 2025 19:45:25 +1000
Subject: [PATCH] Implement mul, div, divmod, clz

---
 Makefile            |  19 ++++++-
 math.s              | 127 +++++++++++++++++++++++++++++++++++++++++---
 tests/math_add64.s  |  10 ++++
 tests/math_clz.s    |  63 ++++++++++++++++++++++
 tests/math_div.s    |  62 +++++++++++++++++++++
 tests/math_divmod.s |  83 +++++++++++++++++++++++++++++
 tests/math_mod.s    |  66 +++++++++++++++++++++++
 tests/math_mul.s    |  66 +++++++++++++++++++++++
 tests/test_math.sh  |  56 +++++++++++++++++++
 9 files changed, 545 insertions(+), 7 deletions(-)
 create mode 100644 tests/math_clz.s
 create mode 100644 tests/math_div.s
 create mode 100644 tests/math_divmod.s
 create mode 100644 tests/math_mod.s
 create mode 100644 tests/math_mul.s

diff --git a/Makefile b/Makefile
index b610fcd..426aaf6 100644
--- a/Makefile
+++ b/Makefile
@@ -6,6 +6,11 @@ LD=riscv64-unknown-elf-ld
 export JQ?=jaq
 export QEMU?=qemu-riscv32
 
+TEST_SRC := $(shell find tests -name '*.s')
+
+# Replace .s with .elf
+TESTS := $(patsubst %.s,%.elf,$(TEST_SRC))
+
 all: calc.elf
 
 check: tests
@@ -20,7 +25,7 @@ hello.elf: hello.o
 calc.elf: mem.o hex.o debug.o math.o calc.o
 	$(LD) -m elf32lriscv -T link.ld $^ -o $@
 
-tests: tests/btohex.elf tests/tohex.elf tests/math_add64.elf tests/math_mul.elf
+tests: $(TESTS)
 
 tests/math_add64.elf: hex.o math.o tests/math_add64.o
 	$(LD) -m elf32lriscv -T link.ld $^ -o $@
@@ -28,6 +33,18 @@ tests/math_add64.elf: hex.o math.o tests/math_add64.o
 tests/math_mul.elf: hex.o math.o tests/math_mul.o
 	$(LD) -m elf32lriscv -T link.ld $^ -o $@
 
+tests/math_div.elf: hex.o math.o tests/math_div.o
+	$(LD) -m elf32lriscv -T link.ld $^ -o $@
+
+tests/math_divmod.elf: hex.o math.o tests/math_divmod.o
+	$(LD) -m elf32lriscv -T link.ld $^ -o $@
+
+tests/math_mod.elf: hex.o math.o tests/math_mod.o
+	$(LD) -m elf32lriscv -T link.ld $^ -o $@
+
+tests/math_clz.elf: hex.o math.o tests/math_clz.o
+	$(LD) -m elf32lriscv -T link.ld $^ -o $@
+
 tests/btohex.elf: mem.o hex.o debug.o tests/btohex.o
 	$(LD) -m elf32lriscv -T link.ld $^ -o $@
 
diff --git a/math.s b/math.s
index 9e2b2d1..2b73208 100644
--- a/math.s
+++ b/math.s
@@ -1,5 +1,8 @@
 .globl add64
 .globl mul
+.globl div
+.globl divmod
+.globl clz
 
 .text
 
@@ -36,17 +39,129 @@ mul:
     mv      t0, a1          # Save multiplier in t0
     li      a1, 0           # Initialize product in a1
 
-multiply_loop:
-    beqz    t0, done        # If multiplier is 0, we're done
+.multiply_loop:
+    beqz    t0, .done       # If multiplier is 0, we're done
     andi    t1, t0, 1       # Check least significant bit
-    beqz    t1, shift       # If LSB is 0, skip addition
+    beqz    t1, .shift      # If LSB is 0, skip addition
     add     a1, a1, a0      # Add multiplicand to product
 
-shift:
+.shift:
     slli    a0, a0, 1       # Shift multiplicand left
     srli    t0, t0, 1       # Shift multiplier right
-    j       multiply_loop   # Continue loop
+    j       .multiply_loop  # Continue loop
 
-done:
+.done:
     mv      a0, a1          # Move product to return register
     ret
+
+# 32-bit shift-subtract integer division
+#    arguments:
+#        a0: dividend, u
+#        a1: divisor, v
+#    return:
+#        a0 = a0 ÷ a1
+#
+# https://blog.segger.com/algorithms-for-division-part-2-classics/
+div:
+    bltu    a0, a1, .zero    # if (u < v) return 0
+    addi    sp, sp, -16
+    sw      s1, 4(sp)
+    mv      s1, a0
+    mv      a0, a1
+    sw      ra, 12(sp)
+    sw      s0, 8(sp)
+    mv      s0, a1
+    jal     clz              # clz(u)
+    sw      a0, 0(sp)
+    mv      a0, s1
+    jal     clz              # clz(v)
+    lw      a5, 0(sp)
+    sub     a5, a5, a0       # k = clz(v) - clz(u); Calculate number of quotient digits - 1
+    sll     a1, s0, a5       # v <<= k;             Normalize divisor
+    li      a0, 0            # q = 0;               Init quotient
+
+    # Iterate k+1 times, each iteration developing one quotient bit.
+.loop:
+    slli    a0, a0, 1        # q <<= 1;             Record preliminary '0' quotient digit
+    bltu    s1, a1, .skip    # if (u >= v)          Subtraction will succeed...
+    sub     s1, s1, a1       # u -= v;
+    addi    a0, a0, 1        # q += 1;              Turn preliminary '0' quotient digit to '1'
+.skip:
+    addi    a5, a5, -1       # k -= 1;
+    srli    a1, a1, 1        # v >>= 1;
+    bgez    a5, .loop        # while (k >= 0);
+
+    lw      ra, 12(sp)
+    lw      s0, 8(sp)
+    lw      s1, 4(sp)
+    addi    sp, sp, 16
+    ret
+
+.zero:
+    li      a0, 0
+    ret
+
+# 32-bit integer division with modulus (remainder)
+#    arguments:
+#        a0: dividend, u
+#        a1: divisor, v
+#    return:
+#        a0 = a0 ÷ a1
+#        a1 = remainder
+divmod:
+    # call div; multiply quotient by divisor; subtract that from dividend
+    addi    sp, sp, -16
+    sw      ra, 12(sp)
+    sw      s0, 8(sp)
+    sw      s1, 4(sp)
+    mv      s0, a0      # save dividend
+    mv      s1, a1      # save divisor
+    jal div             # a0 = a0 ÷ a1
+    sw      a0, 0(sp)
+    mv      a1, a0      # a1 = quotient
+    mv      a0, s1      # a0 = divisor
+    jal     mul         # a0 = divisor × quotient
+    lw      ra, 12(sp)
+    sub     a1, s0, a0  # a1 = dividend - product; remainder
+    lw      s0, 8(sp)
+    lw      s1, 4(sp)
+    lw      a0, 0(sp)   # a0 = quotient
+    addi    sp, sp, 16
+    ret
+
+# count leading zero bits
+#    arguments:
+#        a0: input
+#    return:
+#        a0 = count of leading zero bits
+#
+# binary search approach translated from C code on
+# https://blog.stephencleary.com/2010/10/implementing-gccs-builtin-functions.html
+clz:
+        li      a4, 16             # initialise count of zeros to 16
+        srli    a5, a0, 16         # shift value right 16 bits
+        bne     a5, zero, .eight   # if the result is != 0 we have up to 16 leading zeros
+        mv      a5, a0             # restore unshifted value to a5
+        li      a4, 32             # we have up to 32 leading zeros
+.eight:
+        srli    a3, a5, 8          # shift the value right 8 bits
+        beq     a3, zero, .four
+        addi    a4, a4, -8         # subtract 8 leading zeros if shift result was non-zero
+        mv      a5, a3
+.four:
+        srli    a3, a5, 4          # shift the value right 4 bits
+        beq     a3, zero, .two
+        addi    a4, a4, -4         # subtract 4 leading zeros if shift result was non-zero
+        mv      a5, a3
+.two:
+        srli    a3, a5, 2          # shift the value right 2 bits
+        beq     a3, zero, .one
+        addi    a4, a4, -2         # subtract 2 leading zeros if shift result was non-zero
+        mv      a5, a3
+.one:
+        srli    a3, a5, 1          # shift the value right 1 bit
+        sub     a0, a4, a5         # a0 = count - remaining value
+        beq     a3, zero, .end     # if shift result was zero, return a0
+        addi    a0, a4, -2         # subtract 2 leading zeros if shift result was non-zero
+.end:
+        ret
diff --git a/tests/math_add64.s b/tests/math_add64.s
index 693688a..775eeec 100644
--- a/tests/math_add64.s
+++ b/tests/math_add64.s
@@ -23,6 +23,16 @@ inputs:
     .word 1            # 1
     .word 0x80000000   # 0.5
     .word 2            # 2
+
+    .word 0x40000000   # 0.25
+    .word -1           # -1
+    .word 0x80000000   # 0.5
+    .word 2            # 2
+
+    .word 0x40000000   # 0.25
+    .word 1            # 1
+    .word 0x80000000   # 0.5
+    .word -2           # -2
 inputs_end:
   # llvm doesn't like this: error: expected relocatable expression
   #.set inputs_end, .-inputs
diff --git a/tests/math_clz.s b/tests/math_clz.s
new file mode 100644
index 0000000..68b8a70
--- /dev/null
+++ b/tests/math_clz.s
@@ -0,0 +1,63 @@
+# Test for clz
+
+.org 0
+# Provide program starting address to linker
+.global _start
+
+.extern clz
+.extern tohex
+
+/* newlib system calls */
+.set SYSEXIT,  93
+.set SYSWRITE, 64
+
+.section .rodata
+
+inputs:
+    .word 0
+    .word 1
+    .word 2
+    .word 3
+    .word 0xF
+    .word 0xFF
+    .word 0xFFF
+    .word 0xFFFF
+    .word 0xFFFFF
+    .word 0xFFFFFF
+    .word 0xFFFFFFF
+    .word 0xFFFFFFFF
+    .word 0x80000000
+    .word 0x00008000
+inputs_end:
+
+.section .bss
+
+buf: .skip 9
+
+.text
+
+_start:
+    li a0, '\n'
+    la a1, buf
+    sb a0, 8(a1)        # append newline to buf
+
+    la s0, inputs       # init loop variables
+    la s1, inputs_end
+loop:
+    lw a0, 0(s0)        # input value
+    jal clz
+    la a1, buf
+    jal tohex
+
+    li t0, SYSWRITE     # "write" syscall
+    li a0, 1            # 1 = standard output (stdout)
+    la a1, buf          # load address of output string
+    li a2, 9            # length of output string
+    ecall               # invoke syscall to print the string
+
+    addi s0, s0, 4      # increment input pointer to the next input
+    bltu s0, s1, loop   # if the input address is less than inputs_end, loop
+
+    li t0, SYSEXIT      # "exit" syscall
+    la a0, 0            # Use 0 return code
+    ecall               # invoke syscall to terminate the program
diff --git a/tests/math_div.s b/tests/math_div.s
new file mode 100644
index 0000000..3fa680f
--- /dev/null
+++ b/tests/math_div.s
@@ -0,0 +1,62 @@
+# Test for div
+
+.org 0
+# Provide program starting address to linker
+.global _start
+
+.extern div
+.extern tohex
+
+/* newlib system calls */
+.set SYSEXIT,  93
+.set SYSWRITE, 64
+
+.section .rodata
+
+inputs:
+    .word 3
+    .word 5
+
+    .word 21
+    .word 3
+
+    .word 0xFFFF
+    .word 100
+
+    .word -7
+    .word 3
+inputs_end:
+
+.section .bss
+
+buf: .skip 9
+
+.text
+
+_start:
+    li a0, '\n'
+    la a1, buf
+    sb a0, 8(a1)        # append newline to buf
+
+    la s0, inputs       # init loop variables
+    la s1, inputs_end
+loop:
+    lw a0, 0(s0)        # dividend
+    lw a1, 4(s0)        # divisor
+    jal div
+    # TODO: Format as an actual decimal
+    la a1, buf
+    jal tohex
+
+    li t0, SYSWRITE     # "write" syscall
+    li a0, 1            # 1 = standard output (stdout)
+    la a1, buf          # load address of output string
+    li a2, 9            # length of output string
+    ecall               # invoke syscall to print the string
+
+    addi s0, s0, 8      # increment input pointer to next pair of inputs
+    bltu s0, s1, loop   # if the input address is less than inputs_end, loop
+
+    li t0, SYSEXIT      # "exit" syscall
+    la a0, 0            # Use 0 return code
+    ecall               # invoke syscall to terminate the program
diff --git a/tests/math_divmod.s b/tests/math_divmod.s
new file mode 100644
index 0000000..f8f6298
--- /dev/null
+++ b/tests/math_divmod.s
@@ -0,0 +1,83 @@
+# Test for divmod
+
+.org 0
+# Provide program starting address to linker
+.global _start
+
+.extern div
+.extern tohex
+
+/* newlib system calls */
+.set SYSEXIT,  93
+.set SYSWRITE, 64
+
+.section .rodata
+
+inputs:
+    .word 3
+    .word 5
+
+    .word 21
+    .word 3
+
+    .word 0xFFFF
+    .word 100
+
+    # Division of this one doesn't work yet
+    # .word -7
+    # .word 3
+inputs_end:
+
+.section .bss
+
+buf: .skip 9
+
+.text
+
+_start:
+    li a0, '\n'
+    la a1, buf
+    sb a0, 8(a1)        # append newline to buf
+
+    la s0, inputs       # init loop variables
+loop:
+    lw a0, 0(s0)        # dividend
+    lw a1, 4(s0)        # divisor
+    jal divmod
+    mv s1, a1           # save remainder
+
+    # print quotient
+    la a1, buf
+    jal tohex
+    li t0, SYSWRITE     # "write" syscall
+    li a0, 1            # 1 = standard output (stdout)
+    la a1, buf          # load address of output string
+    li a2, 8            # length of output string
+    ecall               # invoke syscall to print the string
+
+    # print ,
+    li t0, SYSWRITE     # "write" syscall
+    li a0, 1            # 1 = standard output (stdout)
+    la a1, buf          # load address of output string
+    li a2, ','
+    sb a2, 0(a1)
+    li a2, 1            # length of output string
+    ecall               # invoke syscall to print the string
+
+    # print remainder
+    mv a0, s1           # restore remainder
+    la a1, buf
+    jal tohex
+    li t0, SYSWRITE     # "write" syscall
+    li a0, 1            # 1 = standard output (stdout)
+    la a1, buf          # load address of output string
+    li a2, 9            # length of output string
+    ecall               # invoke syscall to print the string
+
+    addi s0, s0, 8      # increment input pointer to next pair of inputs
+    la a0, inputs_end
+    bltu s0, a0, loop   # if the input address is less than inputs_end, loop
+
+    li t0, SYSEXIT      # "exit" syscall
+    la a0, 0            # Use 0 return code
+    ecall               # invoke syscall to terminate the program
diff --git a/tests/math_mod.s b/tests/math_mod.s
new file mode 100644
index 0000000..7abcbdb
--- /dev/null
+++ b/tests/math_mod.s
@@ -0,0 +1,66 @@
+# Test for mul
+
+.org 0
+# Provide program starting address to linker
+.global _start
+
+.extern mul
+.extern tohex
+
+/* newlib system calls */
+.set SYSEXIT,  93
+.set SYSWRITE, 64
+
+.section .rodata
+
+inputs:
+    .word 3
+    .word 5
+
+    .word 21
+    .word 3
+
+    .word 0xFFFF
+    .word 100
+
+    .word -7
+    .word 3
+inputs_end:
+  # llvm doesn't like this: error: expected relocatable expression
+  #.set inputs_end, .-inputs
+  # turns out it was right. That was calculating a length, which was
+  # incorrect for how it was used for looping.
+
+.section .bss
+
+buf: .skip 9
+
+.text
+
+_start:
+    li a0, '\n'
+    la a1, buf
+    sb a0, 8(a1)        # append newline to buf
+
+    la s0, inputs       # init loop variables
+    la s1, inputs_end
+loop:
+    lw a0, 0(s0)        # multiplicand
+    lw a1, 4(s0)        # multiplier
+    jal mul
+    # TODO: Format as an actual decimal
+    la a1, buf
+    jal tohex
+
+    li t0, SYSWRITE     # "write" syscall
+    li a0, 1            # 1 = standard output (stdout)
+    la a1, buf          # load address of output string
+    li a2, 9            # length of output string
+    ecall               # invoke syscall to print the string
+
+    addi s0, s0, 8      # increment input pointer to next pair of 64-bit inputs
+    bltu s0, s1, loop   # if the input address is less than inputs_end, loop
+
+    li t0, SYSEXIT      # "exit" syscall
+    la a0, 0            # Use 0 return code
+    ecall               # invoke syscall to terminate the program
diff --git a/tests/math_mul.s b/tests/math_mul.s
new file mode 100644
index 0000000..7abcbdb
--- /dev/null
+++ b/tests/math_mul.s
@@ -0,0 +1,66 @@
+# Test for mul
+
+.org 0
+# Provide program starting address to linker
+.global _start
+
+.extern mul
+.extern tohex
+
+/* newlib system calls */
+.set SYSEXIT,  93
+.set SYSWRITE, 64
+
+.section .rodata
+
+inputs:
+    .word 3
+    .word 5
+
+    .word 21
+    .word 3
+
+    .word 0xFFFF
+    .word 100
+
+    .word -7
+    .word 3
+inputs_end:
+  # llvm doesn't like this: error: expected relocatable expression
+  #.set inputs_end, .-inputs
+  # turns out it was right. That was calculating a length, which was
+  # incorrect for how it was used for looping.
+
+.section .bss
+
+buf: .skip 9
+
+.text
+
+_start:
+    li a0, '\n'
+    la a1, buf
+    sb a0, 8(a1)        # append newline to buf
+
+    la s0, inputs       # init loop variables
+    la s1, inputs_end
+loop:
+    lw a0, 0(s0)        # multiplicand
+    lw a1, 4(s0)        # multiplier
+    jal mul
+    # TODO: Format as an actual decimal
+    la a1, buf
+    jal tohex
+
+    li t0, SYSWRITE     # "write" syscall
+    li a0, 1            # 1 = standard output (stdout)
+    la a1, buf          # load address of output string
+    li a2, 9            # length of output string
+    ecall               # invoke syscall to print the string
+
+    addi s0, s0, 8      # increment input pointer to next pair of 64-bit inputs
+    bltu s0, s1, loop   # if the input address is less than inputs_end, loop
+
+    li t0, SYSEXIT      # "exit" syscall
+    la a0, 0            # Use 0 return code
+    ecall               # invoke syscall to terminate the program
diff --git a/tests/test_math.sh b/tests/test_math.sh
index 9897d7c..8a0d8ed 100644
--- a/tests/test_math.sh
+++ b/tests/test_math.sh
@@ -9,6 +9,8 @@ test_add64() {
   expected=$(cat << END
 00000003.80000000
 00000003.C0000000
+00000001.C0000000
+FFFFFFFF.C0000000
 END
 )
 
@@ -31,3 +33,57 @@ END
   # different inputs.
   test $? -eq 0 && test "$result" = "$expected"
 }
+
+test_div() {
+  result=$("${QEMU}" -B 0x80000000 -s 2k tests/math_div.elf)
+  expected=$(cat << END
+00000000
+00000007
+0000028F
+55555553
+END
+)
+
+  # TODO: Ideally this test would allow calling the binary repeatedly with
+  # different inputs.
+  test $? -eq 0 && test "$result" = "$expected"
+}
+
+test_divmod() {
+  result=$("${QEMU}" -B 0x80000000 -s 2k tests/math_divmod.elf)
+  expected=$(cat << END
+00000000,00000003
+00000007,00000000
+0000028F,00000023
+END
+)
+
+  # TODO: Ideally this test would allow calling the binary repeatedly with
+  # different inputs.
+  test $? -eq 0 && test "$result" = "$expected"
+}
+
+test_clz() {
+  result=$("${QEMU}" -B 0x80000000 -s 2k tests/math_clz.elf)
+  expected=$(cat << END
+00000020
+0000001F
+0000001E
+0000001E
+0000001C
+00000018
+00000014
+00000010
+0000000C
+00000008
+00000004
+00000000
+00000000
+00000010
+END
+)
+
+  # TODO: Ideally this test would allow calling the binary repeatedly with
+  # different inputs.
+  test $? -eq 0 && test "$result" = "$expected"
+}