diff --git a/Makefile b/Makefile
index de95d29..7517058 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,5 @@
 AS=riscv64-unknown-elf-as
+ASFLAGS=-g -mabi=ilp32e -march=rv32ec
 LD=riscv64-unknown-elf-ld
 
 all: calc.elf
@@ -10,4 +11,4 @@ calc.elf: calc.o
 	$(LD) -m elf32lriscv -T link.ld $^ -o $@
 
 %.o : %.s
-	$(AS) -mabi=ilp32e -march=rv32ec $< -o $@
+	$(AS) $(ASFLAGS) $< -o $@
diff --git a/calc.s b/calc.s
index 58d09cf..e9cbcb4 100644
--- a/calc.s
+++ b/calc.s
@@ -95,37 +95,64 @@ regdump:
     li a2, str2_size    # length of other string
     ecall               # invoke syscall to print the string
 
-    li t1, 0
-    li t2, 16
+    li s0, 0
+    li s1, 16
 
 regdump_loop:
-    beq t1, t2, regdump_done
-    # a1 = regnames + (2 * t1)
-    # a1 = t1 << 1 + regnames
+    beq s0, s1, regdump_done
     la a1, regnames     # load address of regnames
-    slli a2, t1, 1
-    add a1, a1, a2
+    slli a2, s0, 1      # a2 = s0 * 2
+    add a1, a1, a2      # a1 = a1 + a2
+
+    # copy regname to buf
+    la a0, buf          # load address of buf into a0 as dest
+    li a2, 2            # copy 2 bytes
+
+    jal memcpy
+    # append ': \n'
+    li t0, ':
+    sb t0, 0(a0)
+    li t0, 0x20
+    sb t0, 1(a0)
+    li t0, '\n
+    sb t0, 2(a0)
 
     # print the register name
     li t0, SYSWRITE     # "write" syscall
     li a0, 1            # 1 = standard output (stdout)
-    # a1 is the address of the string, calculated above
-    li a2, 2            # length of register name string
+    la a1, buf
+    li a2, 5            # length of register name string
     ecall               # invoke syscall to print the string
 
-    # add newline
-    la a1, buf          # load address of buf into a1
-    li a0, '\n
-    sb a0, 0(a1)
-    li a2, 1            # length of buf
-    li t0, SYSWRITE     # "write" syscall
-    li a0, 1            # 1 = standard output (stdout)
-    ecall               # invoke syscall to print the string
-
-    addi t1, t1, 1
+    addi s0, s0, 1
     j regdump_loop
 
 regdump_done:
     addi sp, sp, 16   # deallocate stack space
     ret
 
+
+# memcpy, copy n bytes of memory from src to dest
+#   arguments:
+#       a0: dest address
+#       a1: source address
+#       a2: number of bytes to copy
+#   temporaries used:
+#       t0, t1
+#   return:
+#       a0: address of dest + n
+#       a1: address of src + n
+#
+memcpy:
+  li t0, 0
+memcpy_loop:
+  # TODO: copy in chunks of 4 bytes if n > 4
+  beq t0, a2, memcpy_done
+  lbu t1, 0(a1)
+  sb  t1, 0(a0)
+  addi a0, a0, 1
+  addi a1, a1, 1
+  addi t0, t0, 1
+  j memcpy_loop
+memcpy_done:
+  ret