diff --git a/Makefile b/Makefile
index 4d87da71..4a91433e 100644
--- a/Makefile
+++ b/Makefile
@@ -5,6 +5,7 @@ OUT ?= build
 BIN := $(OUT)/rv32emu
 
 CFLAGS = -std=gnu99 -O2 -Wall -Wextra
+CFLAGS += -Wno-unused-label
 CFLAGS += -include src/common.h
 
 # Set the default stack pointer
@@ -88,6 +89,10 @@ gdbstub-test: $(BIN)
 	$(Q)tests/gdbstub.sh && $(call notice, [OK])
 endif
 
+# For tail-call elimination, we need a specific set of build flags applied.
+# FIXME: On macOS + Apple Silicon, -fno-stack-protector might have a negative impact.
+$(OUT)/emulate.o: CFLAGS += -fomit-frame-pointer -fno-stack-check -fno-stack-protector
+
 # Clear the .DEFAULT_GOAL special variable, so that the following turns
 # to the first target after .DEFAULT_GOAL is not set.
 .DEFAULT_GOAL :=
diff --git a/src/common.h b/src/common.h
index 34876412..e59816f3 100644
--- a/src/common.h
+++ b/src/common.h
@@ -24,6 +24,16 @@
 #define __ALIGNED(x)
 #endif
 
+/* There is no tail-call optimization(TCO) in non-optimized builds. To work
+ * around this, we attempts to use a compiler attribute called musttail that
+ * forces the compiler to TCO even when optimizations aren't on.
+ */
+#if defined(__has_attribute) && __has_attribute(musttail)
+#define MUST_TAIL __attribute__((musttail))
+#else
+#define MUST_TAIL
+#endif
+
 /* Pattern Matching for C macros.
  * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
  */
diff --git a/src/decode.h b/src/decode.h
index 084f4130..4a9c102f 100644
--- a/src/decode.h
+++ b/src/decode.h
@@ -8,158 +8,160 @@
 #include <stdbool.h>
 #include <stdint.h>
 
-/* RISC-V instruction list */
+#include "riscv.h"
+
+/* RISC-V instruction list in format _(instruction-name, can-branch) */
 /* clang-format off */
 #define RISCV_INSN_LIST                    \
-    _(nop)                                 \
+    _(nop, 0)                              \
     /* RV32I Base Instruction Set */       \
-    _(lui)                                 \
-    _(auipc)                               \
-    _(jal)                                 \
-    _(jalr)                                \
-    _(beq)                                 \
-    _(bne)                                 \
-    _(blt)                                 \
-    _(bge)                                 \
-    _(bltu)                                \
-    _(bgeu)                                \
-    _(lb)                                  \
-    _(lh)                                  \
-    _(lw)                                  \
-    _(lbu)                                 \
-    _(lhu)                                 \
-    _(sb)                                  \
-    _(sh)                                  \
-    _(sw)                                  \
-    _(addi)                                \
-    _(slti)                                \
-    _(sltiu)                               \
-    _(xori)                                \
-    _(ori)                                 \
-    _(andi)                                \
-    _(slli)                                \
-    _(srli)                                \
-    _(srai)                                \
-    _(add)                                 \
-    _(sub)                                 \
-    _(sll)                                 \
-    _(slt)                                 \
-    _(sltu)                                \
-    _(xor)                                 \
-    _(srl)                                 \
-    _(sra)                                 \
-    _(or)                                  \
-    _(and)                                 \
-    _(ecall)                               \
-    _(ebreak)                              \
+    _(lui, 0)                              \
+    _(auipc, 0)                            \
+    _(jal, 1)                              \
+    _(jalr, 1)                             \
+    _(beq, 1)                              \
+    _(bne, 1)                              \
+    _(blt, 1)                              \
+    _(bge, 1)                              \
+    _(bltu, 1)                             \
+    _(bgeu, 1)                             \
+    _(lb, 0)                               \
+    _(lh, 0)                               \
+    _(lw, 0)                               \
+    _(lbu, 0)                              \
+    _(lhu, 0)                              \
+    _(sb, 0)                               \
+    _(sh, 0)                               \
+    _(sw, 0)                               \
+    _(addi, 0)                             \
+    _(slti, 0)                             \
+    _(sltiu, 0)                            \
+    _(xori, 0)                             \
+    _(ori, 0)                              \
+    _(andi, 0)                             \
+    _(slli, 0)                             \
+    _(srli, 0)                             \
+    _(srai, 0)                             \
+    _(add, 0)                              \
+    _(sub, 0)                              \
+    _(sll, 0)                              \
+    _(slt, 0)                              \
+    _(sltu, 0)                             \
+    _(xor, 0)                              \
+    _(srl, 0)                              \
+    _(sra, 0)                              \
+    _(or, 0)                               \
+    _(and, 0)                              \
+    _(ecall, 1)                            \
+    _(ebreak, 1)                           \
     /* RISC-V Privileged Instruction */    \
-    _(wfi)                                 \
-    _(uret)                                \
-    _(sret)                                \
-    _(hret)                                \
-    _(mret)                                \
+    _(wfi, 0)                              \
+    _(uret, 0)                             \
+    _(sret, 0)                             \
+    _(hret, 0)                             \
+    _(mret, 1)                             \
     /* RV32 Zifencei Standard Extension */ \
     IIF(RV32_HAS(Zifencei))(               \
-        _(fencei)                          \
+        _(fencei, 0)                       \
     )                                      \
     /* RV32 Zicsr Standard Extension */    \
     IIF(RV32_HAS(Zicsr))(                  \
-        _(csrrw)                           \
-        _(csrrs)                           \
-        _(csrrc)                           \
-        _(csrrwi)                          \
-        _(csrrsi)                          \
-        _(csrrci)                          \
+        _(csrrw, 0)                        \
+        _(csrrs, 0)                        \
+        _(csrrc, 0)                        \
+        _(csrrwi, 0)                       \
+        _(csrrsi, 0)                       \
+        _(csrrci, 0)                       \
     )                                      \
     /* RV32M Standard Extension */         \
     IIF(RV32_HAS(EXT_M))(                  \
-        _(mul)                             \
-        _(mulh)                            \
-        _(mulhsu)                          \
-        _(mulhu)                           \
-        _(div)                             \
-        _(divu)                            \
-        _(rem)                             \
-        _(remu)                            \
+        _(mul, 0)                          \
+        _(mulh, 0)                         \
+        _(mulhsu, 0)                       \
+        _(mulhu, 0)                        \
+        _(div, 0)                          \
+        _(divu, 0)                         \
+        _(rem, 0)                          \
+        _(remu, 0)                         \
     )                                      \
     /* RV32A Standard Extension */         \
     IIF(RV32_HAS(EXT_A))(                  \
-        _(lrw)                             \
-        _(scw)                             \
-        _(amoswapw)                        \
-        _(amoaddw)                         \
-        _(amoxorw)                         \
-        _(amoandw)                         \
-        _(amoorw)                          \
-        _(amominw)                         \
-        _(amomaxw)                         \
-        _(amominuw)                        \
-        _(amomaxuw)                        \
+        _(lrw, 0)                          \
+        _(scw, 0)                          \
+        _(amoswapw, 0)                     \
+        _(amoaddw, 0)                      \
+        _(amoxorw, 0)                      \
+        _(amoandw, 0)                      \
+        _(amoorw, 0)                       \
+        _(amominw, 0)                      \
+        _(amomaxw, 0)                      \
+        _(amominuw, 0)                     \
+        _(amomaxuw, 0)                     \
     )                                      \
     /* RV32F Standard Extension */         \
     IIF(RV32_HAS(EXT_F))(                  \
-        _(flw)                             \
-        _(fsw)                             \
-        _(fmadds)                          \
-        _(fmsubs)                          \
-        _(fnmsubs)                         \
-        _(fnmadds)                         \
-        _(fadds)                           \
-        _(fsubs)                           \
-        _(fmuls)                           \
-        _(fdivs)                           \
-        _(fsqrts)                          \
-        _(fsgnjs)                          \
-        _(fsgnjns)                         \
-        _(fsgnjxs)                         \
-        _(fmins)                           \
-        _(fmaxs)                           \
-        _(fcvtws)                          \
-        _(fcvtwus)                         \
-        _(fmvxw)                           \
-        _(feqs)                            \
-        _(flts)                            \
-        _(fles)                            \
-        _(fclasss)                         \
-        _(fcvtsw)                          \
-        _(fcvtswu)                         \
-        _(fmvwx)                           \
+        _(flw, 0)                          \
+        _(fsw, 0)                          \
+        _(fmadds, 0)                       \
+        _(fmsubs, 0)                       \
+        _(fnmsubs, 0)                      \
+        _(fnmadds, 0)                      \
+        _(fadds, 0)                        \
+        _(fsubs, 0)                        \
+        _(fmuls, 0)                        \
+        _(fdivs, 0)                        \
+        _(fsqrts, 0)                       \
+        _(fsgnjs, 0)                       \
+        _(fsgnjns, 0)                      \
+        _(fsgnjxs, 0)                      \
+        _(fmins, 0)                        \
+        _(fmaxs, 0)                        \
+        _(fcvtws, 0)                       \
+        _(fcvtwus, 0)                      \
+        _(fmvxw, 0)                        \
+        _(feqs, 0)                         \
+        _(flts, 0)                         \
+        _(fles, 0)                         \
+        _(fclasss, 0)                      \
+        _(fcvtsw, 0)                       \
+        _(fcvtswu, 0)                      \
+        _(fmvwx, 0)                        \
     )                                      \
     /* RV32C Standard Extension */         \
     IIF(RV32_HAS(EXT_C))(                  \
-        _(caddi4spn)                       \
-        _(clw)                             \
-        _(csw)                             \
-        _(cnop)                            \
-        _(caddi)                           \
-        _(cjal)                            \
-        _(cli)                             \
-        _(caddi16sp)                       \
-        _(clui)                            \
-        _(csrli)                           \
-        _(csrai)                           \
-        _(candi)                           \
-        _(csub)                            \
-        _(cxor)                            \
-        _(cor)                             \
-        _(cand)                            \
-        _(cj)                              \
-        _(cbeqz)                           \
-        _(cbnez)                           \
-        _(cslli)                           \
-        _(clwsp)                           \
-        _(cjr)                             \
-        _(cmv)                             \
-        _(cebreak)                         \
-        _(cjalr)                           \
-        _(cadd)                            \
-        _(cswsp)                           \
+        _(caddi4spn, 0)                    \
+        _(clw, 0)                          \
+        _(csw, 0)                          \
+        _(cnop, 0)                         \
+        _(caddi, 0)                        \
+        _(cjal, 1)                         \
+        _(cli, 0)                          \
+        _(caddi16sp, 0)                    \
+        _(clui, 0)                         \
+        _(csrli, 0)                        \
+        _(csrai, 0)                        \
+        _(candi, 0)                        \
+        _(csub, 0)                         \
+        _(cxor, 0)                         \
+        _(cor, 0)                          \
+        _(cand, 0)                         \
+        _(cj, 1)                           \
+        _(cbeqz, 1)                        \
+        _(cbnez, 1)                        \
+        _(cslli, 0)                        \
+        _(clwsp, 0)                        \
+        _(cjr, 1)                          \
+        _(cmv, 0)                          \
+        _(cebreak, 1)                      \
+        _(cjalr, 1)                        \
+        _(cadd, 0)                         \
+        _(cswsp, 0)                        \
     )
 /* clang-format on */
 
 /* IR list */
 enum {
-#define _(inst) rv_insn_##inst,
+#define _(inst, can_branch) rv_insn_##inst,
     RISCV_INSN_LIST
 #undef _
 };
@@ -226,7 +228,7 @@ enum {
     INSN_32 = 4,
 };
 
-typedef struct {
+typedef struct rv_insn {
     union {
         int32_t imm;
         uint8_t rs3;
@@ -241,6 +243,22 @@ typedef struct {
 
     /* instruction length */
     uint8_t insn_len;
+
+    /* According to tail-call optimization (TCO), if a C function ends with
+     * a function call to another function or itself and simply returns that
+     * function's result, the compiler can substitute a simple jump to the
+     * other function for the 'call' and 'return' instructions . The self
+     * -recursive function can therefore use the same function stack frame.
+     *
+     * Using member tailcall, we can tell whether an IR is the final IR in
+     * a basic block. Additionally, member 'impl' allows us to invoke next
+     * instruction emulation directly without computing the jumping address.
+     * In order to enable the compiler to perform TCO, we can use these two
+     * members to rewrite all instruction emulations into a self-recursive
+     * version.
+     */
+    bool tailcall;
+    bool (*impl)(riscv_t *, const struct rv_insn *);
 } rv_insn_t;
 
 /* decode the RISC-V instruction */
diff --git a/src/emulate.c b/src/emulate.c
index 737fbf8e..352487f9 100644
--- a/src/emulate.c
+++ b/src/emulate.c
@@ -254,1119 +254,991 @@ static inline bool insn_is_misaligned(uint32_t pc)
     );
 }
 
-/* execute a basic block */
-static bool emulate(riscv_t *rv, const block_t *block)
-{
-#if RV32_HAS(COMPUTED_GOTO)
-    static const void *dispatch_table[] = {
-#define _(inst) [rv_insn_##inst] = &&do_##inst,
-        RISCV_INSN_LIST
+/* can-branch information for each RISC-V instruction */
+enum {
+#define _(inst, can_branch) __rv_insn_##inst##_canbranch = can_branch,
+    RISCV_INSN_LIST
 #undef _
-    };
-
-#define DISPATCH()              \
-    /* enforce zero register */ \
-    rv->X[rv_reg_zero] = 0;     \
-    /* current IR */            \
-    ir = block->ir + index++;   \
-    /* jump */                  \
-    goto *dispatch_table[ir->opcode];
-
-/* clang-format off */
-#define _(inst, code)                    \
-    do_##inst: code                      \
-    /* step over instruction */          \
-    rv->PC += ir->insn_len;              \
-    /* increment the cycles CSR */       \
-    rv->csr_cycle++;                     \
-    /* all instructions have executed */ \
-    if (unlikely(index == n_insn))       \
-        return true;                     \
-    DISPATCH()
-/* clang-format on */
-#define EPILOGUE()
-
-#else /* !RV32_HAS(COMPUTED_GOTO) */
-#define DISPATCH()                          \
-    for (uint32_t i = 0; i < n_insn; i++) { \
-        ir = block->ir + i;                 \
-        /* enforce zero register */         \
-        rv->X[rv_reg_zero] = 0;             \
-        switch (ir->opcode) {
-/* clang-format off */
-#define _(inst, code)         \
-    case rv_insn_##inst: code \
-        break;
-#define EPILOGUE()                          \
-        }                                   \
-        /* step over instruction */         \
-        rv->PC += ir->insn_len;             \
-        /* increment the cycles csr */      \
-        rv->csr_cycle++;                    \
-    }                                       \
-    return true;
-/* clang-format on */
-#endif /* RV32_HAS(COMPUTED_GOTO) */
-
-    const uint32_t n_insn = block->n_insn;
-    rv_insn_t *ir;
-
-#if RV32_HAS(COMPUTED_GOTO)
-    /* current index in block */
-    uint32_t index = 0;
-#endif
-
-    /* main loop */
-    DISPATCH()
-
-    /* Internal */
-    _(nop, /* no operation */)
-
-    /* LUI (Load Upper Immediate) is used to build 32-bit constants and uses the
-     * U-type format. LUI places the U-immediate value in the top 20 bits of the
-     * destination register rd, filling in the lowest 12 bits with zeros. The
-     * 32-bit result is sign-extended to 64 bits.
-     */
-    _(lui, rv->X[ir->rd] = ir->imm;)
-
-    /* AUIPC (Add Upper Immediate to PC) is used to build pc-relative addresses
-     * and uses the U-type format. AUIPC forms a 32-bit offset from the 20-bit
-     * U-immediate, filling in the lowest 12 bits with zeros, adds this offset
-     * to the address of the AUIPC instruction, then places the result in
-     * register rd.
-     */
-    _(auipc, rv->X[ir->rd] = ir->imm + rv->PC;)
+};
 
-    /* JAL: Jump and Link
-     * store successor instruction address into rd.
-     * add next J imm (offset) to pc.
-     */
-    _(jal, {
-        const uint32_t pc = rv->PC;
-        /* Jump */
-        rv->PC += ir->imm;
-        /* link with return address */
-        if (ir->rd)
-            rv->X[ir->rd] = pc + ir->insn_len;
-        /* check instruction misaligned */
-        if (unlikely(insn_is_misaligned(rv->PC))) {
-            rv->compressed = false;
-            rv_except_insn_misaligned(rv, pc);
-            return false;
-        }
-        /* increment the cycles csr */
-        rv->csr_cycle++;
-        /* can branch */
-        return true;
-    })
-
-    /* JALR: Jump and Link Register
-     * The indirect jump instruction JALR uses the I-type encoding. The
-     * target address is obtained by adding the sign-extended 12-bit
-     * I-immediate to the register rs1, then setting the least-significant
-     * bit of the result to zero. The address of the instruction following
-     * the jump (pc+4) is written to register rd. Register x0 can be used as
-     * the destination if the result is not required.
-     */
-    _(jalr, {
-        const uint32_t pc = rv->PC;
-        /* jump */
-        rv->PC = (rv->X[ir->rs1] + ir->imm) & ~1U;
-        /* link */
-        if (ir->rd)
-            rv->X[ir->rd] = pc + ir->insn_len;
-        /* check instruction misaligned */
-        if (unlikely(insn_is_misaligned(rv->PC))) {
-            rv->compressed = false;
-            rv_except_insn_misaligned(rv, pc);
-            return false;
-        }
-        /* increment the cycles csr */
-        rv->csr_cycle++;
-        /* can branch */
-        return true;
-    })
-
-    /* BEQ: Branch if Equal */
-    _(beq, {
-        const uint32_t pc = rv->PC;
-        if (rv->X[ir->rs1] == rv->X[ir->rs2]) {
-            rv->PC += ir->imm;
-            /* check instruction misaligned */
-            if (unlikely(insn_is_misaligned(rv->PC))) {
-                rv->compressed = false;
-                rv_except_insn_misaligned(rv, pc);
-                return false;
-            }
-            /* increment the cycles csr */
-            rv->csr_cycle++;
-            /* can branch */
-            return true;
-        }
-    })
-
-    /* BNE: Branch if Not Equal */
-    _(bne, {
-        const uint32_t pc = rv->PC;
-        if (rv->X[ir->rs1] != rv->X[ir->rs2]) {
-            rv->PC += ir->imm;
-            /* check instruction misaligned */
-            if (unlikely(insn_is_misaligned(rv->PC))) {
-                rv->compressed = false;
-                rv_except_insn_misaligned(rv, pc);
-                return false;
-            }
-            /* increment the cycles csr */
-            rv->csr_cycle++;
-            /* can branch */
-            return true;
-        }
-    })
-
-    /* BLT: Branch if Less Than */
-    _(blt, {
-        const uint32_t pc = rv->PC;
-        if ((int32_t) rv->X[ir->rs1] < (int32_t) rv->X[ir->rs2]) {
-            rv->PC += ir->imm;
-            /* check instruction misaligned */
-            if (unlikely(insn_is_misaligned(rv->PC))) {
-                rv->compressed = false;
-                rv_except_insn_misaligned(rv, pc);
-                return false;
-            }
-            /* increment the cycles csr */
-            rv->csr_cycle++;
-            /* can branch */
-            return true;
-        }
-    })
-
-    /* BGE: Branch if Greater Than */
-    _(bge, {
-        const uint32_t pc = rv->PC;
-        if ((int32_t) rv->X[ir->rs1] >= (int32_t) rv->X[ir->rs2]) {
-            rv->PC += ir->imm;
-            /* check instruction misaligned */
-            if (unlikely(insn_is_misaligned(rv->PC))) {
-                rv->compressed = false;
-                rv_except_insn_misaligned(rv, pc);
-                return false;
-            }
-            /* increment the cycles csr */
-            rv->csr_cycle++;
-            /* can branch */
-            return true;
-        }
-    })
-
-    /* BLTU: Branch if Less Than Unsigned */
-    _(bltu, {
-        const uint32_t pc = rv->PC;
-        if (rv->X[ir->rs1] < rv->X[ir->rs2]) {
-            rv->PC += ir->imm;
-            /* check instruction misaligned */
-            if (unlikely(insn_is_misaligned(rv->PC))) {
-                rv->compressed = false;
-                rv_except_insn_misaligned(rv, pc);
-                return false;
-            }
-            /* increment the cycles csr */
-            rv->csr_cycle++;
-            /* can branch */
-            return true;
-        }
-    })
-
-    /* BGEU: Branch if Greater Than Unsigned */
-    _(bgeu, {
-        const uint32_t pc = rv->PC;
-        if (rv->X[ir->rs1] >= rv->X[ir->rs2]) {
-            rv->PC += ir->imm;
-            /* check instruction misaligned */
-            if (unlikely(insn_is_misaligned(rv->PC))) {
-                rv->compressed = false;
-                rv_except_insn_misaligned(rv, pc);
-                return false;
-            }
-            /* increment the cycles csr */
-            rv->csr_cycle++;
-            /* can branch */
-            return true;
-        }
-    })
-
-    /* LB: Load Byte */
-    _(lb, {
-        rv->X[ir->rd] =
-            sign_extend_b(rv->io.mem_read_b(rv, rv->X[ir->rs1] + ir->imm));
-    })
-
-    /* LH: Load Halfword */
-    _(lh, {
-        const uint32_t addr = rv->X[ir->rs1] + ir->imm;
-        if (unlikely(addr & 1)) {
-            rv->compressed = false;
-            rv_except_load_misaligned(rv, addr);
-            return false;
-        }
-        rv->X[ir->rd] = sign_extend_h(rv->io.mem_read_s(rv, addr));
-    })
-
-    /* LW: Load Word */
-    _(lw, {
-        const uint32_t addr = rv->X[ir->rs1] + ir->imm;
-        if (unlikely(addr & 3)) {
-            rv->compressed = false;
-            rv_except_load_misaligned(rv, addr);
-            return false;
-        }
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, addr);
-    })
-
-    /* LBU: Load Byte Unsigned */
-    _(lbu, rv->X[ir->rd] = rv->io.mem_read_b(rv, rv->X[ir->rs1] + ir->imm);)
-
-    /* LHU: Load Halfword Unsigned */
-    _(lhu, {
-        const uint32_t addr = rv->X[ir->rs1] + ir->imm;
-        if (unlikely(addr & 1)) {
-            rv->compressed = false;
-            rv_except_load_misaligned(rv, addr);
-            return false;
-        }
-        rv->X[ir->rd] = rv->io.mem_read_s(rv, addr);
-    })
-
-    /* SB: Store Byte */
-    _(sb, rv->io.mem_write_b(rv, rv->X[ir->rs1] + ir->imm, rv->X[ir->rs2]);)
-
-    /* SH: Store Halfword */
-    _(sh, {
-        const uint32_t addr = rv->X[ir->rs1] + ir->imm;
-        if (unlikely(addr & 1)) {
-            rv->compressed = false;
-            rv_except_store_misaligned(rv, addr);
-            return false;
-        }
-        rv->io.mem_write_s(rv, addr, rv->X[ir->rs2]);
-    })
-
-    /* SW: Store Word */
-    _(sw, {
-        const uint32_t addr = rv->X[ir->rs1] + ir->imm;
-        if (unlikely(addr & 3)) {
-            rv->compressed = false;
-            rv_except_store_misaligned(rv, addr);
-            return false;
-        }
-        rv->io.mem_write_w(rv, addr, rv->X[ir->rs2]);
-    })
+#define RVOP(inst, code)                                                  \
+    static bool do_##inst(riscv_t *rv UNUSED, const rv_insn_t *ir UNUSED) \
+    {                                                                     \
+        rv->X[rv_reg_zero] = 0;                                           \
+        code;                                                             \
+        if (__rv_insn_##inst##_canbranch) {                               \
+            /* can branch */                                              \
+            rv->csr_cycle++;                                              \
+            return true;                                                  \
+        }                                                                 \
+    nextop:                                                               \
+        rv->PC += ir->insn_len;                                           \
+        if (ir->tailcall)                                                 \
+            return true;                                                  \
+        const rv_insn_t *next = ir + 1;                                   \
+        MUST_TAIL return next->impl(rv, next);                            \
+    }
 
-    /* ADDI (Add Immediate) adds the sign-extended 12-bit immediate to register
-     * rs1. Arithmetic overflow is ignored and the result is simply the low XLEN
-     * bits of the result. ADDI rd, rs1, 0 is used to implement the MV rd, rs1
-     * assembler pseudo-instruction.
-     */
-    _(addi, rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + ir->imm;)
+/* RV32I Base Instruction Set */
 
-    /* SLTI (Set on Less Than Immediate) places the value 1 in register rd if
-     * register rs1 is less than the signextended immediate when both are
-     * treated as signed numbers, else 0 is written to rd.
-     */
-    _(slti, rv->X[ir->rd] = ((int32_t) (rv->X[ir->rs1]) < ir->imm) ? 1 : 0;)
+/* Internal */
+RVOP(nop, {/* no operation */});
 
-    /* SLTIU (Set on Less Than Immediate Unsigned) places the value 1 in
-     * register rd if register rs1 is less than the immediate when both are
-     * treated as unsigned numbers, else 0 is written to rd.
-     */
-    _(sltiu, rv->X[ir->rd] = (rv->X[ir->rs1] < (uint32_t) ir->imm) ? 1 : 0;)
+/* LUI is used to build 32-bit constants and uses the U-type format. LUI
+ * places the U-immediate value in the top 20 bits of the destination
+ * register rd, filling in the lowest 12 bits with zeros. The 32-bit
+ * result is sign-extended to 64 bits.
+ */
+RVOP(lui, { rv->X[ir->rd] = ir->imm; })
 
-    /* XORI: Exclusive OR Immediate */
-    _(xori, rv->X[ir->rd] = rv->X[ir->rs1] ^ ir->imm;)
+/* AUIPC is used to build pc-relative addresses and uses the U-type
+ * format. AUIPC forms a 32-bit offset from the 20-bit U-immediate,
+ * filling in the lowest 12 bits with zeros, adds this offset to the
+ * address of the AUIPC instruction, then places the result in register
+ * rd.
+ */
+RVOP(auipc, { rv->X[ir->rd] = ir->imm + rv->PC; })
 
-    /* ORI: OR Immediate */
-    _(ori, rv->X[ir->rd] = rv->X[ir->rs1] | ir->imm;)
+/* JAL: Jump and Link
+ * store successor instruction address into rd.
+ * add next J imm (offset) to pc.
+ */
+RVOP(jal, {
+    const uint32_t pc = rv->PC;
+    /* Jump */
+    rv->PC += ir->imm;
+    /* link with return address */
+    if (ir->rd)
+        rv->X[ir->rd] = pc + ir->insn_len;
+    /* check instruction misaligned */
+    if (unlikely(insn_is_misaligned(rv->PC))) {
+        rv->compressed = false;
+        rv_except_insn_misaligned(rv, pc);
+        return false;
+    }
+})
+
+/*The indirect jump instruction JALR uses the I-type encoding. The
+ * target address is obtained by adding the sign-extended 12-bit
+ * I-immediate to the register rs1, then setting the least-significant
+ * bit of the result to zero. The address of the instruction following
+ * the jump (pc+4) is written to register rd. Register x0 can be used as
+ * the destination if the result is not required.
+ */
+RVOP(jalr, {
+    const uint32_t pc = rv->PC;
+    /* jump */
+    rv->PC = (rv->X[ir->rs1] + ir->imm) & ~1U;
+    /* link */
+    if (ir->rd)
+        rv->X[ir->rd] = pc + ir->insn_len;
+    /* check instruction misaligned */
+    if (unlikely(insn_is_misaligned(rv->PC))) {
+        rv->compressed = false;
+        rv_except_insn_misaligned(rv, pc);
+        return false;
+    }
+})
+
+/* BEQ: Branch if Equal */
+RVOP(beq, {
+    const uint32_t pc = rv->PC;
+    if (rv->X[ir->rs1] != rv->X[ir->rs2])
+        goto nextop;
+    rv->PC += ir->imm;
+    /* check instruction misaligned */
+    if (unlikely(insn_is_misaligned(rv->PC))) {
+        rv->compressed = false;
+        rv_except_insn_misaligned(rv, pc);
+        return false;
+    }
+})
+
+/* BNE: Branch if Not Equal */
+RVOP(bne, {
+    const uint32_t pc = rv->PC;
+    if (rv->X[ir->rs1] == rv->X[ir->rs2])
+        goto nextop;
+    rv->PC += ir->imm;
+    /* check instruction misaligned */
+    if (unlikely(insn_is_misaligned(rv->PC))) {
+        rv->compressed = false;
+        rv_except_insn_misaligned(rv, pc);
+        return false;
+    }
+})
+
+/* BLT: Branch if Less Than */
+RVOP(blt, {
+    const uint32_t pc = rv->PC;
+    if ((int32_t) rv->X[ir->rs1] >= (int32_t) rv->X[ir->rs2])
+        goto nextop;
+    rv->PC += ir->imm;
+    /* check instruction misaligned */
+    if (unlikely(insn_is_misaligned(rv->PC))) {
+        rv->compressed = false;
+        rv_except_insn_misaligned(rv, pc);
+        return false;
+    }
+})
+
+/* BGE: Branch if Greater Than */
+RVOP(bge, {
+    const uint32_t pc = rv->PC;
+    if ((int32_t) rv->X[ir->rs1] < (int32_t) rv->X[ir->rs2])
+        goto nextop;
+    rv->PC += ir->imm;
+    /* check instruction misaligned */
+    if (unlikely(insn_is_misaligned(rv->PC))) {
+        rv->compressed = false;
+        rv_except_insn_misaligned(rv, pc);
+        return false;
+    }
+})
+
+/* BLTU: Branch if Less Than Unsigned */
+RVOP(bltu, {
+    const uint32_t pc = rv->PC;
+    if (rv->X[ir->rs1] >= rv->X[ir->rs2])
+        goto nextop;
+    rv->PC += ir->imm;
+    /* check instruction misaligned */
+    if (unlikely(insn_is_misaligned(rv->PC))) {
+        rv->compressed = false;
+        rv_except_insn_misaligned(rv, pc);
+        return false;
+    }
+})
+
+/* BGEU: Branch if Greater Than Unsigned */
+RVOP(bgeu, {
+    const uint32_t pc = rv->PC;
+    if (rv->X[ir->rs1] < rv->X[ir->rs2])
+        goto nextop;
+    rv->PC += ir->imm;
+    /* check instruction misaligned */
+    if (unlikely(insn_is_misaligned(rv->PC))) {
+        rv->compressed = false;
+        rv_except_insn_misaligned(rv, pc);
+        return false;
+    }
+})
+
+/* LB: Load Byte */
+RVOP(lb, {
+    rv->X[ir->rd] =
+        sign_extend_b(rv->io.mem_read_b(rv, rv->X[ir->rs1] + ir->imm));
+})
+
+/* LH: Load Halfword */
+RVOP(lh, {
+    const uint32_t addr = rv->X[ir->rs1] + ir->imm;
+    if (unlikely(addr & 1)) {
+        rv->compressed = false;
+        rv_except_load_misaligned(rv, addr);
+        return false;
+    }
+    rv->X[ir->rd] = sign_extend_h(rv->io.mem_read_s(rv, addr));
+})
 
-    /* ANDI (AND Immediate) performs bitwise AND on register rs1 and the
-     * sign-extended 12-bit immediate and place the result in rd.
-     */
-    _(andi, rv->X[ir->rd] = rv->X[ir->rs1] & ir->imm;)
+/* LW: Load Word */
+RVOP(lw, {
+    const uint32_t addr = rv->X[ir->rs1] + ir->imm;
+    if (unlikely(addr & 3)) {
+        rv->compressed = false;
+        rv_except_load_misaligned(rv, addr);
+        return false;
+    }
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, addr);
+})
 
-    /* SLLI (Shift Left Logical) performs logical left shift on the value in
-     * register rs1 by the shift amount held in the lower 5 bits of the
-     * immediate.
-     */
-    _(slli, rv->X[ir->rd] = rv->X[ir->rs1] << (ir->imm & 0x1f);)
+/* LBU: Load Byte Unsigned */
+RVOP(lbu, { rv->X[ir->rd] = rv->io.mem_read_b(rv, rv->X[ir->rs1] + ir->imm); })
 
-    /* SRLI (Shift Right Logical) performs logical right shift on the value in
-     * register rs1 by the shift amount held in the lower 5 bits of the
-     * immediate.
-     */
-    _(srli, rv->X[ir->rd] = rv->X[ir->rs1] >> (ir->imm & 0x1f);)
+/* LHU: Load Halfword Unsigned */
+RVOP(lhu, {
+    const uint32_t addr = rv->X[ir->rs1] + ir->imm;
+    if (unlikely(addr & 1)) {
+        rv->compressed = false;
+        rv_except_load_misaligned(rv, addr);
+        return false;
+    }
+    rv->X[ir->rd] = rv->io.mem_read_s(rv, addr);
+})
 
-    /* SRAI (Shift Right Arithmetic) performs arithmetic right shift on the
-     * value in register rs1 by the shift amount held in the lower 5 bits of the
-     * immediate.
-     */
-    _(srai, rv->X[ir->rd] = ((int32_t) rv->X[ir->rs1]) >> (ir->imm & 0x1f);)
+/* SB: Store Byte */
+RVOP(sb, { rv->io.mem_write_b(rv, rv->X[ir->rs1] + ir->imm, rv->X[ir->rs2]); })
 
-    /* ADD */
-    _(add,
-      rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + (int32_t) (rv->X[ir->rs2]);)
+/* SH: Store Halfword */
+RVOP(sh, {
+    const uint32_t addr = rv->X[ir->rs1] + ir->imm;
+    if (unlikely(addr & 1)) {
+        rv->compressed = false;
+        rv_except_store_misaligned(rv, addr);
+        return false;
+    }
+    rv->io.mem_write_s(rv, addr, rv->X[ir->rs2]);
+})
 
-    /* SUB: Substract */
-    _(sub,
-      rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) - (int32_t) (rv->X[ir->rs2]);)
+/* SW: Store Word */
+RVOP(sw, {
+    const uint32_t addr = rv->X[ir->rs1] + ir->imm;
+    if (unlikely(addr & 3)) {
+        rv->compressed = false;
+        rv_except_store_misaligned(rv, addr);
+        return false;
+    }
+    rv->io.mem_write_w(rv, addr, rv->X[ir->rs2]);
+})
 
-    /* SLL: Shift Left Logical */
-    _(sll, rv->X[ir->rd] = rv->X[ir->rs1] << (rv->X[ir->rs2] & 0x1f);)
+/* ADDI adds the sign-extended 12-bit immediate to register rs1. Arithmetic
+ * overflow is ignored and the result is simply the low XLEN bits of the
+ * result. ADDI rd, rs1, 0 is used to implement the MV rd, rs1 assembler
+ * pseudo-instruction.
+ */
+RVOP(addi, { rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + ir->imm; })
 
-    /* SLT: Set on Less Than */
-    _(slt, {
-        rv->X[ir->rd] =
-            ((int32_t) (rv->X[ir->rs1]) < (int32_t) (rv->X[ir->rs2])) ? 1 : 0;
-    })
+/* SLTI place the value 1 in register rd if register rs1 is less than the
+ * signextended immediate when both are treated as signed numbers, else
+ * 0 is written to rd.
+ */
+RVOP(slti, { rv->X[ir->rd] = ((int32_t) (rv->X[ir->rs1]) < ir->imm) ? 1 : 0; })
 
-    /* SLTU: Set on Less Than Unsigned */
-    _(sltu, rv->X[ir->rd] = (rv->X[ir->rs1] < rv->X[ir->rs2]) ? 1 : 0;)
+/* SLTIU places the value 1 in register rd if register rs1 is less than the
+ * immediate when both are treated as unsigned numbers, else 0 is
+ * written to rd.
+ */
+RVOP(sltiu, { rv->X[ir->rd] = (rv->X[ir->rs1] < (uint32_t) ir->imm) ? 1 : 0; })
 
-    /* XOR: Exclusive OR */
-    _(xor, rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2];)
+/* XORI: Exclusive OR Immediate */
+RVOP(xori, { rv->X[ir->rd] = rv->X[ir->rs1] ^ ir->imm; })
 
-    /* SRL: Shift Right Logical */
-    _(srl, rv->X[ir->rd] = rv->X[ir->rs1] >> (rv->X[ir->rs2] & 0x1f);)
+/* ORI: OR Immediate */
+RVOP(ori, { rv->X[ir->rd] = rv->X[ir->rs1] | ir->imm; })
 
-    /* SRA: Shift Right Arithmetic */
-    _(sra, {
-        rv->X[ir->rd] = ((int32_t) rv->X[ir->rs1]) >> (rv->X[ir->rs2] & 0x1f);
-    })
+/* ANDI performs bitwise AND on register rs1 and the sign-extended 12-bit
+ * immediate and place the result in rd.
+ */
+RVOP(andi, { rv->X[ir->rd] = rv->X[ir->rs1] & ir->imm; })
 
-    /* OR */
-    _(or, rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2];)
+/* SLLI performs logical left shift on the value in register rs1 by the shift
+ * amount held in the lower 5 bits of the immediate.
+ */
+RVOP(slli, { rv->X[ir->rd] = rv->X[ir->rs1] << (ir->imm & 0x1f); })
 
-    /* AND */
-    _(and, rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2];)
+/* SRLI performs logical right shift on the value in register rs1 by the
+ * shift amount held in the lower 5 bits of the immediate.
+ */
+RVOP(srli, { rv->X[ir->rd] = rv->X[ir->rs1] >> (ir->imm & 0x1f); })
 
-    /* ECALL: Environment Call */
-    _(ecall, {
-        rv->compressed = false;
-        rv->io.on_ecall(rv); /* increment the cycles csr */
-        rv->csr_cycle++;
-        return true;
-    })
+/* SRAI performs arithmetic right shift on the value in register rs1 by
+ * the shift amount held in the lower 5 bits of the immediate.
+ */
+RVOP(srai, { rv->X[ir->rd] = ((int32_t) rv->X[ir->rs1]) >> (ir->imm & 0x1f); })
+
+/* ADD */
+RVOP(add, {
+    rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + (int32_t) (rv->X[ir->rs2]);
+})
+
+/* SUB: Substract */
+RVOP(sub, {
+    rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) - (int32_t) (rv->X[ir->rs2]);
+})
+
+/* SLL: Shift Left Logical */
+RVOP(sll, { rv->X[ir->rd] = rv->X[ir->rs1] << (rv->X[ir->rs2] & 0x1f); })
+
+/* SLT: Set on Less Than */
+RVOP(slt, {
+    rv->X[ir->rd] =
+        ((int32_t) (rv->X[ir->rs1]) < (int32_t) (rv->X[ir->rs2])) ? 1 : 0;
+})
+
+/* SLTU: Set on Less Than Unsigned */
+RVOP(sltu, { rv->X[ir->rd] = (rv->X[ir->rs1] < rv->X[ir->rs2]) ? 1 : 0; })
+
+/* XOR: Exclusive OR */
+RVOP(xor, {
+  rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2];
+})
+
+/* SRL: Shift Right Logical */
+RVOP(srl, { rv->X[ir->rd] = rv->X[ir->rs1] >> (rv->X[ir->rs2] & 0x1f); })
+
+/* SRA: Shift Right Arithmetic */
+RVOP(sra,
+     { rv->X[ir->rd] = ((int32_t) rv->X[ir->rs1]) >> (rv->X[ir->rs2] & 0x1f); })
+
+/* OR */
+RVOP(or, { rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2]; })
+
+/* AND */
+RVOP(and, { rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2]; })
+
+/* ECALL: Environment Call */
+RVOP(ecall, {
+    rv->compressed = false;
+    rv->io.on_ecall(rv);
+})
+
+/* EBREAK: Environment Break */
+RVOP(ebreak, {
+    rv->compressed = false;
+    rv->io.on_ebreak(rv);
+})
+
+/* WFI: Wait for Interrupt */
+RVOP(wfi, {
+    /* FIXME: Implement */
+    return false;
+})
 
-    /* EBREAK: Environment Break */
-    _(ebreak, {
-        rv->compressed = false;
-        rv->io.on_ebreak(rv); /* increment the cycles csr */
-        rv->csr_cycle++;
-        return true;
-    })
+/* URET: return from traps in U-mode */
+RVOP(uret, {
+    /* FIXME: Implement */
+    return false;
+})
 
-    /* WFI: Wait for Interrupt */
-    _(wfi, return false;)
+/* SRET: return from traps in S-mode */
+RVOP(sret, {
+    /* FIXME: Implement */
+    return false;
+})
 
-    /* URET: return from traps in U-mode */
-    _(uret, return false;)
+/* HRET: return from traps in H-mode */
+RVOP(hret, {
+    /* FIXME: Implement */
+    return false;
+})
 
-    /* SRET: return from traps in S-mode */
-    _(sret, return false;)
+/* MRET: return from traps in U-mode */
+RVOP(mret, { rv->PC = rv->csr_mepc; })
 
-    /* HRET: return from traps in H-mode */
-    _(hret, return false;)
+#if RV32_HAS(Zifencei) /* RV32 Zifencei Standard Extension */
+RVOP(fencei,
+     {
+         /* FIXME: fill real implementations */
+     })
+#endif
 
-    /* MRET: return from traps in U-mode */
-    _(mret, {
-        rv->PC = rv->csr_mepc;
-        /* increment the cycles csr */
-        rv->csr_cycle++;
-        /* this is a branch */
-        return true;
-    })
+#if RV32_HAS(Zicsr) /* RV32 Zicsr Standard Extension */
+/* CSRRW: Atomic Read/Write CSR */
+RVOP(csrrw, {
+    uint32_t tmp = csr_csrrw(rv, ir->imm, rv->X[ir->rs1]);
+    rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
+})
+
+/* CSRRS: Atomic Read and Set Bits in CSR */
+RVOP(csrrs, {
+    uint32_t tmp =
+        csr_csrrs(rv, ir->imm, (ir->rs1 == rv_reg_zero) ? 0U : rv->X[ir->rs1]);
+    rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
+})
+
+/* CSRRC: Atomic Read and Clear Bits in CSR */
+RVOP(csrrc, {
+    uint32_t tmp =
+        csr_csrrc(rv, ir->imm, (ir->rs1 == rv_reg_zero) ? ~0U : rv->X[ir->rs1]);
+    rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
+})
+
+/* CSRRWI */
+RVOP(csrrwi, {
+    uint32_t tmp = csr_csrrw(rv, ir->imm, ir->rs1);
+    rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
+})
+
+/* CSRRSI */
+RVOP(csrrsi, {
+    uint32_t tmp = csr_csrrs(rv, ir->imm, ir->rs1);
+    rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
+})
+
+/* CSRRCI */
+RVOP(csrrci, {
+    uint32_t tmp = csr_csrrc(rv, ir->imm, ir->rs1);
+    rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
+})
+#endif
 
-    /* RV32 Zifencei Standard Extension */
-#if RV32_HAS(Zifencei)
-    _(fencei, /* FIXME: fill real implementations */);
+#if RV32_HAS(EXT_M) /* RV32M Standard Extension */
+/* MUL: Multiply */
+RVOP(mul,
+     { rv->X[ir->rd] = (int32_t) rv->X[ir->rs1] * (int32_t) rv->X[ir->rs2]; })
+
+/* MULH: Multiply High Signed Signed */
+RVOP(mulh, {
+    const int64_t a = (int32_t) rv->X[ir->rs1];
+    const int64_t b = (int32_t) rv->X[ir->rs2];
+    rv->X[ir->rd] = ((uint64_t) (a * b)) >> 32;
+})
+
+/* MULHSU: Multiply High Signed Unsigned */
+RVOP(mulhsu, {
+    const int64_t a = (int32_t) rv->X[ir->rs1];
+    const uint64_t b = rv->X[ir->rs2];
+    rv->X[ir->rd] = ((uint64_t) (a * b)) >> 32;
+})
+
+/* MULHU: Multiply High Unsigned Unsigned */
+RVOP(mulhu, {
+    rv->X[ir->rd] =
+        ((uint64_t) rv->X[ir->rs1] * (uint64_t) rv->X[ir->rs2]) >> 32;
+})
+
+/* DIV: Divide Signed */
+RVOP(div, {
+    const int32_t dividend = (int32_t) rv->X[ir->rs1];
+    const int32_t divisor = (int32_t) rv->X[ir->rs2];
+    rv->X[ir->rd] = !divisor ? ~0U
+                    : (divisor == -1 && rv->X[ir->rs1] == 0x80000000U)
+                        ? rv->X[ir->rs1] /* overflow */
+                        : (unsigned int) (dividend / divisor);
+})
+
+/* DIVU: Divide Unsigned */
+RVOP(divu, {
+    const uint32_t dividend = rv->X[ir->rs1];
+    const uint32_t divisor = rv->X[ir->rs2];
+    rv->X[ir->rd] = !divisor ? ~0U : dividend / divisor;
+})
+
+/* REM: Remainder Signed */
+RVOP(rem, {
+    const int32_t dividend = rv->X[ir->rs1];
+    const int32_t divisor = rv->X[ir->rs2];
+    rv->X[ir->rd] = !divisor ? dividend
+                    : (divisor == -1 && rv->X[ir->rs1] == 0x80000000U)
+                        ? 0 /* overflow */
+                        : (dividend % divisor);
+})
+
+/* REMU: Remainder Unsigned */
+RVOP(remu, {
+    const uint32_t dividend = rv->X[ir->rs1];
+    const uint32_t divisor = rv->X[ir->rs2];
+    rv->X[ir->rd] = !divisor ? dividend : dividend % divisor;
+})
 #endif
 
-    /* RV32 Zicsr Standard Extension */
-#if RV32_HAS(Zicsr)
-    /* CSRRW: Atomic Read/Write CSR */
-    _(csrrw, {
-        uint32_t tmp = csr_csrrw(rv, ir->imm, rv->X[ir->rs1]);
-        rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
-    })
-
-    /* CSRRS: Atomic Read and Set Bits in CSR */
-    _(csrrs, {
-        uint32_t tmp = csr_csrrs(
-            rv, ir->imm, (ir->rs1 == rv_reg_zero) ? 0U : rv->X[ir->rs1]);
-        rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
-    })
-
-    /* CSRRC: Atomic Read and Clear Bits in CSR */
-    _(csrrc, {
-        uint32_t tmp = csr_csrrc(
-            rv, ir->imm, (ir->rs1 == rv_reg_zero) ? ~0U : rv->X[ir->rs1]);
-        rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
-    })
-
-    /* CSRRWI */
-    _(csrrwi, {
-        uint32_t tmp = csr_csrrw(rv, ir->imm, ir->rs1);
-        rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
-    })
-
-    /* CSRRSI */
-    _(csrrsi, {
-        uint32_t tmp = csr_csrrs(rv, ir->imm, ir->rs1);
-        rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
-    })
-
-    /* CSRRCI */
-    _(csrrci, {
-        uint32_t tmp = csr_csrrc(rv, ir->imm, ir->rs1);
-        rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd];
-    })
-#endif /* RV32_HAS(Zicsr) */
-
-    /* RV32M Standard Extension */
-#if RV32_HAS(EXT_M)
-    /* MUL: Multiply */
-    _(mul, rv->X[ir->rd] = (int32_t) rv->X[ir->rs1] * (int32_t) rv->X[ir->rs2];)
-
-    /* MULH: Multiply High Signed Signed */
-    _(mulh, {
-        const int64_t a = (int32_t) rv->X[ir->rs1];
-        const int64_t b = (int32_t) rv->X[ir->rs2];
-        rv->X[ir->rd] = ((uint64_t) (a * b)) >> 32;
-    })
-
-    /* MULHSU: Multiply High Signed Unsigned */
-    _(mulhsu, {
-        const int64_t a = (int32_t) rv->X[ir->rs1];
-        const uint64_t b = rv->X[ir->rs2];
-        rv->X[ir->rd] = ((uint64_t) (a * b)) >> 32;
-    })
-
-    /* MULHU: Multiply High Unsigned Unsigned */
-    _(mulhu, {
-        rv->X[ir->rd] =
-            ((uint64_t) rv->X[ir->rs1] * (uint64_t) rv->X[ir->rs2]) >> 32;
-    })
-
-    /* DIV: Divide Signed */
-    _(div, {
-        const int32_t dividend = (int32_t) rv->X[ir->rs1];
-        const int32_t divisor = (int32_t) rv->X[ir->rs2];
-        rv->X[ir->rd] = !divisor ? ~0U
-                        : (divisor == -1 && rv->X[ir->rs1] == 0x80000000U)
-                            ? rv->X[ir->rs1] /* overflow */
-                            : (unsigned int) (dividend / divisor);
-    })
-
-    /* DIVU: Divide Unsigned */
-    _(divu, {
-        const uint32_t dividend = rv->X[ir->rs1];
-        const uint32_t divisor = rv->X[ir->rs2];
-        rv->X[ir->rd] = !divisor ? ~0U : dividend / divisor;
-    })
-
-    /* REM: Remainder Signed */
-    _(rem, {
-        const int32_t dividend = rv->X[ir->rs1];
-        const int32_t divisor = rv->X[ir->rs2];
-        rv->X[ir->rd] = !divisor ? dividend
-                        : (divisor == -1 && rv->X[ir->rs1] == 0x80000000U)
-                            ? 0 /* overflow */
-                            : (dividend % divisor);
-    })
-
-    /* REMU: Remainder Unsigned */
-    _(remu, {
-        const uint32_t dividend = rv->X[ir->rs1];
-        const uint32_t divisor = rv->X[ir->rs2];
-        rv->X[ir->rd] = !divisor ? dividend : dividend % divisor;
-    })
-#endif /* RV32_HAS(EXT_M) */
-
-    /* RV32A Standard Extension
-     * At present, AMO is not implemented atomically because the emulated
-     * RISC-V core just runs on single thread, and no out-of-order execution
-     * happens. In addition, rl/aq are not handled.
+#if RV32_HAS(EXT_A) /* RV32A Standard Extension */
+/* At present, AMO is not implemented atomically because the rvop_jump_table[(ir
+ * + 1)->opcode]d RISC-V core just runs on single thread, and no out-of-order
+ * execution happens. In addition, rl/aq are not handled.
+ */
+
+/* LR.W: Load Reserved */
+RVOP(lrw, {
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, rv->X[ir->rs1]);
+    /* skip registration of the 'reservation set'
+     * FIXME: uimplemented
      */
-#if RV32_HAS(EXT_A)
-    /* LR.W: Load Reserved */
-    _(lrw, {
-        /* skip registration of the 'reservation set'
-         * FIXME: uimplemented
-         */
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, rv->X[ir->rs1]);
-    })
+})
 
-    /* SC.W: Store Conditional */
-    _(scw, {
-        /* assume the 'reservation set' is valid
-         * FIXME: unimplemented
-         */
-        rv->io.mem_write_w(rv, rv->X[ir->rs1], rv->X[ir->rs2]);
-        rv->X[ir->rd] = 0;
-    })
-
-    /* AMOSWAP.W: Atomic Swap */
-    _(amoswapw, {
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
-        rv->io.mem_write_s(rv, ir->rs1, rv->X[ir->rs2]);
-    })
-
-    /* AMOADD.W: Atomic ADD */
-    _(amoaddw, {
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
-        const int32_t res = (int32_t) rv->X[ir->rd] + (int32_t) rv->X[ir->rs2];
-        rv->io.mem_write_s(rv, ir->rs1, res);
-    })
-
-    /* AMOXOR.W: Atomix XOR */
-    _(amoxorw, {
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
-        const int32_t res = rv->X[ir->rd] ^ rv->X[ir->rs2];
-        rv->io.mem_write_s(rv, ir->rs1, res);
-    })
-
-    /* AMOAND.W: Atomic AND */
-    _(amoandw, {
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
-        const int32_t res = rv->X[ir->rd] & rv->X[ir->rs2];
-        rv->io.mem_write_s(rv, ir->rs1, res);
-    })
-
-    /* AMOOR.W: Atomic OR */
-    _(amoorw, {
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
-        const int32_t res = rv->X[ir->rd] | rv->X[ir->rs2];
-        rv->io.mem_write_s(rv, ir->rs1, res);
-    })
-
-    /* AMOMIN.W: Atomic MIN */
-    _(amominw, {
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
-        const int32_t a = rv->X[ir->rd];
-        const int32_t b = rv->X[ir->rs2];
-        const int32_t res = a < b ? a : b;
-        rv->io.mem_write_s(rv, ir->rs1, res);
-    })
-
-    /* AMOMAX.W: Atomic MAX */
-    _(amomaxw, {
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
-        const int32_t a = rv->X[ir->rd];
-        const int32_t b = rv->X[ir->rs2];
-        const int32_t res = a > b ? a : b;
-        rv->io.mem_write_s(rv, ir->rs1, res);
-    })
-
-    /* AMOMINU.W */
-    _(amominuw, {
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
-        const uint32_t a = rv->X[ir->rd];
-        const uint32_t b = rv->X[ir->rs2];
-        const uint32_t res = a < b ? a : b;
-        rv->io.mem_write_s(rv, ir->rs1, res);
-    })
-
-    /* AMOMAXU.W */
-    _(amomaxuw, {
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
-        const uint32_t a = rv->X[ir->rd];
-        const uint32_t b = rv->X[ir->rs2];
-        const uint32_t res = a > b ? a : b;
-        rv->io.mem_write_s(rv, ir->rs1, res);
-    })
+/* SC.W: Store Conditional */
+RVOP(scw, {
+    /* assume the 'reservation set' is valid
+     * FIXME: unimplemented
+     */
+    rv->io.mem_write_w(rv, rv->X[ir->rs1], rv->X[ir->rs2]);
+    rv->X[ir->rd] = 0;
+})
+
+/* AMOSWAP.W: Atomic Swap */
+RVOP(amoswapw, {
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
+    rv->io.mem_write_s(rv, ir->rs1, rv->X[ir->rs2]);
+})
+
+/* AMOADD.W: Atomic ADD */
+RVOP(amoaddw, {
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
+    const int32_t res = (int32_t) rv->X[ir->rd] + (int32_t) rv->X[ir->rs2];
+    rv->io.mem_write_s(rv, ir->rs1, res);
+})
+
+/* AMOXOR.W: Atomix XOR */
+RVOP(amoxorw, {
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
+    const int32_t res = rv->X[ir->rd] ^ rv->X[ir->rs2];
+    rv->io.mem_write_s(rv, ir->rs1, res);
+})
+
+/* AMOAND.W: Atomic AND */
+RVOP(amoandw, {
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
+    const int32_t res = rv->X[ir->rd] & rv->X[ir->rs2];
+    rv->io.mem_write_s(rv, ir->rs1, res);
+})
+
+/* AMOOR.W: Atomic OR */
+RVOP(amoorw, {
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
+    const int32_t res = rv->X[ir->rd] | rv->X[ir->rs2];
+    rv->io.mem_write_s(rv, ir->rs1, res);
+})
+
+/* AMOMIN.W: Atomic MIN */
+RVOP(amominw, {
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
+    const int32_t a = rv->X[ir->rd];
+    const int32_t b = rv->X[ir->rs2];
+    const int32_t res = a < b ? a : b;
+    rv->io.mem_write_s(rv, ir->rs1, res);
+})
+
+/* AMOMAX.W: Atomic MAX */
+RVOP(amomaxw, {
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
+    const int32_t a = rv->X[ir->rd];
+    const int32_t b = rv->X[ir->rs2];
+    const int32_t res = a > b ? a : b;
+    rv->io.mem_write_s(rv, ir->rs1, res);
+})
+
+/* AMOMINU.W */
+RVOP(amominuw, {
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
+    const uint32_t a = rv->X[ir->rd];
+    const uint32_t b = rv->X[ir->rs2];
+    const uint32_t res = a < b ? a : b;
+    rv->io.mem_write_s(rv, ir->rs1, res);
+})
+
+/* AMOMAXU.W */
+RVOP(amomaxuw, {
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, ir->rs1);
+    const uint32_t a = rv->X[ir->rd];
+    const uint32_t b = rv->X[ir->rs2];
+    const uint32_t res = a > b ? a : b;
+    rv->io.mem_write_s(rv, ir->rs1, res);
+})
 #endif /* RV32_HAS(EXT_A) */
 
-    /* RV32F Standard Extension */
-#if RV32_HAS(EXT_F)
-    /* FLW */
-    _(flw, {
-        /* copy into the float register */
-        const uint32_t data = rv->io.mem_read_w(rv, rv->X[ir->rs1] + ir->imm);
-        memcpy(rv->F + ir->rd, &data, 4);
-    })
-
-    /* FSW */
-    _(fsw, {
-        /* copy from float registers */
-        uint32_t data;
-        memcpy(&data, (const void *) (rv->F + ir->rs2), 4);
-        rv->io.mem_write_w(rv, rv->X[ir->rs1] + ir->imm, data);
-    })
-
-    /* FMADD.S */
-    _(fmadds, rv->F[ir->rd] = rv->F[ir->rs1] * rv->F[ir->rs2] + rv->F[ir->rs3];)
-
-    /* FMSUB.S */
-    _(fmsubs, rv->F[ir->rd] = rv->F[ir->rs1] * rv->F[ir->rs2] - rv->F[ir->rs3];)
-
-    /* FNMSUB.S */
-    _(fnmsubs,
-      rv->F[ir->rd] = rv->F[ir->rs3] - (rv->F[ir->rs1] * rv->F[ir->rs2]);)
-
-    /* FNMADD.S */
-    _(fnmadds,
-      rv->F[ir->rd] = -(rv->F[ir->rs1] * rv->F[ir->rs2]) - rv->F[ir->rs3];)
-
-    /* FADD.S */
-    _(fadds, {
-        if (isnanf(rv->F[ir->rs1]) || isnanf(rv->F[ir->rs2]) ||
-            isnanf(rv->F[ir->rs1] + rv->F[ir->rs2])) {
-            /* raise invalid operation */
-            rv->F_int[ir->rd] = RV_NAN;
-            /* F_int is the integer shortcut of F */
+#if RV32_HAS(EXT_F) /* RV32F Standard Extension */
+/* FLW */
+RVOP(flw, {
+    /* copy into the float register */
+    const uint32_t data = rv->io.mem_read_w(rv, rv->X[ir->rs1] + ir->imm);
+    memcpy(rv->F + ir->rd, &data, 4);
+})
+
+/* FSW */
+RVOP(fsw, {
+    /* copy from float registers */
+    uint32_t data;
+    memcpy(&data, (const void *) (rv->F + ir->rs2), 4);
+    rv->io.mem_write_w(rv, rv->X[ir->rs1] + ir->imm, data);
+})
+
+/* FMADD.S */
+RVOP(fmadds,
+     { rv->F[ir->rd] = rv->F[ir->rs1] * rv->F[ir->rs2] + rv->F[ir->rs3]; })
+
+/* FMSUB.S */
+RVOP(fmsubs,
+     { rv->F[ir->rd] = rv->F[ir->rs1] * rv->F[ir->rs2] - rv->F[ir->rs3]; })
+
+/* FNMSUB.S */
+RVOP(fnmsubs,
+     { rv->F[ir->rd] = rv->F[ir->rs3] - (rv->F[ir->rs1] * rv->F[ir->rs2]); })
+
+/* FNMADD.S */
+RVOP(fnmadds,
+     { rv->F[ir->rd] = -(rv->F[ir->rs1] * rv->F[ir->rs2]) - rv->F[ir->rs3]; })
+
+/* FADD.S */
+RVOP(fadds, {
+    if (isnanf(rv->F[ir->rs1]) || isnanf(rv->F[ir->rs2]) ||
+        isnanf(rv->F[ir->rs1] + rv->F[ir->rs2])) {
+        /* raise invalid operation */
+        rv->F_int[ir->rd] = RV_NAN; /* F_int is the integer shortcut of F */
+        rv->csr_fcsr |= FFLAG_INVALID_OP;
+    } else {
+        rv->F[ir->rd] = rv->F[ir->rs1] + rv->F[ir->rs2];
+    }
+    if (isinff(rv->F[ir->rd])) {
+        rv->csr_fcsr |= FFLAG_OVERFLOW;
+        rv->csr_fcsr |= FFLAG_INEXACT;
+    }
+})
+
+/* FSUB.S */
+RVOP(fsubs, {
+    if (isnanf(rv->F[ir->rs1]) || isnanf(rv->F[ir->rs2])) {
+        rv->F_int[ir->rd] = RV_NAN;
+    } else {
+        rv->F[ir->rd] = rv->F[ir->rs1] - rv->F[ir->rs2];
+    }
+})
+
+/* FMUL.S */
+RVOP(fmuls, { rv->F[ir->rd] = rv->F[ir->rs1] * rv->F[ir->rs2]; })
+
+/* FDIV.S */
+RVOP(fdivs, { rv->F[ir->rd] = rv->F[ir->rs1] / rv->F[ir->rs2]; })
+
+/* FSQRT.S */
+RVOP(fsqrts, { rv->F[ir->rd] = sqrtf(rv->F[ir->rs1]); })
+
+/* FSGNJ.S */
+RVOP(fsgnjs, {
+    uint32_t f1;
+    uint32_t f2;
+    uint32_t res;
+    memcpy(&f1, rv->F + ir->rs1, 4);
+    memcpy(&f2, rv->F + ir->rs2, 4);
+    res = (f1 & ~FMASK_SIGN) | (f2 & FMASK_SIGN);
+    memcpy(rv->F + ir->rd, &res, 4);
+})
+
+/* FSGNJN.S */
+RVOP(fsgnjns, {
+    uint32_t f1;
+    uint32_t f2;
+    uint32_t res;
+    memcpy(&f1, rv->F + ir->rs1, 4);
+    memcpy(&f2, rv->F + ir->rs2, 4);
+    res = (f1 & ~FMASK_SIGN) | (~f2 & FMASK_SIGN);
+    memcpy(rv->F + ir->rd, &res, 4);
+})
+
+/* FSGNJX.S */
+RVOP(fsgnjxs, {
+    uint32_t f1;
+    uint32_t f2;
+    uint32_t res;
+    memcpy(&f1, rv->F + ir->rs1, 4);
+    memcpy(&f2, rv->F + ir->rs2, 4);
+    res = f1 ^ (f2 & FMASK_SIGN);
+    memcpy(rv->F + ir->rd, &res, 4);
+})
+
+/* FMIN.S
+ * In IEEE754-201x, fmin(x, y) return
+ * - min(x,y) if both numbers are not NaN
+ * - if one is NaN and another is a number, return the number
+ * - if both are NaN, return NaN
+ * When input is signaling NaN, raise invalid operation
+ */
+RVOP(fmins, {
+    uint32_t x;
+    uint32_t y;
+    memcpy(&x, rv->F + ir->rs1, 4);
+    memcpy(&y, rv->F + ir->rs2, 4);
+    if (is_nan(x) || is_nan(y)) {
+        if (is_snan(x) || is_snan(y))
             rv->csr_fcsr |= FFLAG_INVALID_OP;
+        if (is_nan(x) && !is_nan(y)) {
+            rv->F[ir->rd] = rv->F[ir->rs2];
+        } else if (!is_nan(x) && is_nan(y)) {
+            rv->F[ir->rd] = rv->F[ir->rs1];
         } else {
-            rv->F[ir->rd] = rv->F[ir->rs1] + rv->F[ir->rs2];
-        }
-        if (isinff(rv->F[ir->rd])) {
-            rv->csr_fcsr |= FFLAG_OVERFLOW;
-            rv->csr_fcsr |= FFLAG_INEXACT;
-        }
-    })
-
-    /* FSUB.S */
-    _(fsubs, {
-        if (isnanf(rv->F[ir->rs1]) || isnanf(rv->F[ir->rs2])) {
             rv->F_int[ir->rd] = RV_NAN;
+        }
+    } else {
+        uint32_t a_sign;
+        uint32_t b_sign;
+        a_sign = x & FMASK_SIGN;
+        b_sign = y & FMASK_SIGN;
+        if (a_sign != b_sign) {
+            rv->F[ir->rd] = a_sign ? rv->F[ir->rs1] : rv->F[ir->rs2];
         } else {
-            rv->F[ir->rd] = rv->F[ir->rs1] - rv->F[ir->rs2];
+            rv->F[ir->rd] = (rv->F[ir->rs1] < rv->F[ir->rs2]) ? rv->F[ir->rs1]
+                                                              : rv->F[ir->rs2];
         }
-    })
-
-    /* FMUL.S */
-    _(fmuls, rv->F[ir->rd] = rv->F[ir->rs1] * rv->F[ir->rs2];)
-
-    /* FDIV.S */
-    _(fdivs, rv->F[ir->rd] = rv->F[ir->rs1] / rv->F[ir->rs2];)
-
-    /* FSQRT.S */
-    _(fsqrts, rv->F[ir->rd] = sqrtf(rv->F[ir->rs1]);)
-
-    /* FSGNJ.S */
-    _(fsgnjs, {
-        uint32_t f1;
-        uint32_t f2;
-        memcpy(&f1, rv->F + ir->rs1, 4);
-        memcpy(&f2, rv->F + ir->rs2, 4);
-        uint32_t res = (f1 & ~FMASK_SIGN) | (f2 & FMASK_SIGN);
-        memcpy(rv->F + ir->rd, &res, 4);
-    })
-
-    /* FSGNJN.S */
-    _(fsgnjns, {
-        uint32_t f1;
-        uint32_t f2;
-        memcpy(&f1, rv->F + ir->rs1, 4);
-        memcpy(&f2, rv->F + ir->rs2, 4);
-        uint32_t res = (f1 & ~FMASK_SIGN) | (~f2 & FMASK_SIGN);
-        memcpy(rv->F + ir->rd, &res, 4);
-    })
-
-    /* FSGNJX.S */
-    _(fsgnjxs, {
-        uint32_t f1;
-        uint32_t f2;
-        uint32_t res;
-        memcpy(&f1, rv->F + ir->rs1, 4);
-        memcpy(&f2, rv->F + ir->rs2, 4);
-        res = f1 ^ (f2 & FMASK_SIGN);
-        memcpy(rv->F + ir->rd, &res, 4);
-    })
-
-    /* FMIN.S */
-    _(fmins, {
-        /* In IEEE754-201x, fmin(x, y) return
-         * - min(x,y) if both numbers are not NaN
-         * - if one is NaN and another is a number, return the number
-         * - if both are NaN, return NaN
-         * When input is signaling NaN, raise invalid operation
-         */
-        uint32_t x;
-        uint32_t y;
-        memcpy(&x, rv->F + ir->rs1, 4);
-        memcpy(&y, rv->F + ir->rs2, 4);
-        if (is_nan(x) || is_nan(y)) {
-            if (is_snan(x) || is_snan(y))
-                rv->csr_fcsr |= FFLAG_INVALID_OP;
-            if (is_nan(x) && !is_nan(y)) {
-                rv->F[ir->rd] = rv->F[ir->rs2];
-            } else if (!is_nan(x) && is_nan(y)) {
-                rv->F[ir->rd] = rv->F[ir->rs1];
-            } else {
-                rv->F_int[ir->rd] = RV_NAN;
-            }
+    }
+})
+
+/* FMAX.S */
+RVOP(fmaxs, {
+    uint32_t x;
+    uint32_t y;
+    memcpy(&x, rv->F + ir->rs1, 4);
+    memcpy(&y, rv->F + ir->rs2, 4);
+    if (is_nan(x) || is_nan(y)) {
+        if (is_snan(x) || is_snan(y))
+            rv->csr_fcsr |= FFLAG_INVALID_OP;
+        if (is_nan(x) && !is_nan(y)) {
+            rv->F[ir->rd] = rv->F[ir->rs2];
+        } else if (!is_nan(x) && is_nan(y)) {
+            rv->F[ir->rd] = rv->F[ir->rs1];
         } else {
-            uint32_t a_sign;
-            uint32_t b_sign;
-            a_sign = x & FMASK_SIGN;
-            b_sign = y & FMASK_SIGN;
-            if (a_sign != b_sign) {
-                rv->F[ir->rd] = a_sign ? rv->F[ir->rs1] : rv->F[ir->rs2];
-            } else {
-                rv->F[ir->rd] = (rv->F[ir->rs1] < rv->F[ir->rs2])
-                                    ? rv->F[ir->rs1]
-                                    : rv->F[ir->rs2];
-            }
+            rv->F_int[ir->rd] = RV_NAN;
         }
-    })
-
-    /* FMAX.S */
-    _(fmaxs, {
-        uint32_t x;
-        uint32_t y;
-        memcpy(&x, rv->F + ir->rs1, 4);
-        memcpy(&y, rv->F + ir->rs2, 4);
-        if (is_nan(x) || is_nan(y)) {
-            if (is_snan(x) || is_snan(y))
-                rv->csr_fcsr |= FFLAG_INVALID_OP;
-            if (is_nan(x) && !is_nan(y)) {
-                rv->F[ir->rd] = rv->F[ir->rs2];
-            } else if (!is_nan(x) && is_nan(y)) {
-                rv->F[ir->rd] = rv->F[ir->rs1];
-            } else {
-                rv->F_int[ir->rd] = RV_NAN;
-            }
+    } else {
+        uint32_t a_sign;
+        uint32_t b_sign;
+        a_sign = x & FMASK_SIGN;
+        b_sign = y & FMASK_SIGN;
+        if (a_sign != b_sign) {
+            rv->F[ir->rd] = a_sign ? rv->F[ir->rs2] : rv->F[ir->rs1];
         } else {
-            uint32_t a_sign;
-            uint32_t b_sign;
-            a_sign = x & FMASK_SIGN;
-            b_sign = y & FMASK_SIGN;
-            if (a_sign != b_sign) {
-                rv->F[ir->rd] = a_sign ? rv->F[ir->rs2] : rv->F[ir->rs1];
-            } else {
-                rv->F[ir->rd] = (rv->F[ir->rs1] > rv->F[ir->rs2])
-                                    ? rv->F[ir->rs1]
-                                    : rv->F[ir->rs2];
-            }
+            rv->F[ir->rd] = (rv->F[ir->rs1] > rv->F[ir->rs2]) ? rv->F[ir->rs1]
+                                                              : rv->F[ir->rs2];
         }
-    })
+    }
+})
 
-    /* FCVT.W.S */
-    _(fcvtws, rv->X[ir->rd] = (int32_t) rv->F[ir->rs1];)
+/* FCVT.W.S */
+RVOP(fcvtws, { rv->X[ir->rd] = (int32_t) rv->F[ir->rs1]; })
 
-    /* FCVT.WU.S */
-    _(fcvtwus, rv->X[ir->rd] = (uint32_t) rv->F[ir->rs1];)
+/* FCVT.WU.S */
+RVOP(fcvtwus, { rv->X[ir->rd] = (uint32_t) rv->F[ir->rs1]; })
 
-    /* FMV.X.W */
-    _(fmvxw, memcpy(rv->X + ir->rd, rv->F + ir->rs1, 4);)
+/* FMV.X.W */
+RVOP(fmvxw, { memcpy(rv->X + ir->rd, rv->F + ir->rs1, 4); })
 
-    /* FEQ.S performs a quiet comparison: it only sets the invalid
-     * operation exception flag if either input is a signaling NaN.
-     */
-    _(feqs, rv->X[ir->rd] = (rv->F[ir->rs1] == rv->F[ir->rs2]) ? 1 : 0;)
+/* FEQ.S performs a quiet comparison: it only sets the invalid
+ * operation exception flag if either input is a signaling NaN.
+ */
+RVOP(feqs, { rv->X[ir->rd] = (rv->F[ir->rs1] == rv->F[ir->rs2]) ? 1 : 0; })
 
-    /* FLT.S and FLE.S perform what the IEEE 754-2008 standard refers
-     * to as signaling comparisons: that is, they set the invalid
-     * operation exception flag if either input is NaN.
-     */
-    _(flts, rv->X[ir->rd] = (rv->F[ir->rs1] < rv->F[ir->rs2]) ? 1 : 0;)
+/* FLT.S and FLE.S perform what the IEEE 754-2008 standard refers
+ * to as signaling comparisons: that is, they set the invalid
+ * operation exception flag if either input is NaN.
+ */
+RVOP(flts, { rv->X[ir->rd] = (rv->F[ir->rs1] < rv->F[ir->rs2]) ? 1 : 0; })
 
-    /* FLE.S */
-    _(fles, rv->X[ir->rd] = (rv->F[ir->rs1] <= rv->F[ir->rs2]) ? 1 : 0;)
+RVOP(fles, { rv->X[ir->rd] = (rv->F[ir->rs1] <= rv->F[ir->rs2]) ? 1 : 0; })
 
-    /* FCLASS.S */
-    _(fclasss, {
-        uint32_t bits;
-        memcpy(&bits, rv->F + ir->rs1, 4);
-        rv->X[ir->rd] = calc_fclass(bits);
-    })
+/* FCLASS.S */
+RVOP(fclasss, {
+    uint32_t bits;
+    memcpy(&bits, rv->F + ir->rs1, 4);
+    rv->X[ir->rd] = calc_fclass(bits);
+})
 
-    /* FCVT.S.W */
-    _(fcvtsw, rv->F[ir->rd] = (float) (int32_t) rv->X[ir->rs1];)
+/* FCVT.S.W */
+RVOP(fcvtsw, { rv->F[ir->rd] = (float) (int32_t) rv->X[ir->rs1]; })
 
-    /* FCVT.S.WU */
-    _(fcvtswu, rv->F[ir->rd] = (float) (uint32_t) rv->X[ir->rs1];)
+/* FCVT.S.WU */
+RVOP(fcvtswu, { rv->F[ir->rd] = (float) (uint32_t) rv->X[ir->rs1]; })
 
-    /* FMV.W.X */
-    _(fmvwx, memcpy(rv->F + ir->rd, rv->X + ir->rs1, 4);)
-#endif /* RV32_HAS(EXT_F) */
+/* FMV.W.X */
+RVOP(fmvwx, { memcpy(rv->F + ir->rd, rv->X + ir->rs1, 4); })
+#endif
 
-    /* RV32C Standard Extension */
-#if RV32_HAS(EXT_C)
-    /* C.ADDI4SPN is a CIW-format instruction that adds a zero-extended
-     * non-zero immediate, scaledby 4, to the stack pointer, x2, and
-     * writes the result to rd'. This instruction is used to generate
-     * pointers to stack-allocated variables, and expands to addi rd',
-     * x2, nzuimm[9:2].
-     */
-    _(caddi4spn, rv->X[ir->rd] = rv->X[2] + (uint16_t) ir->imm;)
+#if RV32_HAS(EXT_C) /* RV32C Standard Extension */
+/* C.ADDI4SPN is a CIW-format instruction that adds a zero-extended
+ * non-zero immediate, scaledby 4, to the stack pointer, x2, and writes
+ * the result to rd'. This instruction is used to generate pointers to
+ * stack-allocated variables, and expands to addi rd', x2, nzuimm[9:2].
+ */
+RVOP(caddi4spn, { rv->X[ir->rd] = rv->X[2] + (uint16_t) ir->imm; })
 
-    /* C.LW loads a 32-bit value from memory into register rd'. It
-     * computes an ffective address by adding the zero-extended offset,
-     * scaled by 4, to the base address in register rs1'. It expands to
-     * # lw rd', offset[6:2](rs1').
-     */
-    _(clw, {
-        const uint32_t addr = rv->X[ir->rs1] + (uint32_t) ir->imm;
-        if (addr & 3) {
-            rv->compressed = true;
-            rv_except_load_misaligned(rv, addr);
-            return false;
-        }
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, addr);
-    })
+/* C.LW loads a 32-bit value from memory into register rd'. It computes
+ * an ffective address by adding the zero-extended offset, scaled by 4,
+ * to the base address in register rs1'. It expands to  # lw rd',
+ * offset[6:2](rs1').
+ */
+RVOP(clw, {
+    const uint32_t addr = rv->X[ir->rs1] + (uint32_t) ir->imm;
+    if (unlikely(addr & 3)) {
+        rv->compressed = true;
+        rv_except_load_misaligned(rv, addr);
+        return false;
+    }
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, addr);
+})
 
-    /* C.SW stores a 32-bit value in register rs2' to memory. It computes
-     * an effective address by adding the zero-extended offset, scaled by
-     * 4, to the base address in register rs1'.
-     * It expands to sw rs2', offset[6:2](rs1')
-     */
-    _(csw, {
-        const uint32_t addr = rv->X[ir->rs1] + (uint32_t) ir->imm;
-        if (addr & 3) {
-            rv->compressed = true;
-            rv_except_store_misaligned(rv, addr);
-            return false;
-        }
-        rv->io.mem_write_w(rv, addr, rv->X[ir->rs2]);
-    })
-
-    /* C.NOP */
-    _(cnop, /* nothing */)
-
-    /* C.ADDI adds the non-zero sign-extended 6-bit immediate to the
-     * value in register rd then writes the result to rd. C.ADDI expands
-     * into addi rd, rd, nzimm[5:0]. C.ADDI is only valid when rd̸=x0.
-     * The code point with both rd=x0 and nzimm=0 encodes the C.NOP
-     * instruction; the remaining code points with either rd=x0 or
-     * nzimm=0 encode HINTs.
-     */
-    _(caddi, rv->X[ir->rd] += (int16_t) ir->imm;)
-
-    /* C.JAL */
-    _(cjal, {
-        rv->X[1] = rv->PC + ir->insn_len;
-        rv->PC += ir->imm;
-        if (rv->PC & 0x1) {
-            rv->compressed = true;
-            rv_except_insn_misaligned(rv, rv->PC);
-            return false;
-        }
-        /* increment the cycles csr */
-        rv->csr_cycle++;
-        /* can branch */
-        return true;
-    })
+/* C.SW stores a 32-bit value in register rs2' to memory. It computes an
+ * effective address by adding the zero-extended offset, scaled by 4, to
+ * the base address in register rs1'.
+ * It expands to sw rs2', offset[6:2](rs1')
+ */
+RVOP(csw, {
+    const uint32_t addr = rv->X[ir->rs1] + (uint32_t) ir->imm;
+    if (unlikely(addr & 3)) {
+        rv->compressed = true;
+        rv_except_store_misaligned(rv, addr);
+        return false;
+    }
+    rv->io.mem_write_w(rv, addr, rv->X[ir->rs2]);
+})
 
-    /* C.LI loads the sign-extended 6-bit immediate, imm, into
-     * register rd.
-     * C.LI expands into addi rd, x0, imm[5:0].
-     * C.LI is only valid when rd=x0; the code points with rd=x0 encode
-     * HINTs.
-     */
-    _(cli, rv->X[ir->rd] = ir->imm;)
+/* C.NOP */
+RVOP(cnop, {/* no operation */})
 
-    /* C.ADDI16SP is used to adjust the stack pointer in procedure
-     * prologues and epilogues.
-     * It expands into addi x2, x2, nzimm[9:4].
-     * C.ADDI16SP is only valid when nzimm̸=0; the code point with
-     * nzimm=0 is reserved.
-     */
-    _(caddi16sp, rv->X[ir->rd] += ir->imm;)
-
-    /* C.LUI loads the non-zero 6-bit immediate field into bits 17–12 of
-     * the destination register, clears the bottom 12 bits, and
-     * sign-extends bit 17 into all higher bits of the destination.
-     * C.LUI expands into lui rd, nzimm[17:12].
-     * C.LUI is only valid when rd̸={x0, x2}, and when the immediate is
-     * not equal to zero.
-     */
-    _(clui, rv->X[ir->rd] = ir->imm;)
+/* C.ADDI adds the non-zero sign-extended 6-bit immediate to the value
+ * in register rd then writes the result to rd. C.ADDI expands into addi
+ * rd, rd, nzimm[5:0]. C.ADDI is only valid when rd̸=x0. The code point
+ * with both rd=x0 and nzimm=0 encodes the C.NOP instruction; the
+ * remaining code points with either rd=x0 or nzimm=0 encode HINTs.
+ */
+RVOP(caddi, { rv->X[ir->rd] += (int16_t) ir->imm; })
 
-    /* C.SRLI is a CB-format instruction that performs a logical right
-     * shift of the value in register rd' then writes the result to rd'.
-     * The shift amount is encoded in the shamt field. C.SRLI expands
-     * into srli rd', rd', shamt[5:0].
-     */
-    _(csrli, rv->X[ir->rs1] >>= ir->shamt;)
+/* C.JAL */
+RVOP(cjal, {
+    rv->X[1] = rv->PC + ir->insn_len;
+    rv->PC += ir->imm;
+    if (unlikely(rv->PC & 0x1)) {
+        rv->compressed = true;
+        rv_except_insn_misaligned(rv, rv->PC);
+        return false;
+    }
+})
 
-    /* C.SRAI is defined analogously to C.SRLI, but instead performs an
-     * arithmetic right shift.
-     * C.SRAI expands to srai rd', rd', shamt[5:0].
-     */
-    _(csrai, {
-        const uint32_t mask = 0x80000000 & rv->X[ir->rs1];
-        rv->X[ir->rs1] >>= ir->shamt;
-        for (unsigned int i = 0; i < ir->shamt; ++i)
-            rv->X[ir->rs1] |= mask >> i;
-    })
-
-    /* C.ANDI is a CB-format instruction that computes the bitwise AND of
-     * the value in register rd' and the sign-extended 6-bit immediate,
-     * then writes the result to rd'.
-     * C.ANDI expands to andi rd', rd', imm[5:0].
-     */
-    _(candi, rv->X[ir->rs1] &= ir->imm;)
+/* C.LI loads the sign-extended 6-bit immediate, imm, into register rd.
+ * C.LI expands into addi rd, x0, imm[5:0].
+ * C.LI is only valid when rd=x0; the code points with rd=x0 encode
+ * HINTs.
+ */
+RVOP(cli, { rv->X[ir->rd] = ir->imm; })
 
-    /* C.SUB */
-    _(csub, rv->X[ir->rd] = rv->X[ir->rs1] - rv->X[ir->rs2];)
+/* C.ADDI16SP is used to adjust the stack pointer in procedure
+ * prologues and epilogues.
+ * It expands into addi x2, x2, nzimm[9:4].
+ * C.ADDI16SP is only valid when nzimm̸=0; the code point with nzimm=0
+ * is reserved.
+ */
+RVOP(caddi16sp, { rv->X[ir->rd] += ir->imm; })
+
+/* C.LUI loads the non-zero 6-bit immediate field into bits 17–12 of the
+ * destination register, clears the bottom 12 bits, and sign-extends bit
+ * 17 into all higher bits of the destination.
+ * C.LUI expands into lui rd, nzimm[17:12].
+ * C.LUI is only valid when rd̸={x0, x2}, and when the immediate is not
+ * equal to zero.
+ */
+RVOP(clui, { rv->X[ir->rd] = ir->imm; })
 
-    /* C.XOR */
-    _(cxor, rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2];)
+/* C.SRLI is a CB-format instruction that performs a logical right shift
+ * of the value in register rd' then writes the result to rd'. The shift
+ * amount is encoded in the shamt field. C.SRLI expands into srli rd',
+ * rd', shamt[5:0].
+ */
+RVOP(csrli, { rv->X[ir->rs1] >>= ir->shamt; })
 
-    /* C.OR */
-    _(cor, rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2];)
+/* C.SRAI is defined analogously to C.SRLI, but instead performs an
+ * arithmetic right shift. C.SRAI expands to srai rd', rd', shamt[5:0].
+ */
+RVOP(csrai, {
+    const uint32_t mask = 0x80000000 & rv->X[ir->rs1];
+    rv->X[ir->rs1] >>= ir->shamt;
+    for (unsigned int i = 0; i < ir->shamt; ++i)
+        rv->X[ir->rs1] |= mask >> i;
+})
+
+/* C.ANDI is a CB-format instruction that computes the bitwise AND of
+ * the value in register rd' and the sign-extended 6-bit immediate, then
+ * writes the result to rd'. C.ANDI expands to andi rd', rd', imm[5:0].
+ */
+RVOP(candi, { rv->X[ir->rs1] &= ir->imm; })
 
-    /* C.AND */
-    _(cand, rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2];)
+/* C.SUB */
+RVOP(csub, { rv->X[ir->rd] = rv->X[ir->rs1] - rv->X[ir->rs2]; })
 
-    /* C.J performs an unconditional control transfer. The offset is
-     * sign-extended and added to the pc to form the jump target address.
-     * C.J can therefore target a ±2 KiB range.
-     * C.J expands to jal x0, offset[11:1].
-     */
-    _(cj, {
-        rv->PC += ir->imm;
-        if (rv->PC & 0x1) {
-            rv->compressed = true;
-            rv_except_insn_misaligned(rv, rv->PC);
-            return false;
-        }
-        /* increment the cycles csr */
-        rv->csr_cycle++;
-        /* can branch */
-        return true;
-    })
+/* C.XOR */
+RVOP(cxor, { rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2]; })
 
-    /* C.BEQZ performs conditional control transfers. The offset is
-     * sign-extended and added to the pc to form the branch target
-     * address. It can therefore target a ±256 B range. C.BEQZ takes the
-     * branch if the value in register rs1' is zero.
-     * It expands to beq rs1', x0, offset[8:1].
-     */
-    _(cbeqz, {
-        rv->PC += (!rv->X[ir->rs1]) ? (uint32_t) ir->imm : ir->insn_len;
-        /* increment the cycles csr */
-        rv->csr_cycle++;
-        /* can branch */
-        return true;
-    })
+RVOP(cor, { rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2]; })
 
-    _(cbnez, {
-        rv->PC += (rv->X[ir->rs1]) ? (uint32_t) ir->imm : ir->insn_len;
-        /* increment the cycles csr */
-        rv->csr_cycle++;
-        /* can branch */
-        return true;
-    })
+RVOP(cand, { rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2]; })
 
-    /* C.SLLI is a CI-format instruction that performs a logical left
-     * shift of the value in register rd then writes the result to rd.
-     * The shift amount is encoded in the shamt field.
-     * C.SLLI expands into slli rd, rd, shamt[5:0].
-     */
-    _(cslli, rv->X[ir->rd] <<= (uint8_t) ir->imm;)
-
-    /* C.LWSP */
-    _(clwsp, {
-        const uint32_t addr = rv->X[rv_reg_sp] + ir->imm;
-        if (addr & 3) {
-            rv->compressed = true;
-            rv_except_load_misaligned(rv, addr);
-            return false;
-        }
-        rv->X[ir->rd] = rv->io.mem_read_w(rv, addr);
-    })
-
-    /* C.JR */
-    _(cjr, {
-        rv->PC = rv->X[ir->rs1];
-        /* increment the cycles csr */
-        rv->csr_cycle++;
-        /* can branch */
-        return true;
-    })
+/* C.J performs an unconditional control transfer. The offset is
+ * sign-extended and added to the pc to form the jump target address.
+ * C.J can therefore target a ±2 KiB range.
+ * C.J expands to jal x0, offset[11:1].
+ */
+RVOP(cj, {
+    rv->PC += ir->imm;
+    if (unlikely(rv->PC & 0x1)) {
+        rv->compressed = true;
+        rv_except_insn_misaligned(rv, rv->PC);
+        return false;
+    }
+})
+
+/* C.BEQZ performs conditional control transfers. The offset is
+ * sign-extended and added to the pc to form the branch target address.
+ * It can therefore target a ±256 B range. C.BEQZ takes the branch if
+ * the value in register rs1' is zero. It expands to beq rs1', x0,
+ * offset[8:1].
+ */
+RVOP(cbeqz,
+     { rv->PC += (!rv->X[ir->rs1]) ? (uint32_t) ir->imm : ir->insn_len; })
 
-    /* C.MV */
-    _(cmv, rv->X[ir->rd] = rv->X[ir->rs2];)
+/* C.BEQZ */
+RVOP(cbnez, { rv->PC += (rv->X[ir->rs1]) ? (uint32_t) ir->imm : ir->insn_len; })
+
+/* C.SLLI is a CI-format instruction that performs a logical left shift
+ * of the value in register rd then writes the result to rd. The shift
+ * amount is encoded in the shamt field. C.SLLI expands into slli rd,
+ * rd, shamt[5:0].
+ */
+RVOP(cslli, { rv->X[ir->rd] <<= (uint8_t) ir->imm; })
 
-    /* C.EBREAK */
-    _(cebreak, {
+/* C.LWSP */
+RVOP(clwsp, {
+    const uint32_t addr = rv->X[rv_reg_sp] + ir->imm;
+    if (unlikely(addr & 3)) {
         rv->compressed = true;
-        rv->io.on_ebreak(rv);
-        /* increment the cycles csr */
-        rv->csr_cycle++;
-        /* can branch */
-        return true;
-    })
-
-    /* C.JALR */
-    _(cjalr, {
-        /* Unconditional jump and store PC+2 to ra */
-        const int32_t jump_to = rv->X[ir->rs1];
-        rv->X[rv_reg_ra] = rv->PC + ir->insn_len;
-        rv->PC = jump_to;
-        if (rv->PC & 0x1) {
-            rv->compressed = true;
-            rv_except_insn_misaligned(rv, rv->PC);
-            return false;
-        }
-        /* increment the cycles csr */
-        rv->csr_cycle++;
-        /* can branch */
-        return true;
-    })
-
-    /* C.ADD adds the values in registers rd and rs2 and writes the
-     * result to register rd.
-     * C.ADD expands into add rd, rd, rs2.
-     * C.ADD is only valid when rs2=x0; the code points with rs2=x0
-     * correspond to the C.JALR and C.EBREAK instructions. The code
-     * points with rs2=x0 and rd=x0 are HINTs.
-     */
-    _(cadd, rv->X[ir->rd] = rv->X[ir->rs1] + rv->X[ir->rs2];)
-
-    /* C.SWSP */
-    _(cswsp, {
-        const uint32_t addr = rv->X[2] + ir->imm;
-        if (addr & 3) {
-            rv->compressed = true;
-            rv_except_store_misaligned(rv, addr);
-            return false;
-        }
-        rv->io.mem_write_w(rv, addr, rv->X[ir->rs2]);
-    })
-#endif /* RV32_HAS(EXT_C) */
+        rv_except_load_misaligned(rv, addr);
+        return false;
+    }
+    rv->X[ir->rd] = rv->io.mem_read_w(rv, addr);
+})
+
+/* C.JR */
+RVOP(cjr, { rv->PC = rv->X[ir->rs1]; })
+
+/* C.MV */
+RVOP(cmv, { rv->X[ir->rd] = rv->X[ir->rs2]; })
+
+/* C.EBREAK */
+RVOP(cebreak, {
+    rv->compressed = true;
+    rv->io.on_ebreak(rv);
+})
+
+/* C.JALR */
+RVOP(cjalr, {
+    /* Unconditional jump and store PC+2 to ra */
+    const int32_t jump_to = rv->X[ir->rs1];
+    rv->X[rv_reg_ra] = rv->PC + ir->insn_len;
+    rv->PC = jump_to;
+    if (unlikely(rv->PC & 0x1)) {
+        rv->compressed = true;
+        rv_except_insn_misaligned(rv, rv->PC);
+        return false;
+    }
+})
+
+/* C.ADD adds the values in registers rd and rs2 and writes the
+ * result to register rd.
+ * C.ADD expands into add rd, rd, rs2.
+ * C.ADD is only valid when rs2=x0; the code points with rs2=x0
+ * correspond to the C.JALR and C.EBREAK instructions. The code
+ * points with rs2=x0 and rd=x0 are HINTs.
+ */
+RVOP(cadd, { rv->X[ir->rd] = rv->X[ir->rs1] + rv->X[ir->rs2]; })
 
-#undef _
+/* C.SWSP */
+RVOP(cswsp, {
+    const uint32_t addr = rv->X[2] + ir->imm;
+    if (unlikely(addr & 3)) {
+        rv->compressed = true;
+        rv_except_store_misaligned(rv, addr);
+        return false;
+    }
+    rv->io.mem_write_w(rv, addr, rv->X[ir->rs2]);
+})
+#endif
 
-    EPILOGUE()
-}
+static const void *dispatch_table[] = {
+#define _(inst, can_branch) [rv_insn_##inst] = do_##inst,
+    RISCV_INSN_LIST
+#undef _
+};
 
 static bool insn_is_branch(uint8_t opcode)
 {
     switch (opcode) {
-    case rv_insn_jal:
-    case rv_insn_jalr:
-    case rv_insn_beq:
-    case rv_insn_bne:
-    case rv_insn_blt:
-    case rv_insn_bge:
-    case rv_insn_bltu:
-    case rv_insn_bgeu:
-    case rv_insn_ecall:
-    case rv_insn_ebreak:
-    case rv_insn_mret:
-#if RV32_HAS(EXT_C)
-    case rv_insn_cj:
-    case rv_insn_cjr:
-    case rv_insn_cjal:
-    case rv_insn_cjalr:
-    case rv_insn_cbeqz:
-    case rv_insn_cbnez:
-    case rv_insn_cebreak:
-#endif
-#if RV32_HAS(Zifencei)
-    case rv_insn_fencei:
-#endif
+#define _(inst, can_branch) IIF(can_branch)(case rv_insn_##inst:, )
+        RISCV_INSN_LIST
+#undef _
         return true;
     }
     return false;
@@ -1449,7 +1321,7 @@ static void block_translate(riscv_t *rv, block_t *block)
             rv_except_illegal_insn(rv, insn);
             break;
         }
-
+        ir->impl = dispatch_table[ir->opcode];
         /* compute the end of pc */
         block->pc_end += ir->insn_len;
         block->n_insn++;
@@ -1458,6 +1330,7 @@ static void block_translate(riscv_t *rv, block_t *block)
         if (insn_is_branch(ir->opcode))
             break;
     }
+    block->ir[block->n_insn - 1].tailcall = true;
 }
 
 static block_t *block_find_or_translate(riscv_t *rv, block_t *prev)
@@ -1520,7 +1393,8 @@ void rv_step(riscv_t *rv, int32_t cycles)
         assert(block);
 
         /* execute the block */
-        if (!emulate(rv, block))
+        const rv_insn_t *ir = block->ir;
+        if (unlikely(!ir->impl(rv, ir)))
             break;
 
         prev = block;