cpu/esp8266: add LoadStoreError exception handler

Usually, the access to the IROM (flash) memory requires 32-bit word aligned reads. Attempts to access data in the IROM (flash) memory less than 32 bits in size triggers a LoadStoreError exception. With the exception handler from esp-open-rtos it becomes possible to access data in IROM (flash) with a size of less than 32 bits and thus to place .rodata sections in the IROM (flash).
RIOT-OS · Apr 15, 2019 · 633887e · 633887e
1 parent 7bbe94c
commit 633887e
Showing 1 changed file with 312 additions and 0 deletions.
diff --git a/cpu/esp_common/vendor/xtensa/xtensa_vectors.S b/cpu/esp_common/vendor/xtensa/xtensa_vectors.S
@@ -485,12 +485,324 @@ User Exception (including Level 1 Interrupt from user mode).
 
 _UserExceptionVector:
 
+#ifdef MCU_ESP8266
+    wsr     a0, EXCSAVE_1                   /* preserve a0 */
+    j       _UserExceptionTrampoline        /* jump to handler trampoline */
+#else
     wsr     a0, EXCSAVE_1                   /* preserve a0 */
     call0   _xt_user_exc                    /* user exception handler */
     /* never returns here - call0 is used as a jump (see note at top) */
+#endif
 
     .end        literal_prefix
 
+#ifdef MCU_ESP8266
+/*************************** LoadStoreError Handler BEGIN ********************/
+/*
+ * PLEASE NOTE: The code between "LoadStoreError Handler BEGIN" and
+ * "LoadStoreError Handler END" markers was extracted from esp-open-rtos. It is
+ * under the following copyright:
+ *
+ * Original vector contents Copyright (C) 2014-2015 Espressif Systems
+ * Additions Copyright (C) Superhouse Automation Pty Ltd and Angus Gratton
+ * BSD Licensed as described in the file LICENSE
+ *
+ * Usually, the access to the IROM (flash) memory requires 32-bit word aligned
+ * reads. Attempts to access data in the IROM (flash) memory less than 32 bits
+ * in size triggers a LoadStoreError exception. Therefore, it is not possible to
+ * place .rodata sections in IROM (flash). Rather, .rodata sections have to
+ * be placed in RAM. With the exception handler from esp-open-rtos it becomes
+ * possible to access data in IROM (flash) with a size of less than 32 bits
+ * and thus to place .rodata sections in the IROM (flash).
+ */
+
+#define CAUSE_LOADSTORE         3
+#define fatal_exception_handler _xt_user_exc
+
+/* LoadStoreError handler stack */
+
+    .section .bss
+    .balign 16
+
+_LoadStoreErrorHandlerStack:
+    .word   0       # a0
+    .word   0       # (unused)
+    .word   0       # a2
+    .word   0       # a3
+    .word   0       # a4
+
+/* LoadStoreError Trampoline */
+
+    .section .UserExceptionTrampoline.text, "x"
+    .literal_position
+    .balign 4
+
+_UserExceptionTrampoline:
+
+    wsr     a1, EXCSAVE_2                   /* preserve a1 */
+#ifdef MCU_ESP8266
+    rsr     a1, exccause
+    beqi    a1, CAUSE_LOADSTORE, _LoadStoreErrorHandler
+#endif
+    rsr     a1, EXCSAVE_2                   /* restore a1 */
+    call0   _xt_user_exc                    /* user exception handler */
+    /* never returns here - call0 is used as a jump (see note at top) */
+
+    /*
+     * Xtensa "Load/Store Exception" handler:
+     * Completes L8/L16 load instructions from Instruction address space,
+     * for which the architecture only supports 32-bit reads.
+     *
+     * Called from UserExceptionVector if EXCCAUSE is LoadStoreErrorCause
+     *
+     * (Fast path (no branches) is for L8UI)
+     */
+    .literal_position
+    .balign 4
+    .type   LoadStoreErrorHandler, @function
+
+_LoadStoreErrorHandler:
+
+    rsr     a1, EXCSAVE_2                   /* restore a1 */
+    wsr     a1, EXCSAVE_1                   /* save it to excsave1 */
+    /* Registers are saved in the address corresponding to their register
+     * number times 4.  This allows a quick and easy mapping later on when
+     * needing to store the value to a particular register number. */
+    movi    sp, _LoadStoreErrorHandlerStack
+    s32i    a0, sp, 0
+    s32i    a2, sp, 0x08
+    s32i    a3, sp, 0x0c
+    s32i    a4, sp, 0x10
+    rsr     a0, sar     # Save SAR in a0 to restore later
+
+    /* Examine the opcode which generated the exception */
+    /* Note: Instructions are in this order to avoid pipeline stalls. */
+    rsr     a2, epc1
+    movi    a3, ~3
+    ssa8l   a2          # sar is now correct shift for aligned read
+    and     a2, a2, a3      # a2 now 4-byte aligned address of instruction
+    l32i    a4, a2, 0
+    l32i    a2, a2, 4
+    movi    a3, 0x00700F    # opcode mask for l8ui/l16si/l16ui
+    src     a2, a2, a4      # a2 now instruction that failed
+    and     a3, a2, a3      # a3 is masked instruction
+    bnei    a3, 0x000002, .LSE_check_l16
+
+    /* Note: At this point, opcode could technically be one of two things:
+     *   xx0xx2 (L8UI)
+     *   xx8xx2 (Reserved (invalid) opcode)
+     * It is assumed that we'll never get to this point from an illegal
+     * opcode, so we don't bother to check for that case and presume this
+     * is always an L8UI. */
+
+    movi    a4, ~3
+    rsr     a3, excvaddr    # read faulting address
+    and     a4, a3, a4      # a4 now word aligned read address
+
+    l32i    a4, a4, 0       # perform the actual read
+    ssa8l   a3          # sar is now shift to extract a3's byte
+    srl     a3, a4      # shift right correct distance
+    extui   a4, a3, 0, 8    # mask off bits we need for an l8
+
+.LSE_post_fetch:
+    /* We jump back here after either the L8UI or the L16*I routines do the
+     * necessary work to read the value from memory.
+     * At this point, a2 holds the faulting instruction and a4 holds the
+     * correctly read value.
+
+     * Restore original SAR value (saved in a0) and update EPC so we'll
+     * return back to the instruction following the one we just emulated */
+
+    /* Note: Instructions are in this order to avoid pipeline stalls */
+    rsr     a3, epc1
+    wsr     a0, sar
+    addi    a3, a3, 0x3
+    wsr     a3, epc1
+
+    /* Stupid opcode tricks: The jumptable we use later on needs 16 bytes
+     * per entry (so we can avoid a second jump by just doing a RFE inside
+     * each entry).  Unfortunately, however, Xtensa doesn't have an addx16
+     * operation to make that easy for us.  Luckily, all of the faulting
+     * opcodes we're processing are guaranteed to have bit 3 be zero, which
+     * means if we just shift the register bits of the opcode down by 3
+     * instead of 4, we will get the register number multiplied by 2.  This
+     * combined with an addx8 will give us an effective addx16 without
+     * needing any extra shift operations. */
+    extui   a2, a2, 3, 5    # a2 is now destination register 0-15 times 2
+
+    bgei    a2, 10, .LSE_assign_reg     # a5..a15 use jumptable
+    beqi    a2, 2, .LSE_assign_a1       # a1 uses a special routine
+
+    /* We're storing into a0 or a2..a4, which are all saved in our "stack"
+     * area.  Calculate the correct address and stick the value in there,
+     * then just do our normal restore and RFE (no jumps required, which
+     * actually makes a0..a4 substantially faster). */
+    addx2   a2, a2, sp
+    s32i    a4, a2, 0
+
+    /* Restore all regs and return */
+    l32i    a0, sp, 0
+    l32i    a2, sp, 0x08
+    l32i    a3, sp, 0x0c
+    l32i    a4, sp, 0x10
+    rsr     a1, excsave1    # restore a1 saved by UserExceptionVector
+    rfe
+
+.LSE_assign_reg:
+    /* At this point, a2 contains the register number times 2, a4 is the
+     * read value. */
+
+    /* Calculate the jumptable address, and restore all regs except a2 and
+     * a4 so we have less to do after jumping. */
+    /* Note: Instructions are in this order to avoid pipeline stalls. */
+    movi    a3, .LSE_jumptable_base
+    l32i    a0, sp, 0
+    addx8   a2, a2, a3      # a2 is now the address to jump to
+    l32i    a3, sp, 0x0c
+
+    jx      a2
+
+    .balign 4
+.LSE_check_l16:
+    /* At this point, a2 contains the opcode, a3 is masked opcode */
+    movi    a4, 0x001002    # l16si or l16ui opcode after masking
+    bne     a3, a4, .LSE_wrong_opcode
+
+    /* Note: At this point, the opcode could be one of two things:
+     *   xx1xx2 (L16UI)
+     *   xx9xx2 (L16SI)
+     * Both of these we can handle. */
+
+    movi    a4, ~3
+    rsr     a3, excvaddr    # read faulting address
+    and     a4, a3, a4      # a4 now word aligned read address
+
+    l32i    a4, a4, 0       # perform the actual read
+    ssa8l   a3          # sar is now shift to extract a3's bytes
+    srl     a3, a4      # shift right correct distance
+    extui   a4, a3, 0, 16   # mask off bits we need for an l16
+
+    bbci    a2, 15, .LSE_post_fetch  # Not a signed op
+    bbci    a4, 15, .LSE_post_fetch  # Value does not need sign-extension
+
+    movi    a3, 0xFFFF0000
+    or      a4, a3, a4      # set 32-bit sign bits
+    j       .LSE_post_fetch
+
+.LSE_wrong_opcode:
+    /* If we got here it's not an opcode we can try to fix, so bomb out.
+     * Restore registers so any dump the fatal exception routine produces
+     * will have correct values */
+    wsr     a0, sar
+    l32i    a0, sp, 0
+    /*l32i    a2, sp, 0x08*/
+    l32i    a3, sp, 0x0c
+    l32i    a4, sp, 0x10
+    rsr     a1, excsave1
+    mov     a2, a1
+    movi    a3, 0
+    call0   fatal_exception_handler
+
+    .balign 4
+.LSE_assign_a1:
+    /* a1 is saved in excsave1, so just update that with the value, */
+    wsr     a4, excsave1
+    /* Then restore all regs and return */
+    l32i    a0, sp, 0
+    l32i    a2, sp, 0x08
+    l32i    a3, sp, 0x0c
+    l32i    a4, sp, 0x10
+    rsr     a1, excsave1
+    rfe
+
+    .balign 4
+.LSE_jumptable:
+    /* The first 5 entries (80 bytes) of this table are unused (registers
+     * a0..a4 are handled separately above).  Rather than have a whole bunch
+     * of wasted space, we just pretend that the table starts 80 bytes
+     * earlier in memory. */
+    .set    .LSE_jumptable_base, .LSE_jumptable - (16 * 5)
+
+    .org    .LSE_jumptable_base + (16 * 5)
+    mov     a5, a4
+    l32i    a2, sp, 0x08
+    l32i    a4, sp, 0x10
+    rsr     a1, excsave1
+    rfe
+
+    .org    .LSE_jumptable_base + (16 * 6)
+    mov     a6, a4
+    l32i    a2, sp, 0x08
+    l32i    a4, sp, 0x10
+    rsr     a1, excsave1
+    rfe
+
+    .org    .LSE_jumptable_base + (16 * 7)
+    mov     a7, a4
+    l32i    a2, sp, 0x08
+    l32i    a4, sp, 0x10
+    rsr     a1, excsave1
+    rfe
+
+    .org    .LSE_jumptable_base + (16 * 8)
+    mov     a8, a4
+    l32i    a2, sp, 0x08
+    l32i    a4, sp, 0x10
+    rsr     a1, excsave1
+    rfe
+
+    .org    .LSE_jumptable_base + (16 * 9)
+    mov     a9, a4
+    l32i    a2, sp, 0x08
+    l32i    a4, sp, 0x10
+    rsr     a1, excsave1
+    rfe
+
+    .org    .LSE_jumptable_base + (16 * 10)
+    mov     a10, a4
+    l32i    a2, sp, 0x08
+    l32i    a4, sp, 0x10
+    rsr     a1, excsave1
+    rfe
+
+    .org    .LSE_jumptable_base + (16 * 11)
+    mov     a11, a4
+    l32i    a2, sp, 0x08
+    l32i    a4, sp, 0x10
+    rsr     a1, excsave1
+    rfe
+
+    .org    .LSE_jumptable_base + (16 * 12)
+    mov     a12, a4
+    l32i    a2, sp, 0x08
+    l32i    a4, sp, 0x10
+    rsr     a1, excsave1
+    rfe
+
+    .org    .LSE_jumptable_base + (16 * 13)
+    mov     a13, a4
+    l32i    a2, sp, 0x08
+    l32i    a4, sp, 0x10
+    rsr     a1, excsave1
+    rfe
+
+    .org    .LSE_jumptable_base + (16 * 14)
+    mov     a14, a4
+    l32i    a2, sp, 0x08
+    l32i    a4, sp, 0x10
+    rsr     a1, excsave1
+    rfe
+
+    .org    .LSE_jumptable_base + (16 * 15)
+    mov     a15, a4
+    l32i    a2, sp, 0x08
+    l32i    a4, sp, 0x10
+    rsr     a1, excsave1
+    rfe
+
+/*************************** LoadStoreError Handler END **********************/
+#endif
+
 /*
 --------------------------------------------------------------------------------
   Insert some waypoints for jumping beyond the signed 8-bit range of