From 7fd8a6d4bced2fcad11b30189e1fba4c32eaafc5 Mon Sep 17 00:00:00 2001 From: Angus Gratton Date: Tue, 20 Feb 2024 12:14:44 +1100 Subject: [PATCH] stm32/dma: Add D-cache protection for DMA RX operations, including SPI. This new DMA API corrects possible cache coherency issues on chips with D-Cache, when working with buffers at arbitrary memory locations (i.e. supplied by Python code). The API is used by SPI to fix an issue with corrupt data when reading from SPI using DMA in certain cases. A regression test is included (it depends on external hardware connection). Explanation: 1) It's necessary to invalidate D-Cache after a DMA RX operation completes in case the CPU reads (or speculatively reads) from the DMA RX region during the operation. This seems to have been the root cause of issue #13471 (only when src==dest for this case). 2) More generally, it is also necessary to temporarily mark the first and last cache lines of a DMA RX operation as "uncached", in case the DMA buffer shares this cache line with unrelated data. The CPU could otherwise write the other data at any time during the DMA operation (for example from an interrupt handler), creating a dirty cache line that's inconsistent with the DMA result. Fixes issue #13471. This work was funded through GitHub Sponsors. Signed-off-by: Angus Gratton --- ports/stm32/dma.c | 53 +++++++++++++++++++ ports/stm32/dma.h | 38 +++++++++++++ ports/stm32/mpu.h | 16 ++++++ ports/stm32/spi.c | 8 ++- tests/ports/stm32_hardware/dma_alignment.py | 43 +++++++++++++++ .../ports/stm32_hardware/dma_alignment.py.exp | 2 + 6 files changed, 158 insertions(+), 2 deletions(-) create mode 100644 tests/ports/stm32_hardware/dma_alignment.py create mode 100644 tests/ports/stm32_hardware/dma_alignment.py.exp diff --git a/ports/stm32/dma.c b/ports/stm32/dma.c index af56d81236..df8a408cbf 100644 --- a/ports/stm32/dma.c +++ b/ports/stm32/dma.c @@ -33,6 +33,7 @@ #include "systick.h" #include "dma.h" #include "irq.h" +#include "mpu.h" // When this option is enabled, the DMA will turn off automatically after // a period of inactivity. @@ -1852,3 +1853,55 @@ void dma_external_acquire(uint32_t controller, uint32_t stream) { void dma_external_release(uint32_t controller, uint32_t stream) { dma_disable_clock(DMA_ID_FROM_CONTROLLER_STREAM(controller, stream)); } + +#if __DCACHE_PRESENT + +void dma_protect_rx_region(void *dest, size_t len) { + #if __DCACHE_PRESENT + uint32_t start_addr = (uint32_t)dest; + uint32_t start_aligned = start_addr & ~(__SCB_DCACHE_LINE_SIZE - 1U); + uint32_t end_addr = start_addr + len - 1; // Address of last byte in the buffer + uint32_t end_aligned = end_addr & ~(__SCB_DCACHE_LINE_SIZE - 1U); + + uint32_t irq_state = mpu_config_start(); + + // Clean (write back) any cached memory in this region, so there's no dirty + // cache entries that might be written back later after DMA RX is done. + MP_HAL_CLEAN_DCACHE(dest, len); + + // The way we protect the whole region is to mark the first and last cache + // line as UNCACHED using the MPU. This means any unrelated reads/writes in + // these cache lines will bypass the cache, and can coexist with DMA also + // writing to parts of these cache lines. + // + // This is redundant sometimes (because the DMA region fills the entire cache line, or because + // the region fits in a single cache line.) However, the implementation is only 3 register writes so + // it's more efficient to call it every time. + mpu_config_region(MPU_REGION_DMA_UNCACHED_1, start_aligned, MPU_CONFIG_UNCACHED(MPU_REGION_SIZE_32B)); + mpu_config_region(MPU_REGION_DMA_UNCACHED_2, end_aligned, MPU_CONFIG_UNCACHED(MPU_REGION_SIZE_32B)); + + mpu_config_end(irq_state); + #endif +} + +void dma_unprotect_rx_region(void *dest, size_t len) { + #if __DCACHE_PRESENT + uint32_t irq_state = mpu_config_start(); + + // Disabling these regions removes them from the MPU + mpu_config_region(MPU_REGION_DMA_UNCACHED_1, 0, MPU_CONFIG_DISABLE); + mpu_config_region(MPU_REGION_DMA_UNCACHED_2, 0, MPU_CONFIG_DISABLE); + + // Invalidate the whole region in the cache. This may seem redundant, but it + // is possible that during the DMA operation the CPU read inside this region + // (excluding the first & last cache lines), and cache lines were filled. + // + // (This can happen in SPI if src==dest, for example, possibly due to speculative + // cache line fills.) + MP_HAL_CLEANINVALIDATE_DCACHE(dest, len); + + mpu_config_end(irq_state); + #endif +} + +#endif diff --git a/ports/stm32/dma.h b/ports/stm32/dma.h index fa3238413e..7ba04baf52 100644 --- a/ports/stm32/dma.h +++ b/ports/stm32/dma.h @@ -26,6 +26,8 @@ #ifndef MICROPY_INCLUDED_STM32_DMA_H #define MICROPY_INCLUDED_STM32_DMA_H +#include "py/mpconfig.h" + typedef struct _dma_descr_t dma_descr_t; #if defined(STM32H5) @@ -164,4 +166,40 @@ void dma_nohal_start(const dma_descr_t *descr, uint32_t src_addr, uint32_t dst_a void dma_external_acquire(uint32_t controller, uint32_t stream); void dma_external_release(uint32_t controller, uint32_t stream); +#if __DCACHE_PRESENT +// On chips with D-Cache, it's necessary to call this function before using DMA to read +// into a user-supplied buffer. It does two things: +// +// 1) Cleans this region in the cache. +// +// 2) Temporarily disables caching for the first and last cache lines of the +// buffer. This prevents cache coherency issues if the start or end of the +// buffer don't align to a cache line. (For example, this can happen if the CPU +// accesses unrelated memory adjacent to the buffer and in the same cache +// line.) +// +// Only one region can be protected in this way at a time. +void dma_protect_rx_region(void *dest, size_t len); + +// Release the region protected by dma_protect_rx_region(). +// +// Call this after the DMA operation completes, before the CPU reads any of the +// data that was written. +// +// As well as restoring data caching, this function invalidates D-cache for the +// entire specified region. +void dma_unprotect_rx_region(void *dest, size_t len); + +#else + +inline static void dma_protect_rx_region(uint8_t *dest, size_t len) { + // No-ops on targets without D-Cache. +} + +inline static void dma_unprotect_rx_region(void *dest, size_t len) { + +} + +#endif // __DCACHE_PRESENT + #endif // MICROPY_INCLUDED_STM32_DMA_H diff --git a/ports/stm32/mpu.h b/ports/stm32/mpu.h index 62a428198e..64880a85db 100644 --- a/ports/stm32/mpu.h +++ b/ports/stm32/mpu.h @@ -37,6 +37,10 @@ #define MPU_REGION_SDRAM1 (MPU_REGION_NUMBER4) #define MPU_REGION_SDRAM2 (MPU_REGION_NUMBER5) +// Only relevant on CPUs with D-Cache, must be higher priority than SDRAM +#define MPU_REGION_DMA_UNCACHED_1 (MPU_REGION_NUMBER6) +#define MPU_REGION_DMA_UNCACHED_2 (MPU_REGION_NUMBER7) + // Attribute value to disable a region entirely, remove it from the MPU // (i.e. the MPU_REGION_ENABLE bit is unset.) #define MPU_CONFIG_DISABLE 0 @@ -78,6 +82,18 @@ | MPU_REGION_ENABLE << MPU_RASR_ENABLE_Pos \ ) +#define MPU_CONFIG_UNCACHED(size) ( \ + MPU_INSTRUCTION_ACCESS_DISABLE << MPU_RASR_XN_Pos \ + | MPU_REGION_FULL_ACCESS << MPU_RASR_AP_Pos \ + | MPU_TEX_LEVEL1 << MPU_RASR_TEX_Pos \ + | MPU_ACCESS_NOT_SHAREABLE << MPU_RASR_S_Pos \ + | MPU_ACCESS_NOT_CACHEABLE << MPU_RASR_C_Pos \ + | MPU_ACCESS_NOT_BUFFERABLE << MPU_RASR_B_Pos \ + | 0x00 << MPU_RASR_SRD_Pos \ + | (size) << MPU_RASR_SIZE_Pos \ + | MPU_REGION_ENABLE << MPU_RASR_ENABLE_Pos \ + ) + static inline void mpu_init(void) { MPU->CTRL = MPU_PRIVILEGED_DEFAULT | MPU_CTRL_ENABLE_Msk; SCB->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk; diff --git a/ports/stm32/spi.c b/ports/stm32/spi.c index 5aace8e9a3..607b3fe685 100644 --- a/ports/stm32/spi.c +++ b/ports/stm32/spi.c @@ -571,6 +571,8 @@ void spi_transfer(const spi_t *self, size_t len, const uint8_t *src, uint8_t *de // Note: DMA transfers are limited to 65535 bytes at a time. HAL_StatusTypeDef status; + void *odest = dest; // Original values of dest & len + size_t olen = len; if (dest == NULL) { // send only @@ -613,7 +615,7 @@ void spi_transfer(const spi_t *self, size_t len, const uint8_t *src, uint8_t *de } dma_init(&rx_dma, self->rx_dma_descr, DMA_PERIPH_TO_MEMORY, self->spi); self->spi->hdmarx = &rx_dma; - MP_HAL_CLEANINVALIDATE_DCACHE(dest, len); + dma_protect_rx_region(dest, len); uint32_t t_start = HAL_GetTick(); do { uint32_t l = MIN(len, 65535); @@ -632,6 +634,7 @@ void spi_transfer(const spi_t *self, size_t len, const uint8_t *src, uint8_t *de dma_deinit(self->tx_dma_descr); } dma_deinit(self->rx_dma_descr); + dma_unprotect_rx_region(odest, olen); } } else { // send and receive @@ -644,7 +647,7 @@ void spi_transfer(const spi_t *self, size_t len, const uint8_t *src, uint8_t *de dma_init(&rx_dma, self->rx_dma_descr, DMA_PERIPH_TO_MEMORY, self->spi); self->spi->hdmarx = &rx_dma; MP_HAL_CLEAN_DCACHE(src, len); - MP_HAL_CLEANINVALIDATE_DCACHE(dest, len); + dma_protect_rx_region(dest, len); uint32_t t_start = HAL_GetTick(); do { uint32_t l = MIN(len, 65535); @@ -662,6 +665,7 @@ void spi_transfer(const spi_t *self, size_t len, const uint8_t *src, uint8_t *de } while (len); dma_deinit(self->tx_dma_descr); dma_deinit(self->rx_dma_descr); + dma_unprotect_rx_region(odest, olen); } } diff --git a/tests/ports/stm32_hardware/dma_alignment.py b/tests/ports/stm32_hardware/dma_alignment.py new file mode 100644 index 0000000000..1836b25d86 --- /dev/null +++ b/tests/ports/stm32_hardware/dma_alignment.py @@ -0,0 +1,43 @@ +from machine import SPI +# Regression test for DMA for DCache coherency bugs with cache line +# written originally for https://github.com/micropython/micropython/issues/13471 + +# IMPORTANT: This test requires SPI2 MISO (pin Y8 on Pyboard D) to be connected to GND + +SPI_NUM = 2 + +spi = SPI(SPI_NUM, baudrate=5_000_000) +buf = bytearray(1024) +ok = True + +for offs in range(0, len(buf)): + v = memoryview(buf)[offs : offs + 128] + spi.readinto(v, 0xFF) + if not all(b == 0x00 for b in v): + print(offs, v.hex()) + ok = False + +print("Variable offset fixed length " + ("OK" if ok else "FAIL")) + +# this takes around 30s to run, so skipped if already failing +if ok: + for op_len in range(1, 66): + wr = b"\xFF" * op_len + for offs in range(1, len(buf) - op_len - 1): + # Place some "sentinel" values before and after the DMA buffer + before = offs & 0xFF + after = (~offs) & 0xFF + buf[offs - 1] = before + buf[offs + op_len] = after + v = memoryview(buf)[offs : offs + op_len] + spi.write_readinto(wr, v) + if ( + not all(b == 0x00 for b in v) + or buf[offs - 1] != before + or buf[offs + op_len] != after + ): + print(v.hex()) + print(hex(op_len), hex(offs), hex(buf[offs - 1]), hex(buf[offs + op_len])) + ok = False + + print("Variable offset and lengths " + ("OK" if ok else "FAIL")) diff --git a/tests/ports/stm32_hardware/dma_alignment.py.exp b/tests/ports/stm32_hardware/dma_alignment.py.exp new file mode 100644 index 0000000000..e890e0081c --- /dev/null +++ b/tests/ports/stm32_hardware/dma_alignment.py.exp @@ -0,0 +1,2 @@ +Variable offset fixed length OK +Variable offset and lengths OK