stm32/dma: Add D-cache protection for DMA RX operations, including SPI.

This new DMA API corrects possible cache coherency issues on chips with
D-Cache, when working with buffers at arbitrary memory locations (i.e.
supplied by Python code).

The API is used by SPI to fix an issue with corrupt data when reading from
SPI using DMA in certain cases.  A regression test is included (it depends
on external hardware connection).

Explanation:

1) It's necessary to invalidate D-Cache after a DMA RX operation completes
   in case the CPU reads (or speculatively reads) from the DMA RX region
   during the operation.  This seems to have been the root cause of issue
   #13471 (only when src==dest for this case).

2) More generally, it is also necessary to temporarily mark the first and
   last cache lines of a DMA RX operation as "uncached", in case the DMA
   buffer shares this cache line with unrelated data.  The CPU could
   otherwise write the other data at any time during the DMA operation (for
   example from an interrupt handler), creating a dirty cache line that's
   inconsistent with the DMA result.

Fixes issue #13471.

This work was funded through GitHub Sponsors.

Signed-off-by: Angus Gratton <angus@redyak.com.au>
pull/13717/head
Angus Gratton 2024-02-20 12:14:44 +11:00 zatwierdzone przez Damien George
rodzic 2345c1a04e
commit 7fd8a6d4bc
6 zmienionych plików z 158 dodań i 2 usunięć

Wyświetl plik

@ -33,6 +33,7 @@
#include "systick.h"
#include "dma.h"
#include "irq.h"
#include "mpu.h"
// When this option is enabled, the DMA will turn off automatically after
// a period of inactivity.
@ -1852,3 +1853,55 @@ void dma_external_acquire(uint32_t controller, uint32_t stream) {
void dma_external_release(uint32_t controller, uint32_t stream) {
dma_disable_clock(DMA_ID_FROM_CONTROLLER_STREAM(controller, stream));
}
#if __DCACHE_PRESENT
void dma_protect_rx_region(void *dest, size_t len) {
#if __DCACHE_PRESENT
uint32_t start_addr = (uint32_t)dest;
uint32_t start_aligned = start_addr & ~(__SCB_DCACHE_LINE_SIZE - 1U);
uint32_t end_addr = start_addr + len - 1; // Address of last byte in the buffer
uint32_t end_aligned = end_addr & ~(__SCB_DCACHE_LINE_SIZE - 1U);
uint32_t irq_state = mpu_config_start();
// Clean (write back) any cached memory in this region, so there's no dirty
// cache entries that might be written back later after DMA RX is done.
MP_HAL_CLEAN_DCACHE(dest, len);
// The way we protect the whole region is to mark the first and last cache
// line as UNCACHED using the MPU. This means any unrelated reads/writes in
// these cache lines will bypass the cache, and can coexist with DMA also
// writing to parts of these cache lines.
//
// This is redundant sometimes (because the DMA region fills the entire cache line, or because
// the region fits in a single cache line.) However, the implementation is only 3 register writes so
// it's more efficient to call it every time.
mpu_config_region(MPU_REGION_DMA_UNCACHED_1, start_aligned, MPU_CONFIG_UNCACHED(MPU_REGION_SIZE_32B));
mpu_config_region(MPU_REGION_DMA_UNCACHED_2, end_aligned, MPU_CONFIG_UNCACHED(MPU_REGION_SIZE_32B));
mpu_config_end(irq_state);
#endif
}
void dma_unprotect_rx_region(void *dest, size_t len) {
#if __DCACHE_PRESENT
uint32_t irq_state = mpu_config_start();
// Disabling these regions removes them from the MPU
mpu_config_region(MPU_REGION_DMA_UNCACHED_1, 0, MPU_CONFIG_DISABLE);
mpu_config_region(MPU_REGION_DMA_UNCACHED_2, 0, MPU_CONFIG_DISABLE);
// Invalidate the whole region in the cache. This may seem redundant, but it
// is possible that during the DMA operation the CPU read inside this region
// (excluding the first & last cache lines), and cache lines were filled.
//
// (This can happen in SPI if src==dest, for example, possibly due to speculative
// cache line fills.)
MP_HAL_CLEANINVALIDATE_DCACHE(dest, len);
mpu_config_end(irq_state);
#endif
}
#endif

Wyświetl plik

@ -26,6 +26,8 @@
#ifndef MICROPY_INCLUDED_STM32_DMA_H
#define MICROPY_INCLUDED_STM32_DMA_H
#include "py/mpconfig.h"
typedef struct _dma_descr_t dma_descr_t;
#if defined(STM32H5)
@ -164,4 +166,40 @@ void dma_nohal_start(const dma_descr_t *descr, uint32_t src_addr, uint32_t dst_a
void dma_external_acquire(uint32_t controller, uint32_t stream);
void dma_external_release(uint32_t controller, uint32_t stream);
#if __DCACHE_PRESENT
// On chips with D-Cache, it's necessary to call this function before using DMA to read
// into a user-supplied buffer. It does two things:
//
// 1) Cleans this region in the cache.
//
// 2) Temporarily disables caching for the first and last cache lines of the
// buffer. This prevents cache coherency issues if the start or end of the
// buffer don't align to a cache line. (For example, this can happen if the CPU
// accesses unrelated memory adjacent to the buffer and in the same cache
// line.)
//
// Only one region can be protected in this way at a time.
void dma_protect_rx_region(void *dest, size_t len);
// Release the region protected by dma_protect_rx_region().
//
// Call this after the DMA operation completes, before the CPU reads any of the
// data that was written.
//
// As well as restoring data caching, this function invalidates D-cache for the
// entire specified region.
void dma_unprotect_rx_region(void *dest, size_t len);
#else
inline static void dma_protect_rx_region(uint8_t *dest, size_t len) {
// No-ops on targets without D-Cache.
}
inline static void dma_unprotect_rx_region(void *dest, size_t len) {
}
#endif // __DCACHE_PRESENT
#endif // MICROPY_INCLUDED_STM32_DMA_H

Wyświetl plik

@ -37,6 +37,10 @@
#define MPU_REGION_SDRAM1 (MPU_REGION_NUMBER4)
#define MPU_REGION_SDRAM2 (MPU_REGION_NUMBER5)
// Only relevant on CPUs with D-Cache, must be higher priority than SDRAM
#define MPU_REGION_DMA_UNCACHED_1 (MPU_REGION_NUMBER6)
#define MPU_REGION_DMA_UNCACHED_2 (MPU_REGION_NUMBER7)
// Attribute value to disable a region entirely, remove it from the MPU
// (i.e. the MPU_REGION_ENABLE bit is unset.)
#define MPU_CONFIG_DISABLE 0
@ -78,6 +82,18 @@
| MPU_REGION_ENABLE << MPU_RASR_ENABLE_Pos \
)
#define MPU_CONFIG_UNCACHED(size) ( \
MPU_INSTRUCTION_ACCESS_DISABLE << MPU_RASR_XN_Pos \
| MPU_REGION_FULL_ACCESS << MPU_RASR_AP_Pos \
| MPU_TEX_LEVEL1 << MPU_RASR_TEX_Pos \
| MPU_ACCESS_NOT_SHAREABLE << MPU_RASR_S_Pos \
| MPU_ACCESS_NOT_CACHEABLE << MPU_RASR_C_Pos \
| MPU_ACCESS_NOT_BUFFERABLE << MPU_RASR_B_Pos \
| 0x00 << MPU_RASR_SRD_Pos \
| (size) << MPU_RASR_SIZE_Pos \
| MPU_REGION_ENABLE << MPU_RASR_ENABLE_Pos \
)
static inline void mpu_init(void) {
MPU->CTRL = MPU_PRIVILEGED_DEFAULT | MPU_CTRL_ENABLE_Msk;
SCB->SHCSR |= SCB_SHCSR_MEMFAULTENA_Msk;

Wyświetl plik

@ -571,6 +571,8 @@ void spi_transfer(const spi_t *self, size_t len, const uint8_t *src, uint8_t *de
// Note: DMA transfers are limited to 65535 bytes at a time.
HAL_StatusTypeDef status;
void *odest = dest; // Original values of dest & len
size_t olen = len;
if (dest == NULL) {
// send only
@ -613,7 +615,7 @@ void spi_transfer(const spi_t *self, size_t len, const uint8_t *src, uint8_t *de
}
dma_init(&rx_dma, self->rx_dma_descr, DMA_PERIPH_TO_MEMORY, self->spi);
self->spi->hdmarx = &rx_dma;
MP_HAL_CLEANINVALIDATE_DCACHE(dest, len);
dma_protect_rx_region(dest, len);
uint32_t t_start = HAL_GetTick();
do {
uint32_t l = MIN(len, 65535);
@ -632,6 +634,7 @@ void spi_transfer(const spi_t *self, size_t len, const uint8_t *src, uint8_t *de
dma_deinit(self->tx_dma_descr);
}
dma_deinit(self->rx_dma_descr);
dma_unprotect_rx_region(odest, olen);
}
} else {
// send and receive
@ -644,7 +647,7 @@ void spi_transfer(const spi_t *self, size_t len, const uint8_t *src, uint8_t *de
dma_init(&rx_dma, self->rx_dma_descr, DMA_PERIPH_TO_MEMORY, self->spi);
self->spi->hdmarx = &rx_dma;
MP_HAL_CLEAN_DCACHE(src, len);
MP_HAL_CLEANINVALIDATE_DCACHE(dest, len);
dma_protect_rx_region(dest, len);
uint32_t t_start = HAL_GetTick();
do {
uint32_t l = MIN(len, 65535);
@ -662,6 +665,7 @@ void spi_transfer(const spi_t *self, size_t len, const uint8_t *src, uint8_t *de
} while (len);
dma_deinit(self->tx_dma_descr);
dma_deinit(self->rx_dma_descr);
dma_unprotect_rx_region(odest, olen);
}
}

Wyświetl plik

@ -0,0 +1,43 @@
from machine import SPI
# Regression test for DMA for DCache coherency bugs with cache line
# written originally for https://github.com/micropython/micropython/issues/13471
# IMPORTANT: This test requires SPI2 MISO (pin Y8 on Pyboard D) to be connected to GND
SPI_NUM = 2
spi = SPI(SPI_NUM, baudrate=5_000_000)
buf = bytearray(1024)
ok = True
for offs in range(0, len(buf)):
v = memoryview(buf)[offs : offs + 128]
spi.readinto(v, 0xFF)
if not all(b == 0x00 for b in v):
print(offs, v.hex())
ok = False
print("Variable offset fixed length " + ("OK" if ok else "FAIL"))
# this takes around 30s to run, so skipped if already failing
if ok:
for op_len in range(1, 66):
wr = b"\xFF" * op_len
for offs in range(1, len(buf) - op_len - 1):
# Place some "sentinel" values before and after the DMA buffer
before = offs & 0xFF
after = (~offs) & 0xFF
buf[offs - 1] = before
buf[offs + op_len] = after
v = memoryview(buf)[offs : offs + op_len]
spi.write_readinto(wr, v)
if (
not all(b == 0x00 for b in v)
or buf[offs - 1] != before
or buf[offs + op_len] != after
):
print(v.hex())
print(hex(op_len), hex(offs), hex(buf[offs - 1]), hex(buf[offs + op_len]))
ok = False
print("Variable offset and lengths " + ("OK" if ok else "FAIL"))

Wyświetl plik

@ -0,0 +1,2 @@
Variable offset fixed length OK
Variable offset and lengths OK