Add palette encoding to libdvi, plus example (#8)

* Add palette encoder (cherry picked from commit c30692ee75b8a2e3aaf4d7d5b809c267f9066aed) * Add full resolution Mandelbrot as example for palette encoding * Switch to 2 symbols per word, this is performance neutral on the encode, but saves memory bandwidth (cherry picked from commit 70834bfa1953a29d95e0d0f5ae16c86d2feb7242) * Marginally faster palette encode * Up to 256 colour palettes (cherry picked from commit 86e0e5d7dd11020d01f167b8cad571391de56aee) * Apply parity alternation to palette and other full res case (cherry picked from commit e9971155ff08977275612e4d22d37f0f416ef13e) * Use 256 colour palette, generate on both cores * Clear up magic numbers (cherry picked from commit 6180d210e59f25c7c4b4855920acdaa973447228) * Use PICO_DEFAULT_LED_PIN (cherry picked from commit 383c6eb4b6ea79b617d785e3736ea744746f57af)
2021-03-08 00:23:26 +00:00 · 2021-03-08 00:23:26 +00:00 · a607ff5afa
commit a607ff5afa
--- a/software/.gitignore
+++ b/software/.gitignore
@ -1 +1,2 @@
 build
+*.swp
--- a/software/apps/CMakeLists.txt
+++ b/software/apps/CMakeLists.txt
@ -7,3 +7,4 @@ add_subdirectory(moon)
 add_subdirectory(sprite_bounce)
 add_subdirectory(terminal)
 add_subdirectory(vista)
+add_subdirectory(mandel-full)
--- a/software/apps/mandel-full/CMakeLists.txt
+++ b/software/apps/mandel-full/CMakeLists.txt
@ -0,0 +1,26 @@
+# Replace TMDS with 10 bit UART (same baud rate):
+# add_definitions(-DDVI_SERIAL_DEBUG=1)
+# add_definitions(-DRUN_FROM_CRYSTAL)
+
+add_executable(mandel-full
+	main.c
+  mandelbrot.c
+)
+
+target_compile_definitions(mandel-full PRIVATE
+	DVI_VERTICAL_REPEAT=1
+	DVI_N_TMDS_BUFFERS=8
+	DVI_SYMBOLS_PER_WORD=2
+	)
+
+target_compile_definitions(mandel-full PRIVATE PICO_STACK_SIZE=0x400)
+
+
+target_link_libraries(mandel-full
+	pico_stdlib
+	pico_multicore
+	libdvi
+)
+
+# create map/bin/hex file etc.
+pico_add_extra_outputs(mandel-full)
--- a/software/apps/mandel-full/main.c
+++ b/software/apps/mandel-full/main.c
@ -0,0 +1,207 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "hardware/clocks.h"
+#include "hardware/dma.h"
+#include "hardware/gpio.h"
+#include "hardware/irq.h"
+#include "hardware/pll.h"
+#include "hardware/sync.h"
+#include "hardware/structs/bus_ctrl.h"
+#include "hardware/structs/ssi.h"
+#include "hardware/vreg.h"
+#include "pico/multicore.h"
+#include "pico/sem.h"
+#include "pico/stdlib.h"
+
+#include "tmds_encode.h"
+
+#include "dvi.h"
+#include "dvi_serialiser.h"
+#include "common_dvi_pin_configs.h"
+
+#include "mandelbrot.h"
+
+// TMDS bit clock 252 MHz
+// DVDD 1.2V (1.1V seems ok too)
+#define FRAME_WIDTH 640
+#define FRAME_HEIGHT 480
+#define VREG_VSEL VREG_VOLTAGE_1_10
+#define DVI_TIMING dvi_timing_640x480p_60hz
+
+#define N_IMAGES 3
+#define FRAMES_PER_IMAGE 300
+
+uint8_t mandel[FRAME_WIDTH * (FRAME_HEIGHT / 2)];
+
+#define PALETTE_BITS 8
+#define PALETTE_SIZE (1 << PALETTE_BITS)
+uint16_t palette[PALETTE_SIZE];
+
+uint32_t tmds_palette[PALETTE_SIZE * 6];
+
+struct dvi_inst dvi0;
+struct semaphore dvi_start_sem;
+
+FractalBuffer fractal;
+
+static uint8_t palette_offset = 0;
+
+void init_palette() {
+  palette[0] = 0;
+  for (int i = 1; i < PALETTE_SIZE; ++i) {
+    uint8_t c = i + palette_offset;
+    if (c < 0x20) palette[i] = c;
+    else if (c < 0x40) palette[i] = (c - 0x20) << 6;
+    else if (c < 0x60) palette[i] = (c - 0x40) << 11;
+    else if (c < 0x80) palette[i] = ((c - 0x60) & 0x1f) * 0x0840;
+    else if (c < 0xa0) palette[i] = ((c - 0x80) & 0x1f) * 0x0041;
+    else if (c < 0xc0) palette[i] = ((c - 0xa0) & 0x1f) * 0x0801;
+    else if (c < 0xe0) palette[i] = ((c - 0xc0) & 0x1f) * 0x0841;
+    else palette[i] = 0;
+  }
+  ++palette_offset;
+
+  tmds_setup_palette_symbols(palette, tmds_palette, PALETTE_SIZE);
+}
+
+void init_mandel() {
+  for (int y = 0; y < (FRAME_HEIGHT / 2); ++y) {
+    uint8_t* buf = &mandel[y * FRAME_WIDTH];
+    for (int i = 0; i < FRAME_WIDTH; ++i) {
+      buf[i] = ((i + y) & 0x3f);
+    }
+  }
+
+  fractal.buff = mandel;
+  fractal.rows = FRAME_HEIGHT / 2;
+  fractal.cols = FRAME_WIDTH;
+  fractal.max_iter = PALETTE_SIZE;
+  fractal.iter_offset = 0;
+  fractal.minx = -2.25f;
+  fractal.maxx = 0.75f;
+  fractal.miny = -1.6f;
+  fractal.maxy = 0.f - (1.6f / FRAME_HEIGHT); // Half a row
+  fractal.use_cycle_check = true;
+  init_fractal(&fractal);
+}
+
+#define NUM_ZOOMS 64
+static uint32_t zoom_count = 0;
+
+void zoom_mandel() {
+  if (++zoom_count == NUM_ZOOMS)
+  {
+    init_mandel();
+    zoom_count = 0;
+    return;
+  }
+
+  printf("Zoom: %ld\n", zoom_count);
+
+  float zoomx = -.75f - .7f * ((float)zoom_count / (float)NUM_ZOOMS);
+  float sizex = fractal.maxx - fractal.minx;
+  float sizey = fractal.miny * -2.f;
+  float zoomr = 0.96f * 0.5f;
+  fractal.minx = zoomx - zoomr * sizex;
+  fractal.maxx = zoomx + zoomr * sizex;
+  fractal.miny = -zoomr * sizey;
+  fractal.maxy = 0.f + fractal.miny / FRAME_HEIGHT;
+  init_fractal(&fractal);
+}
+
+// Core 1 handles DMA IRQs and runs TMDS encode on scanline buffers it
+// receives through the mailbox FIFO
+void __not_in_flash("core1_main") core1_main() {
+  dvi_register_irqs_this_core(&dvi0, DMA_IRQ_0);
+  sem_acquire_blocking(&dvi_start_sem);
+  dvi_start(&dvi0);
+
+  while (1) {
+    const uint32_t *colourbuf = (const uint32_t*)multicore_fifo_pop_blocking();
+    uint32_t *tmdsbuf = (uint32_t*)multicore_fifo_pop_blocking();
+    tmds_encode_palette_data((const uint32_t*)colourbuf, tmds_palette, tmdsbuf, FRAME_WIDTH, PALETTE_BITS);
+    multicore_fifo_push_blocking(0);
+    while (!fractal.done && queue_get_level(&dvi0.q_tmds_valid) >= 5) generate_steal_one(&fractal);
+  }
+  __builtin_unreachable();
+}
+
+int __not_in_flash("main") main() {
+  vreg_set_voltage(VREG_VSEL);
+  sleep_ms(10);
+  set_sys_clock_khz(DVI_TIMING.bit_clk_khz, true);
+
+  setup_default_uart();
+
+  gpio_init(PICO_DEFAULT_LED_PIN);
+  gpio_set_dir(PICO_DEFAULT_LED_PIN, GPIO_OUT);
+  
+  init_palette();
+  init_mandel();
+
+  printf("Configuring DVI\n");
+
+  dvi0.timing = &DVI_TIMING;
+  dvi0.ser_cfg = DEFAULT_DVI_SERIAL_CONFIG;
+  dvi_init(&dvi0, next_striped_spin_lock_num(), next_striped_spin_lock_num());
+
+  printf("Core 1 start\n");
+  sem_init(&dvi_start_sem, 0, 1);
+  hw_set_bits(&bus_ctrl_hw->priority, BUSCTRL_BUS_PRIORITY_PROC1_BITS);
+  multicore_launch_core1(core1_main);
+
+  uint heartbeat = 0;
+  uint32_t encode_time = 0;
+
+  sem_release(&dvi_start_sem);
+  while (1) {
+    if (++heartbeat >= 30) {
+      heartbeat = 0;
+      gpio_xor_mask(1u << PICO_DEFAULT_LED_PIN);
+
+      printf("Encode total time: %ldus\n", encode_time);
+      encode_time = 0;
+    }
+    if (fractal.done) zoom_mandel();
+    //if (heartbeat & 1) init_palette();
+    for (int y = 0; y < FRAME_HEIGHT / 2; y += 2) {
+      uint32_t *our_tmds_buf, *their_tmds_buf;
+      queue_remove_blocking_u32(&dvi0.q_tmds_free, &their_tmds_buf);
+      multicore_fifo_push_blocking((uint32_t)(&mandel[y*FRAME_WIDTH]));
+      multicore_fifo_push_blocking((uint32_t)their_tmds_buf);
+  
+      queue_remove_blocking_u32(&dvi0.q_tmds_free, &our_tmds_buf);
+      absolute_time_t start_time = get_absolute_time();
+      tmds_encode_palette_data((const uint32_t*)(&mandel[(y+1)*FRAME_WIDTH]), tmds_palette, our_tmds_buf, FRAME_WIDTH, PALETTE_BITS);
+      encode_time += absolute_time_diff_us(start_time, get_absolute_time());
+      
+      multicore_fifo_pop_blocking();
+
+      while (!fractal.done && queue_get_level(&dvi0.q_tmds_valid) >= 5) generate_one_forward(&fractal);
+
+      queue_add_blocking_u32(&dvi0.q_tmds_valid, &their_tmds_buf);
+      queue_add_blocking_u32(&dvi0.q_tmds_valid, &our_tmds_buf);
+    }
+    for (int y = FRAME_HEIGHT / 2 - 2; y >= 0; y -= 2) {
+      uint32_t *our_tmds_buf, *their_tmds_buf;
+      queue_remove_blocking_u32(&dvi0.q_tmds_free, &their_tmds_buf);
+      multicore_fifo_push_blocking((uint32_t)(&mandel[(y+1)*FRAME_WIDTH]));
+      multicore_fifo_push_blocking((uint32_t)their_tmds_buf);
+  
+      queue_remove_blocking_u32(&dvi0.q_tmds_free, &our_tmds_buf);
+      absolute_time_t start_time = get_absolute_time();
+      tmds_encode_palette_data((const uint32_t*)(&mandel[y*FRAME_WIDTH]), tmds_palette, our_tmds_buf, FRAME_WIDTH, PALETTE_BITS);
+      encode_time += absolute_time_diff_us(start_time, get_absolute_time());
+      
+      multicore_fifo_pop_blocking();
+
+      while (!fractal.done && queue_get_level(&dvi0.q_tmds_valid) >= 5) generate_one_forward(&fractal);
+
+      queue_add_blocking_u32(&dvi0.q_tmds_valid, &their_tmds_buf);
+      queue_add_blocking_u32(&dvi0.q_tmds_valid, &our_tmds_buf);
+    }
+  }
+  __builtin_unreachable();
+}
+  
--- a/software/apps/mandel-full/mandelbrot.c
+++ b/software/apps/mandel-full/mandelbrot.c
@ -0,0 +1,232 @@
+// Copyright (C) Michael Bell 2021
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "pico/stdlib.h"
+#include "hardware/interp.h"
+#include "hardware/dma.h"
+
+#include "mandelbrot.h"
+
+// Cycle checking parameters
+#define MAX_CYCLE_LEN 8          // Must be power of 2
+#define MIN_CYCLE_CHECK_ITER 32  // Must be multiple of max cycle len
+#define CYCLE_TOLERANCE (1<<18)
+
+// Fixed point with 6 bits to the left of the point.
+// Range [-32,32) with precision 2^-26
+typedef int32_t fixed_pt_t;
+
+#define ESCAPE_SQUARE (4<<26)
+
+static inline fixed_pt_t mul(fixed_pt_t a, fixed_pt_t b)
+{
+  int32_t ah = a >> 13;
+  int32_t al = a & 0x1fff;
+  int32_t bh = b >> 13;
+  int32_t bl = b & 0x1fff;
+
+  // Ignore al * bl as contribution to final result is only the carry.
+  fixed_pt_t r = ((ah * bl) + (al * bh)) >> 13;
+  r += ah * bh;
+  return r;
+}
+
+// a * b * 2
+static inline fixed_pt_t mul2(fixed_pt_t a, fixed_pt_t b)
+{
+#if 0
+  int32_t ah = a >> 12;
+  int32_t al = a & 0xfff;
+  int32_t bh = b >> 13;
+  int32_t bl = b & 0x1fff;
+
+  interp0->accum[0] = ah * bl;
+  interp0->accum[1] = al * bh;
+  interp0->base[2] = ah * bh;
+  return interp0->peek[2];
+#else
+  int32_t ah = a >> 12;
+  int32_t al = (a & 0xfff) << 1;
+  int32_t bh = b >> 13;
+  int32_t bl = b & 0x1fff;
+
+  fixed_pt_t r = ((ah * bl) + (al * bh)) >> 13;
+  r += ah * bh;
+  return r;
+#endif
+}
+
+static inline fixed_pt_t square(fixed_pt_t a) {
+  int32_t ah = a >> 13;
+  int32_t al = a & 0x1fff;
+
+  return ((ah * al) >> 12) + (ah * ah);
+}
+
+fixed_pt_t make_fixed(int32_t x) {
+  return x << 26;
+}
+
+fixed_pt_t make_fixedf(float x) {
+  return (int32_t)(x * (67108864.f));
+}
+
+void mandel_init()
+{
+  // Not curently used
+  interp_config cfg = interp_default_config();
+  interp_config_set_add_raw(&cfg, false);
+  interp_config_set_shift(&cfg, 13);
+  interp_config_set_mask(&cfg, 0, 31 - 13);
+  interp_config_set_signed(&cfg, true);
+  interp_set_config(interp0, 0, &cfg);
+  interp_config_set_shift(&cfg, 12);
+  interp_config_set_mask(&cfg, 0, 31 - 12);
+  interp_set_config(interp0, 1, &cfg);
+}
+
+void init_fractal(FractalBuffer* f)
+{
+  f->done = false;
+  f->min_iter = f->max_iter - 1;
+  f->iminx = make_fixedf(f->minx);
+  f->imaxx = make_fixedf(f->maxx);
+  f->iminy = make_fixedf(f->miny);
+  f->imaxy = make_fixedf(f->maxy);
+  f->incx = (f->imaxx - f->iminx) / (f->cols - 1);
+  f->incy = (f->imaxy - f->iminy) / (f->rows - 1);
+  f->count_inside = 0;
+  f->ipos = 0;
+  f->jpos = 0;
+  f->iend = f->rows - 1;
+  f->jend = f->cols - 1;
+}
+
+static inline void generate_one(FractalBuffer* f, fixed_pt_t x0, fixed_pt_t y0, uint8_t* buffptr)
+{
+  fixed_pt_t x = x0;
+  fixed_pt_t y = y0;
+
+  uint16_t k = 1;
+  for (; k < f->max_iter; ++k) {
+    fixed_pt_t x_square = square(x);
+    fixed_pt_t y_square = square(y);
+    if (x_square + y_square > ESCAPE_SQUARE) break;
+
+    fixed_pt_t nextx = x_square - y_square + x0;
+    y = mul2(x,y) + y0;
+    x = nextx;
+  }
+  if (k == f->max_iter) {
+    *buffptr = 0;
+    f->count_inside++;
+  } else {
+    if (k > f->iter_offset) k -= f->iter_offset;
+    else k = 1;
+    *buffptr = k;
+    if (f->min_iter > k) f->min_iter = k;
+  }
+}
+
+static inline void generate_one_cycle_check(FractalBuffer* f, fixed_pt_t x0, fixed_pt_t y0, uint8_t* buffptr)
+{
+  fixed_pt_t x = x0;
+  fixed_pt_t y = y0;
+  fixed_pt_t oldx = 0, oldy = 0;
+
+  uint16_t k = 1;
+  for (; k < f->max_iter; ++k) {
+    fixed_pt_t x_square = square(x);
+    fixed_pt_t y_square = square(y);
+    if (x_square + y_square > ESCAPE_SQUARE) break;
+
+    if (k >= MIN_CYCLE_CHECK_ITER) {
+      if ((k & (MAX_CYCLE_LEN - 1)) == 0) {
+        oldx = x - CYCLE_TOLERANCE;
+        oldy = y - CYCLE_TOLERANCE;
+      }
+      else
+      {
+        if ((uint32_t)(x - oldx) < (2*CYCLE_TOLERANCE) && (uint32_t)(y - oldy) < (2*CYCLE_TOLERANCE)) {
+          // Found a cycle
+          k = f->max_iter;
+          break;
+        }
+      }
+    }
+
+    fixed_pt_t nextx = x_square - y_square + x0;
+    y = mul2(x,y) + y0;
+    x = nextx;
+  }
+  if (k == f->max_iter) {
+    *buffptr = 0;
+    f->count_inside++;
+  } else {
+    if (k > f->iter_offset) k -= f->iter_offset;
+    else k = 1;
+    *buffptr = k;
+    if (f->min_iter > k) f->min_iter = k;
+  }
+}
+
+void generate_fractal(FractalBuffer* f)
+{
+  uint8_t* buffptr = f->buff;
+
+  fixed_pt_t y0 = f->iminy;
+  int16_t i = 0;
+  for (; i < f->iend; ++i, y0 += f->incy) {
+    fixed_pt_t x0 = f->iminx;
+    for (int16_t j = 0; j < f->cols; ++j, x0 += f->incx) {
+      if (f->use_cycle_check) generate_one_cycle_check(f, x0, y0, buffptr++);
+      else generate_one(f, x0, y0, buffptr++);
+    }
+  }
+
+  fixed_pt_t x0 = f->iminx;
+  for (int16_t j = 0; j < f->jend && i == f->iend; ++j, x0 += f->incx) {
+    if (f->use_cycle_check) generate_one_cycle_check(f, x0, y0, buffptr++);
+    else generate_one(f, x0, y0, buffptr++);
+  }
+
+  f->done = true;
+}
+
+void generate_one_forward(FractalBuffer* f)
+{
+  if (f->done) return;
+
+  uint8_t* buffptr = f->buff + f->ipos * f->cols + f->jpos;
+
+  fixed_pt_t y0 = f->iminy + f->ipos * f->incy;
+  fixed_pt_t x0 = f->iminx + f->jpos * f->incx;
+  if (f->use_cycle_check) generate_one_cycle_check(f, x0, y0, buffptr);
+  else generate_one(f, x0, y0, buffptr);
+
+  if (++f->jpos == f->cols)
+  {
+    f->jpos = 0;
+    if (++f->ipos > f->iend) f->done = true;
+  }
+}
+
+void generate_steal_one(FractalBuffer* f)
+{
+  if (f->done) {
+    return;
+  }
+
+  uint8_t* buffptr = f->buff + f->iend * f->cols + f->jend;
+
+  fixed_pt_t y0 = f->iminy + f->iend * f->incy;
+  fixed_pt_t x0 = f->iminx + f->jend * f->incx;
+  if (f->use_cycle_check) generate_one_cycle_check(f, x0, y0, buffptr);
+  else generate_one(f, x0, y0, buffptr);
+
+  if (f->jend-- == 0) {
+    f->jend = f->cols - 1;
+    if (--f->iend < f->ipos) f->done = true;
+  }
+}
--- a/software/apps/mandel-full/mandelbrot.h
+++ b/software/apps/mandel-full/mandelbrot.h
@ -0,0 +1,41 @@
+// Init pico resources used for generation
+void mandel_init();
+
+// Fixed point with 6 bits to the left of the point.
+// Range [-32,32) with precision 2^-26
+typedef int32_t fixed_pt_t;
+
+typedef struct {
+  // Configuration
+  uint8_t* buff;
+  int16_t rows;
+  int16_t cols;
+
+  uint16_t max_iter;
+  uint16_t iter_offset;
+  float minx, miny, maxx, maxy;
+  bool use_cycle_check;
+
+  // State
+  volatile bool done;
+  volatile uint16_t min_iter;
+  fixed_pt_t iminx, iminy, imaxx, imaxy;
+  fixed_pt_t incx, incy;
+  volatile uint32_t count_inside;
+
+  int16_t ipos, jpos;
+  // Tracks work stealing on core 0
+  volatile int16_t iend, jend;
+} FractalBuffer;
+
+// Make a fixed_pt_t from an int or float.
+fixed_pt_t make_fixed(int32_t x);
+fixed_pt_t make_fixedf(float x);
+
+// Generate a section of the fractal into buff
+// Result written to buff is 0 for inside Mandelbrot set
+// Otherwise iteration of escape minus min_iter (clamped to 1)
+void init_fractal(FractalBuffer* fractal);
+void generate_fractal(FractalBuffer* fractal);
+void generate_one_forward(FractalBuffer* f);
+void generate_steal_one(FractalBuffer* f);
--- a/software/libdvi/tmds_encode.S
+++ b/software/libdvi/tmds_encode.S
@ -395,6 +395,10 @@ tmds_2bpp_table:
 	// DC balance defined to be 0 at start of scanline:
 	movs r4, #0
 	str r4, [r2, #ACCUM1_OFFS]
+#if TMDS_FULLRES_NO_DC_BALANCE
+	// Alternate parity between odd/even symbols if there's no balance feedback
+	mvns r4, r4
+#endif
 	str r4, [r2, #ACCUM1_OFFS + INTERP1]

 	// Keep loop start pointer in r8 so we can get a longer backward branch
@ -529,3 +533,118 @@ decl_func_x tmds_fullres_encode_loop_16bpp_leftshift_x
 	tmds_fullres_encode_loop_16bpp_leftshift
 decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y
 	tmds_fullres_encode_loop_16bpp_leftshift
+
+
+// Variant of tmds_fullres_encode_loop_16bpp that reads
+// 8-bit wide pixels packed 4 per word.  The interpolator
+// base is set to a reordered list of TMDS symbols based
+// on a user colour palette.
+
+.macro tmds_palette_encode_loop
+	push {r4-r7, lr}
+	mov r4, r8
+	push {r4}
+
+
+	lsls r2, #1
+	add r2, r1
+	mov ip, r2
+	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
+	// DC balance defined to be 0 at start of scanline:
+	movs r4, #0
+	str r4, [r2, #ACCUM1_OFFS]
+#if TMDS_FULLRES_NO_DC_BALANCE
+	// Alternate parity between odd/even symbols if there's no balance feedback
+	mvns r4, r4
+#endif
+	str r4, [r2, #ACCUM1_OFFS + INTERP1]
+
+	// Keep loop start pointer in r8 so we can get a longer backward branch
+	adr r4, 1f
+	adds r4, #1 // god damn thumb bit why is this a thing
+	mov r8, r4
+	b 2f
+	.align 2
+1:
+.rept 8
+	ldmia r0!, {r3, r5}
+	lsrs r4, r3, #14
+	lsls r3, #2
+	lsrs r6, r5, #14
+	lsls r5, #2
+	str r3, [r2, #ACCUM0_OFFS + INTERP1]
+	str r3, [r2, #ACCUM0_OFFS]
+	ldr r3, [r2, #PEEK2_OFFS]
+	ldr r3, [r3]
+#if !TMDS_FULLRES_NO_DC_BALANCE
+	str r3, [r2, #ACCUM1_ADD_OFFS]
+#endif
+	ldr r7, [r2, #PEEK2_OFFS + INTERP1]
+	ldr r7, [r7]
+#if !TMDS_FULLRES_NO_DC_BALANCE
+	str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1]
+#endif
+	lsls r7, #10
+	orrs r3, r7
+
+	str r4, [r2, #ACCUM0_OFFS + INTERP1]
+	str r4, [r2, #ACCUM0_OFFS]
+	ldr r4, [r2, #PEEK2_OFFS]
+	ldr r4, [r4]
+#if !TMDS_FULLRES_NO_DC_BALANCE
+	str r4, [r2, #ACCUM1_ADD_OFFS]
+#endif
+	ldr r7, [r2, #PEEK2_OFFS + INTERP1]
+	ldr r7, [r7]
+#if !TMDS_FULLRES_NO_DC_BALANCE
+	str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1]
+#endif
+	lsls r7, #10
+	orrs r4, r7
+
+	str r5, [r2, #ACCUM0_OFFS + INTERP1]
+	str r5, [r2, #ACCUM0_OFFS]
+	ldr r5, [r2, #PEEK2_OFFS]
+	ldr r5, [r5]
+#if !TMDS_FULLRES_NO_DC_BALANCE
+	str r5, [r2, #ACCUM1_ADD_OFFS]
+#endif
+	ldr r7, [r2, #PEEK2_OFFS + INTERP1]
+	ldr r7, [r7]
+#if !TMDS_FULLRES_NO_DC_BALANCE
+	str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1]
+#endif
+	lsls r7, #10
+	orrs r5, r7
+
+	str r6, [r2, #ACCUM0_OFFS + INTERP1]
+	str r6, [r2, #ACCUM0_OFFS]
+	ldr r6, [r2, #PEEK2_OFFS]
+	ldr r6, [r6]
+#if !TMDS_FULLRES_NO_DC_BALANCE
+	str r6, [r2, #ACCUM1_ADD_OFFS]
+#endif
+	ldr r7, [r2, #PEEK2_OFFS + INTERP1]
+	ldr r7, [r7]
+#if !TMDS_FULLRES_NO_DC_BALANCE
+	str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1]
+#endif
+	lsls r7, #10
+	orrs r6, r7
+
+	stmia r1!, {r3, r4, r5, r6}
+.endr
+2:
+	cmp r1, ip
+	beq 1f
+	bx r8
+1:
+	pop {r4}
+	mov r8, r4
+	pop {r4-r7, pc}
+.endm
+
+decl_func_x tmds_palette_encode_loop_x
+	tmds_palette_encode_loop
+decl_func_y tmds_palette_encode_loop_y
+	tmds_palette_encode_loop
--- a/software/libdvi/tmds_encode.c
+++ b/software/libdvi/tmds_encode.c
@ -162,3 +162,85 @@ void __not_in_flash_func(tmds_encode_data_channel_fullres_16bpp)(const uint32_t
 	interp_restore(interp1_hw, &interp1_save);
 #endif
 }
+
+// This takes a 16-bit (RGB 565) colour palette and makes palettes of TMDS symbols suitable
+// for performing fullres encode.
+// The TMDS palette buffer should be 6 * n_palette words long.
+// n_palette must be a power of 2 <= 256.
+void tmds_setup_palette_symbols(const uint16_t *palette, uint32_t *tmds_palette, size_t n_palette) {
+	uint32_t* tmds_palette_blue = tmds_palette;
+	uint32_t* tmds_palette_green = tmds_palette + 2 * n_palette;
+	uint32_t* tmds_palette_red = tmds_palette + 4 * n_palette;
+	for (int i = 0; i < n_palette; ++i) {
+		uint16_t blue = (palette[i] << 1) & 0x3e;
+		uint16_t green = (palette[i] >> 5) & 0x3f;
+		uint16_t red = (palette[i] >> 10) & 0x3e;
+		tmds_palette_blue[i] = tmds_table_fullres_x[blue];
+		tmds_palette_blue[i + n_palette] = tmds_table_fullres_x[64 + blue];
+		tmds_palette_green[i] = tmds_table_fullres_x[green];
+		tmds_palette_green[i + n_palette] = tmds_table_fullres_x[64 + green];
+		tmds_palette_red[i] = tmds_table_fullres_x[red];
+		tmds_palette_red[i + n_palette] = tmds_table_fullres_x[64 + red];
+	}
+}
+
+// Encode palette data for all 3 channels.
+// pixbuf is an array of n_pix 8-bit wide pixels containing palette values (32-bit word aligned)
+// tmds_palette is a palette of TMDS symbols produced by tmds_setup_palette_symbols
+// symbuf is 3*n_pix 32-bit words, this function writes the symbol values for each of the channels to it.
+void __not_in_flash_func(tmds_encode_palette_data)(const uint32_t *pixbuf, const uint32_t *tmds_palette, uint32_t *symbuf, size_t n_pix, uint32_t palette_bits) {
+	uint core = get_core_num();
+#if !TMDS_FULLRES_NO_INTERP_SAVE
+	interp_hw_save_t interp0_save, interp1_save;
+	interp_save(interp0_hw, &interp0_save);
+	interp_save(interp1_hw, &interp1_save);
+#endif
+
+	interp0_hw->base[2] = (uint32_t)tmds_palette;
+	interp1_hw->base[2] = (uint32_t)tmds_palette;
+
+	// Lane 0 on both interpolators masks the palette bits, starting at bit 2,
+	// The second interpolator also shifts to read the 2nd or 4th byte of the word.
+	interp0_hw->ctrl[0] =
+		(2 << SIO_INTERP0_CTRL_LANE0_MASK_LSB_LSB) |
+		((palette_bits + 1) << SIO_INTERP0_CTRL_LANE0_MASK_MSB_LSB);
+	interp1_hw->ctrl[0] =
+		(8 << SIO_INTERP0_CTRL_LANE0_SHIFT_LSB) |
+		(2 << SIO_INTERP0_CTRL_LANE0_MASK_LSB_LSB) |
+		((palette_bits + 1) << SIO_INTERP0_CTRL_LANE0_MASK_MSB_LSB);
+
+	// Lane 1 shifts and masks the sign bit into the right position to add to the symbol
+	// table index to choose the negative disparity symbols if the sign is negative.
+	const uint32_t ctrl_lane_1 =
+		((31 - (palette_bits + 2)) << SIO_INTERP0_CTRL_LANE0_SHIFT_LSB) |
+		(palette_bits + 2) * ((1 << SIO_INTERP0_CTRL_LANE0_MASK_LSB_LSB) | (1 << SIO_INTERP0_CTRL_LANE0_MASK_MSB_LSB));
+	interp0_hw->ctrl[1] = ctrl_lane_1;
+	interp1_hw->ctrl[1] = ctrl_lane_1;
+
+	if (core) {
+		tmds_palette_encode_loop_x(pixbuf, symbuf, n_pix);
+
+		interp0_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits));
+		interp1_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits));
+		tmds_palette_encode_loop_x(pixbuf, symbuf + (n_pix >> 1), n_pix);
+
+		interp0_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits));
+		interp1_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits));
+		tmds_palette_encode_loop_x(pixbuf, symbuf + n_pix, n_pix);
+	} else {
+		tmds_palette_encode_loop_y(pixbuf, symbuf, n_pix);
+
+		interp0_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits));
+		interp1_hw->base[2] = (uint32_t)(tmds_palette + (2 << palette_bits));
+		tmds_palette_encode_loop_y(pixbuf, symbuf + (n_pix >> 1), n_pix);
+
+		interp0_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits));
+		interp1_hw->base[2] = (uint32_t)(tmds_palette + (4 << palette_bits));
+		tmds_palette_encode_loop_y(pixbuf, symbuf + n_pix, n_pix);
+	}
+
+#if !TMDS_FULLRES_NO_INTERP_SAVE
+	interp_restore(interp0_hw, &interp0_save);
+	interp_restore(interp1_hw, &interp1_save);
+#endif
+}
--- a/software/libdvi/tmds_encode.h
+++ b/software/libdvi/tmds_encode.h
@ -8,6 +8,8 @@
 void tmds_encode_data_channel_16bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb);
 void tmds_encode_data_channel_8bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb);
 void tmds_encode_data_channel_fullres_16bpp(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint channel_msb, uint channel_lsb);
+void tmds_setup_palette_symbols(const uint16_t *palette, uint32_t *symbuf, size_t n_palette);
+void tmds_encode_palette_data(const uint32_t *pixbuf, const uint32_t *tmds_palette, uint32_t *symbuf, size_t n_pix, uint32_t palette_bits);

 // Functions from tmds_encode.S

@ -28,5 +30,7 @@ void tmds_fullres_encode_loop_16bpp_x(const uint32_t *pixbuf, uint32_t *symbuf,
 void tmds_fullres_encode_loop_16bpp_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
 void tmds_fullres_encode_loop_16bpp_leftshift_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift);
 void tmds_fullres_encode_loop_16bpp_leftshift_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix, uint leftshift);
+void tmds_palette_encode_loop_x(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);
+void tmds_palette_encode_loop_y(const uint32_t *pixbuf, uint32_t *symbuf, size_t n_pix);

 #endif