Update 1bpp encode: 35% cycle reduction. Also optimise pixel-doubled 16bpp to use all low registers again.#

two-pixels-per-word
Luke Wren 2021-03-01 09:46:15 +00:00
rodzic e39f516913
commit b20f4ef88f
2 zmienionych plików z 70 dodań i 63 usunięć

Wyświetl plik

@ -42,6 +42,14 @@
// r1: Output buffer (word-aligned)
// r2: Input size (pixels)
.macro do_channel_16bpp r_ibase r_inout0 r_out1
str \r_inout0, [\r_ibase, #ACCUM0_OFFS]
ldr \r_inout0, [\r_ibase, #PEEK0_OFFS]
ldr \r_inout0, [\r_inout0]
ldr \r_out1, [\r_ibase, #PEEK1_OFFS]
ldr \r_out1, [\r_out1]
.endm
decl_func tmds_encode_loop_16bpp
push {r4, r5, r6, r7, lr}
lsls r2, #2
@ -52,15 +60,10 @@ decl_func tmds_encode_loop_16bpp
.align 2
1:
.rept TMDS_ENCODE_UNROLL
ldmia r0!, {r4}
str r4, [r2, #ACCUM0_OFFS]
ldr r4, [r2, #PEEK0_OFFS]
ldr r4, [r4]
ldr r6, [r2, #PEEK1_OFFS]
ldr r6, [r6]
// TODO our pixels are now 2 per word instead of 1 per word, so this store is
// now 2 words instead of 4; reexpand it.
stmia r1!, {r4, r6}
ldmia r0!, {r4, r6}
do_channel_16bpp r2, r4, r5
do_channel_16bpp r2, r6, r7
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
@ -85,14 +88,12 @@ decl_func tmds_encode_loop_16bpp_leftshift
.align 2
1:
.rept TMDS_ENCODE_UNROLL
ldmia r0!, {r4}
ldmia r0!, {r4, r6}
lsls r4, r3
str r4, [r2, #ACCUM0_OFFS]
ldr r4, [r2, #PEEK0_OFFS]
ldr r4, [r4]
ldr r6, [r2, #PEEK1_OFFS]
ldr r6, [r6]
stmia r1!, {r4, r6}
do_channel_16bpp r2, r4, r5
lsls r6, r3
do_channel_16bpp r2, r6, r7
stmia r1!, {r4, r5, r6, r7}
.endr
2:
cmp r1, ip
@ -181,27 +182,24 @@ decl_func tmds_encode_loop_8bpp_leftshift
// - Subsequently encoding either 0x01 or 0xfe will return the balance to 0, with
// output symbol of 0x1ff or 0x2ff
//
// So we can transform a black and white image to TMDS symbols with the
// following table:
//
// x % 2 | Colour | Output
// ------+--------+--------
// 0 | 0 | 0x9aaaa
// 0 | 1 | 0x6aaaa
// 1 | 0 | 0x95555
// 1 | 1 | 0x65555
// So we can do 1bpp encode with a lookup of x coordinate LSB, and input
// colour bit. If we process pixels in even-sized blocks, only the colour
// lookup is needed.
// Encode 4 pixels @ 1bpp (using one table lookup)
// Encode 8 pixels @ 1bpp (using two table lookups)
// r3 contains lookup mask (preshifted)
// r8 contains pointer to encode table
// 3.25 cyc/pix
.macro tmds_encode_1bpp_body shift_instr shamt
\shift_instr r4, r2, #\shamt
// 2.125 cyc/pix
.macro tmds_encode_1bpp_body shift_instr0 shamt0 shift_instr1 shamt1
\shift_instr0 r4, r2, #\shamt0
ands r4, r3
add r4, r8
ldmia r4, {r4-r7}
stmia r1!, {r4-r7}
ldmia r4, {r4, r5}
\shift_instr1 r6, r2, #\shamt1
ands r6, r3
add r6, r8
ldmia r6, {r6, r7}
stmia r1!, {r4, r5, r6, r7}
.endm
// r0: input buffer (word-aligned)
@ -211,24 +209,20 @@ decl_func tmds_encode_1bpp
push {r4-r7, lr}
mov r7, r8
push {r7}
lsls r2, #2
lsls r2, #1
add r2, r1
mov ip, r2
adr r4, tmds_1bpp_table
mov r8, r4
// Mask: 4 bit index, 16 bytes per entry
movs r3, #0xf0
// Mask: 4 bit index, 8 bytes per entry
movs r3, #0x78
b 2f
1:
ldmia r0!, {r2}
tmds_encode_1bpp_body lsls 4
tmds_encode_1bpp_body lsls 0
tmds_encode_1bpp_body lsrs 4
tmds_encode_1bpp_body lsrs 8
tmds_encode_1bpp_body lsrs 12
tmds_encode_1bpp_body lsrs 16
tmds_encode_1bpp_body lsrs 20
tmds_encode_1bpp_body lsrs 24
tmds_encode_1bpp_body lsls 3 lsrs 1
tmds_encode_1bpp_body lsrs 5 lsrs 9
tmds_encode_1bpp_body lsrs 13 lsrs 17
tmds_encode_1bpp_body lsrs 21 lsrs 25
2:
cmp r1, ip
blo 1b
@ -239,22 +233,22 @@ decl_func tmds_encode_1bpp
.align 2
tmds_1bpp_table:
.word 0x9aaaa, 0x95555, 0x9aaaa, 0x95555 // 0000
.word 0x6aaaa, 0x95555, 0x9aaaa, 0x95555 // 0001
.word 0x9aaaa, 0x65555, 0x9aaaa, 0x95555 // 0010
.word 0x6aaaa, 0x65555, 0x9aaaa, 0x95555 // 0011
.word 0x9aaaa, 0x95555, 0x6aaaa, 0x95555 // 0100
.word 0x6aaaa, 0x95555, 0x6aaaa, 0x95555 // 0101
.word 0x9aaaa, 0x65555, 0x6aaaa, 0x95555 // 0110
.word 0x6aaaa, 0x65555, 0x6aaaa, 0x95555 // 0111
.word 0x9aaaa, 0x95555, 0x9aaaa, 0x65555 // 1000
.word 0x6aaaa, 0x95555, 0x9aaaa, 0x65555 // 1001
.word 0x9aaaa, 0x65555, 0x9aaaa, 0x65555 // 1010
.word 0x6aaaa, 0x65555, 0x9aaaa, 0x65555 // 1011
.word 0x9aaaa, 0x95555, 0x6aaaa, 0x65555 // 1100
.word 0x6aaaa, 0x95555, 0x6aaaa, 0x65555 // 1101
.word 0x9aaaa, 0x65555, 0x6aaaa, 0x65555 // 1110
.word 0x6aaaa, 0x65555, 0x6aaaa, 0x65555 // 1111
.word 0x7fd00, 0x7fd00 // 0000
.word 0x7fe00, 0x7fd00 // 0001
.word 0xbfd00, 0x7fd00 // 0010
.word 0xbfe00, 0x7fd00 // 0011
.word 0x7fd00, 0x7fe00 // 0100
.word 0x7fe00, 0x7fe00 // 0101
.word 0xbfd00, 0x7fe00 // 0110
.word 0xbfe00, 0x7fe00 // 0111
.word 0x7fd00, 0xbfd00 // 1000
.word 0x7fe00, 0xbfd00 // 1001
.word 0xbfd00, 0xbfd00 // 1010
.word 0xbfe00, 0xbfd00 // 1011
.word 0x7fd00, 0xbfe00 // 1100
.word 0x7fe00, 0xbfe00 // 1101
.word 0xbfd00, 0xbfe00 // 1110
.word 0xbfe00, 0xbfe00 // 1111
// ----------------------------------------------------------------------------
// Full-resolution RGB encode (not very practical)

Wyświetl plik

@ -84,11 +84,24 @@ enc = TMDSEncode()
###
# Pixel-doubled table:
for i in range(0, 256, 4):
sym0 = enc.encode(i, 0, 1)
sym1 = enc.encode(i ^ 1, 0, 1)
# for i in range(0, 256, 4):
# sym0 = enc.encode(i, 0, 1)
# sym1 = enc.encode(i ^ 1, 0, 1)
# assert(enc.imbalance == 0)
# print(f"0x{sym0 | (sym1 << 10):05x}u,")
###
# Fullres 1bpp table: (each entry is 2 words, 4 pixels)
# (note trick here is that encoding 0x00 or 0xff sets imbalance to -8, and
# (encoding 0x01 or 0xfe returns imbalance to 0, so we alternate between these
# (two pairs of dark/light colours. Creates some fairly subtle vertical
# (banding, but it's cheap.
for i in range(1 << 4):
syms = list(enc.encode((0xff if i & 1 << j else 0) ^ j & 0x01, 0, 1) for j in range(4))
print(f"0x{syms[0] | syms[1] << 10:05x}, 0x{syms[2] | syms[3] << 10:05x}")
assert(enc.imbalance == 0)
print(f"0x{sym0 | (sym1 << 10):05x}u,")
###
# Fullres table stuff: