Microbenchmarks on ARM Cortex M
After the last post we’ve got some cool hybrid c/asm code. It must be faster than the naive solution, but how do we know? Happily there exists dedicated debug hardware that can count the cycles of execution without disturbing them! Let’s start over with the basic code from the last post and add this in.
int dot(short * a, short * b, int len) {
int r=0;
while(--len!=-1) {
r+=(*a++)*(*b++);
}
return r;
}
#define array_len(a) (sizeof(a)/sizeof(a[0]))
int main(char * argc, char * argv[]) {
short a[512] = {1}
int r = dot( a, a, array_len(a));
return r;
}
Now let’s add some cycle counting in main.
#include "stdio.h"
int dot(short * a, short * b, int len) {
int r=0;
while(--len!=-1) {
r+=(*a++)*(*b++);
}
return r;
}
#define array_len(a) (sizeof(a)/sizeof(a[0]))
int main(char * argc, char * argv[]) {
short a[512] = {1}
// These addresses are defined by ARM
volatile unsigned int *DWT_CYCCNT = (volatile unsigned int *)0xE0001004;
volatile unsigned int *DWT_CONTROL = (volatile unsigned int *)0xE0001000;
volatile unsigned int *SCB_DEMCR = (volatile unsigned int *)0xE000EDFC;
//Turn on the cycle counter and reset it to 0
*SCB_DEMCR = *SCB_DEMCR | 0x01000000;
*DWT_CYCCNT = 0;
*DWT_CONTROL = *DWT_CONTROL | 1;
int r = dot( a, a, array_len(a));
//Turn it off
*DWT_CONTROL = *DWT_CONTROL | 0;
printf("%d cycles\n", *DWT_CYCCNT );
return r;
}
That’s 6185 cycles. Let’s see how we did with SMLAD.
#include "stdint.h"
__attribute__((always_inline)) static inline int32_t SMLAD(uint32_t a, uint32_t b, int32_t acc)
{
int32_t r;
asm ( "SMLAD %0, %1, %2, %3"
: "=r"(r)
: "r"(a), "r"(b), "r"(acc));
return r;
}
int dot(short * a, short * b, int len) {
int32_t r=0;
uint32_t *pa=a, *pb=b;
len>>=1;
while(--len!=-1) {
r = SMLAD(*pa++,*pb++,r);
}
return r;
}
Using O3 it’s 3362 cycles. Using Os it’s 4142. How about unrolling?
int dot(short * a, short * b, int len) {
int32_t r=0;
uint64_t *pa=a, *pb=b;
len>>=2;
while(--len!=-1) {
r = SMLAD(*pa,*pb,r);
r = SMLAD(*pa++>>32, *pb++>>32, r);
}
return r;
}
Gets us 2470 cycles with O3 and 2349 with Os. Let’s unroll further.
int dot(short * a, short * b, int len) {
int32_t r=0;
uint64_t *pa=a, *pb=b;
len>>=3;
while(--len!=-1) {
r = SMLAD(*pa,*pb,r);
r = SMLAD(*pa++>>32, *pb++>>32, r);
r = SMLAD(*pa,*pb,r);
r = SMLAD(*pa++>>32, *pb++>>32, r);
}
return r;
}
1703 with O3, 2026 with Os
int dot(short * a, short * b, int len) {
int32_t r=0;
uint64_t *pa=a, *pb=b;
len>>=4;
while(--len!=-1) {
r = SMLAD(*pa,*pb,r);
r = SMLAD(*pa++>>32, *pb++>>32, r);
r = SMLAD(*pa,*pb,r);
r = SMLAD(*pa++>>32, *pb++>>32, r);
r = SMLAD(*pa,*pb,r);
r = SMLAD(*pa++>>32, *pb++>>32, r);
r = SMLAD(*pa,*pb,r);
r = SMLAD(*pa++>>32, *pb++>>32, r);
}
return r;
}
1746 with O3, 1610 with Os.
int dot(short * a, short * b, int len) {
int32_t r=0;
uint64_t *pa=a, *pb=b;
len>>=5;
while(--len!=-1) {
r = SMLAD(*pa,*pb,r);
r = SMLAD(*pa++>>32, *pb++>>32, r);
r = SMLAD(*pa,*pb,r);
r = SMLAD(*pa++>>32, *pb++>>32, r);
r = SMLAD(*pa,*pb,r);
r = SMLAD(*pa++>>32, *pb++>>32, r);
r = SMLAD(*pa,*pb,r);
r = SMLAD(*pa++>>32, *pb++>>32, r);
r = SMLAD(*pa,*pb,r);
r = SMLAD(*pa++>>32, *pb++>>32, r);
r = SMLAD(*pa,*pb,r);
r = SMLAD(*pa++>>32, *pb++>>32, r);
r = SMLAD(*pa,*pb,r);
r = SMLAD(*pa++>>32, *pb++>>32, r);
r = SMLAD(*pa,*pb,r);
r = SMLAD(*pa++>>32, *pb++>>32, r);
}
return r;
}
Cycles by unroll and optimization level
Unroll | O3 | Os |
---|---|---|
1 | 6185 | |
2 | 3362 | 4142 |
4 | 1703 | 2026 |
8 | 1746 | 1610 |
16 | 1918 | 1338 |
32 | 2016 | 1170 |
64 | 2214 | 1217 |
There’s a catch with these unrolls, the input buffers need to be a multiple of the unroll length. But we have one more trick.
Duff’s device
We can reduce the requirement to be a multiple of the unroll down to being a multiple of the smallest internal unroll with Duff. In this case the smalles internal unroll is 8, to match the load multiple instruction.
int dot(short * a, short * b, int len) {
int32_t r=0;
uint64_t *pa=a, *pb=b;
int rm = len & 0xf;
len>>=4;
switch(rm%8) {
do {
case 0:
r = SMLAD(*pa,*pb,r);
r = SMLAD(*pa++>>32, *pb++>>32, r);
case 3:
r = SMLAD(*pa,*pb,r);
r = SMLAD(*pa++>>32, *pb++>>32, r);
case 2:
r = SMLAD(*pa,*pb,r);
r = SMLAD(*pa++>>32, *pb++>>32, r);
case 1:
r = SMLAD(*pa,*pb,r);
r = SMLAD(*pa++>>32, *pb++>>32, r);
} while(--len!=0);
}
return r;
}
But it costs us a few cycles, and O3 is faster.
Unroll | Os | O3 |
---|---|---|
16 | 2133 | 1813 |
32 | 1753 | 1443 |
64 | 1982 | 1499 |
It’s clear it takes advantage of the inner unroll.
Fastest Duff (unroll by 32 O3)
<dot>:
b4f0 push {r4, r5, r6, r7}
f002 0407 and.w r4, r2, #7
3c01 subs r4, #1
1152 asrs r2, r2, #5
2c06 cmp r4, #6
d85f bhi.n 3386e <dot+0xce>
e8df f004 tbb [pc, r4]
6561 .short 0x6561
04716d69 .word 0x04716d69
5a .byte 0x5a
00 .byte 0x00
4603 mov r3, r0
f04f 0c00 mov.w ip, #0
681d ldr r5, [r3, #0]
6858 ldr r0, [r3, #4]
680e ldr r6, [r1, #0]
684c ldr r4, [r1, #4]
fb25 cc06 smlad ip, r5, r6, ip
3308 adds r3, #8
3108 adds r1, #8
fb20 cc04 smlad ip, r0, r4, ip
e893 0021 ldmia.w r3, {r0, r5}
e891 0050 ldmia.w r1, {r4, r6}
fb20 cc04 smlad ip, r0, r4, ip
fb25 cc06 smlad ip, r5, r6, ip
3308 adds r3, #8
3108 adds r1, #8
681d ldr r5, [r3, #0]
6858 ldr r0, [r3, #4]
680e ldr r6, [r1, #0]
684c ldr r4, [r1, #4]
fb25 cc06 smlad ip, r5, r6, ip
3308 adds r3, #8
3108 adds r1, #8
fb20 cc04 smlad ip, r0, r4, ip
e893 0041 ldmia.w r3, {r0, r6}
e891 0030 ldmia.w r1, {r4, r5}
fb20 cc04 smlad ip, r0, r4, ip
fb26 cc05 smlad ip, r6, r5, ip
3308 adds r3, #8
3108 adds r1, #8
6858 ldr r0, [r3, #4]
684c ldr r4, [r1, #4]
681e ldr r6, [r3, #0]
680d ldr r5, [r1, #0]
fb26 cc05 smlad ip, r6, r5, ip
3308 adds r3, #8
3108 adds r1, #8
fb20 cc04 smlad ip, r0, r4, ip
3a01 subs r2, #1
681d ldr r5, [r3, #0]
685c ldr r4, [r3, #4]
680e ldr r6, [r1, #0]
6848 ldr r0, [r1, #4]
fb25 c506 smlad r5, r5, r6, ip
fb24 5000 smlad r0, r4, r0, r5
d031 beq.n 3389c <dot+0xfc>
3308 adds r3, #8
3108 adds r1, #8
685e ldr r6, [r3, #4]
680d ldr r5, [r1, #0]
684c ldr r4, [r1, #4]
681f ldr r7, [r3, #0]
fb27 0005 smlad r0, r7, r5, r0
3308 adds r3, #8
3108 adds r1, #8
fb26 0c04 smlad ip, r6, r4, r0
685c ldr r4, [r3, #4]
684e ldr r6, [r1, #4]
6818 ldr r0, [r3, #0]
680d ldr r5, [r1, #0]
fb20 cc05 smlad ip, r0, r5, ip
3308 adds r3, #8
3108 adds r1, #8
fb24 cc06 smlad ip, r4, r6, ip
e7ac b.n 337c0 <dot+0x20>
4603 mov r3, r0
f04f 0c00 mov.w ip, #0
e7f0 b.n 33850 <dot+0xb0>
4603 mov r3, r0
2000 movs r0, #0
e7e3 b.n 3383c <dot+0x9c>
4603 mov r3, r0
f04f 0c00 mov.w ip, #0
e7d3 b.n 33824 <dot+0x84>
4603 mov r3, r0
f04f 0c00 mov.w ip, #0
e7c5 b.n 33810 <dot+0x70>
4603 mov r3, r0
f04f 0c00 mov.w ip, #0
e7b7 b.n 337fc <dot+0x5c>
4603 mov r3, r0
f04f 0c00 mov.w ip, #0
e7a9 b.n 337e8 <dot+0x48>
4603 mov r3, r0
f04f 0c00 mov.w ip, #0
e79b b.n 337d4 <dot+0x34>
bcf0 pop {r4, r5, r6, r7}
4770 bx lr
Fastest overall (Unroll by 32 Os):
<dot>:
b5f0 push {r4, r5, r6, r7, lr}
4603 mov r3, r0
1192 asrs r2, r2, #6
2000 movs r0, #0
f112 32ff adds.w r2, r2, #4294967295 ; 0xffffffff
f0c0 8083 bcc.w 338b6 <dot+0x116>
685c ldr r4, [r3, #4]
680f ldr r7, [r1, #0]
681e ldr r6, [r3, #0]
684d ldr r5, [r1, #4]
fb26 0607 smlad r6, r6, r7, r0
6898 ldr r0, [r3, #8]
fb24 6505 smlad r5, r4, r5, r6
699f ldr r7, [r3, #24]
688c ldr r4, [r1, #8]
69de ldr r6, [r3, #28]
fb20 5504 smlad r5, r0, r4, r5
68d8 ldr r0, [r3, #12]
68cc ldr r4, [r1, #12]
fb20 5504 smlad r5, r0, r4, r5
6918 ldr r0, [r3, #16]
690c ldr r4, [r1, #16]
fb20 5504 smlad r5, r0, r4, r5
6958 ldr r0, [r3, #20]
694c ldr r4, [r1, #20]
fb20 5504 smlad r5, r0, r4, r5
69cc ldr r4, [r1, #28]
6988 ldr r0, [r1, #24]
fb27 5000 smlad r0, r7, r0, r5
6a1d ldr r5, [r3, #32]
fb26 0604 smlad r6, r6, r4, r0
6b1f ldr r7, [r3, #48] ; 0x30
6a4c ldr r4, [r1, #36] ; 0x24
6a08 ldr r0, [r1, #32]
fb25 6000 smlad r0, r5, r0, r6
6a5d ldr r5, [r3, #36] ; 0x24
6a9e ldr r6, [r3, #40] ; 0x28
fb25 0c04 smlad ip, r5, r4, r0
6acc ldr r4, [r1, #44] ; 0x2c
6add ldr r5, [r3, #44] ; 0x2c
6a88 ldr r0, [r1, #40] ; 0x28
fb26 c000 smlad r0, r6, r0, ip
6b5e ldr r6, [r3, #52] ; 0x34
fb25 0504 smlad r5, r5, r4, r0
6b4c ldr r4, [r1, #52] ; 0x34
6b08 ldr r0, [r1, #48] ; 0x30
fb27 5000 smlad r0, r7, r0, r5
6b9d ldr r5, [r3, #56] ; 0x38
fb26 0604 smlad r6, r6, r4, r0
6c9f ldr r7, [r3, #72] ; 0x48
6bcc ldr r4, [r1, #60] ; 0x3c
6b88 ldr r0, [r1, #56] ; 0x38
fb25 6000 smlad r0, r5, r0, r6
6bdd ldr r5, [r3, #60] ; 0x3c
6c1e ldr r6, [r3, #64] ; 0x40
fb25 0c04 smlad ip, r5, r4, r0
6c4c ldr r4, [r1, #68] ; 0x44
6c5d ldr r5, [r3, #68] ; 0x44
6c08 ldr r0, [r1, #64] ; 0x40
fb26 c000 smlad r0, r6, r0, ip
6cde ldr r6, [r3, #76] ; 0x4c
fb25 0504 smlad r5, r5, r4, r0
6ccc ldr r4, [r1, #76] ; 0x4c
6c88 ldr r0, [r1, #72] ; 0x48
fb27 5000 smlad r0, r7, r0, r5
6d1d ldr r5, [r3, #80] ; 0x50
fb26 0604 smlad r6, r6, r4, r0
6e1f ldr r7, [r3, #96] ; 0x60
6d4c ldr r4, [r1, #84] ; 0x54
6d08 ldr r0, [r1, #80] ; 0x50
fb25 6000 smlad r0, r5, r0, r6
6d5d ldr r5, [r3, #84] ; 0x54
6d9e ldr r6, [r3, #88] ; 0x58
fb25 0c04 smlad ip, r5, r4, r0
6dcc ldr r4, [r1, #92] ; 0x5c
6ddd ldr r5, [r3, #92] ; 0x5c
6d88 ldr r0, [r1, #88] ; 0x58
fb26 c000 smlad r0, r6, r0, ip
6e5e ldr r6, [r3, #100] ; 0x64
fb25 0504 smlad r5, r5, r4, r0
6e4c ldr r4, [r1, #100] ; 0x64
6e08 ldr r0, [r1, #96] ; 0x60
fb27 5000 smlad r0, r7, r0, r5
6e9d ldr r5, [r3, #104] ; 0x68
fb26 0604 smlad r6, r6, r4, r0
6f8f ldr r7, [r1, #120] ; 0x78
6ecc ldr r4, [r1, #108] ; 0x6c
6e88 ldr r0, [r1, #104] ; 0x68
fb25 6000 smlad r0, r5, r0, r6
6edd ldr r5, [r3, #108] ; 0x6c
6f1e ldr r6, [r3, #112] ; 0x70
fb25 0c04 smlad ip, r5, r4, r0
6f4d ldr r5, [r1, #116] ; 0x74
6f5c ldr r4, [r3, #116] ; 0x74
6f08 ldr r0, [r1, #112] ; 0x70
fb26 c000 smlad r0, r6, r0, ip
6fde ldr r6, [r3, #124] ; 0x7c
fb24 0405 smlad r4, r4, r5, r0
6fc8 ldr r0, [r1, #124] ; 0x7c
6f9d ldr r5, [r3, #120] ; 0x78
fb25 4407 smlad r4, r5, r7, r4
3380 adds r3, #128 ; 0x80
3180 adds r1, #128 ; 0x80
fb26 4000 smlad r0, r6, r0, r4
e778 b.n 337a8 <dot+0x8>
bdf0 pop {r4, r5, r6, r7, pc}
Try playing with it yourself on the compiler explorer.