Quick hand optimized ASM

13 Mar 2017

Profiling is the first step, and sometimes it shows almost all the time is spent in some basic math function. So what then? If the design is good we need to optimize.

But doing asm from scratch is a pain. Let’s not do that.

This

Step 1: Get a toolchain

I’ll use Cortex M4 as an example, so let’s get a bare metal tool chain for it from ARM. Here’s the latest for all architectures.

tar xf gcc-linaro-6.3.1-2017.02-x86_64_arm-eabi.tar.xz

cd gcc-linaro-6.3.1-2017.02-x86_64_arm-eabi/bin

Step 1: Write some code.

Let’s say almost all the time is spent in the dot product.

int dot(short * a, short * b, int len) {
  int r=0;
  while(--len!=-1) {
    r+=(*a++)*(*b++);
  }
  return r;
}

In example.c allows us to

./arm-eabi-gcc -nostdlib -mcpu=cortex-m4 -O3 example.c
./arm-eabi-objdump -d a.out 

and have a look at what the compiler thinks.

00008024 <dot>:
    8024:	2a01      	cmp	r2, #1
    8026:	dd11      	ble.n	804c <dot+0x28>
    8028:	b430      	push	{r4, r5}
    802a:	f102 4500 	add.w	r5, r2, #2147483648	; 0x80000000
    802e:	3d01      	subs	r5, #1
    8030:	4603      	mov	r3, r0
    8032:	eb00 0545 	add.w	r5, r0, r5, lsl #1
    8036:	2000      	movs	r0, #0
    8038:	f833 4b02 	ldrh.w	r4, [r3], #2
    803c:	f831 2b02 	ldrh.w	r2, [r1], #2
    8040:	42ab      	cmp	r3, r5
    8042:	fb14 0002 	smlabb	r0, r4, r2, r0
    8046:	d1f7      	bne.n	8038 <dot+0x14>
    8048:	bc30      	pop	{r4, r5}
    804a:	4770      	bx	lr
    804c:	2000      	movs	r0, #0
    804e:	4770      	bx	lr

42 instructions. Not bad, but we can do better with a few assumptions the compiler isn’t allowed to make.

Step 3: Pick an instruction

In this case the dual 16 bit signed multiply accumulate into 32 bits instruction SMLAD looks about right.

#include "stdint.h"


__attribute__((always_inline)) static inline int32_t SMLAD(uint32_t a, uint32_t b, int32_t acc)
{
  int32_t r;
  asm ( "SMLAD %0, %1, %2, %3"
    : "=r"(r)
    : "r"(a), "r"(b), "r"(acc));
  return r;
}

int dot(short * a, short * b, int len) {
  int32_t r=0;
  uint32_t *pa=a, *pb=b;
  len>>=1;
  while(--len!=-1) {
    r = SMLAD(*pa++,*pb++,r);
  }
  return r;
}

In m4.c run through

./arm-eabi-gcc -nostdlib  -mcpu=cortex-m4 -O3 m4.c 
./arm-eabi-objdump -d a.out

gives

00008024 <dot>:
    8024:	3a01      	subs	r2, #1
    8026:	d00c      	beq.n	8042 <dot+0x1e>
    8028:	b430      	push	{r4, r5}
    802a:	4603      	mov	r3, r0
    802c:	2000      	movs	r0, #0
    802e:	3a01      	subs	r2, #1
    8030:	f853 4b04 	ldr.w	r4, [r3], #4
    8034:	f851 5b04 	ldr.w	r5, [r1], #4
    8038:	fb24 0005 	smlad	r0, r4, r5, r0
    803c:	d1f7      	bne.n	802e <dot+0xa>
    803e:	bc30      	pop	{r4, r5}
    8040:	4770      	bx	lr
    8042:	4610      	mov	r0, r2
    8044:	4770      	bx	lr
    8046:	bf00      	nop

34 instructions, 17 per item, thanks to SMLAD. We’ve unrolled the loop once, so len must be a multiple of two. We can unroll further. By four using 64 bit registers:

int dot(short * a, short * b, int len) {
  int32_t r=0;
  uint64_t *pa=a, *pb=b;
  len>>=2;
  while(--len!=-1) {
    r = SMLAD(*pa,*pb,r);
    r = SMLAD(*pa++>>32, *pb++>>32, r);
  }
  return r;
}

00008024 <dot>:
    8024:	3a01      	subs	r2, #1
    8026:	d012      	beq.n	804e <dot+0x2a>
    8028:	b470      	push	{r4, r5, r6}
    802a:	f1a0 0308 	sub.w	r3, r0, #8
    802e:	2000      	movs	r0, #0
    8030:	f853 5f08 	ldr.w	r5, [r3, #8]!
    8034:	680e      	ldr	r6, [r1, #0]
    8036:	685c      	ldr	r4, [r3, #4]
    8038:	fb25 0006 	smlad	r0, r5, r6, r0
    803c:	3a01      	subs	r2, #1
    803e:	684d      	ldr	r5, [r1, #4]
    8040:	fb24 0005 	smlad	r0, r4, r5, r0
    8044:	f101 0108 	add.w	r1, r1, #8
    8048:	d1f2      	bne.n	8030 <dot+0xc>
    804a:	bc70      	pop	{r4, r5, r6}
    804c:	4770      	bx	lr
    804e:	4610      	mov	r0, r2
    8050:	4770      	bx	lr
    8052:	bf00      	nop

46 instructions, 11.5 per item. We can unroll further. Where to stop? Just double until dimishing returns.

Unroll	Instructions	Per item
1	42	42
2	34	17
4	46	11.5
8	82	10.25
16	130	8.125
32	274	8.5625

It becomes worse as the cost of using the stack overcomes the unrolling advantage. But the best one likely depends on how big the instruction cache is on your M4.