ARM VFP Vector Programming, Part 3: Source
Here are the source code and Make files for the two examples discussed in Part 2.
The code demonstrating simple vector addition is at the top; complex value multiplication is farther down.
To use them, copy out the files for the example you want to try (best on a Raspberry Pi or some other BCM2835-based system with GCC installed), then type “make”. The code will build automatically. To run a timed test of the code, become root, then type “make test”. You will get a table of microseconds of time, tabulated by command and iteration count, suitable for importing to a spreadsheet.
Simple vector addition
Makefile
E = DEBUG = CFLAGS = -O3 -fPIC -march=native -mtune=native -mfloat-abi=softfp -ftree-vectorize -funroll-all-loops -mfpu=vfp $(E) $(DEBUG) all: vec-add-float vec-add-int vec-add-double vec-add-float-asm vec-add-double-asm vec-add-int-asm -rm -f *.o vec-add-float: vec-add.c read_usec.o gcc $(CFLAGS) -DTYPE=float -c -o vec-add-float.o vec-add.c gcc $(CFLAGS) -o vec-add-float vec-add-float.o read_usec.o vec-add-double: vec-add.c read_usec.o gcc $(CFLAGS) -DTYPE=double -c -o vec-add-double.o vec-add.c gcc $(CFLAGS) -o vec-add-double vec-add-double.o read_usec.o vec-add-int: vec-add.c read_usec.o gcc $(CFLAGS) -DTYPE=int -c -o vec-add-int.o vec-add.c gcc $(CFLAGS) -o vec-add-int vec-add-int.o read_usec.o vec-add-float-asm: reconfig.s vec-add.c vec-add-float.s read_usec.o as $(DEBUG) -o vafa.o vec-add-float.s gcc $(CFLAGS) -DTYPE=float -DASM -c -o vaf.o vec-add.c gcc $(CFLAGS) -o vec-add-float-asm vaf.o vafa.o read_usec.o vec-add-int-asm: vec-add.c vec-add-int.s read_usec.o as $(DEBUG) -o vaia.o vec-add-int.s gcc $(CFLAGS) -DTYPE=int -DASM -c -o vai.o vec-add.c gcc $(CFLAGS) -o vec-add-int-asm vai.o vaia.o read_usec.o vec-add-double-asm: reconfig.s vec-add.c vec-add-double.s read_usec.o as $(DEBUG) -o vada.o vec-add-double.s gcc $(CFLAGS) -DTYPE=double -DASM -c -o vad.o vec-add.c gcc $(CFLAGS) -o vec-add-double-asm vad.o vada.o read_usec.o read_usec.o: read_usec.c gcc $(CFLAGS) -c -o read_usec.o read_usec.c clean: -rm -f vec-add-{int,float,double}{,-asm} *.o # you need to be root for the following target to work test: /bin/sh ./runtest.sh
read_usec.c
#include <stdio.h> #include <stdlib.h> #include <sys/mman.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> #include "read_usec.h" #define ST_BASE (0x20003000) #define TIMER_OFFSET (4) unsigned long long int read_usec(void) { static int fd; static void *st_base; if (!fd) { if (-1 == (fd = open("/dev/mem", O_RDONLY))) { fprintf(stderr, "open() failed in read_usec().\n"); exit(255); } if (MAP_FAILED == (st_base = mmap(NULL, 4096, PROT_READ, MAP_SHARED, fd, ST_BASE))) { fprintf(stderr, "mmap() failed in read_usec().\n"); exit(254); } } return *(long long int *)((unsigned char *)st_base + TIMER_OFFSET); }
read_usec.h
unsigned long long int read_usec(void);
reconfig.s
.text .align 2 .local reconfig .type reconfig, %function .local deconfig .type deconfig, %function .fpu vfp // inputs: // r0: desired stride (1 or 2) // r1: desired length (min. 1, max. 8) // outputs: (none) // modified: r0, r1 // notes: // r0 and r1 will be truncated before fitting into FPSCR reconfig: push {r0-r2} // adjust/truncate r0 to 2 bits and r0, r0, #3 // but stride is either 1(01->00) or 2(10->11) eor r0, r0, #1 // adjust/truncate r1 sub r1, r1, #1 and r1, r1, #7 // shift/accumulate them in r0 mov r0, r0, lsl #20 orr r0, r1, lsl #16 // get SysCtlReg fmrx r2, fpscr // clear stride and length fields bic r2, #55*65536 // set them how we want them (stride=0x00, (length-1)=0x07) orr r2, r2, r0 // this is the new SysCtlReg fmxr fpscr, r0 // restore registers pop {r0-r2} bx lr // inputs: (none) // outputs: (none) // clears the stride and length fields of the FPSCR, // returning it to "scalar mode" (no vectors). This is // what the GNU EABI expects. deconfig: push {r0,r1,lr} // set stride and length to 1 mov r0, #1 mov r1, #1 // and call "reconfig" bl reconfig // then return pop {r0,r1,pc}
runtest.sh
#!/bin/sh if [ "$#" != "1" ] ; then COUNT=20 else COUNT=$1 fi echo "command,`seq -s ',' $COUNT`" for i in ./vec-add-{int,float,double}{,-asm} ; do echo -n ${i} eval $i 1 > /dev/null for count in `seq $COUNT` ; do rrr=`$i $count` echo -n ,${rrr} done echo "" done
vec-add-double.s
.include "reconfig.s" .text .align 2 .global doit .type doit, %function .fpu vfp // inputs: // r0 = destination array // r1 = source array 1 // r2 = source array 2 // r3 = count // // no return (void function) doit: push {r0,r1,r4,lr} // first thing to do is change the Stride and Length fields // in the VFP System Control Register (SCR). Linux keeps these // as "pure scalar", so that all operations specify single registers. // We want to enable vector mode, to add 4 pairs of doubles at a time. push {r0,r1} mov r0, #1 mov r1, #4 bl reconfig pop {r0,r1} // divide count by 4, since we're doing 4 at a time asr r3, #2 .loop: // fetch 8 floats from source array 1 fldmiad r1!,{d4-d7} // and 8 floats from source array 2 fldmiad r2!,{d8-d11} // here's where the magic happens: // d[12..15] = d[4..7] + d[8..11] faddd d12, d4, d8 // and store the result in the destination array fstmiad r0!,{d12-d15} // one less block subs r3, r3, #1 // loop if we're not done bne .loop // restore the SCR bl deconfig // and we're done pop {r0,r1,r4,pc} .size doit, .-doit
vec-add-float.s
.include "reconfig.s" .text .align 2 .global doit .type doit, %function .fpu vfp // inputs: // r0 = destination array // r1 = source array 1 // r2 = source array 2 // r3 = count // // no return (void function) doit: push {r0,r1,r4,lr} // first thing to do is change the Stride and Length fields // in the VFP System Control Register (SCR). Linux keeps these // as "pure scalar", so that all operations specify single registers. // We want to enable vector mode, to add 8 pairs of floats at a time. push {r0,r1} // stride length 1 mov r0, #1 // vector length 8 mov r1, #8 bl reconfig pop {r0,r1} // divide count by 8, since we're doing 8 at a time asr r3, #3 .loop: // fetch 8 floats from source array 1 fldmias r1!,{s8-s15} // and 8 floats from source array 2 fldmias r2!,{s16-s23} // here's where the magic happens: // s[24..31] = s[8..15] + s[16..23] fadds s24, s8, s16 // and store the result in the destination array fstmias r0!,{s24-s31} // one less block subs r3, r3, #1 // loop if we're not done bne .loop // restore the FPSCR bl deconfig // and we're done pop {r0,r1,r4,pc} .size doit, .-doit
vec-add-int.s
.text .align 2 .global doit .type doit, %function .fpu vfp // inputs: // r0 = destination array // r1 = source array 1 // r2 = source array 2 // r3 = count // // no return (void function) doit: push {r0-r11, lr} // move parameters to higher registers mov r11, r3, asr #2 // looping, 4 at a time, so shifted by 2 mov r8, r0 mov r9, r1 mov r10, r2 .loop: // ldmia and stmia have been replaced with // discrete instructions, to minimize overhead // of instruction restart on data abort // fetch 8 ints from source array 1 ldmia r9!,{r0-r3} //ldr r0, [r9] //ldr r1, [r9, #4] //ldr r2, [r9, #8] //ldr r3, [r9, #12] // and 8 ints from source array 2 ldmia r10!,{r4-r7} //ldr r4, [r10] //ldr r5, [r10, #4] //ldr r6, [r10, #8] //ldr r7, [r10, #12] // add the ints add r0,r4,r0 add r1,r5,r1 add r2,r6,r2 add r3,r7,r3 // and store the result in the destination array stmia r8!,{r0-r3} //str r0, [r8] //add r9, #16 //str r1, [r8, #4] //add r10, #16 //str r2, [r8, #8] //str r3, [r8, #12] //add r8, #16 // one less block subs r11, r11, #1 // loop if we're not done bne .loop // and we're done pop {r0-r11, pc} .size doit, .-doit
vec-add.c
#include <stdio.h> #include <stdlib.h> #include "read_usec.h" #ifndef TYPE #error The TYPE macro must be defined as a scalar numeric on the command line. #endif #define LEN ((3<<23) / sizeof(TYPE)) #ifndef ASM void doit(TYPE *sum, TYPE *a1, TYPE *a2, int count) { int i; for (i=0; i<count; i++) *(sum+i) = *(a1+i) + *(a2+i); } #else extern void doit(TYPE *sum, TYPE *a1, TYPE *a2, int count); #endif void usage(void) { fprintf(stderr, "Must include an integer argument.\n\n"); exit(255); } int main(int argc, char *argv[]) { static TYPE a[LEN], b[LEN], c[LEN]; int i, iters; // generic counter long long int start, end; // start and end uSec counters if ((argc != 2) || 0 == (iters = atoi(argv[1]))) usage(); for (i=0; i<LEN; i++) { b[i]=(TYPE)(a[i]=(TYPE)i)+1; c[i]=(TYPE)0; } start = read_usec(); for (i=0; i<iters; i++) /* magnify everything */ doit(c, a, b, LEN); end = read_usec(); #ifdef VERIFY for (i=0; i<LEN; i++) if ((int)c[i]!=2*i+1) { fprintf(stderr, "Error at %d: expected %d, got %d\n", i, 2*i+1, (int)c[i]); return 1; } #endif printf("%llu", end - start); return 0; /* shell: 0 is success */ }
Complex value multiplication
Makefile
E = DEBUG = -g CFLAGS = -O3 -fPIC -march=native -mtune=native -mfloat-abi=softfp -mfpu=vfp $(E) $(DEBUG) all: cmult-float cmult-double cmult-float-asm cmult-double-asm -rm -f *.o cmult-float: cmult.c read_usec.o gcc $(CFLAGS) -DTYPE=float -c -o cmult-float.o cmult.c gcc $(CFLAGS) -o cmult-float cmult-float.o read_usec.o cmult-double: cmult.c read_usec.o gcc $(CFLAGS) -DTYPE=double -c -o cmult-double.o cmult.c gcc $(CFLAGS) -o cmult-double cmult-double.o read_usec.o cmult-float-asm: reconfig.s cmult.c cmult-float.s read_usec.o as $(DEBUG) -o cmfa.o cmult-float.s gcc $(CFLAGS) -DTYPE=float -DASM -c -o cmf.o cmult.c gcc $(CFLAGS) -o cmult-float-asm cmf.o cmfa.o read_usec.o cmult-double-asm: reconfig.s cmult.c cmult-double.s read_usec.o as $(DEBUG) -o cmda.o cmult-double.s gcc $(CFLAGS) -DTYPE=double -DASM -c -o cmd.o cmult.c gcc $(CFLAGS) -o cmult-double-asm cmd.o cmda.o read_usec.o read_usec.o: read_usec.c gcc $(CFLAGS) -c -o read_usec.o read_usec.c clean: -rm -f cmult-{float,double}{,-asm} *.o # you need to be root for the following target to work test: /bin/sh ./runtest.sh
cmult-double.s
.include "reconfig.s" .text .align 2 .global doit .type doit, %function .fpu vfp // inputs: // r0 = destination array // r1 = source array 1 // r2 = source array 2 // r3 = count // // no return (void function) doit: push {r0,r1,r4,lr} // first thing to do is change the Stride and Length fields // in the VFP System Control Register (SCR). Linux keeps these // as "pure scalar", so that all operations specify single registers. // We want to enable vector mode, to add 4 pairs of doubles at a time. push {r0,r1} // stride length 2 mov r0, #2 // vector length 2 mov r1, #2 bl reconfig pop {r0,r1} // divide count by 2, since we're doing 2 // complex vectors at a time asr r3, #1 .loop: // fetch 2 complexes from source array 1 fldmiad r1!,{d4-d7} // and 2 complexes from source array 2 fldmiad r2!,{d8-d11} // here's where the magic happens: // real coeff prod - imag coeff prod fmuld d12,d4,d8 fnmacd d12,d5,d9 // cross-product sum fmuld d13,d4,d9 fmacd d13,d5,d8 // and store the result in the destination array fstmiad r0!,{d12-d15} // one less block subs r3, r3, #1 // loop if we're not done bne .loop // restore the SCR bl deconfig // and we're done pop {r0,r1,r4,pc} .size doit, .-doit
cmult-float.s
.include "reconfig.s" .text .align 2 .global doit .type doit, %function .fpu vfp // inputs: // r0 = destination array // r1 = source array 1 // r2 = source array 2 // r3 = count // // no return (void function) doit: push {r0,r1,r4,lr} // first thing to do is change the Stride and Length fields // in the VFP System Control Register (SCR). Linux keeps these // as "pure scalar", so that all operations specify single registers. // We want to enable vector mode, to add 8 pairs of floats at a time. push {r0,r1} // stride length 2 mov r0, #2 // vector length 4 mov r1, #4 bl reconfig pop {r0,r1} // divide count by 4, since we're doing 4 // complex vectors at a time asr r3, #2 .loop: // fetch 8 floats from source array 1 fldmias r1!,{s8-s15} // and 8 floats from source array 2 fldmias r2!,{s16-s23} // here's where the magic happens: // real coeff prod - imag coeff prod fmuls s24,s8,s16 fnmacs s24,s9,s17 // cross-product sum fmuls s25,s8,s17 fmacs s25,s9,s16 // and store the result in the destination array fstmias r0!,{s24-s31} // one less block subs r3, r3, #1 // loop if we're not done bne .loop // restore the FPSCR bl deconfig // and we're done pop {r0,r1,r4,pc} .size doit, .-doit
cmult.c
#include <stdio.h> #include <stdlib.h> #include "read_usec.h" #ifndef TYPE #error The TYPE macro must be defined as a scalar numeric on the command line. #endif // our own Complex type typedef struct _cplx { TYPE real; TYPE imag; } _Cplx; // Complex types have 2 of each TYPE, so double the // sizeof() in the divisor #define LEN ((3<<23) / (sizeof(TYPE)*2)) #ifndef ASM void doit(_Cplx *res, _Cplx *a, _Cplx *b, int count) { int i; _Cplx *rind=res, *aind=a, *bind=b; for (i=0; i<count; i++,aind++,bind++,rind++) { // the real portion (rind)->real = (aind)->real * (bind)->real - (aind)->imag * (bind)->imag; // the imaginary portion (rind)->imag = (aind)->real * (bind)->imag + (bind)->real * (aind)->imag; // While the above looks like a lot of pointer // math and dereferencing, gcc -O3 will generate // temporary symbols to reduce the number of // actual calculations performed. } } #else extern void doit(_Cplx *res, _Cplx *a, _Cplx *b, int count); #endif void usage(void) { fprintf(stderr, "Must include an integer argument.\n\n"); exit(255); } int main(int argc, char *argv[]) { static _Cplx a[LEN], b[LEN], c[LEN]; int i, iters; // generic counter long long int start, end; // start and end uSec counters if ((argc != 2) || 0 == (iters = atoi(argv[1]))) usage(); // initialize the arrays for (i=0; i<LEN; i++) { // doing identical complex values, to check // results independently of iteration correctness a[i].real = (TYPE)2; a[i].imag = (TYPE)1; b[i].real = (TYPE)2; b[i].imag = (TYPE)3; c[i].real = c[i].imag = (TYPE)0; } start = read_usec(); for (i=0; i<iters; i++) /* magnify everything */ doit(c, a, b, LEN); end = read_usec(); #ifdef VERIFY for (i=0; i<LEN; i++) if ((int)c[i]!=2*i+1) { fprintf(stderr, "Error at %d: expected %d, got %d\n", i, 2*i+1, (int)c[i]); return 1; } #endif printf("%llu", end - start); return 0; /* shell: 0 is success */ }
read_usec.c
#include <stdio.h> #include <stdlib.h> #include <sys/mman.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> #include "read_usec.h" #define ST_BASE (0x20003000) #define TIMER_OFFSET (4) unsigned long long int read_usec(void) { static int fd; static void *st_base; if (!fd) { if (-1 == (fd = open("/dev/mem", O_RDONLY))) { fprintf(stderr, "open() failed in read_usec().\n"); exit(255); } if (MAP_FAILED == (st_base = mmap(NULL, 4096, PROT_READ, MAP_SHARED, fd, ST_BASE))) { fprintf(stderr, "mmap() failed in read_usec().\n"); exit(254); } } return *(long long int *)((unsigned char *)st_base + TIMER_OFFSET); }
read_usec.h
unsigned long long int read_usec(void);
reconfig.s
.text .align 2 .local reconfig .type reconfig, %function .local deconfig .type deconfig, %function .fpu vfp // inputs: // r0: desired stride (1 or 2) // r1: desired length (min. 1, max. 8) // outputs: (none) // modified: r0, r1 // notes: // r0 and r1 will be truncated before fitting into FPSCR reconfig: push {r0-r2} // adjust/truncate r0 to 2 bits and r0, r0, #3 // but stride is either 1(01->00) or 2(10-quo11) eor r0, r0, #1 // adjust/truncate r1 sub r1, r1, #1 and r1, r1, #7 // shift/accumulate them in r0 mov r0, r0, lsl #20 orr r0, r1, lsl #16 // get SysCtlReg fmrx r2, fpscr // clear stride and length fields bic r2, #55*65536 // set them how we want them (stride=0x00, (length-1)=0x07) orr r2, r2, r0 // this is the new SysCtlReg fmxr fpscr, r0 // restore registers pop {r0-r2} bx lr // inputs: (none) // outputs: (none) // clears the stride and length fields of the FPSCR, // returning it to "scalar mode" (no vectors). This is // what the GNU EABI expects. deconfig: push {r0,r1,lr} // set stride and length to 1 mov r0, #1 mov r1, #1 // and call "reconfig" bl reconfig // then return pop {r0,r1,pc}
runtest.sh
#!/bin/sh if [ "$#" != "1" ] ; then COUNT=20 else COUNT=$1 fi echo "command,`seq -s ',' $COUNT`" for i in ./cmult-{float,double}{,-asm} ; do echo -n ${i} eval $i 1 > /dev/null for count in `seq $COUNT` ; do rrr=`$i $count` echo -n ,${rrr} done echo "" done
Leave a comment