July 1, 2013 / gus3

ARM VFP Vector Programming, Part 3: Source

Here are the source code and Make files for the two examples discussed in Part 2.

The code demonstrating simple vector addition is at the top; complex value multiplication is farther down.

To use them, copy out the files for the example you want to try (best on a Raspberry Pi or some other BCM2835-based system with GCC installed), then type “make”. The code will build automatically. To run a timed test of the code, become root, then type “make test”. You will get a table of microseconds of time, tabulated by command and iteration count, suitable for importing to a spreadsheet.

Simple vector addition

Makefile

E =
DEBUG =

CFLAGS = -O3 -fPIC -march=native -mtune=native -mfloat-abi=softfp -ftree-vectorize -funroll-all-loops -mfpu=vfp $(E) $(DEBUG)

all: vec-add-float vec-add-int vec-add-double vec-add-float-asm vec-add-double-asm vec-add-int-asm
	-rm -f *.o

vec-add-float: vec-add.c read_usec.o
	gcc $(CFLAGS) -DTYPE=float -c -o vec-add-float.o vec-add.c
	gcc $(CFLAGS) -o vec-add-float vec-add-float.o read_usec.o

vec-add-double: vec-add.c read_usec.o
	gcc $(CFLAGS) -DTYPE=double -c -o vec-add-double.o vec-add.c
	gcc $(CFLAGS) -o vec-add-double vec-add-double.o read_usec.o

vec-add-int: vec-add.c read_usec.o
	gcc $(CFLAGS) -DTYPE=int -c -o vec-add-int.o vec-add.c
	gcc $(CFLAGS) -o vec-add-int vec-add-int.o read_usec.o

vec-add-float-asm: reconfig.s vec-add.c vec-add-float.s read_usec.o
	as $(DEBUG) -o vafa.o vec-add-float.s
	gcc $(CFLAGS) -DTYPE=float -DASM -c -o vaf.o vec-add.c
	gcc $(CFLAGS) -o vec-add-float-asm vaf.o vafa.o read_usec.o

vec-add-int-asm: vec-add.c vec-add-int.s read_usec.o
	as $(DEBUG) -o vaia.o vec-add-int.s
	gcc $(CFLAGS) -DTYPE=int -DASM -c -o vai.o vec-add.c
	gcc $(CFLAGS) -o vec-add-int-asm vai.o vaia.o read_usec.o

vec-add-double-asm: reconfig.s vec-add.c vec-add-double.s read_usec.o
	as $(DEBUG) -o vada.o vec-add-double.s
	gcc $(CFLAGS) -DTYPE=double -DASM -c -o vad.o vec-add.c
	gcc $(CFLAGS) -o vec-add-double-asm vad.o vada.o read_usec.o

read_usec.o: read_usec.c
	gcc $(CFLAGS) -c -o read_usec.o read_usec.c

clean:
	-rm -f vec-add-{int,float,double}{,-asm} *.o

# you need to be root for the following target to work
test:
	/bin/sh ./runtest.sh

read_usec.c

#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include "read_usec.h"

#define ST_BASE (0x20003000)
#define TIMER_OFFSET (4)

unsigned long long int read_usec(void) {
static int fd;
static void *st_base;

if (!fd) {
if (-1 == (fd = open("/dev/mem", O_RDONLY))) {
fprintf(stderr, "open() failed in read_usec().\n");
exit(255);
}
if (MAP_FAILED == (st_base = mmap(NULL, 4096, PROT_READ, MAP_SHARED, fd, ST_BASE))) {
fprintf(stderr, "mmap() failed in read_usec().\n");
exit(254);
}
}

return *(long long int *)((unsigned char *)st_base + TIMER_OFFSET);
}

read_usec.h

unsigned long long int read_usec(void);

reconfig.s

.text
.align	2
.local	reconfig
.type	reconfig, %function
.local	deconfig
.type	deconfig, %function
.fpu	vfp

// inputs:
//	r0: desired stride (1 or 2)
//	r1: desired length (min. 1, max. 8)
// outputs: (none)
// modified: r0, r1
// notes:
//	r0 and r1 will be truncated before fitting into FPSCR

reconfig:
	push {r0-r2}
	// adjust/truncate r0 to 2 bits
	and r0, r0, #3
	// but stride is either 1(01->00) or 2(10->11)
	eor r0, r0, #1
	// adjust/truncate r1
	sub r1, r1, #1
	and r1, r1, #7
	// shift/accumulate them in r0
	mov r0, r0, lsl #20
	orr r0, r1, lsl #16
	// get SysCtlReg
	fmrx r2, fpscr
	// clear stride and length fields
	bic r2, #55*65536
	// set them how we want them (stride=0x00, (length-1)=0x07)
	orr r2, r2, r0
	// this is the new SysCtlReg
	fmxr fpscr, r0
	// restore registers
	pop {r0-r2}
	bx lr

// inputs: (none)
// outputs: (none)
// clears the stride and length fields of the FPSCR,
// returning it to "scalar mode" (no vectors). This is
// what the GNU EABI expects.

deconfig:
	push {r0,r1,lr}
	// set stride and length to 1
	mov r0, #1
	mov r1, #1
	// and call "reconfig"
	bl reconfig
	// then return
	pop {r0,r1,pc}

runtest.sh

#!/bin/sh
if [ "$#" != "1" ] ; then
COUNT=20
else
COUNT=$1
fi

echo "command,`seq -s ',' $COUNT`"
for i in ./vec-add-{int,float,double}{,-asm} ; do
echo -n ${i}
eval $i 1 > /dev/null
for count in `seq $COUNT` ; do
rrr=`$i $count`
echo -n ,${rrr}
done
echo ""
done

vec-add-double.s

.include "reconfig.s"

.text
.align 2
.global doit
.type doit, %function
.fpu vfp

// inputs:
// r0 = destination array
// r1 = source array 1
// r2 = source array 2
// r3 = count
//
// no return (void function)

doit:

push {r0,r1,r4,lr}

// first thing to do is change the Stride and Length fields
// in the VFP System Control Register (SCR). Linux keeps these
// as "pure scalar", so that all operations specify single registers.
// We want to enable vector mode, to add 4 pairs of doubles at a time.

push {r0,r1}
mov r0, #1
mov r1, #4
bl reconfig
pop {r0,r1}

// divide count by 4, since we're doing 4 at a time
asr r3, #2

.loop:

// fetch 8 floats from source array 1
fldmiad r1!,{d4-d7}
// and 8 floats from source array 2
fldmiad r2!,{d8-d11}
// here's where the magic happens:
// d[12..15] = d[4..7] + d[8..11]
faddd d12, d4, d8
// and store the result in the destination array
fstmiad r0!,{d12-d15}
// one less block
subs r3, r3, #1
// loop if we're not done
bne .loop

// restore the SCR
bl deconfig
// and we're done
pop {r0,r1,r4,pc}
.size	doit, .-doit

vec-add-float.s

.include "reconfig.s"

.text
.align 2
.global doit
.type doit, %function
.fpu vfp

// inputs:
// r0 = destination array
// r1 = source array 1
// r2 = source array 2
// r3 = count
//
// no return (void function)

doit:

push {r0,r1,r4,lr}

// first thing to do is change the Stride and Length fields
// in the VFP System Control Register (SCR). Linux keeps these
// as "pure scalar", so that all operations specify single registers.
// We want to enable vector mode, to add 8 pairs of floats at a time.

push {r0,r1}
// stride length 1
mov r0, #1
// vector length 8
mov r1, #8
bl reconfig
pop {r0,r1}

// divide count by 8, since we're doing 8 at a time
asr r3, #3

.loop:

// fetch 8 floats from source array 1
fldmias r1!,{s8-s15}
//   and 8 floats from source array 2
fldmias r2!,{s16-s23}
// here's where the magic happens:
// s[24..31] = s[8..15] + s[16..23]
fadds s24, s8, s16
// and store the result in the destination array
fstmias r0!,{s24-s31}
// one less block
subs r3, r3, #1
// loop if we're not done
bne .loop

// restore the FPSCR
bl deconfig
// and we're done
pop {r0,r1,r4,pc}
.size	doit, .-doit

vec-add-int.s

.text
.align	2
.global	doit
.type	doit, %function
.fpu	vfp

// inputs:
// r0 = destination array
// r1 = source array 1
// r2 = source array 2
// r3 = count
//
// no return (void function)

doit:

push {r0-r11, lr}

// move parameters to higher registers
mov r11, r3, asr #2 // looping, 4 at a time, so shifted by 2
mov r8, r0
mov r9, r1
mov r10, r2

.loop:

// ldmia and stmia have been replaced with
// discrete instructions, to minimize overhead
// of instruction restart on data abort

// fetch 8 ints from source array 1
ldmia r9!,{r0-r3}
//ldr r0, [r9]
//ldr r1, [r9, #4]
//ldr r2, [r9, #8]
//ldr r3, [r9, #12]
// and 8 ints from source array 2
ldmia r10!,{r4-r7}
//ldr r4, [r10]
//ldr r5, [r10, #4]
//ldr r6, [r10, #8]
//ldr r7, [r10, #12]
// add the ints
add r0,r4,r0
add r1,r5,r1
add r2,r6,r2
add r3,r7,r3
// and store the result in the destination array
stmia r8!,{r0-r3}
//str r0, [r8]
//add r9, #16
//str r1, [r8, #4]
//add r10, #16
//str r2, [r8, #8]
//str r3, [r8, #12]
//add r8, #16
// one less block
subs r11, r11, #1
// loop if we're not done
bne .loop
// and we're done
pop {r0-r11, pc}
.size	doit, .-doit

vec-add.c

#include <stdio.h>
#include <stdlib.h>
#include "read_usec.h"

#ifndef TYPE
#error The TYPE macro must be defined as a scalar numeric on the command line.
#endif

#define LEN ((3<<23) / sizeof(TYPE))

#ifndef ASM
void doit(TYPE *sum, TYPE *a1, TYPE *a2, int count) {
int i;

for (i=0; i<count; i++)
*(sum+i) = *(a1+i) + *(a2+i);
}
#else
extern void doit(TYPE *sum, TYPE *a1, TYPE *a2, int count);
#endif

void usage(void) {
fprintf(stderr, "Must include an integer argument.\n\n");
exit(255);
}

int main(int argc, char *argv[]) {
static TYPE a[LEN], b[LEN], c[LEN];
int i, iters;    // generic counter
long long int start, end;    // start and end uSec counters

if ((argc != 2) || 0 == (iters = atoi(argv[1])))
usage();

for (i=0; i<LEN; i++) {
b[i]=(TYPE)(a[i]=(TYPE)i)+1;
c[i]=(TYPE)0;
}

start = read_usec();

for (i=0; i<iters; i++)    /* magnify everything */
doit(c, a, b, LEN);

end = read_usec();

#ifdef VERIFY
for (i=0; i<LEN; i++)
if ((int)c[i]!=2*i+1) {
fprintf(stderr, "Error at %d: expected %d, got %d\n",
i, 2*i+1, (int)c[i]);
return 1;
}
#endif
printf("%llu", end - start);
return 0;    /* shell: 0 is success */
}

Complex value multiplication

Makefile

E =
DEBUG = -g

CFLAGS = -O3 -fPIC -march=native -mtune=native -mfloat-abi=softfp -mfpu=vfp $(E) $(DEBUG)

all: cmult-float cmult-double cmult-float-asm cmult-double-asm
	-rm -f *.o

cmult-float: cmult.c read_usec.o
	gcc $(CFLAGS) -DTYPE=float -c -o cmult-float.o cmult.c
	gcc $(CFLAGS) -o cmult-float cmult-float.o read_usec.o

cmult-double: cmult.c read_usec.o
	gcc $(CFLAGS) -DTYPE=double -c -o cmult-double.o cmult.c
	gcc $(CFLAGS) -o cmult-double cmult-double.o read_usec.o

cmult-float-asm: reconfig.s cmult.c cmult-float.s read_usec.o
	as $(DEBUG) -o cmfa.o cmult-float.s
	gcc $(CFLAGS) -DTYPE=float -DASM -c -o cmf.o cmult.c
	gcc $(CFLAGS) -o cmult-float-asm cmf.o cmfa.o read_usec.o

cmult-double-asm: reconfig.s cmult.c cmult-double.s read_usec.o
	as $(DEBUG) -o cmda.o cmult-double.s
	gcc $(CFLAGS) -DTYPE=double -DASM -c -o cmd.o cmult.c
	gcc $(CFLAGS) -o cmult-double-asm cmd.o cmda.o read_usec.o

read_usec.o: read_usec.c
	gcc $(CFLAGS) -c -o read_usec.o read_usec.c

clean:
	-rm -f cmult-{float,double}{,-asm} *.o

# you need to be root for the following target to work
test:
	/bin/sh ./runtest.sh

cmult-double.s

.include "reconfig.s"

.text
.align 2
.global doit
.type doit, %function
.fpu vfp

// inputs:
// r0 = destination array
// r1 = source array 1
// r2 = source array 2
// r3 = count
//
// no return (void function)

doit:

push {r0,r1,r4,lr}

// first thing to do is change the Stride and Length fields
// in the VFP System Control Register (SCR). Linux keeps these
// as "pure scalar", so that all operations specify single registers.
// We want to enable vector mode, to add 4 pairs of doubles at a time.

push {r0,r1}
// stride length 2
mov r0, #2
// vector length 2
mov r1, #2
bl reconfig
pop {r0,r1}

// divide count by 2, since we're doing 2
// complex vectors at a time
asr r3, #1

.loop:

// fetch 2 complexes from source array 1
fldmiad r1!,{d4-d7}
// and 2 complexes from source array 2
fldmiad r2!,{d8-d11}
// here's where the magic happens:
// real coeff prod - imag coeff prod
fmuld d12,d4,d8
fnmacd d12,d5,d9
// cross-product sum
fmuld d13,d4,d9
fmacd d13,d5,d8
// and store the result in the destination array
fstmiad r0!,{d12-d15}
// one less block
subs r3, r3, #1
// loop if we're not done
bne .loop

// restore the SCR
bl deconfig
// and we're done
pop {r0,r1,r4,pc}
.size doit, .-doit

cmult-float.s

.include "reconfig.s"

.text
.align 2
.global doit
.type doit, %function
.fpu vfp

// inputs:
// r0 = destination array
// r1 = source array 1
// r2 = source array 2
// r3 = count
//
// no return (void function)

doit:

push {r0,r1,r4,lr}

// first thing to do is change the Stride and Length fields
// in the VFP System Control Register (SCR). Linux keeps these
// as "pure scalar", so that all operations specify single registers.
// We want to enable vector mode, to add 8 pairs of floats at a time.

push {r0,r1}
// stride length 2
mov r0, #2
// vector length 4
mov r1, #4
bl reconfig
pop {r0,r1}

// divide count by 4, since we're doing 4
// complex vectors at a time
asr r3, #2

.loop:

// fetch 8 floats from source array 1
fldmias r1!,{s8-s15}
//   and 8 floats from source array 2
fldmias r2!,{s16-s23}
// here's where the magic happens:
// real coeff prod - imag coeff prod
fmuls s24,s8,s16
fnmacs s24,s9,s17
// cross-product sum
fmuls s25,s8,s17
fmacs s25,s9,s16
// and store the result in the destination array
fstmias r0!,{s24-s31}
// one less block
subs r3, r3, #1
// loop if we're not done
bne .loop

// restore the FPSCR
bl deconfig
// and we're done
pop {r0,r1,r4,pc}
.size	doit, .-doit

cmult.c

#include <stdio.h>
#include <stdlib.h>
#include "read_usec.h"

#ifndef TYPE
#error The TYPE macro must be defined as a scalar numeric on the command line.
#endif

// our own Complex type
typedef struct _cplx { TYPE real; TYPE imag; } _Cplx;

// Complex types have 2 of each TYPE, so double the
// sizeof() in the divisor
#define LEN ((3<<23) / (sizeof(TYPE)*2))

#ifndef ASM
void doit(_Cplx *res, _Cplx *a, _Cplx *b, int count) {
int i;
_Cplx *rind=res, *aind=a, *bind=b;

for (i=0; i<count; i++,aind++,bind++,rind++) {
// the real portion
(rind)->real = (aind)->real * (bind)->real -
(aind)->imag * (bind)->imag;
// the imaginary portion
(rind)->imag = (aind)->real * (bind)->imag +
(bind)->real * (aind)->imag;
// While the above looks like a lot of pointer
// math and dereferencing, gcc -O3 will generate
// temporary symbols to reduce the number of
// actual calculations performed.
}
}
#else
extern void doit(_Cplx *res, _Cplx *a, _Cplx *b, int count);
#endif

void usage(void) {
fprintf(stderr, "Must include an integer argument.\n\n");
exit(255);
}

int main(int argc, char *argv[]) {
static _Cplx a[LEN], b[LEN], c[LEN];
int i, iters;    // generic counter
long long int start, end;    // start and end uSec counters

if ((argc != 2) || 0 == (iters = atoi(argv[1])))
usage();

// initialize the arrays
for (i=0; i<LEN; i++) {
// doing identical complex values, to check
// results independently of iteration correctness
a[i].real = (TYPE)2; a[i].imag = (TYPE)1;
b[i].real = (TYPE)2; b[i].imag = (TYPE)3;
c[i].real = c[i].imag = (TYPE)0;
}

start = read_usec();

for (i=0; i<iters; i++)    /* magnify everything */
doit(c, a, b, LEN);

end = read_usec();

#ifdef VERIFY
for (i=0; i<LEN; i++)
if ((int)c[i]!=2*i+1) {
fprintf(stderr, "Error at %d: expected %d, got %d\n",
i, 2*i+1, (int)c[i]);
return 1;
}
#endif
printf("%llu", end - start);
return 0;    /* shell: 0 is success */
}

read_usec.c

#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include "read_usec.h"

#define ST_BASE (0x20003000)
#define TIMER_OFFSET (4)

unsigned long long int read_usec(void) {
static int fd;
static void *st_base;

if (!fd) {
if (-1 == (fd = open("/dev/mem", O_RDONLY))) {
fprintf(stderr, "open() failed in read_usec().\n");
exit(255);
}
if (MAP_FAILED == (st_base = mmap(NULL, 4096, PROT_READ, MAP_SHARED, fd, ST_BASE))) {
fprintf(stderr, "mmap() failed in read_usec().\n");
exit(254);
}
}

return *(long long int *)((unsigned char *)st_base + TIMER_OFFSET);
}

read_usec.h

unsigned long long int read_usec(void);

reconfig.s

.text
.align	2
.local	reconfig
.type	reconfig, %function
.local	deconfig
.type	deconfig, %function
.fpu	vfp

// inputs:
//	r0: desired stride (1 or 2)
//	r1: desired length (min. 1, max. 8)
// outputs: (none)
// modified: r0, r1
// notes:
//	r0 and r1 will be truncated before fitting into FPSCR

reconfig:
	push {r0-r2}
	// adjust/truncate r0 to 2 bits
	and r0, r0, #3
	// but stride is either 1(01->00) or 2(10-quo11)
	eor r0, r0, #1
	// adjust/truncate r1
	sub r1, r1, #1
	and r1, r1, #7
	// shift/accumulate them in r0
	mov r0, r0, lsl #20
	orr r0, r1, lsl #16
	// get SysCtlReg
	fmrx r2, fpscr
	// clear stride and length fields
	bic r2, #55*65536
	// set them how we want them (stride=0x00, (length-1)=0x07)
	orr r2, r2, r0
	// this is the new SysCtlReg
	fmxr fpscr, r0
	// restore registers
	pop {r0-r2}
	bx lr

// inputs: (none)
// outputs: (none)
// clears the stride and length fields of the FPSCR,
// returning it to "scalar mode" (no vectors). This is
// what the GNU EABI expects.

deconfig:
	push {r0,r1,lr}
	// set stride and length to 1
	mov r0, #1
	mov r1, #1
	// and call "reconfig"
	bl reconfig
	// then return
	pop {r0,r1,pc}

runtest.sh

#!/bin/sh
if [ "$#" != "1" ] ; then
COUNT=20
else
COUNT=$1
fi

echo "command,`seq -s ',' $COUNT`"
for i in ./cmult-{float,double}{,-asm} ; do
echo -n ${i}
eval $i 1 > /dev/null
for count in `seq $COUNT` ; do
rrr=`$i $count`
echo -n ,${rrr}
done
echo ""
done

← ARM VFP Vector Programming, Part 2: Examples

Accessing the Raspberry Pi’s 1MHz timer, via kernel driver →

ARM VFP Vector Programming, Part 3: Source

Simple vector addition

Makefile

read_usec.c

read_usec.h

reconfig.s

runtest.sh

vec-add-double.s

vec-add-float.s

vec-add-int.s

vec-add.c

Complex value multiplication

Makefile

cmult-double.s

cmult-float.s

cmult.c

read_usec.c

read_usec.h

reconfig.s

runtest.sh

Leave a comment Cancel reply

Search

Contact

Recent Posts

-- Buy a Raspberry Pi! --

Eastern Orthodoxy

Free/Libre Software

GNU/Linux

Humor

News

Archives

ARM VFP Vector Programming, Part 3: Source

Simple vector addition

Makefile

read_usec.c

read_usec.h

reconfig.s

runtest.sh

vec-add-double.s

vec-add-float.s

vec-add-int.s

vec-add.c

Complex value multiplication

Makefile

cmult-double.s

cmult-float.s

cmult.c

read_usec.c

read_usec.h

reconfig.s

runtest.sh

Rate this:

Share this:

Related

Leave a comment Cancel reply

Search

Contact

Recent Posts

-- Buy a Raspberry Pi! --

Eastern Orthodoxy

Free/Libre Software

GNU/Linux

Humor

News

Archives