An OpenCL Matrix Multiplication Recipe for Epiphany

Abstract:

The examples shows how to use OpenCL to implement an efficient matrix multiplication routine on the Epiphany architecture. To reduce off-chip data movements, our OpenCL SDK was agumented with a few key extension calls to allow for more efficient core to core communication. The example was implemented by David Richie at Brown Deer Technology. The code is provided as a proof of concept and is not ready for release, and as such may not be completely bug free. If you find problem, just let us know. The code should be self explanatory for people familiar with OpenCL and matrix multiplication, but a white paper explaining the code will follow soon.

Host Code:

/*Written in July 2012 by David Richie
  Placed in the public domain August 2012: no copyright is claimed, and you may use it for any purpose you like.
  No warranty for any purpose is expressed or implied by the author, Brown Deer Technology,
  or Adapteva Inc. Report bugs and send enhancements to Adapteva, Inc.
*/
#include 
#include 
#include 

#include "Timer.h"

#define EPS (0.00001 * 4)

int main()
{
	Setup(0);
	Reset(0);

	srand(123456);

   cl_uint n = 128;

	/* use default context for device type accelerator */
	CLCONTEXT* cp = stdacc; /* use default context for accelerator device */

   unsigned int devnum = 0;

	clopen(cp,0,CLLD_NOW);

   cl_kernel krn = clsym(cp,0,"ideal_kern",0);

   /* allocate OpenCL device-sharable memory */
   cl_float* aa = (float*)clmalloc(cp,n*n*sizeof(cl_float),0);
   cl_float* bb = (float*)clmalloc(cp,n*n*sizeof(cl_float),0);
   cl_float* cc = (float*)clmalloc(cp,n*n*sizeof(cl_float),0);
   float* cc2 = (float*)malloc(n*n*sizeof(float));

   /* initialize vectors a[] and b[], zero c[] */
   int i,j,k; 
   for(i=0;i EPS ) {
			count++;
			printf("%d %d %f %f %f %f\n",i,j,aa[i*n+j],bb[i*n+j],cc[i*n+j], cc2[i*n+j]);
		}
	}
	printf("error count = %d\n",count);

	printf("time = %f sec (%f GFLOPS)\n",time,gflops);

   clfree(aa);
   clfree(bb);
   clfree(cc);
	free(cc2);

#ifdef __FreeBSD__
	clclose(cp,clh);
#endif

}

Kernel Code:

/*Written in July 2012 by David Richie
  Placed in the public domain August 2012: no copyright is claimed, and you may use it for any purpose you like.
  No warranty for any purpose is expressed or implied by the author, Brown Deer Technology,
  or Adapteva Inc. Report bugs and send enhancements to Adapteva, Inc.
*/

/*
 * call with NDR { 0,N,4, 0,N,4 } where N = size_of_matrix / 32
 */


#define even_parity(n) ( ((n)&1) == 0 )

__kernel void ideal_kern( 
	int size,
	__global float* aa, 
	__global float* bb, 
	__global float* cc
)
{

	int i,j,k;
	int it,ib;

	unsigned int gid0 = get_group_id(0);
	unsigned int gid1 = get_group_id(1);

	unsigned int ltid0 = get_local_id(0);
	unsigned int ltid1 = get_local_id(1);

	int ltsz = get_local_size(0); /* assumed to be square */

	__private float aa_block[32*32];
	__private float bb_block[32*32];
	__private float cc_block[32*32];
	__private float tmp_block[32*32];

	/* copy block */

	int I = ltid0;
	int J = ltid1;
	int K = (ltid0+ltid1)%ltsz;

	size_t aa_offset = (gid0*ltsz+I)*32*size + (gid1*ltsz+K)*32;
	size_t bb_offset = (gid0*ltsz+K)*32*size + (gid1*ltsz+J)*32;
	size_t cc_offset = (gid0*ltsz+I)*32*size + (gid1*ltsz+J)*32;

	for(i=0;i<32;i++) 
		memcopy(&aa_block[i*32],aa+aa_offset+i*size,32*sizeof(float),XCL_MEM_DMA);

	for(i=0;i<32;i++) 
		memcopy(&bb_block[i*32],bb+bb_offset+i*size,32*sizeof(float),XCL_MEM_DMA);

	for(i=0;i<32;i++) for(j=0;j<32;j++) {
			cc_block[i*32+j] = 0.0f;
	}

	for(ib=0;ib