Skip to content

Commit

Permalink
Update exercise 6
Browse files Browse the repository at this point in the history
  • Loading branch information
tomdeakin committed Nov 24, 2014
1 parent 4547961 commit cd3fc29
Show file tree
Hide file tree
Showing 12 changed files with 288 additions and 348 deletions.
24 changes: 6 additions & 18 deletions Exercises/Exercise06/C/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# History: Written by Tim mattson, August 2010
# Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
# Modified by Tom Deakin, July 2013
# Modified by Tom Deakin, October 2014
#

ifndef CC
Expand All @@ -16,42 +17,29 @@ LIBS = -lm -lOpenCL -fopenmp

COMMON_DIR = ../../C_common

MMUL_OBJS = matmul.o matrix_lib.o wtime.o err_code.o
MMUL_OBJS = wtime.o
EXEC = mult

# Change this variable to specify the device type
# to the OpenCL device type of choice. You can also
# edit the variable in the source.
ifndef DEVICE
DEVICE = CL_DEVICE_TYPE_DEFAULT
endif

CCFLAGS += -D DEVICE=$(DEVICE)

# Check our platform and make sure we define the APPLE variable
# and set up the right compiler flags and libraries
PLATFORM = $(shell uname -s)
ifeq ($(PLATFORM), Darwin)
LIBS = -lm -framework OpenCL
LIBS = -lm -framework OpenCL
endif


all: $(EXEC)

mult: $(MMUL_OBJS)
$(CC) $(MMUL_OBJS) $(CCFLAGS) $(LIBS) -o $(EXEC)
mult: $(MMUL_OBJS) matmul.c matrix_lib.c
$(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $(EXEC)

wtime.o: $(COMMON_DIR)/wtime.c
$(CC) -c $^ $(CCFLAGS) -o $@

err_code.o: $(COMMON_DIR)/err_code.c
$(CC) -c $^ $(CCFLAGS) -o $@

.c.o:
$(CC) -c $< $(CCFLAGS) -o $@

matmul.o: matmul.h matrix_lib.h

matrix_lib.o: matmul.h

clean:
rm -f $(MMUL_OBJS) $(EXEC)
215 changes: 83 additions & 132 deletions Exercises/Exercise06/C/matmul.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//------------------------------------------------------------------------------
//
// PROGRAM: Matrix Multipliplication driver
// PROGRAM: Matrix Multiplication driver
//
// PURPOSE: This is a driver program to test various ways of computing
// the product:
Expand All @@ -17,146 +17,122 @@
// Modified by Simon McIntosh-Smith, September 2011
// Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
// Ported to C by Tom Deakin, July 2013
// Modified to assume square matrices by Simon McIntosh-Smith, Sep 2014
//
//------------------------------------------------------------------------------

#include "matmul.h"
#include "matrix_lib.h"
#include "err_code.h"
#include "device_picker.h"

char * kernelsource = "__kernel void mmul( \n" \
" const int Mdim, \n" \
" const int Ndim, \n" \
" const int Pdim, \n" \
char * kernelsource = "__kernel void mmul( \n" \
" const int N, \n" \
" __global float* A, \n" \
" __global float* B, \n" \
" __global float* C) \n" \
"{ \n" \
"} \n" \
"\n";

int main(void)
int main(int argc, char *argv[])
{
float *h_A; // A matrix
float *h_B; // B matrix
float *h_C; // C = A*B matrix
int Mdim, Ndim, Pdim; // A[N][P], B[P][M], C[N][M]
int szA, szB, szC; // number of elements in each matrix
int N; // A[N][N], B[N][N], C[N][N]
int size; // number of elements in each matrix

cl_mem d_a, d_b, d_c; // Matrices in device memory

double start_time; // Starting time
double run_time; // timing data

cl_int err; // error code returned from OpenCL calls
cl_device_id device_id; // compute device id
cl_device_id device; // compute device id
cl_context context; // compute context
cl_command_queue commands; // compute command queue
cl_program program; // compute program
cl_kernel kernel; // compute kernel

Ndim = ORDER;
Pdim = ORDER;
Mdim = ORDER;
N = ORDER;
size = N * N;

szA = Ndim * Pdim;
szB = Pdim * Mdim;
szC = Ndim * Mdim;
h_A = (float *)malloc(size * sizeof(float));
h_B = (float *)malloc(size * sizeof(float));
h_C = (float *)malloc(size * sizeof(float));

h_A = (float *)malloc(szA * sizeof(float));
h_B = (float *)malloc(szB * sizeof(float));
h_C = (float *)malloc(szC * sizeof(float));

initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C);

printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER);
for(int i = 0; i < COUNT; i++)
{
zero_mat(Ndim, Mdim, h_C);
start_time = wtime();

seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C);

run_time = wtime() - start_time;
results(Mdim, Ndim, Pdim, h_C, run_time);
}

//--------------------------------------------------------------------------------
// Create a context, queue and device.
//--------------------------------------------------------------------------------

// Set up OpenCL context. queue, kernel, etc.
cl_uint numPlatforms;
// Find number of platforms
err = clGetPlatformIDs(0, NULL, &numPlatforms);
if (err != CL_SUCCESS || numPlatforms <= 0)
{
printf("Error: Failed to find a platform!\n%s\n",err_code(err));
return EXIT_FAILURE;
}
// Get all platforms
cl_platform_id Platform[numPlatforms];
err = clGetPlatformIDs(numPlatforms, Platform, NULL);
if (err != CL_SUCCESS || numPlatforms <= 0)
{
printf("Error: Failed to get the platform!\n%s\n",err_code(err));
return EXIT_FAILURE;
}
// Secure a device
for (int i = 0; i < numPlatforms; i++)
{
err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL);
if (err == CL_SUCCESS)
break;
}
if (device_id == NULL)
cl_uint deviceIndex = 0;
parseArguments(argc, argv, &deviceIndex);

// Get list of devices
cl_device_id devices[MAX_DEVICES];
unsigned numDevices = getDeviceList(devices);

// Check device index in range
if (deviceIndex >= numDevices)
{
printf("Error: Failed to create a device group!\n%s\n",err_code(err));
return EXIT_FAILURE;
printf("Invalid device index (try '--list')\n");
return EXIT_FAILURE;
}

device = devices[deviceIndex];

char name[MAX_INFO_STRING];
getDeviceName(device, name);
printf("\nUsing OpenCL device: %s\n", name);

// Create a compute context
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!context)
{
printf("Error: Failed to create a compute context!\n%s\n", err_code(err));
return EXIT_FAILURE;
}
// Create a command queue
commands = clCreateCommandQueue(context, device_id, 0, &err);
if (!commands)
context = clCreateContext(0, 1, &device, NULL, NULL, &err);
checkError(err, "Creating context");

// Create a command queue
commands = clCreateCommandQueue(context, device, 0, &err);
checkError(err, "Creating command queue");

//--------------------------------------------------------------------------------
// Run sequential version on the host
//--------------------------------------------------------------------------------

initmat(N, h_A, h_B, h_C);

printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER);
for(int i = 0; i < COUNT; i++)
{
printf("Error: Failed to create a command commands!\n%s\n", err_code(err));
return EXIT_FAILURE;
zero_mat(N, h_C);
start_time = wtime();

seq_mat_mul_sdot(N, h_A, h_B, h_C);

run_time = wtime() - start_time;
results(N, h_C, run_time);
}


//--------------------------------------------------------------------------------
// Setup the buffers, initialize matrices, and write them into global memory
//--------------------------------------------------------------------------------

// Reset A, B and C matrices (just to play it safe)
initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C);
initmat(N, h_A, h_B, h_C);

d_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(float) * szA, h_A, &err);
if (err != CL_SUCCESS)
{
printf("Error: failed to create buffer\n%s\n", err_code(err));
return EXIT_FAILURE;
}
sizeof(float) * size, h_A, &err);
checkError(err, "Creating buffer d_a");

d_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(float) * szB, h_B, &err);
if (err != CL_SUCCESS)
{
printf("Error: failed to create buffer\n%s\n", err_code(err));
return EXIT_FAILURE;
}
sizeof(float) * size, h_B, &err);
checkError(err, "Creating buffer d_b");

d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(float) * szC, NULL, &err);
if (err != CL_SUCCESS)
{
printf("Error: failed to create buffer\n%s\n", err_code(err));
return EXIT_FAILURE;
}
sizeof(float) * size, NULL, &err);
checkError(err, "Creating buffer d_c");


//--------------------------------------------------------------------------------
Expand All @@ -165,91 +141,66 @@ int main(void)

// Create the comput program from the source buffer
program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err);
if (err != CL_SUCCESS)
{
printf("Error: could not create program\n%s\n", err_code(err));
return EXIT_FAILURE;
}
// Build the program
checkError(err, "Creating program");

// Build the program
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];

printf("Error: Failed to build program executable!\n%s\n", err_code(err));
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
return EXIT_FAILURE;
}

// Create the compute kernel from the program
kernel = clCreateKernel(program, "mmul", &err);
if (!kernel || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel!\n%s\n", err_code(err));
return EXIT_FAILURE;
}
checkError(err, "Creating kernel");

printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim);
printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",N);

// Do the multiplication COUNT times
for (int i = 0; i < COUNT; i++)
{
zero_mat(Ndim, Mdim, h_C);

err = clSetKernelArg(kernel, 0, sizeof(int), &Mdim);
err |= clSetKernelArg(kernel, 1, sizeof(int), &Ndim);
err |= clSetKernelArg(kernel, 2, sizeof(int), &Pdim);
err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_a);
err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_b);
err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_c);
zero_mat(N, h_C);

if (err != CL_SUCCESS)
{
printf("Error: Could not set kernel arguments\n");
return EXIT_FAILURE;
}
err = clSetKernelArg(kernel, 0, sizeof(int), &N);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b);
err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c);
checkError(err, "Setting kernel arguments");

start_time = wtime();

// Execute the kernel over the entire range of C matrix elements ... computing
// a dot product for each element of the product matrix. The local work
// group size is set to NULL ... so I'm telling the OpenCL runtime to
// figure out a local work group size for me.
const size_t global[2] = {Ndim, Mdim};
const size_t global[2] = {N, N};
err = clEnqueueNDRangeKernel(
commands,
kernel,
2, NULL,
global, NULL,
0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to execute kernel\n%s\n", err_code(err));
return EXIT_FAILURE;
}

checkError(err, "Enqueuing kernel");

err = clFinish(commands);
if (err != CL_SUCCESS)
{
printf("Error: waiting for queue to finish failed\n%s\n", err_code(err));
return EXIT_FAILURE;
}
checkError(err, "Waiting for commands to finish");

run_time = wtime() - start_time;

err = clEnqueueReadBuffer(
commands, d_c, CL_TRUE, 0,
sizeof(float) * szC, h_C,
sizeof(float) * size, h_C,
0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to read buffer\n%s\n", err_code(err));
return EXIT_FAILURE;
}
checkError(err, "Reading back buffer d_c");

results(Mdim, Ndim, Pdim, h_C, run_time);
results(N, h_C, run_time);

} // end for loop

Expand Down
Loading

0 comments on commit cd3fc29

Please sign in to comment.