CUDA练习两则

CUDA-homework-1

Start from the provided skeleton code error-test.cu that provides some convenience macros for error checking. The macros are defined in the header file error_checks_1.h. Add the missing memory allocations and copies and the kernel launch and check that your code works.

Remember that you can use also cuda-memcheck! If you have time, you can also check what happens if you remove all error checks and do the same tests again.

What happens if you try to launch kernel with too large block size? When do you catch the error if you remove the cudaStreamSynchronize(NULL) call

过大 block size 会抢占计算资源。在 easyHPC 上面交了(vector_add<<<1, 1025>>>(dC, dA, dB, N);)之后,出现了这样的报错

Error: vector_add kernel at 0_4323.cu(86): invalid configuration argument
yhrun: error: gn07: task 1: Exited with exit code 1

删去设备同步的调用之后,报错信息如下。

Error: vector_add kernel at 0_4323.cu(86): invalid configuration argument
yhrun: error: gn07: task 0: Exited with exit code 1

What happens if you try to dereference a pointer to device memory in host code

运行时在解引用的地方报 error 了。

Error: vector_add kernel at 0_4323.cu(85): invalid configuration argument
yhrun: error: gn07: task 0: Exited with exit code 1

What if you try to access host memory from the kernel

编译时报错。

0_4323.cu(48): error: identifier "hb" is undefined in device code

1 error detected in the compilation of "/tmp/tmpxft_0000440e_00000000-11_0_4323.cpp2.i".

slurmd[gn07]: execve(): 0_4323.cu.out: No such file or directory
yhrun: error: gn07: task 0: Exited with exit code 2
rm: cannot remove ‘0_4323.cu.out’: No such file or directory

提交代码

#include <stdio.h>
#include <math.h>
//#include "error_checks.h" // Macros CUDA_CHECK and CHECK_ERROR_MSG
// This header provides two helper macros for error checking
// See the exercise skeletons and answers for usage examples.

#ifndef COURSE_UTIL_H_
#define COURSE_UTIL_H_

#include <stdio.h>
#include <stdlib.h>

#define CUDA_CHECK(errarg) __checkErrorFunc(errarg, __FILE__, __LINE__)
#define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__)

inline void __checkErrorFunc(cudaError_t errarg, const char *file,
							 const int line)
{
	if (errarg)
	{
		fprintf(stderr, "Error at %s(%i)\n", file, line);
		exit(EXIT_FAILURE);
	}
}

inline void __checkErrMsgFunc(const char *errstr, const char *file,
							  const int line)
{
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Error: %s at %s(%i): %s\n",
				errstr, file, line, cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
}

#endif

__global__ void vector_add(double *C, const double *A, const double *B, int N)
{
	// Add the kernel code
	int idx = blockIdx.x * blockDim.x + threadIdx.x;

	// Do not try to access past the allocated memory
	if (idx < N)
	{
		C[idx] = A[idx] + B[idx];
	}
}

int main(void)
{
	const int N = 20;
	const int ThreadsInBlock = 128;
	double *dA, *dB, *dC;
	double hA[N], hB[N], hC[N];

	for (int i = 0; i < N; ++i)
	{
		hA[i] = (double)i;
		hB[i] = (double)i * i;
	}

	/*
       Add memory allocations and copies. Wrap your runtime function
       calls with CUDA_CHECK( ) macro
    */
	CUDA_CHECK(cudaMalloc((void **)&dA, sizeof(double) * N));
	CUDA_CHECK(cudaMalloc((void **)&dB, sizeof(double) * N));
	CUDA_CHECK(cudaMalloc((void **)&dC, sizeof(double) * N));
	CUDA_CHECK(cudaMemcpy(dA, hA, sizeof(double) * N, cudaMemcpyHostToDevice));
	CUDA_CHECK(cudaMemcpy(dB, hB, sizeof(double) * N, cudaMemcpyHostToDevice));
	//#error Add the remaining memory allocations and copies

	// Note the maximum size of threads in a block
	dim3 grid, threads;

	//// Add the kernel call here
	vector_add<<<1, 32>>>(dC, dA, dB, N);
	//#error Add the CUDA kernel call

	// Here we add an explicit synchronization so that we catch errors
	// as early as possible. Don't do this in production code!
	cudaStreamSynchronize(NULL);
	CHECK_ERROR_MSG("vector_add kernel");

	//// Copy back the results and free the device memory
	CUDA_CHECK(cudaMemcpy(hC, dC, sizeof(double) * N, cudaMemcpyDeviceToHost));
	CUDA_CHECK(cudaFree(dA));
	CUDA_CHECK(cudaFree(dB));
	CUDA_CHECK(cudaFree(dC));
	//#error Copy back the results and free the allocated memory

	for (int i = 0; i < N; i++)
		printf("%5.1f\n", hC[i]);

	return 0;
}

CUDA-homework-2

In this exercise we will implement a Jacobi iteration which is a very simple finite-difference scheme. Familiarize yourself with the provided skeleton. Then implement following things:

Write the missing CUDA kernel sweepGPU that implements the same algorithm as the sweepCPU function. Check that the reported averate difference is in the order of the numerical accuracy.

Experiment with different grid and block sizes and compare the execution times.

代码

//err_checker.h
// This header provides two helper macros for error checking
// See the exercise skeletons and answers for usage examples.

#ifndef COURSE_UTIL_H_
#define COURSE_UTIL_H_

#include <stdio.h>
#include <stdlib.h>

#define CUDA_CHECK(errarg) __checkErrorFunc(errarg, __FILE__, __LINE__)
#define CHECK_ERROR_MSG(errstr) __checkErrMsgFunc(errstr, __FILE__, __LINE__)

inline void __checkErrorFunc(cudaError_t errarg, const char *file,
							 const int line)
{
	if (errarg)
	{
		fprintf(stderr, "Error at %s(%i)\n", file, line);
		exit(EXIT_FAILURE);
	}
}

inline void __checkErrMsgFunc(const char *errstr, const char *file,
							  const int line)
{
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Error: %s at %s(%i): %s\n",
				errstr, file, line, cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
}

#endif
//jacbi.h
#ifndef EX3_H_
#define EX3_H_

#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform_reduce.h>
#include <thrust/iterator/zip_iterator.h>

// Helper function prototypes
double compareArrays(const double *a, const double *b, int N);
double diffCPU(const double *a, const double *b, int N);
void sweepCPU(double *phi, const double *phiPrev,
			  const double *source, double h2, int N);

/* -------------------------------------------------------------------------
   EXTRACURRICULAR ACTIVITIES

   This part provides the reduction operation (in this case summation of
   difference of two arrays) using thrust library. Thrust mimics the
   syntax and design of standard template library (STL) of C++. Thrust is
   also a part of CUDA 4 SDK.
   More information can be found from thrust home page
   http://code.google.com/p/thrust/
   ----------------------------------------------------------------------- */

template <typename T>
class square_diff_thr : public thrust::unary_function<thrust::tuple<T, T>, T>
{
public:
	__host__ __device__
		T
		operator()(const thrust::tuple<T, T> &x) const
	{
		return (thrust::get<1>(x) - thrust::get<0>(x)) *
			   (thrust::get<1>(x) - thrust::get<0>(x));
	}
};

template <typename T>
class square_thr : public thrust::unary_function<T, T>
{
public:
	__host__ __device__
		T
		operator()(const T &x) const
	{
		return x * x;
	}
};

template <typename T>
T diffGPU(T *A_d, T *B_d, int N)
{
	typedef thrust::device_ptr<T> FloatIterator;
	typedef thrust::tuple<FloatIterator, FloatIterator> IteratorTuple;
	typedef thrust::zip_iterator<IteratorTuple> ZipIterator;

	thrust::device_ptr<T> A_ptr(A_d);
	thrust::device_ptr<T> B_ptr(B_d);

	ZipIterator first =
		thrust::make_zip_iterator(thrust::make_tuple(A_ptr, B_ptr));
	ZipIterator last =
		thrust::make_zip_iterator(thrust::make_tuple(A_ptr + N * N,
													 B_ptr + N * N));

	T a1 = thrust::transform_reduce(first, last, square_diff_thr<T>(),
									static_cast<T>(0), thrust::plus<T>());
	T a2 = thrust::transform_reduce(B_ptr, B_ptr + N * N,
									square_thr<T>(), static_cast<T>(0),
									thrust::plus<T>());

	return sqrt(a1 / a2);
}

#endif // EX3_H_

//jacobi.cu
#include <time.h>
#include <stdio.h>
//#include "jacobi.h"
//#include "error_checks.h"

// Change this to 0 if CPU reference result is not needed
#define COMPUTE_CPU_REFERENCE 1
#define MAX_ITERATIONS 3000

// CPU kernel
void sweepCPU(double *phi, const double *phiPrev, const double *source,
			  double h2, int N)
{
	int i, j;
	int index, i1, i2, i3, i4;

	for (j = 1; j < N - 1; j++)
	{
		for (i = 1; i < N - 1; i++)
		{
			index = i + j * N;
			i1 = (i - 1) + j * N;
			i2 = (i + 1) + j * N;
			i3 = i + (j - 1) * N;
			i4 = i + (j + 1) * N;
			phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] +
								 phiPrev[i3] + phiPrev[i4] -
								 h2 * source[index]);
		}
	}
}

// GPU kernel
__global__ void sweepGPU(double *phi, const double *phiPrev, const double *source, double h2, int N)
{
	// #error Add here the GPU version of the update routine (see sweepCPU above)
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	int j = blockIdx.y * blockDim.y + threadIdx.y;
	if (i > 0 && j > 0 && i < N - 1 && j < N - 1)
	{ // be careful!
		int index = i + j * N;
		int i1 = (i - 1) + j * N;
		int i2 = (i + 1) + j * N;
		int i3 = i + (j - 1) * N;
		int i4 = i + (j + 1) * N;
		phi[index] = 0.25 * (phiPrev[i1] + phiPrev[i2] + phiPrev[i3] + phiPrev[i4] - h2 * source[index]);
	}
}

double compareArrays(const double *a, const double *b, int N)
{
	double error = 0.0;
	int i;
	for (i = 0; i < N * N; i++)
	{
		error += fabs(a[i] - b[i]);
	}
	return error / (N * N);
}

double diffCPU(const double *phi, const double *phiPrev, int N)
{
	int i;
	double sum = 0;
	double diffsum = 0;

	for (i = 0; i < N * N; i++)
	{
		diffsum += (phi[i] - phiPrev[i]) * (phi[i] - phiPrev[i]);
		sum += phi[i] * phi[i];
	}

	return sqrt(diffsum / sum);
}

int main()
{
	clock_t t1, t2; // Structs for timing
	const int N = 512;
	double h = 1.0 / (N - 1);
	int iterations;
	const double tolerance = 5e-4; // Stopping condition
	int i, j, index;

	const int blocksize = 16;

	double *phi = new double[N * N];
	double *phiPrev = new double[N * N];
	double *source = new double[N * N];
	double *phi_cuda = new double[N * N];

	double *phi_d, *phiPrev_d, *source_d;
	// Size of the arrays in bytes
	const int size = N * N * sizeof(double);
	double diff;

	// Source initialization
	for (i = 0; i < N; i++)
	{
		for (j = 0; j < N; j++)
		{
			double x, y;
			x = (i - N / 2) * h;
			y = (j - N / 2) * h;
			index = j + i * N;
			if (((x - 0.25) * (x - 0.25) + y * y) < 0.1 * 0.1)
				source[index] = 1e10 * h * h;
			else if (((x + 0.25) * (x + 0.25) + y * y) < 0.1 * 0.1)
				source[index] = -1e10 * h * h;
			else
				source[index] = 0.0;
		}
	}

	CUDA_CHECK(cudaMalloc((void **)&source_d, size));
	CUDA_CHECK(cudaMemcpy(source_d, source, size, cudaMemcpyHostToDevice));

	// Reset values to zero
	for (i = 0; i < N; i++)
	{
		for (j = 0; j < N; j++)
		{
			index = j + i * N;
			phi[index] = 0.0;
			phiPrev[index] = 0.0;
		}
	}

	CUDA_CHECK(cudaMalloc((void **)&phi_d, size));
	CUDA_CHECK(cudaMalloc((void **)&phiPrev_d, size));
	CUDA_CHECK(cudaMemcpy(phi_d, phi, size, cudaMemcpyHostToDevice));
	CUDA_CHECK(cudaMemcpy(phiPrev_d, phiPrev, size, cudaMemcpyHostToDevice));

	// CPU version
	if (COMPUTE_CPU_REFERENCE)
	{
		t1 = clock();

		// Do sweeps untill difference is under the tolerance
		diff = tolerance * 2;
		iterations = 0;
		while (diff > tolerance && iterations < MAX_ITERATIONS)
		{
			sweepCPU(phiPrev, phi, source, h * h, N);
			sweepCPU(phi, phiPrev, source, h * h, N);

			iterations += 2;
			if (iterations % 100 == 0)
			{
				diff = diffCPU(phi, phiPrev, N);
				printf("%d %g\n", iterations, diff);
			}
		}
		t2 = clock();
		printf("CPU Jacobi: %g ms, %d iterations\n",
			   t2 - t1,
			   iterations);
	}

	// GPU version

	dim3 dimBlock(blocksize, blocksize);
	dim3 dimGrid((N + blocksize - 1) / blocksize, (N + blocksize - 1) / blocksize);

	//do sweeps until diff under tolerance
	diff = tolerance * 2;
	iterations = 0;

	t1 = clock();

	while (diff > tolerance && iterations < MAX_ITERATIONS)
	{
		// See above how the CPU update kernel is called
		// and implement similar calling sequence for the GPU code

		//// Add routines here
		sweepGPU<<<dimGrid, dimBlock>>>(phiPrev_d, phi_d, source_d, h * h, N);
		sweepGPU<<<dimGrid, dimBlock>>>(phi_d, phiPrev_d, source_d, h * h, N);
		//#error Add GPU kernel calls here (see CPU version above)

		iterations += 2;

		if (iterations % 100 == 0)
		{
			// diffGPU is defined in the header file, it uses
			// Thrust library for reduction computation
			diff = diffGPU<double>(phiPrev_d, phi_d, N);
			CHECK_ERROR_MSG("Difference computation");
			printf("%d %g\n", iterations, diff);
		}
	}

	//// Add here the routine to copy back the results
	CUDA_CHECK(cudaMemcpy(phi, phi_d, size, cudaMemcpyDeviceToHost));
	CUDA_CHECK(cudaMemcpy(phiPrev, phiPrev_d, size, cudaMemcpyDeviceToHost));
	//#error Copy back the results

	t2 = clock();
	printf("GPU Jacobi: %g ms, %d iterations\n",
		   t2 - t1,
		   iterations);

	//// Add here the clean up code for all allocated CUDA resources
	CUDA_CHECK(cudaFree(phi_d));
	CUDA_CHECK(cudaFree(phiPrev_d));
	CUDA_CHECK(cudaFree(source_d));
	//#error Add here the clean up code

	if (COMPUTE_CPU_REFERENCE)
	{
		printf("Average difference is %g\n", compareArrays(phi, phi_cuda, N));
	}

	delete[] phi;
	delete[] phi_cuda;
	delete[] phiPrev;
	delete[] source;

	return EXIT_SUCCESS;
}