From e531fadbe37287e39ed490b41011bad4cf09aa6b Mon Sep 17 00:00:00 2001 From: Johannes Deselaers Date: Mon, 4 Aug 2014 13:28:17 -0700 Subject: [PATCH 1/6] Update directory paths in CMakeList --- CMakeLists.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 996cb1c3..f620f7af 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,12 +39,12 @@ if(CUDA_FOUND) # add debugging to CUDA NVCC flags. For NVidia's NSight tools. set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} "-G") - add_subdirectory (HW1) - add_subdirectory (HW2) - add_subdirectory (HW3) - add_subdirectory (HW4) - add_subdirectory (HW5) - add_subdirectory (HW6) + add_subdirectory (Problem\ Sets/Problem\ Set\ 1) + add_subdirectory (Problem\ Sets/Problem\ Set\ 2) + add_subdirectory (Problem\ Sets/Problem\ Set\ 3) + add_subdirectory (Problem\ Sets/Problem\ Set\ 4) + add_subdirectory (Problem\ Sets/Problem\ Set\ 5) + add_subdirectory (Problem\ Sets/Problem\ Set\ 6) else(CUDA_FOUND) message("CUDA is not installed on this system.") endif() From 96aa3b780d8bfd13e8f74cacc2484acc2ef2b942 Mon Sep 17 00:00:00 2001 From: Johannes Deselaers Date: Wed, 6 Aug 2014 19:06:02 -0700 Subject: [PATCH 2/6] added Homeworks 1-3 --- Problem Sets/Problem Set 1/Makefile | 2 +- Problem Sets/Problem Set 1/student_func.cu | 63 ++++---- Problem Sets/Problem Set 2/student_func.cu | 28 ++-- Problem Sets/Problem Set 3/student_func.cu | 162 ++++++++++++++++++--- 4 files changed, 196 insertions(+), 59 deletions(-) diff --git a/Problem Sets/Problem Set 1/Makefile b/Problem Sets/Problem Set 1/Makefile index bc0b2e5d..281df44f 100755 --- a/Problem Sets/Problem Set 1/Makefile +++ b/Problem Sets/Problem Set 1/Makefile @@ -22,7 +22,7 @@ OPENCV_INCLUDEPATH=/usr/include OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui -CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include +CUDA_INCLUDEPATH=/usr/local/cuda/include ###################################################### # On Macs the default install locations are below # diff --git a/Problem Sets/Problem Set 1/student_func.cu b/Problem Sets/Problem Set 1/student_func.cu index 452b379f..0aeb3eb2 100755 --- a/Problem Sets/Problem Set 1/student_func.cu +++ b/Problem Sets/Problem Set 1/student_func.cu @@ -2,11 +2,11 @@ // Color to Greyscale Conversion //A common way to represent color images is known as RGBA - the color -//is specified by how much Red, Grean and Blue is in it. -//The 'A' stands for Alpha and is used for transparency, it will be +//is specified by how much Red, Green, and Blue is in it. +//The 'A' stands for Alpha and is used for transparency; it will be //ignored in this homework. -//Each channel Red, Blue, Green and Alpha is represented by one byte. +//Each channel Red, Blue, Green, and Alpha is represented by one byte. //Since we are using one byte for each color there are 256 different //possible values for each color. This means we use 4 bytes per pixel. @@ -15,7 +15,7 @@ //To convert an image from color to grayscale one simple method is to //set the intensity to the average of the RGB channels. But we will -//use a more sophisticated method that takes into account how the eye +//use a more sophisticated method that takes into account how the eye //perceives color and weights the channels unequally. //The eye responds most strongly to green followed by red and then blue. @@ -24,43 +24,50 @@ //I = .299f * R + .587f * G + .114f * B -//Notice the trailing f's on the numbers which indicate that they are +//Notice the trailing f's on the numbers which indicate that they are //single precision floating point constants and not double precision //constants. //You should fill in the kernel as well as set the block and grid sizes //so that the entire image is processed. +#include "reference_calc.cpp" #include "utils.h" +#include __global__ void rgba_to_greyscale(const uchar4* const rgbaImage, - unsigned char* const greyImage, - int numRows, int numCols) +unsigned char* const greyImage, +int numRows, int numCols) { - //TODO - //Fill in the kernel to convert from color to greyscale - //the mapping from components of a uchar4 to RGBA is: - // .x -> R ; .y -> G ; .z -> B ; .w -> A - // - //The output (greyImage) at each pixel should be the result of - //applying the formula: output = .299f * R + .587f * G + .114f * B; - //Note: We will be ignoring the alpha channel for this conversion - - //First create a mapping from the 2D block and grid locations - //to an absolute 2D location in the image, then use that to - //calculate a 1D offset + //TODO + //Fill in the kernel to convert from color to greyscale + //the mapping from components of a uchar4 to RGBA is: + // .x -> R ; .y -> G ; .z -> B ; .w -> A + // + //The output (greyImage) at each pixel should be the result of + //applying the formula: output = .299f * R + .587f * G + .114f * B; + //Note: We will be ignoring the alpha channel for this conversion + + //First create a mapping from the 2D block and grid locations + //to an absolute 2D location in the image, then use that to + //calculate a 1D offset + int x = blockIdx.x; + int y = blockIdx.y; + uchar4 colorValue = rgbaImage[x * numCols + y]; + float greyValue = 0.299f * colorValue.x + 0.587f * colorValue.y + 0.114f * colorValue.z; + greyImage[x * numCols + y] = greyValue; } void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage, - unsigned char* const d_greyImage, size_t numRows, size_t numCols) +unsigned char* const d_greyImage, size_t numRows, size_t numCols) { - //You must fill in the correct sizes for the blockSize and gridSize - //currently only one block with one thread is being launched - const dim3 blockSize(1, 1, 1); //TODO - const dim3 gridSize( 1, 1, 1); //TODO - rgba_to_greyscale<<>>(d_rgbaImage, d_greyImage, numRows, numCols); - - cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); - + //You must fill in the correct sizes for the blockSize and gridSize + //currently only one block with one thread is being launched + const dim3 threadsPerBlock( 1, 1, 1); //TODO + const dim3 numberOfBlocks( numRows, numCols, 1); //TODO + rgba_to_greyscale<<>>(d_rgbaImage, d_greyImage, numRows, numCols); + + cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); } + diff --git a/Problem Sets/Problem Set 2/student_func.cu b/Problem Sets/Problem Set 2/student_func.cu index 825e412b..30f5402b 100755 --- a/Problem Sets/Problem Set 2/student_func.cu +++ b/Problem Sets/Problem Set 2/student_func.cu @@ -117,11 +117,11 @@ void gaussian_blur(const unsigned char* const inputChannel, // the image. You'll want code that performs the following check before accessing // GPU memory: // - // if ( absolute_image_position_x >= numCols || - // absolute_image_position_y >= numRows ) - // { - // return; - // } + if ( absolute_image_position_x >= numCols || + absolute_image_position_y >= numRows ) + { + return; + } // NOTE: If a thread's absolute position 2D position is within the image, but some of // its neighbors are outside the image, then you will need to be extra careful. Instead @@ -147,11 +147,11 @@ void separateChannels(const uchar4* const inputImageRGBA, // the image. You'll want code that performs the following check before accessing // GPU memory: // - // if ( absolute_image_position_x >= numCols || - // absolute_image_position_y >= numRows ) - // { - // return; - // } + if ( absolute_image_position_x >= numCols || + absolute_image_position_y >= numRows ) + { + return; + } } //This kernel takes in three color channels and recombines them @@ -205,6 +205,7 @@ void allocateMemoryAndCopyToGPU(const size_t numRowsImage, const size_t numColsI //be sure to use checkCudaErrors like the above examples to //be able to tell if anything goes wrong //IMPORTANT: Notice that we pass a pointer to a pointer to cudaMalloc + checkCudaErrors(cudaMalloc); //TODO: //Copy the filter on the host (h_filter) to the memory you just allocated @@ -221,15 +222,16 @@ void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_ const int filterWidth) { //TODO: Set reasonable block size (i.e., number of threads per block) - const dim3 blockSize; + const dim3 blockSize( 1, 1, 1); //TODO: //Compute correct grid size (i.e., number of blocks per kernel launch) //from the image size and and block size. - const dim3 gridSize; + const dim3 gridSize( 1, 1, 1); //TODO: Launch a kernel for separating the RGBA image into different color channels - + seperateChannels<<>>(...); + // Call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after // launching your kernel to make sure that you didn't make any mistakes. cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); diff --git a/Problem Sets/Problem Set 3/student_func.cu b/Problem Sets/Problem Set 3/student_func.cu index 26f00a74..a7f13fdc 100755 --- a/Problem Sets/Problem Set 3/student_func.cu +++ b/Problem Sets/Problem Set 3/student_func.cu @@ -81,24 +81,152 @@ #include "utils.h" -void your_histogram_and_prefixsum(const float* const d_logLuminance, - unsigned int* const d_cdf, - float &min_logLum, - float &max_logLum, - const size_t numRows, - const size_t numCols, - const size_t numBins) +__global__ void reduceLuminance(const float* const d_logLuminance, bool isMax, float *result, int inputSize) { - //TODO - /*Here are the steps you need to implement - 1) find the minimum and maximum value in the input logLuminance channel - store in min_logLum and max_logLum - 2) subtract them to find the range - 3) generate a histogram of all the values in the logLuminance channel using - the formula: bin = (lum[i] - lumMin) / lumRange * numBins - 4) Perform an exclusive scan (prefix sum) on the histogram to get - the cumulative distribution of luminance values (this should go in the - incoming d_cdf pointer which already has been allocated for you) */ + // // sdata is allocated in the kernel call: 3rd arg to <<>> + extern __shared__ float sdata[]; + + int myId = threadIdx.x + blockDim.x * blockIdx.x; + int tid = threadIdx.x; + + // load shared mem from global mem + if(myId < inputSize) + { + sdata[tid] = d_logLuminance[myId]; + } + __syncthreads(); // make sure entire block is loaded! + + // do reduction in shared mem + for (int s = blockDim.x / 2; s > 0; s >>= 1) + { + if (tid < s && (myId+s) < inputSize) + { + float val1=sdata[tid]; + float val2=sdata[tid + s]; + sdata[tid] = isMax?(val1 > val2 ? val1 : val2) : (val1 < val2 ? val1 : val2); + } + __syncthreads(); // make sure all adds at one stage are done! + } + + // only thread 0 writes result for this block back to global mem + if (tid == 0) + { + result[blockIdx.x] = sdata[0]; + } +} + +__global__ void assignHistogram(int *d_bins, const float *d_in, int numBins, float lumRange, float lumMin, int inputSize) +{ + int myId = threadIdx.x + blockDim.x * blockIdx.x; + + if(myId>> + extern __shared__ int shareddata[]; + + int myId = threadIdx.x + blockDim.x * blockIdx.x; + int tid = threadIdx.x; + + // load shared mem from global mem + if(myId < inputSize) + { + shareddata[tid] = d_bins[myId]; + } + __syncthreads(); // make sure entire block is loaded! + + //Step hillis / Steele + for (int step = 1; step= step) + { + shareddata[tid] += shareddata[tid - step]; + } + __syncthreads(); // make sure all adds at one stage are done! + } + + // every thread writes result for this block back to global mem + if(myId < inputSize) + { + result[myId] = shareddata[tid]; + } +} + +void your_histogram_and_prefixsum(const float* const d_logLuminance, +unsigned int* const d_cdf, +float &min_logLum, +float &max_logLum, +const size_t numRows, +const size_t numCols, +const size_t numBins) +{ + //TODO + /*Here are the steps you need to implement + 1) find the minimum and maximum value in the input logLuminance channel + store in min_logLum and max_logLum + 2) subtract them to find the range + 3) generate a histogram of all the values in the logLuminance channel using + the formula: bin = (lum[i] - lumMin) / lumRange * numBins + 4) Perform an exclusive scan (prefix sum) on the histogram to get + the cumulative distribution of luminance values (this should go in the + incoming d_cdf pointer which already has been allocated for you) */ + + // Two step reduce on one dimension + const int inputSize = numRows * numCols; + const int maxThreadsPerBlock = 1024; + unsigned int threads = maxThreadsPerBlock; + const int blocks = (inputSize / maxThreadsPerBlock)+2; //more blocks to avoid int round loss + + const int sharedMemorySize_1 = threads * sizeof(float); + const int sharedMemorySize_2 = blocks * sizeof(float); + + float *d_intermediate, *d_result; + checkCudaErrors(cudaMalloc(&d_intermediate, sizeof(float) * blocks)); + checkCudaErrors(cudaMalloc(&d_result, sizeof(float))); + + // Step 1: min luminance + reduceLuminance<<>>(d_logLuminance, false, d_intermediate, inputSize); + reduceLuminance<<<1, blocks, sharedMemorySize_2>>>(d_intermediate, false, d_result, blocks); + cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); + checkCudaErrors(cudaMemcpy(&min_logLum, d_result, sizeof(float), cudaMemcpyDeviceToHost)); + + // Step 1: max luminance + reduceLuminance<<>>(d_logLuminance, true, d_intermediate, inputSize); + reduceLuminance<<<1, blocks, sharedMemorySize_2>>>(d_intermediate, true, d_result, blocks); + cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); + checkCudaErrors(cudaMemcpy(&max_logLum, d_result, sizeof(float), cudaMemcpyDeviceToHost)); + + // Step 2: luminance range + float range = max_logLum - min_logLum; + + // Step 3: histogram + int *d_bins; + checkCudaErrors(cudaMalloc(&d_bins, sizeof(int) * numBins)); + checkCudaErrors(cudaMemset(d_bins, 0, sizeof(int) * numBins)); + assignHistogram<<<(inputSize/threads) +1, threads>>>(d_bins, d_logLuminance, numBins, range, min_logLum, inputSize); + cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); + + // Step 4: Exclusive Scan on d_bins + threads = 2; + while(threads < numBins) + { + threads <<= 1; + } + + scanHistogram<<<1, threads, sizeof(unsigned int) * numBins>>>(d_bins, d_cdf, numBins); + cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); + + //free resources + + cudaFree(d_intermediate); + cudaFree(d_result); + cudaFree(d_bins); } From 91ab000067b1f21aa2b670e2bfc7c861b7498410 Mon Sep 17 00:00:00 2001 From: Johannes Deselaers Date: Wed, 13 Aug 2014 10:33:29 -0700 Subject: [PATCH 3/6] added solution for HW5 --- .gitignore | 1 + Problem Sets/Problem Set 4/student_func.cu | 302 ++++++++++++++++++--- Problem Sets/Problem Set 5/student.cu | 49 +++- 3 files changed, 307 insertions(+), 45 deletions(-) diff --git a/.gitignore b/.gitignore index a22963e0..3baa1c0e 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,4 @@ build bin +*gpusort-ipdps09.pdf \ No newline at end of file diff --git a/Problem Sets/Problem Set 4/student_func.cu b/Problem Sets/Problem Set 4/student_func.cu index 347d7b6e..a7b19922 100755 --- a/Problem Sets/Problem Set 4/student_func.cu +++ b/Problem Sets/Problem Set 4/student_func.cu @@ -3,45 +3,180 @@ #include "utils.h" #include +#include "device_launch_parameters.h" +#include "device_functions.h" +#include /* Red Eye Removal - =============== - - For this assignment we are implementing red eye removal. This is - accomplished by first creating a score for every pixel that tells us how - likely it is to be a red eye pixel. We have already done this for you - you - are receiving the scores and need to sort them in ascending order so that we - know which pixels to alter to remove the red eye. +=============== - Note: ascending order == smallest to largest +For this assignment we are implementing red eye removal. This is +accomplished by first creating a score for every pixel that tells us how +likely it is to be a red eye pixel. We have already done this for you - you +are receiving the scores and need to sort them in ascending order so that we +know which pixels to alter to remove the red eye. - Each score is associated with a position, when you sort the scores, you must - also move the positions accordingly. +Note: ascending order == smallest to largest - Implementing Parallel Radix Sort with CUDA - ========================================== +Each score is associated with a position, when you sort the scores, you must +also move the positions accordingly. - The basic idea is to construct a histogram on each pass of how many of each - "digit" there are. Then we scan this histogram so that we know where to put - the output of each digit. For example, the first 1 must come after all the - 0s so we have to know how many 0s there are to be able to start moving 1s - into the correct position. +Implementing Parallel Radix Sort with CUDA +========================================== - 1) Histogram of the number of occurrences of each digit - 2) Exclusive Prefix Sum of Histogram - 3) Determine relative offset of each digit - For example [0 0 1 1 0 0 1] - -> [0 1 0 1 2 3 2] - 4) Combine the results of steps 2 & 3 to determine the final - output location for each element and move it there +The basic idea is to construct a histogram on each pass of how many of each +"digit" there are. Then we scan this histogram so that we know where to put +the output of each digit. For example, the first 1 must come after all the +0s so we have to know how many 0s there are to be able to start moving 1s +into the correct position. - LSB Radix sort is an out-of-place sort and you will need to ping-pong values - between the input and output buffers we have provided. Make sure the final - sorted results end up in the output buffer! Hint: You may need to do a copy - at the end. +1) Histogram of the number of occurrences of each digit +2) Exclusive Prefix Sum of Histogram +3) Determine relative offset of each digit +For example [0 0 1 1 0 0 1] +-> [0 1 0 1 2 3 2] +4) Combine the results of steps 2 & 3 to determine the final +output location for each element and move it there - */ +LSB Radix sort is an out-of-place sort and you will need to ping-pong values +between the input and output buffers we have provided. Make sure the final +sorted results end up in the output buffer! Hint: You may need to do a copy +at the end. +*/ +#define NUM_BITS 4 +#define RADIX (1 << NUM_BITS) +#define BLOCK_SIZE 256 // no more than 254 in order to fit all relative positions into the array of bytes + +__device__ __host__ unsigned int get_digit(unsigned int num, unsigned int shift) +{ + int mask = (RADIX - 1) << shift; + return (num & mask) >> shift; +} + +// histogram of digits within one block +__global__ void hist_digits(const unsigned int* const in, + const size_t size, + unsigned int shift, + const int nBins, + unsigned int* const outBins) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) + { + return; + } + + unsigned int digit = get_digit(in[idx], shift); + + atomicAdd(&outBins[digit], 1); +} + +// # of each digit within a block. +// used to compute relative positions of each digit within the array +__global__ void num_digits_per_block(const unsigned int* const in, + const size_t size, + unsigned int shift, + const int nBins, + unsigned int* const outBins) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) + { + return; + } + + unsigned int digit = get_digit(in[idx], shift); + + atomicAdd(&outBins[blockIdx.x * RADIX + digit], 1); +} + +// add per-block cdf of digits to get a truly relative position of the digit (relative to the start of the sequence) +// and then add cdf of all digits to get the absolute position of the element in the new order +__global__ void get_absolute_positions(const unsigned int * const in, unsigned int * const relPos, size_t size, unsigned int shift, unsigned int * const maxVals, unsigned int * const digitCdf) +{ + int idx = blockDim.x * blockIdx.x + threadIdx.x; + if (idx >= size) + { + return; + } + unsigned int digit = get_digit(in[idx], shift); + + relPos[idx] += maxVals[blockIdx.x * RADIX + digit]; + relPos[idx] += digitCdf[digit]; +} + +// Build a relative positional scan of digit, shifted by "shift" +// in - array of numbers to sort +// inPos - positions of the keys that also need to move +// size - size of the array +// shift - which significant digit are we concerned about +// rel - array of relative positions of the numbers in the in array (relative to current block), based on the digit of interest. +__global__ void rel_pos_per_block(const unsigned int * const in, + const size_t size, unsigned int shift, + unsigned int * const rel) +{ + int idx = blockDim.x * blockIdx.x + threadIdx.x; + int digit_pos = get_digit(in[idx], shift); + + int offset = digit_pos * blockDim.x; + + __shared__ unsigned int relativePosition[RADIX * BLOCK_SIZE]; + for(int i = 0; i < RADIX; i++) + { + relativePosition[threadIdx.x + i * blockDim.x] = 0; + } + __syncthreads(); + + // initialize to the "startup" positions for each digit. + // since we always set them to "1" and the scan is supposed + // to be exclusive, we will need to subtract "1" at the end + relativePosition[threadIdx.x + offset] = 1; + + // accumulate position values for each digit + for(int step = 1; step < size; step <<= 1) + { + for(int j = 0; j < RADIX; j++) + { + __syncthreads(); + // position start of the digit in the relatvePosition array + int offset = j * blockDim.x; + int tSum = relativePosition[threadIdx.x + offset]; + + if(threadIdx.x >= step) + { + tSum += relativePosition[threadIdx.x + offset - step]; + } + __syncthreads(); + relativePosition[threadIdx.x + offset] = tSum; + } + } + + __syncthreads(); + + rel[idx] = relativePosition[threadIdx.x + offset] - 1; +} + +// Convert relative postions to absolute ones by adding up to the histogram +// rel array of relative positions +// cdf - exclusive cdf of the counts histogram +__global__ void move_to_out(unsigned int * const dest, + unsigned int * const destPos, + unsigned int * const source, + unsigned int * const sourcePos, + unsigned int * const rel, + size_t size) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= size) + { + return; + } + + int pos = rel[idx]; + dest[pos] = source[idx]; + destPos[pos] = sourcePos[idx]; +} void your_sort(unsigned int* const d_inputVals, unsigned int* const d_inputPos, @@ -49,6 +184,109 @@ void your_sort(unsigned int* const d_inputVals, unsigned int* const d_outputPos, const size_t numElems) { - //TODO - //PUT YOUR SORT HERE -} + + const int blockSize = BLOCK_SIZE; + int nBlocks = (numElems + blockSize) / blockSize; + int nWholeBlocks = numElems / blockSize; + int remainder = numElems % blockSize; // elements that don't fit into the blocks must be handled separately + bool swapInOut = false; //flag to show if we need to swap input & output at the end + + unsigned int *d_outBins, *d_rel, *d_maxDigitPerBlock; + + std::auto_ptr h_hist(new unsigned int[RADIX]); + std::auto_ptr cdf(new unsigned int [RADIX]); + std::auto_ptr h_rel(new unsigned int[numElems]); + std::auto_ptr h_digits_per_block(new unsigned int[RADIX * nBlocks]); + std::auto_ptr cdf_per_block(new unsigned int[RADIX * nBlocks]); + + checkCudaErrors(cudaMalloc(&d_outBins, RADIX * sizeof(int))); + checkCudaErrors(cudaMalloc(&d_rel, numElems * sizeof(int))); + checkCudaErrors(cudaMalloc(&d_maxDigitPerBlock, nBlocks * RADIX * sizeof(int))); + + for (unsigned int shift = 0; shift < 8 * sizeof(unsigned int); shift += NUM_BITS) + { + unsigned int * d_input, * d_output, * d_inPos, * d_outPos; + if (!swapInOut) + { + d_input = d_inputVals; + d_output = d_outputVals; + d_inPos = d_inputPos; + d_outPos = d_outputPos; + } + else + { + d_output = d_inputVals; + d_input = d_outputVals; + d_outPos = d_inputPos; + d_inPos = d_outputPos; + } + + //1. Compute histogram of digits + checkCudaErrors(cudaMemset(d_outBins, 0, RADIX * sizeof(int))); + + hist_digits<<>>(d_input, numElems, shift, RADIX, d_outBins); + cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); + + checkCudaErrors(cudaMemcpy(h_hist.get(), d_outBins, RADIX * sizeof(int), cudaMemcpyDeviceToHost)); + + //2. Compute exclusive scan of the histogram + cdf.get()[0] = 0; + + for(int digit = 1; digit < RADIX; digit++) + { + cdf.get()[digit] = cdf.get()[digit - 1] + h_hist.get()[digit - 1]; + } + + //3. Compute relative position of each digit per block. + //a. Compute # of each digit per block + checkCudaErrors(cudaMemset(d_maxDigitPerBlock, 0, RADIX * nBlocks * sizeof(int))); + num_digits_per_block<<>>(d_input, numElems, shift, RADIX, d_maxDigitPerBlock); + cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); + + checkCudaErrors(cudaMemcpy(h_digits_per_block.get(), d_maxDigitPerBlock, RADIX * sizeof(int) * nBlocks, cudaMemcpyDeviceToHost)); + + //b. Compute exclusive scan per block of numbers of digits per block + memset(cdf_per_block.get(), 0, RADIX * nBlocks * sizeof(unsigned int)); + + for(int block = 1; block < nBlocks; block++) + { + for(int digit = 0; digit < RADIX; digit++) + { + cdf_per_block.get()[block * RADIX + digit] = cdf_per_block.get()[(block - 1) * RADIX + digit] + h_digits_per_block.get()[(block - 1) * RADIX + digit]; + } + } + + //c. Compute relative position + rel_pos_per_block<<>>(d_input, blockSize, shift, d_rel); + + if(remainder > 0) + { + rel_pos_per_block<<<1, remainder>>>(&d_input[nWholeBlocks * blockSize], remainder, shift, &d_rel[nWholeBlocks * blockSize]); + } + cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); + + // d. Add things up to get the real relative position of each digit (relative to the start of the sequence, not per block) + checkCudaErrors(cudaMemcpy(d_maxDigitPerBlock, cdf_per_block.get(), RADIX * nBlocks * sizeof(int), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_outBins, cdf.get(), RADIX * sizeof(int), cudaMemcpyHostToDevice)); + + get_absolute_positions<<>>(d_input, d_rel, numElems, shift, d_maxDigitPerBlock, d_outBins); + cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); + + //4. now we can update the output + move_to_out<<>>(d_output, d_outPos, d_input, d_inPos, d_rel, numElems); + cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); + + swapInOut = !swapInOut; + } + + // when it's all over we must make sure output values are where they ought to be. + if(!swapInOut) + { + checkCudaErrors(cudaMemcpy(d_outputVals, d_inputVals, numElems * sizeof(unsigned int), cudaMemcpyDeviceToDevice)); + checkCudaErrors(cudaMemcpy(d_outputPos, d_inputPos, numElems * sizeof(unsigned int), cudaMemcpyDeviceToDevice)); + } + + checkCudaErrors(cudaFree(d_outBins)); + checkCudaErrors(cudaFree(d_rel)); + checkCudaErrors(cudaFree(d_maxDigitPerBlock)); + diff --git a/Problem Sets/Problem Set 5/student.cu b/Problem Sets/Problem Set 5/student.cu index 00dd306f..05ceb6dd 100755 --- a/Problem Sets/Problem Set 5/student.cu +++ b/Problem Sets/Problem Set 5/student.cu @@ -24,20 +24,42 @@ */ - #include "utils.h" +#include "reference.cpp" + +#define elementsPerThread 64 __global__ void yourHisto(const unsigned int* const vals, //INPUT unsigned int* const histo, //OUPUT - int numVals) + const int numVals, + const int numBins) { - //TODO fill in this kernel to calculate the histogram - //as quickly as possible - - //Although we provide only one kernel skeleton, - //feel free to use more if it will help you - //write faster code + int tid = threadIdx.x; + int index = blockIdx.x * blockDim.x + tid; + + // create sharedMemory histogram and initialize to 0 + extern __shared__ unsigned int localHistogram[]; + for (int i = tid; i < numBins; i += blockDim.x) { + localHistogram[i] = 0; + } + + __syncthreads(); + + // use offset to ensure coalescing read from global memory + // each thread reads with a stride of the offset + // parallel threads read one coalesced piece of memory + int offset = numVals / elementsPerThread; + for(int i = 0; i < elementsPerThread; ++i) + { + atomicAdd(&(localHistogram[vals[index + offset*i]]),1); + } + + __syncthreads(); + + for (int i = tid; i < numBins; i += blockDim.x) { + atomicAdd(&histo[i], localHistogram[i]); + } } void computeHistogram(const unsigned int* const d_vals, //INPUT @@ -45,10 +67,11 @@ void computeHistogram(const unsigned int* const d_vals, //INPUT const unsigned int numBins, const unsigned int numElems) { - //TODO Launch the yourHisto kernel - - //if you want to use/launch more than one kernel, - //feel free - + int powerOfTwo = 4; + int threadsPerBlock = 1024/powerOfTwo; + int blocks = (numElems + (threadsPerBlock*elementsPerThread) - 1) / (threadsPerBlock*elementsPerThread); + + yourHisto<<>>(d_vals, d_histo, numElems, numBins); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); } + From f3c28d0d9adba6e44f3f5fa5a2bbe09dc4036896 Mon Sep 17 00:00:00 2001 From: Johannes Deselaers Date: Wed, 13 Aug 2014 11:21:51 -0700 Subject: [PATCH 4/6] added missing semicolon --- Problem Sets/Problem Set 5/student.cu | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Problem Sets/Problem Set 5/student.cu b/Problem Sets/Problem Set 5/student.cu index 05ceb6dd..43ca76cd 100755 --- a/Problem Sets/Problem Set 5/student.cu +++ b/Problem Sets/Problem Set 5/student.cu @@ -27,7 +27,9 @@ #include "utils.h" #include "reference.cpp" +#define powerOfTwo 4 #define elementsPerThread 64 +#define maxThreadsPerBlock 1024 __global__ void yourHisto(const unsigned int* const vals, //INPUT @@ -67,11 +69,10 @@ void computeHistogram(const unsigned int* const d_vals, //INPUT const unsigned int numBins, const unsigned int numElems) { - int powerOfTwo = 4; - int threadsPerBlock = 1024/powerOfTwo; - int blocks = (numElems + (threadsPerBlock*elementsPerThread) - 1) / (threadsPerBlock*elementsPerThread); + int threadsPerBlock = maxThreadsPerBlock/powerOfTwo; + int elementsPerBlock = threadsPerBlock*elementsPerThread; + int blocks = (numElems + elementsPerBlock - 1) / elementsPerBlock; yourHisto<<>>(d_vals, d_histo, numElems, numBins); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); -} - +} \ No newline at end of file From 54834e0f678c14e7c12c4f70932be09cb9e194c1 Mon Sep 17 00:00:00 2001 From: Johannes Deselaers Date: Wed, 13 Aug 2014 15:18:45 -0700 Subject: [PATCH 5/6] added parts 1-4 of HW6, so far unoptimized --- Problem Sets/Problem Set 6/student_func.cu | 187 +++++++++++++++++---- 1 file changed, 153 insertions(+), 34 deletions(-) diff --git a/Problem Sets/Problem Set 6/student_func.cu b/Problem Sets/Problem Set 6/student_func.cu index 1f42ce05..9a32925e 100755 --- a/Problem Sets/Problem Set 6/student_func.cu +++ b/Problem Sets/Problem Set 6/student_func.cu @@ -62,52 +62,171 @@ In this assignment we will do 800 iterations. */ - - #include "utils.h" #include +#include "reference_calc.cpp" -void your_blend(const uchar4* const h_sourceImg, //IN - const size_t numRowsSource, const size_t numColsSource, - const uchar4* const h_destImg, //IN - uchar4* const h_blendedImg) //OUT -{ - /* To Recap here are the steps you need to implement - - 1) Compute a mask of the pixels from the source image to be copied - The pixels that shouldn't be copied are completely white, they - have R=255, G=255, B=255. Any other pixels SHOULD be copied. - 2) Compute the interior and border regions of the mask. An interior - pixel has all 4 neighbors also inside the mask. A border pixel is - in the mask itself, but has at least one neighbor that isn't. +__global__ void computeMask( + const uchar4* sourceImg, unsigned char* mask, const int numRowsSource, const int numColsSource) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + if(index >= numRowsSource * numColsSource) return; - 3) Separate out the incoming image into three separate channels + mask[index] = ((sourceImg[index].x + sourceImg[index].y + sourceImg[index].z) < 765) ? 1: 0; +} - 4) Create two float(!) buffers for each color channel that will - act as our guesses. Initialize them to the respective color - channel of the source image since that will act as our intial guess. +__global__ void computerInteriorAndBorder( + unsigned char *mask, + unsigned char *interior, + unsigned char *border, + const int numRowsSource, + const int numColsSource) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + if(index >= numRowsSource * numColsSource) return; + + if (index < numColsSource || index < (numColsSource*numRowsSource)-numColsSource-1) // check for access at image borders (top / bottom &? left/right) + { + interior = border = 0; + return; + } + + int neighborSum = mask[index-1] + mask[index+1] + mask[index - numRowsSource] + mask[index + numRowsSource]; + if (neighborSum == 4) + { + interior[index] = 1; + border[index] = 0; + } + else if(neighborSum == 3) + { + interior[index] = 0; + border[index] = 1; + } + else + { + interior = border = 0; + } +} - 5) For each color channel perform the Jacobi iteration described - above 800 times. +__global__ void separate_channels( + uchar4* sourceImg, + unsigned char* red, + unsigned char* green, + unsigned char* blue, + const int numRowsSource, + const int numColsSource) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= numColsSource * numRowsSource) return; - 6) Create the output image by replacing all the interior pixels - in the destination image with the result of the Jacobi iterations. - Just cast the floating point values to unsigned chars since we have - already made sure to clamp them to the correct range. + red[index] = sourceImg[index].x; + green[index] = sourceImg[index].z; + blue[index] = sourceImg[index].y; +} - Since this is final assignment we provide little boilerplate code to - help you. Notice that all the input/output pointers are HOST pointers. +__global__ void initialize( + unsigned char* source, + float* blended, + const int numRowsSource, + const int numColsSource) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= numColsSource * numRowsSource) return; - You will have to allocate all of your own GPU memory and perform your own - memcopies to get data in and out of the GPU memory. + blended[index] = static_cast(source[index]); +} - Remember to wrap all of your calls with checkCudaErrors() to catch any - thing that might go wrong. After each kernel call do: +void your_blend(const uchar4* const h_sourceImg, //IN + const size_t numRowsSource, const size_t numColsSource, + const uchar4* const h_destImg, //IN + uchar4* const h_blendedImg) //OUT +{ + uchar4 *d_sourceImg, *d_destImg, *d_blendedImg; + unsigned char *d_mask, *d_border, *d_interior; + unsigned char *d_red, *d_green, *d_blue; + + checkCudaErrors(cudaMalloc(&d_sourceImg, sizeof(uchar4) * numColsSource * numRowsSource)); + checkCudaErrors(cudaMalloc(&d_destImg, sizeof(uchar4) * numColsSource * numRowsSource)); + checkCudaErrors(cudaMalloc(&d_blendedImg, sizeof(uchar4) * numColsSource * numRowsSource)); + checkCudaErrors(cudaMalloc(&d_mask, sizeof(unsigned char) * numColsSource * numRowsSource)); + checkCudaErrors(cudaMalloc(&d_border, sizeof(unsigned char) * numColsSource * numRowsSource)); + checkCudaErrors(cudaMalloc(&d_interior, sizeof(unsigned char) * numColsSource * numRowsSource)); + checkCudaErrors(cudaMalloc(&d_red, sizeof(unsigned char) * numColsSource * numRowsSource)); + checkCudaErrors(cudaMalloc(&d_green, sizeof(unsigned char) * numColsSource * numRowsSource)); + checkCudaErrors(cudaMalloc(&d_blue, sizeof(unsigned char) * numColsSource * numRowsSource)); + + checkCudaErrors(cudaMemcpy(d_sourceImg, h_sourceImg, sizeof(uchar4) * numColsSource * numRowsSource, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_destImg, h_destImg, sizeof(uchar4) * numColsSource * numRowsSource, cudaMemcpyHostToDevice)); + + // 1) Compute a mask of the pixels from the source image to be copied + // The pixels that shouldn't be copied are completely white, they + // have R=255, G=255, B=255. Any other pixels SHOULD be copied. + + const int maxThreadsPerBlock = 1024; + int threadsPerBlock = maxThreadsPerBlock; + int blocks = (numRowsSource * numColsSource + threadsPerBlock - 1) / threadsPerBlock; + + computeMask<<>>(d_sourceImg, d_mask, numRowsSource, numColsSource); + cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); + + // 2) Compute the interior and border regions of the mask. An interior + // pixel has all 4 neighbors also inside the mask. A border pixel is + // in the mask itself, but has at least one neighbor that isn't. + + computerInteriorAndBorder<<>>( + d_mask, d_interior, d_border, numRowsSource, numColsSource); + cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); + + // 3) Separate out the incoming image into three separate channels + + separate_channels<<>>(d_sourceImg, d_red, d_green, d_blue, numRowsSource, numColsSource); + cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); + + // 4) Create two float(!) buffers for each color channel that will + // act as our guesses. Initialize them to the respective color + // channel of the source image since that will act as our intial guess. + + float *d_blendedRed1; + float *d_blendedRed2; + float *d_blendedGreen1; + float *d_blendedGreen2; + float *d_blendedBlue1; + float *d_blendedBlue2; + checkCudaErrors(cudaMalloc(&d_blendedRed1, sizeof(float) * numColsSource * numRowsSource)); + checkCudaErrors(cudaMalloc(&d_blendedRed2, sizeof(float) * numColsSource * numRowsSource)); + checkCudaErrors(cudaMalloc(&d_blendedGreen1, sizeof(float) * numColsSource * numRowsSource)); + checkCudaErrors(cudaMalloc(&d_blendedGreen2, sizeof(float) * numColsSource * numRowsSource)); + checkCudaErrors(cudaMalloc(&d_blendedBlue1, sizeof(float) * numColsSource * numRowsSource)); + checkCudaErrors(cudaMalloc(&d_blendedBlue2, sizeof(float) * numColsSource * numRowsSource)); + + initialize<<>>(d_red, d_blendedRed1, numRowsSource, numColsSource); + initialize<<>>(d_red, d_blendedRed2, numRowsSource, numColsSource); + initialize<<>>(d_green, d_blendedGreen1, numRowsSource, numColsSource); + initialize<<>>(d_green, d_blendedGreen2, numRowsSource, numColsSource); + initialize<<>>(d_blue, d_blendedBlue1, numRowsSource, numColsSource); + initialize<<>>(d_blue, d_blendedBlue2, numRowsSource, numColsSource); + cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); + + // 5) For each color channel perform the Jacobi iteration described + // above 800 times. + + // 6) Create the output image by replacing all the interior pixels + // in the destination image with the result of the Jacobi iterations. + // Just cast the floating point values to unsigned chars since we have + // already made sure to clamp them to the correct range. + + /* The reference calculation is provided below, feel free to use it + for debugging purposes. + */ - cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); + /** + uchar4* h_reference = new uchar4[srcSize]; + reference_calc(h_sourceImg, numRowsSource, numColsSource, + h_destImg, h_reference); - to catch any errors that happened while executing the kernel. - */ + checkResultsEps((unsigned char *)h_reference, (unsigned char *)h_blendedImg, 4 * srcSize, 2, .01); + delete[] h_reference; + /**/ } From ed8fe428f648c6608a9787baebb7c8f127af2fec Mon Sep 17 00:00:00 2001 From: Johannes Deselaers Date: Wed, 20 Aug 2014 12:44:12 -0700 Subject: [PATCH 6/6] solution for ps6 --- Problem Sets/Problem Set 6/student_func.cu | 132 ++++++++++++++++++--- 1 file changed, 117 insertions(+), 15 deletions(-) diff --git a/Problem Sets/Problem Set 6/student_func.cu b/Problem Sets/Problem Set 6/student_func.cu index 9a32925e..544cca1d 100755 --- a/Problem Sets/Problem Set 6/student_func.cu +++ b/Problem Sets/Problem Set 6/student_func.cu @@ -64,9 +64,6 @@ #include "utils.h" #include -#include "reference_calc.cpp" - - __global__ void computeMask( const uchar4* sourceImg, unsigned char* mask, const int numRowsSource, const int numColsSource) @@ -110,7 +107,7 @@ __global__ void computerInteriorAndBorder( } } -__global__ void separate_channels( +__global__ void separateChannels( uchar4* sourceImg, unsigned char* red, unsigned char* green, @@ -138,6 +135,55 @@ __global__ void initialize( blended[index] = static_cast(source[index]); } +__global__ void performIteration( + float * currentGuess, + float * nextGuess, + unsigned char * sourceChannel, + unsigned char * destChannel, + unsigned char * interiorMask, + unsigned char * borderMask, + const int numRowsSource, + const int numColsSource) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= numColsSource * numRowsSource) return; + if (interiorMask[index] == 0) return; + + float sum1 = 0; + sum1 += interiorMask[index-numColsSource] * currentGuess[index-numColsSource] + borderMask[index-numColsSource] * destChannel[index-numColsSource]; + sum1 += interiorMask[index+numColsSource] * currentGuess[index+numColsSource] + borderMask[index+numColsSource] * destChannel[index+numColsSource]; + sum1 += interiorMask[index-1] * currentGuess[index-1] + borderMask[index-1] * destChannel[index-1]; + sum1 += interiorMask[index+1] * currentGuess[index+1] + borderMask[index+1] * destChannel[index+1]; + + + float sum2 = 0; + sum2 += interiorMask[index] * (sourceChannel[index] - sourceChannel[index-numColsSource]); + sum2 += interiorMask[index] * (sourceChannel[index] - sourceChannel[index+numColsSource]); + sum2 += interiorMask[index] * (sourceChannel[index] - sourceChannel[index-1]); + sum2 += interiorMask[index] * (sourceChannel[index] - sourceChannel[index+1]); + + float newVal = (sum1 + sum2) / 4.f; + nextGuess[index] = min(255.f, max(0.f, newVal)); +} + +__global__ void createOutput( + uchar4 * blendedImg, + float *finalGuessRed, + float *finalGuessGreen, + float *finalGuessBlue, + unsigned char *interiorMask, + const int numRowsSource, + const int numColsSource) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= numColsSource * numRowsSource) return; + if (!(interiorMask[index])) return; + + blendedImg[index].x = static_cast(finalGuessRed[index]); + blendedImg[index].y = static_cast(finalGuessGreen[index]); + blendedImg[index].z = static_cast(finalGuessBlue[index]); +} + void your_blend(const uchar4* const h_sourceImg, //IN const size_t numRowsSource, const size_t numColsSource, const uchar4* const h_destImg, //IN @@ -145,7 +191,9 @@ void your_blend(const uchar4* const h_sourceImg, //IN { uchar4 *d_sourceImg, *d_destImg, *d_blendedImg; unsigned char *d_mask, *d_border, *d_interior; - unsigned char *d_red, *d_green, *d_blue; + unsigned char *d_redSrc, *d_greenSrc, *d_blueSrc; + unsigned char *d_redDst, *d_greenDst, *d_blueDst; + checkCudaErrors(cudaMalloc(&d_sourceImg, sizeof(uchar4) * numColsSource * numRowsSource)); checkCudaErrors(cudaMalloc(&d_destImg, sizeof(uchar4) * numColsSource * numRowsSource)); @@ -153,9 +201,12 @@ void your_blend(const uchar4* const h_sourceImg, //IN checkCudaErrors(cudaMalloc(&d_mask, sizeof(unsigned char) * numColsSource * numRowsSource)); checkCudaErrors(cudaMalloc(&d_border, sizeof(unsigned char) * numColsSource * numRowsSource)); checkCudaErrors(cudaMalloc(&d_interior, sizeof(unsigned char) * numColsSource * numRowsSource)); - checkCudaErrors(cudaMalloc(&d_red, sizeof(unsigned char) * numColsSource * numRowsSource)); - checkCudaErrors(cudaMalloc(&d_green, sizeof(unsigned char) * numColsSource * numRowsSource)); - checkCudaErrors(cudaMalloc(&d_blue, sizeof(unsigned char) * numColsSource * numRowsSource)); + checkCudaErrors(cudaMalloc(&d_redSrc, sizeof(unsigned char) * numColsSource * numRowsSource)); + checkCudaErrors(cudaMalloc(&d_greenSrc, sizeof(unsigned char) * numColsSource * numRowsSource)); + checkCudaErrors(cudaMalloc(&d_blueSrc, sizeof(unsigned char) * numColsSource * numRowsSource)); + checkCudaErrors(cudaMalloc(&d_redDst, sizeof(unsigned char) * numColsSource * numRowsSource)); + checkCudaErrors(cudaMalloc(&d_greenDst, sizeof(unsigned char) * numColsSource * numRowsSource)); + checkCudaErrors(cudaMalloc(&d_blueDst, sizeof(unsigned char) * numColsSource * numRowsSource)); checkCudaErrors(cudaMemcpy(d_sourceImg, h_sourceImg, sizeof(uchar4) * numColsSource * numRowsSource, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_destImg, h_destImg, sizeof(uchar4) * numColsSource * numRowsSource, cudaMemcpyHostToDevice)); @@ -181,7 +232,8 @@ void your_blend(const uchar4* const h_sourceImg, //IN // 3) Separate out the incoming image into three separate channels - separate_channels<<>>(d_sourceImg, d_red, d_green, d_blue, numRowsSource, numColsSource); + separateChannels<<>>(d_sourceImg, d_redSrc, d_greenSrc, d_blueSrc, numRowsSource, numColsSource); + separateChannels<<>>(d_destImg, d_redDst, d_greenDst, d_blueDst, numRowsSource, numColsSource); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); // 4) Create two float(!) buffers for each color channel that will @@ -201,21 +253,71 @@ void your_blend(const uchar4* const h_sourceImg, //IN checkCudaErrors(cudaMalloc(&d_blendedBlue1, sizeof(float) * numColsSource * numRowsSource)); checkCudaErrors(cudaMalloc(&d_blendedBlue2, sizeof(float) * numColsSource * numRowsSource)); - initialize<<>>(d_red, d_blendedRed1, numRowsSource, numColsSource); - initialize<<>>(d_red, d_blendedRed2, numRowsSource, numColsSource); - initialize<<>>(d_green, d_blendedGreen1, numRowsSource, numColsSource); - initialize<<>>(d_green, d_blendedGreen2, numRowsSource, numColsSource); - initialize<<>>(d_blue, d_blendedBlue1, numRowsSource, numColsSource); - initialize<<>>(d_blue, d_blendedBlue2, numRowsSource, numColsSource); + initialize<<>>(d_redSrc, d_blendedRed1, numRowsSource, numColsSource); + initialize<<>>(d_redDst, d_blendedRed2, numRowsSource, numColsSource); + initialize<<>>(d_greenSrc, d_blendedGreen1, numRowsSource, numColsSource); + initialize<<>>(d_greenDst, d_blendedGreen2, numRowsSource, numColsSource); + initialize<<>>(d_blueSrc, d_blendedBlue1, numRowsSource, numColsSource); + initialize<<>>(d_blueDst, d_blendedBlue2, numRowsSource, numColsSource); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); // 5) For each color channel perform the Jacobi iteration described // above 800 times. + const int iterations(800); + for(int i = 0; i < iterations; ++i) + { + performIteration<<>>( + d_blendedRed1,d_blendedRed2, d_redSrc, d_redDst, d_interior, d_border, numRowsSource, numColsSource); + performIteration<<>>( + d_blendedGreen1,d_blendedGreen2, d_greenSrc, d_greenDst, d_interior, d_border, numRowsSource, numColsSource); + performIteration<<>>( + d_blendedBlue1,d_blendedBlue2, d_blueSrc, d_blueDst, d_interior, d_border, numRowsSource, numColsSource); + cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); + + std::swap(d_blendedRed1, d_blendedRed2); + std::swap(d_blendedGreen1, d_blendedGreen2); + std::swap(d_blendedBlue1, d_blendedBlue2); + } + // even number of iterations, so swap once more + std::swap(d_blendedRed1, d_blendedRed2); + std::swap(d_blendedGreen1, d_blendedGreen2); + std::swap(d_blendedBlue1, d_blendedBlue2); // 6) Create the output image by replacing all the interior pixels // in the destination image with the result of the Jacobi iterations. // Just cast the floating point values to unsigned chars since we have // already made sure to clamp them to the correct range. + checkCudaErrors(cudaMemcpy(d_blendedImg, d_destImg, sizeof(uchar4) * numRowsSource * numColsSource, cudaMemcpyDeviceToDevice)); + + createOutput<<>>( + d_blendedImg, d_blendedRed2, d_blendedGreen2, d_blendedBlue2, d_interior, numRowsSource, numColsSource); + + checkCudaErrors(cudaMemcpy(h_blendedImg, d_blendedImg, sizeof(uchar4) * numRowsSource * numColsSource,cudaMemcpyDeviceToHost)); + cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); + + // + // DONT FORGET TO FREE ALL OF THE MEMORY !!! + // + checkCudaErrors(cudaFree(d_sourceImg)); + checkCudaErrors(cudaFree(d_destImg)); + checkCudaErrors(cudaFree(d_blendedImg)); + checkCudaErrors(cudaFree(d_mask)); + checkCudaErrors(cudaFree(d_border)); + checkCudaErrors(cudaFree(d_interior)); + checkCudaErrors(cudaFree(d_redSrc)); + checkCudaErrors(cudaFree(d_redDst)); + checkCudaErrors(cudaFree(d_greenSrc)); + checkCudaErrors(cudaFree(d_greenDst)); + checkCudaErrors(cudaFree(d_blueSrc)); + checkCudaErrors(cudaFree(d_blueDst)); + checkCudaErrors(cudaFree(d_blendedRed1)); + checkCudaErrors(cudaFree(d_blendedRed2)); + checkCudaErrors(cudaFree(d_blendedGreen1)); + checkCudaErrors(cudaFree(d_blendedGreen2)); + checkCudaErrors(cudaFree(d_blendedBlue1)); + checkCudaErrors(cudaFree(d_blendedBlue2)); + + /* The reference calculation is provided below, feel free to use it for debugging purposes.