From e531fadbe37287e39ed490b41011bad4cf09aa6b Mon Sep 17 00:00:00 2001
From: Johannes Deselaers <deselaers.johannes@gmail.com>
Date: Mon, 4 Aug 2014 13:28:17 -0700
Subject: [PATCH 1/6] Update directory paths in CMakeList

---
 CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 996cb1c3..f620f7af 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,12 +39,12 @@ if(CUDA_FOUND)
   # add debugging to CUDA NVCC flags.  For NVidia's NSight tools.
   set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} "-G")
 
-  add_subdirectory (HW1)
-  add_subdirectory (HW2)
-  add_subdirectory (HW3)
-  add_subdirectory (HW4)
-  add_subdirectory (HW5)
-  add_subdirectory (HW6)
+  add_subdirectory (Problem\ Sets/Problem\ Set\ 1)
+  add_subdirectory (Problem\ Sets/Problem\ Set\ 2)
+  add_subdirectory (Problem\ Sets/Problem\ Set\ 3)
+  add_subdirectory (Problem\ Sets/Problem\ Set\ 4)
+  add_subdirectory (Problem\ Sets/Problem\ Set\ 5)
+  add_subdirectory (Problem\ Sets/Problem\ Set\ 6)
 else(CUDA_FOUND)
   message("CUDA is not installed on this system.")
 endif()

From 96aa3b780d8bfd13e8f74cacc2484acc2ef2b942 Mon Sep 17 00:00:00 2001
From: Johannes Deselaers <deselaers.johannes@gmail.com>
Date: Wed, 6 Aug 2014 19:06:02 -0700
Subject: [PATCH 2/6] added Homeworks 1-3

---
 Problem Sets/Problem Set 1/Makefile        |   2 +-
 Problem Sets/Problem Set 1/student_func.cu |  63 ++++----
 Problem Sets/Problem Set 2/student_func.cu |  28 ++--
 Problem Sets/Problem Set 3/student_func.cu | 162 ++++++++++++++++++---
 4 files changed, 196 insertions(+), 59 deletions(-)

diff --git a/Problem Sets/Problem Set 1/Makefile b/Problem Sets/Problem Set 1/Makefile
index bc0b2e5d..281df44f 100755
--- a/Problem Sets/Problem Set 1/Makefile	
+++ b/Problem Sets/Problem Set 1/Makefile	
@@ -22,7 +22,7 @@ OPENCV_INCLUDEPATH=/usr/include
 
 OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
 
-CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
+CUDA_INCLUDEPATH=/usr/local/cuda/include
 
 ######################################################
 # On Macs the default install locations are below    #
diff --git a/Problem Sets/Problem Set 1/student_func.cu b/Problem Sets/Problem Set 1/student_func.cu
index 452b379f..0aeb3eb2 100755
--- a/Problem Sets/Problem Set 1/student_func.cu	
+++ b/Problem Sets/Problem Set 1/student_func.cu	
@@ -2,11 +2,11 @@
 // Color to Greyscale Conversion
 
 //A common way to represent color images is known as RGBA - the color
-//is specified by how much Red, Grean and Blue is in it.
-//The 'A' stands for Alpha and is used for transparency, it will be
+//is specified by how much Red, Green, and Blue is in it.
+//The 'A' stands for Alpha and is used for transparency; it will be
 //ignored in this homework.
 
-//Each channel Red, Blue, Green and Alpha is represented by one byte.
+//Each channel Red, Blue, Green, and Alpha is represented by one byte.
 //Since we are using one byte for each color there are 256 different
 //possible values for each color.  This means we use 4 bytes per pixel.
 
@@ -15,7 +15,7 @@
 
 //To convert an image from color to grayscale one simple method is to
 //set the intensity to the average of the RGB channels.  But we will
-//use a more sophisticated method that takes into account how the eye 
+//use a more sophisticated method that takes into account how the eye
 //perceives color and weights the channels unequally.
 
 //The eye responds most strongly to green followed by red and then blue.
@@ -24,43 +24,50 @@
 
 //I = .299f * R + .587f * G + .114f * B
 
-//Notice the trailing f's on the numbers which indicate that they are 
+//Notice the trailing f's on the numbers which indicate that they are
 //single precision floating point constants and not double precision
 //constants.
 
 //You should fill in the kernel as well as set the block and grid sizes
 //so that the entire image is processed.
 
+#include "reference_calc.cpp"
 #include "utils.h"
+#include <stdio.h>
 
 __global__
 void rgba_to_greyscale(const uchar4* const rgbaImage,
-                       unsigned char* const greyImage,
-                       int numRows, int numCols)
+unsigned char* const greyImage,
+int numRows, int numCols)
 {
-  //TODO
-  //Fill in the kernel to convert from color to greyscale
-  //the mapping from components of a uchar4 to RGBA is:
-  // .x -> R ; .y -> G ; .z -> B ; .w -> A
-  //
-  //The output (greyImage) at each pixel should be the result of
-  //applying the formula: output = .299f * R + .587f * G + .114f * B;
-  //Note: We will be ignoring the alpha channel for this conversion
-
-  //First create a mapping from the 2D block and grid locations
-  //to an absolute 2D location in the image, then use that to
-  //calculate a 1D offset
+    //TODO
+    //Fill in the kernel to convert from color to greyscale
+    //the mapping from components of a uchar4 to RGBA is:
+    // .x -> R ; .y -> G ; .z -> B ; .w -> A
+    //
+    //The output (greyImage) at each pixel should be the result of
+    //applying the formula: output = .299f * R + .587f * G + .114f * B;
+    //Note: We will be ignoring the alpha channel for this conversion
+    
+    //First create a mapping from the 2D block and grid locations
+    //to an absolute 2D location in the image, then use that to
+    //calculate a 1D offset
+    int x = blockIdx.x;
+    int y = blockIdx.y;
+    uchar4 colorValue = rgbaImage[x * numCols + y];
+    float greyValue = 0.299f * colorValue.x + 0.587f * colorValue.y + 0.114f * colorValue.z;
+    greyImage[x * numCols + y] = greyValue;
 }
 
 void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
-                            unsigned char* const d_greyImage, size_t numRows, size_t numCols)
+unsigned char* const d_greyImage, size_t numRows, size_t numCols)
 {
-  //You must fill in the correct sizes for the blockSize and gridSize
-  //currently only one block with one thread is being launched
-  const dim3 blockSize(1, 1, 1);  //TODO
-  const dim3 gridSize( 1, 1, 1);  //TODO
-  rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);
-  
-  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
-
+    //You must fill in the correct sizes for the blockSize and gridSize
+    //currently only one block with one thread is being launched
+    const dim3 threadsPerBlock( 1, 1, 1);  //TODO
+    const dim3 numberOfBlocks( numRows, numCols, 1);  //TODO
+    rgba_to_greyscale<<<numberOfBlocks, threadsPerBlock>>>(d_rgbaImage, d_greyImage, numRows, numCols);
+    
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
 }
+
diff --git a/Problem Sets/Problem Set 2/student_func.cu b/Problem Sets/Problem Set 2/student_func.cu
index 825e412b..30f5402b 100755
--- a/Problem Sets/Problem Set 2/student_func.cu	
+++ b/Problem Sets/Problem Set 2/student_func.cu	
@@ -117,11 +117,11 @@ void gaussian_blur(const unsigned char* const inputChannel,
   // the image. You'll want code that performs the following check before accessing
   // GPU memory:
   //
-  // if ( absolute_image_position_x >= numCols ||
-  //      absolute_image_position_y >= numRows )
-  // {
-  //     return;
-  // }
+  if ( absolute_image_position_x >= numCols ||
+       absolute_image_position_y >= numRows )
+  {
+      return;
+  }
   
   // NOTE: If a thread's absolute position 2D position is within the image, but some of
   // its neighbors are outside the image, then you will need to be extra careful. Instead
@@ -147,11 +147,11 @@ void separateChannels(const uchar4* const inputImageRGBA,
   // the image. You'll want code that performs the following check before accessing
   // GPU memory:
   //
-  // if ( absolute_image_position_x >= numCols ||
-  //      absolute_image_position_y >= numRows )
-  // {
-  //     return;
-  // }
+  if ( absolute_image_position_x >= numCols ||
+        absolute_image_position_y >= numRows )
+  {
+      return;
+  }
 }
 
 //This kernel takes in three color channels and recombines them
@@ -205,6 +205,7 @@ void allocateMemoryAndCopyToGPU(const size_t numRowsImage, const size_t numColsI
   //be sure to use checkCudaErrors like the above examples to
   //be able to tell if anything goes wrong
   //IMPORTANT: Notice that we pass a pointer to a pointer to cudaMalloc
+  checkCudaErrors(cudaMalloc);
 
   //TODO:
   //Copy the filter on the host (h_filter) to the memory you just allocated
@@ -221,15 +222,16 @@ void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_
                         const int filterWidth)
 {
   //TODO: Set reasonable block size (i.e., number of threads per block)
-  const dim3 blockSize;
+  const dim3 blockSize( 1, 1, 1);
 
   //TODO:
   //Compute correct grid size (i.e., number of blocks per kernel launch)
   //from the image size and and block size.
-  const dim3 gridSize;
+  const dim3 gridSize( 1, 1, 1);
 
   //TODO: Launch a kernel for separating the RGBA image into different color channels
-
+  seperateChannels<<<gridSize, blockSize>>>(...);
+  
   // Call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after
   // launching your kernel to make sure that you didn't make any mistakes.
   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
diff --git a/Problem Sets/Problem Set 3/student_func.cu b/Problem Sets/Problem Set 3/student_func.cu
index 26f00a74..a7f13fdc 100755
--- a/Problem Sets/Problem Set 3/student_func.cu	
+++ b/Problem Sets/Problem Set 3/student_func.cu	
@@ -81,24 +81,152 @@
 
 #include "utils.h"
 
-void your_histogram_and_prefixsum(const float* const d_logLuminance,
-                                  unsigned int* const d_cdf,
-                                  float &min_logLum,
-                                  float &max_logLum,
-                                  const size_t numRows,
-                                  const size_t numCols,
-                                  const size_t numBins)
+__global__ void reduceLuminance(const float* const d_logLuminance, bool isMax, float *result, int inputSize)
 {
-  //TODO
-  /*Here are the steps you need to implement
-    1) find the minimum and maximum value in the input logLuminance channel
-       store in min_logLum and max_logLum
-    2) subtract them to find the range
-    3) generate a histogram of all the values in the logLuminance channel using
-       the formula: bin = (lum[i] - lumMin) / lumRange * numBins
-    4) Perform an exclusive scan (prefix sum) on the histogram to get
-       the cumulative distribution of luminance values (this should go in the
-       incoming d_cdf pointer which already has been allocated for you)       */
+    //  // sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
+    extern __shared__ float sdata[];
+    
+    int myId = threadIdx.x + blockDim.x * blockIdx.x;
+    int tid  = threadIdx.x;
+    
+    // load shared mem from global mem
+    if(myId < inputSize)
+    {
+        sdata[tid] = d_logLuminance[myId];
+    }
+    __syncthreads();            // make sure entire block is loaded!
+    
+    // do reduction in shared mem
+    for (int s = blockDim.x / 2; s > 0; s >>= 1)
+    {
+        if (tid < s && (myId+s) < inputSize)
+        {
+            float val1=sdata[tid];
+            float val2=sdata[tid + s];
+            sdata[tid] = isMax?(val1 > val2 ? val1 : val2) : (val1 < val2 ? val1 : val2);
+        }
+        __syncthreads();        // make sure all adds at one stage are done!
+    }
+    
+    // only thread 0 writes result for this block back to global mem
+    if (tid == 0)
+    {
+        result[blockIdx.x] = sdata[0];
+    }
+}
+
+__global__ void assignHistogram(int *d_bins, const float *d_in, int numBins, float lumRange, float lumMin, int inputSize)
+{
+    int myId = threadIdx.x + blockDim.x * blockIdx.x;
+    
+    if(myId<inputSize)
+    {
+        //bin = (lum[i] - lumMin) / lumRange * numBins
+        int myBin = (d_in[myId] - lumMin) / lumRange * numBins;
+        atomicAdd(&(d_bins[myBin]), 1);
+    }
+}
 
 
+__global__ void scanHistogram(const int* const d_bins, unsigned int *result, int inputSize)
+{
+    //  // sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
+    extern __shared__ int shareddata[];
+    
+    int myId = threadIdx.x + blockDim.x * blockIdx.x;
+    int tid  = threadIdx.x;
+    
+    // load shared mem from global mem
+    if(myId < inputSize)
+    {
+        shareddata[tid] = d_bins[myId];
+    }
+    __syncthreads();            // make sure entire block is loaded!
+    
+    //Step hillis / Steele
+    for (int step = 1; step<inputSize;step <<= 1)
+    {
+        if (tid >= step)
+        {
+            shareddata[tid] += shareddata[tid - step];
+        }
+        __syncthreads();        // make sure all adds at one stage are done!
+    }
+    
+    // every thread writes result for this block back to global mem
+    if(myId < inputSize)
+    {
+        result[myId] = shareddata[tid];
+    }
+}
+
+void your_histogram_and_prefixsum(const float* const d_logLuminance,
+unsigned int* const d_cdf,
+float &min_logLum,
+float &max_logLum,
+const size_t numRows,
+const size_t numCols,
+const size_t numBins)
+{
+    //TODO
+    /*Here are the steps you need to implement
+     1) find the minimum and maximum value in the input logLuminance channel
+     store in min_logLum and max_logLum
+     2) subtract them to find the range
+     3) generate a histogram of all the values in the logLuminance channel using
+     the formula: bin = (lum[i] - lumMin) / lumRange * numBins
+     4) Perform an exclusive scan (prefix sum) on the histogram to get
+     the cumulative distribution of luminance values (this should go in the
+     incoming d_cdf pointer which already has been allocated for you)       */
+    
+    // Two step reduce on one dimension
+    const int inputSize = numRows * numCols;
+    const int maxThreadsPerBlock = 1024;
+    unsigned int threads = maxThreadsPerBlock;
+    const int blocks = (inputSize / maxThreadsPerBlock)+2; //more blocks to avoid int round loss
+    
+    const int sharedMemorySize_1 = threads * sizeof(float);
+    const int sharedMemorySize_2 = blocks * sizeof(float);
+    
+    float *d_intermediate, *d_result;
+    checkCudaErrors(cudaMalloc(&d_intermediate, sizeof(float) * blocks));
+    checkCudaErrors(cudaMalloc(&d_result, sizeof(float)));
+    
+    // Step 1: min luminance
+    reduceLuminance<<<blocks, threads, sharedMemorySize_1>>>(d_logLuminance, false, d_intermediate, inputSize);
+    reduceLuminance<<<1, blocks, sharedMemorySize_2>>>(d_intermediate, false, d_result, blocks);
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+    checkCudaErrors(cudaMemcpy(&min_logLum, d_result, sizeof(float), cudaMemcpyDeviceToHost));
+    
+    // Step 1: max luminance
+    reduceLuminance<<<blocks, threads, sharedMemorySize_1>>>(d_logLuminance, true, d_intermediate, inputSize);
+    reduceLuminance<<<1, blocks, sharedMemorySize_2>>>(d_intermediate, true, d_result, blocks);
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+    checkCudaErrors(cudaMemcpy(&max_logLum, d_result, sizeof(float), cudaMemcpyDeviceToHost));
+    
+    // Step 2: luminance range
+    float range = max_logLum - min_logLum;
+    
+    // Step 3: histogram
+    int *d_bins;
+    checkCudaErrors(cudaMalloc(&d_bins, sizeof(int) * numBins));
+    checkCudaErrors(cudaMemset(d_bins, 0, sizeof(int) * numBins));
+    assignHistogram<<<(inputSize/threads) +1, threads>>>(d_bins, d_logLuminance, numBins, range, min_logLum, inputSize);
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+    
+    // Step 4: Exclusive Scan on d_bins
+    threads = 2;
+    while(threads < numBins)
+    {
+        threads <<= 1;
+    }
+    
+    scanHistogram<<<1, threads, sizeof(unsigned int) * numBins>>>(d_bins, d_cdf, numBins);
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+    
+    //free resources
+    
+    cudaFree(d_intermediate);
+    cudaFree(d_result);
+    cudaFree(d_bins);
 }

From 91ab000067b1f21aa2b670e2bfc7c861b7498410 Mon Sep 17 00:00:00 2001
From: Johannes Deselaers <deselaers.johannes@gmail.com>
Date: Wed, 13 Aug 2014 10:33:29 -0700
Subject: [PATCH 3/6] added solution for HW5

---
 .gitignore                                 |   1 +
 Problem Sets/Problem Set 4/student_func.cu | 302 ++++++++++++++++++---
 Problem Sets/Problem Set 5/student.cu      |  49 +++-
 3 files changed, 307 insertions(+), 45 deletions(-)

diff --git a/.gitignore b/.gitignore
index a22963e0..3baa1c0e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,4 @@
 build
 bin
 
+*gpusort-ipdps09.pdf
\ No newline at end of file
diff --git a/Problem Sets/Problem Set 4/student_func.cu b/Problem Sets/Problem Set 4/student_func.cu
index 347d7b6e..a7b19922 100755
--- a/Problem Sets/Problem Set 4/student_func.cu	
+++ b/Problem Sets/Problem Set 4/student_func.cu	
@@ -3,45 +3,180 @@
 
 #include "utils.h"
 #include <thrust/host_vector.h>
+#include "device_launch_parameters.h"
+#include "device_functions.h"
+#include <memory>
 
 /* Red Eye Removal
-   ===============
-   
-   For this assignment we are implementing red eye removal.  This is
-   accomplished by first creating a score for every pixel that tells us how
-   likely it is to be a red eye pixel.  We have already done this for you - you
-   are receiving the scores and need to sort them in ascending order so that we
-   know which pixels to alter to remove the red eye.
+===============
 
-   Note: ascending order == smallest to largest
+For this assignment we are implementing red eye removal.  This is
+accomplished by first creating a score for every pixel that tells us how
+likely it is to be a red eye pixel.  We have already done this for you - you
+are receiving the scores and need to sort them in ascending order so that we
+know which pixels to alter to remove the red eye.
 
-   Each score is associated with a position, when you sort the scores, you must
-   also move the positions accordingly.
+Note: ascending order == smallest to largest
 
-   Implementing Parallel Radix Sort with CUDA
-   ==========================================
+Each score is associated with a position, when you sort the scores, you must
+also move the positions accordingly.
 
-   The basic idea is to construct a histogram on each pass of how many of each
-   "digit" there are.   Then we scan this histogram so that we know where to put
-   the output of each digit.  For example, the first 1 must come after all the
-   0s so we have to know how many 0s there are to be able to start moving 1s
-   into the correct position.
+Implementing Parallel Radix Sort with CUDA
+==========================================
 
-   1) Histogram of the number of occurrences of each digit
-   2) Exclusive Prefix Sum of Histogram
-   3) Determine relative offset of each digit
-        For example [0 0 1 1 0 0 1]
-                ->  [0 1 0 1 2 3 2]
-   4) Combine the results of steps 2 & 3 to determine the final
-      output location for each element and move it there
+The basic idea is to construct a histogram on each pass of how many of each
+"digit" there are.   Then we scan this histogram so that we know where to put
+the output of each digit.  For example, the first 1 must come after all the
+0s so we have to know how many 0s there are to be able to start moving 1s
+into the correct position.
 
-   LSB Radix sort is an out-of-place sort and you will need to ping-pong values
-   between the input and output buffers we have provided.  Make sure the final
-   sorted results end up in the output buffer!  Hint: You may need to do a copy
-   at the end.
+1) Histogram of the number of occurrences of each digit
+2) Exclusive Prefix Sum of Histogram
+3) Determine relative offset of each digit
+For example [0 0 1 1 0 0 1]
+->  [0 1 0 1 2 3 2]
+4) Combine the results of steps 2 & 3 to determine the final
+output location for each element and move it there
 
- */
+LSB Radix sort is an out-of-place sort and you will need to ping-pong values
+between the input and output buffers we have provided.  Make sure the final
+sorted results end up in the output buffer!  Hint: You may need to do a copy
+at the end.
 
+*/
+#define NUM_BITS 4
+#define RADIX (1 << NUM_BITS)
+#define BLOCK_SIZE 256 // no more than 254 in order to fit all relative positions into the array of bytes
+
+__device__ __host__ unsigned int get_digit(unsigned int num, unsigned int shift)
+{
+  int mask = (RADIX - 1) << shift;
+  return (num & mask) >> shift;
+}
+
+// histogram of digits within one block
+__global__ void hist_digits(const unsigned int* const in, 
+                            const size_t size, 
+                            unsigned int shift, 
+                            const int nBins, 
+                            unsigned int* const  outBins)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= size)
+  {
+    return;
+  }
+
+  unsigned int digit = get_digit(in[idx], shift);
+
+  atomicAdd(&outBins[digit], 1);
+}
+
+// # of each digit within a block.
+// used to compute relative positions of each digit within the array
+__global__ void num_digits_per_block(const unsigned int* const in, 
+                                     const size_t size, 
+                                     unsigned int shift, 
+                                     const int nBins, 
+                                     unsigned int* const  outBins)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= size)
+  {
+    return;
+  }
+
+  unsigned int digit = get_digit(in[idx], shift);
+
+  atomicAdd(&outBins[blockIdx.x * RADIX + digit], 1);
+}
+
+// add per-block cdf of digits to get a truly relative position of the digit (relative to the start of the sequence)
+// and then add cdf of all digits to get the absolute position of the element in the new order
+__global__ void get_absolute_positions(const unsigned int * const in, unsigned int * const relPos, size_t size, unsigned int shift, unsigned int * const maxVals, unsigned int * const digitCdf)
+{
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx >= size)
+  {
+    return;
+  }
+  unsigned int digit = get_digit(in[idx], shift);
+
+  relPos[idx] += maxVals[blockIdx.x * RADIX + digit];
+  relPos[idx] += digitCdf[digit];
+}
+
+// Build a relative positional scan of digit, shifted by "shift"
+// in - array of numbers to sort
+// inPos - positions of the keys that also need to move
+// size - size of the array
+// shift - which significant digit are we concerned about
+// rel - array of relative positions of the numbers in the in array (relative to current block), based on the digit of interest.
+__global__ void rel_pos_per_block(const unsigned int * const in, 
+                                  const size_t size, unsigned int shift, 
+                                  unsigned int * const rel)
+{
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int digit_pos = get_digit(in[idx], shift);
+
+  int offset = digit_pos * blockDim.x;
+
+  __shared__ unsigned int relativePosition[RADIX * BLOCK_SIZE];
+  for(int i = 0; i < RADIX; i++)
+  {
+    relativePosition[threadIdx.x + i * blockDim.x] = 0;
+  }
+  __syncthreads();
+
+  // initialize to the "startup" positions for each digit.
+  // since we always set them to "1" and the scan is supposed
+  // to be exclusive, we will need to subtract "1" at the end
+  relativePosition[threadIdx.x + offset] = 1;
+
+  // accumulate position values for each digit
+  for(int step = 1; step < size; step <<= 1)
+  {
+    for(int j = 0; j < RADIX; j++)
+    {
+      __syncthreads();
+      // position start of the digit in the relatvePosition array
+      int offset = j * blockDim.x;
+      int tSum = relativePosition[threadIdx.x + offset];
+
+      if(threadIdx.x >= step)
+      {
+        tSum += relativePosition[threadIdx.x + offset - step];
+      }
+      __syncthreads();
+      relativePosition[threadIdx.x + offset] = tSum;
+    }
+  }
+
+  __syncthreads();
+
+  rel[idx] = relativePosition[threadIdx.x + offset] - 1;
+}
+
+// Convert relative postions to absolute ones by adding up to the histogram
+// rel array of relative positions
+// cdf - exclusive cdf of the counts histogram
+__global__ void move_to_out(unsigned int * const dest, 
+                            unsigned int * const destPos, 
+                            unsigned int * const source, 
+                            unsigned int * const sourcePos, 
+                            unsigned int * const rel, 
+                            size_t size)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if(idx >= size)
+  {
+    return;
+  }
+
+  int pos = rel[idx];
+  dest[pos] = source[idx];
+  destPos[pos] = sourcePos[idx];
+}
 
 void your_sort(unsigned int* const d_inputVals,
                unsigned int* const d_inputPos,
@@ -49,6 +184,109 @@ void your_sort(unsigned int* const d_inputVals,
                unsigned int* const d_outputPos,
                const size_t numElems)
 { 
-  //TODO
-  //PUT YOUR SORT HERE
-}
+
+  const int blockSize = BLOCK_SIZE;
+  int nBlocks = (numElems + blockSize) / blockSize;
+  int nWholeBlocks = numElems / blockSize;
+  int remainder = numElems % blockSize; // elements that don't fit into the blocks must be handled separately
+  bool swapInOut = false; //flag to show if we need to swap input & output at the end
+
+  unsigned int *d_outBins, *d_rel, *d_maxDigitPerBlock;
+
+  std::auto_ptr<unsigned int> h_hist(new unsigned int[RADIX]);
+  std::auto_ptr<unsigned int> cdf(new unsigned int [RADIX]);
+  std::auto_ptr<unsigned int> h_rel(new unsigned int[numElems]);
+  std::auto_ptr<unsigned int> h_digits_per_block(new unsigned int[RADIX * nBlocks]);
+  std::auto_ptr<unsigned int> cdf_per_block(new unsigned int[RADIX * nBlocks]);
+
+  checkCudaErrors(cudaMalloc(&d_outBins, RADIX * sizeof(int)));
+  checkCudaErrors(cudaMalloc(&d_rel, numElems * sizeof(int)));
+  checkCudaErrors(cudaMalloc(&d_maxDigitPerBlock, nBlocks * RADIX * sizeof(int)));
+
+  for (unsigned int shift = 0; shift < 8 * sizeof(unsigned int); shift += NUM_BITS)
+  {
+    unsigned int * d_input, * d_output, * d_inPos, * d_outPos;
+    if (!swapInOut)
+    {
+      d_input = d_inputVals;
+      d_output = d_outputVals;
+      d_inPos = d_inputPos;
+      d_outPos = d_outputPos;
+    }
+    else
+    {
+      d_output = d_inputVals;
+      d_input = d_outputVals;
+      d_outPos = d_inputPos;
+      d_inPos = d_outputPos;
+    }
+
+    //1. Compute histogram of digits
+    checkCudaErrors(cudaMemset(d_outBins, 0, RADIX * sizeof(int)));
+
+    hist_digits<<<nBlocks, blockSize>>>(d_input, numElems, shift, RADIX, d_outBins);
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+    checkCudaErrors(cudaMemcpy(h_hist.get(), d_outBins, RADIX * sizeof(int), cudaMemcpyDeviceToHost));
+
+    //2. Compute exclusive scan of the histogram
+    cdf.get()[0] = 0;
+
+    for(int digit = 1; digit < RADIX; digit++)
+    {
+      cdf.get()[digit] = cdf.get()[digit - 1] + h_hist.get()[digit - 1];
+    }
+
+    //3. Compute relative position of each digit per block.
+    //a. Compute # of each digit per block
+    checkCudaErrors(cudaMemset(d_maxDigitPerBlock, 0, RADIX * nBlocks * sizeof(int)));
+    num_digits_per_block<<<nBlocks, blockSize>>>(d_input, numElems, shift, RADIX, d_maxDigitPerBlock);
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+    checkCudaErrors(cudaMemcpy(h_digits_per_block.get(), d_maxDigitPerBlock, RADIX * sizeof(int) * nBlocks, cudaMemcpyDeviceToHost));
+
+    //b. Compute exclusive scan per block of numbers of digits per block
+    memset(cdf_per_block.get(), 0, RADIX * nBlocks * sizeof(unsigned int));
+
+    for(int block = 1; block < nBlocks; block++)
+    {
+      for(int digit = 0; digit < RADIX; digit++)
+      {
+        cdf_per_block.get()[block * RADIX + digit] = cdf_per_block.get()[(block - 1) * RADIX + digit] + h_digits_per_block.get()[(block - 1) * RADIX + digit];
+      }
+    }
+
+    //c. Compute relative position
+    rel_pos_per_block<<<nWholeBlocks, blockSize>>>(d_input, blockSize, shift, d_rel);
+
+    if(remainder > 0)
+    {
+      rel_pos_per_block<<<1, remainder>>>(&d_input[nWholeBlocks * blockSize], remainder, shift, &d_rel[nWholeBlocks * blockSize]);
+    }
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+    // d. Add things up to get the real relative position of each digit (relative to the start of the sequence, not per block)
+    checkCudaErrors(cudaMemcpy(d_maxDigitPerBlock, cdf_per_block.get(), RADIX * nBlocks * sizeof(int), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_outBins, cdf.get(), RADIX * sizeof(int), cudaMemcpyHostToDevice));
+
+    get_absolute_positions<<<nBlocks, blockSize>>>(d_input, d_rel, numElems, shift, d_maxDigitPerBlock, d_outBins);
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+    //4. now we can update the output
+    move_to_out<<<nBlocks, blockSize>>>(d_output, d_outPos, d_input, d_inPos, d_rel, numElems);
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+    swapInOut = !swapInOut;
+  }
+
+  // when it's all over we must make sure output values are where they ought to be.
+  if(!swapInOut)
+  {
+    checkCudaErrors(cudaMemcpy(d_outputVals, d_inputVals, numElems * sizeof(unsigned int), cudaMemcpyDeviceToDevice));
+    checkCudaErrors(cudaMemcpy(d_outputPos, d_inputPos, numElems * sizeof(unsigned int), cudaMemcpyDeviceToDevice));
+  }
+
+  checkCudaErrors(cudaFree(d_outBins));
+  checkCudaErrors(cudaFree(d_rel));
+  checkCudaErrors(cudaFree(d_maxDigitPerBlock));
+
diff --git a/Problem Sets/Problem Set 5/student.cu b/Problem Sets/Problem Set 5/student.cu
index 00dd306f..05ceb6dd 100755
--- a/Problem Sets/Problem Set 5/student.cu	
+++ b/Problem Sets/Problem Set 5/student.cu	
@@ -24,20 +24,42 @@
 
 */
 
-
 #include "utils.h"
+#include "reference.cpp"
+
+#define elementsPerThread 64
 
 __global__
 void yourHisto(const unsigned int* const vals, //INPUT
                unsigned int* const histo,      //OUPUT
-               int numVals)
+               const int numVals,
+               const int numBins)
 {
-  //TODO fill in this kernel to calculate the histogram
-  //as quickly as possible
-
-  //Although we provide only one kernel skeleton,
-  //feel free to use more if it will help you
-  //write faster code
+  int tid = threadIdx.x;
+  int index = blockIdx.x * blockDim.x + tid;
+    
+  // create sharedMemory histogram and initialize to 0
+  extern __shared__ unsigned int localHistogram[];
+  for (int i = tid; i < numBins; i += blockDim.x) {
+    localHistogram[i] = 0;
+  }
+ 
+  __syncthreads();
+  
+  // use offset to ensure coalescing read from global memory
+  // each thread reads with a stride of the offset
+  // parallel threads read one coalesced piece of memory
+  int offset = numVals / elementsPerThread;
+  for(int i = 0; i < elementsPerThread; ++i)
+  {
+    atomicAdd(&(localHistogram[vals[index + offset*i]]),1);
+  }
+  
+  __syncthreads();
+  
+  for (int i = tid; i < numBins; i += blockDim.x) {
+    atomicAdd(&histo[i], localHistogram[i]);
+  }
 }
 
 void computeHistogram(const unsigned int* const d_vals, //INPUT
@@ -45,10 +67,11 @@ void computeHistogram(const unsigned int* const d_vals, //INPUT
                       const unsigned int numBins,
                       const unsigned int numElems)
 {
-  //TODO Launch the yourHisto kernel
-
-  //if you want to use/launch more than one kernel,
-  //feel free
-
+  int powerOfTwo = 4;
+  int threadsPerBlock = 1024/powerOfTwo;
+  int blocks = (numElems + (threadsPerBlock*elementsPerThread) - 1) / (threadsPerBlock*elementsPerThread);
+    
+  yourHisto<<<blocks, threadsPerBlock, (numBins + 250) * sizeof(unsigned int)>>>(d_vals, d_histo, numElems, numBins);
   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
 }
+

From f3c28d0d9adba6e44f3f5fa5a2bbe09dc4036896 Mon Sep 17 00:00:00 2001
From: Johannes Deselaers <deselaers.johannes@gmail.com>
Date: Wed, 13 Aug 2014 11:21:51 -0700
Subject: [PATCH 4/6] added missing semicolon

---
 Problem Sets/Problem Set 5/student.cu | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/Problem Sets/Problem Set 5/student.cu b/Problem Sets/Problem Set 5/student.cu
index 05ceb6dd..43ca76cd 100755
--- a/Problem Sets/Problem Set 5/student.cu	
+++ b/Problem Sets/Problem Set 5/student.cu	
@@ -27,7 +27,9 @@
 #include "utils.h"
 #include "reference.cpp"
 
+#define powerOfTwo 4
 #define elementsPerThread 64
+#define maxThreadsPerBlock 1024
 
 __global__
 void yourHisto(const unsigned int* const vals, //INPUT
@@ -67,11 +69,10 @@ void computeHistogram(const unsigned int* const d_vals, //INPUT
                       const unsigned int numBins,
                       const unsigned int numElems)
 {
-  int powerOfTwo = 4;
-  int threadsPerBlock = 1024/powerOfTwo;
-  int blocks = (numElems + (threadsPerBlock*elementsPerThread) - 1) / (threadsPerBlock*elementsPerThread);
+  int threadsPerBlock = maxThreadsPerBlock/powerOfTwo;
+  int elementsPerBlock = threadsPerBlock*elementsPerThread;
+  int blocks = (numElems + elementsPerBlock - 1) / elementsPerBlock;
     
   yourHisto<<<blocks, threadsPerBlock, (numBins + 250) * sizeof(unsigned int)>>>(d_vals, d_histo, numElems, numBins);
   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
-}
-
+}
\ No newline at end of file

From 54834e0f678c14e7c12c4f70932be09cb9e194c1 Mon Sep 17 00:00:00 2001
From: Johannes Deselaers <deselaers.johannes@gmail.com>
Date: Wed, 13 Aug 2014 15:18:45 -0700
Subject: [PATCH 5/6] added parts 1-4 of HW6, so far unoptimized

---
 Problem Sets/Problem Set 6/student_func.cu | 187 +++++++++++++++++----
 1 file changed, 153 insertions(+), 34 deletions(-)

diff --git a/Problem Sets/Problem Set 6/student_func.cu b/Problem Sets/Problem Set 6/student_func.cu
index 1f42ce05..9a32925e 100755
--- a/Problem Sets/Problem Set 6/student_func.cu	
+++ b/Problem Sets/Problem Set 6/student_func.cu	
@@ -62,52 +62,171 @@
     In this assignment we will do 800 iterations.
    */
 
-
-
 #include "utils.h"
 #include <thrust/host_vector.h>
+#include "reference_calc.cpp"
 
-void your_blend(const uchar4* const h_sourceImg,  //IN
-                const size_t numRowsSource, const size_t numColsSource,
-                const uchar4* const h_destImg, //IN
-                uchar4* const h_blendedImg) //OUT
-{
 
-  /* To Recap here are the steps you need to implement
-  
-     1) Compute a mask of the pixels from the source image to be copied
-        The pixels that shouldn't be copied are completely white, they
-        have R=255, G=255, B=255.  Any other pixels SHOULD be copied.
 
-     2) Compute the interior and border regions of the mask.  An interior
-        pixel has all 4 neighbors also inside the mask.  A border pixel is
-        in the mask itself, but has at least one neighbor that isn't.
+__global__ void computeMask(
+  const uchar4* sourceImg, unsigned char* mask, const int numRowsSource, const int numColsSource)
+{
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if(index >= numRowsSource * numColsSource) return;
 
-     3) Separate out the incoming image into three separate channels
+  mask[index] = ((sourceImg[index].x + sourceImg[index].y + sourceImg[index].z) < 765) ? 1: 0;
+}
 
-     4) Create two float(!) buffers for each color channel that will
-        act as our guesses.  Initialize them to the respective color
-        channel of the source image since that will act as our intial guess.
+__global__ void  computerInteriorAndBorder(
+    unsigned char *mask,
+    unsigned char *interior,
+    unsigned char *border,
+    const int numRowsSource,
+    const int numColsSource)
+{
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if(index >= numRowsSource * numColsSource) return;
+
+  if (index < numColsSource || index < (numColsSource*numRowsSource)-numColsSource-1) // check for access at image borders (top / bottom &? left/right)
+  {
+    interior = border = 0;
+    return;
+  }
+
+  int neighborSum = mask[index-1] + mask[index+1] + mask[index - numRowsSource] + mask[index + numRowsSource];
+  if (neighborSum == 4)
+  {
+    interior[index] = 1;
+    border[index] = 0;
+  } 
+  else if(neighborSum == 3)
+  {
+    interior[index] = 0;
+    border[index] = 1;
+  }
+  else
+  {
+    interior = border = 0;
+  }
+}
 
-     5) For each color channel perform the Jacobi iteration described 
-        above 800 times.
+__global__ void separate_channels(
+  uchar4* sourceImg,
+  unsigned char* red,
+  unsigned char* green,
+  unsigned char* blue,
+  const int numRowsSource,
+  const int numColsSource)
+{
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index >= numColsSource * numRowsSource) return;
 
-     6) Create the output image by replacing all the interior pixels
-        in the destination image with the result of the Jacobi iterations.
-        Just cast the floating point values to unsigned chars since we have
-        already made sure to clamp them to the correct range.
+  red[index] = sourceImg[index].x;
+  green[index] = sourceImg[index].z;
+  blue[index] = sourceImg[index].y;
+}
 
-      Since this is final assignment we provide little boilerplate code to
-      help you.  Notice that all the input/output pointers are HOST pointers.
+__global__ void initialize(
+  unsigned char* source,
+  float* blended,
+  const int numRowsSource,
+  const int numColsSource)
+{
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index >= numColsSource * numRowsSource) return;
 
-      You will have to allocate all of your own GPU memory and perform your own
-      memcopies to get data in and out of the GPU memory.
+  blended[index] = static_cast<float>(source[index]);
+}
 
-      Remember to wrap all of your calls with checkCudaErrors() to catch any
-      thing that might go wrong.  After each kernel call do:
+void your_blend(const uchar4* const h_sourceImg,  //IN
+                const size_t numRowsSource, const size_t numColsSource,
+                const uchar4* const h_destImg, //IN
+                uchar4* const h_blendedImg) //OUT
+{  
+  uchar4        *d_sourceImg, *d_destImg, *d_blendedImg;
+  unsigned char *d_mask, *d_border, *d_interior;
+  unsigned char *d_red, *d_green, *d_blue;
+
+  checkCudaErrors(cudaMalloc(&d_sourceImg,  sizeof(uchar4) * numColsSource * numRowsSource));
+  checkCudaErrors(cudaMalloc(&d_destImg,    sizeof(uchar4) * numColsSource * numRowsSource));
+  checkCudaErrors(cudaMalloc(&d_blendedImg, sizeof(uchar4) * numColsSource * numRowsSource));
+  checkCudaErrors(cudaMalloc(&d_mask,       sizeof(unsigned char) * numColsSource * numRowsSource));
+  checkCudaErrors(cudaMalloc(&d_border,     sizeof(unsigned char) * numColsSource * numRowsSource));
+  checkCudaErrors(cudaMalloc(&d_interior,   sizeof(unsigned char) * numColsSource * numRowsSource));
+  checkCudaErrors(cudaMalloc(&d_red,        sizeof(unsigned char) * numColsSource * numRowsSource));
+  checkCudaErrors(cudaMalloc(&d_green,      sizeof(unsigned char) * numColsSource * numRowsSource));
+  checkCudaErrors(cudaMalloc(&d_blue,       sizeof(unsigned char) * numColsSource * numRowsSource));
+
+  checkCudaErrors(cudaMemcpy(d_sourceImg, h_sourceImg,  sizeof(uchar4) * numColsSource * numRowsSource, cudaMemcpyHostToDevice));
+  checkCudaErrors(cudaMemcpy(d_destImg,   h_destImg,    sizeof(uchar4) * numColsSource * numRowsSource, cudaMemcpyHostToDevice));
+
+  // 1) Compute a mask of the pixels from the source image to be copied
+  //    The pixels that shouldn't be copied are completely white, they
+  //    have R=255, G=255, B=255.  Any other pixels SHOULD be copied.
+
+  const int maxThreadsPerBlock = 1024;
+  int threadsPerBlock = maxThreadsPerBlock;
+  int blocks = (numRowsSource * numColsSource + threadsPerBlock - 1) / threadsPerBlock;
+
+  computeMask<<<blocks, threadsPerBlock>>>(d_sourceImg, d_mask, numRowsSource, numColsSource);
+  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+  // 2) Compute the interior and border regions of the mask.  An interior
+  //    pixel has all 4 neighbors also inside the mask.  A border pixel is
+  //    in the mask itself, but has at least one neighbor that isn't.
+
+  computerInteriorAndBorder<<<blocks, threadsPerBlock>>>(
+    d_mask, d_interior, d_border, numRowsSource, numColsSource);
+  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+  // 3) Separate out the incoming image into three separate channels
+
+  separate_channels<<<blocks, threadsPerBlock>>>(d_sourceImg, d_red, d_green, d_blue, numRowsSource, numColsSource);
+  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+  // 4) Create two float(!) buffers for each color channel that will
+  //    act as our guesses.  Initialize them to the respective color
+  //    channel of the source image since that will act as our intial guess.
+
+  float *d_blendedRed1;
+  float *d_blendedRed2;
+  float *d_blendedGreen1;
+  float *d_blendedGreen2;
+  float *d_blendedBlue1;
+  float *d_blendedBlue2;
+  checkCudaErrors(cudaMalloc(&d_blendedRed1,    sizeof(float) * numColsSource * numRowsSource));
+  checkCudaErrors(cudaMalloc(&d_blendedRed2,    sizeof(float) * numColsSource * numRowsSource));
+  checkCudaErrors(cudaMalloc(&d_blendedGreen1,  sizeof(float) * numColsSource * numRowsSource));
+  checkCudaErrors(cudaMalloc(&d_blendedGreen2,  sizeof(float) * numColsSource * numRowsSource));
+  checkCudaErrors(cudaMalloc(&d_blendedBlue1,   sizeof(float) * numColsSource * numRowsSource));
+  checkCudaErrors(cudaMalloc(&d_blendedBlue2,   sizeof(float) * numColsSource * numRowsSource));
+
+  initialize<<<blocks, threadsPerBlock>>>(d_red,    d_blendedRed1,    numRowsSource, numColsSource);
+  initialize<<<blocks, threadsPerBlock>>>(d_red,    d_blendedRed2,    numRowsSource, numColsSource);
+  initialize<<<blocks, threadsPerBlock>>>(d_green,  d_blendedGreen1,  numRowsSource, numColsSource);
+  initialize<<<blocks, threadsPerBlock>>>(d_green,  d_blendedGreen2,  numRowsSource, numColsSource);
+  initialize<<<blocks, threadsPerBlock>>>(d_blue,   d_blendedBlue1,   numRowsSource, numColsSource);
+  initialize<<<blocks, threadsPerBlock>>>(d_blue,   d_blendedBlue2,   numRowsSource, numColsSource);
+  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+  // 5) For each color channel perform the Jacobi iteration described 
+  //    above 800 times.
+
+  // 6) Create the output image by replacing all the interior pixels
+  //    in the destination image with the result of the Jacobi iterations.
+  //    Just cast the floating point values to unsigned chars since we have
+  //    already made sure to clamp them to the correct range.
+
+  /* The reference calculation is provided below, feel free to use it
+     for debugging purposes. 
+  */
 
-      cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+  /**
+  uchar4* h_reference = new uchar4[srcSize];
+  reference_calc(h_sourceImg, numRowsSource, numColsSource,
+                 h_destImg, h_reference);
 
-      to catch any errors that happened while executing the kernel.
-  */
+  checkResultsEps((unsigned char *)h_reference, (unsigned char *)h_blendedImg, 4 * srcSize, 2, .01);
+  delete[] h_reference;
+  /**/
 }

From ed8fe428f648c6608a9787baebb7c8f127af2fec Mon Sep 17 00:00:00 2001
From: Johannes Deselaers <deselaers.johannes@gmail.com>
Date: Wed, 20 Aug 2014 12:44:12 -0700
Subject: [PATCH 6/6] solution for ps6

---
 Problem Sets/Problem Set 6/student_func.cu | 132 ++++++++++++++++++---
 1 file changed, 117 insertions(+), 15 deletions(-)

diff --git a/Problem Sets/Problem Set 6/student_func.cu b/Problem Sets/Problem Set 6/student_func.cu
index 9a32925e..544cca1d 100755
--- a/Problem Sets/Problem Set 6/student_func.cu	
+++ b/Problem Sets/Problem Set 6/student_func.cu	
@@ -64,9 +64,6 @@
 
 #include "utils.h"
 #include <thrust/host_vector.h>
-#include "reference_calc.cpp"
-
-
 
 __global__ void computeMask(
   const uchar4* sourceImg, unsigned char* mask, const int numRowsSource, const int numColsSource)
@@ -110,7 +107,7 @@ __global__ void  computerInteriorAndBorder(
   }
 }
 
-__global__ void separate_channels(
+__global__ void separateChannels(
   uchar4* sourceImg,
   unsigned char* red,
   unsigned char* green,
@@ -138,6 +135,55 @@ __global__ void initialize(
   blended[index] = static_cast<float>(source[index]);
 }
 
+__global__ void performIteration(
+  float * currentGuess,
+  float * nextGuess,
+  unsigned char * sourceChannel,
+  unsigned char * destChannel,
+  unsigned char * interiorMask,
+  unsigned char * borderMask,
+  const int numRowsSource,
+  const int numColsSource)
+{
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index >= numColsSource * numRowsSource) return;
+  if (interiorMask[index] == 0) return;
+
+  float sum1 = 0;
+  sum1 += interiorMask[index-numColsSource] * currentGuess[index-numColsSource] + borderMask[index-numColsSource] * destChannel[index-numColsSource];
+  sum1 += interiorMask[index+numColsSource] * currentGuess[index+numColsSource] + borderMask[index+numColsSource] * destChannel[index+numColsSource];
+  sum1 += interiorMask[index-1] * currentGuess[index-1] + borderMask[index-1] * destChannel[index-1];
+  sum1 += interiorMask[index+1] * currentGuess[index+1] + borderMask[index+1] * destChannel[index+1];
+
+
+  float sum2 = 0;
+  sum2 += interiorMask[index] * (sourceChannel[index] - sourceChannel[index-numColsSource]);
+  sum2 += interiorMask[index] * (sourceChannel[index] - sourceChannel[index+numColsSource]); 
+  sum2 += interiorMask[index] * (sourceChannel[index] - sourceChannel[index-1]);
+  sum2 += interiorMask[index] * (sourceChannel[index] - sourceChannel[index+1]);
+
+  float newVal = (sum1 + sum2) / 4.f;
+  nextGuess[index] = min(255.f, max(0.f, newVal));
+}
+
+__global__ void createOutput(
+    uchar4 * blendedImg,
+    float *finalGuessRed,
+    float *finalGuessGreen,
+    float *finalGuessBlue,
+    unsigned char *interiorMask,
+    const int numRowsSource,
+    const int numColsSource)
+{
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index >= numColsSource * numRowsSource) return;
+  if (!(interiorMask[index]))  return;
+  
+  blendedImg[index].x = static_cast<unsigned char>(finalGuessRed[index]);
+  blendedImg[index].y = static_cast<unsigned char>(finalGuessGreen[index]);
+  blendedImg[index].z = static_cast<unsigned char>(finalGuessBlue[index]);
+}
+
 void your_blend(const uchar4* const h_sourceImg,  //IN
                 const size_t numRowsSource, const size_t numColsSource,
                 const uchar4* const h_destImg, //IN
@@ -145,7 +191,9 @@ void your_blend(const uchar4* const h_sourceImg,  //IN
 {  
   uchar4        *d_sourceImg, *d_destImg, *d_blendedImg;
   unsigned char *d_mask, *d_border, *d_interior;
-  unsigned char *d_red, *d_green, *d_blue;
+  unsigned char *d_redSrc, *d_greenSrc, *d_blueSrc;
+  unsigned char *d_redDst, *d_greenDst, *d_blueDst;
+
 
   checkCudaErrors(cudaMalloc(&d_sourceImg,  sizeof(uchar4) * numColsSource * numRowsSource));
   checkCudaErrors(cudaMalloc(&d_destImg,    sizeof(uchar4) * numColsSource * numRowsSource));
@@ -153,9 +201,12 @@ void your_blend(const uchar4* const h_sourceImg,  //IN
   checkCudaErrors(cudaMalloc(&d_mask,       sizeof(unsigned char) * numColsSource * numRowsSource));
   checkCudaErrors(cudaMalloc(&d_border,     sizeof(unsigned char) * numColsSource * numRowsSource));
   checkCudaErrors(cudaMalloc(&d_interior,   sizeof(unsigned char) * numColsSource * numRowsSource));
-  checkCudaErrors(cudaMalloc(&d_red,        sizeof(unsigned char) * numColsSource * numRowsSource));
-  checkCudaErrors(cudaMalloc(&d_green,      sizeof(unsigned char) * numColsSource * numRowsSource));
-  checkCudaErrors(cudaMalloc(&d_blue,       sizeof(unsigned char) * numColsSource * numRowsSource));
+  checkCudaErrors(cudaMalloc(&d_redSrc,     sizeof(unsigned char) * numColsSource * numRowsSource));
+  checkCudaErrors(cudaMalloc(&d_greenSrc,   sizeof(unsigned char) * numColsSource * numRowsSource));
+  checkCudaErrors(cudaMalloc(&d_blueSrc,    sizeof(unsigned char) * numColsSource * numRowsSource));
+  checkCudaErrors(cudaMalloc(&d_redDst,     sizeof(unsigned char) * numColsSource * numRowsSource));
+  checkCudaErrors(cudaMalloc(&d_greenDst,   sizeof(unsigned char) * numColsSource * numRowsSource));
+  checkCudaErrors(cudaMalloc(&d_blueDst,    sizeof(unsigned char) * numColsSource * numRowsSource));
 
   checkCudaErrors(cudaMemcpy(d_sourceImg, h_sourceImg,  sizeof(uchar4) * numColsSource * numRowsSource, cudaMemcpyHostToDevice));
   checkCudaErrors(cudaMemcpy(d_destImg,   h_destImg,    sizeof(uchar4) * numColsSource * numRowsSource, cudaMemcpyHostToDevice));
@@ -181,7 +232,8 @@ void your_blend(const uchar4* const h_sourceImg,  //IN
 
   // 3) Separate out the incoming image into three separate channels
 
-  separate_channels<<<blocks, threadsPerBlock>>>(d_sourceImg, d_red, d_green, d_blue, numRowsSource, numColsSource);
+  separateChannels<<<blocks, threadsPerBlock>>>(d_sourceImg, d_redSrc, d_greenSrc, d_blueSrc, numRowsSource, numColsSource);
+  separateChannels<<<blocks, threadsPerBlock>>>(d_destImg, d_redDst, d_greenDst, d_blueDst, numRowsSource, numColsSource);
   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
 
   // 4) Create two float(!) buffers for each color channel that will
@@ -201,21 +253,71 @@ void your_blend(const uchar4* const h_sourceImg,  //IN
   checkCudaErrors(cudaMalloc(&d_blendedBlue1,   sizeof(float) * numColsSource * numRowsSource));
   checkCudaErrors(cudaMalloc(&d_blendedBlue2,   sizeof(float) * numColsSource * numRowsSource));
 
-  initialize<<<blocks, threadsPerBlock>>>(d_red,    d_blendedRed1,    numRowsSource, numColsSource);
-  initialize<<<blocks, threadsPerBlock>>>(d_red,    d_blendedRed2,    numRowsSource, numColsSource);
-  initialize<<<blocks, threadsPerBlock>>>(d_green,  d_blendedGreen1,  numRowsSource, numColsSource);
-  initialize<<<blocks, threadsPerBlock>>>(d_green,  d_blendedGreen2,  numRowsSource, numColsSource);
-  initialize<<<blocks, threadsPerBlock>>>(d_blue,   d_blendedBlue1,   numRowsSource, numColsSource);
-  initialize<<<blocks, threadsPerBlock>>>(d_blue,   d_blendedBlue2,   numRowsSource, numColsSource);
+  initialize<<<blocks, threadsPerBlock>>>(d_redSrc,    d_blendedRed1,    numRowsSource, numColsSource);
+  initialize<<<blocks, threadsPerBlock>>>(d_redDst,    d_blendedRed2,    numRowsSource, numColsSource);
+  initialize<<<blocks, threadsPerBlock>>>(d_greenSrc,  d_blendedGreen1,  numRowsSource, numColsSource);
+  initialize<<<blocks, threadsPerBlock>>>(d_greenDst,  d_blendedGreen2,  numRowsSource, numColsSource);
+  initialize<<<blocks, threadsPerBlock>>>(d_blueSrc,   d_blendedBlue1,   numRowsSource, numColsSource);
+  initialize<<<blocks, threadsPerBlock>>>(d_blueDst,   d_blendedBlue2,   numRowsSource, numColsSource);
   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
 
   // 5) For each color channel perform the Jacobi iteration described 
   //    above 800 times.
+  const int iterations(800);
+  for(int i = 0; i < iterations; ++i)
+  {
+    performIteration<<<blocks, threadsPerBlock>>>(
+      d_blendedRed1,d_blendedRed2, d_redSrc, d_redDst, d_interior, d_border, numRowsSource, numColsSource);
+    performIteration<<<blocks, threadsPerBlock>>>(
+      d_blendedGreen1,d_blendedGreen2, d_greenSrc, d_greenDst, d_interior, d_border, numRowsSource, numColsSource);
+    performIteration<<<blocks, threadsPerBlock>>>(
+      d_blendedBlue1,d_blendedBlue2, d_blueSrc, d_blueDst, d_interior, d_border, numRowsSource, numColsSource);
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+    std::swap(d_blendedRed1, d_blendedRed2);
+    std::swap(d_blendedGreen1, d_blendedGreen2);
+    std::swap(d_blendedBlue1, d_blendedBlue2);
+  }
+  // even number of iterations, so swap once more
+  std::swap(d_blendedRed1, d_blendedRed2);
+  std::swap(d_blendedGreen1, d_blendedGreen2);
+  std::swap(d_blendedBlue1, d_blendedBlue2);
 
   // 6) Create the output image by replacing all the interior pixels
   //    in the destination image with the result of the Jacobi iterations.
   //    Just cast the floating point values to unsigned chars since we have
   //    already made sure to clamp them to the correct range.
+  checkCudaErrors(cudaMemcpy(d_blendedImg, d_destImg, sizeof(uchar4) * numRowsSource * numColsSource, cudaMemcpyDeviceToDevice));
+
+  createOutput<<<blocks, threadsPerBlock>>>(
+    d_blendedImg, d_blendedRed2, d_blendedGreen2, d_blendedBlue2, d_interior, numRowsSource, numColsSource);
+
+  checkCudaErrors(cudaMemcpy(h_blendedImg, d_blendedImg, sizeof(uchar4) * numRowsSource * numColsSource,cudaMemcpyDeviceToHost));
+  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+  //
+  // DONT FORGET TO FREE ALL OF THE MEMORY !!!
+  //
+  checkCudaErrors(cudaFree(d_sourceImg));
+  checkCudaErrors(cudaFree(d_destImg));
+  checkCudaErrors(cudaFree(d_blendedImg));
+  checkCudaErrors(cudaFree(d_mask));
+  checkCudaErrors(cudaFree(d_border));
+  checkCudaErrors(cudaFree(d_interior));
+  checkCudaErrors(cudaFree(d_redSrc));
+  checkCudaErrors(cudaFree(d_redDst));
+  checkCudaErrors(cudaFree(d_greenSrc));
+  checkCudaErrors(cudaFree(d_greenDst));
+  checkCudaErrors(cudaFree(d_blueSrc));
+  checkCudaErrors(cudaFree(d_blueDst));
+  checkCudaErrors(cudaFree(d_blendedRed1));
+  checkCudaErrors(cudaFree(d_blendedRed2));
+  checkCudaErrors(cudaFree(d_blendedGreen1));
+  checkCudaErrors(cudaFree(d_blendedGreen2));
+  checkCudaErrors(cudaFree(d_blendedBlue1));
+  checkCudaErrors(cudaFree(d_blendedBlue2));
+    
+
 
   /* The reference calculation is provided below, feel free to use it
      for debugging purposes.