diff --git a/README.md b/README.md
index 22d2f34..dcbacaa 100644
--- a/README.md
+++ b/README.md
@@ -3,346 +3,100 @@ CUDA Rasterizer
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
-
-### (TODO: Your README)
-
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
-
-
-Instructions (delete me)
-========================
-
-This is due Sunday, October 11, evening at midnight.
-
-**Summary:** 
-In this project, you will use CUDA to implement a simplified
-rasterized graphics pipeline, similar to the OpenGL pipeline. You will
-implement vertex shading, primitive assembly, rasterization, fragment shading,
-and a framebuffer. More information about the rasterized graphics pipeline can
-be found in the class slides and in the CIS 560 lecture notes.
-
-The base code provided includes an OBJ loader and much of the I/O and
-bookkeeping code. It also includes some functions that you may find useful,
-described below. The core rasterization pipeline is left for you to implement.
-
-You are not required to use this base code if you don't want
-to. You may also change any part of the base code as you please.
-**This is YOUR project.**
-
-**Recommendation:**
-Every image you save should automatically get a different
-filename. Don't delete all of them! For the benefit of your README, keep a
-bunch of them around so you can pick a few to document your progress.
-
-
-### Contents
-
-* `src/` C++/CUDA source files.
-* `util/` C++ utility files.
-* `objs/` Example OBJ test files (# verts, # tris in buffers after loading)
-  * `tri.obj` (3v, 1t): The simplest possible geometric object.
-  * `cube.obj` (36v, 12t): A small model with low depth-complexity.
-  * `suzanne.obj` (2904 verts, 968 tris): A medium model with low depth-complexity.
-  * `suzanne_smooth.obj` (2904 verts, 968 tris): A medium model with low depth-complexity.
-    This model has normals which must be interpolated.
-  * `cow.obj` (17412 verts, 5804 tris): A large model with low depth-complexity.
-  * `cow_smooth.obj` (17412 verts, 5804 tris): A large model with low depth-complexity.
-    This model has normals which must be interpolated.
-  * `flower.obj` (1920 verts, 640 tris): A medium model with very high depth-complexity.
-  * `sponza.obj` (837,489 verts, 279,163 tris): A huge model with very high depth-complexity.
-* `renders/` Debug render of an example OBJ.
-* `external/` Includes and static libraries for 3rd party libraries.
-
-### Running the code
-
-The main function requires a scene description file. Call the program with
-one as an argument: `cis565_rasterizer objs/cow.obj`.
-(In Visual Studio, `../objs/cow.obj`.)
-
-If you are using Visual Studio, you can set this in the Debugging > Command
-Arguments section in the Project properties. Note that this value is different
-for every different configuration type. Make sure you get the path right; read
-the console for errors.
-
-## Requirements
-
-**Ask on the mailing list for any clarifications.**
-
-In this project, you are given the following code:
-
-* A library for loading standard Alias/Wavefront `.obj` format mesh
-  files and converting them to OpenGL-style buffers of index and vertex data.
-  * This library does NOT read materials, and provides all colors as white by
-    default. You can use another library if you wish.
-* Simple structs for some parts of the pipeline.
-* Depth buffer to framebuffer copy.
-* CUDA-GL interop.
-
-You will need to implement the following features/pipeline stages:
-
-* Vertex shading.
-* (Vertex shader) perspective transformation.
-* Primitive assembly with support for triangles read from buffers of index and
-  vertex data.
-* Rasterization.
-* Fragment shading.
-* A depth buffer for storing and depth testing fragments.
-* Fragment to depth buffer writing (**with** atomics for race avoidance).
-* (Fragment shader) simple lighting scheme, such as Lambert or Blinn-Phong.
-
-See below for more guidance.
-
-You are also required to implement at least "3.0" points in extra features.
-(the parenthesized numbers must add to 3.0 or more):
-
-* (1.0) Tile-based pipeline.
-* Additional pipeline stages.
-   * (1.0) Tessellation shader.
-   * (1.0) Geometry shader, able to output a variable number of primitives per
-     input primitive, optimized using stream compaction (thrust allowed).
-   * (0.5 **if not doing geometry shader**) Backface culling, optimized using
-     stream compaction (thrust allowed).
-   * (1.0) Transform feedback.
-   * (0.5) Scissor test.
-   * (0.5) Blending (when writing into framebuffer).
-* (1.0) Instancing: draw one set of vertex data multiple times, each run
-  through the vertex shader with a different ID.
-* (0.5) Correct color interpolation between points on a primitive.
-* (1.0) UV texture mapping with bilinear texture filtering and perspective
-  correct texture coordinates.
-* Support for rasterizing additional primitives:
-   * (0.5) Lines or line strips.
-   * (0.5) Points.
-* (1.0) Anti-aliasing.
-* (1.0) Occlusion queries.
-* (1.0) Order-independent translucency using a k-buffer.
-* (0.5) **Mouse**-based interactive camera support.
-
-This extra feature list is not comprehensive. If you have a particular idea
-you would like to implement, please **contact us first**.
-
-**IMPORTANT:**
-For each extra feature, please provide the following brief analysis:
-
-* Concise overview write-up of the feature.
-* Performance impact of adding the feature (slower or faster).
-* If you did something to accelerate the feature, what did you do and why?
-* How might this feature be optimized beyond your current implementation?
-
-
-## Base Code Tour
-
-You will be working primarily in two files: `rasterize.cu`, and
-`rasterizeTools.h`. Within these files, areas that you need to complete are
-marked with a `TODO` comment. Areas that are useful to and serve as hints for
-optional features are marked with `TODO (Optional)`. Functions that are useful
-for reference are marked with the comment `CHECKITOUT`. **You should look at
-all TODOs and CHECKITOUTs before starting!** There are not many.
-
-* `src/rasterize.cu` contains the core rasterization pipeline. 
-  * A few pre-made structs are included for you to use, but those marked with
-    TODO will also be needed for a simple rasterizer. As with any part of the
-    base code, you may modify or replace these as you see fit.
-
-* `src/rasterizeTools.h` contains various useful tools
-  * Includes a number of barycentric coordinate related functions that you may
-    find useful in implementing scanline based rasterization.
-
-* `util/utilityCore.hpp` serves as a kitchen-sink of useful functions.
-
-
-## Rasterization Pipeline
-
-Possible pipelines are described below. Pseudo-type-signatures are given.
-Not all of the pseudocode arrays will necessarily actually exist in practice.
-
-### First-Try Pipeline
-
-This describes a minimal version of *one possible* graphics pipeline, similar
-to modern hardware (DX/OpenGL). Yours need not match precisely.  To begin, try
-to write a minimal amount of code as described here. Verify some output after
-implementing each pipeline step. This will reduce the necessary time spent
-debugging.
-
-Start out by testing a single triangle (`tri.obj`).
-
-* Clear the depth buffer with some default value.
-* Vertex shading: 
-  * `VertexIn[n] vs_input -> VertexOut[n] vs_output`
-  * A minimal vertex shader will apply no transformations at all - it draws
-    directly in normalized device coordinates (-1 to 1 in each dimension).
-* Primitive assembly.
-  * `VertexOut[n] vs_output -> Triangle[n/3] primitives`
-  * Start by supporting ONLY triangles. For a triangle defined by indices
-    `(a, b, c)` into `VertexOut` array `vo`, simply copy the appropriate values
-    into a `Triangle` object `(vo[a], vo[b], vo[c])`.
-* Rasterization.
-  * `Triangle[n/3] primitives -> FragmentIn[m] fs_input`
-  * A scanline implementation is simpler to start with.
-* Fragment shading.
-  * `FragmentIn[m] fs_input -> FragmentOut[m] fs_output`
-  * A super-simple test fragment shader: output same color for every fragment.
-    * Also try displaying various debug views (normals, etc.)
-* Fragments to depth buffer.
-  * `FragmentOut[m] -> FragmentOut[width][height]`
-  * Results in race conditions - don't bother to fix these until it works!
-  * Can really be done inside the fragment shader, if you call the fragment
-    shader from the rasterization kernel for every fragment (including those
-    which get occluded). **OR,** this can be done before fragment shading, which
-    may be faster but means the fragment shader cannot change the depth.
-* A depth buffer for storing and depth testing fragments.
-  * `FragmentOut[width][height] depthbuffer`
-  * An array of `fragment` objects.
-  * At the end of a frame, it should contain the fragments drawn to the screen.
-* Fragment to framebuffer writing.
-  * `FragmentOut[width][height] depthbuffer -> vec3[width][height] framebuffer`
-  * Simply copies the colors out of the depth buffer into the framebuffer
-    (to be displayed on the screen).
-
-### A Useful Pipeline
-
-* Clear the depth buffer with some default value.
-* Vertex shading: 
-  * `VertexIn[n] vs_input -> VertexOut[n] vs_output`
-  * Apply some vertex transformation (e.g. model-view-projection matrix using
-    `glm::lookAt ` and `glm::perspective `).
-* Primitive assembly.
-  * `VertexOut[n] vs_output -> Triangle[n/3] primitives`
-  * As above.
-  * Other primitive types are optional.
-* Rasterization.
-  * `Triangle[n/3] primitives -> FragmentIn[m] fs_input`
-  * You may choose to do a tiled rasterization method, which should have lower
-    global memory bandwidth.
-  * A scanline optimization: when rasterizing a triangle, only scan over the
-    box around the triangle (`getAABBForTriangle`).
-* Fragment shading.
-  * `FragmentIn[m] fs_input -> FragmentOut[m] fs_output`
-  * Add a shading method, such as Lambert or Blinn-Phong. Lights can be defined
-    by kernel parameters (like GLSL uniforms).
-* Fragments to depth buffer.
-  * `FragmentOut[m] -> FragmentOut[width][height]`
-  * Can really be done inside the fragment shader, if you call the fragment
-    shader from the rasterization kernel for every fragment (including those
-    which get occluded). **OR,** this can be done before fragment shading, which
-    may be faster but means the fragment shader cannot change the depth.
-    * This result in an optimization: it allows you to do depth tests before
-     spending execution time in complex fragment shader code!
-  * Handle race conditions! Since multiple primitives write fragments to the
-    same fragment in the depth buffer, races must be avoided by using CUDA
-    atomics.
-    * *Approach 1:* Lock the location in the depth buffer during the time that
-      a thread is comparing old and new fragment depths (and possibly writing
-      a new fragment). This should work in all cases, but be slower.
-    * *Approach 2:* Convert your depth value to a fixed-point `int`, and use
-      `atomicMin` to store it into an `int`-typed depth buffer `intdepth`. After
-      that, the value which is stored at `intdepth[i]` is (usually) that of the
-      fragment which should be stored into the `fragment` depth buffer.
-      * This may result in some rare race conditions (e.g. across blocks).
-    * The `flower.obj` test file is good for testing race conditions.
-* A depth buffer for storing and depth testing fragments.
-  * `FragmentOut[width][height] depthbuffer`
-  * An array of `fragment` objects.
-  * At the end of a frame, it should contain the fragments drawn to the screen.
-* Fragment to framebuffer writing.
-  * `FragmentOut[width][height] depthbuffer -> vec3[width][height] framebuffer`
-  * Simply copies the colors out of the depth buffer into the framebuffer
-    (to be displayed on the screen).
-
-This is a suggested sequence of pipeline steps, but you may choose to alter the
-order of this sequence or merge entire kernels as you see fit.  For example, if
-you decide that doing has benefits, you can choose to merge the vertex shader
-and primitive assembly kernels, or merge the perspective transform into another
-kernel. There is not necessarily a right sequence of kernels, and you may
-choose any sequence that works.  Please document in your README what sequence
-you choose and why.
-
-
-## Resources
-
-The following resources may be useful for this project:
-
-* High-Performance Software Rasterization on GPUs:
-  * [Paper (HPG 2011)](http://www.tml.tkk.fi/~samuli/publications/laine2011hpg_paper.pdf)
-  * [Code](http://code.google.com/p/cudaraster/)
-  * Note that looking over this code for reference with regard to the paper is
-    fine, but we most likely will not grant any requests to actually
-    incorporate any of this code into your project.
-  * [Slides](http://bps11.idav.ucdavis.edu/talks/08-gpuSoftwareRasterLaineAndPantaleoni-BPS2011.pdf)
-* The Direct3D 10 System (SIGGRAPH 2006) - for those interested in doing
-  geometry shaders and transform feedback:
-  * [Paper](http://dl.acm.org/citation.cfm?id=1141947)
-  * [Paper, through Penn Libraries proxy](http://proxy.library.upenn.edu:2247/citation.cfm?id=1141947)
-* Multi-Fragment Eﬀects on the GPU using the k-Buﬀer - for those who want to do
-  order-independent transparency using a k-buffer:
-  * [Paper](http://www.inf.ufrgs.br/~comba/papers/2007/kbuffer_preprint.pdf)
-* FreePipe: A Programmable, Parallel Rendering Architecture for Efficient
-  Multi-Fragment Effects (I3D 2010):
-  * [Paper](https://sites.google.com/site/hmcen0921/cudarasterizer)
-* Writing A Software Rasterizer In Javascript:
-  * [Part 1](http://simonstechblog.blogspot.com/2012/04/software-rasterizer-part-1.html)
-  * [Part 2](http://simonstechblog.blogspot.com/2012/04/software-rasterizer-part-2.html)
-
-
-## Third-Party Code Policy
-
-* Use of any third-party code must be approved by asking on our Google Group.
-* If it is approved, all students are welcome to use it. Generally, we approve
-  use of third-party code that is not a core part of the project. For example,
-  for the path tracer, we would approve using a third-party library for loading
-  models, but would not approve copying and pasting a CUDA function for doing
-  refraction.
-* Third-party code **MUST** be credited in README.md.
-* Using third-party code without its approval, including using another
-  student's code, is an academic integrity violation, and will, at minimum,
-  result in you receiving an F for the semester.
-
-
-## README
-
-Replace the contents of this README.md in a clear manner with the following:
-
-* A brief description of the project and the specific features you implemented.
-* At least one screenshot of your project running.
-* A 30 second or longer video of your project running.
-* A performance analysis (described below).
+* Levi Cai
+* Tested on: Windows 8, i7-5500U @ 2.4GHz, 12GB, NVidia GeForce 940M 2GB
+
+Attack of the cowwwssss!
+
+![](renders/cow_instancing.PNG)
+
+### Graphics Pipeline
+
+This is an implementation of a CUDA-based graphics pipeline with the following stages/features:
+
+* Vertex shading (Apply transformations to vertices)
+* Instancing (Create multiple sets of vertices according to new transformations)
+* Primitive Assembly (Assemble vertices into triangles/primitives)
+* Rasterization with Anti-Aliasing (FSAA using fixed-pattern supersampling) (Convert triangles to fragments)
+* Fragment shading (Determine lighting for fragments)
+* Fragment to Frame buffer transfer (Fragments into pixels, AA as needed)
+* Mouse control
+* Color interpolation on primitive surfaces
+
+### Feature Demos
+
+## Short film demonstrating mouse control
+
+https://www.youtube.com/watch?v=1zS9fQGLfO8
+
+## Proper normals
+
+Simple illustration of proper handling of normals during Model-world transformations.
+
+![](renders/cows_normals.PNG)
+
+## Half-done Geometry Shader
+
+This was an initial attempt at a Geometry Shader stage for the pipeline. The input was
+triangle vertices, and for each vertex, output an additional triangle (1 prim -> 3 prim).
+However, I was unable to finish with this stage due to time contraints.
+
+![](renders/cow_initial_GS.PNG)
+
+## Depth-Testing
+
+Simple illustration of a properly depth-tested image (without race conditions) and what happens when the
+depth test has been reversed (the incorrect image).
+
+Correct:
+
+![](renders/depth_test_flower.PNG)
+
+Incorrect:
+
+![](renders/render_gone_wrong.PNG)
+
+## Color Interpolation
+
+Each vertex of each primitive was set to a different color to illustrate proper interpolation of colors, normals can be visualized in a similar manner. These were computed in the rasterization step by using the barycentric coordinate of each fragment as the weight of the color of each associated vertex, which were then summed together to get the fragment color.
+
+![](renders/cow_color_interpolation.PNG)
+
+## Instancing
+
+Instancing was done in the vertex shading stage. This required generating multiple sets of model-view-perspective transformations (one for each instance) and then generating separate sets of vertices from the original vertices according to their respective transformations. 1 thread per in-vertex was used and for-looped among the model-view-perspective transformations in order to create multiple sets of vertices, as well as duplicating the indices so that vertices could be properly assembled into primitives. An optimization may be to use 1 thread per out-vertex instead.
+
+5x cows with translation and rotations.
+
+![](renders/cow_instancing.PNG)
+
+## Anti-Aliasing
+
+Anti-Aliasing was accomplished by super sampling each pixel according to a fixed grid pattern that is variable-adjustable. This is accomplished in the rasterization portion of the pipeline. The fragments are then combined into single pixels via averaging in the Fragment to Buffer stage of the pipeline.
+
+3x super sampling results.
+
+![](renders/anti_aliasing_cow_3_v_1.PNG)
 
 ### Performance Analysis
 
-The performance analysis is where you will investigate how to make your CUDA
-programs more efficient using the skills you've learned in class. You must have
-performed at least one experiment on your code to investigate the positive or
-negative effects on performance. 
+##Comparison of size of triangles vs. FPS
 
-We encourage you to get creative with your tweaks. Consider places in your code
-that could be considered bottlenecks and try to improve them. 
+This comparison illustrates the bottleneck of the rasterization portion of the pipeline.
+As triangles get nearer to the camera (effectively, larger), then each thread must spend 
+more time rasterizing. One possible method of reducing this is to compare the number of
+primitives to be rasterized vs. the size of the primitives on screen. If the ratio of size to 
+number is large, then instead of launching 1 thread per primitive, then launch one thread per
+fragment and depth test sequentially that way.
 
-Provide summary of your optimizations (no more than one page), along with
-tables and or graphs to visually explain any performance differences.
+![](renders/trisize_vs_fps.png.png)
 
-* Include a breakdown of time spent in each pipeline stage for a few different
-  models. It is suggested that you use pie charts or 100% stacked bar charts.
-* For optimization steps (like backface culling), include a performance
-  comparison to show the effectiveness.
+##Comparison of pipeline stages
 
+![](renders/pie_chart.png)
 
-## Submit
+##Performance Effects of super-sampled Anti-Aliasing
 
-If you have modified any of the `CMakeLists.txt` files at all (aside from the
-list of `SOURCE_FILES`), you must test that your project can build in Moore
-100B/C. Beware of any build issues discussed on the Google Group.
+It is quite clear that AAing in this manner is extremely costly as the samples per pixels increases.
 
-1. Open a GitHub pull request so that we can see that you have finished.
-   The title should be "Submission: YOUR NAME".
-   * **ADDITIONALLY:**
-     In the body of the pull request, include a link to your repository.
-2. Send an email to the TA (gmail: kainino1+cis565@) with:
-   * **Subject**: in the form of `[CIS565] Project N: PENNKEY`.
-   * Direct link to your pull request on GitHub.
-   * Estimate the amount of time you spent on the project.
-   * If there were any outstanding problems, or if you did any extra
-     work, *briefly* explain.
-   * Feedback on the project itself, if any.
+![](renders/aa_vs_fps.png.png)
diff --git a/renders/aa_vs_fps.png.png b/renders/aa_vs_fps.png.png
new file mode 100644
index 0000000..21edf23
Binary files /dev/null and b/renders/aa_vs_fps.png.png differ
diff --git a/renders/anti_aliasing_3_v_1.PNG b/renders/anti_aliasing_3_v_1.PNG
new file mode 100644
index 0000000..2be493b
Binary files /dev/null and b/renders/anti_aliasing_3_v_1.PNG differ
diff --git a/renders/anti_aliasing_cow_3_v_1.PNG b/renders/anti_aliasing_cow_3_v_1.PNG
new file mode 100644
index 0000000..cf2813c
Binary files /dev/null and b/renders/anti_aliasing_cow_3_v_1.PNG differ
diff --git a/renders/cow_color_interpolation.PNG b/renders/cow_color_interpolation.PNG
new file mode 100644
index 0000000..cddad62
Binary files /dev/null and b/renders/cow_color_interpolation.PNG differ
diff --git a/renders/cow_initial_GS.PNG b/renders/cow_initial_GS.PNG
new file mode 100644
index 0000000..752626d
Binary files /dev/null and b/renders/cow_initial_GS.PNG differ
diff --git a/renders/cow_instancing.PNG b/renders/cow_instancing.PNG
new file mode 100644
index 0000000..b4ca31c
Binary files /dev/null and b/renders/cow_instancing.PNG differ
diff --git a/renders/cow_normals.PNG b/renders/cow_normals.PNG
new file mode 100644
index 0000000..f985683
Binary files /dev/null and b/renders/cow_normals.PNG differ
diff --git a/renders/cows_normals.PNG b/renders/cows_normals.PNG
new file mode 100644
index 0000000..19d8acc
Binary files /dev/null and b/renders/cows_normals.PNG differ
diff --git a/renders/depth_test_flower.PNG b/renders/depth_test_flower.PNG
new file mode 100644
index 0000000..4de508e
Binary files /dev/null and b/renders/depth_test_flower.PNG differ
diff --git a/renders/pie_chart.png b/renders/pie_chart.png
new file mode 100644
index 0000000..804f478
Binary files /dev/null and b/renders/pie_chart.png differ
diff --git a/renders/render_gone_wrong.PNG b/renders/render_gone_wrong.PNG
new file mode 100644
index 0000000..627665a
Binary files /dev/null and b/renders/render_gone_wrong.PNG differ
diff --git a/renders/trisize_vs_fps.png.png b/renders/trisize_vs_fps.png.png
new file mode 100644
index 0000000..877ecc8
Binary files /dev/null and b/renders/trisize_vs_fps.png.png differ
diff --git a/src/main.cpp b/src/main.cpp
index a125d7c..0e97ac5 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -7,6 +7,13 @@
  */
 
 #include "main.hpp"
+#include <cuda.h>
+#include <glm/gtc/constants.hpp>
+#include <glm/gtx/rotate_vector.hpp>
+
+static Cam cam;
+static bool camIsMobile;
+static glm::vec2 oldCursorPos;
 
 //-------------------------------
 //-------------MAIN--------------
@@ -78,7 +85,7 @@ void runCuda() {
     dptr = NULL;
 
     cudaGLMapBufferObject((void **)&dptr, pbo);
-    rasterize(dptr);
+    rasterize(dptr, cam);
     cudaGLUnmapBufferObject(pbo);
 
     frame++;
@@ -99,6 +106,17 @@ bool init(obj *mesh) {
 
     width = 800;
     height = 800;
+
+	cam.width = width;
+	cam.height = height;
+	cam.pos = glm::vec3(0.0f, 0.0f, 5.0f);
+	cam.focus = glm::vec3(0.0f, 0.0f, 0.0f);
+	cam.up = glm::vec3(0.0f, -1.0f, 0.0f);
+	cam.fovy = 45.0f * glm::pi<float>() / 180.0f;
+	cam.zNear = 0.1f;
+	cam.zFar = 100.0f;
+	cam.aspect = 1.0f;
+
     window = glfwCreateWindow(width, height, "CIS 565 Pathtracer", NULL, NULL);
     if (!window) {
         glfwTerminate();
@@ -106,6 +124,9 @@ bool init(obj *mesh) {
     }
     glfwMakeContextCurrent(window);
     glfwSetKeyCallback(window, keyCallback);
+	glfwSetScrollCallback(window, scrollCallback);
+	glfwSetCursorPosCallback(window, cursorCallback);
+	glfwSetMouseButtonCallback(window, mouseCallback);
 
     // Set up GL context
     glewExperimental = GL_TRUE;
@@ -270,7 +291,68 @@ void errorCallback(int error, const char *description) {
 }
 
 void keyCallback(GLFWwindow *window, int key, int scancode, int action, int mods) {
+
+	glm::vec3 camView = glm::normalize(cam.focus - cam.pos);
+	glm::vec3 camUp = cam.up;
+	glm::vec3 camRight = glm::cross(camView, camUp);
+
     if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS) {
         glfwSetWindowShouldClose(window, GL_TRUE);
-    }
+	}
+	else if (key == GLFW_KEY_DOWN && action == GLFW_PRESS){
+		cam.pos += camUp * 0.1f;
+		cam.focus += camUp * 0.1f;
+	}
+	else if (key == GLFW_KEY_UP && action == GLFW_PRESS){
+		cam.pos += camUp * -0.1f;
+		cam.focus += camUp * -0.1f;
+	}
+	else if (key == GLFW_KEY_RIGHT && action == GLFW_PRESS){
+		cam.pos += camRight * -0.1f;
+		cam.focus += camRight * -0.1f;
+	}
+	else if (key == GLFW_KEY_LEFT && action == GLFW_PRESS){
+		cam.pos += camRight * 0.1f;
+		cam.focus += camRight * 0.1f;
+	}
+	// Reset
+	else if (key == GLFW_KEY_R && action == GLFW_PRESS){
+		cam.pos = glm::vec3(0.0, 0.0, 4.0);
+		cam.focus = glm::vec3(0.0);
+	}
 }
+
+void scrollCallback(GLFWwindow *window, double x_offset, double y_offset){
+	glm::vec3 camView = glm::normalize(cam.focus - cam.pos);
+	cam.pos += camView * (float)y_offset;
+	cam.focus += camView * (float)y_offset;
+}
+
+void mouseCallback(GLFWwindow *window, int button, int action, int mods){
+	if (button == GLFW_MOUSE_BUTTON_LEFT && action == GLFW_PRESS){
+		camIsMobile = true;
+	}
+	else {
+		camIsMobile = false;
+	}
+}
+
+void cursorCallback(GLFWwindow *window, double x_pos, double y_pos){
+	glm::vec3 camView = glm::normalize(cam.focus - cam.pos);
+	glm::vec3 camUp = cam.up;
+	glm::vec3 camRight = glm::cross(camView, camUp);
+
+	glm::vec3 rotatedView;
+
+	if (camIsMobile){
+		float x_diff = x_pos - oldCursorPos[0];
+		float y_diff = y_pos - oldCursorPos[1];
+
+		rotatedView = glm::rotate(camView, y_diff/100.0f, camRight);
+		rotatedView = glm::rotate(rotatedView, x_diff/100.0f, camUp);
+		cam.up = camUp;
+		cam.focus = cam.pos + glm::normalize(rotatedView);
+	}
+	oldCursorPos[0] = x_pos;
+	oldCursorPos[1] = y_pos;
+}
\ No newline at end of file
diff --git a/src/main.hpp b/src/main.hpp
index 49d3948..10b19de 100644
--- a/src/main.hpp
+++ b/src/main.hpp
@@ -93,3 +93,6 @@ void deleteTexture(GLuint *tex);
 void mainLoop();
 void errorCallback(int error, const char *description);
 void keyCallback(GLFWwindow *window, int key, int scancode, int action, int mods);
+void scrollCallback(GLFWwindow *window, double x_offset, double y_offset);
+void mouseCallback(GLFWwindow *window, int button, int action, int mods);
+void cursorCallback(GLFWwindow *window, double x_pos, double y_pos);
diff --git a/src/rasterize.cu b/src/rasterize.cu
index 53103b5..51859de 100644
--- a/src/rasterize.cu
+++ b/src/rasterize.cu
@@ -14,32 +14,53 @@
 #include <thrust/random.h>
 #include <util/checkCUDAError.h>
 #include "rasterizeTools.h"
+#include <glm/gtc/matrix_transform.hpp>
+#include <glm/gtc/constants.hpp>
+#include <glm/gtc/matrix_inverse.hpp>
+#include <glm/gtc/constants.hpp>
 
-struct VertexIn {
-    glm::vec3 pos;
-    glm::vec3 nor;
-    glm::vec3 col;
-    // TODO (optional) add other vertex attributes (e.g. texture coordinates)
-};
-struct VertexOut {
-    // TODO
-};
-struct Triangle {
-    VertexOut v[3];
-};
-struct Fragment {
-    glm::vec3 color;
-};
+#define MAX_THREADS 128
+
+#define AA 1
+
+//TODO: Make this into a parameter of some kind, allow setting of scale/rot/trans
+#define NUM_INSTANCES 1
+
+static int iter;
 
 static int width = 0;
 static int height = 0;
 static int *dev_bufIdx = NULL;
+static int *dev_bufIdxOut = NULL;
 static VertexIn *dev_bufVertex = NULL;
+static VertexOut *dev_bufVertexOut = NULL;
 static Triangle *dev_primitives = NULL;
 static Fragment *dev_depthbuffer = NULL;
 static glm::vec3 *dev_framebuffer = NULL;
 static int bufIdxSize = 0;
 static int vertCount = 0;
+static int vertInCount = 0;
+static int vertOutCount = 0;
+static Light light;
+
+static int fragCount;
+static int primCount;
+static int numVertBlocks;
+static int numVertInBlocks;
+static int numVertOutBlocks;
+static int numPrimBlocks;
+static int numFragBlocks;
+
+static glm::mat4 Mpvms[NUM_INSTANCES];
+static glm::mat3 Mms[NUM_INSTANCES];
+
+static glm::mat4* dev_Mpvms;
+static glm::mat3* dev_Mms;
+
+//static Cam cam; 
+static glm::mat4 Mview;
+static glm::mat4 Mmod;
+static glm::mat4 Mproj;
 
 /**
  * Kernel that writes the image to the OpenGL PBO directly.
@@ -71,22 +92,48 @@ void render(int w, int h, Fragment *depthbuffer, glm::vec3 *framebuffer) {
     int index = x + (y * w);
 
     if (x < w && y < h) {
-        framebuffer[index] = depthbuffer[index].color;
+
+		int tlx = x*AA;
+		int tly = y*AA;
+
+		glm::vec3 color(0.0);
+
+		int sx, sy;
+		for (int i = 0; i < AA; i++){
+			for (int j = 0; j < AA; j++){
+				sx = tlx + i;
+				sy = tly + j;
+				color += depthbuffer[sx+sy*w*AA].color;
+			}
+		}
+
+		color /= AA*AA;
+
+        framebuffer[index] = color;
     }
 }
 
+__global__ void initDepths(int n, Fragment* depthbuffer){
+	int index = threadIdx.x + (blockDim.x*blockIdx.x);
+
+	if (index < n){
+		depthbuffer[index].fixed_depth = INT_MAX;
+	}
+}
+
 /**
  * Called once at the beginning of the program to allocate memory.
  */
 void rasterizeInit(int w, int h) {
     width = w;
     height = h;
-    cudaFree(dev_depthbuffer);
-    cudaMalloc(&dev_depthbuffer,   width * height * sizeof(Fragment));
-    cudaMemset(dev_depthbuffer, 0, width * height * sizeof(Fragment));
+
     cudaFree(dev_framebuffer);
     cudaMalloc(&dev_framebuffer,   width * height * sizeof(glm::vec3));
     cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3));
+
+	light.pos = glm::vec3(3.0, 3.0, 3.0);
+	iter = 0;
     checkCUDAError("rasterizeInit");
 }
 
@@ -99,6 +146,24 @@ void rasterizeSetBuffers(
     bufIdxSize = _bufIdxSize;
     vertCount = _vertCount;
 
+	// Vertex shading
+	vertInCount = _vertCount;
+	vertOutCount = vertInCount * NUM_INSTANCES;
+	fragCount = width * height * AA * AA;
+	primCount = vertOutCount / 3;
+	numVertBlocks = (vertCount - 1) / MAX_THREADS + 1;
+	numVertInBlocks = (vertInCount - 1) / MAX_THREADS + 1;
+	numVertOutBlocks = (vertOutCount - 1) / MAX_THREADS + 1;
+	numPrimBlocks = (primCount - 1) / MAX_THREADS + 1;
+	numFragBlocks = (fragCount - 1) / MAX_THREADS + 1;
+
+	printf("fragment count: %d\n", fragCount);
+	printf("vertex count: %d\n", vertCount);
+	printf("primitive count: %d\n", primCount);
+
+	//int numBlocks = (width*height - 1) / MAX_THREADS + 1;
+	//initDepths<<<numBlocks, MAX_THREADS>>>(width*height, dev_depthbuffer);
+
     cudaFree(dev_bufIdx);
     cudaMalloc(&dev_bufIdx, bufIdxSize * sizeof(int));
     cudaMemcpy(dev_bufIdx, bufIdx, bufIdxSize * sizeof(int), cudaMemcpyHostToDevice);
@@ -114,30 +179,246 @@ void rasterizeSetBuffers(
     cudaMalloc(&dev_bufVertex, vertCount * sizeof(VertexIn));
     cudaMemcpy(dev_bufVertex, bufVertex, vertCount * sizeof(VertexIn), cudaMemcpyHostToDevice);
 
-    cudaFree(dev_primitives);
-    cudaMalloc(&dev_primitives, vertCount / 3 * sizeof(Triangle));
-    cudaMemset(dev_primitives, 0, vertCount / 3 * sizeof(Triangle));
+	cudaFree(dev_bufVertexOut);
+	cudaMalloc(&dev_bufVertexOut, vertOutCount * sizeof(VertexOut));
+
+	cudaFree(dev_bufIdxOut);
+	cudaMalloc((void**)&dev_bufIdxOut, vertOutCount * sizeof(int));
+
+	cudaFree(dev_primitives);
+	cudaMalloc(&dev_primitives, primCount * sizeof(Triangle));
+	cudaMemset(dev_primitives, 0, primCount * sizeof(Triangle));
+
+	cudaFree(dev_depthbuffer);
+	cudaMalloc(&dev_depthbuffer, fragCount * sizeof(Fragment));
+	cudaMemset(dev_depthbuffer, 0, fragCount * sizeof(Fragment));
+
+	cudaFree(dev_framebuffer);
+	cudaMalloc(&dev_framebuffer, width * height * sizeof(glm::vec3));
 
     checkCUDAError("rasterizeSetBuffers");
 }
 
+__global__ void kernShadeVerticesInstances(int n, int num_instances, VertexOut* vs_output, int* vs_output_idx, VertexIn* vs_input, int* vs_input_idx, glm::mat4* Mpvms, glm::mat3* Mms){
+	// n is the number of in vertices
+	// TODO: Can parallelize this if we do thread per output index instead of thread per input index
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (index < n){
+		glm::mat4 Mpvm;
+		glm::vec4 new_pos;
+		for (int i = 0; i < num_instances; i++){
+			// Model-view-perspective transform for positions
+			Mpvm = Mpvms[i];
+
+			new_pos = Mpvm * glm::vec4(vs_input[index].pos, 1.0f);
+			vs_output[index + i*n].ndc_pos = glm::vec3(new_pos / new_pos.w);
+			vs_output[index + i*n].nor = glm::normalize(vs_input[index].nor * Mms[i]);
+			vs_output[index + i*n].col = vs_input[index].col;
+			vs_output_idx[index + i*n] = vs_input_idx[index] + i*n;
+		}
+	}
+}
+
+__global__ void kernShadeVertices(int n, VertexOut* vs_output, VertexIn* vs_input, glm::mat4 Mpvm, glm::mat3 Mm){
+	// Mm is the 3x3 rotation matrix computed with intervse transpose of the Mmodel matrix, for use to rotate normal vectors
+	int index = (blockIdx.x*blockDim.x) + threadIdx.x;
+
+	if (index < n){
+		vs_output[index].pos = vs_input[index].pos;
+		glm::vec4 new_pos = Mpvm * glm::vec4(vs_input[index].pos, 1.0f);
+		vs_output[index].ndc_pos = glm::vec3(new_pos / new_pos.w);
+		vs_output[index].nor = vs_input[index].nor * Mm;
+		vs_output[index].col = vs_input[index].col;
+	}
+}
+
+__global__ void kernShadeGeometries(int n, VertexOut* out_vertices, int* idx, VertexOut* in_vertices){
+	int index = (blockIdx.x*blockDim.x) + threadIdx.x;
+
+	if (index < n){
+		VertexOut vi = in_vertices[index];
+		idx[index * 3] = 3*index;
+		idx[index * 3 + 1] = 3*index + 1;
+		idx[index * 3 + 2] = 3*index + 2;
+		out_vertices[index * 3].ndc_pos = vi.ndc_pos;
+		out_vertices[index * 3].col = vi.col;
+		out_vertices[index * 3].pos = vi.pos;
+		out_vertices[index * 3].nor = vi.nor;
+		out_vertices[index * 3 + 1].ndc_pos = vi.ndc_pos + glm::vec3(0.01,0.0,0.0);
+		out_vertices[index * 3 + 1].col = vi.col;
+		out_vertices[index * 3 + 1].pos = vi.pos;
+		out_vertices[index * 3 + 1].nor = vi.nor;
+		out_vertices[index * 3 + 2].ndc_pos = vi.ndc_pos + glm::vec3(0.0, 0.01, 0.0);
+		out_vertices[index * 3 + 2].col = vi.col;
+		out_vertices[index * 3 + 2].pos = vi.pos;
+		out_vertices[index * 3 + 2].nor = vi.nor;
+	}
+}
+
+__global__ void kernAssemblePrimitives(int n, Triangle* primitives, VertexOut* vs_output, int* idx){
+	int index = (blockIdx.x*blockDim.x) + threadIdx.x;
+
+	if (index < n){
+		int idx0 = idx[3 * index + 0];
+		int idx1 = idx[3 * index + 1];
+		int idx2 = idx[3 * index + 2];
+		primitives[index].v[0] = vs_output[idx0];
+		primitives[index].v[1] = vs_output[idx1];
+		primitives[index].v[2] = vs_output[idx2];
+		primitives[index].ndc_pos[0] = vs_output[idx0].ndc_pos;
+		primitives[index].ndc_pos[1] = vs_output[idx1].ndc_pos;
+		primitives[index].ndc_pos[2] = vs_output[idx2].ndc_pos;
+		primitives[index].v[0].col = glm::vec3(1.0, 0.0, 0.0);
+		primitives[index].v[1].col = glm::vec3(1.0, 0.0, 0.0);
+		primitives[index].v[2].col = glm::vec3(1.0, 0.0, 0.0);
+	}
+}
+
+// Each thread is responsible for rasterizing a single triangle
+__global__ void kernRasterize(int n, Cam cam, Fragment* fs_input, Triangle* primitives){
+	int index = (blockIdx.x*blockDim.x) + threadIdx.x;
+
+	if (index < n){
+		
+		Triangle prim = primitives[index];
+
+		AABB aabb = getAABBForTriangle(primitives[index].ndc_pos);
+		glm::vec3 bary;
+		glm::vec2 point;
+		glm::vec3 points;
+
+		// Snap i,j to nearest fragment coordinate
+		int frag_width = cam.width * AA;
+		int frag_height = cam.height * AA;
+		float dx = 2.0f / (float)frag_width;
+		float dy = 2.0f / (float)frag_height;
+
+		float x;
+		float y;
+
+		int mini = max((int)(aabb.min.x / dx) + frag_width / 2 - 2, 0);
+		int minj = max((int)(aabb.min.y / dy) + frag_height / 2 - 2, 0);
+		int maxi = min((int)(aabb.max.x / dx) + frag_width / 2 + 2, frag_width-1);
+		int maxj = min((int)(aabb.max.y / dy) + frag_height / 2 + 2, frag_height-1);
+
+		float depth;
+		int fixed_depth;
+		int ind;
+
+		// Iterate through fragment coordinates
+		for (int j = minj; j < maxj; j++){
+			for (int i = mini; i < maxi; i++){
+
+				ind = i + j * frag_width;
+				
+				// Get the NDC coordinate
+				x = dx*i - dx*frag_width/2.0f + dx/2.0f;
+				y = dy*j - dy*frag_height/2.0f + dx/2.0f;
+
+				point[0] = x;
+				point[1] = y;
+
+				bary = calculateBarycentricCoordinate(primitives[index].ndc_pos, point);
+
+				if (isBarycentricCoordInBounds(bary)){
+					depth = -getZAtCoordinate(bary, prim.ndc_pos);
+					fixed_depth = (int)(depth * INT_MAX);
+
+					int old = atomicMin(&fs_input[ind].fixed_depth, fixed_depth);
+
+					if (fs_input[ind].fixed_depth == fixed_depth){
+						fs_input[ind].depth = depth;
+						fs_input[ind].color = bary.x * prim.v[0].col + bary.y * prim.v[1].col + bary.z * prim.v[2].col; //glm::vec3(1.0, 0.0, 0.0);// prim.v[0].col;
+						fs_input[ind].norm = bary.x * prim.v[0].nor + bary.y * prim.v[1].nor + bary.z * prim.v[2].nor;
+						fs_input[ind].pos = bary.x * prim.v[0].pos + bary.y * prim.v[1].pos + bary.z * prim.v[2].pos;
+						fs_input[ind].ndc_pos = bary.x * prim.v[0].ndc_pos + bary.y * prim.v[1].ndc_pos + bary.z * prim.v[2].ndc_pos;
+						//fs_input[ind].color = fs_input[ind].norm;
+					}
+				}
+				
+			}
+		}
+	}
+}
+
+__global__ void kernShadeFragments(int n, Fragment* fs_input, Light light){
+	int index = (blockIdx.x*blockDim.x) + threadIdx.x;
+
+	if (index < n){
+		if (fs_input[index].color != glm::vec3(0.0)){
+			glm::vec3 light_ray = glm::normalize(fs_input[index].pos - light.pos);
+			fs_input[index].color = fs_input[index].color * abs((glm::dot(glm::normalize(fs_input[index].norm), light_ray)));
+		}
+	}
+}
+
+void resetRasterize(){
+	cudaMemset(dev_depthbuffer, 0, fragCount * sizeof(Fragment));
+	initDepths<<<numFragBlocks, MAX_THREADS>>>(fragCount, dev_depthbuffer);
+
+	cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3));
+	checkCUDAError("resetBuffers");
+}
+
 /**
  * Perform rasterization.
  */
-void rasterize(uchar4 *pbo) {
+void rasterize(uchar4 *pbo, Cam cam) {
     int sideLength2d = 8;
     dim3 blockSize2d(sideLength2d, sideLength2d);
     dim3 blockCount2d((width  - 1) / blockSize2d.x + 1,
                       (height - 1) / blockSize2d.y + 1);
 
-    // TODO: Execute your rasterization pipeline here
-    // (See README for rasterization pipeline outline.)
+	resetRasterize();
+
+	//Mmod = glm::mat4(1.0f);
+	Mview = glm::lookAt(cam.pos, cam.focus, cam.up);
+	Mproj = glm::perspective(cam.fovy, cam.aspect, cam.zNear, cam.zFar);
+
+
+
+	for (int i = 0; i < NUM_INSTANCES; i++){
+		Mmod = glm::mat4(1.0);
+		Mmod = glm::translate(Mmod, glm::vec3(i*0.5f,0.0f,i*-1.0f));
+		Mmod = glm::rotate(Mmod, i*3.14f/8.0f, glm::vec3(0.0,1.0,0.0));
+		Mms[i] = glm::inverseTranspose(glm::mat3(Mmod));
+		Mpvms[i] = Mproj * Mview * Mmod;
+	}
+
+
+	cudaMalloc((void**)&dev_Mpvms, NUM_INSTANCES*sizeof(glm::mat4));
+	cudaMemcpy(dev_Mpvms, Mpvms, NUM_INSTANCES*sizeof(glm::mat4), cudaMemcpyHostToDevice);
+	cudaMalloc((void**)&dev_Mms, NUM_INSTANCES*sizeof(glm::mat3));
+	cudaMemcpy(dev_Mms, Mms, NUM_INSTANCES*sizeof(glm::mat3), cudaMemcpyHostToDevice);
+
+	// Vertex Shading
+	kernShadeVerticesInstances<<<numVertBlocks, MAX_THREADS>>>(vertCount, NUM_INSTANCES, dev_bufVertexOut, dev_bufIdxOut, dev_bufVertex, dev_bufIdx, dev_Mpvms, dev_Mms);
+	//kernShadeVertices<<<numVertBlocks, MAX_THREADS>>>(vertCount, dev_bufVertexOut, dev_bufVertex, Mpvm);
+	checkCUDAError("shadeVertices");
+	cudaFree(dev_Mpvms);
+	cudaFree(dev_Mms);
+
+	// Primitive Assembly
+	kernAssemblePrimitives<<<numPrimBlocks, MAX_THREADS>>>(primCount, dev_primitives, dev_bufVertexOut, dev_bufIdxOut);
+	checkCUDAError("assemblePrimitives");
+
+	// Rasterization
+	kernRasterize<<<numPrimBlocks, MAX_THREADS>>>(primCount, cam, dev_depthbuffer, dev_primitives);
+	checkCUDAError("rasterizePrimitives");
+
+	// Fragment shading
+	kernShadeFragments<<<numFragBlocks, MAX_THREADS>>>(fragCount, dev_depthbuffer, light);
+	checkCUDAError("shadeFragments");
 
     // Copy depthbuffer colors into framebuffer
     render<<<blockCount2d, blockSize2d>>>(width, height, dev_depthbuffer, dev_framebuffer);
+
     // Copy framebuffer into OpenGL buffer for OpenGL previewing
     sendImageToPBO<<<blockCount2d, blockSize2d>>>(pbo, width, height, dev_framebuffer);
     checkCUDAError("rasterize");
+
+	iter += 1;
 }
 
 /**
@@ -147,9 +428,15 @@ void rasterizeFree() {
     cudaFree(dev_bufIdx);
     dev_bufIdx = NULL;
 
+	cudaFree(dev_bufIdxOut);
+	dev_bufIdxOut = NULL;
+
     cudaFree(dev_bufVertex);
     dev_bufVertex = NULL;
 
+	cudaFree(dev_bufVertexOut);
+	dev_bufVertexOut = NULL;
+
     cudaFree(dev_primitives);
     dev_primitives = NULL;
 
diff --git a/src/rasterize.h b/src/rasterize.h
index a06b339..b3df9e9 100644
--- a/src/rasterize.h
+++ b/src/rasterize.h
@@ -9,10 +9,54 @@
 #pragma once
 
 #include <glm/glm.hpp>
+#include <vector_types.h>
+
+struct Light {
+	glm::vec3 pos;
+	glm::vec3 color;
+};
+
+struct Cam {
+	glm::vec3 pos;
+	glm::vec3 focus;
+	glm::vec3 up;
+	int height;
+	int width;
+	float aspect;
+	float fovy;
+	float zNear;
+	float zFar;
+};
+
+struct VertexIn {
+	glm::vec3 pos;
+	glm::vec3 nor;
+	glm::vec3 col;
+	// TODO (optional) add other vertex attributes (e.g. texture coordinates)
+};
+struct VertexOut {
+	glm::vec3 pos;
+	glm::vec3 ndc_pos;
+	glm::vec3 nor;
+	glm::vec3 col;
+};
+struct Triangle {
+	VertexOut v[3];
+	glm::vec3 ndc_pos[3];
+};
+struct Fragment {
+	glm::vec3 color;
+	glm::vec3 norm;
+	glm::vec3 pos;
+	glm::vec3 ndc_pos;
+	float depth;
+	int fixed_depth;
+	VertexOut v;
+};
 
 void rasterizeInit(int width, int height);
 void rasterizeSetBuffers(
         int bufIdxSize, int *bufIdx,
         int vertCount, float *bufPos, float *bufNor, float *bufCol);
-void rasterize(uchar4 *pbo);
+void rasterize(uchar4 *pbo, Cam cam);
 void rasterizeFree();
diff --git a/util/checkCUDAError.h b/util/checkCUDAError.h
index 6d61f9c..2deb068 100644
--- a/util/checkCUDAError.h
+++ b/util/checkCUDAError.h
@@ -2,10 +2,11 @@
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) _checkCUDAErrorHelper(msg, FILENAME, __LINE__)
+#define DEBUG
 
 static
 void _checkCUDAErrorHelper(const char *msg, const char *filename, int line) {
-#if !defined(NDEBUG)
+#ifdef DEBUG
     cudaDeviceSynchronize();
     cudaError_t err = cudaGetLastError();
     if (cudaSuccess == err) {