diff --git a/1.avi b/1.avi
new file mode 100644
index 0000000..f0765b6
Binary files /dev/null and b/1.avi differ
diff --git a/1.png b/1.png
new file mode 100644
index 0000000..befd030
Binary files /dev/null and b/1.png differ
diff --git a/2.avi b/2.avi
new file mode 100644
index 0000000..3f1ee13
Binary files /dev/null and b/2.avi differ
diff --git a/2.png b/2.png
new file mode 100644
index 0000000..7bb7cb3
Binary files /dev/null and b/2.png differ
diff --git a/3.avi b/3.avi
new file mode 100644
index 0000000..2656bf1
Binary files /dev/null and b/3.avi differ
diff --git a/4.avi b/4.avi
new file mode 100644
index 0000000..1e13731
Binary files /dev/null and b/4.avi differ
diff --git a/5.avi b/5.avi
new file mode 100644
index 0000000..8ba7576
Binary files /dev/null and b/5.avi differ
diff --git a/README.md b/README.md
index 22d2f34..48ee52e 100644
--- a/README.md
+++ b/README.md
@@ -3,346 +3,29 @@ CUDA Rasterizer
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Xinyue zhu
+* Tested on: Windows 10, i7- @ 2.22GHz 22GB, GTX 960M
 
-### (TODO: Your README)
-
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
-
-
-Instructions (delete me)
 ========================
-
-This is due Sunday, October 11, evening at midnight.
-
-**Summary:** 
-In this project, you will use CUDA to implement a simplified
-rasterized graphics pipeline, similar to the OpenGL pipeline. You will
-implement vertex shading, primitive assembly, rasterization, fragment shading,
-and a framebuffer. More information about the rasterized graphics pipeline can
-be found in the class slides and in the CIS 560 lecture notes.
-
-The base code provided includes an OBJ loader and much of the I/O and
-bookkeeping code. It also includes some functions that you may find useful,
-described below. The core rasterization pipeline is left for you to implement.
-
-You are not required to use this base code if you don't want
-to. You may also change any part of the base code as you please.
-**This is YOUR project.**
-
-**Recommendation:**
-Every image you save should automatically get a different
-filename. Don't delete all of them! For the benefit of your README, keep a
-bunch of them around so you can pick a few to document your progress.
-
-
-### Contents
-
-* `src/` C++/CUDA source files.
-* `util/` C++ utility files.
-* `objs/` Example OBJ test files (# verts, # tris in buffers after loading)
-  * `tri.obj` (3v, 1t): The simplest possible geometric object.
-  * `cube.obj` (36v, 12t): A small model with low depth-complexity.
-  * `suzanne.obj` (2904 verts, 968 tris): A medium model with low depth-complexity.
-  * `suzanne_smooth.obj` (2904 verts, 968 tris): A medium model with low depth-complexity.
-    This model has normals which must be interpolated.
-  * `cow.obj` (17412 verts, 5804 tris): A large model with low depth-complexity.
-  * `cow_smooth.obj` (17412 verts, 5804 tris): A large model with low depth-complexity.
-    This model has normals which must be interpolated.
-  * `flower.obj` (1920 verts, 640 tris): A medium model with very high depth-complexity.
-  * `sponza.obj` (837,489 verts, 279,163 tris): A huge model with very high depth-complexity.
-* `renders/` Debug render of an example OBJ.
-* `external/` Includes and static libraries for 3rd party libraries.
-
-### Running the code
-
-The main function requires a scene description file. Call the program with
-one as an argument: `cis565_rasterizer objs/cow.obj`.
-(In Visual Studio, `../objs/cow.obj`.)
-
-If you are using Visual Studio, you can set this in the Debugging > Command
-Arguments section in the Project properties. Note that this value is different
-for every different configuration type. Make sure you get the path right; read
-the console for errors.
-
-## Requirements
-
-**Ask on the mailing list for any clarifications.**
-
-In this project, you are given the following code:
-
-* A library for loading standard Alias/Wavefront `.obj` format mesh
-  files and converting them to OpenGL-style buffers of index and vertex data.
-  * This library does NOT read materials, and provides all colors as white by
-    default. You can use another library if you wish.
-* Simple structs for some parts of the pipeline.
-* Depth buffer to framebuffer copy.
-* CUDA-GL interop.
-
-You will need to implement the following features/pipeline stages:
-
-* Vertex shading.
-* (Vertex shader) perspective transformation.
-* Primitive assembly with support for triangles read from buffers of index and
-  vertex data.
-* Rasterization.
-* Fragment shading.
-* A depth buffer for storing and depth testing fragments.
-* Fragment to depth buffer writing (**with** atomics for race avoidance).
-* (Fragment shader) simple lighting scheme, such as Lambert or Blinn-Phong.
-
-See below for more guidance.
-
-You are also required to implement at least "3.0" points in extra features.
-(the parenthesized numbers must add to 3.0 or more):
-
-* (1.0) Tile-based pipeline.
-* Additional pipeline stages.
-   * (1.0) Tessellation shader.
-   * (1.0) Geometry shader, able to output a variable number of primitives per
-     input primitive, optimized using stream compaction (thrust allowed).
-   * (0.5 **if not doing geometry shader**) Backface culling, optimized using
-     stream compaction (thrust allowed).
-   * (1.0) Transform feedback.
-   * (0.5) Scissor test.
-   * (0.5) Blending (when writing into framebuffer).
-* (1.0) Instancing: draw one set of vertex data multiple times, each run
-  through the vertex shader with a different ID.
-* (0.5) Correct color interpolation between points on a primitive.
-* (1.0) UV texture mapping with bilinear texture filtering and perspective
-  correct texture coordinates.
-* Support for rasterizing additional primitives:
-   * (0.5) Lines or line strips.
-   * (0.5) Points.
-* (1.0) Anti-aliasing.
-* (1.0) Occlusion queries.
-* (1.0) Order-independent translucency using a k-buffer.
-* (0.5) **Mouse**-based interactive camera support.
-
-This extra feature list is not comprehensive. If you have a particular idea
-you would like to implement, please **contact us first**.
-
-**IMPORTANT:**
-For each extra feature, please provide the following brief analysis:
-
-* Concise overview write-up of the feature.
-* Performance impact of adding the feature (slower or faster).
-* If you did something to accelerate the feature, what did you do and why?
-* How might this feature be optimized beyond your current implementation?
-
-
-## Base Code Tour
-
-You will be working primarily in two files: `rasterize.cu`, and
-`rasterizeTools.h`. Within these files, areas that you need to complete are
-marked with a `TODO` comment. Areas that are useful to and serve as hints for
-optional features are marked with `TODO (Optional)`. Functions that are useful
-for reference are marked with the comment `CHECKITOUT`. **You should look at
-all TODOs and CHECKITOUTs before starting!** There are not many.
-
-* `src/rasterize.cu` contains the core rasterization pipeline. 
-  * A few pre-made structs are included for you to use, but those marked with
-    TODO will also be needed for a simple rasterizer. As with any part of the
-    base code, you may modify or replace these as you see fit.
-
-* `src/rasterizeTools.h` contains various useful tools
-  * Includes a number of barycentric coordinate related functions that you may
-    find useful in implementing scanline based rasterization.
-
-* `util/utilityCore.hpp` serves as a kitchen-sink of useful functions.
-
-
-## Rasterization Pipeline
-
-Possible pipelines are described below. Pseudo-type-signatures are given.
-Not all of the pseudocode arrays will necessarily actually exist in practice.
-
-### First-Try Pipeline
-
-This describes a minimal version of *one possible* graphics pipeline, similar
-to modern hardware (DX/OpenGL). Yours need not match precisely.  To begin, try
-to write a minimal amount of code as described here. Verify some output after
-implementing each pipeline step. This will reduce the necessary time spent
-debugging.
-
-Start out by testing a single triangle (`tri.obj`).
-
-* Clear the depth buffer with some default value.
-* Vertex shading: 
-  * `VertexIn[n] vs_input -> VertexOut[n] vs_output`
-  * A minimal vertex shader will apply no transformations at all - it draws
-    directly in normalized device coordinates (-1 to 1 in each dimension).
-* Primitive assembly.
-  * `VertexOut[n] vs_output -> Triangle[n/3] primitives`
-  * Start by supporting ONLY triangles. For a triangle defined by indices
-    `(a, b, c)` into `VertexOut` array `vo`, simply copy the appropriate values
-    into a `Triangle` object `(vo[a], vo[b], vo[c])`.
-* Rasterization.
-  * `Triangle[n/3] primitives -> FragmentIn[m] fs_input`
-  * A scanline implementation is simpler to start with.
-* Fragment shading.
-  * `FragmentIn[m] fs_input -> FragmentOut[m] fs_output`
-  * A super-simple test fragment shader: output same color for every fragment.
-    * Also try displaying various debug views (normals, etc.)
-* Fragments to depth buffer.
-  * `FragmentOut[m] -> FragmentOut[width][height]`
-  * Results in race conditions - don't bother to fix these until it works!
-  * Can really be done inside the fragment shader, if you call the fragment
-    shader from the rasterization kernel for every fragment (including those
-    which get occluded). **OR,** this can be done before fragment shading, which
-    may be faster but means the fragment shader cannot change the depth.
-* A depth buffer for storing and depth testing fragments.
-  * `FragmentOut[width][height] depthbuffer`
-  * An array of `fragment` objects.
-  * At the end of a frame, it should contain the fragments drawn to the screen.
-* Fragment to framebuffer writing.
-  * `FragmentOut[width][height] depthbuffer -> vec3[width][height] framebuffer`
-  * Simply copies the colors out of the depth buffer into the framebuffer
-    (to be displayed on the screen).
-
-### A Useful Pipeline
-
-* Clear the depth buffer with some default value.
-* Vertex shading: 
-  * `VertexIn[n] vs_input -> VertexOut[n] vs_output`
-  * Apply some vertex transformation (e.g. model-view-projection matrix using
-    `glm::lookAt ` and `glm::perspective `).
-* Primitive assembly.
-  * `VertexOut[n] vs_output -> Triangle[n/3] primitives`
-  * As above.
-  * Other primitive types are optional.
-* Rasterization.
-  * `Triangle[n/3] primitives -> FragmentIn[m] fs_input`
-  * You may choose to do a tiled rasterization method, which should have lower
-    global memory bandwidth.
-  * A scanline optimization: when rasterizing a triangle, only scan over the
-    box around the triangle (`getAABBForTriangle`).
-* Fragment shading.
-  * `FragmentIn[m] fs_input -> FragmentOut[m] fs_output`
-  * Add a shading method, such as Lambert or Blinn-Phong. Lights can be defined
-    by kernel parameters (like GLSL uniforms).
-* Fragments to depth buffer.
-  * `FragmentOut[m] -> FragmentOut[width][height]`
-  * Can really be done inside the fragment shader, if you call the fragment
-    shader from the rasterization kernel for every fragment (including those
-    which get occluded). **OR,** this can be done before fragment shading, which
-    may be faster but means the fragment shader cannot change the depth.
-    * This result in an optimization: it allows you to do depth tests before
-     spending execution time in complex fragment shader code!
-  * Handle race conditions! Since multiple primitives write fragments to the
-    same fragment in the depth buffer, races must be avoided by using CUDA
-    atomics.
-    * *Approach 1:* Lock the location in the depth buffer during the time that
-      a thread is comparing old and new fragment depths (and possibly writing
-      a new fragment). This should work in all cases, but be slower.
-    * *Approach 2:* Convert your depth value to a fixed-point `int`, and use
-      `atomicMin` to store it into an `int`-typed depth buffer `intdepth`. After
-      that, the value which is stored at `intdepth[i]` is (usually) that of the
-      fragment which should be stored into the `fragment` depth buffer.
-      * This may result in some rare race conditions (e.g. across blocks).
-    * The `flower.obj` test file is good for testing race conditions.
-* A depth buffer for storing and depth testing fragments.
-  * `FragmentOut[width][height] depthbuffer`
-  * An array of `fragment` objects.
-  * At the end of a frame, it should contain the fragments drawn to the screen.
-* Fragment to framebuffer writing.
-  * `FragmentOut[width][height] depthbuffer -> vec3[width][height] framebuffer`
-  * Simply copies the colors out of the depth buffer into the framebuffer
-    (to be displayed on the screen).
-
-This is a suggested sequence of pipeline steps, but you may choose to alter the
-order of this sequence or merge entire kernels as you see fit.  For example, if
-you decide that doing has benefits, you can choose to merge the vertex shader
-and primitive assembly kernels, or merge the perspective transform into another
-kernel. There is not necessarily a right sequence of kernels, and you may
-choose any sequence that works.  Please document in your README what sequence
-you choose and why.
-
-
-## Resources
-
-The following resources may be useful for this project:
-
-* High-Performance Software Rasterization on GPUs:
-  * [Paper (HPG 2011)](http://www.tml.tkk.fi/~samuli/publications/laine2011hpg_paper.pdf)
-  * [Code](http://code.google.com/p/cudaraster/)
-  * Note that looking over this code for reference with regard to the paper is
-    fine, but we most likely will not grant any requests to actually
-    incorporate any of this code into your project.
-  * [Slides](http://bps11.idav.ucdavis.edu/talks/08-gpuSoftwareRasterLaineAndPantaleoni-BPS2011.pdf)
-* The Direct3D 10 System (SIGGRAPH 2006) - for those interested in doing
-  geometry shaders and transform feedback:
-  * [Paper](http://dl.acm.org/citation.cfm?id=1141947)
-  * [Paper, through Penn Libraries proxy](http://proxy.library.upenn.edu:2247/citation.cfm?id=1141947)
-* Multi-Fragment Eﬀects on the GPU using the k-Buﬀer - for those who want to do
-  order-independent transparency using a k-buffer:
-  * [Paper](http://www.inf.ufrgs.br/~comba/papers/2007/kbuffer_preprint.pdf)
-* FreePipe: A Programmable, Parallel Rendering Architecture for Efficient
-  Multi-Fragment Effects (I3D 2010):
-  * [Paper](https://sites.google.com/site/hmcen0921/cudarasterizer)
-* Writing A Software Rasterizer In Javascript:
-  * [Part 1](http://simonstechblog.blogspot.com/2012/04/software-rasterizer-part-1.html)
-  * [Part 2](http://simonstechblog.blogspot.com/2012/04/software-rasterizer-part-2.html)
-
-
-## Third-Party Code Policy
-
-* Use of any third-party code must be approved by asking on our Google Group.
-* If it is approved, all students are welcome to use it. Generally, we approve
-  use of third-party code that is not a core part of the project. For example,
-  for the path tracer, we would approve using a third-party library for loading
-  models, but would not approve copying and pasting a CUDA function for doing
-  refraction.
-* Third-party code **MUST** be credited in README.md.
-* Using third-party code without its approval, including using another
-  student's code, is an academic integrity violation, and will, at minimum,
-  result in you receiving an F for the semester.
-
-
-## README
-
-Replace the contents of this README.md in a clear manner with the following:
-
-* A brief description of the project and the specific features you implemented.
-* At least one screenshot of your project running.
-* A 30 second or longer video of your project running.
-* A performance analysis (described below).
+## Description:
+ <p>This is a rasterizer.</p>
+It includes the following pipeline:<br/>
+1)vertex transformation with camera movement</br>
+2)Tessellation shader</br>
+3)Bling-phong shader and blending</br>
+4)rasterization brute force scan line->boundingbox scan line</br>
+5)depth test</br>
+6)anti-aliansing:Super sampling - random  </br>
+the base color is normal map</br>
+<img src="1.png"  width="380" height="400">  <img src="2.png"  width="400" height="380"></br>
+
+https://www.youtube.com/watch?v=IY66btfDzQ0&feature=youtu.be
 
 ### Performance Analysis
+<img src="p1.png"  width="600" height="150">
+<p>The CPU are all used for opengl related funtions. The computation mostly happened in GPU
+<p> a breakdown of time spent in each pipeline stage 
+<img src="pipe_line.png"  width="600" height="280">
 
-The performance analysis is where you will investigate how to make your CUDA
-programs more efficient using the skills you've learned in class. You must have
-performed at least one experiment on your code to investigate the positive or
-negative effects on performance. 
-
-We encourage you to get creative with your tweaks. Consider places in your code
-that could be considered bottlenecks and try to improve them. 
-
-Provide summary of your optimizations (no more than one page), along with
-tables and or graphs to visually explain any performance differences.
-
-* Include a breakdown of time spent in each pipeline stage for a few different
-  models. It is suggested that you use pie charts or 100% stacked bar charts.
-* For optimization steps (like backface culling), include a performance
-  comparison to show the effectiveness.
-
-
-## Submit
 
-If you have modified any of the `CMakeLists.txt` files at all (aside from the
-list of `SOURCE_FILES`), you must test that your project can build in Moore
-100B/C. Beware of any build issues discussed on the Google Group.
 
-1. Open a GitHub pull request so that we can see that you have finished.
-   The title should be "Submission: YOUR NAME".
-   * **ADDITIONALLY:**
-     In the body of the pull request, include a link to your repository.
-2. Send an email to the TA (gmail: kainino1+cis565@) with:
-   * **Subject**: in the form of `[CIS565] Project N: PENNKEY`.
-   * Direct link to your pull request on GitHub.
-   * Estimate the amount of time you spent on the project.
-   * If there were any outstanding problems, or if you did any extra
-     work, *briefly* explain.
-   * Feedback on the project itself, if any.
diff --git a/p1.png b/p1.png
new file mode 100644
index 0000000..f4390ae
Binary files /dev/null and b/p1.png differ
diff --git a/pipe_line.png b/pipe_line.png
new file mode 100644
index 0000000..31674ec
Binary files /dev/null and b/pipe_line.png differ
diff --git a/src/main.cpp b/src/main.cpp
index a125d7c..78810f1 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -7,82 +7,158 @@
  */
 
 #include "main.hpp"
+#include "GLFW\glfw3.h"
+float up_rot=0;
+float right_rot =0 ;
+glm::vec2 all_amt = glm::vec2(0,0);//translate along x,y
+float posx1;
+float posy1;
+float posx;
+float posy;
+
+bool click = false;
+float previous_t;
+bool rotation = false;
+bool translate = false;
 
 //-------------------------------
 //-------------MAIN--------------
 //-------------------------------
-
 int main(int argc, char **argv) {
-    if (argc != 2) {
-        cout << "Usage: [obj file]" << endl;
-        return 0;
-    }
+	if (argc != 2) {
+		cout << "Usage: [obj file]" << endl;
+
+		return 0;
+	}
 
-    obj *mesh = new obj();
+	obj *mesh = new obj();
 
-    {
-        objLoader loader(argv[1], mesh);
-        mesh->buildBufPoss();
-    }
+	{
+		objLoader loader(argv[1], mesh);
+		mesh->buildBufPoss();
+	}
 
-    frame = 0;
-    seconds = time(NULL);
-    fpstracker = 0;
+	frame = 0;
+	seconds = time(NULL);
+	fpstracker = 0;
 
-    // Launch CUDA/GL
-    if (init(mesh)) {
-        // GLFW main loop
-        mainLoop();
-    }
+	// Launch CUDA/GL
+	if (init(mesh)) {
 
-    return 0;
+		// GLFW main loop
+		mainLoop();
+	}
+	getchar();
+	return 0;
 }
+//mouse function
 
 void mainLoop() {
-    while (!glfwWindowShouldClose(window)) {
-        glfwPollEvents();
-        runCuda();
-
-        time_t seconds2 = time (NULL);
-
-        if (seconds2 - seconds >= 1) {
-
-            fps = fpstracker / (seconds2 - seconds);
-            fpstracker = 0;
-            seconds = seconds2;
-        }
-
-        string title = "CIS565 Rasterizer | " + utilityCore::convertIntToString((int)fps) + " FPS";
-        glfwSetWindowTitle(window, title.c_str());
-
-        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
-        glBindTexture(GL_TEXTURE_2D, displayImage);
-        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
-        glClear(GL_COLOR_BUFFER_BIT);
-
-        // VAO, shader program, and texture already bound
-        glDrawElements(GL_TRIANGLES, 6,  GL_UNSIGNED_SHORT, 0);
-        glfwSwapBuffers(window);
-    }
-    glfwDestroyWindow(window);
-    glfwTerminate();
+	while (!glfwWindowShouldClose(window)) {
+		glfwPollEvents();
+		
+		runCuda();
+
+		time_t seconds2 = time(NULL);
+
+		if (seconds2 - seconds >= 1) {
+
+			fps = fpstracker / (seconds2 - seconds);
+			fpstracker = 0;
+			seconds = seconds2;
+		}
+
+		string title = "CIS565 Rasterizer | " + utilityCore::convertIntToString((int)fps) + " FPS";
+		glfwSetWindowTitle(window, title.c_str());
+
+		glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
+		glBindTexture(GL_TEXTURE_2D, displayImage);
+		glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+		glClear(GL_COLOR_BUFFER_BIT);
+
+		// VAO, shader program, and texture already bound
+		glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_SHORT, 0);
+		glfwSwapBuffers(window);
+	}
+	glfwDestroyWindow(window);
+	glfwTerminate();
 }
 
 //-------------------------------
 //---------RUNTIME STUFF---------
 //-------------------------------
-
+//the camera movement refers to CIS-560 based code..  MyGL::keyPressEvent(QKeyEvent *e){..}
+void mouse_pos_callback(GLFWwindow *window, double x_current, double y_current)
+{
+	float amount = 0.03;
+	float deltaTime = glfwGetTime() -previous_t;
+
+	if (rotation)//right
+	{
+		right_rot += amount * deltaTime * float(x_current - posx);
+		up_rot += amount * deltaTime * float(y_current - posy);
+	}
+	else if (translate)//left
+	{
+		all_amt.x = amount * deltaTime * float(x_current - posx);
+		all_amt.y = amount * deltaTime * float(y_current - posy);
+ 	}
+	previous_t = glfwGetTime();
+}
+void mouse_button_callback(GLFWwindow* window, int button, int action, int mods)
+{
+	click = true;
+	double xpos = 0, ypos = 0; double xpos1 = 0, ypos1 = 0;
+	//double xpos, ypos; double xpos1, ypos1;
+	if (button == GLFW_MOUSE_BUTTON_LEFT && action == GLFW_PRESS)
+	{
+		translate = true;//press left button move..
+		rotation = false;//glfwGetCursorPos(window, &xpos, &ypos);...not a good idea to put inside...
+	}
+	if (button == GLFW_MOUSE_BUTTON_LEFT && action == GLFW_RELEASE)
+	{
+		/*glfwGetCursorPos(window, &xpos1, &ypos1);
+		if (xpos1 - xpos > 0){all_amt += 1;}
+		if (xpos1 - xpos <= 0){all_amt += -1;}*/
+		translate = false;
+		rotation = false;
+	}
+	if (button == GLFW_MOUSE_BUTTON_RIGHT && action == GLFW_PRESS)
+	{
+		translate = false;//press right button rotate..
+		rotation = true;
+	}
+	if (button == GLFW_MOUSE_BUTTON_RIGHT && action == GLFW_RELEASE)
+	{
+		translate = false;
+		rotation = false;
+	}
+}
+void mouse_scroll_callback(GLFWwindow* window,double front,double back)
+{
+	//......
+}
 void runCuda() {
-    // Map OpenGL buffer object for writing from CUDA on a single GPU
-    // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer
-    dptr = NULL;
-
-    cudaGLMapBufferObject((void **)&dptr, pbo);
-    rasterize(dptr);
-    cudaGLUnmapBufferObject(pbo);
-
-    frame++;
-    fpstracker++;
+	// Map OpenGL buffer object for writing from CUDA on a single GPU
+	// No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer
+	dptr = NULL;
+	//Camera camera;
+	/*
+	glfwSetCursorPosCallback(window, mouse_move_callback);
+	glfwSetMouseButtonCallback(window, mouse_button_callback);
+	glfwSetScrollCallback(window, mouse_scroll_callback);*/
+	//if (!click){
+	//	all_amt=0;
+	//}
+	//glm::mat4 projview = camera.PerspectiveProjectionMatrix;
+	cudaGLMapBufferObject((void **)&dptr, pbo);
+
+	rasterize(dptr, all_amt.x, all_amt.x,up_rot, right_rot);
+	cudaGLUnmapBufferObject(pbo);
+
+	frame++;
+	fpstracker++;
+	click=false;
 
 }
 
@@ -91,135 +167,138 @@ void runCuda() {
 //-------------------------------
 
 bool init(obj *mesh) {
-    glfwSetErrorCallback(errorCallback);
-
-    if (!glfwInit()) {
-        return false;
-    }
-
-    width = 800;
-    height = 800;
-    window = glfwCreateWindow(width, height, "CIS 565 Pathtracer", NULL, NULL);
-    if (!window) {
-        glfwTerminate();
-        return false;
-    }
-    glfwMakeContextCurrent(window);
-    glfwSetKeyCallback(window, keyCallback);
-
-    // Set up GL context
-    glewExperimental = GL_TRUE;
-    if (glewInit() != GLEW_OK) {
-        return false;
-    }
-
-    // Initialize other stuff
-    initVAO();
-    initTextures();
-    initCuda();
-    initPBO();
-
-    float cbo[] = {
-        0.0, 1.0, 0.0,
-        0.0, 0.0, 1.0,
-        1.0, 0.0, 0.0
-    };
-    rasterizeSetBuffers(mesh->getBufIdxsize(), mesh->getBufIdx(),
-            mesh->getBufPossize() / 3,
-            mesh->getBufPos(), mesh->getBufNor(), mesh->getBufCol());
-
-    GLuint passthroughProgram;
-    passthroughProgram = initShader();
-
-    glUseProgram(passthroughProgram);
-    glActiveTexture(GL_TEXTURE0);
-
-    return true;
+	glfwSetErrorCallback(errorCallback);
+
+	if (!glfwInit()) {
+		return false;
+	}
+
+	width = 800;
+	height = 800;
+	window = glfwCreateWindow(width, height, "CIS 565 Pathtracer", NULL, NULL);
+	if (!window) {
+		glfwTerminate();
+		return false;
+	}
+	glfwMakeContextCurrent(window);
+	glfwSetCursorPosCallback(window, mouse_pos_callback);
+	glfwSetMouseButtonCallback(window, mouse_button_callback);
+	//glfwSetScrollCallback(window, mouse_scroll_callback);
+	glfwSetKeyCallback(window, keyCallback);
+
+	// Set up GL context
+	glewExperimental = GL_TRUE;
+	if (glewInit() != GLEW_OK) {
+		return false;
+	}
+
+	// Initialize other stuff
+	initVAO();
+	initTextures();
+	initCuda();
+	initPBO();
+
+	float cbo[] = {
+		0.0, 1.0, 0.0,
+		0.0, 0.0, 1.0,
+		1.0, 0.0, 0.0
+	};
+	rasterizeSetBuffers(mesh->getBufIdxsize(), mesh->getBufIdx(),
+		mesh->getBufPossize() / 3,
+		mesh->getBufPos(), mesh->getBufNor(), mesh->getBufCol(),1);
+
+	GLuint passthroughProgram;
+	passthroughProgram = initShader();
+
+	glUseProgram(passthroughProgram);
+	glActiveTexture(GL_TEXTURE0);
+
+	return true;
 }
 
 void initPBO() {
-    // set up vertex data parameter
-    int num_texels = width * height;
-    int num_values = num_texels * 4;
-    int size_tex_data = sizeof(GLubyte) * num_values;
+	// set up vertex data parameter
+	int num_texels = width * height;
+	int num_values = num_texels * 4;
+	int size_tex_data = sizeof(GLubyte)* num_values;
 
-    // Generate a buffer ID called a PBO (Pixel Buffer Object)
-    glGenBuffers(1, &pbo);
+	// Generate a buffer ID called a PBO (Pixel Buffer Object)
+	glGenBuffers(1, &pbo);
 
-    // Make this the current UNPACK buffer (OpenGL is state-based)
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
+	// Make this the current UNPACK buffer (OpenGL is state-based)
+	glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
 
-    // Allocate data for the buffer. 4-channel 8-bit image
-    glBufferData(GL_PIXEL_UNPACK_BUFFER, size_tex_data, NULL, GL_DYNAMIC_COPY);
-    cudaGLRegisterBufferObject(pbo);
+	// Allocate data for the buffer. 4-channel 8-bit image
+	glBufferData(GL_PIXEL_UNPACK_BUFFER, size_tex_data, NULL, GL_DYNAMIC_COPY);
+	cudaGLRegisterBufferObject(pbo);
 
 }
 
 void initCuda() {
-    // Use device with highest Gflops/s
-    cudaGLSetGLDevice(0);
+	// Use device with highest Gflops/s
+	cudaGLSetGLDevice(0);
 
-    rasterizeInit(width, height);
+	rasterizeInit(width, height);
 
-    // Clean up on program exit
-    atexit(cleanupCuda);
+	// Clean up on program exit
+	atexit(cleanupCuda);
 }
 
 void initTextures() {
-    glGenTextures(1, &displayImage);
-    glBindTexture(GL_TEXTURE_2D, displayImage);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_BGRA,
-                  GL_UNSIGNED_BYTE, NULL);
+	glGenTextures(1, &displayImage);
+	glBindTexture(GL_TEXTURE_2D, displayImage);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+	glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_BGRA,
+		GL_UNSIGNED_BYTE, NULL);
 }
 
 void initVAO(void) {
-    GLfloat vertices[] = {
-        -1.0f, -1.0f,
-        1.0f, -1.0f,
-        1.0f,  1.0f,
-        -1.0f,  1.0f,
-    };
-
-    GLfloat texcoords[] = {
-        1.0f, 1.0f,
-        0.0f, 1.0f,
-        0.0f, 0.0f,
-        1.0f, 0.0f
-    };
-
-    GLushort indices[] = { 0, 1, 3, 3, 1, 2 };
-
-    GLuint vertexBufferObjID[3];
-    glGenBuffers(3, vertexBufferObjID);
-
-    glBindBuffer(GL_ARRAY_BUFFER, vertexBufferObjID[0]);
-    glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW);
-    glVertexAttribPointer((GLuint)positionLocation, 2, GL_FLOAT, GL_FALSE, 0, 0);
-    glEnableVertexAttribArray(positionLocation);
-
-    glBindBuffer(GL_ARRAY_BUFFER, vertexBufferObjID[1]);
-    glBufferData(GL_ARRAY_BUFFER, sizeof(texcoords), texcoords, GL_STATIC_DRAW);
-    glVertexAttribPointer((GLuint)texcoordsLocation, 2, GL_FLOAT, GL_FALSE, 0, 0);
-    glEnableVertexAttribArray(texcoordsLocation);
-
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, vertexBufferObjID[2]);
-    glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(indices), indices, GL_STATIC_DRAW);
+	GLfloat vertices[] = {
+		-1.0f, -1.0f,
+		1.0f, -1.0f,
+		1.0f, 1.0f,
+		-1.0f, 1.0f,
+	};
+
+	GLfloat texcoords[] = {
+		1.0f, 1.0f,
+		0.0f, 1.0f,
+		0.0f, 0.0f,
+		1.0f, 0.0f
+	};
+
+	GLushort indices[] = { 0, 1, 3, 3, 1, 2 };
+
+	GLuint vertexBufferObjID[3];
+	glGenBuffers(3, vertexBufferObjID);
+
+	glBindBuffer(GL_ARRAY_BUFFER, vertexBufferObjID[0]);
+	glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW);
+	glVertexAttribPointer((GLuint)positionLocation, 2, GL_FLOAT, GL_FALSE, 0, 0);
+	glEnableVertexAttribArray(positionLocation);
+
+	glBindBuffer(GL_ARRAY_BUFFER, vertexBufferObjID[1]);
+	glBufferData(GL_ARRAY_BUFFER, sizeof(texcoords), texcoords, GL_STATIC_DRAW);
+	glVertexAttribPointer((GLuint)texcoordsLocation, 2, GL_FLOAT, GL_FALSE, 0, 0);
+	glEnableVertexAttribArray(texcoordsLocation);
+
+	glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, vertexBufferObjID[2]);
+	glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(indices), indices, GL_STATIC_DRAW);
 }
 
 
 GLuint initShader() {
-    const char *attribLocations[] = { "Position", "Tex" };
-    GLuint program = glslUtility::createDefaultProgram(attribLocations, 2);
-    GLint location;
+	const char *attribLocations[] = { "Position", "Tex" };
+	GLuint program = glslUtility::createDefaultProgram(attribLocations, 2);
+	GLint location;
 
-    glUseProgram(program);
-    if ((location = glGetUniformLocation(program, "u_image")) != -1) {
-        glUniform1i(location, 0);
-    }
+	glUseProgram(program);
+	if ((location = glGetUniformLocation(program, "u_image")) != -1) {
+		glUniform1i(location, 0);
+	}
 
-    return program;
+	return program;
 }
 
 //-------------------------------
@@ -227,38 +306,39 @@ GLuint initShader() {
 //-------------------------------
 
 void cleanupCuda() {
-    if (pbo) {
-        deletePBO(&pbo);
-    }
-    if (displayImage) {
-        deleteTexture(&displayImage);
-    }
+	if (pbo) {
+		deletePBO(&pbo);
+	}
+	if (displayImage) {
+		deleteTexture(&displayImage);
+	}
 }
 
 void deletePBO(GLuint *pbo) {
-    if (pbo) {
-        // unregister this buffer object with CUDA
-        cudaGLUnregisterBufferObject(*pbo);
+	if (pbo) {
+		// unregister this buffer object with CUDA
+		cudaGLUnregisterBufferObject(*pbo);
 
-        glBindBuffer(GL_ARRAY_BUFFER, *pbo);
-        glDeleteBuffers(1, pbo);
+		glBindBuffer(GL_ARRAY_BUFFER, *pbo);
+		glDeleteBuffers(1, pbo);
 
-        *pbo = (GLuint)NULL;
-    }
+		*pbo = (GLuint)NULL;
+	}
 }
 
+
 void deleteTexture(GLuint *tex) {
-    glDeleteTextures(1, tex);
-    *tex = (GLuint)NULL;
+	glDeleteTextures(1, tex);
+	*tex = (GLuint)NULL;
 }
 
 void shut_down(int return_code) {
-    rasterizeFree();
-    cudaDeviceReset();
+	rasterizeFree();
+	cudaDeviceReset();
 #ifdef __APPLE__
-    glfwTerminate();
+	glfwTerminate();
 #endif
-    exit(return_code);
+	exit(return_code);
 }
 
 //------------------------------
@@ -266,11 +346,12 @@ void shut_down(int return_code) {
 //------------------------------
 
 void errorCallback(int error, const char *description) {
-    fputs(description, stderr);
+	fputs(description, stderr);
 }
 
 void keyCallback(GLFWwindow *window, int key, int scancode, int action, int mods) {
-    if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS) {
-        glfwSetWindowShouldClose(window, GL_TRUE);
-    }
+	if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS) {
+		glfwSetWindowShouldClose(window, GL_TRUE);
+	}
 }
+
diff --git a/src/rasterize.cu b/src/rasterize.cu
index 53103b5..3bbe89a 100644
--- a/src/rasterize.cu
+++ b/src/rasterize.cu
@@ -7,157 +7,797 @@
  */
 
 #include "rasterize.h"
-
+//<seqan / parallel.h>
+#include <thrust/random.h>
 #include <cmath>
+#include <vector>
 #include <cstdio>
 #include <cuda.h>
 #include <thrust/random.h>
 #include <util/checkCUDAError.h>
 #include "rasterizeTools.h"
-
+#include <glm/gtc/matrix_transform.hpp>
+#include <glm/gtc/type_ptr.hpp>
+#define DEG2RAD  PI/180.f
+#define Tess 0
+#define Blending 0
 struct VertexIn {
-    glm::vec3 pos;
-    glm::vec3 nor;
-    glm::vec3 col;
-    // TODO (optional) add other vertex attributes (e.g. texture coordinates)
+	glm::vec3 pos;
+	glm::vec3 nor;
+	glm::vec3 col;
+	// TODO (optional) add other vertex attributes (e.g. texture coordinates)
 };
 struct VertexOut {
-    // TODO
+	// TODO
+	glm::vec3 pos;
+	glm::vec3 nor;
+	glm::vec3 col;
 };
 struct Triangle {
-    VertexOut v[3];
+	VertexOut v[3];
 };
 struct Fragment {
-    glm::vec3 color;
+	int dis;
+	glm::vec3 color;
+	glm::vec3 normal;
+	glm::vec3 pos;
+	glm::vec3 subcolor[4];
+	int subdis[4];
 };
-
+int N = 0;
+int M = 0;
+int mat = 0;
+int dev = 0;
 static int width = 0;
 static int height = 0;
 static int *dev_bufIdx = NULL;
 static VertexIn *dev_bufVertex = NULL;
+static VertexOut *dev_vsOutput = NULL;
 static Triangle *dev_primitives = NULL;
 static Fragment *dev_depthbuffer = NULL;
+static Fragment *dev_fmInput = NULL;
+static Fragment *dev_fmOutput = NULL;
 static glm::vec3 *dev_framebuffer = NULL;
 static int bufIdxSize = 0;
 static int vertCount = 0;
 
+__host__ __device__ inline unsigned int utilhash(unsigned int a) {
+	a = (a + 0x7ed55d16) + (a << 12);
+	a = (a ^ 0xc761c23c) ^ (a >> 19);
+	a = (a + 0x165667b1) + (a << 5);
+	a = (a + 0xd3a2646c) ^ (a << 9);
+	a = (a + 0xfd7046c5) + (a << 3);
+	a = (a ^ 0xb55a4f09) ^ (a >> 16);
+	return a;
+}
+__host__ __device__
+thrust::default_random_engine makeSeededRandomEngine(int iter, int index, int depth) {
+	int h = utilhash((1 << 31) | (depth << 22) | iter) ^ utilhash(index);
+	return thrust::default_random_engine(h);
+}
 /**
  * Kernel that writes the image to the OpenGL PBO directly.
  */
-__global__
-void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-    int index = x + (y * w);
-
-    if (x < w && y < h) {
-        glm::vec3 color;
-        color.x = glm::clamp(image[index].x, 0.0f, 1.0f) * 255.0;
-        color.y = glm::clamp(image[index].y, 0.0f, 1.0f) * 255.0;
-        color.z = glm::clamp(image[index].z, 0.0f, 1.0f) * 255.0;
-        // Each thread writes one pixel location in the texture (textel)
-        pbo[index].w = 0;
-        pbo[index].x = color.x;
-        pbo[index].y = color.y;
-        pbo[index].z = color.z;
-    }
-}
+__global__ void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) {
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int index = x + (y * w);
 
+	if (x < w && y < h) {
+		glm::vec3 color;
+		color.x = glm::clamp(image[index].x, 0.0f, 1.0f) * 255.0;
+		color.y = glm::clamp(image[index].y, 0.0f, 1.0f) * 255.0;
+		color.z = glm::clamp(image[index].z, 0.0f, 1.0f) * 255.0;
+		// Each thread writes one pixel location in the texture (textel)
+		pbo[index].w = 0;
+		pbo[index].x = color.x;
+		pbo[index].y = color.y;
+		pbo[index].z = color.z;
+	}
+}
+__global__ void cleanDepth(Fragment* dev_depthbuffer, Fragment* dev_fmInput, int w, int h)
+{
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int index = x + (y * w);
+	//float t = INFINITY;
+	if (x < w && y < h)
+	{
+		dev_depthbuffer[index].color = glm::vec3(1, 0, 0);
+		dev_depthbuffer[index].dis = INFINITY;
+		dev_depthbuffer[index].normal = glm::vec3(0, 1, 0);
+		dev_fmInput[index].normal = glm::vec3(0, 1, 0);
+		dev_fmInput[index].dis = INFINITY;
+		dev_fmInput[index].color = glm::vec3(1, 1, 1);
+		dev_fmInput[index].normal = glm::vec3(0, 1, 0);
+	}
+}
 // Writes fragment colors to the framebuffer
-__global__
-void render(int w, int h, Fragment *depthbuffer, glm::vec3 *framebuffer) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-    int index = x + (y * w);
+__global__ void render(int w, int h, Fragment *depthbuffer, glm::vec3 *framebuffer) {
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+	int index = x + (y * w);
 
-    if (x < w && y < h) {
-        framebuffer[index] = depthbuffer[index].color;
-    }
+	if (x < w && y < h) {
+		framebuffer[index] = depthbuffer[index].color;
+	}
 }
 
 /**
  * Called once at the beginning of the program to allocate memory.
  */
 void rasterizeInit(int w, int h) {
-    width = w;
-    height = h;
-    cudaFree(dev_depthbuffer);
-    cudaMalloc(&dev_depthbuffer,   width * height * sizeof(Fragment));
-    cudaMemset(dev_depthbuffer, 0, width * height * sizeof(Fragment));
-    cudaFree(dev_framebuffer);
-    cudaMalloc(&dev_framebuffer,   width * height * sizeof(glm::vec3));
-    cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3));
-    checkCUDAError("rasterizeInit");
+	width = w;
+	height = h;
+	cudaFree(dev_depthbuffer);
+	cudaMalloc(&dev_depthbuffer, width * height * sizeof(Fragment));
+	cudaMemset(dev_depthbuffer, 0, width * height * sizeof(Fragment));
+	cudaFree(dev_framebuffer);
+	cudaMalloc(&dev_framebuffer, width * height * sizeof(glm::vec3));
+	cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3));
+
+	cudaFree(dev_fmInput);
+	cudaMalloc(&dev_fmInput, 4 * width * height * sizeof(Fragment));
+	cudaMemset(dev_fmInput, 0, 4 * width * height * sizeof(Fragment));
+
+	cudaFree(dev_fmOutput);
+	cudaMalloc(&dev_fmOutput, width * height * sizeof(Fragment));
+	cudaMemset(dev_fmOutput, 0, width * height * sizeof(Fragment));
+	checkCUDAError("rasterizeInit");
 }
 
 /**
  * Set all of the buffers necessary for rasterization.
  */
+
 void rasterizeSetBuffers(
-        int _bufIdxSize, int *bufIdx,
-        int _vertCount, float *bufPos, float *bufNor, float *bufCol) {
-    bufIdxSize = _bufIdxSize;
-    vertCount = _vertCount;
+	int _bufIdxSize, int *bufIdx,
+	int _vertCount, float *bufPos, float *bufNor, float *bufCol, bool resselation) {
+	//********************
+	resselation = Tess;
+	//********************
+	bufIdxSize = _bufIdxSize;
+	vertCount = _vertCount;
+
+	cudaFree(dev_bufIdx);
+	cudaMalloc(&dev_bufIdx, bufIdxSize * sizeof(int));
+	cudaMemcpy(dev_bufIdx, bufIdx, bufIdxSize * sizeof(int), cudaMemcpyHostToDevice);
+
 
-    cudaFree(dev_bufIdx);
-    cudaMalloc(&dev_bufIdx, bufIdxSize * sizeof(int));
-    cudaMemcpy(dev_bufIdx, bufIdx, bufIdxSize * sizeof(int), cudaMemcpyHostToDevice);
+	VertexIn *bufVertex = new VertexIn[_vertCount];
+	float maxv = -1.f;
 
-    VertexIn *bufVertex = new VertexIn[_vertCount];
-    for (int i = 0; i < vertCount; i++) {
-        int j = i * 3;
-        bufVertex[i].pos = glm::vec3(bufPos[j + 0], bufPos[j + 1], bufPos[j + 2]);
-        bufVertex[i].nor = glm::vec3(bufNor[j + 0], bufNor[j + 1], bufNor[j + 2]);
-        bufVertex[i].col = glm::vec3(bufCol[j + 0], bufCol[j + 1], bufCol[j + 2]);
-    }
-    cudaFree(dev_bufVertex);
-    cudaMalloc(&dev_bufVertex, vertCount * sizeof(VertexIn));
-    cudaMemcpy(dev_bufVertex, bufVertex, vertCount * sizeof(VertexIn), cudaMemcpyHostToDevice);
+	for (int i = 0; i < vertCount; i++) {
+		int j = i * 3;
+		bufVertex[i].pos = glm::vec3(bufPos[j + 0], bufPos[j + 1], bufPos[j + 2]);
+		bufVertex[i].nor = glm::vec3(bufNor[j + 0], bufNor[j + 1], bufNor[j + 2]);
+		bufVertex[i].col = glm::vec3(bufCol[j + 0], bufCol[j + 1], bufCol[j + 2]);
+		//***********check here....*******//
+		float temp = std::max(bufVertex[i].pos.x, std::max(bufVertex[i].pos.y, bufVertex[i].pos.y));
+		if (temp>maxv){ maxv = temp; }
+	}
+	N = (int)maxv + 1;
+	cudaFree(dev_bufVertex);
+	cudaMalloc(&dev_bufVertex, vertCount * sizeof(VertexIn));
+	cudaMemcpy(dev_bufVertex, bufVertex, vertCount * sizeof(VertexIn), cudaMemcpyHostToDevice);
 
-    cudaFree(dev_primitives);
-    cudaMalloc(&dev_primitives, vertCount / 3 * sizeof(Triangle));
-    cudaMemset(dev_primitives, 0, vertCount / 3 * sizeof(Triangle));
+	cudaFree(dev_vsOutput);
+	cudaMalloc(&dev_vsOutput, vertCount * sizeof(VertexOut));
+
+	if (!resselation)
+	{
+		cudaFree(dev_primitives);
+		cudaMalloc(&dev_primitives, vertCount / 3 * sizeof(Triangle));
+		cudaMemset(dev_primitives, 0, vertCount / 3 * sizeof(Triangle));
+		checkCUDAError("rasterizeSetBuffers");
+	}
+	else
+	{
+		cudaFree(dev_primitives);
+		cudaMalloc(&dev_primitives, vertCount / 3 * 4 * sizeof(Triangle));
+		cudaMemset(dev_primitives, 0, vertCount / 3 * 4 * sizeof(Triangle));
+		checkCUDAError("rasterizeSetBuffers");
+	}
 
-    checkCUDAError("rasterizeSetBuffers");
 }
 
-/**
+
+__global__ void vertexShader(VertexIn *dev_bufVertex, VertexOut *dev_vsOutput, int vertexCount, glm::mat4 ViewProj){
+
+	int id = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (id < vertexCount){
+		//simple orthordox projection 
+		//dev_vsOutput[id].pos = dev_bufVertex[id].pos;
+		//dev_vsOutput[id].nor = dev_bufVertex[id].nor;
+
+		dev_vsOutput[id].pos = multiplyMV(ViewProj, glm::vec4(dev_bufVertex[id].pos, 1));
+		dev_vsOutput[id].nor = multiplyMV(ViewProj, glm::vec4(dev_bufVertex[id].nor, 0));
+		dev_vsOutput[id].nor = glm::normalize(dev_vsOutput[id].nor);
+		dev_vsOutput[id].col = glm::vec3(1, 0, 0);
+		//dev_vsOutput[id].col = dev_bufVertex[id].col;
+		//interpolate the normal:smooth normal color??
+	}
+
+}
+__global__ void PrimitiveAssembly(VertexOut *dev_vsOutput, Triangle * dev_primitives, int verCount)
+{
+	int id = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (id < verCount / 3){
+		dev_primitives[id].v[0].pos = dev_vsOutput[3 * id].pos;//012,345,678
+		dev_primitives[id].v[1].pos = dev_vsOutput[3 * id + 1].pos;
+		dev_primitives[id].v[2].pos = dev_vsOutput[3 * id + 2].pos;
+
+		dev_primitives[id].v[0].nor = dev_vsOutput[3 * id].nor;//012,345,678
+		dev_primitives[id].v[1].nor = dev_vsOutput[3 * id + 1].nor;
+		dev_primitives[id].v[2].nor = dev_vsOutput[3 * id + 2].nor;
+
+		dev_primitives[id].v[0].col = dev_vsOutput[3 * id].col;//012,345,678
+		dev_primitives[id].v[1].col = dev_vsOutput[3 * id + 1].col;
+		dev_primitives[id].v[2].col = dev_vsOutput[3 * id + 2].col;
+	}
+}
+
+__host__ __device__  bool fequal(float a, float b){
+	if (a > b - 0.000001&&a < b + 0.000001){ return true; }
+	else return false;
+}
+
+__device__ int _atomicMin(int *addr, int val)
+{
+	int old = *addr, assumed;
+	if (old <= val) return old;
+	do{
+		assumed = old;
+		old = atomicCAS(addr, assumed, val);
+	} while (old != assumed);
+	return old;
+}
+/*{
+	int id = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (id < vertexcount / 3.f)
+	{
+	glm::vec3 tri[3];
+	for (int i = 0; i < 3; i++){
+	tri[i] = dev_primitives[id].v[i].pos;
+	tri[i].x += N;
+	tri[i].y += N;
+	tri[i].z += N;
+	tri[i].x *= w / (float)(2.f*N);
+	tri[i].y *= h / (float)(2.f*N);
+	tri[i].z *= w / (float)(2.f*N);
+	//because the image is cube anyway...I think multiply should have better result than devide...
+	}
+	AABB aabb;
+	aabb = getAABBForTriangle(tri);
+
+	for (int i = aabb.min.x - 1; i < aabb.max.x + 1; i += 0.5){
+	for (int j = aabb.min.y - 1; j < aabb.max.y + 1; j += 0.5){
+	if (tri[0].x > w || tri[0].x < 0 || tri[0].y>h || tri[0].x < 0)
+	{
+	//color[i*w + j].color = glm::vec3(0, 0, 0);//black
+	}	//anti-aliansing..multisampling the patern 4 sample every pixel
+	glm::vec2 point(i, j);
+
+	glm::vec3 baryc = calculateBarycentricCoordinate(tri, point);
+	if (isBarycentricCoordInBounds(baryc))
+	{
+	int intdepth = getZAtCoordinate(baryc, tri);
+	int dis;
+	_atomicMin(&dis, intdepth);
+	if (intdepth == dis){
+	dev_fmInput[i*w + j].subcolor[k] = dev_primitives[id].v[0].nor;
+	}
+	dev_fmInput[i*w + j].pos = dev_primitives[id].v[0].pos;
+	dev_fmInput[i*w + j].normal = dev_primitives[id].v[0].nor;
+	}
+	}
+	}
+	//else //pixel have more than 1 color
+	//{
+	/*	glm::vec3 baryc_p[4];
+	int intdepth_s[4];
+	for (int p = 0; p < 4; p++)
+	{
+	baryc_p[p] = calculateBarycentricCoordinate(tri, random_point[p]);
+	if (isBarycentricCoordInBounds(baryc_p[p])){
+	intdepth_s[p] = getZAtCoordinate(baryc_p[p], tri);
+	_atomicMin(&dev_fmInput[i*w + j].subdis[p], intdepth_s[p]);
+	if (intdepth_s[p] == dev_fmInput[i*w + j].subdis[p]){
+	dev_fmInput[i*w + j].subcolor[p] = dev_primitives[id].v[0].nor;;
+	}
+	}
+	}
+	dev_fmInput[i*w + j].pos = dev_primitives[id].v[0].pos;
+	dev_fmInput[i*w + j].normal = dev_primitives[id].v[0].nor;
+	//	}
+	}
+	}
+	}
+	}
+	//dev_primitives, dev_fmInput*4, dev_fmOutput
+	/*__global__ void rasterization(Triangle * dev_primitives, Fragment *dev_fmInput, int vertexcount, int w, int h, int N)
+	{
+	int id = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (id < vertexcount / 3.f)
+	{
+	//potimized boundingbox;
+	glm::vec3 tri[3];
+	for (int i = 0; i < 3; i++){//(-1,1)+1*w/2
+	//(-10,10)+10*w/20
+	tri[i] = dev_primitives[id].v[i].pos;
+	tri[i].x += N;
+	tri[i].y += N;
+	tri[i].z += N;
+	tri[i].x *= w / (float)(2.f*N);
+	tri[i].y *= h / (float)(2.f*N);
+	tri[i].z *= w / (float)(2.f*N);
+	//because the image is cube anyway...I think multiply should have better result than devide...
+	}
+	AABB aabb;
+	aabb = getAABBForTriangle(tri);
+
+	for (int i = aabb.min.x - 1; i < aabb.max.x + 1; i++){
+	for (int j = aabb.min.y - 1; j < aabb.max.y + 1; j++){
+	if (tri[0].x > w || tri[0].x < 0 || tri[0].y>h || tri[0].x < 0)
+	{
+	//color[i*w + j].color = glm::vec3(0, 0, 0);//black
+	}	//anti-aliansing..multisampling the patern 4 sample every pixel
+	glm::vec2 point(i + 0.5, j + 0.5);
+	thrust::default_random_engine rngx = makeSeededRandomEngine(i, id, 1);
+	thrust::default_random_engine rngy = makeSeededRandomEngine(j, id, 1);
+	thrust::uniform_real_distribution<float> u1(0, 0.5);
+	thrust::uniform_real_distribution<float> u2(0.5, 0.999);
+	glm::vec2 random_point[4];
+	int number = 0;
+	//random_point[0].x = i  + u1(rngx);//-1,1
+	//random_point[0].y = j  + u1(rngy);
+
+	//random_point[1].x = i + u2(rngx);//-1,-1
+	//random_point[1].y = j + u1(rngy);
+
+	//random_point[2].x = i + u1(rngx);//1,1
+	//random_point[2].y = j + u2(rngy);
+
+	//random_point[3].x = i + u2(rngx);//i+0+0.22,i+
+	//random_point[3].y = j + u2(rngy);
+	random_point[0].x = i + 0.25;//-1,1
+	random_point[0].y = j + 0.25;
+
+	random_point[1].x = i + 0.25;//-1,-1
+	random_point[1].y = j + 0.75;
+
+	random_point[2].x = i + 0.75;//1,1
+	random_point[2].y = j + 0.25;
+
+	random_point[3].x = i + 0.75;//i+0+0.22,i+
+	random_point[3].y = j + 0.75;
+	for (int t = 0; t < 4;t++){
+	glm::vec3 baryc_sub = calculateBarycentricCoordinate(tri, random_point[t]);
+	if (isBarycentricCoordInBounds(baryc_sub))
+	{
+	number++;
+	}
+	}
+	/*	if (number == 4)//all in
+	{
+	glm::vec3 baryc = calculateBarycentricCoordinate(tri, point);
+	if (isBarycentricCoordInBounds(baryc)){
+	int intdepth = getZAtCoordinate(baryc, tri);
+	_atomicMin(&(dev_fmInput[i*w + j].dis), intdepth);
+	if (intdepth == dev_fmInput[i*w + j].dis){
+	for (int k = 0; k < 4; k++){
+	dev_fmInput[i*w + j].subcolor[k] = dev_primitives[id].v[0].nor;
+	}
+	dev_fmInput[i*w + j].pos= dev_primitives[id].v[0].pos;
+	dev_fmInput[i*w + j].normal = dev_primitives[id].v[0].nor;
+	}
+	}
+	}*/
+//else //pixel have more than 1 color
+//{
+/*	glm::vec3 baryc_p[4];
+	int intdepth_s[4];
+	for (int p = 0; p < 4; p++)
+	{
+	baryc_p[p] = calculateBarycentricCoordinate(tri, random_point[p]);
+	if (isBarycentricCoordInBounds(baryc_p[p])){
+	intdepth_s[p] = getZAtCoordinate(baryc_p[p], tri);
+	_atomicMin(&dev_fmInput[i*w + j].subdis[p], intdepth_s[p]);
+	if (intdepth_s[p] == dev_fmInput[i*w + j].subdis[p]){
+	dev_fmInput[i*w + j].subcolor[p] = dev_primitives[id].v[0].nor;;
+	}
+	}
+	}
+	dev_fmInput[i*w + j].pos = dev_primitives[id].v[0].pos;
+	dev_fmInput[i*w + j].normal = dev_primitives[id].v[0].nor;
+	//	}
+	}
+	}
+	}
+	}*/
+
+/*__global__ void rasterization(Triangle * dev_primitives, Fragment *dev_fmInput, int vertexcount, int w, int h, int N)
+{
+	int id = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (id < vertexcount / 3.f)
+	{
+		//potimized boundingbox; 
+		glm::vec3 tri[3];
+		for (int i = 0; i < 3; i++){//(-1,1)+1*w/2 
+			//(-10,10)+10*w/20 
+			tri[i] = dev_primitives[id].v[i].pos;
+			tri[i].x += N;
+			tri[i].y += N;
+			tri[i].z += N;
+			tri[i].x *= w / (float)(2.f*N);
+			tri[i].y *= h / (float)(2.f*N);
+			tri[i].z *= w / (float)(2.f*N);
+			//because the image is cube anyway...I think multiply should have better result than devide... 
+
+		}
+		AABB aabb;
+		aabb = getAABBForTriangle(tri);
+		for (int i = aabb.min.x - 1; i < aabb.max.x + 1; i++){
+			for (int j = aabb.min.y - 1; j < aabb.max.y + 1; j++){
+				glm::vec2 point(i, j);
+				glm::vec3 baryc = calculateBarycentricCoordinate(tri, point);
+				//simple clip.. 
+				if (tri[0].x > w || tri[0].x < 0 || tri[0].y>h || tri[0].x < 0)continue;
+				if (isBarycentricCoordInBounds(baryc)){
+					//these three normal should be the same since they are on the same face (checked) 
+					int intdepth = (int)getZAtCoordinate(baryc, tri);
+					//atomicMin(int* address, int val)
+					//reads word old located at the address, computes the minimum of old and val, 
+					//and stores the result back to memory at the same address. returns old
+					atomicMin(&dev_fmInput[i*w + j].dis, intdepth);
+					if (dev_fmInput[i*w + j].dis == intdepth){
+
+						dev_fmInput[i*w + j].color = dev_primitives[id].v[0].col;
+						dev_fmInput[i*w + j].normal = dev_primitives[id].v[0].nor;
+						dev_fmInput[i*w + j].pos = (dev_primitives[id].v[0].pos + dev_primitives[id].v[1].pos + dev_primitives[id].v[2].pos) / 3.f;
+					}
+				}
+
+			}
+
+		}
+	}
+}*/
+__global__ void rasterization(Triangle * dev_primitives, Fragment *dev_fmInput, int vertexcount, int w, int h, int N)
+{
+	int id = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (id < vertexcount / 3.f)
+	{
+		//potimized boundingbox; 
+		glm::vec3 tri[3];
+		for (int i = 0; i < 3; i++){//(-1,1)+1*w/2 
+			//(-10,10)+10*w/20 
+			tri[i] = dev_primitives[id].v[i].pos;
+			tri[i].x += N;
+			tri[i].y += N;
+			tri[i].z += N;
+			tri[i].x *= w / (float)(2.f*N);
+			tri[i].y *= h / (float)(2.f*N);
+			tri[i].z *= w / (float)(2.f*N);
+			//because the image is cube anyway...I think multiply should have better result than devide... 
+
+		}
+		AABB aabb;
+		aabb = getAABBForTriangle(tri);
+		for (int i = aabb.min.x - 1; i < aabb.max.x + 1; i++){
+			for (int j = aabb.min.y - 1; j < aabb.max.y + 1; j++){
+				glm::vec2 point(i, j);
+				glm::vec3 baryc = calculateBarycentricCoordinate(tri, point);
+				//random sample anti-aliansing 1-sample..
+				thrust::default_random_engine rngx = makeSeededRandomEngine(i, id, 1);
+				thrust::default_random_engine rngy = makeSeededRandomEngine(j, id, 1);
+				thrust::uniform_real_distribution<float> u1(0, 1);
+				thrust::uniform_real_distribution<float> u2(0.5, 0.999);
+				//simple clip.. 
+				point =glm::vec2(i + u1(rngx), j + u1(rngy));
+				if (tri[0].x > w || tri[0].x < 0 || tri[0].y>h || tri[0].x < 0)continue;
+				if (isBarycentricCoordInBounds(baryc)){
+					//these three normal should be the same since they are on the same face (checked) 
+					int intdepth = (int)getZAtCoordinate(baryc, tri);
+					//atomicMin(int* address, int val)
+					//reads word old located at the address, computes the minimum of old and val, 
+					//and stores the result back to memory at the same address. returns old
+					atomicMin(&dev_fmInput[i*w + j].dis, intdepth);
+					if (dev_fmInput[i*w + j].dis == intdepth){
+						dev_fmInput[i*w + j].color = dev_primitives[id].v[0].col;
+						dev_fmInput[i*w + j].normal = dev_primitives[id].v[0].nor;
+						dev_fmInput[i*w + j].pos = (dev_primitives[id].v[0].pos + dev_primitives[id].v[1].pos + dev_primitives[id].v[2].pos) / 3.f;
+					}
+				}
+
+			}
+
+		}
+	}
+}
+__global__ void Tesselation(bool active, VertexOut *dev_vertin, Triangle *dev_triout, int vercount)
+{
+
+	int id = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (active&&id < vercount / 3.f)
+	{
+		int tessel_number = 3;
+		glm::vec3 tri[3];
+		tri[0] = dev_vertin[3 * id].pos;
+		tri[1] = dev_vertin[3 * id + 1].pos;
+		tri[2] = dev_vertin[3 * id + 2].pos;
+		//default tesselation,generate 4 triangles automativaly
+		glm::vec3 vnew[3];
+		vnew[0] = (tri[0] + tri[1]) / 2.f;
+		vnew[1] = (tri[0] + tri[2]) / 2.f;
+		vnew[2] = (tri[2] + tri[1]) / 2.f;
+
+		dev_triout[4 * id].v[0].pos = tri[0];
+		dev_triout[4 * id].v[1].pos = vnew[0];
+		dev_triout[4 * id].v[2].pos = vnew[1];
+
+		dev_triout[4 * id + 1].v[0].pos = vnew[0];
+		dev_triout[4 * id + 1].v[1].pos = tri[1];
+		dev_triout[4 * id + 1].v[2].pos = vnew[2];
+
+		dev_triout[4 * id + 2].v[0].pos = vnew[0];
+		dev_triout[4 * id + 2].v[1].pos = vnew[2];
+		dev_triout[4 * id + 2].v[2].pos = vnew[1];
+
+		dev_triout[4 * id + 3].v[0].pos = vnew[1];
+		dev_triout[4 * id + 3].v[1].pos = vnew[2];
+		dev_triout[4 * id + 3].v[2].pos = tri[2];
+		/*for (int i = 0; i < 4; i++){
+			for (int j = 0; j < 3; j++)
+			{
+			dev_triout[4 * id + i].v[j].nor = dev_vertin[3 * id].nor;
+			}
+			}*/
+		//in order to check :change the normal a little
+		for (int i = 0; i < 3; i++){
+			{
+				dev_triout[4 * id].v[i].nor = glm::normalize(dev_vertin[3 * id].nor + glm::vec3(0.3, 0, 0));
+				dev_triout[4 * id + 1].v[i].nor = glm::normalize(dev_vertin[3 * id].nor + glm::vec3(0, 0.3, 0));
+				dev_triout[4 * id + 2].v[i].nor = glm::normalize(dev_vertin[3 * id].nor + glm::vec3(0, 0, 0));
+				dev_triout[4 * id + 3].v[i].nor = glm::normalize(dev_vertin[3 * id].nor + glm::vec3(0, 0, 0.3));
+			}
+		}
+	}
+
+}
+/* scan_line:brute force
+glm::vec3 tri[3];
+for (int i = 0; i < 3; i++){
+tri[i] = dev_primitives[id].v[i].pos;
+tri[i].x += 1;
+tri[i].y += 1;
+tri[i].x *= w / 2.f;
+tri[i].y *= h / 2.f;
+}
+for (int i = 0; i < w; i++){
+for (int j = 0; j < h; j++){
+glm::vec2 point(i, j);
+glm::vec3 baryc = calculateBarycentricCoordinate(tri, point);
+if (isBarycentricCoordInBounds(baryc)){
+dev_fmInput[i*w + j].color = glm::vec3(1, 0, 0);
+}
+}*/
+
+glm::vec3 SetLight()
+{
+	glm::vec3 light_pos = glm::vec3(2, 1, 2);
+
+	return light_pos;
+}
+//blin phong
+/*__global__ void antialiansing(Triangle *dev_in,Triangle *dev_out,int trianglecount)
+{
+int id = (blockIdx.x * blockDim.x) + threadIdx.x;
+if (id < trianglecount)
+{
+
+}
+}*/
+//input output depthbuffer
+
+__global__ void	fragmentShading(Fragment *dev_fmInput, Fragment *dev_fmOutput, int w, int h, glm::vec3 light_pos, glm::vec3 camera_pos, bool defaultbackground)
+{
+	int id = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (id < w*h){
+		__syncthreads();
+		///glm::vec3 ccc = (dev_fmInput[id].subcolor[0] + dev_fmInput[id].subcolor[1] + dev_fmInput[id].subcolor[2] + dev_fmInput[id].subcolor[3]) / 4.f;
+		glm::vec3 ccc = dev_fmInput[id].color;
+		float specular_power = 100;
+		glm::vec3 specular_color = glm::vec3(1, 1, 1);//dev_fmInput[id].color;
+		glm::vec3 lightray = light_pos - dev_fmInput[id].pos;
+		glm::vec3 inray = camera_pos - dev_fmInput[id].pos;
+		glm::vec3 H = glm::normalize(inray) + glm::normalize(lightray);
+		H = glm::vec3(H.x / 2.0, H.y / 2.0, H.z / 2.0);
+		float hdot = glm::dot(H, dev_fmInput[id].normal);
+		float x = pow(hdot, specular_power);
+		if (x < 0)x = 0.f;
+		glm::vec3 spec = x*specular_color;
+
+		glm::vec3 Lambert = glm::vec3(1, 1, 1);
+		glm::vec3 Ambient = ccc;
+		float diffuse = glm::clamp(glm::dot(dev_fmInput[id].normal, glm::normalize(lightray)), 0.0f, 1.0f);
+		Lambert *= diffuse;
+
+		glm::vec3 phong_color = 0.5f*spec + 0.4f*Lambert + 0.1f*Ambient;//where is ambient light?
+		phong_color = glm::clamp(phong_color, 0.f, 1.f);
+
+		//dev_fmOutput[id].color = phong_color;
+		//blending
+		//DestinationColor.rgb = (SourceColor.rgb * One) + (DestinationColor.rgb * (1 - SourceColor.a));
+		if (Blending){
+			if (defaultbackground)
+			{
+				glm::vec3 background = glm::vec3(0, 0, 1);
+				float default_a = 0.8;
+				dev_fmOutput[id].color = phong_color + (background * (1 - default_a));
+			}
+			else
+			{
+				float depth = dev_fmInput[id].dis;
+				if (depth > 0) {
+					dev_fmOutput[id].color = glm::vec3(0.8, 0.8, 0.8);
+				}
+				else dev_fmOutput[id].color = (-depth)* phong_color + (1 + depth)*glm::vec3(0.8, 0.8, 0.8);
+			}
+		}
+		else dev_fmOutput[id].color = phong_color;
+	}
+}
+
+
+/*
  * Perform rasterization.
  */
-void rasterize(uchar4 *pbo) {
-    int sideLength2d = 8;
-    dim3 blockSize2d(sideLength2d, sideLength2d);
-    dim3 blockCount2d((width  - 1) / blockSize2d.x + 1,
-                      (height - 1) / blockSize2d.y + 1);
+void RotateAboutRight(float deg, glm::vec3 &ref, const glm::vec3 right, const glm::vec3 eye)
+{
+	deg *= DEG2RAD;
+	glm::mat4 rotation = glm::rotate(glm::mat4(1.0f), deg, right);
+	ref = ref - eye;
+	ref = glm::vec3(rotation * glm::vec4(ref, 1));
+	ref = ref + eye;
+
+}
+void TranslateAlongRight(float amt, glm::vec3 &ref, const glm::vec3 right, glm::vec3 &eye)
+{
+	glm::vec3 translation = right * amt;
+	eye += translation;
+	ref += translation;
+}
+void RotateAboutUp(float deg, glm::vec3 &ref, const glm::vec3 right, const glm::vec3 eye, const glm::vec3 up)
+{
+	deg *= DEG2RAD;
+	glm::mat4 rotation = glm::rotate(glm::mat4(1.0f), deg, up);
+	ref = ref - eye;
+	ref = glm::vec3(rotation * glm::vec4(ref, 1));
+	ref = ref + eye;
+}
+void TranslateAlongLook(float amt, const glm::vec3 look, glm::vec3 &eye, glm::vec3 & ref)
+{
+	glm::vec3 translation = look * amt;
+	eye += translation;
+	ref += translation;
+}
+void TranslateAlongUp(float amt, glm::vec3 &eye, glm::vec3 & ref, const glm::vec3 up)
+{
+	glm::vec3 translation = up * amt;
+	eye += translation;
+	ref += translation;
+}
+glm::mat4 camera(float x_trans_amount, float y_trans_amount, float up_angle_amount, float right_angle_amount, glm::vec3 &camerapos)
+{
+	glm::vec3 eye = glm::vec3(3, 0, 3);
+	glm::vec3 up = glm::vec3(0, 1, 0);
+	glm::vec3 ref = glm::vec3(0, 0, 0);
+	camerapos = eye;
+
+	float near_clip = 1.0f;
+	float far_clip = 1000.f;
+	float width = 800;
+	float height = 800;
+	float aspect = (float)width / (float)height;
+	float fovy = 45.f;
+	glm::vec3 world_up = glm::vec3(0, 1, 0);
+	glm::vec3 look = glm::normalize(ref - eye);
+	glm::vec3 right = glm::normalize(glm::cross(look, world_up));
+	RotateAboutRight(right_angle_amount, ref, right, eye);
+	RotateAboutUp(up_angle_amount, ref, right, eye, up);
+	TranslateAlongRight(x_trans_amount, ref, right, eye);
+	TranslateAlongUp(y_trans_amount, eye, ref, up);
+
+	glm::mat4 viewMatrix = glm::lookAt(eye, ref, up);
+	glm::mat4 projectionMatrix = glm::perspective(fovy, aspect, near_clip, far_clip);//fovy,aspect, zNear, zFar;
 
-    // TODO: Execute your rasterization pipeline here
-    // (See README for rasterization pipeline outline.)
+	glm::mat4 getViewProj = projectionMatrix*viewMatrix;
+	return getViewProj;
 
-    // Copy depthbuffer colors into framebuffer
-    render<<<blockCount2d, blockSize2d>>>(width, height, dev_depthbuffer, dev_framebuffer);
-    // Copy framebuffer into OpenGL buffer for OpenGL previewing
-    sendImageToPBO<<<blockCount2d, blockSize2d>>>(pbo, width, height, dev_framebuffer);
-    checkCUDAError("rasterize");
+}
+
+void rasterize(uchar4 *pbo, float amt_x, float amt_y, float up_a, float right_a)
+{
+	int sideLength2d = 8;
+	dim3 blockSize2d(sideLength2d, sideLength2d);
+	dim3 blockCount2d((width - 1) / blockSize2d.x + 1,
+		(height - 1) / blockSize2d.y + 1);
+	//key_test:
+	//std::cout << "ss " << amt_x << "and " << amt_y << std::endl;
+	//std::cout << "dd" << up_a << "and " << right_a << std::endl;
+
+	//step1.vertex shading
+	int blockSize1d = 256;
+	int blockCount1d = (vertCount + blockSize1d - 1) / blockSize1d;
+
+	int image_blockSize1d = 256;
+	int image_blockCount1d = (width*height + image_blockSize1d - 1) / image_blockSize1d;
+	glm::vec3 camera_pos = glm::vec3(0);
+	glm::vec3 light_pos = SetLight();
+	glm::mat4 getViewProj = camera(amt_x,amt_y, up_a,right_a, camera_pos);
+	//glm::mat4 getViewProj = glm::mat4(1);
+	//clean depth buffer
+	cleanDepth << < image_blockCount1d, image_blockSize1d >> >(dev_depthbuffer, dev_fmInput, width, height);
+	checkCUDAError("clean");
+	vertexShader << <blockCount1d, blockSize1d >> >(dev_bufVertex, dev_vsOutput, vertCount, getViewProj);
+	checkCUDAError("vertexShader");
+	//step2.primitive assembly
+	int blockCount1d_tri;
+	bool tesselation = Tess;
+	if (!tesselation)
+	{
+		//vertexnumber: vertcount,triangle number:vertcount/3.0
+		blockCount1d_tri = blockCount1d / 3 + 1;
+		PrimitiveAssembly << < blockCount1d_tri, blockSize1d >> >(dev_vsOutput, dev_primitives, vertCount);
+		checkCUDAError("PrimitiveAssembly");
+		rasterization << < blockCount1d_tri, blockSize1d >> >(dev_primitives, dev_fmInput, vertCount, width, height, N);
+		checkCUDAError("rasterization");
+	}
+	else
+	{
+		blockCount1d_tri = blockCount1d / 3 * 4 + 1;
+		//vertexnumber: vertcount*12,triangle number:vertcount*12/3.0
+		Tesselation << <blockCount1d_tri, blockSize1d >> >(1, dev_vsOutput, dev_primitives, vertCount);
+		checkCUDAError("Tesselation");
+		rasterization << < blockCount1d_tri, blockSize1d >> >(dev_primitives, dev_fmInput, vertCount * 4, width, height, N);
+		checkCUDAError("rasterization");
+	}
+	//blin-phong+blending
+	fragmentShading << <image_blockCount1d, image_blockSize1d >> >(dev_fmInput, dev_depthbuffer, width, height, light_pos, camera_pos, 1);
+	checkCUDAError("shading");
+	//blending << <image_blockCount1d, image_blockSize1d >> >(dev_fmOutput, dev_depthbuffer, N, 1);
+	checkCUDAError("blending");
+	render << <blockCount2d, blockSize2d >> >(width, height, dev_depthbuffer, dev_framebuffer);
+	sendImageToPBO << <blockCount2d, blockSize2d >> >(pbo, width, height, dev_framebuffer);
+	checkCUDAError("sendToPBO");
 }
 
 /**
  * Called once at the end of the program to free CUDA memory.
  */
 void rasterizeFree() {
-    cudaFree(dev_bufIdx);
-    dev_bufIdx = NULL;
+	cudaFree(dev_bufIdx);
+	dev_bufIdx = NULL;
+
+	cudaFree(dev_bufVertex);
+	dev_bufVertex = NULL;
 
-    cudaFree(dev_bufVertex);
-    dev_bufVertex = NULL;
+	cudaFree(dev_primitives);
+	dev_primitives = NULL;
 
-    cudaFree(dev_primitives);
-    dev_primitives = NULL;
+	cudaFree(dev_vsOutput);
+	dev_fmInput = NULL;
+	cudaFree(dev_fmInput);
+	dev_fmInput = NULL;
 
-    cudaFree(dev_depthbuffer);
-    dev_depthbuffer = NULL;
+	cudaFree(dev_depthbuffer);
+	dev_depthbuffer = NULL;
 
-    cudaFree(dev_framebuffer);
-    dev_framebuffer = NULL;
+	cudaFree(dev_framebuffer);
+	dev_framebuffer = NULL;
 
-    checkCUDAError("rasterizeFree");
+	checkCUDAError("rasterizeFree");
 }
diff --git a/src/rasterize.h b/src/rasterize.h
index a06b339..8f98945 100644
--- a/src/rasterize.h
+++ b/src/rasterize.h
@@ -13,6 +13,6 @@
 void rasterizeInit(int width, int height);
 void rasterizeSetBuffers(
         int bufIdxSize, int *bufIdx,
-        int vertCount, float *bufPos, float *bufNor, float *bufCol);
-void rasterize(uchar4 *pbo);
+        int vertCount, float *bufPos, float *bufNor, float *bufCol,bool resellation);
+void rasterize(uchar4 *pbo, float x,float y,float u,float r);
 void rasterizeFree();
diff --git a/src/rasterizeTools.h b/src/rasterizeTools.h
index 46c701e..ed339a7 100644
--- a/src/rasterizeTools.h
+++ b/src/rasterizeTools.h
@@ -51,7 +51,9 @@ __host__ __device__ static
 float calculateSignedArea(const glm::vec3 tri[3]) {
     return 0.5 * ((tri[2].x - tri[0].x) * (tri[1].y - tri[0].y) - (tri[1].x - tri[0].x) * (tri[2].y - tri[0].y));
 }
-
+//1)if its points are collinear: 0 
+//2)if the points are in a counterclockwise direction:positive;clockwise: negative
+//3)get the are
 // CHECKITOUT
 /**
  * Helper function for calculating barycentric coordinates.
@@ -68,6 +70,7 @@ float calculateBarycentricCoordinateValue(glm::vec2 a, glm::vec2 b, glm::vec2 c,
 // CHECKITOUT
 /**
  * Calculate barycentric coordinates.
+ * give a point r in triangle r=tri[0]*alpha+tri[1]*beta+tri[2]*gamma, alpha+ beta+ gamma=1
  */
 __host__ __device__ static
 glm::vec3 calculateBarycentricCoordinate(const glm::vec3 tri[3], glm::vec2 point) {