From 14de7a84dcacacf2bafd42fb157b03610caa1318 Mon Sep 17 00:00:00 2001 From: sri-koundinyan Date: Sun, 23 Nov 2025 21:09:58 -0500 Subject: [PATCH 1/2] Fix errors in 07__cuda_core__devices_streams_and_memory.ipynb Add import for ProgramOptions, and modify invocation to use to use the selected GPU's architecture. --- .../07__cuda_core__devices_streams_and_memory.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tutorials/accelerated-python/notebooks/fundamentals/07__cuda_core__devices_streams_and_memory.ipynb b/tutorials/accelerated-python/notebooks/fundamentals/07__cuda_core__devices_streams_and_memory.ipynb index e7f63ff7..a2962bae 100644 --- a/tutorials/accelerated-python/notebooks/fundamentals/07__cuda_core__devices_streams_and_memory.ipynb +++ b/tutorials/accelerated-python/notebooks/fundamentals/07__cuda_core__devices_streams_and_memory.ipynb @@ -427,7 +427,7 @@ "outputs": [], "source": [ "import cupy as cp\n", - "from cuda.core.experimental import launch, LaunchConfig\n", + "from cuda.core.experimental import launch, LaunchConfig, ProgramOptions\n", "\n", "def execute_vector_add():\n", " # Initialize device and create a stream\n", @@ -548,7 +548,7 @@ " print(f\"Multiplying {N}x{N} matrices\")\n", "\n", " # Compile the templated matrix multiplication kernel with specific C++ compiler flags\n", - " program_options = ProgramOptions(std=\"c++17\", arch=f\"sm_{arch}\")\n", + " program_options = ProgramOptions(std=\"c++17\")\n", " program = Program(matmul_source, code_type='c++', options=program_options)\n", " compiled_program = program.compile(target_type='cubin', name_expressions=(\"matrix_multiply\",))\n", " kernel = compiled_program.get_kernel(\"matrix_multiply\")\n", @@ -889,4 +889,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} From 7ef35132c4f8456b2d7d03cb6277feb7b9619eae Mon Sep 17 00:00:00 2001 From: Sri Koundinyan Date: Thu, 11 Dec 2025 17:36:03 -0500 Subject: [PATCH 2/2] Refining eight notebooks for DLI readiness: restructured sections and added more context --- .../01__numpy_intro__ndarray_basics.ipynb | 683 ++-- .../03__numpy_to_cupy__ndarray_basics.ipynb | 890 +++-- .../05__memory_spaces__power_iteration.ipynb | 973 ++++-- .../06__asynchrony__power_iteration.ipynb | 829 ++--- .../kernels/40__kernel_authoring__copy.ipynb | 43 +- ...41__kernel_authoring__book_histogram.ipynb | 39 +- .../20__cudf__nyc_parking_violations.ipynb | 3099 ++++------------- ...3__cuda_cccl__customizing_algorithms.ipynb | 3010 ++++++++-------- 8 files changed, 4191 insertions(+), 5375 deletions(-) diff --git a/tutorials/accelerated-python/notebooks/fundamentals/01__numpy_intro__ndarray_basics.ipynb b/tutorials/accelerated-python/notebooks/fundamentals/01__numpy_intro__ndarray_basics.ipynb index c528a70d..fe742556 100644 --- a/tutorials/accelerated-python/notebooks/fundamentals/01__numpy_intro__ndarray_basics.ipynb +++ b/tutorials/accelerated-python/notebooks/fundamentals/01__numpy_intro__ndarray_basics.ipynb @@ -1,323 +1,366 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "d2e341ff-0c1e-40e8-8c33-9e3039de8013", - "metadata": { - "id": "d2e341ff-0c1e-40e8-8c33-9e3039de8013" - }, - "source": [ - "## Exercise - NumPy Intro - `ndarray` Basics\n", - "\n", - "Let's practice working with NumPy `ndarray`s. You may find NumPy's [reference documentation](https://numpy.org/doc/stable/reference/arrays.html) useful." - ] + "cells": [ + { + "cell_type": "markdown", + "id": "d2e341ff-0c1e-40e8-8c33-9e3039de8013", + "metadata": { + "id": "d2e341ff-0c1e-40e8-8c33-9e3039de8013" + }, + "source": [ + "# NumPy $\\text{ndarray}$ Basics" + ] + }, + { + "cell_type": "markdown", + "id": "a5ba6a1c", + "metadata": {}, + "source": [ + "## Table of Contents\n", + "\n", + "1. [The De Facto Standard for Array Data](#1.-The-De-Facto-Standard-for-Array-Data)\n", + "2. [Anatomy of an ndarray: Structure and Memory](#2.-Anatomy-of-an-$\\text{ndarray}$:-Structure-and-Memory)\n", + "3. [Array Creation and Logical Views (Views vs. Copies)](#3.-Array-Creation-and-Logical-Views-(Views-vs.-Copies))\n", + "4. [Aggregations and Axes](#4.-Aggregations-and-Axes)\n", + "5. [Broadcasting: The \"Stretch\" Rule](#5.-Broadcasting:-The-\"Stretch\"-Rule)\n", + "6. [Why Vectorize? The Speed Advantage](#6.-Why-Vectorize?-The-Speed-Advantage)" + ] + }, + { + "cell_type": "markdown", + "id": "b30427de", + "metadata": {}, + "source": [ + "## 1. The De Facto Standard for Array Data\n", + "\n", + "NumPy is the foundational library for High Performance Computing (HPC) and Machine Learning (ML) in Python. Libraries like PyTorch, Pandas, and Scikit-learn are built upon or mirror the NumPy API. Learning NumPy is essential for mastering the Array Programming paradigm.\n", + "\n", + "NumPy provides the $\\text{ndarray}$ (N-dimensional array), a powerful, high-performance, and uniform container that enables highly efficient memory management, indexing, slicing, and, most importantly, vectorized arithmetic." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "cc4596d8-d9ff-4c66-8822-246c0fc830c7", + "metadata": { + "id": "cc4596d8-d9ff-4c66-8822-246c0fc830c7" + }, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "id": "c59fce80", + "metadata": {}, + "source": [ + "## 2. Anatomy of an $\\text{ndarray}$: Structure and Memory\n", + "\n", + "Unlike a standard Python list, an $\\text{ndarray}$ is a fixed-size, structured block of contiguous memory. Its efficiency comes from these four key, immutable properties:\n", + "\n", + "- **Data**: A pointer to the memory location holding the elements.\n", + "- **dtype**: The data type (e.g., $\\text{int32}, \\text{float64}$) which is uniform across all elements.\n", + "- **Shape**: A tuple defining the size along each dimension (e.g., $(100, 50)$ for 100 rows and 50 columns).\n", + "- **Strides**: The number of bytes to step in memory to reach the next element along each dimension—this is how NumPy efficiently handles different shapes and views.\n", + "\n", + "Let's explore these properties by creating a large dataset.\n", + "\n", + "---\n", + "\n", + "**Quick Docs**\n", + "- `np.arange(start, stop, step)`: Returns evenly spaced values in the half-open interval $[\\text{start}, \\text{stop})$.\n", + "- `arr.nbytes`: Total bytes consumed by the array's elements (in bytes).\n", + "- `arr.ndim`: The number of array dimensions (integer).\n", + "- `arr.size`: The total number of elements in the array (integer).\n", + "- `arr.shape`: The tuple of array dimensions.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "465e35bd", + "metadata": {}, + "outputs": [], + "source": [ + "# Use a large number to clearly demonstrate the memory density of ndarrays\n", + "N = 50_000_000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f1a613f-bc87-4950-b195-a66bb5bc05d3", + "metadata": { + "id": "5f1a613f-bc87-4950-b195-a66bb5bc05d3" + }, + "outputs": [], + "source": [ + "# TODO: Create the input data array with the numbers 1 to 50_000_000 (inclusive).\n", + "# Hint: np.arange generates values within a half-open interval [start, stop)\n", + "arr = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50530f2c-29bf-4061-8f84-bc5be00a5622", + "metadata": { + "id": "50530f2c-29bf-4061-8f84-bc5be00a5622" + }, + "outputs": [], + "source": [ + "# TODO: Calculate how large the array is in GB with nbytes.\n", + "# Hint: GB is 1e9 bytes. The .nbytes attribute returns the total bytes consumed by the elements.\n", + "# Note: This demonstrates that arrays are dense memory blocks, unlike pointer-heavy Python lists.\n", + "arr..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffc15dad-e2fd-4b96-8b39-3496519d0656", + "metadata": { + "id": "ffc15dad-e2fd-4b96-8b39-3496519d0656" + }, + "outputs": [], + "source": [ + "# TODO: How many dimensions does the array have? (ndim)\n", + "arr..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b15cdf25-eb35-4926-b306-90ffd62b3d28", + "metadata": { + "id": "b15cdf25-eb35-4926-b306-90ffd62b3d28" + }, + "outputs": [], + "source": [ + "# TODO: How many elements does the array have? (size)\n", + "arr..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63887722-c9d7-405e-a019-e75646115541", + "metadata": { + "id": "63887722-c9d7-405e-a019-e75646115541" + }, + "outputs": [], + "source": [ + "# TODO: What is the shape of the array?\n", + "arr..." + ] + }, + { + "cell_type": "markdown", + "id": "f5e58ee4", + "metadata": {}, + "source": [ + "## 3. Array Creation and Logical Views (Views vs. Copies)\n", + "\n", + "Arrays can logically represent data in many ways (e.g., 1D signal, 2D image, 4D video batch) independent of the underlying physical memory block.\n", + "\n", + "A critical performance feature is that operations like transposing or $\\text{reshape}$ often return a **View** instead of a **Copy**. A View only changes the metadata ($\\text{shape}$ and $\\text{strides}$) without duplicating the physical data, making these operations nearly instantaneous.\n", + "\n", + "---\n", + "\n", + "**Quick Docs**\n", + "- `np.linspace(start, stop, num)`: Returns $\\text{num}$ evenly spaced samples, calculated over the interval $[\\text{start}, \\text{stop}]$.\n", + "- `np.random.default_rng().random(size)`: Returns random floats in $[0.0, 1.0)$. $\\text{size}$ can be a tuple.\n", + "- `arr.sort()`: Sorts an array in-place (modifies the original data). Use $\\text{np.sort}(\\text{arr})$ to return a sorted copy.\n", + "- `arr.reshape(new_shape)`: Returns a View with a new shape. One dimension can be -1, instructing NumPy to calculate the size automatically.\n", + "- `np.resize(arr, new_shape)`: Returns a new array with the specified shape. If the new shape is larger, it fills the new elements by repeating the original array.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1527b4f6-5d75-47d4-97e0-d0e78bbc59f9", + "metadata": { + "id": "1527b4f6-5d75-47d4-97e0-d0e78bbc59f9" + }, + "outputs": [], + "source": [ + "# TODO: Create a new array with 5_000_000 elements containing equally spaced values between 0 to 1000 (inclusive).\n", + "arr = ...\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f51aa2e-b994-4a91-aed6-4a4632eb7050", + "metadata": { + "id": "2f51aa2e-b994-4a91-aed6-4a4632eb7050" + }, + "outputs": [], + "source": [ + "# TODO: Create a random array that is 10_000 rows by 5_000 columns.\n", + "arr = ...\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ec06270-6e08-4cce-9385-9dc8b53e95fd", + "metadata": { + "id": "4ec06270-6e08-4cce-9385-9dc8b53e95fd" + }, + "outputs": [], + "source": [ + "# TODO: Sort that array (in-place).\n", + "# Note: arr.sort() modifies the array directly, which is typically faster than creating a copy.\n", + "arr..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cdde560b-5ba6-484c-a601-00b7ef71273d", + "metadata": { + "id": "cdde560b-5ba6-484c-a601-00b7ef71273d" + }, + "outputs": [], + "source": [ + "# TODO: Reshape the array to have the last dimension of length 5. \n", + "# Ensure that the operation only changes the logical view without duplicating the physical data pointer.\n", + "# Hint: You can use -1 for one dimension to let NumPy automatically calculate the size based on the total elements.\n", + "arr_new = ...\n", + "arr_new" + ] + }, + { + "cell_type": "markdown", + "id": "54982876", + "metadata": {}, + "source": [ + "## 4. Aggregations and Axes\n", + "\n", + "When performing aggregations (like $\\text{sum}$, $\\text{mean}$, $\\text{max}$), you must specify the **Axis** you want to collapse (or reduce) the array along.\n", + "\n", + "- **Axis 0**: The first dimension (often rows in 2D). Aggregating across Axis 0 produces a result for each column.\n", + "- **Axis 1**: The second dimension (often columns in 2D). Aggregating across Axis 1 produces a result for each row.\n", + "\n", + "---\n", + "\n", + "**Quick Docs**\n", + "- $\\text{np.sum}(\\text{a}, \\text{axis}=\\text{None})$: Sum of array elements over a given axis.\n", + " - $\\text{axis}=0$: Collapse the rows (sum vertical columns).\n", + " - $\\text{axis}=1$: Collapse the columns (sum horizontal rows).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44dd3ac2-c9b7-4327-ba63-860b074c0583", + "metadata": { + "id": "44dd3ac2-c9b7-4327-ba63-860b074c0583" + }, + "outputs": [], + "source": [ + "# TODO: Find the sum of each row in the reshaped array (arr_new) above.\n", + "# Hint: To sum the row's content, we must reduce across the columns.\n", + "arr_sum = ...\n", + "arr_sum" + ] + }, + { + "cell_type": "markdown", + "id": "ed072cee", + "metadata": {}, + "source": [ + "## 5. Broadcasting: The \"Stretch\" Rule\n", + "\n", + "Broadcasting is NumPy's mechanism for performing arithmetic between arrays of different shapes. If dimensions don't match, NumPy attempts to \"stretch\" the smaller array to match the larger one.\n", + "\n", + "**The Compatibility Rule:** Two dimensions are compatible when:\n", + "1. They are equal, or\n", + "2. One of them is 1.\n", + "\n", + "If a dimension is 1, NumPy logically copies that single value across the dimension to match the other array's shape **without allocating any new memory**.\n", + "\n", + "---\n", + "\n", + "**Quick Docs**\n", + "- **Arithmetic Operators** $(/, *, +, -)$: These operate element-wise. Broadcasting occurs if shapes are different but compatible.\n", + "- $\\text{np.allclose}(\\text{a}, \\text{b})$: Returns $\\text{True}$ if two floating-point arrays are element-wise equal within a tolerance. Essential for comparisons instead of using $==$.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b15342af-2916-481a-9724-9874acf4ed24", + "metadata": { + "id": "b15342af-2916-481a-9724-9874acf4ed24" + }, + "outputs": [], + "source": [ + "# TODO: Normalize each row of the 2D array (arr_new) by dividing by the sum you just computed (arr_sum).\n", + "# Hint: 'arr_new' is (M, N) and 'arr_sum' is (M,). To successfully divide, you may need to reshape 'arr_sum' to (M, 1)\n", + "# so that broadcasting can stretch it across the N columns.\n", + "arr_normalized = ...\n", + "arr_normalized" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b04622b8-c6de-4756-8a56-e3d2835a5eaf", + "metadata": { + "id": "b04622b8-c6de-4756-8a56-e3d2835a5eaf" + }, + "outputs": [], + "source": [ + "# EXTRA CREDIT: Prove that your normalized array is actually normalized.\n", + "# Hint: If normalized correctly, the sum of every row should now be 1.0.\n", + "# Check if the new row sums are close to 1.0 using np.allclose." + ] + }, + { + "cell_type": "markdown", + "id": "31657dd2", + "metadata": {}, + "source": [ + "## 6. Why Vectorize? The Speed Advantage\n", + "\n", + "The entire Array Programming paradigm hinges on **Vectorization**.\n", + "\n", + "Why use complex shapes and broadcasting instead of simple Python $\\text{for}$ loops?\n", + "\n", + "NumPy's array functions are implemented in highly optimized native code (C/C++, Fortran). An operation like $\\text{A} + \\text{A}^2$, where $\\text{A}$ is a massive $\\text{ndarray}$, is often $\\mathbf{100\\times}$ faster than performing the equivalent element-wise operation using explicit Python loops.\n", + "\n", + "**Always choose a vectorized NumPy function or operator over a manual Python loop.**" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (RAPIDS 25.10)", + "language": "python", + "name": "cudf-cu12-25.10" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } }, - { - "cell_type": "code", - "execution_count": null, - "id": "cc4596d8-d9ff-4c66-8822-246c0fc830c7", - "metadata": { - "id": "cc4596d8-d9ff-4c66-8822-246c0fc830c7" - }, - "outputs": [], - "source": [ - "import numpy as np" - ] - }, - { - "cell_type": "markdown", - "id": "7535d7ab-1dd8-407a-bd30-7422e8391fc7", - "metadata": { - "id": "7535d7ab-1dd8-407a-bd30-7422e8391fc7" - }, - "source": [ - "**TODO: Create the input data array with the numbers `1` to `500_000_000`.**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5f1a613f-bc87-4950-b195-a66bb5bc05d3", - "metadata": { - "id": "5f1a613f-bc87-4950-b195-a66bb5bc05d3" - }, - "outputs": [], - "source": [ - "arr = ...\n", - "arr" - ] - }, - { - "cell_type": "markdown", - "id": "9b2a320b-b202-4d62-88de-981130756987", - "metadata": { - "id": "9b2a320b-b202-4d62-88de-981130756987" - }, - "source": [ - "**TODO: Calculate how large the array is in GB with `nbytes`.** _Hint: GB is `1e9`_" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "50530f2c-29bf-4061-8f84-bc5be00a5622", - "metadata": { - "id": "50530f2c-29bf-4061-8f84-bc5be00a5622" - }, - "outputs": [], - "source": [ - "arr..." - ] - }, - { - "cell_type": "markdown", - "id": "8dfdc34c-f616-491f-a0c6-8add195412f8", - "metadata": { - "id": "8dfdc34c-f616-491f-a0c6-8add195412f8" - }, - "source": [ - "**TODO: How many dimensions does the array have?**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ffc15dad-e2fd-4b96-8b39-3496519d0656", - "metadata": { - "id": "ffc15dad-e2fd-4b96-8b39-3496519d0656" - }, - "outputs": [], - "source": [ - "arr..." - ] - }, - { - "cell_type": "markdown", - "id": "ee24ecbe-2b39-43ea-9319-cdfa08f52fb1", - "metadata": { - "id": "ee24ecbe-2b39-43ea-9319-cdfa08f52fb1" - }, - "source": [ - "**TODO: How many elements does the array have?**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b15cdf25-eb35-4926-b306-90ffd62b3d28", - "metadata": { - "id": "b15cdf25-eb35-4926-b306-90ffd62b3d28" - }, - "outputs": [], - "source": [ - "arr..." - ] - }, - { - "cell_type": "markdown", - "id": "c6cb7de1-20ef-4edf-a4b1-3abe40e83ab8", - "metadata": { - "id": "c6cb7de1-20ef-4edf-a4b1-3abe40e83ab8" - }, - "source": [ - "**TODO: What is the shape of the array?**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "63887722-c9d7-405e-a019-e75646115541", - "metadata": { - "id": "63887722-c9d7-405e-a019-e75646115541" - }, - "outputs": [], - "source": [ - "arr..." - ] - }, - { - "cell_type": "markdown", - "id": "35f4e58d-9cbe-4e71-8b65-42f9460531e3", - "metadata": { - "id": "35f4e58d-9cbe-4e71-8b65-42f9460531e3" - }, - "source": [ - "**TODO: Create a new array with `5_000_000` elements containing equally spaced values between `0` to `1000` (inclusive).**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1527b4f6-5d75-47d4-97e0-d0e78bbc59f9", - "metadata": { - "id": "1527b4f6-5d75-47d4-97e0-d0e78bbc59f9" - }, - "outputs": [], - "source": [ - "arr = ...\n", - "arr" - ] - }, - { - "cell_type": "markdown", - "id": "1e0b80a2-4bb0-40a0-82e6-366a47f26b43", - "metadata": { - "id": "1e0b80a2-4bb0-40a0-82e6-366a47f26b43" - }, - "source": [ - "**TODO: Create a random array that is `10_000` by `5_000`.**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2f51aa2e-b994-4a91-aed6-4a4632eb7050", - "metadata": { - "id": "2f51aa2e-b994-4a91-aed6-4a4632eb7050" - }, - "outputs": [], - "source": [ - "arr = ...\n", - "arr" - ] - }, - { - "cell_type": "markdown", - "id": "f8ab4e20-cc42-4335-8a8a-bb695631185a", - "metadata": { - "id": "f8ab4e20-cc42-4335-8a8a-bb695631185a" - }, - "source": [ - "**TODO: Sort that array.**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4ec06270-6e08-4cce-9385-9dc8b53e95fd", - "metadata": { - "id": "4ec06270-6e08-4cce-9385-9dc8b53e95fd" - }, - "outputs": [], - "source": [ - "arr = ...\n", - "arr" - ] - }, - { - "cell_type": "markdown", - "id": "dc675e4c-66e6-4a9e-8f68-96802c7f96ad", - "metadata": { - "id": "dc675e4c-66e6-4a9e-8f68-96802c7f96ad" - }, - "source": [ - "**TODO: Reshape the array to have the last dimension of length `5`.**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cdde560b-5ba6-484c-a601-00b7ef71273d", - "metadata": { - "id": "cdde560b-5ba6-484c-a601-00b7ef71273d" - }, - "outputs": [], - "source": [ - "arr = ...\n", - "arr" - ] - }, - { - "cell_type": "markdown", - "id": "70dba856-6eef-427d-871a-c6d041ac8b69", - "metadata": { - "id": "70dba856-6eef-427d-871a-c6d041ac8b69" - }, - "source": [ - "**TODO: Find the sum of each row.** _Hint: Rows are axis 0, but the sum is being applied across columns, which are axis 1._" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "44dd3ac2-c9b7-4327-ba63-860b074c0583", - "metadata": { - "id": "44dd3ac2-c9b7-4327-ba63-860b074c0583" - }, - "outputs": [], - "source": [ - "arr_sum = ...\n", - "arr_sum" - ] - }, - { - "cell_type": "markdown", - "id": "4445c09b-e32e-46e9-aa93-3b36fbbcdcaa", - "metadata": { - "id": "4445c09b-e32e-46e9-aa93-3b36fbbcdcaa" - }, - "source": [ - "**TODO: Normalize each row of the original random array by dividing by the sum you just computed using broadcasting.**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b15342af-2916-481a-9724-9874acf4ed24", - "metadata": { - "id": "b15342af-2916-481a-9724-9874acf4ed24" - }, - "outputs": [], - "source": [ - "arr_normalized = ...\n", - "arr_normalized" - ] - }, - { - "cell_type": "markdown", - "id": "6ff7b234-2c1e-4576-a8df-62958d3f6a4a", - "metadata": { - "id": "6ff7b234-2c1e-4576-a8df-62958d3f6a4a" - }, - "source": [ - "**EXTRA CREDIT: Prove that your normalized array is actually normalized.** _Hint: Does each row sum to 1 now?_" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b04622b8-c6de-4756-8a56-e3d2835a5eaf", - "metadata": { - "id": "b04622b8-c6de-4756-8a56-e3d2835a5eaf" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.11" - }, - "colab": { - "provenance": [], - "gpuType": "T4" - }, - "accelerator": "GPU" - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/tutorials/accelerated-python/notebooks/fundamentals/03__numpy_to_cupy__ndarray_basics.ipynb b/tutorials/accelerated-python/notebooks/fundamentals/03__numpy_to_cupy__ndarray_basics.ipynb index e36fd566..f44d03ae 100644 --- a/tutorials/accelerated-python/notebooks/fundamentals/03__numpy_to_cupy__ndarray_basics.ipynb +++ b/tutorials/accelerated-python/notebooks/fundamentals/03__numpy_to_cupy__ndarray_basics.ipynb @@ -1,385 +1,509 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "d2e341ff-0c1e-40e8-8c33-9e3039de8013", - "metadata": { - "id": "d2e341ff-0c1e-40e8-8c33-9e3039de8013" - }, - "source": [ - "## Exercise - NumPy to CuPy - `ndarray` Basics\n", - "\n", - "Let's revisit our first NumPy exercise and try porting it to CuPy.\n", - "\n", - "**TODO: Add an import of CuPy, update `xp`, and rerun the cells one by one to see if there's any issues.**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cc4596d8-d9ff-4c66-8822-246c0fc830c7", - "metadata": { - "id": "cc4596d8-d9ff-4c66-8822-246c0fc830c7" - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "xp = np" - ] - }, - { - "cell_type": "markdown", - "id": "7535d7ab-1dd8-407a-bd30-7422e8391fc7", - "metadata": { - "id": "7535d7ab-1dd8-407a-bd30-7422e8391fc7" - }, - "source": [ - "Create the input data array with the numbers `1` to `500_000_000`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5f1a613f-bc87-4950-b195-a66bb5bc05d3", - "metadata": { - "id": "5f1a613f-bc87-4950-b195-a66bb5bc05d3" - }, - "outputs": [], - "source": [ - "arr = xp.arange(1, 500_000_001)\n", - "arr" - ] - }, - { - "cell_type": "markdown", - "id": "9b2a320b-b202-4d62-88de-981130756987", - "metadata": { - "id": "9b2a320b-b202-4d62-88de-981130756987" - }, - "source": [ - "Calculate how large the array is in GB with `nbytes`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "50530f2c-29bf-4061-8f84-bc5be00a5622", - "metadata": { - "id": "50530f2c-29bf-4061-8f84-bc5be00a5622" - }, - "outputs": [], - "source": [ - "arr.nbytes / 1e9" - ] - }, - { - "cell_type": "markdown", - "id": "8dfdc34c-f616-491f-a0c6-8add195412f8", - "metadata": { - "id": "8dfdc34c-f616-491f-a0c6-8add195412f8" - }, - "source": [ - "How many dimensions does the array have?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ffc15dad-e2fd-4b96-8b39-3496519d0656", - "metadata": { - "id": "ffc15dad-e2fd-4b96-8b39-3496519d0656" - }, - "outputs": [], - "source": [ - "arr.ndim # `len(arr.shape)` also works, but is longer to type." - ] - }, - { - "cell_type": "markdown", - "id": "ee24ecbe-2b39-43ea-9319-cdfa08f52fb1", - "metadata": { - "id": "ee24ecbe-2b39-43ea-9319-cdfa08f52fb1" - }, - "source": [ - "How many elements does the array have?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b15cdf25-eb35-4926-b306-90ffd62b3d28", - "metadata": { - "id": "b15cdf25-eb35-4926-b306-90ffd62b3d28" - }, - "outputs": [], - "source": [ - "arr.size # For 1D array, `arr.shape[0]` also works, but `arr.size` multiplies the size of all dimensions." - ] - }, - { - "cell_type": "markdown", - "id": "c6cb7de1-20ef-4edf-a4b1-3abe40e83ab8", - "metadata": { - "id": "c6cb7de1-20ef-4edf-a4b1-3abe40e83ab8" - }, - "source": [ - "What is the shape of the array?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "63887722-c9d7-405e-a019-e75646115541", - "metadata": { - "id": "63887722-c9d7-405e-a019-e75646115541" - }, - "outputs": [], - "source": [ - "arr.shape" - ] - }, - { - "cell_type": "markdown", - "id": "35f4e58d-9cbe-4e71-8b65-42f9460531e3", - "metadata": { - "id": "35f4e58d-9cbe-4e71-8b65-42f9460531e3" - }, - "source": [ - "Create a new array with `5_000_000` elements containing equally spaced values between `0` to `1000` (inclusive)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1527b4f6-5d75-47d4-97e0-d0e78bbc59f9", - "metadata": { - "id": "1527b4f6-5d75-47d4-97e0-d0e78bbc59f9" - }, - "outputs": [], - "source": [ - "arr = xp.linspace(0, 1000, 5_000_000, endpoint=True)\n", - "arr" - ] - }, - { - "cell_type": "markdown", - "id": "1e0b80a2-4bb0-40a0-82e6-366a47f26b43", - "metadata": { - "id": "1e0b80a2-4bb0-40a0-82e6-366a47f26b43" - }, - "source": [ - "Create a random array that is `10_000` by `5_000`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2f51aa2e-b994-4a91-aed6-4a4632eb7050", - "metadata": { - "id": "2f51aa2e-b994-4a91-aed6-4a4632eb7050" - }, - "outputs": [], - "source": [ - "arr = xp.random.rand(10_000, 5_000)\n", - "arr" - ] - }, - { - "cell_type": "markdown", - "id": "f8ab4e20-cc42-4335-8a8a-bb695631185a", - "metadata": { - "id": "f8ab4e20-cc42-4335-8a8a-bb695631185a" - }, - "source": [ - "Sort that array." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4ec06270-6e08-4cce-9385-9dc8b53e95fd", - "metadata": { - "id": "4ec06270-6e08-4cce-9385-9dc8b53e95fd" - }, - "outputs": [], - "source": [ - "arr = xp.sort(arr)\n", - "arr" - ] - }, - { - "cell_type": "markdown", - "id": "dc675e4c-66e6-4a9e-8f68-96802c7f96ad", - "metadata": { - "id": "dc675e4c-66e6-4a9e-8f68-96802c7f96ad" - }, - "source": [ - "Reshape the CuPy array to have the last dimension of length `5`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cdde560b-5ba6-484c-a601-00b7ef71273d", - "metadata": { - "id": "cdde560b-5ba6-484c-a601-00b7ef71273d" - }, - "outputs": [], - "source": [ - "arr = arr.reshape((-1, 5))\n", - "# -1 will infer the size of that dimension from the rest. Would also accept: arr.reshape((10_000_000, 5))\n", - "arr" - ] - }, - { - "cell_type": "markdown", - "id": "32cf6010-c6d0-45a3-ae7c-1a8a88d0efc0", - "metadata": { - "id": "32cf6010-c6d0-45a3-ae7c-1a8a88d0efc0" - }, - "source": [ - "Find the sum of each row. Rows are axis 0, but the sum is being applied across columns, which are axis 1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8e8c9ce0-c2ee-463c-a609-2da7fa0f61ec", - "metadata": { - "id": "8e8c9ce0-c2ee-463c-a609-2da7fa0f61ec" - }, - "outputs": [], - "source": [ - "arr_sum = xp.sum(arr, axis=1) # You could also write `arr.sum(axis=1)`.\n", - "arr_sum" - ] - }, - { - "cell_type": "markdown", - "id": "93a890db-ca42-456c-9813-9c22a6f15fd7", - "metadata": { - "id": "93a890db-ca42-456c-9813-9c22a6f15fd7" - }, - "source": [ - "Normalize each row of the original random array by dividing by the sum you just computed using broadcasting." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "31da458f-abdd-4694-9b52-6f7495e9b6cc", - "metadata": { - "id": "31da458f-abdd-4694-9b52-6f7495e9b6cc" - }, - "outputs": [], - "source": [ - "arr_normalized = arr / arr_sum[:, xp.newaxis]\n", - "arr_normalized" - ] - }, - { - "cell_type": "markdown", - "id": "4525c0ac-dc5c-4255-a104-77fbca22bef4", - "metadata": { - "id": "4525c0ac-dc5c-4255-a104-77fbca22bef4" - }, - "source": [ - "Prove that your normalized array is actually normalized by checking that every row sums to 1.\n", - "\n", - "**TODO: Try changing `xp.testing.assert_allclose` to `np.testing.assert_allclose`. What happens?**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d84a0db9-9983-4c25-ae10-7a5b937afb59", - "metadata": { - "id": "d84a0db9-9983-4c25-ae10-7a5b937afb59" - }, - "outputs": [], - "source": [ - "xp.testing.assert_allclose(xp.sum(arr_normalized, axis=1), 1.0)" - ] - }, - { - "cell_type": "markdown", - "source": [ - "**TODO: Create two arrays (one NumPy, one CuPy) that discretize the sine function from 0 to 2π with `50_000_000` points. Benchmark how long it takes NumPy and CuPy to sort the array.**\n", - "\n", - "_Hint: You can use `linspace` to help generate the data - see the example in earlier cells._\n", - "\n", - "_Hint: To accurately time both NumPy and CuPy calls, use [`cupyx.profiler.benchmark`](https://docs.cupy.dev/en/stable/reference/generated/cupyx.profiler.benchmark.html). Don't go overboard with the `n_repeat` parameter._" - ], - "metadata": { - "id": "AxU_hG5M-LKS" - }, - "id": "AxU_hG5M-LKS" - }, - { - "cell_type": "code", - "source": [ - "import cupyx as cpx\n", - "\n", - "arr_np = ...\n", - "arr_cp = ...\n", - "\n", - "..." - ], - "metadata": { - "id": "EKwfS_iM9Yps" - }, - "id": "EKwfS_iM9Yps", - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "**EXTRA CREDIT: Benchmark with different array sizes and find the size at which CuPy and NumPy take the same amount of time. Try to extract the timing data from `cupyx.profiler.benchmark`'s return value and customize how the output is displayed. You could even make a graph.**" - ], - "metadata": { - "id": "qnAvEk5QFAA8" - }, - "id": "qnAvEk5QFAA8" - }, - { - "cell_type": "code", - "source": [ - "..." - ], - "metadata": { - "id": "42YwwyrJFTyV" - }, - "id": "42YwwyrJFTyV", - "execution_count": null, - "outputs": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.11" - }, - "colab": { - "provenance": [], - "gpuType": "T4" - }, - "accelerator": "GPU" - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "markdown", + "id": "f966f67f", + "metadata": {}, + "source": [ + "# Accelerated Computing with CuPy\n", + "\n", + "## Table of Contents\n", + "1. [Creating Arrays: CPU vs. GPU](#1.-Creating-Arrays:-CPU-vs.-GPU)\n", + "2. [Basic Operations](#2.-Basic-Operations)\n", + " - [Sequential Operations & Memory](#Sequential-Operations-&-Memory)\n", + "3. [Complex Operations (Linear Algebra)](#3.-Complex-Operations-(Linear-Algebra))\n", + " - [Agnostic Code (NumPy Dispatch)](#Agnostic-Code-(NumPy-Dispatch))\n", + "4. [Device Management](#4.-Device-Management)\n", + "5. [Exercise - NumPy to CuPy](#Exercise---NumPy-to-CuPy)\n", + " - [Part 1](#Part-1)\n", + " - [Part 2](#Part-2)\n", + "\n", + "---\n", + "\n", + "Let's shift gears to high-level array functionality using **[CuPy](https://cupy.dev/)**.\n", + "\n", + "### What is CuPy?\n", + "CuPy is a library that implements the familiar **NumPy API** but runs on the GPU (using CUDA C++ in the backend). \n", + "\n", + "**Why use it?**\n", + "* **Zero Friction:** If you know NumPy, you already know CuPy.\n", + "* **Speed:** It provides out-of-the-box GPU acceleration for array operations.\n", + "* **Ease of use:** You can often port CPU code to GPU simply by changing `import numpy as np` to `import cupy as cp`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d369bcdc", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import cupy as cp\n", + "\n", + "# Ensure the GPU is clean and ready\n", + "cp.cuda.Stream.null.synchronize()" + ] + }, + { + "cell_type": "markdown", + "id": "8c38845d", + "metadata": {}, + "source": [ + "---\n" + ] + }, + { + "cell_type": "markdown", + "id": "15fc304c", + "metadata": {}, + "source": [ + "## 1. Creating Arrays: CPU vs. GPU\n", + "\n", + "Let's compare the performance of creating a large 3D array (approx. 2GB in size) on the CPU versus the GPU.\n", + "\n", + "We will use `np.ones` for the CPU and `cp.ones` for the GPU.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0f8b002", + "metadata": {}, + "outputs": [], + "source": [ + "%%timeit -r 1 -n 10\n", + "# CPU creation\n", + "global x_cpu\n", + "x_cpu = np.ones((1000, 500, 500))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19309ca7", + "metadata": {}, + "outputs": [], + "source": [ + "%%timeit -n 10\n", + "# GPU creation\n", + "global x_gpu\n", + "x_gpu = cp.ones((1000, 500, 500))\n", + "\n", + "# Force the CPU to wait for the GPU to finish before stopping the timer\n", + "cp.cuda.Stream.null.synchronize()" + ] + }, + { + "cell_type": "markdown", + "id": "ae637eaf", + "metadata": {}, + "source": [ + "We can see here that creating this array on the GPU is much faster than doing so on the CPU! You also likely noticed the line `cp.cuda.Stream.null.synchronize()` in the code above. This is vital for accurate timing.\n", + "\n", + "**How CuPy works:**\n", + "1. When you call a CuPy function, the CPU places a task in the GPU's \"to-do list\" (stream).\n", + "2. The CPU immediately moves to the next line of code **without waiting** for the GPU to finish.\n", + "3. This is called **Asynchronous Execution**.\n", + "\n", + "If we didn't call `synchronize()`, the timer would stop as soon as the CPU issued the command. This would report a misleadingly fast time because it only measures how long it took to launch the task, not how long the GPU actually took to execute it. `synchronize()` forces the CPU to wait until the GPU has finished its work." + ] + }, + { + "cell_type": "markdown", + "id": "6d179e9b", + "metadata": {}, + "source": [ + "## 2. Basic Operations\n", + "\n", + "The syntax for mathematical operations is identical. Let's multiply every value in our arrays by `5`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de5bdefb", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "# CPU Operation\n", + "x_cpu *= 5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a7f32b8", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "# GPU Operation\n", + "x_gpu *= 5\n", + "\n", + "cp.cuda.Stream.null.synchronize()" + ] + }, + { + "cell_type": "markdown", + "id": "bc24579f", + "metadata": {}, + "source": [ + "The GPU completes this operation notably faster, with the code staying the same." + ] + }, + { + "cell_type": "markdown", + "id": "83c69334", + "metadata": {}, + "source": [ + "### Sequential Operations & Memory\n", + "\n", + "Now let's do a couple of operations sequentially, something which would suffer from memory transfer times in Numba examples without explicit memory management." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0294dbc", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "# CPU: Sequential math\n", + "x_cpu *= 5\n", + "x_cpu *= x_cpu\n", + "x_cpu += x_cpu" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acafdbe7", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "# GPU: Sequential math\n", + "x_gpu *= 5\n", + "x_gpu *= x_gpu\n", + "x_gpu += x_gpu\n", + "\n", + "cp.cuda.Stream.null.synchronize()" + ] + }, + { + "cell_type": "markdown", + "id": "0f250bbb", + "metadata": {}, + "source": [ + "The GPU ran that much faster even without us explicitly managing memory. This is because CuPy is handling all of this for us transparently." + ] + }, + { + "cell_type": "markdown", + "id": "84221268", + "metadata": {}, + "source": [ + "## 3. Complex Operations (Linear Algebra)\n", + "\n", + "GPUs excel at Linear Algebra. Let's look at **Singular Value Decomposition (SVD)**, a computationally heavy $O(N^3)$ operation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "978af795", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "# CPU SVD\n", + "x_cpu = np.random.random((1000, 1000))\n", + "u, s, v = np.linalg.svd(x_cpu)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0bc855b", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "# GPU SVD\n", + "x_gpu = cp.random.random((1000, 1000))\n", + "u, s, v = cp.linalg.svd(x_gpu)" + ] + }, + { + "cell_type": "markdown", + "id": "e298f0ea", + "metadata": {}, + "source": [ + "The GPU outperforms the CPU again with exactly the same API!" + ] + }, + { + "cell_type": "markdown", + "id": "4a0870d0", + "metadata": {}, + "source": [ + "### Agnostic Code (NumPy Dispatch)\n", + "\n", + "A key feature of CuPy is that many **NumPy functions work on CuPy arrays without changing your code**.\n", + "\n", + "When you pass a CuPy GPU array (`x_gpu`) into a NumPy function that supports the `__array_function__` protocol (e.g., `np.linalg.svd`), NumPy detects the CuPy input and **delegates the operation to CuPy’s own implementation**, which runs on the GPU.\n", + "\n", + "This allows you to write code using standard `np.*` syntax and have it run on either CPU or GPU seamlessly - **as long as CuPy implements an override for that function.**\n", + "\n", + "CuPy also protects you from hidden performance penalties: **it forbids implicit GPU → CPU copies**, raising a `TypeError` when NumPy tries to convert a CuPy array into a NumPy array behind the scenes. This ensures all device-to-host transfers are **explicit and intentional**, never silent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba4f2863", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "# We create the data on the GPU\n", + "x_gpu = cp.random.random((1000, 1000))\n", + "\n", + "# BUT we call the standard NumPy function\n", + "u, s, v = np.linalg.svd(x_gpu) \n", + "\n", + "cp.cuda.Stream.null.synchronize()" + ] + }, + { + "cell_type": "markdown", + "id": "6e37faae", + "metadata": {}, + "source": [ + "## 4. Device Management\n", + "\n", + "If you have multiple GPUs, CuPy uses the concept of a \"Current Device\" context. \n", + "\n", + "You can use a `with` statement to ensure specific arrays are created on specific cards (e.g., GPU 0 vs GPU 1).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26aa4f57", + "metadata": {}, + "outputs": [], + "source": [ + "with cp.cuda.Device(0):\n", + " x_on_gpu0 = cp.random.random((100000, 1000))\n", + "\n", + "print(f\"Array is on device: {x_on_gpu0.device}\")" + ] + }, + { + "cell_type": "markdown", + "id": "32f7226a", + "metadata": {}, + "source": [ + "**Note:** CuPy functions generally expect all input arrays to be on the **same** device. Passing an array stored on a non-current device may work depending on the hardware configuration but is generally discouraged as it may not be performant.\n" + ] + }, + { + "cell_type": "markdown", + "id": "2e0a4a03", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "d2e341ff-0c1e-40e8-8c33-9e3039de8013", + "metadata": { + "id": "d2e341ff-0c1e-40e8-8c33-9e3039de8013" + }, + "source": [ + "## Exercise - NumPy to CuPy\n", + "\n", + "### Part 1\n", + "Let's put the \"Drop-in Replacement\" philosophy to the test with the same data pipeline as the previous notebook. Specficially, the single block of code below performs the following steps:\n", + "1) Generate a massive dataset (50 million elements).\n", + "2) Process it using a heavy operation (Sorting).\n", + "3) Manipulate the shape and normalize the data (Broadcasting).\n", + "4) Verify the integrity of the result.\n", + "\n", + "**TODO:**\n", + "1. Run the cell below with xp = np (CPU Mode). Note the \"Sort Time\".\n", + "2. Change the setup line to xp = cp (GPU Mode). Run it again.\n", + "3. Observe how the exact same logic runs significantly faster on the GPU with CuPy while retaining the implementation properties of NumPy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc4596d8-d9ff-4c66-8822-246c0fc830c7", + "metadata": { + "id": "cc4596d8-d9ff-4c66-8822-246c0fc830c7" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import cupy as cp\n", + "import time\n", + "\n", + "# --- 1. SETUP: CHOOSE YOUR DEVICE ---\n", + "xp = np # Toggle this to 'cp' for GPU acceleration\n", + "\n", + "print(f\"Running on: {xp.__name__.upper()}\")\n", + "\n", + "# --- 2. DATA GENERATION ---\n", + "N = 50_000_000\n", + "print(f\"Generating {N:,} random elements ({N*8/1e9:.2f} GB)...\")\n", + "arr = xp.random.rand(N)\n", + "\n", + "# --- 3. HEAVY COMPUTATION (TIMED) ---\n", + "print(\"Sorting data...\")\n", + "t0 = time.perf_counter()\n", + "\n", + "xp.sort(arr)\n", + "\n", + "# Ensure GPU finishes before stopping timer\n", + "if xp == cp:\n", + " cp.cuda.Stream.null.synchronize()\n", + "\n", + "t1 = time.perf_counter()\n", + "print(f\" -> Sort Time: {t1 - t0:.4f} seconds\")\n", + "\n", + "# --- 4. MANIPULATION & BROADCASTING ---\n", + "# Purpose: Demonstrate that CuPy supports complex reshaping and broadcasting rules exactly like NumPy.\n", + "# This shows you don't need to rewrite your data processing logic.\n", + "\n", + "# Reshape to a matrix with 5 columns\n", + "arr_new = arr.reshape((-1, 5))\n", + "\n", + "# Normalize: Divide every row by its sum using broadcasting\n", + "row_sums = arr_new.sum(axis=1)\n", + "normalized_matrix = arr_new / row_sums[:, xp.newaxis]\n", + "\n", + "# --- 5. VERIFICATION ---\n", + "# Purpose: Verify mathematical correctness/integrity of the result.\n", + "check_sums = xp.sum(normalized_matrix, axis=1)\n", + "xp.testing.assert_allclose(check_sums, 1.0)\n", + "\n", + "print(\" -> Verification: PASSED (All rows sum to 1.0)\")" + ] + }, + { + "cell_type": "markdown", + "id": "077b7589", + "metadata": {}, + "source": [ + "**TODO: When working with CuPy arrays, try changing `xp.testing.assert_allclose` to `np.testing.assert_allclose`. What happens and why?**" + ] + }, + { + "cell_type": "markdown", + "id": "AxU_hG5M-LKS", + "metadata": { + "id": "AxU_hG5M-LKS" + }, + "source": [ + "### Part 2\n", + "We will now create a massive dataset (50 million points) representing a sine wave and see how fast the GPU can sort it compared to the CPU. \n", + "\n", + "**TODO:** \n", + "1) **Generate Data:** Create a NumPy array (`y_cpu`) and a CuPy array (`y_gpu`) representing $\\sin(x)$ from $0$ to $2\\pi$ with `50,000,000` points.\n", + "2) **Benchmark CPU and GPU:** Use `cupyx.profiler.benchmark` to measure both `np.sort` and `cp.sort`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "EKwfS_iM9Yps", + "metadata": { + "id": "EKwfS_iM9Yps" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import cupy as cp\n", + "import cupyx.profiler\n", + "\n", + "# --- Step 1: Generate Data ---\n", + "N = 50_000_000\n", + "print(f\"Generating {N} points...\")\n", + "\n", + "# TODO: Create x_cpu using np.linspace from 0 to 2*pi\n", + "# TODO: Create y_cpu by taking np.sin(x_cpu)\n", + "\n", + "# TODO: Create x_gpu using cp.linspace from 0 to 2*pi\n", + "# TODO: Create y_gpu by taking cp.sin(x_gpu)\n", + "\n", + "\n", + "# --- Step 2: Benchmark NumPy (CPU) ---\n", + "print(\"Benchmarking NumPy Sort (this may take a few seconds)...\")\n", + "# TODO: Use cupyx.profiler.benchmark(function, (args,), n_repeat=5)\n", + "# Hint: Pass the function `np.sort` and the argument `(y_cpu,)`\n", + "# Note: The comma in (y_cpu,) is required to make it a tuple!\n", + "\n", + "\n", + "# --- Step 3: Benchmark CuPy (GPU) ---\n", + "print(\"Benchmarking CuPy Sort...\")\n", + "# TODO: Use cupyx.profiler.benchmark(function, (args,), n_repeat=5)\n", + "# Hint: Pass the function `cp.sort` and the argument `(y_gpu,)`\n", + "# Note: The comma in (y_gpu,) is required to make it a tuple!" + ] + }, + { + "cell_type": "markdown", + "id": "qnAvEk5QFAA8", + "metadata": { + "id": "qnAvEk5QFAA8" + }, + "source": [ + "**EXTRA CREDIT: Benchmark with different array sizes and find the size at which CuPy and NumPy take the same amount of time. Try to extract the timing data from `cupyx.profiler.benchmark`'s return value and customize how the output is displayed. You could even make a graph.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42YwwyrJFTyV", + "metadata": { + "id": "42YwwyrJFTyV" + }, + "outputs": [], + "source": [ + "..." + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (RAPIDS 25.10)", + "language": "python", + "name": "cudf-cu12-25.10" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/tutorials/accelerated-python/notebooks/fundamentals/05__memory_spaces__power_iteration.ipynb b/tutorials/accelerated-python/notebooks/fundamentals/05__memory_spaces__power_iteration.ipynb index 39bea6fa..f141e491 100644 --- a/tutorials/accelerated-python/notebooks/fundamentals/05__memory_spaces__power_iteration.ipynb +++ b/tutorials/accelerated-python/notebooks/fundamentals/05__memory_spaces__power_iteration.ipynb @@ -1,344 +1,635 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Memory Spaces & Power Iteration\n", + "\n", + "## Table of Contents\n", + "1. [Introduction to Memory Spaces](#1-introduction-to-memory-spaces)\n", + "2. [The CPU Baseline (NumPy)](#2-the-cpu-baseline-numpy)\n", + "3. [The GPU Port (CuPy)](#3-the-gpu-port-cupy)\n", + "4. [Optimizing Data Generation](#4-optimizing-data-generation)\n", + "5. [Verification and Benchmarking](#5-verification-and-benchmarking)\n", + "6. [Extra Credit](#extra-credit)\n", + "\n", + "---\n", + "\n", + "## 1. Introduction to Memory Spaces\n", + "\n", + "Before we implement algorithms on the GPU, we must understand the hardware architecture. A heterogeneous system (like the one you are using) consists of two distinct memory spaces:\n", + "\n", + "1. **Host Memory (CPU):** System RAM. Accessible by the CPU.\n", + "2. **Device Memory (GPU):** High-bandwidth memory (HBM) attached to the GPU. Accessible by the GPU.\n", + "\n", + "The CPU cannot directly calculate data stored on the GPU, and the GPU cannot directly calculate data stored in System RAM. To perform work on the GPU, you must explicitly manage data movement.\n", + "\n", + "* **Host $\\to$ Device:** Move data to the GPU to compute.\n", + " * Syntax: `x_device = cp.asarray(x_host)`\n", + "* **Device $\\to$ Host:** Move results back to the CPU to save to disk, plot with Matplotlib, or print.\n", + " * Syntax: `y_host = cp.asnumpy(y_device)`\n", + "\n", + "### Implicit Transfers and Synchronization\n", + "\n", + "It is crucial to understand when CuPy interacts with the CPU implicitly. These interactions can kill performance because they force the GPU to pause (synchronize) while data moves.\n", + "\n", + "CuPy silently transfers and synchronizes when you:\n", + "1. **Print** a GPU array (`print(gpu_array)`).\n", + "2. **Convert** to a Python scalar (`float(gpu_array)` or `.item()`).\n", + "3. **Evaluate** a GPU scalar in a boolean context (`if gpu_scalar > 0:`).\n", + "\n", + "### The Task\n", + "To understand the implications of these concepts, let's experiment with estimating the dominant eigenvalue of a matrix using the **Power Iteration** algorithm.\n", + "\n", + "Before we dive into the code, let's understand the math behind the algorithm we are implementing.\n", + "\n", + "**Power Iteration** is a classic iterative method used to find the dominant eigenvalue (the eigenvalue with the largest absolute value) and its corresponding eigenvector of a square matrix $A$.\n", + "\n", + "#### How It Works\n", + "\n", + "The core idea is simple: if you repeatedly multiply a vector by a matrix $A$, the vector will eventually converge towards the dominant eigenvector of $A$, regardless of the initial vector you started with (provided the initial vector has some component in the direction of the dominant eigenvector).\n", + "\n", + "#### The Mathematical Steps\n", + "\n", + "Given a square matrix $A$ and a random initial vector $x_0$, the algorithm proceeds as follows for each step $k$:\n", + "\n", + "**1. Matrix-Vector Multiplication:**\n", + "\n", + "We calculate the next approximation of the vector:\n", + "\n", + "$$y = A x_k$$\n", + "\n", + "**2. Eigenvalue Estimation (Rayleigh Quotient):**\n", + "\n", + "We estimate the eigenvalue $\\lambda$ using the current vector. This is essentially projecting $y$ onto $x$:\n", + "\n", + "$$\\lambda_k = \\frac{x_k^T y}{x_k^T x_k} = \\frac{x_k^T A x_k}{x_k^T x_k}$$\n", + "\n", + "**3. Residual Calculation (Error Check):**\n", + "\n", + "We check how close we are to the true definition of an eigenvector ($Ax = \\lambda x$) by calculating the \"residual\" (error):\n", + "\n", + "$$r = ||y - \\lambda_k x_k||$$\n", + "\n", + "If $r$ is close to 0, we have converged.\n", + "\n", + "**4. Normalization:**\n", + "\n", + "To prevent the numbers from exploding (overflow) or vanishing (underflow), we normalize the vector for the next iteration:\n", + "\n", + "$$x_{k+1} = \\frac{y}{||y||}$$\n", + "\n", + "We will start with a standard CPU implementation, port it to the GPU using CuPy, and analyze the performance impact of memory transfers.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import cupy as cp\n", + "import time\n", + "from dataclasses import dataclass\n", + "\n", + "# Configuration for the algorithm\n", + "@dataclass\n", + "class PowerIterationConfig:\n", + " dim: int = 4096 # Matrix size (dim x dim)\n", + " dominance: float = 0.1 # How much larger the top eigenvalue is (controls convergence speed)\n", + " max_steps: int = 400 # Maximum iterations\n", + " check_frequency: int = 10 # Check for convergence every N steps\n", + " progress: bool = True # Print progress logs\n", + " residual_threshold: float = 1e-10 # Stop if error is below this" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. The CPU Baseline (NumPy)\n", + "\n", + "We generate a random dense matrix that is diagonalizable. This data is generated on the **Host (CPU)** and resides in **Host Memory**.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_host(cfg=PowerIterationConfig()):\n", + " \"\"\"Generates a random diagonalizable matrix on the CPU.\"\"\"\n", + " np.random.seed(42)\n", + "\n", + " # Create eigenvalues: One large one (1.0), the rest smaller\n", + " weak_lam = np.random.random(cfg.dim - 1) * (1.0 - cfg.dominance)\n", + " lam = np.random.permutation(np.concatenate(([1.0], weak_lam)))\n", + "\n", + " # Construct matrix A = P * D * P^-1\n", + " P = np.random.random((cfg.dim, cfg.dim)) # Random invertible matrix\n", + " D = np.diag(np.random.permutation(lam)) # Diagonal matrix of eigenvalues\n", + " A = ((P @ D) @ np.linalg.inv(P)) # The final matrix\n", + " return A\n", + "\n", + "# Generate the data on Host\n", + "print(\"Generating Host Data...\")\n", + "A_host = generate_host()\n", + "print(f\"Host Matrix Shape: {A_host.shape}\")\n", + "print(f\"Data Type: {A_host.dtype}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Implementing Power Iteration (CPU)\n", + "\n", + "As described above, the Power Iteration algorithm repeatedly multiplies a vector $x$ by matrix $A$ ($y = Ax$) and normalizes the result. We initialize this algorithm with a vector of 1s ($x_0$) as our initial guess." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def estimate_host(A, cfg=PowerIterationConfig()):\n", + " \"\"\"\n", + " Performs power iteration using purely NumPy (CPU).\n", + " \"\"\"\n", + " # Initialize vector of ones on Host\n", + " x = np.ones(A.shape[0], dtype=np.float64)\n", + "\n", + " for i in range(0, cfg.max_steps, cfg.check_frequency):\n", + " # Matrix-Vector multiplication\n", + " y = A @ x\n", + " \n", + " # Rayleigh quotient: (x . y) / (x . x)\n", + " lam = (x @ y) / (x @ x)\n", + " \n", + " # Calculate residual (error)\n", + " res = np.linalg.norm(y - lam * x)\n", + " \n", + " # Normalize vector for next step\n", + " x = y / np.linalg.norm(y)\n", + "\n", + " if cfg.progress:\n", + " print(f\"Step {i}: residual = {res:.3e}\")\n", + "\n", + " # Convergence check\n", + " if res < cfg.residual_threshold:\n", + " break\n", + "\n", + " # Run intermediate steps without checking residual to save compute\n", + " for _ in range(cfg.check_frequency - 1):\n", + " y = A @ x\n", + " x = y / np.linalg.norm(y)\n", + "\n", + " return (x.T @ (A @ x)) / (x.T @ x)\n", + "\n", + "# Run CPU Baseline\n", + "print(\"\\nRunning CPU Estimate...\")\n", + "start_time = time.time()\n", + "lam_est_host = estimate_host(A_host)\n", + "end_time = time.time()\n", + "\n", + "print(f\"\\nEstimated Eigenvalue (CPU): {lam_est_host}\")\n", + "print(f\"Time taken: {end_time - start_time:.4f}s\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. The GPU Port (CuPy)\n", + "\n", + "### Exercise: Port the CPU Implementation to GPU\n", + "\n", + "Now it's your turn! Your task is to convert the `estimate_host` function to run on the GPU using CuPy.\n", + "\n", + "**Remember the rules of Memory Spaces:**\n", + "1. **Transfer:** Move `A_host` from CPU to GPU using `cp.asarray()`.\n", + "2. **Compute:** Perform math using `cp` functions on the GPU.\n", + "3. **Retrieve:** Move result back to CPU using `cp.asnumpy()` or `.item()` if we need to print it or use it in standard Python.\n", + "\n", + "**Hint:** CuPy tries to replicate the NumPy API. In many cases, you can simply change `np.` to `cp.`. However, CuPy operations *must* run on data present in Device Memory.\n", + "\n", + "**Fill in the `TODO` sections in the skeleton code below:**\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def estimate_device_exercise(A, cfg=PowerIterationConfig()):\n", + " \"\"\"\n", + " TODO: Port the power iteration algorithm to the GPU using CuPy.\n", + " \n", + " Steps to complete:\n", + " 1. Transfer the input matrix A to the GPU (if it's a numpy array)\n", + " 2. Initialize the vector x on the GPU\n", + " 3. Replace np operations with cp operations\n", + " 4. Return the result as a Python scalar\n", + " \"\"\"\n", + " # ---------------------------------------------------------\n", + " # TODO 1: MEMORY TRANSFER (Host -> Device)\n", + " # Check if A is a numpy array. If so, move it to GPU using cp.asarray()\n", + " # Otherwise, assume it's already on the device.\n", + " # ---------------------------------------------------------\n", + " if isinstance(A, np.ndarray):\n", + " A_gpu = ... # TODO: Transfer to GPU\n", + " else:\n", + " A_gpu = A\n", + " \n", + " # ---------------------------------------------------------\n", + " # TODO 2: Initialize vector of ones ON THE GPU\n", + " # Hint: Use cp.ones() instead of np.ones()\n", + " # ---------------------------------------------------------\n", + " x = ... # TODO: Create vector of ones on GPU\n", + " \n", + " for i in range(0, cfg.max_steps, cfg.check_frequency):\n", + " # ---------------------------------------------------------\n", + " # TODO 3: Perform GPU computations\n", + " # Replace the operations below with CuPy equivalents\n", + " # ---------------------------------------------------------\n", + " \n", + " # Matrix-Vector multiplication (this works the same with CuPy!)\n", + " y = A_gpu @ x\n", + " \n", + " # Rayleigh quotient\n", + " lam = (x @ y) / (x @ x)\n", + " \n", + " # TODO: Calculate residual using cp.linalg.norm (not np.linalg.norm)\n", + " res = ...\n", + " \n", + " # TODO: Normalize x using cp.linalg.norm\n", + " x = ...\n", + " \n", + " if cfg.progress:\n", + " print(f\"Step {i}: residual = {res:.3e}\")\n", + " \n", + " if res < cfg.residual_threshold:\n", + " break\n", + " \n", + " for _ in range(cfg.check_frequency - 1):\n", + " y = A_gpu @ x\n", + " x = y / cp.linalg.norm(y)\n", + " \n", + " # ---------------------------------------------------------\n", + " # TODO 4: MEMORY TRANSFER (Device -> Host)\n", + " # Return the eigenvalue as a Python scalar using .item()\n", + " # ---------------------------------------------------------\n", + " result = (x.T @ (A_gpu @ x)) / (x.T @ x)\n", + " return ...\n", + "\n", + "# Uncomment to test your implementation:\n", + "# lam_test = estimate_device_exercise(A_host, PowerIterationConfig(max_steps=50))\n", + "# print(f\"Your result: {lam_test}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Solution\n", + "\n", + "
\n", + "Click to reveal the solution\n", + "\n", + "The key changes from NumPy to CuPy are:\n", + "1. `cp.asarray(A)` to transfer data to GPU\n", + "2. `cp.ones()` instead of `np.ones()` to create arrays on GPU\n", + "3. `cp.linalg.norm()` instead of `np.linalg.norm()`\n", + "4. `.item()` to convert GPU scalar back to Python scalar\n", + "\n", + "
\n", + "\n", + "Run the cell below to see the complete implementation:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def estimate_device(A, cfg=PowerIterationConfig()):\n", + " \"\"\"\n", + " Performs power iteration using CuPy (GPU).\n", + " Handles memory transfer internally.\n", + " \"\"\"\n", + " # ---------------------------------------------------------\n", + " # MEMORY TRANSFER: Host -> Device\n", + " # We use cp.asarray to move the numpy array to the GPU.\n", + " # ---------------------------------------------------------\n", + " if isinstance(A, np.ndarray):\n", + " A_gpu = cp.asarray(A)\n", + " else:\n", + " A_gpu = A # Already on device\n", + "\n", + " # Initialize vector on Device\n", + " x = cp.ones(A_gpu.shape[0], dtype=cp.float64)\n", + "\n", + " for i in range(0, cfg.max_steps, cfg.check_frequency):\n", + " # All operations below happen on the GPU\n", + " y = A_gpu @ x\n", + " lam = (x @ y) / (x @ x)\n", + " \n", + " # Note: using cp.linalg, not np.linalg\n", + " res = cp.linalg.norm(y - lam * x)\n", + " x = y / cp.linalg.norm(y)\n", + "\n", + " if cfg.progress:\n", + " # IMPLICIT TRANSFER WARNING:\n", + " # Printing a GPU scalar/array forces a download to CPU \n", + " # and a synchronization.\n", + " print(f\"Step {i}: residual = {res:.3e}\")\n", + "\n", + " # Boolean checks on GPU scalars also force synchronization\n", + " if res < cfg.residual_threshold:\n", + " break\n", + "\n", + " for _ in range(cfg.check_frequency - 1):\n", + " y = A_gpu @ x\n", + " x = y / cp.linalg.norm(y)\n", + "\n", + " # ---------------------------------------------------------\n", + " # MEMORY TRANSFER: Device -> Host\n", + " # .item() converts a 0-dim GPU array to a Python scalar\n", + " # This implicitly copies data back to the host.\n", + " # ---------------------------------------------------------\n", + " return ((x.T @ (A_gpu @ x)) / (x.T @ x)).item()\n", + "\n", + "print(\"\\nRunning GPU Estimate (Input is Host Array)...\")\n", + "# Note: The first run might be slower due to compilation/caching overhead\n", + "start_time = time.time()\n", + "lam_est_device = estimate_device(A_host)\n", + "cp.cuda.Stream.null.synchronize()\n", + "end_time = time.time()\n", + "\n", + "print(f\"\\nEstimated Eigenvalue (GPU): {lam_est_device}\")\n", + "print(f\"Time taken: {end_time - start_time:.4f}s\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Optimizing Data Generation\n", + "\n", + "In the previous step, we generated data on the CPU and copied it to the GPU. For large datasets, the transfer time (`Host -> Device`) can be a bottleneck. \n", + "\n", + "It is almost always faster to **generate** the data directly on the GPU if possible.\n", + "\n", + "### Exercise: Generate Data Directly on the GPU\n", + "\n", + "Your task is to convert the `generate_host` function to generate the matrix directly on the GPU using CuPy's random functions.\n", + "\n", + "**Hints:**\n", + "- Use `cp.random.seed()` instead of `np.random.seed()`\n", + "- Use `cp.random.random()` instead of `np.random.random()`\n", + "- Use `cp.random.permutation()` instead of `np.random.permutation()`\n", + "- Use `cp.concatenate()`, `cp.array()`, `cp.diag()`, and `cp.linalg.inv()`\n", + "\n", + "**Fill in the `TODO` sections in the skeleton code below:**\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_device_exercise(cfg=PowerIterationConfig()):\n", + " \"\"\"\n", + " TODO: Generate a random diagonalizable matrix directly on the GPU.\n", + " \n", + " This should mirror the generate_host function but use CuPy instead of NumPy.\n", + " The key benefit: no Host->Device transfer needed!\n", + " \"\"\"\n", + " # ---------------------------------------------------------\n", + " # TODO 1: Set the random seed on the GPU\n", + " # ---------------------------------------------------------\n", + " ... \n", + " \n", + " # ---------------------------------------------------------\n", + " # TODO 2: Create eigenvalues on the GPU\n", + " # Generate (dim-1) random values, scale them, then combine with 1.0\n", + " # ---------------------------------------------------------\n", + " # TODO: Generate weak eigenvalues using cp.random.random()\n", + " weak_lam = ...\n", + " \n", + " # TODO: Concatenate [1.0] with weak_lam using cp.concatenate and cp.array\n", + " # Then permute them using cp.random.permutation\n", + " lam = ...\n", + " \n", + " # ---------------------------------------------------------\n", + " # TODO 3: Construct the matrix A = P * D * P^-1 on the GPU\n", + " # ---------------------------------------------------------\n", + " # TODO: Generate random matrix P using cp.random.random()\n", + " P = ...\n", + " \n", + " # TODO: Create diagonal matrix D using cp.diag()\n", + " D = ...\n", + " \n", + " # TODO: Compute A = P @ D @ P^-1 using cp.linalg.inv()\n", + " A = ...\n", + " \n", + " return A\n", + "\n", + "# Uncomment to test your implementation:\n", + "# A_test = generate_device_exercise()\n", + "# print(f\"Matrix shape: {A_test.shape}\")\n", + "# print(f\"Matrix is on GPU: {isinstance(A_test, cp.ndarray)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Solution\n", + "\n", + "
\n", + "Click to reveal the solution\n", + "\n", + "The conversion is straightforward - replace all `np.` calls with `cp.` equivalents:\n", + "- `np.random.seed(42)` → `cp.random.seed(42)`\n", + "- `np.random.random()` → `cp.random.random()`\n", + "- `np.random.permutation()` → `cp.random.permutation()`\n", + "- `np.concatenate()` → `cp.concatenate()`\n", + "- `np.array()` → `cp.array()`\n", + "- `np.diag()` → `cp.diag()`\n", + "- `np.linalg.inv()` → `cp.linalg.inv()`\n", + "\n", + "
\n", + "\n", + "Run the cell below to see the complete implementation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_device(cfg=PowerIterationConfig()):\n", + " \"\"\"Generates random matrix directly on the GPU.\"\"\"\n", + " # Use cupy.random instead of numpy.random\n", + " cp.random.seed(42)\n", + "\n", + " weak_lam = cp.random.random(cfg.dim - 1) * (1.0 - cfg.dominance)\n", + " # cp.concatenate joins arrays on the GPU\n", + " lam = cp.random.permutation(cp.concatenate((cp.array([1.0]), weak_lam)))\n", + "\n", + " P = cp.random.random((cfg.dim, cfg.dim))\n", + " D = cp.diag(cp.random.permutation(lam))\n", + " A = ((P @ D) @ cp.linalg.inv(P))\n", + " return A\n", + "\n", + "print(\"\\nGenerating Data directly on GPU...\")\n", + "start_time = time.time()\n", + "A_device = generate_device()\n", + "end_time = time.time()\n", + "print(f\"Generation time: {end_time - start_time:.4f}s\")\n", + "\n", + "print(\"Running GPU Estimate (Input is Device Array)...\")\n", + "start_time = time.time()\n", + "# No transfer overhead here because A_device is already on GPU\n", + "lam_est_device_gen = estimate_device(A_device)\n", + "cp.cuda.Stream.null.synchronize()\n", + "end_time = time.time()\n", + "print(f\"Compute time: {end_time - start_time:.4f}s\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Verification and Benchmarking\n", + "\n", + "Finally, let's verify our accuracy against a reference implementation (`numpy.linalg.eigvals`) and benchmark the speedup.\n", + "\n", + "**Note on CuPy Limitations:** You might wonder why we use `np.linalg.eigvals` on the CPU instead of a CuPy equivalent. The reason is that CuPy does not yet implement `eigvals`. While CuPy covers a large portion of the NumPy API, it does not support every function. Always check the [CuPy documentation](https://docs.cupy.dev/en/stable/reference/comparison.html) to verify which functions are available before assuming a direct NumPy-to-CuPy conversion will work.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Calculating Reference Eigenvalue (numpy.linalg)...\")\n", + "# Note: calculating all eigenvalues is computationally expensive\n", + "lam_ref = np.linalg.eigvals(A_host).real.max()\n", + "\n", + "print(f\"\\n--- Results ---\")\n", + "print(f\"Reference: {lam_ref}\")\n", + "print(f\"CPU Est: {lam_est_host}\")\n", + "print(f\"GPU Est: {lam_est_device_gen}\")\n", + "\n", + "# Assert correctness\n", + "np.testing.assert_allclose(lam_est_host, lam_ref, rtol=1e-4)\n", + "np.testing.assert_allclose(lam_est_device_gen, lam_ref, rtol=1e-4)\n", + "print(\"\\nAccuracy verification passed!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Benchmarking with `%timeit`\n", + "\n", + "We turn off progress printing to measure raw computation speed.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"\\n--- Benchmarking ---\")\n", + "\n", + "# 1. CPU\n", + "print(\"Timing CPU...\")\n", + "t_cpu = %timeit -o -q estimate_host(A_host, PowerIterationConfig(progress=False))\n", + "\n", + "# 2. GPU (with transfer overhead)\n", + "print(\"Timing GPU (Host Input)...\")\n", + "t_gpu_transfer = %timeit -o -q estimate_device(A_host, PowerIterationConfig(progress=False))\n", + "\n", + "# 3. GPU (Pure device)\n", + "print(\"Timing GPU (Device Input)...\")\n", + "t_gpu_pure = %timeit -o -q estimate_device(A_device, PowerIterationConfig(progress=False))\n", + "\n", + "print(f\"\\nAverage Execution Times:\")\n", + "print(f\"CPU: {t_cpu.average:.4f} s\")\n", + "print(f\"GPU (with transfer): {t_gpu_transfer.average:.4f} s\")\n", + "print(f\"GPU (pure): {t_gpu_pure.average:.4f} s\")\n", + "\n", + "speedup = t_cpu.average / t_gpu_pure.average\n", + "print(f\"\\nSpeedup: {speedup:.1f}x\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Extra Credit\n", + "\n", + "**Explore the impact of changing the following parameters:**\n", + "\n", + "1. **Problem Size (`dim`):** How does the GPU speedup change as you increase or decrease the matrix dimensions? Try values like 1024, 2048, 4096, 8192.\n", + "\n", + "2. **Compute Workload (`max_steps` and `dominance`):** The `dominance` parameter controls how quickly the algorithm converges. A smaller dominance means eigenvalues are closer together, requiring more iterations. How does this affect the CPU vs GPU comparison?\n", + "\n", + "3. **Check Frequency (`check_frequency`):** This controls how often we check for convergence (and trigger implicit CPU synchronization via the print statement). What happens to GPU performance when you check every step (`check_frequency=1`) vs. less frequently (`check_frequency=50`)?\n", + "\n", + "**Experiment below:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Try different configurations here!\n", + "# Example:\n", + "# cfg_large = PowerIterationConfig(dim=8192, progress=False)\n", + "# cfg_slow_converge = PowerIterationConfig(dominance=0.01, progress=False)\n", + "# cfg_frequent_check = PowerIterationConfig(check_frequency=1, progress=True)\n", + "\n", + "# Your experiments:" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (RAPIDS 25.10)", + "language": "python", + "name": "cudf-cu12-25.10" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } }, - "language_info": { - "name": "python" - }, - "colab": { - "provenance": [], - "gpuType": "T4" - }, - "accelerator": "GPU" - }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "## Exercise - Memory Spaces - Power Iteration\n", - "\n", - "Let's learn about memory spaces and transfers! In this exercise, we'll learn:\n", - "\n", - "- How to explicitly transfer data between host and device.\n", - " - `d = cupy.asarray(h)` to copy from a host array to a device array.\n", - " - `h = cupy.asnumpy(d)` to copy from a device array to a host array.\n", - "- What happens if we mix NumPy and CuPy code.\n", - "- Some ways in which NumPy and CuPy produce different results.\n", - "- Some of the limitations of CuPy.\n", - "- How problem size and compute workload impacts performance.\n", - "\n", - "We're going to estimate the dominant eigenvalue of a matrix with the [power iteration algorithm](https://en.wikipedia.org/wiki/Power_iteration).\n", - "First, we'll randomly generate a dense diagonalizable square matrix." - ], - "metadata": { - "id": "6YkNAlM91iGa" - } - }, - { - "cell_type": "code", - "source": [ - "import numpy as np" - ], - "metadata": { - "id": "ZHpg3aVXSeix" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "from dataclasses import dataclass\n", - "\n", - "@dataclass\n", - "class PowerIterationConfig:\n", - " dim: int = 4096 # Number of rows and columns in the square matrix.\n", - "\n", - " # Value from 0 to 1 that controls how much greater the dominant eigenvalue is\n", - " # from the rest of the eigenvalues. A higher value means quicker convergence.\n", - " dominance: float = 0.1\n", - "\n", - " # Maximum number of steps to perform.\n", - " max_steps: int = 400\n", - "\n", - " # Every `check_frequency` steps we save a checkpoint and compute the residual.\n", - " check_frequency: int = 10\n", - "\n", - " # Whether the residual should be printed every `check_frequency` steps.\n", - " progress: bool = True\n", - "\n", - " # If the residual is below `residual_threshold`, terminate early.\n", - " residual_threshold: float = 1e-10" - ], - "metadata": { - "id": "BCToOdVdSONp" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5ak3mn2hIsKo" - }, - "outputs": [], - "source": [ - "def generate_host(cfg=PowerIterationConfig()):\n", - " np.random.seed(42)\n", - "\n", - " # Vector with a single 1 & `cfg.dim - 1` values from 0 to `1 - cfg.dominance`.\n", - " weak_lam = np.random.random(cfg.dim - 1) * (1.0 - cfg.dominance)\n", - " lam = np.random.permutation(np.concatenate(([1.0], weak_lam)))\n", - "\n", - " P = np.random.random((cfg.dim, cfg.dim)) # Random invertible matrix.\n", - " D = np.diag(np.random.permutation(lam)) # Diagonal matrix w/ random eigenvalues.\n", - " A = ((P @ D) @ np.linalg.inv(P)) # Diagonalizable matrix.\n", - " return A\n", - "\n", - "A_host = generate_host()\n", - "\n", - "with np.printoptions(precision=4):\n", - " print(A_host)" - ] - }, - { - "cell_type": "markdown", - "source": [ - "Next, we perform the power iteration with NumPy, using a vector of 1s as our initial guess.\n", - "\n", - "We'll perform at most `cfg.max_steps`. Every `config.check_frequency` steps, we'll output a checkpoint, compute the absolute residual, check whether it's below a `cfg.residual_threshold`. If it is, then we'll stop early." - ], - "metadata": { - "id": "VHv9uXHI_6e0" - } - }, - { - "cell_type": "code", - "source": [ - "def estimate_host(A, cfg=PowerIterationConfig()):\n", - " x = np.ones(A.shape[0], dtype=np.float64)\n", - "\n", - " for i in range(0, cfg.max_steps, cfg.check_frequency):\n", - " y = A @ x\n", - " lam = (x @ y) / (x @ x) # Rayleigh quotient.\n", - " res = np.linalg.norm(y - lam * x)\n", - " x = y / np.linalg.norm(y) # Normalize for next step.\n", - "\n", - " if cfg.progress:\n", - " print(f\"step {i}: residual = {res:.3e}\")\n", - "\n", - " np.savetxt(f\"host_{i}.txt\", x) # Save a checkpoint.\n", - "\n", - " if res < cfg.residual_threshold:\n", - " break\n", - "\n", - " for _ in range(cfg.check_frequency - 1):\n", - " y = A @ x\n", - " x = y / np.linalg.norm(y) # Normalize for next step.\n", - "\n", - " return (x.T @ (A @ x)) / (x.T @ x)\n", - "\n", - "lam_est_host = estimate_host(A_host).item()\n", - "\n", - "print()\n", - "print(lam_est_host)" - ], - "metadata": { - "id": "q0x0_p4pvAdD" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "**TODO: In the next cell, port the power iteration function to CuPy. Try leaving some operations as NumPy and see what happens.**" - ], - "metadata": { - "id": "LMtzZzVNBexb" - } - }, - { - "cell_type": "code", - "source": [ - "def estimate_device(A, cfg=PowerIterationConfig):\n", - " x = np.ones(A.shape[0], dtype=np.float64)\n", - "\n", - " for i in range(0, cfg.max_steps, cfg.check_frequency):\n", - " y = A @ x\n", - " lam = (x @ y) / (x @ x) # Rayleigh quotient.\n", - " res = np.linalg.norm(y - lam * x)\n", - " x = y / np.linalg.norm(y) # Normalize for next step.\n", - "\n", - " if cfg.progress:\n", - " print(f\"step {i}: residual = {res:.3e}\")\n", - "\n", - " np.savetxt(f\"device_{i}.txt\", x) # Save a checkpoint.\n", - "\n", - " if res < cfg.residual_threshold:\n", - " break\n", - "\n", - " for _ in range(cfg.check_frequency - 1):\n", - " y = A @ x\n", - " x = y / np.linalg.norm(y) # Normalize for next step.\n", - "\n", - " return (x.T @ (A @ x)) / (x.T @ x)\n", - "\n", - "lam_est_device = estimate_device(A_host).item()\n", - "\n", - "print()\n", - "print(lam_est_device)" - ], - "metadata": { - "id": "sulx6gabBd1w" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "**TODO: Now port the matrix generation function to CuPy, and run the power iteration with it. What do you notice about the result?**" - ], - "metadata": { - "id": "ySwpt6ro00pz" - } - }, - { - "cell_type": "code", - "source": [ - "def generate_device(cfg=PowerIterationConfig):\n", - " np.random.seed(42)\n", - "\n", - " # Vector with a single 1 & `cfg.dim - 1` values from 0 to `1 - cfg.dominance`.\n", - " weak_lam = np.random.random(cfg.dim - 1) * (1.0 - cfg.dominance)\n", - " lam = np.random.permutation(np.concatenate(([1.0], weak_lam)))\n", - "\n", - " P = np.random.random((cfg.dim, cfg.dim)) # Random invertible matrix.\n", - " D = np.diag(np.random.permutation(lam)) # Diagonal matrix with random eigenvalues.\n", - " A = ((P @ D) @ np.linalg.inv(P)) # Diagonalizable matrix.\n", - " return A\n", - "\n", - "A_device = generate_device()\n", - "\n", - "with np.printoptions(precision=4):\n", - " print(\"A_host:\")\n", - " print(A_host)\n", - " print()\n", - " print(\"A_device:\")\n", - " print(A_device)\n", - " print()\n", - "\n", - "lam_est_device_generation = estimate_device(A_device).item()\n", - "\n", - "print()\n", - "print(lam_est_device_generation)" - ], - "metadata": { - "id": "7R96PJqp0zkI" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Next, let's compute the eigenvalues of the matrix with `numpy.linalg.eigvals`. This may take a little while.\n", - "\n", - "**TODO: What happens if we port this to CuPy?**" - ], - "metadata": { - "id": "BXXawDcfHSol" - } - }, - { - "cell_type": "code", - "source": [ - "lam_ref = np.linalg.eigvals(A_host).real.max()" - ], - "metadata": { - "id": "fL7QYIVesehd" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Now we can check whether our power iteration estimation is correct." - ], - "metadata": { - "id": "BF1HHlkOHtw9" - } - }, - { - "cell_type": "code", - "source": [ - "print(f\"Solution\")\n", - "print()\n", - "print(f\"Power iteration (host) = {lam_est_host:.6e}\")\n", - "print(f\"Power iteration (device) = {lam_est_device:.6e}\")\n", - "print(f\"`eigvals` reference = {lam_ref:.6e}\")\n", - "\n", - "rel_err_host = abs(lam_est_host - lam_ref) / abs(lam_ref)\n", - "rel_err_device = abs(lam_est_device - lam_ref) / abs(lam_ref)\n", - "print()\n", - "print(f\"Relative error (host) = {rel_err_host:.3e}\")\n", - "print(f\"Relative error (device) = {rel_err_device:.3e}\")\n", - "\n", - "np.testing.assert_allclose(lam_est_host, lam_ref, rtol=1e-4)\n", - "np.testing.assert_allclose(lam_est_device, lam_ref, rtol=1e-4)" - ], - "metadata": { - "id": "7E7MUYsYsjwO" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "Finally, let's benchmark all three solutions." - ], - "metadata": { - "id": "2UXGSFs2H70q" - } - }, - { - "cell_type": "code", - "source": [ - "print(f\"Execution Time\")\n", - "print()\n", - "\n", - "time_host = %timeit -q -o estimate_host(A_host, PowerIterationConfig(progress=False)).item()\n", - "print(f\"Power iteration (host) = {time_host}\")\n", - "\n", - "time_device = %timeit -q -o estimate_device(A_host, PowerIterationConfig(progress=False)).item()\n", - "print(f\"Power iteration (device) = {time_device}\")\n", - "\n", - "time_ref = %timeit -q -o -r 1 -n 1 np.linalg.eigvals(A_host).real.max()\n", - "print(f\"`eigvals` reference = {time_ref}\")" - ], - "metadata": { - "id": "v_2HmcBFERhE" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "**EXTRA CREDIT: Explore the impact of changing the problem size (`dim`), the compute workload (`max_steps` and `dominance`), and the check frequency (`check_frequency`).**" - ], - "metadata": { - "id": "8prYSJJprj-Q" - } - } - ] + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/tutorials/accelerated-python/notebooks/fundamentals/06__asynchrony__power_iteration.ipynb b/tutorials/accelerated-python/notebooks/fundamentals/06__asynchrony__power_iteration.ipynb index b21e03d1..4ca12f97 100644 --- a/tutorials/accelerated-python/notebooks/fundamentals/06__asynchrony__power_iteration.ipynb +++ b/tutorials/accelerated-python/notebooks/fundamentals/06__asynchrony__power_iteration.ipynb @@ -1,397 +1,438 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "8KP1pYqmtXdr" - }, - "source": [ - "## Exercise - Asynchrony - Power Iteration\n", - "\n", - "GPU programming is inherently asynchronous - in this exercise, we'll learn the implications that has when using CuPy, and how we can understand and analyze the flow of execution in our code.\n", - "\n", - "We'll revisit our power iteration example from earlier for this exercise.\n", - "\n", - "First, we need to make sure the Nsight Systems profiler, Nsightful, and NVTX are available in our notebook:" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "8KP1pYqmtXdr" + }, + "source": [ + "# Asynchrony and Power Iteration\n", + "\n", + "## Table of Contents\n", + "1. [Introduction and Setup](#1-Introduction-and-Setup)\n", + " - [1.1 Environment Setup](#11-Environment-Setup)\n", + "2. [Theory: Streams and Synchronization](#2-Theory:-Streams-and-Synchronization)\n", + "3. [The Baseline Implementation](#3-The-Baseline-Implementation)\n", + "4. [Profiling the Baseline](#4-Profiling-the-Baseline)\n", + "5. [Better Visibility with NVTX](#5-Better-Visibility-with-NVTX)\n", + "6. [Implementing Asynchrony](#6-Implementing-Asynchrony)\n", + "7. [Performance Analysis](#7-Performance-Analysis)\n", + "\n", + "## 1. Introduction and Setup\n", + "\n", + "GPU programming is inherently asynchronous. In this exercise, we will explore the implications of this behavior when using CuPy and learn how to analyze the flow of execution using profiling tools.\n", + "\n", + "We will revisit the Power Iteration algorithm. Our goal is to take a standard implementation, profile it to identify bottlenecks caused by implicit synchronization, and then optimize it using CUDA streams and asynchronous memory transfers.\n", + "\n", + "### 1.1 Environment Setup\n", + "\n", + "First, we need to ensure the Nsight Systems profiler (nsys), Nsightful, and NVTX are installed and available." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rO4kOPuP_0JG" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Install necessary tools if running in Google Colab\n", + "if os.getenv(\"COLAB_RELEASE_TAG\"):\n", + " !curl -s -L -O https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/NsightSystems-linux-cli-public-2025.3.1.90-3582212.deb\n", + " !sudo dpkg -i NsightSystems-linux-cli-public-2025.3.1.90-3582212.deb > /dev/null\n", + " !pip install \"nvtx\" \"nsightful[notebook] @ git+https://github.com/brycelelbach/nsightful.git\" > /dev/null 2>&1\n", + "\n", + "print(\"Environment setup complete.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6sUvjAtMxI3h" + }, + "source": [ + "## 2. Theory: Streams and Synchronization\n", + "\n", + "All GPU work is launched asynchronously on a stream. The work items in a stream are executed in order. If you launch `f` on a stream and later launch `g` on that same stream, then `f` will be executed before `g`. But if `f` and `g` are launched on different streams, then their execution might overlap.\n", + "\n", + "**How CuPy handles this:**\n", + "\n", + "- **Default Stream:** Unless specified, CuPy launches work on the default CUDA stream.\n", + "\n", + "- **Sequential Device Execution:** By default, CuPy work executes sequentially on the GPU.\n", + "\n", + "- **Asynchronous Host Execution:** From the Python (Host) perspective, the code often returns immediately after launching the GPU kernel, before the work is actually finished.\n", + "\n", + "**TODO:** Even though CuPy is asynchronous, certain operations force the CPU to wait for the GPU to finish. What operations do you think implicitly synchronize the host and device?\n", + "\n", + "## 3. The Baseline Implementation\n", + "\n", + "We will start with a baseline implementation of the Power Iteration algorithm.\n", + "\n", + "**Note:** The cell below writes the code to a file named `power_iteration__baseline.py`. We do this because we must run the code through the Nsight Systems profiler via the command line." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sEicxhLO_9G9" + }, + "outputs": [], + "source": [ + "%%writefile power_iteration__baseline.py\n", + "\n", + "import numpy as np\n", + "import cupy as cp\n", + "import cupyx as cpx\n", + "import nvtx\n", + "from dataclasses import dataclass\n", + "\n", + "@dataclass\n", + "class PowerIterationConfig:\n", + " dim: int = 8192\n", + " dominance: float = 0.05\n", + " max_steps: int = 1000\n", + " check_frequency: int = 10\n", + " progress: bool = True\n", + " residual_threshold: float = 1e-10\n", + "\n", + "def generate_device(cfg=PowerIterationConfig()):\n", + " cp.random.seed(42)\n", + " weak_lam = cp.random.random(cfg.dim - 1) * (1.0 - cfg.dominance)\n", + " lam = cp.random.permutation(cp.concatenate((cp.asarray([1.0]), weak_lam)))\n", + " P = cp.random.random((cfg.dim, cfg.dim))\n", + " D = cp.diag(cp.random.permutation(lam))\n", + " A = ((P @ D) @ cp.linalg.inv(P))\n", + " return A\n", + "\n", + "def estimate_device(A, cfg=PowerIterationConfig()):\n", + " # If `A` is on the host, copy from host to device. Otherwise, does nothing.\n", + " A_gpu = cp.asarray(A)\n", + "\n", + " x = cp.ones(A_gpu.shape[0], dtype=np.float64)\n", + "\n", + " for i in range(0, cfg.max_steps, cfg.check_frequency):\n", + " y = A_gpu @ x\n", + " lam = (x @ y) / (x @ x) # Rayleigh quotient.\n", + " res = cp.linalg.norm(y - lam * x)\n", + " x = y / cp.linalg.norm(y) # Normalize for next step.\n", + "\n", + " if cfg.progress:\n", + " print(f\"step {i}: residual = {res:.3e}\")\n", + "\n", + " # Copy from device to host and save a checkpoint.\n", + " np.savetxt(f\"device_{i}.txt\", cp.asnumpy(x))\n", + "\n", + " if res < cfg.residual_threshold:\n", + " break\n", + "\n", + " for _ in range(cfg.check_frequency - 1):\n", + " y = A_gpu @ x # We have to use `A_gpu` here as well.\n", + " x = y / cp.linalg.norm(y) # Normalize for next step.\n", + "\n", + " # Copy from device to host.\n", + " return cp.asnumpy((x.T @ (A_gpu @ x)) / (x.T @ x))\n", + "\n", + "A_device = generate_device()\n", + "\n", + "# Warmup to ensure modules are loaded and code is JIT compiled before timing.\n", + "estimate_device(A_device, cfg=PowerIterationConfig(progress=False))\n", + "\n", + "start = cp.cuda.get_current_stream().record()\n", + "lam_est_device = estimate_device(A_device).item()\n", + "stop = cp.cuda.get_current_stream().record()\n", + "\n", + "duration = cp.cuda.get_elapsed_time(start, stop) / 1e3\n", + "\n", + "print()\n", + "print(f\"GPU Execution Time: {duration:.3f} s\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1lyhHnzrdXzI" + }, + "source": [ + "## 4. Profiling the Baseline\n", + "\n", + "Now let's profile our code by running it under the Nsight Systems `nsys` tool. The syntax for this is `nsys `. It will run your program while collecting a birdseye view of everything going on in your program." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1HU5p1IhAkTA" + }, + "outputs": [], + "source": [ + "!nsys profile --cuda-event-trace=false --force-overwrite true -o power_iteration__baseline python power_iteration__baseline.py" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IlGIAIEPe3SV" + }, + "source": [ + "Now let's view our report and explore what's going on in our program.\n", + "\n", + "**TODO:** Run the next cell, which will generate the report and create a button that when clicked will open it up in Perfetto, a web-based no-install visual profiler.\n", + "\n", + "**EXTRA CREDIT:** Download the Nsight Systems GUI and open the report in it to see even more information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "s6VVOnGQR3Ph" + }, + "outputs": [], + "source": [ + "import nsightful\n", + "\n", + "!nsys export --type sqlite --quiet true --force-overwrite true power_iteration__baseline.nsys-rep\n", + "nsightful.display_nsys_sqlite_file_in_notebook(\"power_iteration__baseline.sqlite\", title=\"Power Iteration - Baseline\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bGxz6-spplcU" + }, + "source": [ + "## 5. Better Visibility with NVTX\n", + "\n", + "Nsight Systems shows us a lot of information - sometimes it's too much and not all relevant.\n", + "\n", + "There's two ways that we can filter and annotate what we see in Nsight systems.\n", + "\n", + "The first is to limit when we start and stop profiling in the program. In Python, we can do this with `cupyx.profiler.profile()`, which give us a Python context manager. Any CUDA code used during scope will be included in the profile.\n", + "\n", + "```\n", + "not_in_the profile()\n", + "with cpx.profiler.profile():\n", + " in_the_profile()\n", + "not_in_the_profile()\n", + "```\n", + "\n", + "For this to work, we have to pass `--capture-range=cudaProfilerApi --capture-range-end=stop` as flags to `nsys`.\n", + "\n", + "We can also annotate specific regions of our code, which will show up in the profiler. We can even add categories, domains, and colors to these regions, and they can be nested. To add these annotations, we use `nvtx.annnotate()`, another Python context manager, this time from a library called NVTX.\n", + "\n", + "```\n", + "with nvtx.annotate(\"Loop\")\n", + " for i in range(20):\n", + " with nvtx.annotate(f\"Step {i}\"):\n", + " pass\n", + "```\n", + "\n", + "**TODO:** Go back to the earlier cells and improve the profile results by adding:\n", + "\n", + "- `nvtx.annotate()` regions. Remember, you can nest them.\n", + "\n", + "- A `cpx.profiler.profile()` around the `start =`/`stop =` lines that run the solver.\n", + "\n", + "- `--capture-range=cudaProfilerApi --capture-range-end=stop` to the `nsys` flags.\n", + "\n", + "Then, capture another profile and see if you can identify how we can improve the code. Specifically, think about how we could add more asynchrony." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PF7PUALVfX3A" + }, + "source": [ + "## 6. Implementing Asynchrony\n", + "\n", + "Remember what we've learned about streams and how to use them with CuPy:\n", + "\n", + "- By default, all CuPy operations within a single thread run on the same stream. You can access this stream with `cp.cuda.get_current_stream()`.\n", + "\n", + "- You can create a new stream with `cp.cuda.Stream(non_blocking=True)`. Use `with` statements to use the stream for all CuPy operations within a block.\n", + "\n", + "- You can record an event on a stream by calling `.record()` on it.\n", + "\n", + "- You can synchronize on an event (or an entire stream) by calling `.synchronize()` on it.\n", + "\n", + "- Memory transfers will block by default. You can launch them asynchronously with `cp.asarray(..., blocking=False)` (for host to device transfers) and `cp.asnumpy(..., blocking=False)` (for device to host transfers).\n", + "\n", + "**TODO:** Copy the kernel from the earlier cell with your NVTX and CuPy profiler regions into the cell below. Then, try to improve performance by adding asynchrony. Make sure that you don't copy and paste the `%%writefile` directive." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile power_iteration__async.py\n", + "\n", + "import numpy as np\n", + "import cupy as cp\n", + "import cupyx as cpx\n", + "import nvtx\n", + "from dataclasses import dataclass\n", + "\n", + "@dataclass\n", + "class PowerIterationConfig:\n", + " dim: int = 8192\n", + " dominance: float = 0.05\n", + " max_steps: int = 1000\n", + " check_frequency: int = 10\n", + " progress: bool = True\n", + " residual_threshold: float = 1e-10\n", + "\n", + "def generate_device(cfg=PowerIterationConfig()):\n", + " cp.random.seed(42)\n", + " weak_lam = cp.random.random(cfg.dim - 1) * (1.0 - cfg.dominance)\n", + " lam = cp.random.permutation(cp.concatenate((cp.asarray([1.0]), weak_lam)))\n", + " P = cp.random.random((cfg.dim, cfg.dim))\n", + " D = cp.diag(cp.random.permutation(lam))\n", + " A = ((P @ D) @ cp.linalg.inv(P))\n", + " return A\n", + "\n", + "def estimate_device(A, cfg=PowerIterationConfig()):\n", + " raise NotImplementedError(\"TODO: You need to implement this kernel!\")\n", + "\n", + "A_device = generate_device()\n", + "\n", + "# Warmup to ensure modules are loaded and code is JIT compiled before timing.\n", + "estimate_device(A_device, cfg=PowerIterationConfig(progress=False))\n", + "\n", + "start = cp.cuda.get_current_stream().record()\n", + "lam_est_device = estimate_device(A_device).item()\n", + "stop = cp.cuda.get_current_stream().record()\n", + "\n", + "duration = cp.cuda.get_elapsed_time(start, stop) / 1e3\n", + "\n", + "print()\n", + "print(f\"GPU Execution Time: {duration:.3f} s\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's make sure it works:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pszz-k8cDfqy" + }, + "outputs": [], + "source": [ + "!python power_iteration__async.py" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VFCYIqwaKYqy" + }, + "source": [ + "## 7. Performance Analysis\n", + "\n", + "Before we profile the improved code, let's compare the execution times of both." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uSPFNIb9KcPb" + }, + "outputs": [], + "source": [ + "power_iteration_baseline_output = !python power_iteration__baseline.py\n", + "power_iteration_baseline_duration = float(power_iteration_baseline_output[-1].split()[-2])\n", + "power_iteration_async_output = !python power_iteration__async.py\n", + "power_iteration_async_duration = float(power_iteration_async_output[-1].split()[-2])\n", + "speedup = power_iteration_baseline_duration / power_iteration_async_duration\n", + "\n", + "print(f\"GPU Execution Time\")\n", + "print()\n", + "print(f\"power_iteration_baseline: {power_iteration_baseline_duration:.3f} s\")\n", + "print(f\"power_iteration_async: {power_iteration_async_duration:.3f} s\")\n", + "print(f\"power_iteration_async speedup over power_iteration_baseline: {speedup:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "o4WJVFBkkRaN" + }, + "source": [ + "Next, let's capture a profile report of our improved code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BtQR4CHikWFK" + }, + "outputs": [], + "source": [ + "!nsys profile --cuda-event-trace=false --capture-range=cudaProfilerApi --capture-range-end=stop --force-overwrite true -o power_iteration__async python power_iteration__async.py" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Pnvne_F4jYTh" + }, + "source": [ + "Finally, let's look at the profile in Perfetto and confirm we've gotten rid of the idling." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mWXBvi-hFGhU" + }, + "outputs": [], + "source": [ + "!nsys export --type sqlite --quiet true --force-overwrite true power_iteration__async.nsys-rep\n", + "nsightful.display_nsys_sqlite_file_in_notebook(\"power_iteration__async.sqlite\", title=\"Power Iteration - Async Event\")" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (RAPIDS 25.10)", + "language": "python", + "name": "cudf-cu12-25.10" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rO4kOPuP_0JG" - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "if os.getenv(\"COLAB_RELEASE_TAG\"): # If running in Google Colab\n", - " !curl -s -L -O https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/NsightSystems-linux-cli-public-2025.3.1.90-3582212.deb\n", - " !sudo dpkg -i NsightSystems-linux-cli-public-2025.3.1.90-3582212.deb > /dev/null\n", - " !pip install \"nvtx\" \"nsightful[notebook] @ git+https://github.com/brycelelbach/nsightful.git\" > /dev/null 2>&1" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6sUvjAtMxI3h" - }, - "source": [ - "All GPU work is launched asynchronously on a stream. The work items in a stream are executed in order. If you launch `f` on a stream and later launch `g` on that same stream, then `f` will be executed before `g`. But if `f` and `g` are launched on different streams, then their execution might overlap.\n", - "\n", - "With CuPy, much of this is hidden from us. Unless you specify otherwise, CuPy launches work on the default CUDA stream. That means that by default, all CuPy work is executed sequentially on the device, but with respect to the host, it's all happening asynchronously.\n", - "\n", - "CuPy supports explicitly synchronizing, creating, and manipulating streams.\n", - "\n", - "**TODO: However, there are also a few common operations in CuPy that will implicitly synchronize with the device. What operations do you think these are?**\n", - "\n", - "Let's run our code throught Nsight Systems profiler, which will help us visualize what's going on.\n", - "\n", - "**NOTE: The next cell won't actually run any code, it will just write its contents to a file. This is necessary because we have to run the code with the Nsight Systems profiler.**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sEicxhLO_9G9" - }, - "outputs": [], - "source": [ - "%%writefile power_iteration__baseline.py\n", - "\n", - "import numpy as np\n", - "import cupy as cp\n", - "import cupyx as cpx\n", - "import nvtx\n", - "from dataclasses import dataclass\n", - "\n", - "@dataclass\n", - "class PowerIterationConfig:\n", - " dim: int = 8192\n", - " dominance: float = 0.05\n", - " max_steps: int = 1000\n", - " check_frequency: int = 10\n", - " progress: bool = True\n", - " residual_threshold: float = 1e-10\n", - "\n", - "def generate_device(cfg=PowerIterationConfig()):\n", - " cp.random.seed(42)\n", - " weak_lam = cp.random.random(cfg.dim - 1) * (1.0 - cfg.dominance)\n", - " lam = cp.random.permutation(cp.concatenate((cp.asarray([1.0]), weak_lam)))\n", - " P = cp.random.random((cfg.dim, cfg.dim))\n", - " D = cp.diag(cp.random.permutation(lam))\n", - " A = ((P @ D) @ cp.linalg.inv(P))\n", - " return A\n", - "\n", - "def estimate_device(A, cfg=PowerIterationConfig()):\n", - " A_gpu = cp.asarray(A) # If `A` is on the host, copy from host to device.\n", - " # Otherwise, does nothing.\n", - "\n", - " x = cp.ones(A_gpu.shape[0], dtype=np.float64)\n", - "\n", - " for i in range(0, cfg.max_steps, cfg.check_frequency):\n", - " y = A_gpu @ x\n", - " lam = (x @ y) / (x @ x) # Rayleigh quotient.\n", - " res = cp.linalg.norm(y - lam * x)\n", - " x = y / cp.linalg.norm(y) # Normalize for next step.\n", - "\n", - " if cfg.progress:\n", - " print(f\"step {i}: residual = {res:.3e}\")\n", - "\n", - " np.savetxt(f\"device_{i}.txt\", cp.asnumpy(x)) # Copy from device to host\n", - " # and save a checkpoint.\n", - "\n", - " if res < cfg.residual_threshold:\n", - " break\n", - "\n", - " for _ in range(cfg.check_frequency - 1):\n", - " y = A_gpu @ x # We have to use `A_gpu` here as well.\n", - " x = y / cp.linalg.norm(y) # Normalize for next step.\n", - "\n", - " return cp.asnumpy((x.T @ (A_gpu @ x)) / (x.T @ x)) # Copy from device to host.\n", - "\n", - "A_device = generate_device()\n", - "\n", - "# Warmup to ensure modules are loaded and code is JIT compiled before timing.\n", - "estimate_device(A_device, cfg=PowerIterationConfig(progress=False))\n", - "\n", - "start = cp.cuda.get_current_stream().record()\n", - "lam_est_device = estimate_device(A_device).item()\n", - "stop = cp.cuda.get_current_stream().record()\n", - "\n", - "duration = cp.cuda.get_elapsed_time(start, stop) / 1e3\n", - "\n", - "print()\n", - "print(f\"GPU Execution Time: {duration:.3f} s\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1lyhHnzrdXzI" - }, - "source": [ - "Now let's profile our code by running it under the Nsight Systems `nsys` tool. The syntax for this is `nsys `. It will run your program while collecting a birdseye view of everything going on in your program." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1HU5p1IhAkTA" - }, - "outputs": [], - "source": [ - "!nsys profile --cuda-event-trace=false --force-overwrite true -o power_iteration__baseline python power_iteration__baseline.py" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IlGIAIEPe3SV" - }, - "source": [ - "Now let's view our report and explore what's going on in our program.\n", - "\n", - "**TODO: Run the next cell, which will generate the report and create a button that when clicked will open it up in [Perfetto](https://ui.perfetto.dev/), a web-based no-install visual profiler.**\n", - "\n", - "**EXTRA CREDIT: Download the [Nsight Systems GUI](https://developer.nvidia.com/nsight-systems) and open the report in it to see even more information.**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "s6VVOnGQR3Ph" - }, - "outputs": [], - "source": [ - "import nsightful\n", - "\n", - "!nsys export --type sqlite --quiet true --force-overwrite true power_iteration__baseline.nsys-rep\n", - "nsightful.display_nsys_sqlite_file_in_notebook(\"power_iteration__baseline.sqlite\", title=\"Power Iteration - Baseline\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bGxz6-spplcU" - }, - "source": [ - "Nsight Systems shows us a lot of information - sometimes it's too much and not all relevant.\n", - "\n", - "There's two ways that we can filter and annotate what we see in Nsight systems.\n", - "\n", - "The first is to limit when we start and stop profiling in the program. In Python, we can do this with `cupyx.profiler.profile()`, which give us a Python context manager. Any CUDA code used during scope will be included in the profile.\n", - "\n", - "```\n", - "not_in_the profile()\n", - "with cpx.profiler.profile():\n", - " in_the_profile()\n", - "not_in_the_profile()\n", - "```\n", - "\n", - "For this to work, we have to pass `--capture-range=cudaProfilerApi --capture-range-end=stop` as flags to `nsys`.\n", - "\n", - "We can also annotate specific regions of our code, which will show up in the profiler. We can even add categories, domains, and colors to these regions, and they can be nested. To add these annotations, we use `nvtx.annnotate()`, another Python context manager, this time from a library called [NVTX](http://nvtx.readthedocs.io/en/latest/reference.html).\n", - "\n", - "```\n", - "with nvtx.annotate(\"Loop\")\n", - " for i in range(20):\n", - " with nvtx.annotate(f\"Step {i}\"):\n", - " pass\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PF7PUALVfX3A" - }, - "source": [ - "**TODO: Go back to the earlier cells and improve the profile results by adding:**\n", - "- **`nvtx.annotate()` regions. Remember, you can nest them.**\n", - "- **A `cpx.profiler.profile()` around the `start =`/`stop =` lines that run the solver.**\n", - "- **`--capture-range=cudaProfilerApi --capture-range-end=stop` to the `nsys` flags.**\n", - "\n", - "**Then, capture another profile and see if you can identify how we can improve the code. Specifically, think about how we could add more asynchrony.**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Remember what we've learned about streams and how to use them with CuPy:\n", - "\n", - "- By default, all CuPy operations within a single thread run on the same stream. You can access this stream with `cp.cuda.get_current_stream()`. \n", - "- You can create a new stream with `cp.cuda.Stream(non_blocking=True)`. Use `with` statements to use the stream for all CuPy operations within a block.\n", - "- You can record an event on a stream by calling `.record()` on it.\n", - "- You can synchronize on an event (or an entire stream) by calling `.synchronize()` on it.\n", - "- Memory transfers will block by default. You can launch them asynchronously with `cp.asarray(..., blocking=False)` (for host to device transfers) and `cp.asnumpy(..., blocking=False)` (for device to host transfers).\n", - "\n", - "**TODO: Copy the kernel from the earlier cell with your NVTX and CuPy profiler regions into the cell below. Then, try to improve performance by adding asynchrony. Make sure that you don't copy and paste the `%%writefile` directive.**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pszz-k8cDfqy" - }, - "outputs": [], - "source": [ - "%%writefile power_iteration__async.py\n", - "\n", - "import numpy as np\n", - "import cupy as cp\n", - "import cupyx as cpx\n", - "import nvtx\n", - "from dataclasses import dataclass\n", - "\n", - "@dataclass\n", - "class PowerIterationConfig:\n", - " dim: int = 8192\n", - " dominance: float = 0.05\n", - " max_steps: int = 1000\n", - " check_frequency: int = 10\n", - " progress: bool = True\n", - " residual_threshold: float = 1e-10\n", - "\n", - "def generate_device(cfg=PowerIterationConfig()):\n", - " cp.random.seed(42)\n", - " weak_lam = cp.random.random(cfg.dim - 1) * (1.0 - cfg.dominance)\n", - " lam = cp.random.permutation(cp.concatenate((cp.asarray([1.0]), weak_lam)))\n", - " P = cp.random.random((cfg.dim, cfg.dim))\n", - " D = cp.diag(cp.random.permutation(lam))\n", - " A = ((P @ D) @ cp.linalg.inv(P))\n", - " return A\n", - "\n", - "def estimate_device(A, cfg=PowerIterationConfig()):\n", - " raise NotImplementedError(\"TODO: You need to implement this kernel!\")\n", - "\n", - "A_device = generate_device()\n", - "\n", - "# Warmup to ensure modules are loaded and code is JIT compiled before timing.\n", - "estimate_device(A_device, cfg=PowerIterationConfig(progress=False))\n", - "\n", - "start = cp.cuda.get_current_stream().record()\n", - "lam_est_device = estimate_device(A_device).item()\n", - "stop = cp.cuda.get_current_stream().record()\n", - "\n", - "duration = cp.cuda.get_elapsed_time(start, stop) / 1e3\n", - "\n", - "print()\n", - "print(f\"GPU Execution Time: {duration:.3f} s\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VFCYIqwaKYqy" - }, - "source": [ - "Now let's make sure it works:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uSPFNIb9KcPb" - }, - "outputs": [], - "source": [ - "!python power_iteration__async.py" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "o4WJVFBkkRaN" - }, - "source": [ - "Before we profile the improved code, let's compare the execution times of both." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BtQR4CHikWFK" - }, - "outputs": [], - "source": [ - "power_iteration_baseline_output = !python power_iteration__baseline.py\n", - "power_iteration_baseline_duration = float(power_iteration_baseline_output[-1].split()[-2])\n", - "power_iteration_async_output = !python power_iteration__async.py\n", - "power_iteration_async_duration = float(power_iteration_async_output[-1].split()[-2])\n", - "speedup = power_iteration_baseline_duration / power_iteration_async_duration\n", - "\n", - "print(f\"GPU Execution Time\")\n", - "print()\n", - "print(f\"power_iteration_baseline: {power_iteration_baseline_duration:.3f} s\")\n", - "print(f\"power_iteration_async: {power_iteration_async_duration:.3f} s\")\n", - "print(f\"power_iteration_async speedup over power_iteration_baseline: {speedup:.2f}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Pnvne_F4jYTh" - }, - "source": [ - "Next, let's capture a profile report of our improved code." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mWXBvi-hFGhU" - }, - "outputs": [], - "source": [ - "!nsys profile --cuda-event-trace=false --capture-range=cudaProfilerApi --capture-range-end=stop --force-overwrite true -o power_iteration__async python power_iteration__async.py" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "b1sI3RPojxP1" - }, - "source": [ - "Finally, let's look at the profile in Perfetto and confirm we've gotten rid of the idling." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KANrefszlkmF" - }, - "outputs": [], - "source": [ - "!nsys export --type sqlite --quiet true --force-overwrite true power_iteration__async.nsys-rep\n", - "nsightful.display_nsys_sqlite_file_in_notebook(\"power_iteration__async.sqlite\", title=\"Power Iteration - Async Event\")" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "T4", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/tutorials/accelerated-python/notebooks/kernels/40__kernel_authoring__copy.ipynb b/tutorials/accelerated-python/notebooks/kernels/40__kernel_authoring__copy.ipynb index f792275a..b70d6251 100644 --- a/tutorials/accelerated-python/notebooks/kernels/40__kernel_authoring__copy.ipynb +++ b/tutorials/accelerated-python/notebooks/kernels/40__kernel_authoring__copy.ipynb @@ -6,11 +6,22 @@ "id": "-JpGaP7-D_5W" }, "source": [ - "## Exercise - Kernel Authoring - Copy\n", + "## Kernel Authoring - Copy\n", "\n", - "In this exercise, we'll learn how to analyze and reason about the performance of CUDA kernels using the NVIDIA Nsight Compute profiler.\n", + "### Table of Contents\n", + "1. [Environment Setup](#1-environment-setup)\n", + "2. [The Baseline Kernel: Blocked Copy](#2-the-baseline-kernel-blocked-copy)\n", + "3. [Profiling the Baseline](#3-profiling-the-baseline)\n", + "4. [Optimization Challenge: Improved Memory Access](#4-optimization-challenge-improved-memory-access)\n", + "5. [Verification & Benchmarking](#5-verification--benchmarking)\n", + "6. [Profiling the Optimized Kernel](#6-profiling-the-optimized-kernel)\n", + "7. [Further Exploration](#7-further-exploration)\n", "\n", - "We'll look at a few different ways of writing a simple kernel that copies items from one array to another.\n", + "---\n", + "\n", + "## 1. Environment Setup\n", + "\n", + "In this exercise, we'll learn how to analyze and reason about the performance of CUDA kernels using the NVIDIA Nsight Compute profiler. We'll look at a few different ways of writing a simple kernel that copies items from one array to another.\n", "\n", "First, we need to make sure the Nsight Compute profiler, Nsightful, Numba CUDA, and CuPy are available in our notebook:" ] @@ -44,6 +55,8 @@ "id": "A1SfTQk0EwUl" }, "source": [ + "## 2. The Baseline Kernel: Blocked Copy\n", + "\n", "Now, we'll write our first kernel. Each thread will copy `items_per_thread` items from the `src` array to the `dst` array. We'll set the number of threads per block to a constant, `threads_per_block`. We'll calculate how many blocks to launch based on `items_per_thread` and `threads_per_block`. We use `cuda.grid(1)` to get the unique global 1D index of each thread.\n", "\n", "Each thread will copy a contiguous set of items, e.g. the items with indices `[base, base + items_per_thread)`:\n", @@ -102,11 +115,11 @@ "id": "TuR4yDV4H6IB" }, "source": [ - "Next, we'll actually run the code, by invoking the Nsight Compute `ncu` command line tool. The basic syntax for this tool is `ncu `, which will run ` ` while gathering a profile on how your kernels are performing. We're passing it some flags that describe what data it should collect and where it should save the results.\n", + "## 3. Profiling the Baseline\n", "\n", - "There is an overhead to running code under the profiler. Your program may execute noticably slower.\n", + "Next, we'll actually run the code by invoking the Nsight Compute `ncu` command line tool. The basic syntax for this tool is `ncu `, which will run ` ` while gathering a profile on how your kernels are performing. We're passing it some flags that describe what data it should collect and where it should save the results.\n", "\n", - "When profiling and benchmarking, we need to run with a sufficient workload to get meaningful and representative results. If your runtime is too short, the profiler may not be able to report some metrics or the results may be inaccurate.\n", + "There is an overhead to running code under the profiler. Your program may execute noticeably slower.\n", "\n", "**NOTE: To modify and rerun the above code, you must execute the previous cell to write the file and this one to execute it.**" ] @@ -131,9 +144,9 @@ "source": [ "Let's take a look at the profiling report on the kernel. When you run the next cell, a number of tabs will be displayed. The first tab will have a summary of all of the Nsight recommendations and advisories. Subsequent tabs will have more detailed information on a particular area.\n", "\n", - "**TODO: Spend a few minutes reviewing the report. What stands out to you? Based on the information in the report, how can the kernel be improved?**\n", + "**TODO:** Spend a few minutes reviewing the report. What stands out to you? Based on the information in the report, how can the kernel be improved?\n", "\n", - "**EXTRA CREDIT: Download the [Nsight Compute GUI](https://developer.nvidia.com/nsight-compute) and open the report in it to see even more information.**" + "**EXTRA CREDIT:** Download the [Nsight Compute GUI](https://developer.nvidia.com/nsight-compute) and open the report in it to see even more information." ] }, { @@ -155,11 +168,13 @@ "id": "mL_9xT44qbMA" }, "source": [ - "**TODO: Now try to write a better version of our copy kernel.**\n", + "## 4. Optimization Challenge: Improved Memory Access\n", + "\n", + "**TODO:** Now try to write a better version of our copy kernel.\n", "\n", "As a hint, given that this kernel does no compute and just moves data, our memory access patterns are probably important!\n", "\n", - "Instead of using the `cuda.grid` utility, you may want to use the hierachical coordinates of our thread to calculate the index:\n", + "Instead of using the `cuda.grid` utility, you may want to use the hierarchical coordinates of our thread to calculate the index:\n", "\n", "- `cuda.blockDim.x`: The number of threads per block.\n", "- `cuda.blockIdx.x`: The global index of the current thread block.\n", @@ -213,6 +228,8 @@ "id": "qco9XOsTkPEJ" }, "source": [ + "## 5. Verification & Benchmarking\n", + "\n", "Now, let's make sure our code works:" ] }, @@ -259,6 +276,8 @@ "id": "mfrqUdzozGeU" }, "source": [ + "## 6. Profiling the Optimized Kernel\n", + "\n", "Hopefully you see quite a speedup! Now let's profile the optimized variant:" ] }, @@ -300,7 +319,9 @@ "id": "Xn6IPpuxD_kz" }, "source": [ - "**EXTRA CREDIT: Experiment with different problem sizes, threads per block, and items per thread. You can pass them as command line arguments to the Python scripts. If you're feeling really ambitious, do a parameter sweep to study the impact these knobs have on performance.**" + "## 7. Further Exploration\n", + "\n", + "**EXTRA CREDIT:** Experiment with different problem sizes, threads per block, and items per thread. You can pass them as command line arguments to the Python scripts. If you're feeling really ambitious, do a parameter sweep to study the impact these knobs have on performance." ] } ], diff --git a/tutorials/accelerated-python/notebooks/kernels/41__kernel_authoring__book_histogram.ipynb b/tutorials/accelerated-python/notebooks/kernels/41__kernel_authoring__book_histogram.ipynb index 1839b1ea..17678c1b 100644 --- a/tutorials/accelerated-python/notebooks/kernels/41__kernel_authoring__book_histogram.ipynb +++ b/tutorials/accelerated-python/notebooks/kernels/41__kernel_authoring__book_histogram.ipynb @@ -7,11 +7,22 @@ "id": "f1a8560a-c91b-48db-af1c-18fcd4892448" }, "source": [ - "## Exercise - Kernel Authoring - Book Histogram\n", + "# Kernel Authoring - Book Histogram\n", + "\n", + "## Table of Contents\n", + "\n", + "1. [Environment Setup & Data Download](#1.-Environment-Setup-&-Data-Download)\n", + "2. [First Attempt: Global Memory Histogram](#2.-First-Attempt:-Global-Memory-Histogram)\n", + "3. [Fixing Data Races with Atomics](#3.-Fixing-Data-Races-with-Atomics)\n", + "4. [Profiling the Naive Solution](#4.-Profiling-the-Naive-Solution)\n", + "5. [Optimization: Shared Memory & Cooperative Groups](#5.-Optimization:-Shared-Memory-&-Cooperative-Groups)\n", + "6. [Performance Comparison](#6.-Performance-Comparison)\n", + "\n", + "## 1. Environment Setup & Data Download\n", "\n", "Let's learn to use some advanced CUDA features like shared memory, atomics, and [cuda.cooperative](https://nvidia.github.io/cccl/python/cooperative.html) to write an efficient histogram kernel to determine the most frequent characters in a collection of books.\n", "\n", - "First, let's download our dataset." + "First, let's download our dataset and install the necessary tools." ] }, { @@ -63,6 +74,8 @@ "id": "9109d3c0-e276-44cc-9f36-f8c79eb48b31" }, "source": [ + "## 2. First Attempt: Global Memory Histogram\n", + "\n", "A histogram kernel counts the number of times a value occurs in a dataset. To implement this, we create an array that is large enough to store all possible values (in the case of counting 1-byte ASCII characters, 256 elements). Then for the value of each element in the dataset, we increment its location in the array.\n", "\n", "Let's try a simple way to implement this:" @@ -168,11 +181,13 @@ "id": "b14fa522-b41b-4538-8c34-ecc355e55116" }, "source": [ + "## 3. Fixing Data Races with Atomics\n", + "\n", "It looks like something is wrong - our counts are very low, and the most common characters don't make a lot of sense. Many of our increments seem to get lost!\n", "\n", "What's happening here is called a data race. Many different threads are trying to access the bins of the histogram at the same time.\n", "\n", - "Imagine that two threads are trying to update the same bin.\n", + "Imagine that two threads are trying to update the same bin:\n", "\n", "- Thread 0 reads the count of the bin, which is 0, and stores it in its local variable `old_count`.\n", "- Thread 0 adds 1 to its `old_count`, producing a `new_count` of 1.\n", @@ -193,6 +208,8 @@ "id": "08f4dded-26a7-4ef8-b981-e00c569ca4d0" }, "source": [ + "## 4. Profiling the Naive Solution\n", + "\n", "Now let's profile our code." ] }, @@ -228,9 +245,11 @@ "id": "e1f72831-780f-4cf5-8ff1-2092ecb193d9" }, "source": [ + "## 5. Optimization: Shared Memory & Cooperative Groups\n", + "\n", "Looking at the profile trace, it seems like our code is quite slow - look at the memory workload tab and see how low the throughput is!\n", "\n", - "One improvement we should make is to separate loading from values from the histogram update and to perform striped loads. We'll use [cuda.cooperative](https://nvidia.github.io/cccl/python/cooperative.html)'s block load instead of writing this by hand.\n", + "One improvement we should make is to separate loading values from the histogram update and to perform striped loads (also known as coalesced access). We'll use [cuda.cooperative](https://nvidia.github.io/cccl/python/cooperative.html)'s block load instead of writing this by hand.\n", "\n", "**TODO: Rewrite the code below to use `cuda.cooperative` to load from `values` into local memory.**\n", "- **Create a `coop.block.load(dtype, threads_per_block, items_per_thread, algorithm)` object outside of the kernel.**\n", @@ -308,7 +327,7 @@ "id": "fd090ee6-a5d3-46f6-a58e-d34e077a99c0" }, "source": [ - "Now let's profile our code." + "Now let's run the code and profile it." ] }, { @@ -366,6 +385,16 @@ "nsightful.display_ncu_csv_in_notebook(histogram_localized_csv)" ] }, + { + "cell_type": "markdown", + "id": "23df6c7a", + "metadata": {}, + "source": [ + "## 6. Performance Comparison\n", + "\n", + "Let's compare the execution time of our naive global memory implementation against our optimized shared memory implementation." + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/tutorials/accelerated-python/notebooks/libraries/20__cudf__nyc_parking_violations.ipynb b/tutorials/accelerated-python/notebooks/libraries/20__cudf__nyc_parking_violations.ipynb index b983bd08..3ef6f86c 100644 --- a/tutorials/accelerated-python/notebooks/libraries/20__cudf__nyc_parking_violations.ipynb +++ b/tutorials/accelerated-python/notebooks/libraries/20__cudf__nyc_parking_violations.ipynb @@ -1,2436 +1,667 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "c1032e41-4396-4ef7-b58a-8c592ad9226f", - "metadata": {}, - "source": [ - "## Exercise - cuDF - NYC Parking Violations" - ] - }, - { - "cell_type": "markdown", - "id": "b2ca1821-ab7d-4f35-8c00-4452e7384901", - "metadata": {}, - "source": [ - "We've learned how to work with numeric data using CuPy. But many applications, in data science and machine learning involve other kinds of data, like dates and strings. \n", - "\n", - "[cuDF](https://docs.rapids.ai/api/cudf/stable/) is a DataFrame library for loading, joining, aggregating, filtering, and otherwise manipulating data. It offers both a [Pandas](https://docs.rapids.ai/api/cudf/stable/cudf_pandas/) and a [Polars](https://docs.rapids.ai/api/cudf/stable/cudf_polars/) API." - ] - }, - { - "cell_type": "markdown", - "id": "6db517aa-5cb9-415f-b1b1-fd9cd9b2f97f", - "metadata": {}, - "source": [ - "### A quick Pandas introduction" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "68edd7ee-1724-46c5-b301-217665bde5de", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd" - ] - }, - { - "cell_type": "markdown", - "id": "2ca65219-e77e-4513-9074-89ed031fb9dc", - "metadata": {}, - "source": [ - "#### Series and DataFrame objects" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "a3bbb246-324a-4385-a3b8-fb4cb12161e2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 1\n", - "1 2\n", - "2 3\n", - "dtype: int64" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s = pd.Series([1, 2, 3])\n", - "s" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "9bb1ccdc-f96e-4737-ba83-44218ac4ab94", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Max value: 3\n", - "Mean value: 2.0\n" - ] - } - ], - "source": [ - "print(\"Max value: \", s.max())\n", - "print(\"Mean value: \", s.mean())" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "ffed09f3-e90a-4bcf-a673-5460c9e51936", - "metadata": {}, - "outputs": [], - "source": [ - "s = pd.Series([\"one\", \"two\", \"three\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "ea45fd58-4450-4679-85b9-eaa0a8434c13", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Max value: two\n" - ] - } - ], - "source": [ - "print(\"Max value: \", s.max())" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "02d9e821-89c5-494b-a700-16effae5f80d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
1113
2243
3173
4324
5205
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "1 1 1 3\n", - "2 2 4 3\n", - "3 1 7 3\n", - "4 3 2 4\n", - "5 2 0 5" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.DataFrame({\n", - " \"a\" : [1, 2, 1, 3, 2],\n", - " \"b\" : [1, 4, 7, 2, 0],\n", - " \"c\" : [3, 3, 3, 4, 5]\n", - "}, index = [1, 2, 3, 4, 5])\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "26089526-5486-48ed-a2e3-d1771dcb5647", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index([1, 2, 3, 4, 5], dtype='int64')" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.index" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "ce752fbe-20bd-4f76-89cb-1e31d8fbd758", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['a', 'b', 'c'], dtype='object')" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.columns" - ] - }, - { - "cell_type": "markdown", - "id": "cbea2a91-09da-448f-82bb-fb037c6de9b0", - "metadata": {}, - "source": [ - "#### Selecting and filtering data" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "039116b6-8f7a-4ca8-a388-6b39ecc437ff", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
1113
2243
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "1 1 1 3\n", - "2 2 4 3" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head(2)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "16b8f18f-ef86-4481-827e-bcfcac16a794", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
4324
5205
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "4 3 2 4\n", - "5 2 0 5" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.tail(2)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "1e427550-57bf-4125-a0a0-5204f6311744", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1 1\n", - "2 2\n", - "3 1\n", - "4 3\n", - "5 2\n", - "Name: a, dtype: int64" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[\"a\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "d1be2de2-7468-4e5f-a95e-a086dd6feed4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
bc
113
243
373
424
505
\n", - "
" - ], - "text/plain": [ - " b c\n", - "1 1 3\n", - "2 4 3\n", - "3 7 3\n", - "4 2 4\n", - "5 0 5" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[[\"b\", \"c\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "f86e8a29-a718-4d1d-b2fc-f818a52fef63", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
1113
2243
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "1 1 1 3\n", - "2 2 4 3" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.iloc[0:2]" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "e699a355-37cc-49d4-b673-8ef178d490d7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "b 1\n", - "c 3\n", - "Name: 1, dtype: int64" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.iloc[0, 1:3]" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "638b4090-a650-4447-8227-1d659a96178f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
bc
243
373
\n", - "
" - ], - "text/plain": [ - " b c\n", - "2 4 3\n", - "3 7 3" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.loc[2:3, \"b\":\"c\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "08e001b8-47ec-4609-9b49-d887abaaa14d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
2243
4324
5205
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "2 2 4 3\n", - "4 3 2 4\n", - "5 2 0 5" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['a'] > 1]" - ] - }, - { - "cell_type": "markdown", - "id": "b3d72acc-7c32-4e6e-aab7-ec6b3d5e71c5", - "metadata": {}, - "source": [ - "#### Sorting" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "30f331f7-b7f6-4fc3-a0bc-fe7b07ff1f10", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
1113
3173
2243
5205
4324
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "1 1 1 3\n", - "3 1 7 3\n", - "2 2 4 3\n", - "5 2 0 5\n", - "4 3 2 4" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.sort_values(\"a\")" - ] - }, - { - "cell_type": "markdown", - "id": "e308b639-7b7f-45ba-977c-66ab018d5bcf", - "metadata": {}, - "source": [ - "#### Summarizing Data" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "f3be893b-ddab-4bff-bac4-a822174aed86", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "a 9\n", - "b 14\n", - "c 18\n", - "dtype: int64" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.sum()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "b0fce8b9-21ed-4bad-a416-9c8b5b323e28", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "np.float64(1.8)" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[\"a\"].mean()" - ] - }, - { - "cell_type": "markdown", - "id": "a09f30d8-1a81-45fa-b919-00ce75337d5c", - "metadata": {}, - "source": [ - "#### Grouped aggregations" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "d7d7a8bc-0b00-4dbe-89de-f6119b305594", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "a\n", - "1 2\n", - "2 2\n", - "3 1\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[\"a\"].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "81f81052-7e85-4697-a3f3-f602990f963f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "c\n", - "3 3\n", - "4 1\n", - "5 1\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[\"c\"].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "a1f1a795-2db1-4411-8e70-9f98b096f956", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
bc
a
14.03.0
22.04.0
32.04.0
\n", - "
" - ], - "text/plain": [ - " b c\n", - "a \n", - "1 4.0 3.0\n", - "2 2.0 4.0\n", - "3 2.0 4.0" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.groupby(\"a\").mean()" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "80f73c53-bcb3-48cf-b456-fba8faafe021", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ab
c
333
411
511
\n", - "
" - ], - "text/plain": [ - " a b\n", - "c \n", - "3 3 3\n", - "4 1 1\n", - "5 1 1" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.groupby(\"c\").count()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "18a5723a-629e-424c-9d59-bc2c0c315a31", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
bc
minmeanmax
a
114.03
202.05
322.04
\n", - "
" - ], - "text/plain": [ - " b c\n", - " min mean max\n", - "a \n", - "1 1 4.0 3\n", - "2 0 2.0 5\n", - "3 2 2.0 4" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.groupby(\"a\").agg({\"b\": [\"min\", \"mean\"], \"c\": [\"max\"]})" - ] - }, - { - "cell_type": "markdown", - "id": "a5d0979a-d6e8-45a3-856b-d3d5ad52be4a", - "metadata": {}, - "source": [ - "#### String operations" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "e0707c98-9178-46f4-8ef9-fe75786b7f4b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abcd
1113mario
2243luigi
3173yoshi
4324peach
5205toad
\n", - "
" - ], - "text/plain": [ - " a b c d\n", - "1 1 1 3 mario\n", - "2 2 4 3 luigi\n", - "3 1 7 3 yoshi\n", - "4 3 2 4 peach\n", - "5 2 0 5 toad" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[\"d\"] = [\"mario\", \"luigi\", \"yoshi\", \"peach\", \"toad\"]\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "98379444-e924-4bd1-a2d5-cfd86bd551a4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1 MARIO\n", - "2 LUIGI\n", - "3 YOSHI\n", - "4 PEACH\n", - "5 TOAD\n", - "Name: d, dtype: object" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[\"d\"].str.upper()" - ] - }, - { - "cell_type": "markdown", - "id": "36586199-7a07-406b-b92b-f17f5f6ecee9", - "metadata": {}, - "source": [ - "#### Time Series" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "4a6166f7-b6b1-453f-97da-e79a6e178f5c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
datevalue
02018-11-200.453224
12018-11-210.127411
22018-11-220.439165
32018-11-230.694320
42018-11-240.979057
.........
672019-01-260.326564
682019-01-270.303860
692019-01-280.936735
702019-01-290.307780
712019-01-300.353858
\n", - "

72 rows \u00d7 2 columns

\n", - "
" - ], - "text/plain": [ - " date value\n", - "0 2018-11-20 0.453224\n", - "1 2018-11-21 0.127411\n", - "2 2018-11-22 0.439165\n", - "3 2018-11-23 0.694320\n", - "4 2018-11-24 0.979057\n", - ".. ... ...\n", - "67 2019-01-26 0.326564\n", - "68 2019-01-27 0.303860\n", - "69 2019-01-28 0.936735\n", - "70 2019-01-29 0.307780\n", - "71 2019-01-30 0.353858\n", - "\n", - "[72 rows x 2 columns]" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import numpy as np\n", - "\n", - "date_df = pd.DataFrame()\n", - "date_df[\"date\"] = pd.date_range(\"11/20/2018\", periods=72, freq=\"D\")\n", - "date_df[\"value\"] = np.random.sample(len(date_df))\n", - "date_df" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "c8ee517e-f016-4547-8cb1-0715c456236d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
datevalue
02018-11-200.453224
12018-11-210.127411
22018-11-220.439165
32018-11-230.694320
\n", - "
" - ], - "text/plain": [ - " date value\n", - "0 2018-11-20 0.453224\n", - "1 2018-11-21 0.127411\n", - "2 2018-11-22 0.439165\n", - "3 2018-11-23 0.694320" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "date_df[date_df[\"date\"] < \"2018-11-24\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "56ae6716-a0f0-42f2-b701-e3b86c1d6f06", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
datevalueyear
02018-11-200.4532242018
12018-11-210.1274112018
22018-11-220.4391652018
32018-11-230.6943202018
42018-11-240.9790572018
............
672019-01-260.3265642019
682019-01-270.3038602019
692019-01-280.9367352019
702019-01-290.3077802019
712019-01-300.3538582019
\n", - "

72 rows \u00d7 3 columns

\n", - "
" - ], - "text/plain": [ - " date value year\n", - "0 2018-11-20 0.453224 2018\n", - "1 2018-11-21 0.127411 2018\n", - "2 2018-11-22 0.439165 2018\n", - "3 2018-11-23 0.694320 2018\n", - "4 2018-11-24 0.979057 2018\n", - ".. ... ... ...\n", - "67 2019-01-26 0.326564 2019\n", - "68 2019-01-27 0.303860 2019\n", - "69 2019-01-28 0.936735 2019\n", - "70 2019-01-29 0.307780 2019\n", - "71 2019-01-30 0.353858 2019\n", - "\n", - "[72 rows x 3 columns]" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "date_df[\"year\"] = date_df[\"date\"].dt.year\n", - "date_df" - ] - }, - { - "cell_type": "markdown", - "id": "70b8dec0-140c-4f1c-b9af-4aa2df6f79ca", - "metadata": {}, - "source": [ - "#### User-defined operations" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "dac6501e-fe69-4028-84e3-d77e3bb79321", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abcd
11113mario
21243luigi
31173yoshi
41324peach
51205toad
\n", - "
" - ], - "text/plain": [ - " a b c d\n", - "1 11 1 3 mario\n", - "2 12 4 3 luigi\n", - "3 11 7 3 yoshi\n", - "4 13 2 4 peach\n", - "5 12 0 5 toad" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def add_ten(x):\n", - " return x + 10\n", - "\n", - "df[\"a\"] = df[\"a\"].apply(add_ten)\n", - "df" - ] - }, - { - "cell_type": "markdown", - "id": "d1cc177a-809f-4033-8017-74a698635663", - "metadata": {}, - "source": [ - "### Now let's do the same thing with cuDF" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "a9c1c477-4cd9-4320-8341-891385ba1ca0", - "metadata": {}, - "outputs": [], - "source": [ - "import cudf" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "b078f895-6bc9-447d-8f7e-5e414bdaa77d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
abc
1111
2241
3178
4322
5209
\n", - "
" - ], - "text/plain": [ - " a b c\n", - "1 1 1 1\n", - "2 2 4 1\n", - "3 1 7 8\n", - "4 3 2 2\n", - "5 2 0 9" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = cudf.DataFrame({\n", - " \"a\" : [1, 2, 1, 3, 2],\n", - " \"b\" : [1, 4, 7, 2, 0],\n", - " \"c\" : [1, 1, 8, 2, 9]\n", - "}, index = [1, 2, 3, 4, 5])\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "6f992143-c99a-481a-8130-b9f2106aa91c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "cudf.core.dataframe.DataFrame" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "type(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "336eaeac-49d9-4fa9-bd9b-888eec5d9f60", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
bc
241
378
\n", - "
" - ], - "text/plain": [ - " b c\n", - "2 4 1\n", - "3 7 8" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.loc[2:3, \"b\":\"c\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "51f48410-e149-4014-b2f5-7969c2e70c9b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
bc
minmeanmax
a
202.09
114.08
322.02
\n", - "
" - ], - "text/plain": [ - " b c\n", - " min mean max\n", - "a \n", - "2 0 2.0 9\n", - "1 1 4.0 8\n", - "3 2 2.0 2" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.groupby(\"a\").agg({\"b\": [\"min\", \"mean\"], \"c\": [\"max\"]})" - ] - }, - { - "cell_type": "markdown", - "id": "37b94b6f-92b2-4369-831b-2d9e3347d0ca", - "metadata": {}, - "source": [ - "Some things are different though!" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "b98f6e24-6dd7-4375-b8a3-c665d15f3cf0", - "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "Error parsing datetime string \"11/20/2018\" at position 2", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mValueError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[36]\u001b[39m\u001b[32m, line 4\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnp\u001b[39;00m\n\u001b[32m 3\u001b[39m date_df = cudf.DataFrame()\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m date_df[\u001b[33m\"\u001b[39m\u001b[33mdate\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[43mcudf\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdate_range\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m11/20/2018\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiods\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m72\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfreq\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mD\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 5\u001b[39m date_df[\u001b[33m\"\u001b[39m\u001b[33mvalue\u001b[39m\u001b[33m\"\u001b[39m] = np.random.sample(\u001b[38;5;28mlen\u001b[39m(date_df))\n\u001b[32m 6\u001b[39m date_df\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/cudf/core/tools/datetimes.py:900\u001b[39m, in \u001b[36mdate_range\u001b[39m\u001b[34m(start, end, periods, freq, tz, normalize, name, inclusive, unit)\u001b[39m\n\u001b[32m 895\u001b[39m start = (\n\u001b[32m 896\u001b[39m pd.Timestamp(end)\n\u001b[32m 897\u001b[39m - (periods - \u001b[32m1\u001b[39m) * offset._maybe_as_fast_pandas_offset()\n\u001b[32m 898\u001b[39m ).to_numpy()\n\u001b[32m 899\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m end \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m900\u001b[39m start = \u001b[43mdtype\u001b[49m\u001b[43m.\u001b[49m\u001b[43mtype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstart\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43munit\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 901\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m periods \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 902\u001b[39m \u001b[38;5;66;03m# When `periods` is unspecified, its upper bound estimated by\u001b[39;00m\n\u001b[32m 903\u001b[39m \u001b[38;5;66;03m# dividing the number of nanoseconds between two timestamps with\u001b[39;00m\n\u001b[32m 904\u001b[39m \u001b[38;5;66;03m# the lower bound of `freq` in nanoseconds. While the final result\u001b[39;00m\n\u001b[32m 905\u001b[39m \u001b[38;5;66;03m# may contain extra elements that exceeds `end`, they are trimmed\u001b[39;00m\n\u001b[32m 906\u001b[39m \u001b[38;5;66;03m# as a post processing step. [1]\u001b[39;00m\n\u001b[32m 907\u001b[39m _periods_not_specified = \u001b[38;5;28;01mTrue\u001b[39;00m\n", - "\u001b[31mValueError\u001b[39m: Error parsing datetime string \"11/20/2018\" at position 2" - ] - } - ], - "source": [ - "import numpy as np\n", - "\n", - "date_df = cudf.DataFrame()\n", - "date_df[\"date\"] = cudf.date_range(\"11/20/2018\", periods=72, freq=\"D\")\n", - "date_df[\"value\"] = np.random.sample(len(date_df))\n", - "date_df" - ] - }, - { - "cell_type": "markdown", - "id": "0f8d1a62-bb1a-497e-a6dc-745548d33626", - "metadata": {}, - "source": [ - "Unlike Pandas, cuDF does not (yet) have the ability to interpret the date `\"11/20/2018\"`, instead use the more standard `\"2018-11-20\"`:" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "09bd617a-b60c-4452-8974-4d74a30cd4da", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
datevalue
02018-11-200.896092
12018-11-210.524976
22018-11-220.375614
32018-11-230.950067
42018-11-240.291348
.........
672019-01-260.099531
682019-01-270.372808
692019-01-280.379842
702019-01-290.164194
712019-01-300.577193
\n", - "

72 rows \u00d7 2 columns

\n", - "
" - ], - "text/plain": [ - " date value\n", - "0 2018-11-20 0.896092\n", - "1 2018-11-21 0.524976\n", - "2 2018-11-22 0.375614\n", - "3 2018-11-23 0.950067\n", - "4 2018-11-24 0.291348\n", - ".. ... ...\n", - "67 2019-01-26 0.099531\n", - "68 2019-01-27 0.372808\n", - "69 2019-01-28 0.379842\n", - "70 2019-01-29 0.164194\n", - "71 2019-01-30 0.577193\n", - "\n", - "[72 rows x 2 columns]" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "date_df = cudf.DataFrame()\n", - "date_df[\"date\"] = cudf.date_range(\"2018-11-20\", periods=72, freq=\"D\")\n", - "date_df[\"value\"] = np.random.sample(len(date_df))\n", - "date_df" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "7bc1ecd6-80ef-403e-a624-737efc63fb11", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
datevalue
02018-11-200.896092
12018-11-210.524976
22018-11-220.375614
32018-11-230.950067
\n", - "
" - ], - "text/plain": [ - " date value\n", - "0 2018-11-20 0.896092\n", - "1 2018-11-21 0.524976\n", - "2 2018-11-22 0.375614\n", - "3 2018-11-23 0.950067" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "date_df[date_df[\"date\"] < \"2018-11-24\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "ad359f13-f070-4e0f-a9b6-8d02451e7aab", - "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "user defined function compilation failed.", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mnvJitLinkError\u001b[39m Traceback (most recent call last)", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/cudf/core/indexed_frame.py:3515\u001b[39m, in \u001b[36mIndexedFrame._apply\u001b[39m\u001b[34m(self, func, kernel_class, *args, **kwargs)\u001b[39m\n\u001b[32m 3514\u001b[39m kr = kernel_class(\u001b[38;5;28mself\u001b[39m, func, args)\n\u001b[32m-> \u001b[39m\u001b[32m3515\u001b[39m kernel, retty = \u001b[43mkr\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget_kernel\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 3516\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/cudf/core/udf/udf_kernel_base.py:167\u001b[39m, in \u001b[36mApplyKernelBase.get_kernel\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 166\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mget_kernel\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m--> \u001b[39m\u001b[32m167\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_compile_or_get_kernel\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/cudf/core/udf/udf_kernel_base.py:184\u001b[39m, in \u001b[36mApplyKernelBase._compile_or_get_kernel\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 182\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m kernel, masked_or_scalar\n\u001b[32m--> \u001b[39m\u001b[32m184\u001b[39m kernel, scalar_return_type = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcompile_kernel\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 186\u001b[39m np_return_type = (\n\u001b[32m 187\u001b[39m numpy_support.as_dtype(scalar_return_type)\n\u001b[32m 188\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m scalar_return_type.is_internal\n\u001b[32m 189\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m scalar_return_type.np_dtype\n\u001b[32m 190\u001b[39m )\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/cudf/core/udf/udf_kernel_base.py:145\u001b[39m, in \u001b[36mApplyKernelBase.compile_kernel\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 144\u001b[39m kernel_string = \u001b[38;5;28mself\u001b[39m._get_kernel_string()\n\u001b[32m--> \u001b[39m\u001b[32m145\u001b[39m kernel = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mcompile_kernel_string\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 146\u001b[39m \u001b[43m \u001b[49m\u001b[43mkernel_string\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnrt\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcapture_nrt_usage\u001b[49m\u001b[43m.\u001b[49m\u001b[43muse_nrt\u001b[49m\n\u001b[32m 147\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 149\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m kernel, return_type\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/cudf/core/udf/udf_kernel_base.py:159\u001b[39m, in \u001b[36mApplyKernelBase.compile_kernel_string\u001b[39m\u001b[34m(self, kernel_string, nrt)\u001b[39m\n\u001b[32m 158\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m ctx:\n\u001b[32m--> \u001b[39m\u001b[32m159\u001b[39m kernel = \u001b[43mcuda\u001b[49m\u001b[43m.\u001b[49m\u001b[43mjit\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 160\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43msig\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 161\u001b[39m \u001b[43m \u001b[49m\u001b[43mlink\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[43mUDF_SHIM_FILE\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 162\u001b[39m \u001b[43m \u001b[49m\u001b[43mextensions\u001b[49m\u001b[43m=\u001b[49m\u001b[43m[\u001b[49m\u001b[43mstr_view_arg_handler\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 163\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_kernel\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 164\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m kernel\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/numba_cuda/numba/cuda/decorators.py:207\u001b[39m, in \u001b[36mjit.._jit\u001b[39m\u001b[34m(func)\u001b[39m\n\u001b[32m 206\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m207\u001b[39m \u001b[43mdisp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcompile\u001b[49m\u001b[43m(\u001b[49m\u001b[43margtypes\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 209\u001b[39m disp._specialized = specialized\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/numba/core/compiler_lock.py:35\u001b[39m, in \u001b[36m_CompilerLock.__call__.._acquire_compile_lock\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 34\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m:\n\u001b[32m---> \u001b[39m\u001b[32m35\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/numba_cuda/numba/cuda/dispatcher.py:1298\u001b[39m, in \u001b[36mCUDADispatcher.compile\u001b[39m\u001b[34m(self, sig)\u001b[39m\n\u001b[32m 1297\u001b[39m \u001b[38;5;66;03m# We call bind to force codegen, so that there is a cubin to cache\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1298\u001b[39m \u001b[43mkernel\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbind\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1299\u001b[39m \u001b[38;5;28mself\u001b[39m._cache.save_overload(sig, kernel)\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/numba_cuda/numba/cuda/dispatcher.py:331\u001b[39m, in \u001b[36m_Kernel.bind\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 328\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 329\u001b[39m \u001b[33;03mForce binding to current CUDA context\u001b[39;00m\n\u001b[32m 330\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m331\u001b[39m cufunc = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_codelibrary\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget_cufunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 333\u001b[39m \u001b[38;5;28mself\u001b[39m.initialize_once(cufunc.module)\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/numba_cuda/numba/cuda/codegen.py:339\u001b[39m, in \u001b[36mCUDACodeLibrary.get_cufunc\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 338\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m cufunc\n\u001b[32m--> \u001b[39m\u001b[32m339\u001b[39m cubin = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mget_cubin\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcc\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcompute_capability\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 340\u001b[39m module = ctx.create_module_image(\n\u001b[32m 341\u001b[39m cubin, \u001b[38;5;28mself\u001b[39m._setup_functions, \u001b[38;5;28mself\u001b[39m._teardown_functions\n\u001b[32m 342\u001b[39m )\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/numba_cuda/numba/cuda/codegen.py:318\u001b[39m, in \u001b[36mCUDACodeLibrary.get_cubin\u001b[39m\u001b[34m(self, cc)\u001b[39m\n\u001b[32m 317\u001b[39m \u001b[38;5;28mself\u001b[39m._link_all(linker, cc, ignore_nonlto=\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[32m--> \u001b[39m\u001b[32m318\u001b[39m cubin = \u001b[43mlinker\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcomplete\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 320\u001b[39m \u001b[38;5;28mself\u001b[39m._cubin_cache[cc] = cubin\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/numba_cuda/numba/cuda/cudadrv/driver.py:3060\u001b[39m, in \u001b[36m_Linker.complete\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 3059\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mcomplete\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m-> \u001b[39m\u001b[32m3060\u001b[39m \u001b[38;5;28mself\u001b[39m.linker = \u001b[43mLinker\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_object_codes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 3061\u001b[39m result = \u001b[38;5;28mself\u001b[39m.linker.link(\u001b[33m\"\u001b[39m\u001b[33mcubin\u001b[39m\u001b[33m\"\u001b[39m)\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/cuda/core/experimental/_linker.py:394\u001b[39m, in \u001b[36mLinker.__init__\u001b[39m\u001b[34m(self, options, *object_codes)\u001b[39m\n\u001b[32m 393\u001b[39m assert_type(code, ObjectCode)\n\u001b[32m--> \u001b[39m\u001b[32m394\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_add_code_object\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcode\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/cuda/core/experimental/_linker.py:402\u001b[39m, in \u001b[36mLinker._add_code_object\u001b[39m\u001b[34m(self, object_code)\u001b[39m\n\u001b[32m 401\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m _nvjitlink:\n\u001b[32m--> \u001b[39m\u001b[32m402\u001b[39m \u001b[43m_nvjitlink\u001b[49m\u001b[43m.\u001b[49m\u001b[43madd_data\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 403\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_mnff\u001b[49m\u001b[43m.\u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 404\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_input_type_from_code_type\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobject_code\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_code_type\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 405\u001b[39m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 406\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 407\u001b[39m \u001b[43m \u001b[49m\u001b[43mname_str\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 408\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 409\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/cuda/bindings/nvjitlink.pyx:124\u001b[39m, in \u001b[36mcuda.bindings.nvjitlink.add_data\u001b[39m\u001b[34m()\u001b[39m\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/cuda/bindings/nvjitlink.pyx:143\u001b[39m, in \u001b[36mcuda.bindings.nvjitlink.add_data\u001b[39m\u001b[34m()\u001b[39m\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/cuda/bindings/nvjitlink.pyx:75\u001b[39m, in \u001b[36mcuda.bindings.nvjitlink.check_status\u001b[39m\u001b[34m()\u001b[39m\n", - "\u001b[31mnvJitLinkError\u001b[39m: ERROR_INTERNAL (6)\nLinker error log: ERROR 4 in nvvmAddNVVMContainerToProgram, may need newer version of nvJitLink library\n\u0000", - "\nThe above exception was the direct cause of the following exception:\n", - "\u001b[31mValueError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[39]\u001b[39m\u001b[32m, line 4\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34madd_ten\u001b[39m(x):\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m x + \u001b[32m10\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m df[\u001b[33m\"\u001b[39m\u001b[33ma\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43ma\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43madd_ten\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 5\u001b[39m df\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/cudf/utils/performance_tracking.py:51\u001b[39m, in \u001b[36m_performance_tracking..wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 43\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m nvtx.enabled():\n\u001b[32m 44\u001b[39m stack.enter_context(\n\u001b[32m 45\u001b[39m nvtx.annotate(\n\u001b[32m 46\u001b[39m message=func.\u001b[34m__qualname__\u001b[39m,\n\u001b[32m (...)\u001b[39m\u001b[32m 49\u001b[39m )\n\u001b[32m 50\u001b[39m )\n\u001b[32m---> \u001b[39m\u001b[32m51\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/cudf/core/series.py:2566\u001b[39m, in \u001b[36mSeries.apply\u001b[39m\u001b[34m(self, func, convert_dtype, args, by_row, **kwargs)\u001b[39m\n\u001b[32m 2563\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m by_row != \u001b[33m\"\u001b[39m\u001b[33mcompat\u001b[39m\u001b[33m\"\u001b[39m:\n\u001b[32m 2564\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33mby_row is currently not supported.\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m2566\u001b[39m result = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mSeriesApplyKernel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2567\u001b[39m result.name = \u001b[38;5;28mself\u001b[39m.name\n\u001b[32m 2568\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/contextlib.py:85\u001b[39m, in \u001b[36mContextDecorator.__call__..inner\u001b[39m\u001b[34m(*args, **kwds)\u001b[39m\n\u001b[32m 82\u001b[39m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[32m 83\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34minner\u001b[39m(*args, **kwds):\n\u001b[32m 84\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m._recreate_cm():\n\u001b[32m---> \u001b[39m\u001b[32m85\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/cudf/utils/performance_tracking.py:51\u001b[39m, in \u001b[36m_performance_tracking..wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 43\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m nvtx.enabled():\n\u001b[32m 44\u001b[39m stack.enter_context(\n\u001b[32m 45\u001b[39m nvtx.annotate(\n\u001b[32m 46\u001b[39m message=func.\u001b[34m__qualname__\u001b[39m,\n\u001b[32m (...)\u001b[39m\u001b[32m 49\u001b[39m )\n\u001b[32m 50\u001b[39m )\n\u001b[32m---> \u001b[39m\u001b[32m51\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[36mFile \u001b[39m\u001b[32m/usr/local/python/3.13.5/lib/python3.13/site-packages/cudf/core/indexed_frame.py:3517\u001b[39m, in \u001b[36mIndexedFrame._apply\u001b[39m\u001b[34m(self, func, kernel_class, *args, **kwargs)\u001b[39m\n\u001b[32m 3515\u001b[39m kernel, retty = kr.get_kernel()\n\u001b[32m 3516\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m-> \u001b[39m\u001b[32m3517\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 3518\u001b[39m \u001b[33m\"\u001b[39m\u001b[33muser defined function compilation failed.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 3519\u001b[39m ) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01me\u001b[39;00m\n\u001b[32m 3521\u001b[39m \u001b[38;5;66;03m# Mask and data column preallocated\u001b[39;00m\n\u001b[32m 3522\u001b[39m ans_col = _return_arr_from_dtype(retty, \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m))\n", - "\u001b[31mValueError\u001b[39m: user defined function compilation failed." - ] - } - ], - "source": [ - "def add_ten(x):\n", - " return x + 10\n", - "\n", - "df[\"a\"] = df[\"a\"].apply(add_ten)\n", - "df" - ] - }, - { - "cell_type": "markdown", - "id": "f6aee7a6-925c-46b9-bc4a-8d019416923f", - "metadata": {}, - "source": [ - "### Exercise: Working With Real Data\n", - "\n", - "In this exercise, you'll use Pandas to analyze some real-world data, and then repeat the analysis with cuDF.\n", - "\n", - "#### Download the data\n", - "\n", - "The data we'll be working with is the [Parking Violations Issued - Fiscal Year 2022](https://data.cityofnewyork.us/City-Government/Parking-Violations-Issued-Fiscal-Year-2022/7mxj-7a6y) dataset from NYC Open Data.\n", - "\n", - "We're downloading a copy of this dataset from an s3 bucket hosted by NVIDIA to provide faster download speeds. We'll start by downloading the data. This should take about 30 seconds.\n", - "\n", - "#### Data License and Terms\n", - "As this dataset originates from the NYC Open Data Portal, it's governed by their license and terms of use.\n", - "\n", - "###### Are there restrictions on how I can use Open Data?\n", - "\n", - "> Open Data belongs to all New Yorkers. There are no restrictions on the use of Open Data. Refer to Terms of Use for more information.\n", - "\n", - "##### [Terms of Use](https://opendata.cityofnewyork.us/overview/#termsofuse)\n", - "\n", - "> By accessing datasets and feeds available through NYC Open Data, the user agrees to all of the Terms of Use of NYC.gov as well as the Privacy Policy for NYC.gov. The user also agrees to any additional terms of use defined by the agencies, bureaus, and offices providing data. Public data sets made available on NYC Open Data are provided for informational purposes. The City does not warranty the completeness, accuracy, content, or fitness for any particular purpose or use of any public data set made available on NYC Open Data, nor are any such warranties to be implied or inferred with respect to the public data sets furnished therein.\n", - "\n", - "> The City is not liable for any deficiencies in the completeness, accuracy, content, or fitness for any particular purpose or use of any public data set, or application utilizing such data set, provided by any third party.\n", - "\n", - "> Submitting City Agencies are the authoritative source of data available on NYC Open Data. These entities are responsible for data quality and retain version control of data sets and feeds accessed on the Site. Data may be updated, corrected, or refreshed at any time." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f474db87-88af-4b66-9172-f36492aa7ebc", - "metadata": {}, - "outputs": [], - "source": [ - "!wget -nc https://data.rapids.ai/datasets/nyc_parking/nyc_parking_violations_2022.parquet -O nyc_parking_violations_2022.parquet" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea5747dc-a442-4a3e-b2ee-56fcf7a0d40b", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "data = pd.read_parquet(\"nyc_parking_violations_2022.parquet\")\n", - "data.head()" - ] - }, - { - "cell_type": "markdown", - "id": "89a143e5-03a7-4cf5-8bb7-834b58e7528f", - "metadata": {}, - "source": [ - "#### Task 1" - ] - }, - { - "cell_type": "markdown", - "id": "b08b88b2-b191-4298-b251-be4c9e8faf75", - "metadata": {}, - "source": [ - "This dataset is relatively large, with lots of columns.\n", - "\n", - "* How many columns are there?\n", - "* Extract a subset of the data with just the following columns:\n", - " * `\"Registration State\"`\n", - " * `\"Violation Description\"`\n", - " * `\"Vehicle Body Type\"`\n", - " * `\"Issue Date\"`" - ] - }, - { - "cell_type": "markdown", - "id": "c08782ac-662b-473d-a103-3ec82af66091", - "metadata": {}, - "source": [ - "#### Task 2" - ] - }, - { - "cell_type": "markdown", - "id": "41c3b975-0968-43fa-b11f-cc6804cc15ba", - "metadata": {}, - "source": [ - "For vehicles with body type `\"TAXI\"`, what is the number of vehicles from each state?" - ] - }, - { - "cell_type": "markdown", - "id": "c1a964b5-5f20-40b9-8989-7241f1a3c1a7", - "metadata": {}, - "source": [ - "#### Task 3\n", - "\n", - "Now, repeat the analysis (starting from `read_parquet`) using cuDF. How much faster is it compared to Pandas? To measure the execution of a cell in Jupyter Notebook, you can add the line `%%time` at the top of a cell. For example:\n", - "\n", - "```python\n", - "%%time\n", - "\n", - "import cudf\n", - "data = cudf.read_parquet(\"nyc_parking_violations_2022.parquet\")\n", - "data.head()\n", - "````" - ] - }, - { - "cell_type": "markdown", - "id": "e6922342-bb99-4cbf-8b22-412ac6249fd7", - "metadata": {}, - "source": [ - "### Resources" - ] - }, - { - "cell_type": "markdown", - "id": "4759d32d-57c2-4454-8acc-d98072fd38b3", - "metadata": {}, - "source": [ - "* `cudf.pandas` docs: https://docs.rapids.ai/api/cudf/stable/cudf_pandas/\n", - "* cuDF documentation: https://docs.rapids.ai/api/cudf/stable\n", - "* cuDF API reference: https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/" - ] - } - ], - "metadata": { - "colab": { - "gpuType": "T4", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (RAPIDS 25.10)", - "language": "python", - "name": "cudf-cu12-25.10" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DataFrames with Pandas and cuDF" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table of Contents\n", + "\n", + "1. [Introduction](#1.-Introduction)\n", + "2. [Pandas Essentials](#2.-Pandas-Essentials)\n", + " - [2.1 Series and DataFrame Objects](#2.1-Series-and-DataFrame-Objects)\n", + " - [2.2 Selecting and Filtering Data](#2.2-Selecting-and-Filtering-Data)\n", + " - [2.3 Sorting](#2.3-Sorting)\n", + " - [2.4 Summarizing Data](#2.4-Summarizing-Data)\n", + " - [2.5 Grouped Aggregations (groupby)](#2.5-Grouped-Aggregations-(groupby))\n", + " - [2.6 String Operations](#2.6-String-Operations)\n", + " - [2.7 Time Series](#2.7-Time-Series)\n", + " - [2.8 User-Defined Operations (apply)](#2.8-User-Defined-Operations-(apply))\n", + "3. [Enter cuDF: GPU DataFrames](#3.-Enter-cuDF:-GPU-DataFrames)\n", + " - [3.1 Exercise: Date Formatting Failure](#3.1-Exercise:-Date-Formatting-Failure)\n", + " - [3.2 Exercise: Why `.apply()` Breaks Down in cuDF](#3.2-Exercise:-Why-`.apply()`-Breaks-Down-in-cuDF)\n", + "4. [Exercise: Analyzing Real Data (NYC Parking Violations)](#4.-Exercise:-Analyzing-Real-Data-(NYC-Parking-Violations))\n", + " - [Step 0: Download Data](#Step-0:-Download-Data)\n", + " - [Task 1: Data Inspection (Pandas)](#📝-Task-1:-Data-Inspection-(Pandas))\n", + " - [Task 2: Analyze Taxis (Pandas)](#📝-Task-2:-Analyze-Taxis-(Pandas))\n", + " - [Task 3: GPU Acceleration (cuDF)](#📝-Task-3:-GPU-Acceleration-(cuDF))\n", + "5. [Conclusion](#Conclusion)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Introduction\n", + "\n", + "In this notebook, we will build a foundation in data manipulation using **Pandas**, the industry standard for Python data analysis. Then, we will transition to **cuDF**, which allows us to run standard Pandas-like code on the GPU.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Learning Objectives:\n", + "\n", + "- **Introduce core Pandas operations:** Indexing, Filtering, Aggregating, and Time Series.\n", + "- **Learn the subtle differences** (and speed benefits) when porting code to cuDF.\n", + "- **Exercise:** Apply these skills to analyze a real-world NYC Parking Violations dataset.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note:** Throughout this notebook, we provide \"Quick Docs\" sections to remind you of common syntax. However, these are not exhaustive. For complete API details, parameters, and edge cases, you should always reference the official [Pandas Documentation](https://pandas.pydata.org/docs/) or the [cuDF Documentation](https://docs.rapids.ai/api/cudf/stable/).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## 2. Pandas Essentials\n", + "\n", + "Before we accelerate with GPUs, let's ensure we are comfortable with the DataFrame API. Even if you are a Pandas pro, this refresher sets the baseline syntax we will replicate later.\n", + "\n", + "First, import the library:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.1 Series and DataFrame Objects\n", + "\n", + "- **Series:** A one-dimensional labeled array (like a powerful list or a single column).\n", + "- **DataFrame:** A two-dimensional labeled data structure (like a spreadsheet or SQL table).\n", + "\n", + "**Quick Docs:**\n", + "\n", + "- `pd.Series(data)`: Create a Series.\n", + "- `pd.DataFrame(data, index)`: Create a DataFrame.\n", + "- `df.head(n)` / `df.tail(n)`: View the first/last n rows.\n", + "- `df.index` / `df.columns`: Access row labels and column names.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# A Series acts like a single column of data\n", + "s = pd.Series([10, 20, 30])\n", + "print(f\"Max value in series: {s.max()}\")\n", + "\n", + "# A DataFrame is a collection of Series sharing an index\n", + "df = pd.DataFrame({\n", + " \"a\": [1, 2, 1, 3, 2],\n", + " \"b\": [1, 4, 7, 2, 0],\n", + " \"c\": [3, 3, 3, 4, 5]\n", + "}, index=[1, 2, 3, 4, 5])\n", + "\n", + "# View the structure\n", + "print(\"Columns:\", df.columns)\n", + "df.head(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2 Selecting and Filtering Data\n", + "\n", + "Selecting specific subsets of data is the most common task in analysis. You can select by column name, label index, or integer position.\n", + "\n", + "**Quick Docs:**\n", + "\n", + "- `df['col']`: Select a single column (returns a Series).\n", + "- `df[['col1', 'col2']]`: Select multiple columns (returns a DataFrame).\n", + "- `df.loc[label]`: Select row(s) by index label.\n", + "- `df.iloc[position]`: Select row(s) by integer position (0-based).\n", + "- `df[condition]`: Boolean indexing (filtering).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Select specific columns\n", + "subset = df[[\"b\", \"c\"]]\n", + "\n", + "# Select rows by label (loc) and position (iloc)\n", + "row_label_2 = df.loc[2] # Row with index label 2\n", + "row_pos_0 = df.iloc[0] # First row (physically)\n", + "\n", + "# Boolean Indexing: Filter rows where column 'a' is greater than 1\n", + "filtered_df = df[df['a'] > 1]\n", + "filtered_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.3 Sorting\n", + "\n", + "Ordering data helps in ranking and visualization.\n", + "\n", + "**Quick Docs:**\n", + "\n", + "- `df.sort_values(by='col', ascending=True/False)`: Sort by one or more columns.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sort by column 'a' in ascending order\n", + "sorted_df = df.sort_values(\"a\")\n", + "sorted_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.4 Summarizing Data\n", + "\n", + "It's straightforward to get a quick overview of your data's distribution.\n", + "\n", + "**Quick Docs:**\n", + "\n", + "- `df.describe()`: Summary statistics (count, mean, std, etc.).\n", + "- `df.mean()`, `df.sum()`, `df.max()`: Aggregations across columns.\n", + "- `df['col'].value_counts()`: Count unique values (useful for histograms).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate the sum of every column\n", + "print(\"Sum of columns:\\n\", df.sum())\n", + "\n", + "# Count frequency of values in column 'a'\n", + "print(\"\\nValue counts for 'a':\\n\", df[\"a\"].value_counts())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.5 Grouped Aggregations (groupby)\n", + "\n", + "The \"Split-Apply-Combine\" strategy. Split data into groups based on some criteria, apply a function to each group, and combine the results.\n", + "\n", + "**Quick Docs:**\n", + "\n", + "- `df.groupby('col')`: Group data.\n", + "- `.mean()`, `.count()`: Apply aggregation.\n", + "- `.agg({'col': ['min', 'max']})`: Apply complex, specific aggregations.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Group by 'a' and calculate the mean of 'b' and 'c' for each group\n", + "grouped_mean = df.groupby(\"a\").mean()\n", + "print(grouped_mean)\n", + "\n", + "# Complex aggregation: Get min and mean of 'b', and max of 'c'\n", + "agg_df = df.groupby(\"a\").agg({\n", + " \"b\": [\"min\", \"mean\"],\n", + " \"c\": [\"max\"]\n", + "})\n", + "agg_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.6 String Operations\n", + "\n", + "Pandas provides vectorized string functions via the `.str` accessor.\n", + "\n", + "**Quick Docs:**\n", + "\n", + "- `df['col'].str.upper()`: Convert to uppercase.\n", + "- `df['col'].str.contains('pattern')`: Boolean check for substring.\n", + "- `df['col'].str.replace('old', 'new')`: Replace text." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Add a string column\n", + "df[\"names\"] = [\"mario\", \"luigi\", \"yoshi\", \"peach\", \"toad\"]\n", + "\n", + "# Convert to uppercase\n", + "df[\"names_upper\"] = df[\"names\"].str.upper()\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.7 Time Series\n", + "\n", + "Pandas was originally developed for financial time series analysis. It handles dates and times robustly via the `.dt` accessor.\n", + "\n", + "**Quick Docs:**\n", + "\n", + "- `pd.to_datetime()`: Convert strings to datetime objects.\n", + "- `df['date'].dt.year`: Extract year component.\n", + "- `df['date'].dt.dayofweek`: Extract day of week.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a date range\n", + "date_df = pd.DataFrame()\n", + "date_df[\"date\"] = pd.date_range(\"2018-11-20\", periods=5, freq=\"D\")\n", + "date_df[\"value\"] = np.random.sample(len(date_df))\n", + "\n", + "# Filter by date\n", + "subset_dates = date_df[date_df[\"date\"] < \"2018-11-23\"]\n", + "\n", + "# Extract features\n", + "date_df[\"year\"] = date_df[\"date\"].dt.year\n", + "date_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.8 User-Defined Operations (apply)\n", + "\n", + "When built-in functions aren't enough, you can apply custom Python functions.\n", + "\n", + "**Quick Docs:**\n", + "\n", + "- `df['col'].apply(func)`: Apply function `func` to every element.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def add_ten(x):\n", + " return x + 10\n", + "\n", + "# Apply the custom function\n", + "df[\"a_plus_10\"] = df[\"a\"].apply(add_ten)\n", + "df.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## 3. Enter cuDF: GPU DataFrames\n", + "\n", + "cuDF mimics the Pandas API but runs on the GPU. The transition is often as simple as changing the import, but there are some constraints you must know.\n", + "\n", + "First, let's create a GPU DataFrame.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import cudf\n", + "\n", + "# Create a cuDF DataFrame (data resides on GPU)\n", + "gdf = cudf.DataFrame({\n", + " \"a\": [1, 2, 1, 3, 2],\n", + " \"b\": [1, 4, 7, 2, 0],\n", + " \"c\": [1, 1, 8, 2, 9]\n", + "}, index=[1, 2, 3, 4, 5])\n", + "\n", + "# Operations work exactly the same!\n", + "print(type(gdf))\n", + "gdf.groupby(\"a\").mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.1 Exercise: Date Formatting Failure\n", + "\n", + "Pandas is very forgiving with date formats. cuDF is stricter. Run the cell below to see what happens when you use a non-standard date string.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# EXECUTE THIS CELL TO SEE THE ERROR\n", + "try:\n", + " date_df = cudf.DataFrame()\n", + " # Pandas handles \"11/20/2018\" easily. Does cuDF?\n", + " date_df[\"date\"] = cudf.date_range(\"11/20/2018\", periods=72, freq=\"D\")\n", + "except Exception as e:\n", + " print(f\"Error caught: {e}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Why did this fail?** Unlike Pandas, cuDF currently requires ISO-standard date formats (Year-Month-Day) for creating date ranges.\n", + "\n", + "- **Pandas:** Guesses `11/20/2018` is Nov 20th.\n", + "- **cuDF:** Requires `2018-11-20`.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.2 Exercise: Why `.apply()` Breaks Down in cuDF\n", + "\n", + "In Pandas, `.apply()` works because the CPU can execute your Python function one element at a time. On the GPU, this model does not work: a GPU cannot interpret Python bytecode. To make custom functions run on the GPU, cuDF uses Numba to compile your Python function into GPU machine code (PTX). That compilation step imposes strict rules:\n", + "\n", + "- The function must be Numba-compilable (pure math only; no Python objects).\n", + "- Types must be static, not dynamic or inferred at runtime.\n", + "- Only supported operations may be used (Numba must be able to lower them to GPU code).\n", + "\n", + "Even simple-looking Python functions often violate these rules:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# A function that looks innocent but is NOT GPU-safe\n", + "def add_ten_verbose(x):\n", + " # Python branching + dynamic typing make this un-compilable for the GPU\n", + " if isinstance(x, (int, float)):\n", + " return x + 10\n", + " else:\n", + " return x" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is perfectly legal in Pandas. But in cuDF, Numba cannot:\n", + "\n", + "- interpret `isinstance`\n", + "- handle Python branching on object types\n", + "- JIT-compile dynamic return values\n", + "\n", + "Now try running it:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Execute this cell to observe the cuDF limitation\n", + "try:\n", + " gdf[\"a\"] = gdf[\"a\"].apply(add_ten_verbose)\n", + "except Exception as e:\n", + " print(\"cuDF apply() constraint caught:\")\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**What happens?** cuDF attempts to compile the function → compilation fails → you get a runtime error. This mirrors real-world failure modes: anything that is not pure numerical logic will break.\n", + "\n", + "Here is the same logic, rewritten in a way the GPU can compile:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# GPU-safe version: no Python, no branching, pure math\n", + "def add_ten_gpu(x):\n", + " return x + 10\n", + "try:\n", + " gdf[\"a\"] = gdf[\"a\"].apply(add_ten_gpu)\n", + "except Exception as e:\n", + " print(\"cuDF apply() constraint caught:\")\n", + " print(e)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is one of the few forms that Numba can translate. But even this version is not preferred. **cuDF `.apply()` is a last resort**. Even if your function compiles, `.apply()` still triggers:\n", + "\n", + "- JIT compilation overhead (slow startup)\n", + "- Kernel launch overhead\n", + "- Reduced optimization compared to built-in GPU operations\n", + "\n", + "For typical column transformations, this is simply unnecessary. \n", + "**Best practice is to always use vectorized operations:**\n", + "\n", + "```python\n", + "gdf[\"a\"] + 10\n", + "```\n", + "The vectorized version is:\n", + "- faster\n", + "- simpler\n", + "- more readable\n", + "- the intended way to use GPUs\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## 4. Exercise: Analyzing Real Data (NYC Parking Violations)\n", + "\n", + "Now you will apply what you learned to a large, real-world dataset.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 0: Download Data\n", + "\n", + "We will fetch a subset of the NYC Parking Violations dataset (Fiscal Year 2022).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!wget -nc https://data.rapids.ai/datasets/nyc_parking/nyc_parking_violations_2022.parquet -O nyc_parking_violations_2022.parquet\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task 1: Data Inspection (Pandas)\n", + "\n", + "**Goal:** Load the data and inspect its structure.\n", + "\n", + "**Instructions:**\n", + "\n", + "1. Read the file `nyc_parking_violations_2022.parquet` into a Pandas DataFrame.\n", + "2. Print the columns.\n", + "3. Create a subset DataFrame with only: `Registration State`, `Violation Description`, `Vehicle Body Type`, `Issue Date`.\n", + "4. Display the head of this subset.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "# TODO: Read parquet file\n", + "# df = ...\n", + "\n", + "# TODO: Print columns\n", + "# ...\n", + "\n", + "# TODO: Select specific columns\n", + "# df_subset = ...\n", + "\n", + "# TODO: Display head\n", + "# ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task 2: Analyze Taxis (Pandas)\n", + "\n", + "**Goal:** Filter, Group, and Count.\n", + "\n", + "**Instructions:**\n", + "\n", + "1. Filter the DataFrame to find rows where `Vehicle Body Type` is `\"TAXI\"`.\n", + "2. Group by `Registration State`.\n", + "3. Count the occurrences to see which states the taxis are registered in.\n", + "4. Sort the results descending to find the top states.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Filter for TAXI\n", + "# taxi_df = ...\n", + "\n", + "# TODO: Group by State and count\n", + "# ...\n", + "\n", + "# TODO: Sort and display top results\n", + "# ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task 3: GPU Acceleration (cuDF)\n", + "\n", + "**Goal:** Measure the speedup.\n", + "\n", + "**Instructions:**\n", + "\n", + "1. Import `cudf`.\n", + "2. Use `%%time` at the top of the cell.\n", + "3. Replicate the entire pipeline (Read -> Filter columns -> Filter Rows -> Group -> Sort) using `cudf`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "import cudf\n", + "\n", + "# TODO: Replicate the analysis using cuDF\n", + "# ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## Conclusion\n", + "\n", + "Compare the **Wall time** of Task 2 vs Task 3. You should see a significant performance improvement with cuDF, especially as data size grows!\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (RAPIDS 25.10)", + "language": "python", + "name": "cudf-cu12-25.10" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/tutorials/accelerated-python/notebooks/libraries/23__cuda_cccl__customizing_algorithms.ipynb b/tutorials/accelerated-python/notebooks/libraries/23__cuda_cccl__customizing_algorithms.ipynb index 7953cf4e..145be7a7 100644 --- a/tutorials/accelerated-python/notebooks/libraries/23__cuda_cccl__customizing_algorithms.ipynb +++ b/tutorials/accelerated-python/notebooks/libraries/23__cuda_cccl__customizing_algorithms.ipynb @@ -1,1492 +1,1528 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "42b6145d", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "## Exercise - CCCL - Customizing Algorithms" - ] - }, - { - "cell_type": "markdown", - "id": "008c05bf-5615-4b0b-8294-d66795f1f155", - "metadata": {}, - "source": [ - "### What is `cuda-cccl`?" - ] - }, - { - "cell_type": "markdown", - "id": "98385a37-45b5-4b42-8a30-22112d2292df", - "metadata": {}, - "source": [ - "The [CUDA Core Compute Libraries (CCCL)](https://nvidia.github.io/cccl/python/) provide high-quality, high-performance abstractions for CUDA development in Python. The `cuda-cccl` Python package is composed of two indepdendent subpackages:\n", - "\n", - "* `cuda.compute` is a **parallel algorithms library** containing algorithms like `reduce`, `transform`, `scan` and `sort`. These can be combined to implement more complex algorithms, while delivering the performance of hand-optimized CUDA kernels, portable across different GPU architectures. They are general-purpose and **designed to be used with CuPy, PyTorch and other array/tensor frameworks.**.\n", - "\n", - "* `cuda.coop` is a lower-level library containing **cooperative algorithms meant to be used within (numba) CUDA kernels**. Examples include _block-wide reduction_ and _warp-wide scan_, providing numba CUDA kernel developers with building blocks to create speed-of-light, custom kernels." - ] - }, - { - "cell_type": "markdown", - "id": "79acd5e0-640c-4b61-8528-f57904d0ca95", - "metadata": {}, - "source": [ - "### When to use it?" - ] - }, - { - "cell_type": "markdown", - "id": "14f05b64-f709-41de-8899-add8b4c3dcb3", - "metadata": {}, - "source": [ - "`cccl` provides a level of abstraction in between tensor libraries and raw CUDA kernels.\n", - "\n", - "- If you want to implement custom functionality that can not easily and efficiently be expressed using PyTorch/CuPy operations, you can reach for `cuda.compute` before resorting to writing CUDA kernels.\n", - "- If you _do_ need to write a kernel, you can often make use of the block-level and warp-level primitives offered by `cuda.coop` to write your kernel much more efficiently and concisely. " - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "3e2b1188-ce5e-4a10-84a1-ed6ace92922f", - "metadata": {}, - "source": [ - "" - ] - }, - { - "cell_type": "markdown", - "id": "71178c05", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "## Installation" - ] - }, - { - "cell_type": "markdown", - "id": "fc78704b-cc16-4f3a-8a76-a1ecb4acd8e3", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "The command below installs `cuda-cccl` along with pieces of the CUDA toolkit it needs. You'll only need to do this in Google Colab." - ] - }, - { - "cell_type": "markdown", - "id": "23c21a66-dcf9-44f5-8262-78aa77754523", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [], - "vscode": { - "languageId": "plaintext" + "cells": [ + { + "cell_type": "markdown", + "id": "42b6145d", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Table of Contents\n", + "\n", + "1. [Introduction: CCCL & Customizing Algorithms](#1.-Introduction:-CCCL-&-Customizing-Algorithms)\n", + " - 1.1 [What is cuda-cccl?](#1.1-What-is-cuda-cccl?)\n", + " - 1.2 [When to use it?](#1.2-When-to-use-it?)\n", + " - 1.3 [Installation](#1.3-Installation)\n", + "2. [Basic Reductions](#2.-Basic-Reductions)\n", + " - 2.1 [Using `reduce_into()` to compute the sum of a sequence](#2.1-Using-`reduce_into()`-to-compute-the-sum-of-a-sequence)\n", + " - 2.2 [Exercise: computing the minimum value](#2.2-Exercise:-computing-the-minimum-value)\n", + "3. [Custom Reductions](#3.-Custom-Reductions)\n", + " - 3.1 [Example: sum of even values](#3.1-Example:-sum-of-even-values)\n", + " - 3.2 [Performance Comparison: Custom vs. Naive CuPy](#3.2-Performance-Comparison:-Custom-vs.-Naive-CuPy)\n", + "4. [Scanning Algorithms](#4.-Scanning-Algorithms)\n", + " - 4.1 [What is a Scan?](#4.1-What-is-a-Scan?)\n", + " - 4.2 [Maximum Scan Example](#4.2-Maximum-Scan-Example)\n", + "5. [Sorting Algorithms](#5.-Sorting-Algorithms)\n", + " - 5.1 [Merge Sort](#5.1-Merge-Sort)\n", + " - 5.1.1 [Exercise: sort by the last digit](#5.1.1-Exercise:-sort-by-the-last-digit)\n", + " - 5.2 [Radix Sort](#5.2-Radix-Sort)\n", + "6. [Transformation Algorithms](#6.-Transformation-Algorithms)\n", + " - 6.1 [Unary and Binary Transform](#6.1-Unary-and-Binary-Transform)\n", + " - 6.2 [Data Normalization with Transform](#6.2-Data-Normalization-with-Transform)\n", + " - 6.3 [Transform with Iterators for Memory Efficiency](#6.3-Transform-with-Iterators-for-Memory-Efficiency)\n", + "7. [Custom (Struct) Data Types](#7.-Custom-(Struct)-Data-Types)\n", + "8. [Working with Iterators](#8.-Working-with-Iterators)\n", + " - 8.1 [CountingIterators and ConstantIterator](#8.1-CountingIterators-and-ConstantIterator)\n", + " - 8.2 [TransformIterator](#8.2-TransformIterator)\n", + " - 8.3 [ZipIterator](#8.3-ZipIterator)\n", + "9. [Capstone Exercise: Implementing Running Average](#9.-Capstone-Exercise:-Implementing-Running-Average)" + ] + }, + { + "cell_type": "markdown", + "id": "575a8a12", + "metadata": {}, + "source": [ + "# 1. Introduction: CCCL & Customizing Algorithms\n" + ] + }, + { + "cell_type": "markdown", + "id": "008c05bf-5615-4b0b-8294-d66795f1f155", + "metadata": {}, + "source": [ + "## 1.1 What is cuda-cccl?" + ] + }, + { + "cell_type": "markdown", + "id": "98385a37-45b5-4b42-8a30-22112d2292df", + "metadata": {}, + "source": [ + "The [CUDA Core Compute Libraries (CCCL)](https://nvidia.github.io/cccl/python/) provide high-quality, high-performance abstractions for CUDA development in Python. The `cuda-cccl` Python package is composed of two indepdendent subpackages:\n", + "\n", + "* `cuda.compute` is a **parallel algorithms library** containing algorithms like `reduce`, `transform`, `scan` and `sort`. These can be combined to implement more complex algorithms, while delivering the performance of hand-optimized CUDA kernels, portable across different GPU architectures. They are general-purpose and **designed to be used with CuPy, PyTorch and other array/tensor frameworks.**.\n", + "\n", + "* `cuda.coop` is a lower-level library containing **cooperative algorithms meant to be used within (numba) CUDA kernels**. Examples include _block-wide reduction_ and _warp-wide scan_, providing numba CUDA kernel developers with building blocks to create speed-of-light, custom kernels." + ] + }, + { + "cell_type": "markdown", + "id": "79acd5e0-640c-4b61-8528-f57904d0ca95", + "metadata": {}, + "source": [ + "## 1.2 When to use it?" + ] + }, + { + "cell_type": "markdown", + "id": "14f05b64-f709-41de-8899-add8b4c3dcb3", + "metadata": {}, + "source": [ + "`cccl` provides a level of abstraction in between tensor libraries and raw CUDA kernels.\n", + "\n", + "- If you want to implement custom functionality that can not easily and efficiently be expressed using PyTorch/CuPy operations, you can reach for `cuda.compute` before resorting to writing CUDA kernels.\n", + "- If you _do_ need to write a kernel, you can often make use of the block-level and warp-level primitives offered by `cuda.coop` to write your kernel much more efficiently and concisely. " + ] + }, + { + "cell_type": "markdown", + "id": "3e2b1188-ce5e-4a10-84a1-ed6ace92922f", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "id": "71178c05", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## 1.3 Installation" + ] + }, + { + "cell_type": "markdown", + "id": "fc78704b-cc16-4f3a-8a76-a1ecb4acd8e3", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "The command below installs `cuda-cccl` along with pieces of the CUDA toolkit it needs. You'll only need to do this in Google Colab." + ] + }, + { + "cell_type": "markdown", + "id": "23c21a66-dcf9-44f5-8262-78aa77754523", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [], + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "import os\n", + "\n", + "if os.getenv(\"COLAB_RELEASE_TAG\") and not os.path.exists(\"/ach-installed\"): # If running in Google Colab:\n", + " !pip uninstall \"cuda-python\" --yes > /dev/null\n", + " !pip install \"numba-cuda\" \"cuda-cccl[test-cu12]\" > /dev/null 2>&1\n", + " open(\"/ach-installed\", \"a\").close()" + ] + }, + { + "cell_type": "markdown", + "id": "10355920-9bfc-4788-bfe7-ab99440a6d98", + "metadata": {}, + "source": [ + "The `[test-cu12]` extras installs CuPy, which we will use in our examples. It is not strictly a dependency of `cuda-cccl` - you can use any array-like object (like PyTorch tensors) as well." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bca5a026-db41-4079-808e-b8c3c7196c7a", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import cupy as cp\n", + "import cuda.compute as comp" + ] + }, + { + "cell_type": "markdown", + "id": "01908d54", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# 2. Basic Reductions" + ] + }, + { + "cell_type": "markdown", + "id": "2d468957-1411-4cae-a050-a75692452cc2", + "metadata": {}, + "source": [ + "A **reduction** takes many values and combines them into a single result using a binary operation.\n", + "\n", + "As a simple example, consider a sequence of values like $[2, 3, 5, 1, 7, 6, 8, 4]$. The *sum* of the values of that sequence is a reduction using _addition_ as the binary operation: $(2 + 3 + 5 + 1 + 7 + 6 + 8 + 4) = 36$. Similarly, the *maximum value* can be obtained by performing a reduction using `max(a, b)` as the binary operation." + ] + }, + { + "cell_type": "markdown", + "id": "83065c06-aeab-4d26-9d18-69edb8462c2d", + "metadata": {}, + "source": [ + "A reduction can be computed in parallel. Typically this is done using a \"tree\" reduction where elements are combined in pairs across multiple levels, resembling the structure of a binary tree. At each level, the number of elements is halved as partial results are computed in parallel. This continues until a single final result is obtained at the root of the tree.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "01d0feb1-7ed0-42dc-bb83-8b017d89a2a6", + "metadata": {}, + "source": [ + "If you know some CUDA, you can quite easily write a kernel to implement this kind of parallel reduction. However, optimizing it for the specific CUDA architecture of your device, and generalizing for different data types and sizes can be difficult.\n", + "\n", + "This is where `cuda.compute` comes in. It provides optimized implementations of algorithms like reduction that give the best possible performance." + ] + }, + { + "cell_type": "markdown", + "id": "23cb8fb8-8e35-4179-8a66-2c3b5e6077ae", + "metadata": {}, + "source": [ + "## 2.1 Using `reduce_into()` to compute the sum of a sequence" + ] + }, + { + "cell_type": "markdown", + "id": "9c8f6367-d7f9-4030-bc57-4b8920299b47", + "metadata": {}, + "source": [ + "`cuda.compute` provides a `reduce_into()` function to compute general reductions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adce5791", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "\"\"\"\n", + "Using `reduce_into()` to compute the sum of a sequence\n", + "\"\"\"\n", + "\n", + "# Prepare the inputs and outputs.\n", + "d_input = cp.array([2, 3, 5, 1, 7, 6, 8, 4], dtype=np.int32) # input sequence, a CuPy (device) array\n", + "d_output = cp.empty(1, dtype=np.int32) # array which will hold the result, a CuPy (device) array of size 1\n", + "h_init = np.array([0], dtype=np.int32) # initial value of the reduction, a NumPy (host) array of size 1\n", + "\n", + "# Perform the reduction.\n", + "comp.reduce_into(d_input, d_output, comp.OpKind.PLUS, len(d_input), h_init)\n", + "\n", + "print(d_input)\n", + "# Verify the result.\n", + "expected_output = 36\n", + "assert (d_output == expected_output).all()\n", + "result = d_output[0]\n", + "print(f\"Sum reduction result: {result}\")" + ] + }, + { + "cell_type": "markdown", + "id": "2f00121b-bfb1-4b37-9651-230386d9c256", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## 2.2 Exercise: computing the minimum value" + ] + }, + { + "cell_type": "markdown", + "id": "8f5e9ffc-89cd-4859-97ea-93abbb8b3f4b", + "metadata": {}, + "source": [ + "`reduce_into()` can be used to compute other reductions " + ] + }, + { + "cell_type": "markdown", + "id": "9241706a-b152-41b3-bba7-d281c9e43675", + "metadata": {}, + "source": [ + "Similar to the examples above, below is an incomplete code snippet for computing the minimum value of a sequence. Complete the section between the comments `begin TODO` and `end TODO` to use `reduce_into()` to compute the minimum." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44cb3daf-1a82-4af3-965e-b0d4be56b17e", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "Using `reduce_into()` to compute the minimum value of a sequence\n", + "\"\"\"\n", + "\n", + "d_input = cp.array([-2, 3, 5, 1, 7, -6, 8, -4], dtype=np.int32)\n", + "d_output = cp.empty(1, dtype=np.int32)\n", + "\n", + "# begin TODO\n", + "\n", + "\n", + "# end TODO\n", + "\n", + "expected_output = -6\n", + "assert (d_output == expected_output).all()\n", + "result = d_output[0]\n", + "print(f\"Min reduction result: {result}\")" + ] + }, + { + "cell_type": "markdown", + "id": "9839aacd-7256-484c-8205-e48068f3217b", + "metadata": {}, + "source": [ + "# 3. Custom Reductions" + ] + }, + { + "cell_type": "markdown", + "id": "9b74da96-38ff-434a-87fa-bba49e37bf5a", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## 3.1 Example: sum of even values" + ] + }, + { + "cell_type": "markdown", + "id": "a0395884-0786-4b75-9442-046609041439", + "metadata": {}, + "source": [ + "At this point, you might be thinking:" + ] + }, + { + "cell_type": "markdown", + "id": "f09ef69c-07d1-4ca7-af22-89f105d7c532", + "metadata": {}, + "source": [ + "> **_Umm, can't I just use CuPy or PyTorch to compute sum or max?_**" + ] + }, + { + "cell_type": "markdown", + "id": "cd42b854-3576-4cf8-9880-f1aed514a10b", + "metadata": {}, + "source": [ + "Of course, given a CuPy array, it's trivial to do simple reductions like `sum`, `min` or `max`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61248d58-dcb7-4bf2-9b8b-9669ea2cd3d6", + "metadata": {}, + "outputs": [], + "source": [ + "d_input = cp.array([-2, 3, 5, 1, 7, -6, 8, -4], dtype=np.int32)\n", + "\n", + "print(f\"Sum using cp.sum: {cp.sum(d_input)}\")\n", + "print(f\"Max value using cp.max: {cp.max(d_input)}\")\n", + "print(f\"Min value using cp.min: {cp.min(d_input)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "b96e0729-87d3-4423-ac13-28c5d34e3786", + "metadata": {}, + "source": [ + "The benefit of `cuda-cccl` is more apparent when you want to do custom operations. For example, rather than just computing a straightforward `sum`, let's say we wanted to compute the sum of **only even values** in a sequence. Naively, here's how to do that with CuPy:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "929b31ba-ce77-4fcd-9750-5efa0c13c346", + "metadata": {}, + "outputs": [], + "source": [ + "d_input = cp.array([2, 3, 5, 1, 7, 6, 8, 4], dtype=np.int32)\n", + "result = (d_input[d_input % 2 == 0]).sum()\n", + "print(f\"Sum of even values with CuPy: {result}\")" + ] + }, + { + "cell_type": "markdown", + "id": "48fd15a3-f616-4337-a7d7-c81c999a73f7", + "metadata": {}, + "source": [ + "Now, let's do the same thing with `parallel`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03947ed8-c6f9-49ae-b382-db7a0ef14931", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "Using `reduce_into()` with a custom binary operation\n", + "\"\"\"\n", + "\n", + "# Define a custom binary operation for the reduction.\n", + "def sum_even_op(a, b):\n", + " return (a if a % 2 == 0 else 0) + (b if b % 2 == 0 else 0)\n", + "\n", + "d_input = cp.array([2, 3, 5, 1, 7, 6, 8, 4], dtype=np.int32)\n", + "d_output = cp.empty(1, dtype=np.int32)\n", + "h_init = np.array([0], dtype=np.int32)\n", + "\n", + "# Call `reduce_into()` passing the function above for the binary operation:\n", + "comp.reduce_into(d_input, d_output, sum_even_op, len(d_input), h_init)\n", + "result = d_output.get()[0]\n", + "print(f\"Sum of even values with `cuda.compute`: {result}\")" + ] + }, + { + "cell_type": "markdown", + "id": "e7c9205d-7668-4a72-a01d-f57e2d1a3bb1", + "metadata": {}, + "source": [ + "We got the same result using `cuda.compute`, but we had to write significantly more code. Is it worth it? Below is a small benchmarking script comparing timings for a range of input sizes:" + ] + }, + { + "cell_type": "markdown", + "id": "43afe1e9-c870-43e0-b884-2c7f93581869", + "metadata": {}, + "source": [ + "## 3.2 Performance Comparison: Custom vs. Naive CuPy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "729a1867-3a21-4f09-af97-f02d0a562d9f", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "Compare the performance of the `parallel` implementation with a naive CuPy implementation\n", + "\"\"\"\n", + "\n", + "import timeit\n", + "\n", + "def evens_sum_cupy(d_input, d_output, h_init):\n", + " # ignore h_init\n", + " cp.sum(d_input[d_input % 2 == 0], out=d_output[0])\n", + "\n", + "def evens_sum_cccl(d_input, d_output, h_init):\n", + " # note, using `op` as the binary operation, rather than `OpKind.PLUS`:\n", + " comp.reduce_into(d_input, d_output, sum_even_op, len(d_input), h_init)\n", + "\n", + "def time_gpu_func(f, *args, **kwargs):\n", + " cp.cuda.Device().synchronize()\n", + " t1 = timeit.default_timer()\n", + " n = 1_000\n", + " for i in range(n):\n", + " f(*args, **kwargs)\n", + " cp.cuda.Device().synchronize()\n", + " t2 = timeit.default_timer()\n", + " return t2 - t1\n", + "\n", + "sizes = [10_000, 100_000, 1_000_000, 10_000_000, 100_000_000]\n", + "cccl_times = []\n", + "cp_times = []\n", + "\n", + "for n in sizes:\n", + " d_input = cp.random.randint(low=0, high=10, size=n, dtype=np.int32)\n", + " d_out = cp.empty(1, dtype=np.int32)\n", + " h_init = np.array([0], dtype=np.int32)\n", + "\n", + " cccl_times.append(time_gpu_func(evens_sum_cccl, d_input, d_out, h_init))\n", + " cp_times.append(time_gpu_func(evens_sum_cupy, d_input, d_out, h_init))\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Plotting\n", + "fig = plt.figure(figsize=(10, 5))\n", + "plt.loglog(sizes, cccl_times, marker='o', label='cuda.ccl')\n", + "plt.loglog(sizes, cp_times, marker='s', label='CuPy')\n", + "\n", + "# Annotate each cuda.ccl point with speedup vs CuPy\n", + "for x, t_cccl, t_cp in zip(sizes, cccl_times, cp_times):\n", + " speedup = t_cp / t_cccl\n", + " label = f\"{speedup:.1f}x faster\"\n", + " plt.annotate(label,\n", + " (x, t_cccl),\n", + " textcoords=\"offset points\",\n", + " xytext=(5, -10), # offset position\n", + " ha='left',\n", + " fontsize=9,\n", + " color='green')\n", + "\n", + "# Labels and title\n", + "plt.xlabel('Input Size')\n", + "plt.ylabel('Time (seconds)')\n", + "plt.title('Timing Comparison for evens_sum.')\n", + "plt.legend()\n", + "plt.grid(True)\n", + "plt.tight_layout()\n" + ] + }, + { + "cell_type": "markdown", + "id": "a0a0d085-75ab-4836-afb6-72ec1abd1d6a", + "metadata": {}, + "source": [ + "We see that using `cuda.compute` is much faster than our naive CuPy approach. This is because:\n", + "\n", + "* Operator fusion: the CuPy operation `x[x % 2 == 0]).sum()` is actually 4 separate operations (and at least 4 separate CUDA kernel invocations). With `cuda.compute`, we have a single call to `reduce_into()` that does all the computation.\n", + "* No intermediate memory allocations.\n", + "* Lesser Python overhead: `cuda.compute` is a lower-level library. You don't have to jump through multiple layers of Python before invoking device code." + ] + }, + { + "cell_type": "markdown", + "id": "a987e11b-777b-4da3-81f8-5808c0dd8836", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# 4. Scanning Algorithms\n", + "\n", + "## 4.1 What is a Scan?\n", + "\n", + "A **scan** (also called prefix sum) computes a running total of elements. For each position, it shows the cumulative result up to that point.\n", + "\n", + "**Two types of scans:**\n", + "* **Inclusive scan**: Includes the current element in the sum\n", + "* **Exclusive scan**: Excludes the current element (shifts results)\n", + "\n", + "**Visual example:**\n", + "\n", + "```\n", + "Input: [3, 1, 4, 1, 5]\n", + "Inclusive: [3, 4, 8, 9, 14] (3, 3+1, 3+1+4, 3+1+4+1, 3+1+4+1+5)\n", + "Exclusive: [0, 3, 4, 8, 9] (0, 3, 3+1, 3+1+4, 3+1+4+1)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fca0df7f", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "d_input = cp.array([3, 1, 4, 1, 5, 9, 2, 6], dtype=np.int32)\n", + "d_inclusive = cp.empty_like(d_input)\n", + "d_exclusive = cp.empty_like(d_input)\n", + "h_init = np.array([0], dtype=np.int32)\n", + "\n", + "def add_op(a, b):\n", + " return a + b\n", + "\n", + "comp.inclusive_scan(d_input, d_inclusive, add_op, h_init, len(d_input))\n", + "comp.exclusive_scan(d_input, d_exclusive, add_op, h_init, len(d_input))\n", + "\n", + "print(f\"Input: {d_input.get()}\")\n", + "print(f\"Inclusive scan: {d_inclusive.get()}\")\n", + "print(f\"Exclusive scan: {d_exclusive.get()}\")\n", + "\n", + "# Verify with NumPy\n", + "np_inclusive = np.cumsum(d_input.get())\n", + "np_exclusive = np.concatenate([[0], np_inclusive[:-1]])\n", + "np.testing.assert_allclose(d_inclusive.get(), np_inclusive)\n", + "np.testing.assert_allclose(d_exclusive.get(), np_exclusive)\n", + "print(f\"NumPy inclusive: {np_inclusive}\")\n", + "print(f\"NumPy exclusive: {np_exclusive}\")" + ] + }, + { + "cell_type": "markdown", + "id": "d5c4b010", + "metadata": {}, + "source": [ + "## 4.2 Maximum Scan Example\n", + "\n", + "Scans aren't limited to addition. Here's an example using maximum operation to find running maximum.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "088f30c8", + "metadata": {}, + "outputs": [], + "source": [ + "# Running maximum example\n", + "d_input = cp.array([3, 7, 2, 9, 1, 8, 4, 6], dtype=np.int32)\n", + "d_output = cp.empty_like(d_input)\n", + "\n", + "def max_op(a, b):\n", + " return a if a > b else b\n", + "\n", + "# Start with a very small value\n", + "h_init = np.array([-999999], dtype=np.int32)\n", + "\n", + "# Perform inclusive scan with max operation\n", + "comp.inclusive_scan(d_input, d_output, max_op, h_init, len(d_input))\n", + "\n", + "print(f\"Input: {d_input.get()}\")\n", + "print(f\"Running max: {d_output.get()}\")\n", + "\n", + "# Verify with NumPy\n", + "np_running_max = np.maximum.accumulate(d_input.get())\n", + "print(f\"NumPy max: {np_running_max}\")\n", + "print(f\"Match: {np.array_equal(d_output.get(), np_running_max)}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "5e37c3f0-b7cb-45e6-83bf-2bbbeeab74b2", + "metadata": {}, + "source": [ + "# 5. Sorting Algorithms" + ] + }, + { + "cell_type": "markdown", + "id": "7aaa94ff", + "metadata": {}, + "source": [ + "## 5.1 Merge Sort" + ] + }, + { + "cell_type": "markdown", + "id": "e5c4043e-6dec-44f4-ab6b-15f24d0c3cfb", + "metadata": {}, + "source": [ + "The `merge_sort` function can be used to perform key-value sorting." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a893a898-e74b-4ea2-b69a-4432a342fead", + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare the input arrays.\n", + "d_in_keys = cp.asarray([-5, 0, 2, -3, 2, -3, 0, -3, -5, 2], dtype=\"int32\")\n", + "d_in_values = cp.asarray(\n", + " [-3.2, 2.2, 1.9, 4.0, -3.9, 2.7, 0, 8.3 - 1, 2.9, 5.4], dtype=\"float32\"\n", + ")\n", + "\n", + "# Perform the merge sort.\n", + "comp.merge_sort(\n", + " d_in_keys,\n", + " d_in_values,\n", + " d_in_keys, # reuse input array to store output\n", + " d_in_values, # reuse input array to store output\n", + " comp.OpKind.LESS,\n", + " d_in_keys.size,\n", + ")\n", + "\n", + "print(f\"Sorted keys: {d_in_keys.get()}\")\n", + "print(f\"Sorted values: {d_in_values.get()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "13dafff6-7a05-43d4-bedb-1c6cc0027573", + "metadata": {}, + "source": [ + "If you just want to sort keys (with no corresponding values), just pass `None`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e90bc0c-810e-46d9-9ef9-9f3adaeecd10", + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare the input and output arrays.\n", + "d_in_keys = cp.asarray([-5, 0, 2, -3, 2, -3, 0, -3, -5, 2], dtype=\"int32\")\n", + "\n", + "print(d_in_keys)\n", + "\n", + "# Perform the merge sort.\n", + "comp.merge_sort(\n", + " d_in_keys,\n", + " None, # don't specify a values array\n", + " d_in_keys, # reuse input array to store output\n", + " None, # don't specify a values array\n", + " comp.OpKind.LESS,\n", + " d_in_keys.size,\n", + ")\n", + "\n", + "print(f\"Sorted keys: {d_in_keys.get()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "6c6ffd71-90d1-461f-843b-96aca2990206", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "### 5.1.1 Exercise: sort by the last digit" + ] + }, + { + "cell_type": "markdown", + "id": "3b91fe2a-169e-4ee6-8408-ee87cf49b481", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "In this excercise, you'll use `merge_sort` with a custom comparator function to sort elements by the last digit.\n", + "For example, $[29, 9, 136, 1001, 72, 24, 32, 1] \\rightarrow [1001, 1, 72, 32, 24, 136, 29, 9]$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88e95ff4-013a-4ca9-bdf4-5e122ba0f66a", + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare the input and output arrays.\n", + "d_in_keys = cp.asarray([29, 9, 136, 1001, 72, 24, 32, 1], dtype=\"int32\")\n", + "\n", + "# define the custom comparator.\n", + "def comparison_op(lhs, rhs):\n", + " # begin TODO\n", + "\n", + " # end TODO\n", + "\n", + "# Perform the merge sort.\n", + "comp.merge_sort(\n", + " # begin TODO\n", + "\n", + " # end TODO\n", + ")\n", + "\n", + "print(f\"Result: {d_in_keys}\")\n", + "expected = np.asarray([1001, 1, 72, 32, 24, 136, 29, 9], dtype=np.int32)\n", + "assert (d_in_keys.get() == expected).all()" + ] + }, + { + "cell_type": "markdown", + "id": "b0b45660-9d80-43a9-9b7b-22eaeed7df4f", + "metadata": {}, + "source": [ + "## 5.2 Radix Sort\n", + "\n", + "The `radix_sort` function provides fast sorting for numeric types using the radix sort algorithm. Unlike merge sort, radix sort doesn't use comparisons but instead processes the bits/digits of numbers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f61f655", + "metadata": {}, + "outputs": [], + "source": [ + "# Basic radix sort example (ascending order)\n", + "d_input = cp.array([64, 34, 25, 12, 22, 11, 90, 5, 77, 30], dtype=np.int32)\n", + "d_output = cp.empty_like(d_input)\n", + "\n", + "print(f\"Input: {d_input.get()}\")\n", + "\n", + "# Sort in ascending order\n", + "comp.radix_sort(\n", + " d_input, # Input keys\n", + " d_output, # Output keys\n", + " None, # Input values (none for keys-only sort)\n", + " None, # Output values (none)\n", + " comp.SortOrder.ASCENDING, # Sort order\n", + " len(d_input) # Number of elements\n", + ")\n", + "\n", + "print(f\"Sorted: {d_output.get()}\")\n", + "\n", + "# Verify sorting\n", + "is_sorted = all(d_output.get()[i] <= d_output.get()[i+1] for i in range(len(d_output.get())-1))\n", + "print(f\"Properly sorted: {is_sorted}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdaa70a2", + "metadata": {}, + "outputs": [], + "source": [ + "# Descending order sort\n", + "d_input = cp.array([3, 1, 4, 1, 5, 9, 2, 6, 5, 3], dtype=np.int32)\n", + "d_output = cp.empty_like(d_input)\n", + "\n", + "comp.radix_sort(\n", + " d_input, d_output, None, None,\n", + " comp.SortOrder.DESCENDING, # Sort in reverse order\n", + " len(d_input)\n", + ")\n", + "\n", + "print(f\"Input: {d_input.get()}\")\n", + "print(f\"Descending sort: {d_output.get()}\")\n", + "\n", + "# Verify descending order\n", + "is_descending = all(d_output.get()[i] >= d_output.get()[i+1] for i in range(len(d_output.get())-1))\n", + "print(f\"Properly descending: {is_descending}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17ce2740", + "metadata": {}, + "outputs": [], + "source": [ + "# Key-value sorting: sort scores while keeping student IDs aligned\n", + "scores = [85, 92, 78, 96, 88, 71, 94]\n", + "student_ids = [101, 102, 103, 104, 105, 106, 107]\n", + "\n", + "d_keys = cp.array(scores, dtype=np.int32)\n", + "d_values = cp.array(student_ids, dtype=np.int32)\n", + "d_keys_out = cp.empty_like(d_keys)\n", + "d_values_out = cp.empty_like(d_values)\n", + "\n", + "print(\"Before sorting:\")\n", + "for score, student_id in zip(scores, student_ids):\n", + " print(f\" Student {student_id}: {score}\")\n", + "\n", + "# Sort by scores (highest first), keep student IDs aligned\n", + "comp.radix_sort(\n", + " d_keys, d_keys_out, # Input/output keys (scores)\n", + " d_values, d_values_out, # Input/output values (student IDs)\n", + " comp.SortOrder.DESCENDING, # Highest scores first\n", + " len(d_keys)\n", + ")\n", + "\n", + "sorted_scores = d_keys_out.get()\n", + "sorted_ids = d_values_out.get()\n", + "\n", + "print(\"\\nAfter sorting (by score, highest first):\")\n", + "for score, student_id in zip(sorted_scores, sorted_ids):\n", + " print(f\" Student {student_id}: {score}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "423fd14a", + "metadata": {}, + "source": [ + "# 6. Transformation Algorithms\n" + ] + }, + { + "cell_type": "markdown", + "id": "5c001e63-54ab-4943-83be-7503fea3ae0a", + "metadata": {}, + "source": [ + "## 6.1 Unary and Binary Transform\n", + "\n", + "### Unary transform" + ] + }, + { + "cell_type": "markdown", + "id": "b55614c6-6865-4bb8-941d-ae73290c7a91", + "metadata": {}, + "source": [ + "The `unary_transform` function applies a user-provided unary operation to each element of the input." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b45d79d-87a3-47c2-9681-8810535d3a80", + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare the input and output arrays.\n", + "d_in = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32)\n", + "d_out = cp.empty_like(d_in)\n", + "\n", + "def double_op(a):\n", + " return a * 2\n", + "\n", + "# Perform the unary transform.\n", + "comp.unary_transform(d_in, d_out, double_op, len(d_in))\n", + "print(f\"Result of unary transform: {d_out.get()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4e413c39-444b-43f8-b5e5-bd1adc18417c", + "metadata": {}, + "source": [ + "### Binary transform" + ] + }, + { + "cell_type": "markdown", + "id": "fa210a46-0c41-4c3f-94bd-954bc6add058", + "metadata": {}, + "source": [ + "The `binary_transform` function applies a user-provided binary operation to pairs of elements from two inputs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "346192c1-2318-437f-aaf5-c8e5f3b21d1e", + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare the input and output arrays.\n", + "d_in1 = cp.asarray([2, 8, 9, 6, 3], dtype=np.int32)\n", + "d_in2 = cp.asarray([7, 2, 1, 0, -1], dtype=np.int32)\n", + "d_out = cp.empty_like(d_in1)\n", + "\n", + "# Perform the binary transform.\n", + "comp.binary_transform(d_in1, d_in2, d_out, comp.OpKind.PLUS, len(d_in1))\n", + "print(f\"Result of binary transform: {d_out.get()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "cd76102c-c085-4893-b047-5c22eeabb20b", + "metadata": {}, + "source": [ + "## 6.2 Data Normalization with Transform\n", + "\n", + "Transform operations are commonly used in machine learning for data preprocessing, such as normalizing features to have zero mean and unit variance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fd56eac", + "metadata": {}, + "outputs": [], + "source": [ + "# Example: Normalize house prices for machine learning\n", + "house_prices = np.array([250000, 180000, 320000, 420000, 150000, 380000, 220000, 295000], dtype=np.float32)\n", + "d_prices = cp.array(house_prices)\n", + "d_normalized = cp.empty_like(d_prices)\n", + "\n", + "# Calculate statistics for normalization\n", + "price_mean = float(np.mean(house_prices))\n", + "price_std = float(np.std(house_prices))\n", + "\n", + "print(f\"Original prices: {house_prices}\")\n", + "print(f\"Mean: ${price_mean:,.0f}, Std: ${price_std:,.0f}\")\n", + "\n", + "def z_score_normalize(price):\n", + " \"\"\"Z-score normalization: (x - mean) / std\"\"\"\n", + " return (price - price_mean) / price_std\n", + "\n", + "# Apply normalization transformation\n", + "comp.unary_transform(d_prices, d_normalized, z_score_normalize, len(house_prices))\n", + "\n", + "normalized_result = d_normalized.get()\n", + "print(f\"Normalized prices: {normalized_result}\")\n", + "print(f\"Normalized mean: {np.mean(normalized_result):.6f}\")\n", + "print(f\"Normalized std: {np.std(normalized_result):.6f}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "bf385d3d", + "metadata": {}, + "source": [ + "## 6.3 Transform with Iterators for Memory Efficiency\n", + "\n", + "Combining transforms with iterators allows complex computations without storing intermediate arrays.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6d8d045", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate sum of squares from 1 to 1000 without storing intermediate arrays\n", + "def square_func(x):\n", + " return x * x\n", + "\n", + "def add_op(a, b):\n", + " return a + b\n", + "\n", + "# Method 1: Using iterators (memory efficient)\n", + "counting_it = comp.CountingIterator(np.int64(1)) # 1, 2, 3, ...\n", + "squares_it = comp.TransformIterator(counting_it, square_func) # 1², 2², 3², ...\n", + "\n", + "d_result = cp.empty(1, dtype=np.int64)\n", + "h_init = np.array([0], dtype=np.int64)\n", + "\n", + "# Sum the squares directly without storing them\n", + "comp.reduce_into(squares_it, d_result, add_op, 1000, h_init)\n", + "iterator_result = d_result.get()[0]\n", + "\n", + "# Mathematical verification: sum of squares = n(n+1)(2n+1)/6\n", + "n = 1000\n", + "formula_result = n * (n + 1) * (2 * n + 1) // 6\n", + "\n", + "print(f\"Sum of squares from 1 to {n}:\")\n", + "print(f\"Iterator result: {iterator_result:,}\")\n", + "print(f\"Formula result: {formula_result:,}\")\n", + "print(f\"Correct: {iterator_result == formula_result}\")\n", + "print(f\"Memory used: Only space for final result (~8 bytes)\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "17282ee7", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# 7. Custom (Struct) Data Types\n" + ] + }, + { + "cell_type": "markdown", + "id": "541308b1-65b3-4856-adb9-aa1f039f0396", + "metadata": {}, + "source": [ + "So far, we've seen how to use `parallel` with input arrays composed of numeric values (ints and floats). A powerful feature of `parallel` is that it can also work with \"struct\" values, i.e., values that are in turn composed of more than one value. " + ] + }, + { + "cell_type": "markdown", + "id": "df4ed2b7-a379-4ff4-941d-f0da11ac0a7a", + "metadata": {}, + "source": [ + "For example, consider a sequence of RGB values, like those used in graphics applications. Each RGB value represents a pixel's color and consists of three components: **red**, **green**, and **blue** intensity levels." + ] + }, + { + "cell_type": "markdown", + "id": "6fa5625f-a13f-468a-baa1-84b7e5abd427", + "metadata": {}, + "source": [ + "The code below shows how you can use `parallel` to find the pixel with the highest **green** intensity level." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9281737-5422-41a3-8052-7ade54a96de0", + "metadata": {}, + "outputs": [], + "source": [ + "# use `@gpu_struct` to define the data type of each value:\n", + "@comp.gpu_struct\n", + "class Pixel:\n", + " r: np.int32\n", + " g: np.int32\n", + " b: np.int32\n", + "\n", + "# Define a reduction operation that operates on two `Pixel` objects:\n", + "def max_g_value(x, y):\n", + " return x if x.g > y.g else y\n", + "\n", + "# Prepare the input and output arrays. These are just CuPy arrays:\n", + "dtype = np.dtype([(\"r\", np.int32), (\"g\", np.int32), (\"b\", np.int32)], align=True) # alternately, use `Pixel.dtype`\n", + "d_rgb = cp.random.randint(0, 256, (10, 3), dtype=np.int32).view(dtype)\n", + "d_out = cp.empty(1, dtype)\n", + "\n", + "# Define the initial value for the reduction. This must be a `Pixel` object:\n", + "h_init = Pixel(0, 0, 0)\n", + "\n", + "# Perform the reduction.\n", + "comp.reduce_into(d_rgb, d_out, max_g_value, d_rgb.size, h_init)\n", + "\n", + "# Verify the result.\n", + "print(f\"Input RGB values: \\n {d_rgb.get()}\")\n", + "result = d_out.get()\n", + "print(f\"Pixel with greatest 'g' intensity: {result}\")" + ] + }, + { + "cell_type": "markdown", + "id": "2567bc4d", + "metadata": {}, + "source": [ + "# 8. Working with Iterators\n", + "\n", + "Now you have a taste for how to use `parallel` with **custom ops** and **custom data types**. _Iterators_ are another powerful tool in your toolbox for solving more complex problems.\n", + "\n", + "Iterators represent streams of data that are computed \"on-the-fly\". Unlike arrays, iterators do not require any memory allocation, and thus can represent huge sequences without consuming valuable GPU memory. Iterators can be used as inputs (and sometimes outputs) to algorithms in place of arrays.\n", + "\n", + "Note that \"iterators\" in the context of the `parallel` library is distinct from the concept of [iterators](https://docs.python.org/3/glossary.html#term-iterator) in the Python language. " + ] + }, + { + "cell_type": "markdown", + "id": "31f350bc-3825-49c3-a858-647aa34177aa", + "metadata": {}, + "source": [ + "## 8.1 CountingIterators and ConstantIterator" + ] + }, + { + "cell_type": "markdown", + "id": "514e4183-fc23-4355-8ad4-7b9989c9eaf4", + "metadata": {}, + "source": [ + "A `CountingIterator` represents the sequence `a, a + 1, a + 2, a + 3,.... `. In the following example, we use a `CountingIterator` as the input to `reduce_into` to compute the sum $1 + 2 + 3 + 4 + 5 = 15$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "604e5f6b", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# Prepare the inputs and outputs:\n", + "it_input = comp.CountingIterator(np.int32(1)) # represents the sequence 1, 2, 3, ....\n", + "d_output = cp.empty(1, dtype=np.int32)\n", + "h_init = np.array([0], dtype=np.int32)\n", + "\n", + "# Perform the reduction.\n", + "comp.reduce_into(it_input, d_output, comp.OpKind.PLUS, 5, h_init) # compute the reduction for `5` input items\n", + "\n", + "print(f\"Sum: {d_output.get()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "dec7e377", + "metadata": {}, + "source": [ + "A `ConstantIterator` represents the sequence `a, a, a, ...`. In the following example, we use a `ConstantIterator` as one of the inputs to `binary_transform`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fcff9cb6-ac1d-43e6-b439-ba7c89029b8c", + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare the input and output arrays.\n", + "d_in1 = cp.asarray([2, 8, 9, 6, 3], dtype=np.int32)\n", + "it_in2 = comp.ConstantIterator(np.int32(1))\n", + "d_out = cp.empty_like(d_in1)\n", + "\n", + "# Perform the binary transform.\n", + "comp.binary_transform(d_in1, it_in2, d_out, comp.OpKind.PLUS, len(d_in1))\n", + "print(f\"Result of binary transform: {d_out.get()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "84e88c3f-8325-47f5-a759-3355b73083f3", + "metadata": {}, + "source": [ + "## 8.2 TransformIterator" + ] + }, + { + "cell_type": "markdown", + "id": "f8231ca7-8daf-4208-9739-b77fd06719f9", + "metadata": {}, + "source": [ + "`TransformIterator` provides a way to compose operations by applying a function to each element as it's accessed. The following code is similar to the `CountingIterator` example above, but it wraps the iterator with a `TransformIterator` to compute the sum $1^2 + 2^2 + 3^2 + 4^2 + 5^2 = 55$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21ce538d-260c-49a5-83f7-3988aa536803", + "metadata": {}, + "outputs": [], + "source": [ + "# Define the transform operation.\n", + "def square(a):\n", + " return a**2\n", + "\n", + "# prepare the inputs and output.\n", + "it_count = comp.CountingIterator(np.int32(1)) # represents the sequence 1, 2, 3, ....\n", + "it_input = comp.TransformIterator(it_count, square) # represents the sequence 1**2, 2**2, 3**2, ...\n", + "d_output = cp.empty(1, dtype=np.int32)\n", + "h_init = np.array([0], dtype=np.int32)\n", + "\n", + "# Perform the reduction.\n", + "comp.reduce_into(it_input, d_output, comp.OpKind.PLUS, 5, h_init) # compute the reduction for `5` input items\n", + "\n", + "print(f\"Sum: {d_output.get()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "a3c05bf4-f477-456b-8238-aa5fc8bbce18", + "metadata": {}, + "source": [ + "You can also wrap an array with a `TransformIterator`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2301f1f-5a56-4075-9c46-c001bc9bab05", + "metadata": {}, + "outputs": [], + "source": [ + "d_arr = cp.asarray([2, 3, 5, 1, 6, 7, 8, 4], dtype=np.int32)\n", + "it_input = comp.TransformIterator(d_arr, square) # represents the sequence [2**2, 3**2, ... 4**2]\n", + "d_output = cp.empty(1, dtype=np.int32)\n", + "h_init = np.array([0], dtype=np.int32)\n", + "\n", + "# Perform the reduction.\n", + "comp.reduce_into(it_input, d_output, comp.OpKind.PLUS, len(d_arr), h_init)\n", + "\n", + "print(f\"Sum: {d_output.get()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "10c7736a-248b-4512-a262-cf452c46de8e", + "metadata": {}, + "source": [ + "Finally, you can use `TransformOutputIterator` as the output of an algorithm, to apply a function to the result as it's being written.\n", + "\n", + "⚠️ Note that when using `TransformOutputIterator`, you must currently provide explicit type annotations for the transform function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1142521-452a-4a3a-9f4d-2a1f01251a3d", + "metadata": {}, + "outputs": [], + "source": [ + "d_arr = cp.asarray([2, 3, 5, 1, 6, 7, 8, 4], dtype=np.float32)\n", + "it_input = comp.TransformIterator(d_arr, square) # represents the sequence [2**2, 3**2, ... 4**2]\n", + "d_out = cp.empty(1, dtype=np.float32)\n", + "\n", + "# provide type annotations when using `TransformOutputIterator`\n", + "def sqrt(a: np.float32) -> np.float32:\n", + " return a**2\n", + "\n", + "it_output = comp.TransformOutputIterator(d_out, sqrt)\n", + "\n", + "h_init = np.array([0], dtype=np.float32)\n", + "\n", + "# Perform the reduction.\n", + "comp.reduce_into(it_input, it_output, comp.OpKind.PLUS, len(d_arr), h_init) # compute the reduction for `5` input items\n", + "\n", + "print(f\"Sum: {d_out.get()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "d98c08e0-629b-4428-96c9-59cc3c132b2d", + "metadata": {}, + "source": [ + "## 8.3 ZipIterator" + ] + }, + { + "cell_type": "markdown", + "id": "8a3d7a82-a627-41dc-923d-242a23eae810", + "metadata": {}, + "source": [ + "A `ZipIterator` combines multiple iterators (or arrays) into a single iterator. To access the individual components of any element of a `ZipIterator`, use numeric indexing:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "090a5d75-1518-4102-8955-6a8e99e6d617", + "metadata": {}, + "outputs": [], + "source": [ + "d_in1 = cp.asarray([2, 3, 5, 1, 6, 7, 8, 4], dtype=np.int32)\n", + "d_in2 = cp.asarray([7, 7, 9, 3, 1, 2, 6, 0], dtype=np.int32)\n", + "it_in3 = comp.CountingIterator(np.int32(0))\n", + "it_input = comp.ZipIterator(d_in1, d_in2, it_in3)\n", + "\n", + "def op(x):\n", + " return x[0] + x[1] + x[2]\n", + "\n", + "d_output = cp.empty_like(d_in1)\n", + "comp.unary_transform(it_input, d_output, op, len(d_in1))\n", + "\n", + "print(f\"Result: {d_output.get()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "e0cf37c9-5d7e-4b2d-a619-e4bd8dbeab46", + "metadata": {}, + "source": [ + "In the example below, we compute the `min` and `max` of a sequence within a single call to `reduce_into`, using `ZipIterator`. Note the need to define `MinMax` to specify the output type of `minmax_op`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b44215d1-8bad-454b-8a0e-a87895dc1da5", + "metadata": {}, + "outputs": [], + "source": [ + "@comp.gpu_struct\n", + "class MinMax:\n", + " min_value: np.int32\n", + " max_value: np.int32\n", + "\n", + "def minmax_op(x, y):\n", + " return MinMax(min(x[0], y[0]), max(x[1], y[1]))\n", + "\n", + "d_in = cp.asarray([2, 3, 5, 1, 6, 7, 8, 4], dtype=np.int32)\n", + "\n", + "it_input = comp.ZipIterator(d_in, d_in)\n", + "d_output = cp.empty(2, dtype=np.int32).view(MinMax.dtype)\n", + "\n", + "SMALLEST_INT = np.iinfo(np.int32).min\n", + "LARGEST_INT = np.iinfo(np.int32).max\n", + "h_init = MinMax(LARGEST_INT, SMALLEST_INT)\n", + "\n", + "comp.reduce_into(it_input, d_output, minmax_op, len(d_in), h_init)\n", + "\n", + "print(f\"Min value: {d_output.get()[0]['min_value']}\")\n", + "print(f\"Max value: {d_output.get()[0]['max_value']}\")" + ] + }, + { + "cell_type": "markdown", + "id": "df2acfef-656b-4c07-9b29-6e8de94add5e", + "metadata": {}, + "source": [ + "### Iterator Composition\n", + "\n", + "You can chain multiple iterator types together to create sophisticated data processing pipelines without intermediate storage." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82a88473", + "metadata": {}, + "outputs": [], + "source": [ + "# Example: Sum of squares of even numbers from 1 to 20\n", + "def square_if_even(x):\n", + " \"\"\"Square the number if it's even, otherwise return 0\"\"\"\n", + " return (x * x) if (x % 2 == 0) else 0\n", + "\n", + "def add_op(a, b):\n", + " return a + b\n", + "\n", + "# Chain operations: generate numbers → filter/square evens → sum\n", + "counting_it = comp.CountingIterator(np.int32(1)) # 1, 2, 3, ..., 20\n", + "transform_it = comp.TransformIterator(counting_it, square_if_even) # 0, 4, 0, 16, 0, 36, ...\n", + "\n", + "d_result = cp.empty(1, dtype=np.int32)\n", + "h_init = np.array([0], dtype=np.int32)\n", + "\n", + "comp.reduce_into(transform_it, d_result, add_op, 20, h_init)\n", + "\n", + "# Verify: even numbers 2,4,6,8,10,12,14,16,18,20 -> squares 4,16,36,64,100,144,196,256,324,400\n", + "evens = [x for x in range(1, 21) if x % 2 == 0]\n", + "expected = sum(x * x for x in evens)\n", + "\n", + "print(f\"Numbers 1-20: even squares sum\")\n", + "print(f\"Even numbers: {evens}\")\n", + "print(f\"Their squares: {[x*x for x in evens]}\")\n", + "print(f\"Iterator result: {d_result.get()[0]}\")\n", + "print(f\"Expected result: {expected}\")\n", + "print(f\"Correct: {d_result.get()[0] == expected}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "8c859995", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# 9. Capstone Exercise: Implementing Running Average\n" + ] + }, + { + "cell_type": "markdown", + "id": "13f16d8f-76e9-40d6-b9ed-159c68379d02", + "metadata": {}, + "source": [ + "In this example, you'll implement the running average of a sequence, using a single call to the [inclusive_scan](https://nvidia.github.io/cccl/python/parallel_api.html#cuda.compute.algorithms.inclusive_scan) API. To do this, you'll have to piece together many of the concepts we've learned about so far." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ee1a1e5-ef83-470d-8991-a2808b8b6ab9", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "@comp.gpu_struct\n", + "class SumAndCount:\n", + " # begin TODO\n", + "\n", + " # end TODO\n", + "\n", + "def reduce_op(x, y) -> SumAndCount:\n", + " # begin TODO\n", + "\n", + " # end TODO\n", + "\n", + "def compute_running_average(x: SumAndCount) -> np.float32:\n", + " # begin TODO\n", + "\n", + " # end TODO\n", + "\n", + "d_input = cp.array([2, 3, 5, 1, 7, 6, 8, 4], dtype=np.float32)\n", + "d_output = cp.empty(len(d_input), dtype=np.float32)\n", + "h_init = SumAndCount(0, 0)\n", + "\n", + "it_input = comp.ZipIterator(d_input, comp.ConstantIterator(np.int32(1)))\n", + "it_output = comp.TransformOutputIterator(d_output, compute_running_average)\n", + "\n", + "# Perform the reduction.\n", + "comp.inclusive_scan(it_input, it_output, reduce_op, h_init, len(d_input))\n", + "\n", + "print(d_input)\n", + "\n", + "h_input = d_input.get()\n", + "expected = h_input.cumsum() / np.arange(1, len(h_input) + 1)\n", + "\n", + "print(f\"Running average result: {d_output}\")\n", + "np.testing.assert_allclose(d_output.get(), expected)" + ] + }, + { + "cell_type": "markdown", + "id": "55a22e1a", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Resources\n", + "\n", + "* `cuda-cccl` Documentation: https://nvidia.github.io/cccl/python/\n", + "* `parallel` API Reference: https://nvidia.github.io/cccl/python/parallel_api.html#cuda-cccl-parallel-api-reference" + ] } - }, - "source": [ - "import os\n", - "\n", - "if os.getenv(\"COLAB_RELEASE_TAG\") and not os.path.exists(\"/ach-installed\"): # If running in Google Colab:\n", - " !pip uninstall \"cuda-python\" --yes > /dev/null\n", - " !pip install \"numba-cuda\" \"cuda-cccl[test-cu12]\" > /dev/null 2>&1\n", - " open(\"/ach-installed\", \"a\").close()" - ] - }, - { - "cell_type": "markdown", - "id": "10355920-9bfc-4788-bfe7-ab99440a6d98", - "metadata": {}, - "source": [ - "The `[test-cu12]` extras installs CuPy, which we will use in our examples. It is not strictly a dependency of `cuda-cccl` - you can use any array-like object (like PyTorch tensors) as well." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bca5a026-db41-4079-808e-b8c3c7196c7a", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import cupy as cp\n", - "import cuda.compute as comp" - ] - }, - { - "cell_type": "markdown", - "id": "01908d54", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "## Hello `cccl`: Simple Reductions" - ] - }, - { - "cell_type": "markdown", - "id": "2d468957-1411-4cae-a050-a75692452cc2", - "metadata": {}, - "source": [ - "A **reduction** takes many values and combines them into a single result using a binary operation.\n", - "\n", - "As a simple example, consider a sequence of values like $[2, 3, 5, 1, 7, 6, 8, 4]$. The *sum* of the values of that sequence is a reduction using _addition_ as the binary operation: $(2 + 3 + 5 + 1 + 7 + 6 + 8 + 4) = 36$. Similarly, the *maximum value* can be obtained by performing a reduction using `max(a, b)` as the binary operation." - ] - }, - { - "cell_type": "markdown", - "id": "83065c06-aeab-4d26-9d18-69edb8462c2d", - "metadata": {}, - "source": [ - "A reduction can be computed in parallel. Typically this is done using a \"tree\" reduction where elements are combined in pairs across multiple levels, resembling the structure of a binary tree. At each level, the number of elements is halved as partial results are computed in parallel. This continues until a single final result is obtained at the root of the tree.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "01d0feb1-7ed0-42dc-bb83-8b017d89a2a6", - "metadata": {}, - "source": [ - "If you know some CUDA, you can quite easily write a kernel to implement this kind of parallel reduction. However, optimizing it for the specific CUDA architecture of your device, and generalizing for different data types and sizes can be difficult.\n", - "\n", - "This is where `cuda.compute` comes in. It provides optimized implementations of algorithms like reduction that give the best possible performance." - ] - }, - { - "cell_type": "markdown", - "id": "23cb8fb8-8e35-4179-8a66-2c3b5e6077ae", - "metadata": {}, - "source": [ - "### Using `reduce_into()` to compute the sum of a sequence" - ] - }, - { - "cell_type": "markdown", - "id": "9c8f6367-d7f9-4030-bc57-4b8920299b47", - "metadata": {}, - "source": [ - "`cuda.compute` provides a `reduce_into()` function to compute general reductions:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "adce5791", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "\"\"\"\n", - "Using `reduce_into()` to compute the sum of a sequence\n", - "\"\"\"\n", - "\n", - "# Prepare the inputs and outputs.\n", - "d_input = cp.array([2, 3, 5, 1, 7, 6, 8, 4], dtype=np.int32) # input sequence, a CuPy (device) array\n", - "d_output = cp.empty(1, dtype=np.int32) # array which will hold the result, a CuPy (device) array of size 1\n", - "h_init = np.array([0], dtype=np.int32) # initial value of the reduction, a NumPy (host) array of size 1\n", - "\n", - "# Perform the reduction.\n", - "comp.reduce_into(d_input, d_output, comp.OpKind.PLUS, len(d_input), h_init)\n", - "\n", - "print(d_input)\n", - "# Verify the result.\n", - "expected_output = 36\n", - "assert (d_output == expected_output).all()\n", - "result = d_output[0]\n", - "print(f\"Sum reduction result: {result}\")" - ] - }, - { - "cell_type": "markdown", - "id": "2f00121b-bfb1-4b37-9651-230386d9c256", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "### Exercise: computing the minimum value" - ] - }, - { - "cell_type": "markdown", - "id": "8f5e9ffc-89cd-4859-97ea-93abbb8b3f4b", - "metadata": {}, - "source": [ - "`reduce_into()` can be used to compute other reductions " - ] - }, - { - "cell_type": "markdown", - "id": "9241706a-b152-41b3-bba7-d281c9e43675", - "metadata": {}, - "source": [ - "Similar to the examples above, below is an incomplete code snippet for computing the minimum value of a sequence. Complete the section between the comments `begin TODO` and `end TODO` to use `reduce_into()` to compute the minimum." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "44cb3daf-1a82-4af3-965e-b0d4be56b17e", - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "Using `reduce_into()` to compute the minimum value of a sequence\n", - "\"\"\"\n", - "\n", - "d_input = cp.array([-2, 3, 5, 1, 7, -6, 8, -4], dtype=np.int32)\n", - "d_output = cp.empty(1, dtype=np.int32)\n", - "\n", - "# begin TODO\n", - "\n", - "\n", - "# end TODO\n", - "\n", - "expected_output = -6\n", - "assert (d_output == expected_output).all()\n", - "result = d_output[0]\n", - "print(f\"Min reduction result: {result}\")" - ] - }, - { - "cell_type": "markdown", - "id": "9839aacd-7256-484c-8205-e48068f3217b", - "metadata": {}, - "source": [ - "## Custom Reductions" - ] - }, - { - "cell_type": "markdown", - "id": "9b74da96-38ff-434a-87fa-bba49e37bf5a", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "### Example: sum of even values" - ] - }, - { - "cell_type": "markdown", - "id": "a0395884-0786-4b75-9442-046609041439", - "metadata": {}, - "source": [ - "At this point, you might be thinking:" - ] - }, - { - "cell_type": "markdown", - "id": "f09ef69c-07d1-4ca7-af22-89f105d7c532", - "metadata": {}, - "source": [ - "> **_Umm, can't I just use CuPy or PyTorch to compute sum or max?_**" - ] - }, - { - "cell_type": "markdown", - "id": "cd42b854-3576-4cf8-9880-f1aed514a10b", - "metadata": {}, - "source": [ - "Of course, given a CuPy array, it's trivial to do simple reductions like `sum`, `min` or `max`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "61248d58-dcb7-4bf2-9b8b-9669ea2cd3d6", - "metadata": {}, - "outputs": [], - "source": [ - "d_input = cp.array([-2, 3, 5, 1, 7, -6, 8, -4], dtype=np.int32)\n", - "\n", - "print(f\"Sum using cp.sum: {cp.sum(d_input)}\")\n", - "print(f\"Max value using cp.max: {cp.max(d_input)}\")\n", - "print(f\"Min value using cp.min: {cp.min(d_input)}\")" - ] - }, - { - "cell_type": "markdown", - "id": "b96e0729-87d3-4423-ac13-28c5d34e3786", - "metadata": {}, - "source": [ - "The benefit of `cuda-cccl` is more apparent when you want to do custom operations. For example, rather than just computing a straightforward `sum`, let's say we wanted to compute the sum of **only even values** in a sequence. Naively, here's how to do that with CuPy:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "929b31ba-ce77-4fcd-9750-5efa0c13c346", - "metadata": {}, - "outputs": [], - "source": [ - "d_input = cp.array([2, 3, 5, 1, 7, 6, 8, 4], dtype=np.int32)\n", - "result = (d_input[d_input % 2 == 0]).sum()\n", - "print(f\"Sum of even values with CuPy: {result}\")" - ] - }, - { - "cell_type": "markdown", - "id": "48fd15a3-f616-4337-a7d7-c81c999a73f7", - "metadata": {}, - "source": [ - "Now, let's do the same thing with `parallel`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "03947ed8-c6f9-49ae-b382-db7a0ef14931", - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "Using `reduce_into()` with a custom binary operation\n", - "\"\"\"\n", - "\n", - "# Define a custom binary operation for the reduction.\n", - "def sum_even_op(a, b):\n", - " return (a if a % 2 == 0 else 0) + (b if b % 2 == 0 else 0)\n", - "\n", - "d_input = cp.array([2, 3, 5, 1, 7, 6, 8, 4], dtype=np.int32)\n", - "d_output = cp.empty(1, dtype=np.int32)\n", - "h_init = np.array([0], dtype=np.int32)\n", - "\n", - "# Call `reduce_into()` passing the function above for the binary operation:\n", - "comp.reduce_into(d_input, d_output, sum_even_op, len(d_input), h_init)\n", - "result = d_output.get()[0]\n", - "print(f\"Sum of even values with `cuda.compute`: {result}\")" - ] - }, - { - "cell_type": "markdown", - "id": "e7c9205d-7668-4a72-a01d-f57e2d1a3bb1", - "metadata": {}, - "source": [ - "We got the same result using `cuda.compute`, but we had to write significantly more code. Is it worth it? Below is a small benchmarking script comparing timings for a range of input sizes:" - ] - }, - { - "cell_type": "markdown", - "id": "43afe1e9-c870-43e0-b884-2c7f93581869", - "metadata": {}, - "source": [ - "### Comparing custom reduction performance with naive CuPy implementation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "729a1867-3a21-4f09-af97-f02d0a562d9f", - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "Compare the performance of the `parallel` implementation with a naive CuPy implementation\n", - "\"\"\"\n", - "\n", - "import timeit\n", - "\n", - "def evens_sum_cupy(d_input, d_output, h_init):\n", - " # ignore h_init\n", - " cp.sum(d_input[d_input % 2 == 0], out=d_output[0])\n", - "\n", - "def evens_sum_cccl(d_input, d_output, h_init):\n", - " # note, using `op` as the binary operation, rather than `OpKind.PLUS`:\n", - " comp.reduce_into(d_input, d_output, sum_even_op, len(d_input), h_init)\n", - "\n", - "def time_gpu_func(f, *args, **kwargs):\n", - " cp.cuda.Device().synchronize()\n", - " t1 = timeit.default_timer()\n", - " n = 1_000\n", - " for i in range(n):\n", - " f(*args, **kwargs)\n", - " cp.cuda.Device().synchronize()\n", - " t2 = timeit.default_timer()\n", - " return t2 - t1\n", - "\n", - "sizes = [10_000, 100_000, 1_000_000, 10_000_000, 100_000_000]\n", - "cccl_times = []\n", - "cp_times = []\n", - "\n", - "for n in sizes:\n", - " d_input = cp.random.randint(low=0, high=10, size=n, dtype=np.int32)\n", - " d_out = cp.empty(1, dtype=np.int32)\n", - " h_init = np.array([0], dtype=np.int32)\n", - "\n", - " cccl_times.append(time_gpu_func(evens_sum_cccl, d_input, d_out, h_init))\n", - " cp_times.append(time_gpu_func(evens_sum_cupy, d_input, d_out, h_init))\n", - "\n", - "import matplotlib.pyplot as plt\n", - "\n", - "# Plotting\n", - "fig = plt.figure(figsize=(10, 5))\n", - "plt.loglog(sizes, cccl_times, marker='o', label='cuda.ccl')\n", - "plt.loglog(sizes, cp_times, marker='s', label='CuPy')\n", - "\n", - "# Annotate each cuda.ccl point with speedup vs CuPy\n", - "for x, t_cccl, t_cp in zip(sizes, cccl_times, cp_times):\n", - " speedup = t_cp / t_cccl\n", - " label = f\"{speedup:.1f}x faster\"\n", - " plt.annotate(label,\n", - " (x, t_cccl),\n", - " textcoords=\"offset points\",\n", - " xytext=(5, -10), # offset position\n", - " ha='left',\n", - " fontsize=9,\n", - " color='green')\n", - "\n", - "# Labels and title\n", - "plt.xlabel('Input Size')\n", - "plt.ylabel('Time (seconds)')\n", - "plt.title('Timing Comparison for evens_sum.')\n", - "plt.legend()\n", - "plt.grid(True)\n", - "plt.tight_layout()\n" - ] - }, - { - "cell_type": "markdown", - "id": "a0a0d085-75ab-4836-afb6-72ec1abd1d6a", - "metadata": {}, - "source": [ - "We see that using `cuda.compute` is much faster than our naive CuPy approach. This is because:\n", - "\n", - "* Operator fusion: the CuPy operation `x[x % 2 == 0]).sum()` is actually 4 separate operations (and at least 4 separate CUDA kernel invocations). With `cuda.compute`, we have a single call to `reduce_into()` that does all the computation.\n", - "* No intermediate memory allocations.\n", - "* Lesser Python overhead: `cuda.compute` is a lower-level library. You don't have to jump through multiple layers of Python before invoking device code." - ] - }, - { - "cell_type": "markdown", - "id": "a987e11b-777b-4da3-81f8-5808c0dd8836", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "## Scanning\n", - "\n", - "### What is a Scan?\n", - "\n", - "A **scan** (also called prefix sum) computes a running total of elements. For each position, it shows the cumulative result up to that point.\n", - "\n", - "**Two types of scans:**\n", - "* **Inclusive scan**: Includes the current element in the sum\n", - "* **Exclusive scan**: Excludes the current element (shifts results)\n", - "\n", - "**Visual example:**\n", - "\n", - "```\n", - "Input: [3, 1, 4, 1, 5]\n", - "Inclusive: [3, 4, 8, 9, 14] (3, 3+1, 3+1+4, 3+1+4+1, 3+1+4+1+5)\n", - "Exclusive: [0, 3, 4, 8, 9] (0, 3, 3+1, 3+1+4, 3+1+4+1)\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fca0df7f", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "d_input = cp.array([3, 1, 4, 1, 5, 9, 2, 6], dtype=np.int32)\n", - "d_inclusive = cp.empty_like(d_input)\n", - "d_exclusive = cp.empty_like(d_input)\n", - "h_init = np.array([0], dtype=np.int32)\n", - "\n", - "def add_op(a, b):\n", - " return a + b\n", - "\n", - "comp.inclusive_scan(d_input, d_inclusive, add_op, h_init, len(d_input))\n", - "comp.exclusive_scan(d_input, d_exclusive, add_op, h_init, len(d_input))\n", - "\n", - "print(f\"Input: {d_input.get()}\")\n", - "print(f\"Inclusive scan: {d_inclusive.get()}\")\n", - "print(f\"Exclusive scan: {d_exclusive.get()}\")\n", - "\n", - "# Verify with NumPy\n", - "np_inclusive = np.cumsum(d_input.get())\n", - "np_exclusive = np.concatenate([[0], np_inclusive[:-1]])\n", - "np.testing.assert_allclose(d_inclusive.get(), np_inclusive)\n", - "np.testing.assert_allclose(d_exclusive.get(), np_exclusive)\n", - "print(f\"NumPy inclusive: {np_inclusive}\")\n", - "print(f\"NumPy exclusive: {np_exclusive}\")" - ] - }, - { - "cell_type": "markdown", - "id": "d5c4b010", - "metadata": {}, - "source": [ - "### Maximum Scan Example\n", - "\n", - "Scans aren't limited to addition. Here's an example using maximum operation to find running maximum.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "088f30c8", - "metadata": {}, - "outputs": [], - "source": [ - "# Running maximum example\n", - "d_input = cp.array([3, 7, 2, 9, 1, 8, 4, 6], dtype=np.int32)\n", - "d_output = cp.empty_like(d_input)\n", - "\n", - "def max_op(a, b):\n", - " return a if a > b else b\n", - "\n", - "# Start with a very small value\n", - "h_init = np.array([-999999], dtype=np.int32)\n", - "\n", - "# Perform inclusive scan with max operation\n", - "comp.inclusive_scan(d_input, d_output, max_op, h_init, len(d_input))\n", - "\n", - "print(f\"Input: {d_input.get()}\")\n", - "print(f\"Running max: {d_output.get()}\")\n", - "\n", - "# Verify with NumPy\n", - "np_running_max = np.maximum.accumulate(d_input.get())\n", - "print(f\"NumPy max: {np_running_max}\")\n", - "print(f\"Match: {np.array_equal(d_output.get(), np_running_max)}\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "5e37c3f0-b7cb-45e6-83bf-2bbbeeab74b2", - "metadata": {}, - "source": [ - "## Sorting" - ] - }, - { - "cell_type": "markdown", - "id": "7aaa94ff", - "metadata": {}, - "source": [ - "### Merge Sort" - ] - }, - { - "cell_type": "markdown", - "id": "e5c4043e-6dec-44f4-ab6b-15f24d0c3cfb", - "metadata": {}, - "source": [ - "The `merge_sort` function can be used to perform key-value sorting." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a893a898-e74b-4ea2-b69a-4432a342fead", - "metadata": {}, - "outputs": [], - "source": [ - "# Prepare the input arrays.\n", - "d_in_keys = cp.asarray([-5, 0, 2, -3, 2, -3, 0, -3, -5, 2], dtype=\"int32\")\n", - "d_in_values = cp.asarray(\n", - " [-3.2, 2.2, 1.9, 4.0, -3.9, 2.7, 0, 8.3 - 1, 2.9, 5.4], dtype=\"float32\"\n", - ")\n", - "\n", - "# Perform the merge sort.\n", - "comp.merge_sort(\n", - " d_in_keys,\n", - " d_in_values,\n", - " d_in_keys, # reuse input array to store output\n", - " d_in_values, # reuse input array to store output\n", - " comp.OpKind.LESS,\n", - " d_in_keys.size,\n", - ")\n", - "\n", - "print(f\"Sorted keys: {d_in_keys.get()}\")\n", - "print(f\"Sorted values: {d_in_values.get()}\")" - ] - }, - { - "cell_type": "markdown", - "id": "13dafff6-7a05-43d4-bedb-1c6cc0027573", - "metadata": {}, - "source": [ - "If you just want to sort keys (with no corresponding values), just pass `None`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2e90bc0c-810e-46d9-9ef9-9f3adaeecd10", - "metadata": {}, - "outputs": [], - "source": [ - "# Prepare the input and output arrays.\n", - "d_in_keys = cp.asarray([-5, 0, 2, -3, 2, -3, 0, -3, -5, 2], dtype=\"int32\")\n", - "\n", - "print(d_in_keys)\n", - "\n", - "# Perform the merge sort.\n", - "comp.merge_sort(\n", - " d_in_keys,\n", - " None, # don't specify a values array\n", - " d_in_keys, # reuse input array to store output\n", - " None, # don't specify a values array\n", - " comp.OpKind.LESS,\n", - " d_in_keys.size,\n", - ")\n", - "\n", - "print(f\"Sorted keys: {d_in_keys.get()}\")" - ] - }, - { - "cell_type": "markdown", - "id": "6c6ffd71-90d1-461f-843b-96aca2990206", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "#### Exercise: sort by the last digit" - ] - }, - { - "cell_type": "markdown", - "id": "3b91fe2a-169e-4ee6-8408-ee87cf49b481", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "In this excercise, you'll use `merge_sort` with a custom comparator function to sort elements by the last digit.\n", - "For example, $[29, 9, 136, 1001, 72, 24, 32, 1] \\rightarrow [1001, 1, 72, 32, 24, 136, 29, 9]$." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "88e95ff4-013a-4ca9-bdf4-5e122ba0f66a", - "metadata": {}, - "outputs": [], - "source": [ - "# Prepare the input and output arrays.\n", - "d_in_keys = cp.asarray([29, 9, 136, 1001, 72, 24, 32, 1], dtype=\"int32\")\n", - "\n", - "# define the custom comparator.\n", - "def comparison_op(lhs, rhs):\n", - " # begin TODO\n", - "\n", - " # end TODO\n", - "\n", - "# Perform the merge sort.\n", - "comp.merge_sort(\n", - " # begin TODO\n", - "\n", - " # end TODO\n", - ")\n", - "\n", - "print(f\"Result: {d_in_keys}\")\n", - "expected = np.asarray([1001, 1, 72, 32, 24, 136, 29, 9], dtype=np.int32)\n", - "assert (d_in_keys.get() == expected).all()" - ] - }, - { - "cell_type": "markdown", - "id": "b0b45660-9d80-43a9-9b7b-22eaeed7df4f", - "metadata": {}, - "source": [ - "### Radix Sort\n", - "\n", - "The `radix_sort` function provides fast sorting for numeric types using the radix sort algorithm. Unlike merge sort, radix sort doesn't use comparisons but instead processes the bits/digits of numbers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7f61f655", - "metadata": {}, - "outputs": [], - "source": [ - "# Basic radix sort example (ascending order)\n", - "d_input = cp.array([64, 34, 25, 12, 22, 11, 90, 5, 77, 30], dtype=np.int32)\n", - "d_output = cp.empty_like(d_input)\n", - "\n", - "print(f\"Input: {d_input.get()}\")\n", - "\n", - "# Sort in ascending order\n", - "comp.radix_sort(\n", - " d_input, # Input keys\n", - " d_output, # Output keys\n", - " None, # Input values (none for keys-only sort)\n", - " None, # Output values (none)\n", - " comp.SortOrder.ASCENDING, # Sort order\n", - " len(d_input) # Number of elements\n", - ")\n", - "\n", - "print(f\"Sorted: {d_output.get()}\")\n", - "\n", - "# Verify sorting\n", - "is_sorted = all(d_output.get()[i] <= d_output.get()[i+1] for i in range(len(d_output.get())-1))\n", - "print(f\"Properly sorted: {is_sorted}\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bdaa70a2", - "metadata": {}, - "outputs": [], - "source": [ - "# Descending order sort\n", - "d_input = cp.array([3, 1, 4, 1, 5, 9, 2, 6, 5, 3], dtype=np.int32)\n", - "d_output = cp.empty_like(d_input)\n", - "\n", - "comp.radix_sort(\n", - " d_input, d_output, None, None,\n", - " comp.SortOrder.DESCENDING, # Sort in reverse order\n", - " len(d_input)\n", - ")\n", - "\n", - "print(f\"Input: {d_input.get()}\")\n", - "print(f\"Descending sort: {d_output.get()}\")\n", - "\n", - "# Verify descending order\n", - "is_descending = all(d_output.get()[i] >= d_output.get()[i+1] for i in range(len(d_output.get())-1))\n", - "print(f\"Properly descending: {is_descending}\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17ce2740", - "metadata": {}, - "outputs": [], - "source": [ - "# Key-value sorting: sort scores while keeping student IDs aligned\n", - "scores = [85, 92, 78, 96, 88, 71, 94]\n", - "student_ids = [101, 102, 103, 104, 105, 106, 107]\n", - "\n", - "d_keys = cp.array(scores, dtype=np.int32)\n", - "d_values = cp.array(student_ids, dtype=np.int32)\n", - "d_keys_out = cp.empty_like(d_keys)\n", - "d_values_out = cp.empty_like(d_values)\n", - "\n", - "print(\"Before sorting:\")\n", - "for score, student_id in zip(scores, student_ids):\n", - " print(f\" Student {student_id}: {score}\")\n", - "\n", - "# Sort by scores (highest first), keep student IDs aligned\n", - "comp.radix_sort(\n", - " d_keys, d_keys_out, # Input/output keys (scores)\n", - " d_values, d_values_out, # Input/output values (student IDs)\n", - " comp.SortOrder.DESCENDING, # Highest scores first\n", - " len(d_keys)\n", - ")\n", - "\n", - "sorted_scores = d_keys_out.get()\n", - "sorted_ids = d_values_out.get()\n", - "\n", - "print(\"\\nAfter sorting (by score, highest first):\")\n", - "for score, student_id in zip(sorted_scores, sorted_ids):\n", - " print(f\" Student {student_id}: {score}\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "423fd14a", - "metadata": {}, - "source": [ - "## Transforming\n" - ] - }, - { - "cell_type": "markdown", - "id": "5c001e63-54ab-4943-83be-7503fea3ae0a", - "metadata": {}, - "source": [ - "#### Unary transform" - ] - }, - { - "cell_type": "markdown", - "id": "b55614c6-6865-4bb8-941d-ae73290c7a91", - "metadata": {}, - "source": [ - "The `unary_transform` function applies a user-provided unary operation to each element of the input." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1b45d79d-87a3-47c2-9681-8810535d3a80", - "metadata": {}, - "outputs": [], - "source": [ - "# Prepare the input and output arrays.\n", - "d_in = cp.asarray([1, 2, 3, 4, 5], dtype=np.int32)\n", - "d_out = cp.empty_like(d_in)\n", - "\n", - "def double_op(a):\n", - " return a * 2\n", - "\n", - "# Perform the unary transform.\n", - "comp.unary_transform(d_in, d_out, double_op, len(d_in))\n", - "print(f\"Result of unary transform: {d_out.get()}\")" - ] - }, - { - "cell_type": "markdown", - "id": "4e413c39-444b-43f8-b5e5-bd1adc18417c", - "metadata": {}, - "source": [ - "#### Binary transform" - ] - }, - { - "cell_type": "markdown", - "id": "fa210a46-0c41-4c3f-94bd-954bc6add058", - "metadata": {}, - "source": [ - "The `binary_transform` function applies a user-provided binary operation to pairs of elements from two inputs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "346192c1-2318-437f-aaf5-c8e5f3b21d1e", - "metadata": {}, - "outputs": [], - "source": [ - "# Prepare the input and output arrays.\n", - "d_in1 = cp.asarray([2, 8, 9, 6, 3], dtype=np.int32)\n", - "d_in2 = cp.asarray([7, 2, 1, 0, -1], dtype=np.int32)\n", - "d_out = cp.empty_like(d_in1)\n", - "\n", - "# Perform the binary transform.\n", - "comp.binary_transform(d_in1, d_in2, d_out, comp.OpKind.PLUS, len(d_in1))\n", - "print(f\"Result of binary transform: {d_out.get()}\")" - ] - }, - { - "cell_type": "markdown", - "id": "cd76102c-c085-4893-b047-5c22eeabb20b", - "metadata": {}, - "source": [ - "#### Data Normalization with Transform\n", - "\n", - "Transform operations are commonly used in machine learning for data preprocessing, such as normalizing features to have zero mean and unit variance." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5fd56eac", - "metadata": {}, - "outputs": [], - "source": [ - "# Example: Normalize house prices for machine learning\n", - "house_prices = np.array([250000, 180000, 320000, 420000, 150000, 380000, 220000, 295000], dtype=np.float32)\n", - "d_prices = cp.array(house_prices)\n", - "d_normalized = cp.empty_like(d_prices)\n", - "\n", - "# Calculate statistics for normalization\n", - "price_mean = float(np.mean(house_prices))\n", - "price_std = float(np.std(house_prices))\n", - "\n", - "print(f\"Original prices: {house_prices}\")\n", - "print(f\"Mean: ${price_mean:,.0f}, Std: ${price_std:,.0f}\")\n", - "\n", - "def z_score_normalize(price):\n", - " \"\"\"Z-score normalization: (x - mean) / std\"\"\"\n", - " return (price - price_mean) / price_std\n", - "\n", - "# Apply normalization transformation\n", - "comp.unary_transform(d_prices, d_normalized, z_score_normalize, len(house_prices))\n", - "\n", - "normalized_result = d_normalized.get()\n", - "print(f\"Normalized prices: {normalized_result}\")\n", - "print(f\"Normalized mean: {np.mean(normalized_result):.6f}\")\n", - "print(f\"Normalized std: {np.std(normalized_result):.6f}\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "bf385d3d", - "metadata": {}, - "source": [ - "#### Transform with Iterators for Memory Efficiency\n", - "\n", - "Combining transforms with iterators allows complex computations without storing intermediate arrays.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f6d8d045", - "metadata": {}, - "outputs": [], - "source": [ - "# Calculate sum of squares from 1 to 1000 without storing intermediate arrays\n", - "def square_func(x):\n", - " return x * x\n", - "\n", - "def add_op(a, b):\n", - " return a + b\n", - "\n", - "# Method 1: Using iterators (memory efficient)\n", - "counting_it = comp.CountingIterator(np.int64(1)) # 1, 2, 3, ...\n", - "squares_it = comp.TransformIterator(counting_it, square_func) # 1², 2², 3², ...\n", - "\n", - "d_result = cp.empty(1, dtype=np.int64)\n", - "h_init = np.array([0], dtype=np.int64)\n", - "\n", - "# Sum the squares directly without storing them\n", - "comp.reduce_into(squares_it, d_result, add_op, 1000, h_init)\n", - "iterator_result = d_result.get()[0]\n", - "\n", - "# Mathematical verification: sum of squares = n(n+1)(2n+1)/6\n", - "n = 1000\n", - "formula_result = n * (n + 1) * (2 * n + 1) // 6\n", - "\n", - "print(f\"Sum of squares from 1 to {n}:\")\n", - "print(f\"Iterator result: {iterator_result:,}\")\n", - "print(f\"Formula result: {formula_result:,}\")\n", - "print(f\"Correct: {iterator_result == formula_result}\")\n", - "print(f\"Memory used: Only space for final result (~8 bytes)\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "17282ee7", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "## Custom (Struct) Types\n" - ] - }, - { - "cell_type": "markdown", - "id": "541308b1-65b3-4856-adb9-aa1f039f0396", - "metadata": {}, - "source": [ - "So far, we've seen how to use `parallel` with input arrays composed of numeric values (ints and floats). A powerful feature of `parallel` is that it can also work with \"struct\" values, i.e., values that are in turn composed of more than one value. " - ] - }, - { - "cell_type": "markdown", - "id": "df4ed2b7-a379-4ff4-941d-f0da11ac0a7a", - "metadata": {}, - "source": [ - "For example, consider a sequence of RGB values, like those used in graphics applications. Each RGB value represents a pixel's color and consists of three components: **red**, **green**, and **blue** intensity levels." - ] - }, - { - "cell_type": "markdown", - "id": "6fa5625f-a13f-468a-baa1-84b7e5abd427", - "metadata": {}, - "source": [ - "The code below shows how you can use `parallel` to find the pixel with the highest **green** intensity level." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b9281737-5422-41a3-8052-7ade54a96de0", - "metadata": {}, - "outputs": [], - "source": [ - "# use `@gpu_struct` to define the data type of each value:\n", - "@comp.gpu_struct\n", - "class Pixel:\n", - " r: np.int32\n", - " g: np.int32\n", - " b: np.int32\n", - "\n", - "# Define a reduction operation that operates on two `Pixel` objects:\n", - "def max_g_value(x, y):\n", - " return x if x.g > y.g else y\n", - "\n", - "# Prepare the input and output arrays. These are just CuPy arrays:\n", - "dtype = np.dtype([(\"r\", np.int32), (\"g\", np.int32), (\"b\", np.int32)], align=True) # alternately, use `Pixel.dtype`\n", - "d_rgb = cp.random.randint(0, 256, (10, 3), dtype=np.int32).view(dtype)\n", - "d_out = cp.empty(1, dtype)\n", - "\n", - "# Define the initial value for the reduction. This must be a `Pixel` object:\n", - "h_init = Pixel(0, 0, 0)\n", - "\n", - "# Perform the reduction.\n", - "comp.reduce_into(d_rgb, d_out, max_g_value, d_rgb.size, h_init)\n", - "\n", - "# Verify the result.\n", - "print(f\"Input RGB values: \\n {d_rgb.get()}\")\n", - "result = d_out.get()\n", - "print(f\"Pixel with greatest 'g' intensity: {result}\")" - ] - }, - { - "cell_type": "markdown", - "id": "2567bc4d", - "metadata": {}, - "source": [ - "## Working with Iterators\n", - "\n", - "Now you have a taste for how to use `parallel` with **custom ops** and **custom data types**. _Iterators_ are another powerful tool in your toolbox for solving more complex problems.\n", - "\n", - "Iterators represent streams of data that are computed \"on-the-fly\". Unlike arrays, iterators do not require any memory allocation, and thus can represent huge sequences without consuming valuable GPU memory. Iterators can be used as inputs (and sometimes outputs) to algorithms in place of arrays.\n", - "\n", - "Note that \"iterators\" in the context of the `parallel` library is distinct from the concept of [iterators](https://docs.python.org/3/glossary.html#term-iterator) in the Python language. " - ] - }, - { - "cell_type": "markdown", - "id": "31f350bc-3825-49c3-a858-647aa34177aa", - "metadata": {}, - "source": [ - "### `CountingIterators` and `ConstantIterator`" - ] - }, - { - "cell_type": "markdown", - "id": "514e4183-fc23-4355-8ad4-7b9989c9eaf4", - "metadata": {}, - "source": [ - "A `CountingIterator` represents the sequence `a, a + 1, a + 2, a + 3,.... `. In the following example, we use a `CountingIterator` as the input to `reduce_into` to compute the sum $1 + 2 + 3 + 4 + 5 = 15$." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "604e5f6b", - "metadata": { - "vscode": { - "languageId": "plaintext" + ], + "metadata": { + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" } - }, - "outputs": [], - "source": [ - "# Prepare the inputs and outputs:\n", - "it_input = comp.CountingIterator(np.int32(1)) # represents the sequence 1, 2, 3, ....\n", - "d_output = cp.empty(1, dtype=np.int32)\n", - "h_init = np.array([0], dtype=np.int32)\n", - "\n", - "# Perform the reduction.\n", - "comp.reduce_into(it_input, d_output, comp.OpKind.PLUS, 5, h_init) # compute the reduction for `5` input items\n", - "\n", - "print(f\"Sum: {d_output.get()}\")" - ] - }, - { - "cell_type": "markdown", - "id": "dec7e377", - "metadata": {}, - "source": [ - "A `ConstantIterator` represents the sequence `a, a, a, ...`. In the following example, we use a `ConstantIterator` as one of the inputs to `binary_transform`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fcff9cb6-ac1d-43e6-b439-ba7c89029b8c", - "metadata": {}, - "outputs": [], - "source": [ - "# Prepare the input and output arrays.\n", - "d_in1 = cp.asarray([2, 8, 9, 6, 3], dtype=np.int32)\n", - "it_in2 = comp.ConstantIterator(np.int32(1))\n", - "d_out = cp.empty_like(d_in1)\n", - "\n", - "# Perform the binary transform.\n", - "comp.binary_transform(d_in1, it_in2, d_out, comp.OpKind.PLUS, len(d_in1))\n", - "print(f\"Result of binary transform: {d_out.get()}\")" - ] - }, - { - "cell_type": "markdown", - "id": "84e88c3f-8325-47f5-a759-3355b73083f3", - "metadata": {}, - "source": [ - "### `TransformIterator`" - ] - }, - { - "cell_type": "markdown", - "id": "f8231ca7-8daf-4208-9739-b77fd06719f9", - "metadata": {}, - "source": [ - "`TransformIterator` provides a way to compose operations by applying a function to each element as it's accessed. The following code is similar to the `CountingIterator` example above, but it wraps the iterator with a `TransformIterator` to compute the sum $1^2 + 2^2 + 3^2 + 4^2 + 5^2 = 55$." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "21ce538d-260c-49a5-83f7-3988aa536803", - "metadata": {}, - "outputs": [], - "source": [ - "# Define the transform operation.\n", - "def square(a):\n", - " return a**2\n", - "\n", - "# prepare the inputs and output.\n", - "it_count = comp.CountingIterator(np.int32(1)) # represents the sequence 1, 2, 3, ....\n", - "it_input = comp.TransformIterator(it_count, square) # represents the sequence 1**2, 2**2, 3**2, ...\n", - "d_output = cp.empty(1, dtype=np.int32)\n", - "h_init = np.array([0], dtype=np.int32)\n", - "\n", - "# Perform the reduction.\n", - "comp.reduce_into(it_input, d_output, comp.OpKind.PLUS, 5, h_init) # compute the reduction for `5` input items\n", - "\n", - "print(f\"Sum: {d_output.get()}\")" - ] - }, - { - "cell_type": "markdown", - "id": "a3c05bf4-f477-456b-8238-aa5fc8bbce18", - "metadata": {}, - "source": [ - "You can also wrap an array with a `TransformIterator`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c2301f1f-5a56-4075-9c46-c001bc9bab05", - "metadata": {}, - "outputs": [], - "source": [ - "d_arr = cp.asarray([2, 3, 5, 1, 6, 7, 8, 4], dtype=np.int32)\n", - "it_input = comp.TransformIterator(d_arr, square) # represents the sequence [2**2, 3**2, ... 4**2]\n", - "d_output = cp.empty(1, dtype=np.int32)\n", - "h_init = np.array([0], dtype=np.int32)\n", - "\n", - "# Perform the reduction.\n", - "comp.reduce_into(it_input, d_output, comp.OpKind.PLUS, len(d_arr), h_init)\n", - "\n", - "print(f\"Sum: {d_output.get()}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "10c7736a-248b-4512-a262-cf452c46de8e", - "metadata": {}, - "source": [ - "Finally, you can use `TransformOutputIterator` as the output of an algorithm, to apply a function to the result as it's being written.\n", - "\n", - "⚠️ Note that when using `TransformOutputIterator`, you must currently provide explicit type annotations for the transform function." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a1142521-452a-4a3a-9f4d-2a1f01251a3d", - "metadata": {}, - "outputs": [], - "source": [ - "d_arr = cp.asarray([2, 3, 5, 1, 6, 7, 8, 4], dtype=np.float32)\n", - "it_input = comp.TransformIterator(d_arr, square) # represents the sequence [2**2, 3**2, ... 4**2]\n", - "d_out = cp.empty(1, dtype=np.float32)\n", - "\n", - "# provide type annotations when using `TransformOutputIterator`\n", - "def sqrt(a: np.float32) -> np.float32:\n", - " return a**2\n", - "\n", - "it_output = comp.TransformOutputIterator(d_out, sqrt)\n", - "\n", - "h_init = np.array([0], dtype=np.float32)\n", - "\n", - "# Perform the reduction.\n", - "comp.reduce_into(it_input, it_output, comp.OpKind.PLUS, len(d_arr), h_init) # compute the reduction for `5` input items\n", - "\n", - "print(f\"Sum: {d_out.get()}\")" - ] - }, - { - "cell_type": "markdown", - "id": "d98c08e0-629b-4428-96c9-59cc3c132b2d", - "metadata": {}, - "source": [ - "### `ZipIterator`" - ] - }, - { - "cell_type": "markdown", - "id": "8a3d7a82-a627-41dc-923d-242a23eae810", - "metadata": {}, - "source": [ - "A `ZipIterator` combines multiple iterators (or arrays) into a single iterator. To access the individual components of any element of a `ZipIterator`, use numeric indexing:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "090a5d75-1518-4102-8955-6a8e99e6d617", - "metadata": {}, - "outputs": [], - "source": [ - "d_in1 = cp.asarray([2, 3, 5, 1, 6, 7, 8, 4], dtype=np.int32)\n", - "d_in2 = cp.asarray([7, 7, 9, 3, 1, 2, 6, 0], dtype=np.int32)\n", - "it_in3 = comp.CountingIterator(np.int32(0))\n", - "it_input = comp.ZipIterator(d_in1, d_in2, it_in3)\n", - "\n", - "def op(x):\n", - " return x[0] + x[1] + x[2]\n", - "\n", - "d_output = cp.empty_like(d_in1)\n", - "comp.unary_transform(it_input, d_output, op, len(d_in1))\n", - "\n", - "print(f\"Result: {d_output.get()}\")" - ] - }, - { - "cell_type": "markdown", - "id": "e0cf37c9-5d7e-4b2d-a619-e4bd8dbeab46", - "metadata": {}, - "source": [ - "In the example below, we compute the `min` and `max` of a sequence within a single call to `reduce_into`, using `ZipIterator`. Note the need to define `MinMax` to specify the output type of `minmax_op`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b44215d1-8bad-454b-8a0e-a87895dc1da5", - "metadata": {}, - "outputs": [], - "source": [ - "@comp.gpu_struct\n", - "class MinMax:\n", - " min_value: np.int32\n", - " max_value: np.int32\n", - "\n", - "def minmax_op(x, y):\n", - " return MinMax(min(x[0], y[0]), max(x[1], y[1]))\n", - "\n", - "d_in = cp.asarray([2, 3, 5, 1, 6, 7, 8, 4], dtype=np.int32)\n", - "\n", - "it_input = comp.ZipIterator(d_in, d_in)\n", - "d_output = cp.empty(2, dtype=np.int32).view(MinMax.dtype)\n", - "\n", - "SMALLEST_INT = np.iinfo(np.int32).min\n", - "LARGEST_INT = np.iinfo(np.int32).max\n", - "h_init = MinMax(LARGEST_INT, SMALLEST_INT)\n", - "\n", - "comp.reduce_into(it_input, d_output, minmax_op, len(d_in), h_init)\n", - "\n", - "print(f\"Min value: {d_output.get()[0]['min_value']}\")\n", - "print(f\"Max value: {d_output.get()[0]['max_value']}\")" - ] - }, - { - "cell_type": "markdown", - "id": "df2acfef-656b-4c07-9b29-6e8de94add5e", - "metadata": {}, - "source": [ - "#### Iterator Composition\n", - "\n", - "You can chain multiple iterator types together to create sophisticated data processing pipelines without intermediate storage." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "82a88473", - "metadata": {}, - "outputs": [], - "source": [ - "# Example: Sum of squares of even numbers from 1 to 20\n", - "def square_if_even(x):\n", - " \"\"\"Square the number if it's even, otherwise return 0\"\"\"\n", - " return (x * x) if (x % 2 == 0) else 0\n", - "\n", - "def add_op(a, b):\n", - " return a + b\n", - "\n", - "# Chain operations: generate numbers → filter/square evens → sum\n", - "counting_it = comp.CountingIterator(np.int32(1)) # 1, 2, 3, ..., 20\n", - "transform_it = comp.TransformIterator(counting_it, square_if_even) # 0, 4, 0, 16, 0, 36, ...\n", - "\n", - "d_result = cp.empty(1, dtype=np.int32)\n", - "h_init = np.array([0], dtype=np.int32)\n", - "\n", - "comp.reduce_into(transform_it, d_result, add_op, 20, h_init)\n", - "\n", - "# Verify: even numbers 2,4,6,8,10,12,14,16,18,20 -> squares 4,16,36,64,100,144,196,256,324,400\n", - "evens = [x for x in range(1, 21) if x % 2 == 0]\n", - "expected = sum(x * x for x in evens)\n", - "\n", - "print(f\"Numbers 1-20: even squares sum\")\n", - "print(f\"Even numbers: {evens}\")\n", - "print(f\"Their squares: {[x*x for x in evens]}\")\n", - "print(f\"Iterator result: {d_result.get()[0]}\")\n", - "print(f\"Expected result: {expected}\")\n", - "print(f\"Correct: {d_result.get()[0] == expected}\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "8c859995", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "### Exercise: implementing running average\n" - ] - }, - { - "cell_type": "markdown", - "id": "13f16d8f-76e9-40d6-b9ed-159c68379d02", - "metadata": {}, - "source": [ - "In this example, you'll implement the running average of a sequence, using a single call to the [inclusive_scan](https://nvidia.github.io/cccl/python/parallel_api.html#cuda.compute.algorithms.inclusive_scan) API. To do this, you'll have to piece together many of the concepts we've learned about so far." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5ee1a1e5-ef83-470d-8991-a2808b8b6ab9", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "@comp.gpu_struct\n", - "class SumAndCount:\n", - " # begin TODO\n", - "\n", - " # end TODO\n", - "\n", - "def reduce_op(x, y) -> SumAndCount:\n", - " # begin TODO\n", - "\n", - " # end TODO\n", - "\n", - "def compute_running_average(x: SumAndCount) -> np.float32:\n", - " # begin TODO\n", - "\n", - " # end TODO\n", - "\n", - "d_input = cp.array([2, 3, 5, 1, 7, 6, 8, 4], dtype=np.float32)\n", - "d_output = cp.empty(len(d_input), dtype=np.float32)\n", - "h_init = SumAndCount(0, 0)\n", - "\n", - "it_input = comp.ZipIterator(d_input, comp.ConstantIterator(np.int32(1)))\n", - "it_output = comp.TransformOutputIterator(d_output, compute_running_average)\n", - "\n", - "# Perform the reduction.\n", - "comp.inclusive_scan(it_input, it_output, reduce_op, h_init, len(d_input))\n", - "\n", - "print(d_input)\n", - "\n", - "h_input = d_input.get()\n", - "expected = h_input.cumsum() / np.arange(1, len(h_input) + 1)\n", - "\n", - "print(f\"Running average result: {d_output}\")\n", - "np.testing.assert_allclose(d_output.get(), expected)" - ] - }, - { - "cell_type": "markdown", - "id": "55a22e1a", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "## Resources\n", - "\n", - "* `cuda-cccl` Documentation: https://nvidia.github.io/cccl/python/\n", - "* `parallel` API Reference: https://nvidia.github.io/cccl/python/parallel_api.html#cuda-cccl-parallel-api-reference" - ] - } - ], - "metadata": { - "colab": { - "gpuType": "T4", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 }