diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3430c468..f3e602d9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,8 +1,8 @@ name: CI -on: +on: push: - branches: + branches: - '**' # Push events on all branches paths-ignore: - '.github/workflows/init.yml' @@ -12,17 +12,23 @@ on: jobs: build_docs: - runs-on: ubuntu-22.04 + runs-on: self-ubuntu-22.04 name: Build, Install, Package documentation if: "!contains(github.event.head_commit.message, 'docs skip')" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install credentials run: echo https://$GITHUB_OAUTH:@github.com > $HOME/.git-credentials env: GITHUB_OAUTH: ${{ secrets.CR_PAT_WORKFLOW }} - name: Build run: | + if [ -f /data/cemosis/spack/share/spack/setup-env.sh ]; then + source /data/cemosis/spack/share/spack/setup-env.sh + spack env activate feelpp + else + echo "Spack environment setup script not found." + fi npm install npm run antora working-directory: docs @@ -40,29 +46,30 @@ jobs: if: "!contains(github.event.head_commit.message, 'code skip')" steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: lfs: true recursive: true - - - name: Build + - + name: Build run: | + spack env activate feelpp-openmpi4-kokkos cmake --preset default cmake --build --preset default - - - name: Check + - + name: Check run: | echo "not yet enabled, need to add tests" # ctest --preset default env: - OMPI_ALLOW_RUN_AS_ROOT: 1 + OMPI_ALLOW_RUN_AS_ROOT: 1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM: 1 - - + - name: Package run: | cmake --build --preset default -t package - name: Upload Artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: # Artifact name name: parallel-programming-artifacts @@ -89,8 +96,8 @@ jobs: with: lfs: true submodules: 'recursive' - - name: Download - uses: actions/download-artifact@v3 + - name: Download + uses: actions/download-artifact@v4 with: # Artifact name name: parallel-programming-artifacts @@ -113,20 +120,20 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 - - + - name: Login to GitHub Container Registry uses: docker/login-action@v2 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.CR_PAT }} - - + - name: Build container image uses: docker/build-push-action@v4 - with: + with: context: artifacts push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} file: ./Dockerfile - + diff --git a/.gitignore b/.gitignore index 119186c2..7bf4d277 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,10 @@ jupyter/ auto-save-list tramp .\#* +_minted* +*.aux +*.log +*.synctex.gz +*.fls +/cpp/ +/cpp.old/ \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index 2d140db4..d3178229 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,4 +1,108 @@ { "C_Cpp.default.configurationProvider": "vector-of-bool.cmake-tools", - "cmake.buildDirectory": "${workspaceFolder}/build-${variant:buildType}" + "cmake.buildDirectory": "${workspaceFolder}/build-${variant:buildType}", + "files.associations": { + "*.json": "jsonc", + "*.dat": "csv (whitespace)", + "*.pgf": "tex", + "*.pdf_tex": "tex", + ".py.in": "Python", + "*.tikz": "tex", + "*.slurm": "shellscript", + "cctype": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "csetjmp": "cpp", + "csignal": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "any": "cpp", + "array": "cpp", + "atomic": "cpp", + "hash_map": "cpp", + "strstream": "cpp", + "barrier": "cpp", + "bit": "cpp", + "*.tcc": "cpp", + "bitset": "cpp", + "cfenv": "cpp", + "charconv": "cpp", + "chrono": "cpp", + "cinttypes": "cpp", + "codecvt": "cpp", + "compare": "cpp", + "complex": "cpp", + "concepts": "cpp", + "condition_variable": "cpp", + "coroutine": "cpp", + "cstdint": "cpp", + "cuchar": "cpp", + "deque": "cpp", + "forward_list": "cpp", + "list": "cpp", + "map": "cpp", + "set": "cpp", + "string": "cpp", + "unordered_map": "cpp", + "unordered_set": "cpp", + "vector": "cpp", + "exception": "cpp", + "algorithm": "cpp", + "functional": "cpp", + "iterator": "cpp", + "memory": "cpp", + "memory_resource": "cpp", + "numeric": "cpp", + "optional": "cpp", + "random": "cpp", + "ratio": "cpp", + "regex": "cpp", + "source_location": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "utility": "cpp", + "hash_set": "cpp", + "slist": "cpp", + "fstream": "cpp", + "future": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "latch": "cpp", + "limits": "cpp", + "mutex": "cpp", + "new": "cpp", + "numbers": "cpp", + "ostream": "cpp", + "ranges": "cpp", + "scoped_allocator": "cpp", + "semaphore": "cpp", + "shared_mutex": "cpp", + "span": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "stop_token": "cpp", + "streambuf": "cpp", + "syncstream": "cpp", + "thread": "cpp", + "typeindex": "cpp", + "typeinfo": "cpp", + "valarray": "cpp", + "variant": "cpp", + "expected": "cpp", + "spanstream": "cpp", + "stacktrace": "cpp", + "format": "cpp", + "__nullptr": "cpp" + } } \ No newline at end of file diff --git a/course-parallel-programming.code-workspace b/course-parallel-programming.code-workspace new file mode 100644 index 00000000..362d7c25 --- /dev/null +++ b/course-parallel-programming.code-workspace @@ -0,0 +1,7 @@ +{ + "folders": [ + { + "path": "." + } + ] +} \ No newline at end of file diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index 45032032..dd858bcb 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -8,8 +8,8 @@ if ( NPM_EXECUTABLE ) add_custom_command(OUTPUT "${ANTORA_OUTPUT_DIR}" ALL COMMENT "Building documentation... " - COMMAND "${npm}" install - COMMAND "${npm}" run antora + COMMAND "${npm}" install + COMMAND "${npm}" run antora WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" ) if ( NOT TARGET doc ) @@ -21,4 +21,6 @@ endif() install(DIRECTORY "${ANTORA_OUTPUT_DIR}/" DESTINATION ${APPLICATION_DOC_DIR} OPTIONAL) else() message(STATUS "npm is not installed, can use antora ") -endif() \ No newline at end of file +endif() + +add_subdirectory( modules/kokkos/examples/src ) \ No newline at end of file diff --git a/docs/antora.yml b/docs/antora.yml index d828f9a4..1d1c1be6 100644 --- a/docs/antora.yml +++ b/docs/antora.yml @@ -27,7 +27,7 @@ ext: base: modules/ROOT/attachments/ - run: command: | - ./generate-jupyter.sh docs/modules/exos + ./generate-jupyter.sh docs/modules scan: dir: jupyter/exos/ files: '**/*.ipynb' diff --git a/docs/modules/ROOT/assets/attachments/KokkosSession1.pdf b/docs/modules/ROOT/assets/attachments/KokkosSession1.pdf new file mode 100644 index 00000000..953f72b3 Binary files /dev/null and b/docs/modules/ROOT/assets/attachments/KokkosSession1.pdf differ diff --git a/docs/modules/ROOT/assets/attachments/KokkosSession1.pptx b/docs/modules/ROOT/assets/attachments/KokkosSession1.pptx new file mode 100644 index 00000000..60c7ffc1 Binary files /dev/null and b/docs/modules/ROOT/assets/attachments/KokkosSession1.pptx differ diff --git a/docs/modules/ROOT/assets/attachments/KokkosSession2.pdf b/docs/modules/ROOT/assets/attachments/KokkosSession2.pdf new file mode 100644 index 00000000..67c6d170 Binary files /dev/null and b/docs/modules/ROOT/assets/attachments/KokkosSession2.pdf differ diff --git a/docs/modules/ROOT/assets/attachments/KokkosSession2.pptx b/docs/modules/ROOT/assets/attachments/KokkosSession2.pptx new file mode 100644 index 00000000..42f58d09 Binary files /dev/null and b/docs/modules/ROOT/assets/attachments/KokkosSession2.pptx differ diff --git a/docs/modules/ROOT/assets/attachments/KokkosSession3.pdf b/docs/modules/ROOT/assets/attachments/KokkosSession3.pdf new file mode 100644 index 00000000..528c2171 Binary files /dev/null and b/docs/modules/ROOT/assets/attachments/KokkosSession3.pdf differ diff --git a/docs/modules/ROOT/assets/attachments/KokkosSession3.pptx b/docs/modules/ROOT/assets/attachments/KokkosSession3.pptx new file mode 100644 index 00000000..cda4833b Binary files /dev/null and b/docs/modules/ROOT/assets/attachments/KokkosSession3.pptx differ diff --git a/docs/modules/ROOT/assets/images/QPU001.jpg b/docs/modules/ROOT/assets/images/QPU001.jpg new file mode 100644 index 00000000..1d4dddf6 Binary files /dev/null and b/docs/modules/ROOT/assets/images/QPU001.jpg differ diff --git a/docs/modules/ROOT/assets/images/QPU002.jpg b/docs/modules/ROOT/assets/images/QPU002.jpg new file mode 100644 index 00000000..0569d0b1 Binary files /dev/null and b/docs/modules/ROOT/assets/images/QPU002.jpg differ diff --git a/docs/modules/ROOT/assets/images/QPU003.jpg b/docs/modules/ROOT/assets/images/QPU003.jpg new file mode 100644 index 00000000..923395f6 Binary files /dev/null and b/docs/modules/ROOT/assets/images/QPU003.jpg differ diff --git a/docs/modules/ROOT/assets/images/QPU004.jpg b/docs/modules/ROOT/assets/images/QPU004.jpg new file mode 100644 index 00000000..ca307145 Binary files /dev/null and b/docs/modules/ROOT/assets/images/QPU004.jpg differ diff --git a/docs/modules/ROOT/assets/images/kokkos1.jpg b/docs/modules/ROOT/assets/images/kokkos1.jpg new file mode 100644 index 00000000..086206b2 Binary files /dev/null and b/docs/modules/ROOT/assets/images/kokkos1.jpg differ diff --git a/docs/modules/ROOT/assets/images/kokkos1.png b/docs/modules/ROOT/assets/images/kokkos1.png new file mode 100644 index 00000000..5db66adf Binary files /dev/null and b/docs/modules/ROOT/assets/images/kokkos1.png differ diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index 7cf700f0..6217de22 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -10,6 +10,7 @@ ** xref:architectures/PPChapter1_DPU.adoc[DPU Architecture] ** xref:architectures/PPChapter1_SIMD.adoc[SIMD Architecture] ** xref:architectures/PPChapter1_AMD_CUDA.adoc[AMD CUDA Architecture] +** xref:architectures/PPChapter1_QPU.adoc[QPU Architecture] * MPI/OpenMP/Hybrid ** xref:PPChapter2_MPI.adoc[MPI (Message Passing Interface)] diff --git a/docs/modules/ROOT/pages/CUDA_Coding.adoc b/docs/modules/ROOT/pages/CUDA_Coding.adoc index a8120420..ea534be1 100644 --- a/docs/modules/ROOT/pages/CUDA_Coding.adoc +++ b/docs/modules/ROOT/pages/CUDA_Coding.adoc @@ -2,7 +2,7 @@ image::CUDA_Logo.png[Img401,50,50,role="left"] -* Compiling a program for CUDA +* Compiling a program for CUDA *** For example, to compile MyProg.cu you would use a command like *** nvcc -o MyProg MyProg.cu diff --git a/docs/modules/ROOT/pages/HEAT_Coding.adoc b/docs/modules/ROOT/pages/HEAT_Coding.adoc index 0bebc54a..a3eb339b 100644 --- a/docs/modules/ROOT/pages/HEAT_Coding.adoc +++ b/docs/modules/ROOT/pages/HEAT_Coding.adoc @@ -2,11 +2,9 @@ -.Heat equation in 2D -[.examp] -**** +== Heat equation in 2D -* Theory +=== Theory Heat (or diffusion) equation is a partial differential equation that describes the variation of temperature in a given region over time @@ -42,17 +40,17 @@ u^{m+1}(i,j) = u^m(i,j) + \Delta t \alpha \nabla^2 u^m(i,j) Note: The algorithm is stable only when +[stem] +++++ \begin{align*} \Delta t < \frac{1}{2 \alpha} \frac{(\Delta x \Delta y)^2}{(\Delta x)^2 (\Delta y)^2} \end{align*} +++++ -**** +== Code -.Code -[.examp] -**** The solver carries out the time development of the 2D heat equation over the number of time steps provided by the user. The default geometry is a flat rectangle (with grid size provided by the user), but other shapes may be used via input files. The program will produce an image (PNG) of the temperature field after every 100 iterations. @@ -74,18 +72,10 @@ include::ROOT:example$src/Heat_Equation_ParallelPrograming_Comparison/MPI_OpenMP ---- include::ROOT:example$src/Heat_Equation_ParallelPrograming_Comparison/Cuda/core_cuda.cu[indent=0] ---- -**** -.*Performance* **** +.*Performance* ADD SOME RESULTS **** - - - - - - -... - +... \ No newline at end of file diff --git a/docs/modules/ROOT/pages/HIP_Coding.adoc b/docs/modules/ROOT/pages/HIP_Coding.adoc index 69d8cf54..4f41c4c5 100644 --- a/docs/modules/ROOT/pages/HIP_Coding.adoc +++ b/docs/modules/ROOT/pages/HIP_Coding.adoc @@ -67,7 +67,6 @@ include::ROOT:example$src/HIP/Matrix_Summation/03_matrix_summation_GPU_2D2D_2D1D -**** .*Profiling Performance* **** ADD SOME RESULTS diff --git a/docs/modules/ROOT/pages/MPI_Coding.adoc b/docs/modules/ROOT/pages/MPI_Coding.adoc index 49492630..efa29e10 100644 --- a/docs/modules/ROOT/pages/MPI_Coding.adoc +++ b/docs/modules/ROOT/pages/MPI_Coding.adoc @@ -7,87 +7,305 @@ +== List of programms + +//:cpp: C++ -.Test compile {cpp} -[%dynamic,cpp,filename="Hello.hpp",compile=true,run=true] +ifeval::[{showproof} >= 2] +.Allgather +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Allgather.c",compile="mpi",np=3] ---- -#include +include::ROOT:example$src/MPI/MPI_Allgather.c[] +---- +==== +endif::[] + -int main() -{ - std::cout << "Hello, world!" << std::endl; - return 0; -} +ifeval::[{showproof} >= 2] +.Allreduce +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Allreduce.c",compile="mpi",np=4] ---- +include::ROOT:example$src/MPI/MPI_Allreduce.c[] +---- +==== +endif::[] +ifeval::[{showproof} >= 2] +.MPI_Alltoall +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Alltoall.c",compile="mpi",np=3] +---- +include::ROOT:example$src/MPI/MPI_Alltoall.c[] +---- +==== +endif::[] +ifeval::[{showproof} >= 2] +.MPI_Barrier +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Barrier.c",compile="mpi",np=4] +---- +include::ROOT:example$src/MPI/MPI_Barrier.c[] +---- +==== +endif::[] -== List of programms -//* xref:MPI_Allgather.c[MPI_Allgather] +ifeval::[{showproof} >= 2] +.MPI_Bcast +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Bcast.c",compile="mpi",np=2] +---- +include::ROOT:example$src/MPI/MPI_Bcast.c[] +---- +==== +endif::[] -//:cpp: C++ -.Allgather -[.examp] -**** -.Code Allgather -[source,c] +ifeval::[{showproof} >= 2] +.MPI_Bsend +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Bsend.c",compile="mpi",np=2] ---- -include::ROOT:example$src/MPI/MPI_Allgather.c[indent=0] +include::ROOT:example$src/MPI/MPI_Bsend.c[] ---- -**** +==== +endif::[] + + + +ifeval::[{showproof} >= 2] +.MPI_Comm_spawn +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Comm_spawn.c",compile="mpi",np=4] +---- +include::ROOT:example$src/MPI/MPI_Comm_spawn.c[] +---- +==== +endif::[] + + +ifeval::[{showproof} >= 2] +.MPI_Comm_split +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Comm_split.c",compile="mpi",np=4] +---- +include::ROOT:example$src/MPI/MPI_Comm_split.c[] +---- +==== +endif::[] + + +ifeval::[{showproof} >= 2] +.MPI_Exscan +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Exscan.c",compile="mpi",np=4] +---- +include::ROOT:example$src/MPI/MPI_Exscan.c[] +---- +==== +endif::[] + + + +ifeval::[{showproof} >= 2] +.MPI_File +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_File.c",compile="mpi",np=3] +---- +include::ROOT:example$src/MPI/MPI_File.c[] +---- +==== +endif::[] + + + +ifeval::[{showproof} >= 2] +.MPI_Gather +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Gather.c",compile="mpi",np=4] +---- +include::ROOT:example$src/MPI/MPI_Gather.c[] +---- +==== +endif::[] + + + +ifeval::[{showproof} >= 2] +.MPI_Graph_get +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Graph_get.c",compile="mpi",np=3] +---- +include::ROOT:example$src/MPI/MPI_Graph_get.c[] +---- +==== +endif::[] + + +ifeval::[{showproof} >= 2] +.MPI_Graphdims_get +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Graphdims_get.c",compile="mpi",np=3] +---- +include::ROOT:example$src/MPI/MPI_Graphdims_get.c[] +---- +==== +endif::[] + + + +ifeval::[{showproof} >= 2] +.MPI_allgather +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Iallgather.c",compile="mpi",np=4] +---- +include::ROOT:example$src/MPI/MPI_Iallgather.c[] +---- +==== +endif::[] + + +ifeval::[{showproof} >= 2] +.MPI_allreduce +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Iallreduce.c",compile="mpi",np=4] +---- +include::ROOT:example$src/MPI/MPI_Iallreduce.c[] +---- +==== +endif::[] + + +ifeval::[{showproof} >= 2] +.MPI_alltoall +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Ialltoall.c",compile="mpi",np=3] +---- +include::ROOT:example$src/MPI/MPI_Ialltoall.c[] +---- +==== +endif::[] + + +ifeval::[{showproof} >= 2] +.MPI_barrier +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Ibarrier.c",compile="mpi",np=4] +---- +include::ROOT:example$src/MPI/MPI_Ibarrier.c[] +---- +==== +endif::[] + + +ifeval::[{showproof} >= 2] +.MPI_bcast +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Ibcast.c",compile="mpi",np=4] +---- +include::ROOT:example$src/MPI/MPI_Ibcast.c[] +---- +==== +endif::[] + + + +ifeval::[{showproof} >= 2] +.MPI_reduce +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Ireduce.c",compile="mpi",np=3] +---- +include::ROOT:example$src/MPI/MPI_Ireduce.c[] +---- +==== +endif::[] + + +ifeval::[{showproof} >= 2] +.MPI_reduce_scatter +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Ireduce_scatter.c",compile="mpi",np=4] +---- +include::ROOT:example$src/MPI/MPI_Ireduce_scatter.c[] +---- +==== +endif::[] + + + +ifeval::[{showproof} >= 2] +.MPI_reduce_scatter +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Ireduce_scatter.c",compile="mpi",np=3] +---- +include::ROOT:example$src/MPI/MPI_Ireduce_scatter.c[] +---- +==== +endif::[] + + +ifeval::[{showproof} >= 2] +.MPI_reduce_scatter_block +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Ireduce_scatter_block.c",compile="mpi",np=3] +---- +include::ROOT:example$src/MPI/MPI_Ireduce_scatter_block.c[] +---- +==== +endif::[] + + +ifeval::[{showproof} >= 2] +.MPI_scatterv +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Iscatterv.c",compile="mpi",np=3] +---- +include::ROOT:example$src/MPI/MPI_Iscatterv.c[] +---- +==== +endif::[] + + + +ifeval::[{showproof} >= 2] +.MPI_ssend +[%collapsible.proof] +==== +[%dynamic,c,filename="MPI_Issend.c",compile="mpi",np=3] +---- +include::ROOT:example$src/MPI/MPI_Issend.c[] +---- +==== +endif::[] + + -.Allreduce -[.examp] -**** -.Code Allreduce -[source,c] ----- -include::ROOT:example$src/MPI/MPI_Allreduce.c[indent=0] ----- -**** - - - - -.Code Demo - ----- -MPI_Allgather -MPI_Allreduce -MPI_Alltoall -MPI_Barrier -MPI_Bcast -MPI_BSend -MPI_Buffer_attach -MPI_Buffer_detach -MPI_Comm_spawn -MPI_Comm_split -MPI_Exscan -MPI_File_close -MPI_File_open -MPI_File -MPI_Gather -MPI_Graph_get -MPI_Graph_neigbors_count -MPI_Graph_neigbors -MPI_Graphdims_get -MPI_lallgather -MPI_lallreduce -MPI_lalltoall -MPI_lbarrier -MPI-lbcast -MPI_lbsend -MPI_lreduce -MPI_lreduce_scatter -MPI_lreduce_scatter_block -MPI_lresend -MPI_lscatterv -MPI_lssend - - ----- -... diff --git a/docs/modules/ROOT/pages/OpenMP_Coding.adoc b/docs/modules/ROOT/pages/OpenMP_Coding.adoc index fd12c346..57cf4993 100644 --- a/docs/modules/ROOT/pages/OpenMP_Coding.adoc +++ b/docs/modules/ROOT/pages/OpenMP_Coding.adoc @@ -6,87 +6,126 @@ *** g++ -c MyProg. cpp -o MyProg.o -fopenmp *** g++ MyProg.o -o MyProg -fopenmp -lpthread + .Start [.examp] **** This start example illustrates how to do a task reduction and consists in calculating the sum of all elements of an array. - +ifeval::[{showproof} >= 2] .Code Start -[source,cpp] +[%collapsible.proof] +==== +[%dynamic,cpp,filename="OpenMP_Start.cpp",compile="openmp"] ---- -include::ROOT:example$src/OpenMP/OpenMP_Start.cpp[indent=0] +include::ROOT:example$src/OpenMP/OpenMP_Start.cpp[] ---- +==== +//endif::[] **** + + .Firstprivate [.examp] **** Specifies that each thread should have its own instance of a variable, and that the variable should be initialized with the value of the variable, because it exists before the parallel construct. .Code Firstprivate -[source,cpp] +[%collapsible.proof] +==== +[%dynamic,cpp,filename="OpenMP_Firstprivate.cpp",compile="openmp"] ---- -include::ROOT:example$src/OpenMP/OpenMP_Firstprivate.cpp[indent=0] +include::ROOT:example$src/OpenMP/OpenMP_Firstprivate.cpp[] ---- +==== +//endif::[] **** + + .Private [.examp] **** The private clause declares the variables in the list to be private to each thread in a team. .Code Private -[source,cpp] +[%collapsible.proof] +==== +[%dynamic,cpp,filename="OpenMP_Private.cpp",compile="openmp"] ---- -include::ROOT:example$src/OpenMP/OpenMP_Private.cpp[indent=0] +include::ROOT:example$src/OpenMP/OpenMP_Private.cpp[] ---- +==== +//endif::[] **** + + .Lastprivate [.examp] **** Specifies that the enclosing context's version of the variable is set equal to the private version of whichever thread executes the final iteration (for-loop construct) or last section (#pragma sections). .Code Lastprivate -[source,cpp] +[%collapsible.proof] +==== +[%dynamic,cpp,filename="OpenMP_Lastprivate.cpp",compile="openmp"] ---- -include::ROOT:example$src/OpenMP/OpenMP_Lastprivate.cpp[indent=0] +include::ROOT:example$src/OpenMP/OpenMP_Lastprivate.cpp[] ---- +==== +//endif::[] **** + .Linear [.examp] **** The linear clause provides a superset of the functionality provided by the private clause. .Code Linear -[source,cpp] +[%collapsible.proof] +==== +[%dynamic,cpp,filename="OpenMP_Linear.cpp",compile="openmp"] ---- -include::ROOT:example$src/OpenMP/OpenMP_Linear.cpp[indent=0] +include::ROOT:example$src/OpenMP/OpenMP_Linear.cpp[] ---- +==== +//endif::[] **** + + .Schedule [.examp] **** Scheduling is a method in OpenMP to distribute iterations to different threads in for loop. .Code Schedule -[source,cpp] +[%collapsible.proof] +==== +[%dynamic,cpp,filename="OpenMP_Schedule.cpp",compile="openmp"] ---- -include::ROOT:example$src/OpenMP/OpenMP_Schedule.cpp[indent=0] +include::ROOT:example$src/OpenMP/OpenMP_Schedule.cpp[] ---- +==== +//endif::[] **** + .None [.examp] **** The none clause requires that each variable that is referenced in the construct, and that does not have a predetermined data-sharing attribute, must have its data-sharing attribute explicitly determined by being listed in a data-sharing attribute clause. .Code None -[source,cpp] +[%collapsible.proof] +==== +[%dynamic,cpp,filename="OpenMP_None.cpp",compile="openmp"] ---- -include::ROOT:example$src/OpenMP/OpenMP_None.cpp[indent=0] +include::ROOT:example$src/OpenMP/OpenMP_None.cpp[] ---- +==== +//endif::[] **** @@ -99,26 +138,46 @@ The task pragma can be used to explicitly define a task. Use the task pragma whe This application consists of a thread, in an OpenMP parallel region, that spawns tasks. .Code Task -[source,cpp] +[%collapsible.proof] +==== +[%dynamic,cpp,filename="OpenMP_Task.cpp",compile="openmp"] ---- -include::ROOT:example$src/OpenMP/OpenMP_Task.cpp[indent=0] +include::ROOT:example$src/OpenMP/OpenMP_Task.cpp[] ---- +==== +//endif::[] +**** -This example consists in calculating the sum of all elements of an array. +.Code Reduction +[.examp] +**** +This example consists in calculating the sum of all elements of an array. .Code Task Reduction -[source,cpp] +[%collapsible.proof] +==== +[%dynamic,cpp,filename="OpenMP_Task_reduction.cpp",compile="openmp"] ---- -include::ROOT:example$src/OpenMP/OpenMP_Task_reduction.cpp[indent=0] +include::ROOT:example$src/OpenMP/OpenMP_Task_reduction.cpp[] ---- +==== +//endif::[] +**** -Use the taskwait pragma to specify a wait for child tasks to be completed that are generated by the current task. This application consists of a thread, in an OpenMP parallel region that spawns tasks. It first spawns two tasks, then wait for these to complete before spawning a third task. The execution flow can be visualised below: +.Code Reduction +[.examp] +**** +Use the taskwait pragma to specify a wait for child tasks to be completed that are generated by the current task. This application consists of a thread, in an OpenMP parallel region that spawns tasks. It first spawns two tasks, then wait for these to complete before spawning a third task. The execution flow can be visualised below: .Code Task Wait -[source,cpp] +[%collapsible.proof] +==== +[%dynamic,cpp,filename="OpenMP_Task_wait.cpp",compile="openmp"] ---- -include::ROOT:example$src/OpenMP/OpenMP_Task_wait.cpp[indent=0] +include::ROOT:example$src/OpenMP/OpenMP_Task_wait.cpp[] ---- +==== +//endif::[] **** diff --git a/docs/modules/ROOT/pages/PPChapter2.adoc b/docs/modules/ROOT/pages/PPChapter2.adoc index eb5d15c1..4baceb96 100644 --- a/docs/modules/ROOT/pages/PPChapter2.adoc +++ b/docs/modules/ROOT/pages/PPChapter2.adoc @@ -82,10 +82,10 @@ types). [.text-justify] In each case, there are several transfer modes , using different protocols. - + int MPI_Send( *const void* *message, *int* length, MPI_Datatype type_message, *int* rank_dest, *int* label, MPI_Comm comm) - + int MPI_Recv ( *void* *message, *int* length, MPI_Datatype type_message, *int* rank_source, *int* label, MPI_Comm comm, MPI_Status *status) @@ -103,7 +103,7 @@ Note this operation is blocking. label_message_received, MPI_Comm comm, MPI_Status *status) *Simultaneous send and receive operation* - + int MPI_Sendrecv_replace ( void * message, int length, MPI_Datatype type_message, int rank_dest, int label_message_sent, int* rank_source, int label_message_recu, MPI_Comm comm, MPI_Status *status) @@ -207,13 +207,13 @@ MPI_Reduce() followed by an MPI_Bcast()): MPI_Allreduce(). *Point-to-point sending modes* _Blocking and Non-blocking mode_ - + Standard sending MPI_Send() MPI_Isend() - + Synchronous send MPI_Ssend() MPI_Issend() - + _Buffered_ send MPI_Bsend() MPI_Ibsend() - + Receive MPI_Recv() MPI_Irecv() @@ -259,8 +259,8 @@ and MPI_Buffer_detach()). They must be allocated taking into account the memory overhead of the messages (by adding the MPI_BSEND_OVERHEAD constant for each message instance). - int MPI_Buffer_attach ( void *buf, int size_buf) - int MPI_Buffer_detach ( void *buf, int size_buf) + int MPI_Buffer_attach ( void *buf, int size_buf) + int MPI_Buffer_detach ( void *buf, int size_buf) int MPI_Bsend( const void *values, int size, MPI_Datatype type_message, int dest, int label, MPI_Comm comm) @@ -310,13 +310,13 @@ example) before using it again. int MPI_Isend( const void *values, int size, MPI_Datatype message_type, int dest, int label, MPI_Comm comm, MPI_Request *req) - + int MPI_Issend ( const void* values, int size, MPI_Datatype message_type, int dest, int label, MPI_Comm comm, MPI_Request *req) - + int MPI_Ibsend( const void* values, int size, MPI_Datatype message_type, int dest, int label, MPI_Comm comm, MPI_Request *req) - + int MPI_Irecv( void *values, int size, MPI_Datatype type_message, int* source, int label, MPI_Comm comm, MPI_Request *req) @@ -348,20 +348,20 @@ collectives in MPI 3.0) MPI_Wait() waits for the end of a communication. MPI_Test() is the non-blocking version. - int MPI_Wait ( MPI_Request *req, MPI_Status *status) + int MPI_Wait ( MPI_Request *req, MPI_Status *status) int MPI_Test( MPI_Request *req, int *flag, MPI_Status *status) MPI_Waitall() waits for all communications to end. MPI_Testall() is the non-blocking version. - int MPI_Waitall ( int size, MPI_Request reqs[], MPI_Status statuses[]) + int MPI_Waitall ( int size, MPI_Request reqs[], MPI_Status statuses[]) int* MPI_Testall ( int size, MPI_Request reqs[], int *flag, MPI_Status statuses[]) MPI_Waitany waits for the end of one communication among several. int MPI_Waitany ( int size, MPI_Request reqs[], int *index,MPI_Status *status) -MPI_Testany is the non-blocking version. +MPI_Testany is the non-blocking version. int* MPI_Testany( int size, MPI_Request reqs[], int *index, int *flag, MPI_Status *status) @@ -454,7 +454,7 @@ a| • MPI Comm size: Number of processes -• MPI Finalize: Deactivation of the MPI environment +• MPI Finalize: Deactivation of the MPI environment • MPI Abort:Stopping of an MPI program @@ -486,7 +486,7 @@ a| • MPI Alltoall: Collection and distribution -• MPI Reduce and MPI Allreduce: Reduction +• MPI Reduce and MPI Allreduce: Reduction • MPI Barrier: Global synchronization @@ -728,36 +728,36 @@ if this mode has been activated by calling the OMP_SET_NESTED subroutine or by setting the OMP_NESTED environment variable. *Examples* - + #include - - int main() - { + + int main() + { int row; - - #pragma omp parallel private(rank) num_threads(3) - { - rank=omp_get_thread_num(); + + #pragma omp parallel private(rank) num_threads(3) + { + rank=omp_get_thread_num(); printf("My rank in region 1: %d \n",rank); - - #pragma omp parallel private(rank) num_threads(2) - { - rank=omp_get_thread_num(); - printf(" My rank in region 2: %d \n",rank); + + #pragma omp parallel private(rank) num_threads(2) + { + rank=omp_get_thread_num(); + printf(" My rank in region 2: %d \n",rank); } - + } - return 0; + return 0; } - - My rank in region 1: 0 - My rank in region 2: 1 - My rank in region 2: 0 - My rank in region 1: 2 - My rank in region 2: 1 - My rank in region 2: 0 - My rank in region 1: 1 - My rank in region 2: 0 + + My rank in region 1: 0 + My rank in region 2: 1 + My rank in region 2: 0 + My rank in region 1: 2 + My rank in region 2: 1 + My rank in region 2: 0 + My rank in region 1: 1 + My rank in region 2: 0 My rank in region 2: 1 *Work sharing* @@ -983,18 +983,18 @@ block or an OpenMP Construct. A “structured block” is a single statement or a compound statement with a single entry at the top and a single exit at the bottom. - + The *parallel* construction forms To team of threads and starts parallel execution. - + *#pragma comp parallel* _[clause[ [_ *,* _]clause] ...] new-line structured-block_ - + _clause_ : *if(* _scalar- expression_ *)* - + *num_threads(* _integer-expression_ *) default(shared* *none) private(* _list_ *) firstprivate(* _list_ *)* - + *shared(* _list_ *) copyin(* _list_ *) reduce(* _operator_ *:* _list_ *)s* @@ -1005,9 +1005,9 @@ distributed among and executed by the encountering team of threads. *#pragma comp for* _[clause[[_ *,* _] clause] ... ] new-line for-loops_ - + _clause_ : *private(* _list_ *)* - + *firstprivate(* _list_ *) lastprivate(* _list_ *) reduce(* _operator_ *:* _list_ *) schedule(* _kind[, chunk_size]_ *) collapse(* _n_ *)* *ordered nowait* @@ -1021,16 +1021,16 @@ distributed among and executed by the meeting team of threads. *#pragma comp sections* _[clause[[_ *,* _] clause] ...] new line_ - + *{* - + _[_ *#pragma comp section* _new-line] structured-block_ - + _[_ *#pragma comp section* _new-line structured-block ]_ - + _clause_ : *private(* _list_ *)* - - *firstprivate(* _list_ *) + + *firstprivate(* _list_ *) lastprivate(* _list_ *) reduce(* _operator_ *:* _list_ *) nowait* @@ -1039,12 +1039,12 @@ a| executed by only one of the threads in the team (not necessarily the master thread), in the context of its implicit task. - + *#pragma comp single* _[clause[[_ *,* _] clause] ...] new-line structured-block_ - + _clause_ : *private(* _list_ *)* - + *firstprivate(* _list_ *) copyprivate(* _list_ *) nowait* a| @@ -1056,31 +1056,31 @@ allowed for the *parallel* and worksharing constructs. *#pragma comp parallel for* _[clause[[_ *,* _] clause] ...] new-line for-loop_ - + *#pragma comp parallel sections* _[clause[ [_ *,* _]clause] ...] new-line_ - + *{* _[_ *#pragma comp section* _new-line] structured-block_ - + _[_ *#pragma comp section* _new-line structured-block ]_ - + _..._ *#pragma comp task* _[clause[ [_ *,* _]clause] ...] new-line structured-block_ _clause_ : *if(* _scalar- expression_ *)* - + === untied - - + + *default(shared none) private(* _list_ *) firstprivate(* _list_ *) shared(* _list_ *)* - + *Master* construction specifies To structured block that is executed by the Master thread of the team. There is no implied barriers either on entry to, or exit from, the master construct. - - + + *#pragma comp Master* _new-line structured-block_ a| @@ -1092,12 +1092,12 @@ block to a single thread at a time. The *barriers* construction specifies year explicit barriers did the point did which the construct appears. - + *#pragma comp barriers* _new- line_ - + The *taskwait* construction specifies To wait we the completion of child tasks generated since the beginning of the current task. - + *#pragma comp you asked* _new line_ a| @@ -1107,13 +1107,13 @@ simultaneous writing threads. *#pragma comp atomic* _new-line expression-stmt_ - + _stmt-expression_ : one of the following forms: - + _x binop_ *=* _expr x_ *++* - + *++* _x x_ *- -* - + *--x* ___ @@ -1134,13 +1134,13 @@ allowing code outside the region to run in parallel. *#pragma comp ordered* _new-line structured-block_ - - + + a| *threadprivate* guideline specifies that variables are replicated, with each thread having its own copy. - - + + *#pragma comp threadprivate* _( list) new- line_ |=== @@ -1167,17 +1167,17 @@ necessary to specify it explicitly in a private clause for (i=1; i - + int main() { omp_set_dynamic(0); @@ -1225,9 +1225,9 @@ implementations that support it. If there are multiple independent loops within a parallel region, you can use the nowait clause to avoid the implied barrier at the end of the loop construct - + #include - + void nowait_example(int n, int m, float *a, float *b, float *y, float *z) { int i; @@ -1253,9 +1253,9 @@ collapse clause may appear, the effect is as if a value of one was specified for n if the collapse clause is not specified. void bar(float *a, int i, int j, int k); - + int kl, ku, ks, jl, ju, js, il, iu,is; - + void sub(float *a) { int i, j, k; @@ -1273,9 +1273,9 @@ the end of the execution of the loop construct, the original variable _j_ is updated with the value _N/2_ from the last iteration of the loop. #include - + #define N 100 - + int main(void) { float a[N], b[N/2]; @@ -1289,7 +1289,7 @@ _j_ is updated with the value _N/2_ from the last iteration of the loop. b[j]= a[i] * 2.0f; j++; } - + printf"%d %f %f\n", j, b[0], b[j-1] ); /* print out: 50 2.0 198.0 */ return 0; @@ -1306,36 +1306,36 @@ Since the order of execution of the two sections in this case is unspecified, it is unspecified which section prints which value. #include - + #define NT 4 - + int main( ) { - + int section_count = 0; - + *omp_set_dynamic(0);* *omp_set_num_threads(NT);* *#pragma omp parallel* *#pragma omp sections firstprivate( section_count )* { - + *#pragma omp section* { section_count++; /* may print the number one or two */ printf( "section_count %d\n", section_count ); - + } - + *#pragma omp section* { section_count++; /* may print the number one or two */ printf( "section_count %d\n", section_count ); } - + } - + return 0; } @@ -1349,14 +1349,14 @@ third single construct in this example. The user must not make any assumptions as to which thread will execute a single region. #include - + void work1() {} - + void work2() {} - + void single_example() - - + + *#pragma omp parallel* { *#pragma omp single* @@ -1369,20 +1369,20 @@ assumptions as to which thread will execute a single region. work2(); } } - - - - + + + + |The master Construct a| #include - + extern float average(float,float,float); void master_example( float* x, float* xold, int n, float tol ) { int c, i, toobig; float error, y; c = 0; - + #*pragma omp parallel* { do { @@ -1390,12 +1390,12 @@ assumptions as to which thread will execute a single region. for( i = 1; i < n-1; ++i ){ xold[i] = x[i]; } - + *#pragma omp single* { toobig = 0; } - + *#pragma omp for private(i,y,error) reduction(+:toobig)* for(i=1; i tol or error < -tol ) ++toobig; } - + *#pragma omp master* { ++c; @@ -1417,16 +1417,16 @@ assumptions as to which thread will execute a single region. |Parrallel Random Access Iterator Loop a| #include - + void iterator_example() - + { std::vector vec(23); std::vector::iterator it; - + *#pragma omp parallel for default(none) shared(vec)* for (it = vec.begin(); it < vec.end(); it++) - + { // do work with *it // } @@ -1440,11 +1440,11 @@ turn off the dynamic threads capability and set the number of threads explicitly to ensure portability. #include - + #include - + void do_by_16(float *x, int iam, int ipoints) {} - + void dynthreads(float *x, int npoints) { int iam, ipoints; @@ -1472,7 +1472,7 @@ visible in the construct on which the clause appears. Not all of the clauses are valid on all directives. The set of clauses that is valid we To particular guideline is described with the directive. Most of the clauses accept a comma-separated list of list items. All list items -appearing in a clause must be visible._ +appearing in a clause must be visible._ @@ -1668,7 +1668,7 @@ void omp_get_schedule (omp_sched_t *kind, int *edit)s; |Returns the schedule applied when *run-time* schedule is used. a| -int omp_get_thread_limit(void)* +int omp_get_thread_limit(void)* |Returns the maximum number of OpenMP threads available to the program. @@ -1907,7 +1907,7 @@ a| a| -| +\| |0 @@ -1924,7 +1924,7 @@ a| |1 a| -|| +\|\| |0 diff --git a/docs/modules/ROOT/pages/PPChapter2_Hybrid.adoc b/docs/modules/ROOT/pages/PPChapter2_Hybrid.adoc index 1a2ac9fc..54d4428c 100644 --- a/docs/modules/ROOT/pages/PPChapter2_Hybrid.adoc +++ b/docs/modules/ROOT/pages/PPChapter2_Hybrid.adoc @@ -3,14 +3,12 @@ [.text-justify] == MPI, OpenMP two complementary parallelization models. -* MPI is a multi-process model whose mode of communication between the processes is *explicit* (communication management is the responsibility of the user). MPI is generally used on multiprocessor machines with distributed memory. MPI is a library for passing messages between processes without sharing. +* *MPI* is a multi-process model whose mode of communication between the processes is *explicit* (communication management is the responsibility of the user). MPI is generally used on multiprocessor machines with distributed memory. MPI is a library for passing messages between processes without sharing. -* OpenMP is a multitasking model whose mode of communication between tasks is *implicit* (the management of communications is the responsibility of the compiler). OpenMP is used on shared-memory multiprocessor machines. It focuses on shared memory paradigms. It is a language extension for expressing data-parallel operations (usually parallelized arrays over loops). +* *OpenMP* is a multitasking model whose mode of communication between tasks is *implicit* (the management of communications is the responsibility of the compiler). OpenMP is used on shared-memory multiprocessor machines. It focuses on shared memory paradigms. It is a language extension for expressing data-parallel operations (usually parallelized arrays over loops). Note: on a cluster of independent shared-memory multiprocessor machines (nodes), the implementation of a two-level parallelization (MPI, OpenMP) in the same program can be a major advantage for the parallel performance of the code. -image::image7.png[xref=#fragment7,width=581,height=336] - == MPI vs OpenMP comparison diff --git a/docs/modules/ROOT/pages/PPChapter2_OpenMP.adoc b/docs/modules/ROOT/pages/PPChapter2_OpenMP.adoc index 5bb7d4e5..f09b8c8a 100644 --- a/docs/modules/ROOT/pages/PPChapter2_OpenMP.adoc +++ b/docs/modules/ROOT/pages/PPChapter2_OpenMP.adoc @@ -1,7 +1,7 @@ = OpenMP (Open Multi-Processing) -== Definition +== Definition [.text-justify] OpenMP ( Open Multi-Processing ) is a programming interface for parallel computing on shared memory architecture. @@ -122,32 +122,32 @@ subroutine or set the OMP_DYNAMIC environment variable to true. It is possible to nest (nesting) parallel regions, but this only has an effect if this mode has been activated by calling the OMP_SET_NESTED subroutine or by setting the OMP_NESTED environment variable. *Examples* - + #include - int main() - { + int main() + { int row; - #pragma omp parallel private(rank) num_threads(3) - { - rank=omp_get_thread_num(); + #pragma omp parallel private(rank) num_threads(3) + { + rank=omp_get_thread_num(); printf("My rank in region 1: %d \n",rank); - #pragma omp parallel private(rank) num_threads(2) - { - rank=omp_get_thread_num(); - printf(" My rank in region 2: %d \n",rank); + #pragma omp parallel private(rank) num_threads(2) + { + rank=omp_get_thread_num(); + printf(" My rank in region 2: %d \n",rank); } } - return 0; + return 0; } - - My rank in region 1: 0 - My rank in region 2: 1 - My rank in region 2: 0 - My rank in region 1: 2 - My rank in region 2: 1 - My rank in region 2: 0 - My rank in region 1: 1 - My rank in region 2: 0 + + My rank in region 1: 0 + My rank in region 2: 1 + My rank in region 2: 0 + My rank in region 1: 2 + My rank in region 2: 1 + My rank in region 2: 0 + My rank in region 1: 1 + My rank in region 2: 0 My rank in region 2: 1 *Work sharing* @@ -350,10 +350,10 @@ block or an OpenMP Construct. A “structured block” is a single statement or a compound statement with a single entry at the top and a single exit at the bottom. - + The *parallel* construction forms To team of threads and starts parallel execution. - + *#pragma comp parallel* _[clause[ [_ *,* _]clause] ...] new-line structured-block_ _clause_ : *if(* _scalar- expression_ *)* @@ -388,7 +388,7 @@ distributed among and executed by the meeting team of threads. _[_ *#pragma comp section* _new-line] structured-block_ _[_ *#pragma comp section* _new-line structured-block ]_ _clause_ : *private(* _list_ *)* - *firstprivate(* _list_ *) + *firstprivate(* _list_ *) lastprivate(* _list_ *) reduce(* _operator_ *:* _list_ *) nowait* @@ -398,7 +398,7 @@ a| executed by only one of the threads in the team (not necessarily the master thread), in the context of its implicit task. - + *#pragma comp single* _[clause[[_ *,* _] clause] ...] new-line structured-block_ _clause_ : *private(* _list_ *)* @@ -423,18 +423,18 @@ allowed for the *parallel* and worksharing constructs. *#pragma comp task* _[clause[ [_ *,* _]clause] ...] new-line structured-block_ _clause_ : *if(* _scalar- expression_ *)* - + === untied - - + + *default(shared none) private(* _list_ *) firstprivate(* _list_ *) shared(* _list_ *)* - + *Master* construction specifies To structured block that is executed by the Master thread of the team. There is no implied barriers either on entry to, or exit from, the master construct. - - + + *#pragma comp Master* _new-line structured-block_ a| @@ -447,12 +447,12 @@ block to a single thread at a time. The *barriers* construction specifies year explicit barriers did the point did which the construct appears. - + *#pragma comp barriers* _new- line_ - + The *taskwait* construction specifies To wait we the completion of child tasks generated since the beginning of the current task. - + *#pragma comp you asked* _new line_ a| @@ -488,13 +488,13 @@ allowing code outside the region to run in parallel. *#pragma comp ordered* _new-line structured-block_ - - + + a| *threadprivate* guideline specifies that variables are replicated, with each thread having its own copy. - - + + *#pragma comp threadprivate* _( list) new- line_ |=== @@ -521,17 +521,17 @@ The loop iteration variable is private by default, so it is not necessary to spe for (i=1; i int main() { @@ -576,7 +576,7 @@ implementations that support it. |The nowait Clause a| If there are multiple independent loops within a parallel region, you can use the nowait clause to avoid the implied barrier at the end of the loop construct - + #include void nowait_example(int n, int m, float *a, float *b, float *y, float *z) { @@ -692,8 +692,8 @@ assumptions as to which thread will execute a single region. printf("Finished work1 and beginning work2.\n"); work2(); } - - + + |The master Construct a| #include extern float average(float,float,float); @@ -781,7 +781,7 @@ visible in the construct on which the clause appears. Not all of the clauses are valid on all directives. The set of clauses that is valid we To particular guideline is described with the directive. Most of the clauses accept a comma-separated list of list items. All list items -appearing in a clause must be visible._ +appearing in a clause must be visible._ @@ -953,7 +953,7 @@ void omp_get_schedule (omp_sched_t *kind, int *edit)s; |Returns the schedule applied when *run-time* schedule is used. a| -int omp_get_thread_limit(void)* +int omp_get_thread_limit(void)* |Returns the maximum number of OpenMP threads available to the program. @@ -1168,7 +1168,7 @@ a| a| -| +\| |0 @@ -1185,7 +1185,7 @@ a| |1 a| -|| +\|\| |0 diff --git a/docs/modules/ROOT/pages/PPChapter3.adoc b/docs/modules/ROOT/pages/PPChapter3.adoc index 90a5d03b..d99e0268 100644 --- a/docs/modules/ROOT/pages/PPChapter3.adoc +++ b/docs/modules/ROOT/pages/PPChapter3.adoc @@ -1,34 +1,44 @@ = Star PU +== Introduction + +[.text-justify] +StarPU is a sophisticated C task scheduling library designed for hybrid architectures, enabling users to define tasks that can be executed on both CPUs and GPUs. It facilitates the specification of dependencies between these tasks and efficiently manages their scheduling across all available processing units. The primary objective of StarPU is to create systems where applications are distributed throughout the entire machine, allowing parallel tasks to run concurrently on all accessible resources. + [.text-justify] -StarPU is a C task scheduling library for hybrid architectures. It provides users with a means to define tasks that can run on both CPU and GPU, as well as the dependencies between these tasks, and takes care of scheduling them across all available processing units. The goal of StarPU is therefore to design systems in which applications are distributed across the entire machine, running parallel tasks on all available resources. It keeps track of the copies of each of the data in the different memories on board the accelerators and provides mechanisms such as data preloading. The calculation time has been significantly reduced, as well as the high efficiency in the use of different calculation resources, the different typical workloads, especially in the case of multi-core machines equipped with several acceleration machines. In doing so, StarPU abstracts the programmer from the underlying technical details: StarPU adapts to the available processing units and takes care of transferring data between them when necessary. StarPU effectively addresses scheduling problems, but the C language interface it offers is poor and makes programming errors easier. +The library meticulously tracks the locations of data copies across various memory units associated with the accelerators and incorporates mechanisms such as data preloading to enhance performance. This approach has led to significant reductions in computation time and improved efficiency in utilizing diverse computational resources, particularly in multi-core environments equipped with multiple accelerators. By abstracting the underlying technical complexities, StarPU allows programmers to focus on higher-level algorithmic concerns while it autonomously adapts to the available processing units and manages necessary data transfers. + +[.text-justify] +StarPU provides algorithms and constraints for task implementations, supporting CPU/GPU executions through a task graph that can be constructed using its comprehensive C/C++/Fortran/Python API or OpenMP pragmas. Internally, StarPU addresses several critical aspects: + +- Management of task dependencies. +- Optimized heterogeneous scheduling. +- Efficient data transfers and replication between main memory and discrete memory units. +- Enhanced communication within clusters. +- Optimized data transfers and replication between main memory and +discrete memories. +- Optimized cluster communications. -The app provides algorithms and constraints +The app provides algorithms and constraints : -* CPU/GPU implementations of tasks -* A task graph, using either StarPU's rich C/C++/Fortran/Python API or +- CPU/GPU implementations of tasks +- A task graph, using either StarPU's rich C/C++/Fortran/Python API or OpenMP pragmas. -StarPU internally deals with the following aspects: -* Task dependencies -* Optimized heterogeneous scheduling -* Optimized data transfers and replication between main memory and -discrete memories -* Optimized cluster communications image::image19.png[xref=#fragment19,width=179,height=179] -Links: +== References -https://hpc2n.github.io/Task-based-parallelism/branch/master/starpu1/#hello-world +** [1] https://hpc2n.github.io/Task-based-parallelism/branch/master/starpu1/#hello-world -https://github.com/alucas/StarPU/tree/master +** [2] https://github.com/alucas/StarPU/tree/master -https://hpc2n.github.io/Task-based-parallelism/branch/master/starpu1/#benefits-and-downsides +** [3] https://hpc2n.github.io/Task-based-parallelism/branch/master/starpu1/#benefits-and-downsides -https://indico.math.cnrs.fr/event/6415/attachments/2736/3475/2021.02.24_-_exa2pro-eocoe_workshop_-_StarPU_-_S._Thibault.pdf +** [4] https://indico.math.cnrs.fr/event/6415/attachments/2736/3475/2021.02.24_-_exa2pro-eocoe_workshop_-_StarPU_-_S._Thibault.pdf -https://gitub.u-bordeaux.fr/starpu/starpu/-/tree/master/examples +** [5] https://gitub.u-bordeaux.fr/starpu/starpu/-/tree/master/examples diff --git a/docs/modules/ROOT/pages/PPChapter4.adoc b/docs/modules/ROOT/pages/PPChapter4.adoc index b5ffd4bc..59b29c38 100644 --- a/docs/modules/ROOT/pages/PPChapter4.adoc +++ b/docs/modules/ROOT/pages/PPChapter4.adoc @@ -1,14 +1,24 @@ = Specx [.text-justify] -SPECX is a task-based execution system. It shares many similarities -with StarPU but is written in modern C++. It also supports speculative -execution, which is the ability to run tasks ahead of time if others are -unsure about changing the data. +*SPECX* is an innovative task-based programming model designed to address the challenges of modern high-performance computing (HPC). In a context where computing power and efficiency are paramount, SPECX is positioned as a leading solution to optimize the performance of complex parallel applications. +This open-source framework offers developers and researchers a powerful tool to fully exploit the potential of advanced computing architectures. SPECX facilitates the creation and management of asynchronous tasks, enabling more efficient use of available computing resources, whether multi-core processors, clusters or supercomputers. + + +SPECX is a task-based execution system. It shares many similarities with StarPU but is written in modern C++. It also supports speculative execution, which is the ability to run tasks ahead of time if others are unsure about changing the data. image::image21.png[xref=#fragment21,image,width=642,height=380] + +SPECX's key features include: + +* A task-based approach to simplify parallel programming. +* Automatic management of dependencies between tasks. +* Asynchronous execution to maximize resource utilization. +* Compatibility with various HPC environments. + + == Workflow [.text-justify] * *Execution interface:* Provides functionality for creating tasks, task diff --git a/docs/modules/ROOT/pages/antora.adoc b/docs/modules/ROOT/pages/antora.adoc index abb94a89..3f1a2537 100644 --- a/docs/modules/ROOT/pages/antora.adoc +++ b/docs/modules/ROOT/pages/antora.adoc @@ -1,7 +1,7 @@ = Antora environment -Antora is our static website generator. -We use it to generate the documentation of the project +Antora is our static website generator. +We use it to generate the documentation of the project It is part of the documentation of docs.feelpp.org[{feelpp} docs] website. .To generate the documentation @@ -15,4 +15,3 @@ It is part of the documentation of docs.feelpp.org[{feelpp} docs] website. <2> Generate the documentation <3> Serve the documentation, the url is provided in the console -include::antora-ui::partial$antora.adoc[leveloffset=+1] diff --git a/docs/modules/ROOT/pages/architectures/PPChapter1_DPU.adoc b/docs/modules/ROOT/pages/architectures/PPChapter1_DPU.adoc index 80bee071..528ead85 100644 --- a/docs/modules/ROOT/pages/architectures/PPChapter1_DPU.adoc +++ b/docs/modules/ROOT/pages/architectures/PPChapter1_DPU.adoc @@ -3,7 +3,7 @@ image::DPU1.jpg[xref=#fragment03,width=322,height=220] [.text-justify] -Data Processing Units (DPUs) are specialized hardware components designed to optimize data-centric workloads in modern computing environments, particularly in data centers. They have emerged as a crucial element alongside Central Processing Units (CPUs) and Graphics Processing Units (GPUs), forming a triad of processing units that address different computational needs. +*Data Processing Units (DPUs)* are specialized hardware components designed to optimize data-centric workloads in modern computing environments, particularly in data centers. They have emerged as a crucial element alongside Central Processing Units (CPUs) and Graphics Processing Units (GPUs), forming a triad of processing units that address different computational needs. [.text-justify] Specialists in moving data in data centers, DPUs, or data processing units, are a new class of programmable processor and will join CPUs and GPUs as one of the three pillars of computing. @@ -38,46 +38,46 @@ A DPU is a programmable processor that integrates several key features: - **High-speed Networking**: Facilitates rapid data transfer, which is essential for data-intensive applications. -The primary role of a DPU is to offload and accelerate tasks traditionally managed by CPUs, thereby improving overall data throughput and freeing the CPU to focus on more complex computations. This capability is particularly beneficial in environments dealing with large volumes of data, such as cloud computing, artificial intelligence, and big data analytics[1][3][4]. +The primary role of a DPU is to offload and accelerate tasks traditionally managed by CPUs, thereby improving overall data throughput and freeing the CPU to focus on more complex computations. This capability is particularly beneficial in environments dealing with large volumes of data, such as cloud computing, artificial intelligence, and big data analytics [1][3][4]. == Evolution and Importance -DPUs evolved from earlier technologies like network interface cards (NICs) and smart NICs, which were designed to offload specific tasks from the CPU. As data processing demands grew, especially with the rise of AI and complex workloads, the need for dedicated processors that could efficiently manage data movement and processing became apparent. DPUs are now seen as essential for enhancing the performance and efficiency of data centers by handling tasks such as data transfer, security, and analytics, which allows CPUs to concentrate on core application cycles[2][4][5]. +DPUs evolved from earlier technologies like network interface cards (NICs) and smart NICs, which were designed to offload specific tasks from the CPU. As data processing demands grew, especially with the rise of AI and complex workloads, the need for dedicated processors that could efficiently manage data movement and processing became apparent. DPUs are now seen as essential for enhancing the performance and efficiency of data centers by handling tasks such as data transfer, security, and analytics, which allows CPUs to concentrate on core application cycles [2][4][5]. == Key Benefits -1. **Increased Efficiency**: DPUs significantly improve performance metrics, such as performance per watt and performance per dollar, by optimizing data handling tasks[3]. +1. **Increased Efficiency**: DPUs significantly improve performance metrics, such as performance per watt and performance per dollar, by optimizing data handling tasks [3]. -2. **Cost Reduction**: By offloading work from CPUs, DPUs can lower the total cost of ownership (TCO) for data centers, making them more economically viable for cloud service providers and enterprises[3]. +2. **Cost Reduction**: By offloading work from CPUs, DPUs can lower the total cost of ownership (TCO) for data centers, making them more economically viable for cloud service providers and enterprises [3]. -3. **Enhanced Scalability**: DPUs support composable infrastructure architectures, allowing for more flexible and scalable data center designs[4]. +3. **Enhanced Scalability**: DPUs support composable infrastructure architectures, allowing for more flexible and scalable data center designs [4]. -4. **Specialized Processing**: They are tailored for specific data-related functions, such as encryption, data reduction, and network offloading, which enhances their effectiveness compared to general-purpose CPUs[1][3]. +4. **Specialized Processing**: They are tailored for specific data-related functions, such as encryption, data reduction, and network offloading, which enhances their effectiveness compared to general-purpose CPUs [1][3]. == Performance Comparison === **Functionality and Specialization** -- **DPU**: Primarily designed to handle data-centric workloads, DPUs excel in tasks such as data transfer, security, and network management. They integrate multiple functions, including high-performance networking and programmable accelerators, allowing them to offload tasks from CPUs and GPUs effectively. This specialization enables DPUs to manage data movement and processing more efficiently than general-purpose CPUs or even GPUs in certain contexts[8][11]. +- **DPU**: Primarily designed to handle data-centric workloads, DPUs excel in tasks such as data transfer, security, and network management. They integrate multiple functions, including high-performance networking and programmable accelerators, allowing them to offload tasks from CPUs and GPUs effectively. This specialization enables DPUs to manage data movement and processing more efficiently than general-purpose CPUs or even GPUs in certain contexts [8][11]. -- **GPU**: GPUs are optimized for parallel processing and are particularly effective at handling tasks that require massive parallel computations, such as graphics rendering and machine learning. However, they are not inherently designed for data management tasks, which can limit their efficiency in data-centric environments where data movement and processing are critical[9][11]. +- **GPU**: GPUs are optimized for parallel processing and are particularly effective at handling tasks that require massive parallel computations, such as graphics rendering and machine learning. However, they are not inherently designed for data management tasks, which can limit their efficiency in data-centric environments where data movement and processing are critical [9][11]. === 2. **Architecture and Design** -- **DPU Architecture**: DPUs typically feature a multi-core CPU architecture combined with hardware accelerators tailored for specific data processing tasks. This design allows them to perform operations like packet processing and encryption at high speeds, often achieving data transfer rates of 100-200 Gbps[8][10]. +- **DPU Architecture**: DPUs typically feature a multi-core CPU architecture combined with hardware accelerators tailored for specific data processing tasks. This design allows them to perform operations like packet processing and encryption at high speeds, often achieving data transfer rates of 100-200 Gbps [8][10]. -- **GPU Architecture**: GPUs consist of thousands of cores optimized for parallel execution, making them powerful for tasks that can be parallelized. However, their architecture is less flexible for general data processing tasks compared to DPUs, which are designed specifically for such workloads[9][11]. +- **GPU Architecture**: GPUs consist of thousands of cores optimized for parallel execution, making them powerful for tasks that can be parallelized. However, their architecture is less flexible for general data processing tasks compared to DPUs, which are designed specifically for such workloads [9][11]. === **Efficiency and Power Consumption** -- **DPU Efficiency**: DPUs are engineered to be energy-efficient by offloading tasks from CPUs and GPUs, which reduces overall power consumption in data centers. They can handle multiple workloads simultaneously while maintaining low energy usage, making them a cost-effective solution for managing data-intensive applications[9][10]. +- **DPU Efficiency**: DPUs are engineered to be energy-efficient by offloading tasks from CPUs and GPUs, which reduces overall power consumption in data centers. They can handle multiple workloads simultaneously while maintaining low energy usage, making them a cost-effective solution for managing data-intensive applications [9][10]. -- **GPU Power Consumption**: While GPUs are powerful, they tend to consume more power, especially when executing complex computations. Their design is focused on maximizing throughput for parallel tasks, which can lead to higher energy costs in data centers[9][11]. +- **GPU Power Consumption**: While GPUs are powerful, they tend to consume more power, especially when executing complex computations. Their design is focused on maximizing throughput for parallel tasks, which can lead to higher energy costs in data centers [9][11]. === **Overall Impact on Data Center Performance** -DPUs enhance data center performance by optimizing data flow and reducing latency in data processing. They effectively manage workloads that would otherwise burden CPUs and GPUs, leading to improved overall efficiency in data handling and processing tasks. In contrast, while GPUs excel in specific computational tasks, they do not manage data as effectively as DPUs, particularly in scenarios requiring extensive data movement and management[8][9][10]. +DPUs enhance data center performance by optimizing data flow and reducing latency in data processing. They effectively manage workloads that would otherwise burden CPUs and GPUs, leading to improved overall efficiency in data handling and processing tasks. In contrast, while GPUs excel in specific computational tasks, they do not manage data as effectively as DPUs, particularly in scenarios requiring extensive data movement and management [8][9][10]. Other points, DPUs and GPUs (Graphics Processing Units) can work together to enhance data center performance. Their collaboration leverages the strengths of each processing unit, optimizing the overall efficiency and capability of data centers. diff --git a/docs/modules/ROOT/pages/architectures/PPChapter1_NPU.adoc b/docs/modules/ROOT/pages/architectures/PPChapter1_NPU.adoc index b2de81b7..6b025a13 100644 --- a/docs/modules/ROOT/pages/architectures/PPChapter1_NPU.adoc +++ b/docs/modules/ROOT/pages/architectures/PPChapter1_NPU.adoc @@ -5,19 +5,19 @@ image::NPU001.jpg[xref=#fragment03,width=322,height=220] [.text-justify] == What’s an NPU (Neural Processing Unit)? -An NPU, or Neural Processing Unit, is a specialized hardware accelerator designed for executing artificial neural network tasks efficiently and with high throughput. NPUs deliver high performance while minimizing power consumption, making them suitable for mobile devices, edge computing, and other energy-sensitive applications. With the spike in GPU prices, which is a limited supply despite the increasing demand starting with crypto mining, hardware companies have invested in NPUs to position them as an alternative to GPUs. While an NPU is not a perfect substitute for a GPU, it helps run inference on mobile or embedded. NPUs use the traditional von Neumann architecture, which separates the memory and the processing units. TPUs use the systolic array architecture, which integrates the memory and the processing units into a single chip. NPUs have a higher peak performance than TPUs, but they also have a higher latency and power consumption. TPUs have a lower peak performance than NPUs, but they also have a lower latency and power consumption. +An *NPU*, or *Neural Processing Unit*, is a specialized hardware accelerator designed for executing artificial neural network tasks efficiently and with high throughput. NPUs deliver high performance while minimizing power consumption, making them suitable for mobile devices, edge computing, and other energy-sensitive applications. With the spike in GPU prices, which is a limited supply despite the increasing demand starting with crypto mining, hardware companies have invested in NPUs to position them as an alternative to GPUs. While an NPU is not a perfect substitute for a GPU, it helps run inference on mobile or embedded. NPUs use the traditional von Neumann architecture, which separates the memory and the processing units. TPUs use the systolic array architecture, which integrates the memory and the processing units into a single chip. NPUs have a higher peak performance than TPUs, but they also have a higher latency and power consumption. TPUs have a lower peak performance than NPUs, but they also have a lower latency and power consumption. [.text-justify] == What’s a TPU (Tensor Processing Unit)? -A TPU, or Tensor Processing Unit, is a specialized application-specific integrated circuit (ASIC) developed by Google for accelerating machine learning workloads. TPUs efficiently perform essential neural network tasks, such as matrix multiplications or other tensor operations. Since TPUs are optimized for the specific mathematical operations in neural network training and inference, they offer superior performance and energy efficiency. However, machine learning developers may prefer GPUs, especially NVIDIA GPUs, over TPUs due to the network effect. NVIDIA’s brand, mature software stack, simple documentation, and integration with major frameworks give NVIDIA a competitive advantage over other GPU manufacturers or alternatives. +A *TPU*, or *Tensor Processing Unit*, is a specialized application-specific integrated circuit (ASIC) developed by Google for accelerating machine learning workloads. TPUs efficiently perform essential neural network tasks, such as matrix multiplications or other tensor operations. Since TPUs are optimized for the specific mathematical operations in neural network training and inference, they offer superior performance and energy efficiency. However, machine learning developers may prefer GPUs, especially NVIDIA GPUs, over TPUs due to the network effect. NVIDIA’s brand, mature software stack, simple documentation, and integration with major frameworks give NVIDIA a competitive advantage over other GPU manufacturers or alternatives. [.text-justify] == What are the advantages and disadvantages of NPUs and TPUs? Based on the comparison above, we can summarize the advantages and disadvantages of NPUs and TPUs as follows: -NPUs: NPUs have the advantage of having a higher peak performance than TPUs, which means they can handle more complex and diverse neural networks. However, NPUs also have the disadvantage of having a higher latency and power consumption than TPUs, which means they are slower and more costly to run. +*NPUs* : NPUs have the advantage of having a higher peak performance than TPUs, which means they can handle more complex and diverse neural networks. However, NPUs also have the disadvantage of having a higher latency and power consumption than TPUs, which means they are slower and more costly to run. -TPUs: TPUs have the advantage of having a lower latency and power consumption than NPUs, which means they are faster and more efficient to run. However, TPUs also have the disadvantage of having a lower peak performance than NPUs, which means they can handle only specific and optimized neural networks. +*TPUs* : TPUs have the advantage of having a lower latency and power consumption than NPUs, which means they are faster and more efficient to run. However, TPUs also have the disadvantage of having a lower peak performance than NPUs, which means they can handle only specific and optimized neural networks. [.text-justify] == NPUs are vital for efficiency diff --git a/docs/modules/ROOT/pages/architectures/PPChapter1_QPU.adoc b/docs/modules/ROOT/pages/architectures/PPChapter1_QPU.adoc new file mode 100644 index 00000000..2de51e86 --- /dev/null +++ b/docs/modules/ROOT/pages/architectures/PPChapter1_QPU.adoc @@ -0,0 +1,74 @@ += QPU (Quantum Processing Units) + +image::QPU002.jpg[xref=#fragment03,width=322,height=220] + +[.text-justify] +A quantum processing unit (QPU) is the central component of a quantum computer. It functions as the "brain" where the qubits reside and computations are performed. Similar to the central processing unit (CPU) of classical computers, a QPU requires significant support infrastructure. However, unlike CPUs, this infrastructure can vary considerably depending on the quantum computing modality used. In reality, quantum hardware is far more diverse and variable than classical computing systems at this stage of development. + +Analogies are often made to explain QPUs, but they can sometimes be insufficient. For example, a Microcontroller Tips article titled "What is a Quantum Processing Unit?" compares QPUs to microcontrollers. This analogy is misleading because QPUs don't control anything; they themselves must be controlled by external systems. Furthermore, the challenges mentioned in this article, such as control, temperature regulation, and manufacturing issues, apply specifically to superconducting processors and silicon spin processors but do not apply universally to all quantum modalities. Neutral atom quantum processors, for example, do not face these limitations, highlighting the diversity of quantum computing technologies currently under development. This diversity is also highlighted in a Medium article by QuAIL Technologies titled "Quantum Processing Units (QPUs)." + +Another article by Huawei, titled "What is a Quantum Processing Unit?", compares QPUs to CPUs, which provides a closer analogy. However, the claim that QPUs compute faster than CPUs requires clarification. In reality, QPUs perform computations much more slowly than CPUs but are far more efficient for certain types of problems. This efficiency can significantly reduce computation times for certain classes of problems. As with the "Microcontroller Tips" article, Huawei's analysis of issues such as noise, scalability, and connectivity applies to certain modalities, but not to neutral atom processors or other emerging technologies. + +Perhaps the most apt analogy for QPUs regarding their future role is that of the graphics processing unit (GPU). Just as CPUs continue to handle most computation today, they will remain at the heart of future computing architectures. GPUs and other specialized processors will continue to handle specific tasks as they do today. However, QPUs are expected to join this group as an additional specialized processor to solve specific computational problems. Although QPUs and GPUs perform fundamentally different types of computation, their role within computing systems—as specialized tools for specific tasks—will be strikingly similar. + + +[.text-justify] +== What is a Quantum Processing Unit? +The quantum technology that goes into a quantum processor can vary significantly. The computing process can vary quite a bit, as well. Just a few examples of qubit modalities and their supporting technologies to showcase the extent of this variety include: + +* Neutral atoms in vacuum chambers, cooled and controlled by lasers +* Electronic circuits, cooled by dilution refrigerators and controlled by microwaves +* Individual electrons, trapped in vacuum chambers like ionized atoms, but cooled by dilution refrigerators and controlled by microwaves +* Photonic integrated circuits at room temperature, but with cryogenically-cooled detectors and controlled by physical hardware + +There are quite a few more modalities in development than are listed above, but these four alone already demonstrate quite a bit of variation. Some of the other modalities use similar technologies, but they differ in what actually constitutes a qubit. + + +[.text-justify] +== How do quantum processing units (QPUs) use qubits to solve problems? + +Quantum processors, or QPUs, take a radically different approach to performing calculations than classical computers. While traditional computers follow deterministic logic, performing each step of a complex calculation sequentially, quantum circuits composed of qubits use the fundamental principles of quantum mechanics to simultaneously process large amounts of data. This unique capability offers a new way to solve certain types of problems, with potentially exponential efficiency gains in specific fields such as optimization, molecular simulation, or machine learning. + +*Qubits: at the heart of quantum computing:* Unlike bits in classical computers, which can only assume two states (0 or 1), qubits can exist in a superposition of states. This means that a qubit can simultaneously represent 0, 1, and all possible combinations between these two states. For example, a system composed of multiple qubits can maintain a superposition of an exponential set of values relative to the number of qubits. This property allows QPUs to simultaneously explore a much larger solution space than traditional computers. In QPUs developed by IBM, qubits are often made from superconducting or semiconductor materials. These superconducting qubits are cooled to extremely low temperatures to minimize external disturbances and maintain their quantum coherence, a state essential for performing precise calculations. + +*Entanglement: An Extra Power:* Another key property of quantum systems is entanglement. When two qubits are entangled, their states become interdependent, even if they are physically separated. This means that a measurement of one instantly affects the state of the other. This interdependence allows quantum algorithms to exploit complex correlations between data, which is impossible to reproduce with independent classical bits. Entanglement is therefore a fundamental tool for solving certain problems where complexity increases rapidly with the size of the system. + +*From superposition to binary result:* At the end of a quantum computation, the system must provide a result understandable to the classical world. The data is converted into binary code: each qubit is measured as being either in state 0 or in state 1. The probability of obtaining one or the other state depends directly on its contribution to the superposition during the computation. This measurement process is crucial but delicate, as it leads to "decoherence," i.e., the loss of the system's quantum properties. + +The fundamental principles of quantum technologies +Quantum technologies are based on four key principles that differentiate these systems from classical computers: + +image::QPU003.jpg[xref=#fragment03,width=322,height=220] +* Superposition: Allows a qubit to exist in multiple states simultaneously. + +* Entanglement: Creates a dependency between entangled qubits, facilitating parallel and correlated processing. + +* Nonlocality: Allows quantum effects to act instantly over large distances. + +* Interference: Used to amplify correct solutions and reduce incorrect ones during computation. + +These properties allow QPUs to use either real particles (such as trapped ions or molecular qubits) or material systems that mimic their behavior (such as superconducting qubits). Thanks to these unique characteristics, quantum processors can perform calculations impossible with classical binary bits, paving the way for a technological revolution in diverse fields such as cryptography, computational chemistry, and artificial intelligence. + +*Types of qubits:* +* Superconducting qubits: Made from superconducting materials operating at low temperatures, these qubits are favored for their speed in performing computations and fine-tuned control. + +* Trapped ion qubits: Trapped ions can also be used as qubits and are noted for long coherence times and high-fidelity measurements. Ions are atoms with electrical charge. + +* Quantum dots: Quantum dots are small semiconductors that capture a single electron and use it as a qubit, offering promising potential for scalability and compatibility with existing semiconductor technology. + +* Photons: Photons are individual light particles used to send quantum information across long distances through optical fiber cables and are being used in quantum communication and quantum cryptography. + +* Neutral atoms: Commonly occurring neutral atoms charged with lasers are well suited for scaling and performing operations. + +Certain types of qubits are better suited for certain tasks, although all known qubits are still highly sensitive. QPUs used in functional quantum computers require significant support hardware and software to maintain proper calibration and handle external noise. + + +[.text-justify] +== Why the QPU is the next GPU ? +Quantum Processing Units (QPUs) represent a groundbreaking leap in computing technology. Unlike classical processors, QPUs leverage qubits and quantum circuit architectures to solve problems that are computationally intractable for traditional systems, such as molecular simulations or large-scale optimization challenges. The evolution of GPUs revolutionized classical computing by enabling parallel processing for graphics, AI, and simulations. QPUs aim to push these boundaries further through quantum mechanics principles like superposition, where qubits exist in multiple states simultaneously, and entanglement, which allows qubits to correlate instantaneously across distances. This enables QPUs to process data at scales unattainable by classical hardware. Key application areas include drug discovery, where QPUs could model molecular interactions with unprecedented accuracy, and materials science, where they might design next-generation materials like high-efficiency batteries. In finance and AI, QPUs could optimize complex portfolios or enhance machine learning algorithms beyond current limits. However, challenges remain. Qubit stability, error correction, and the development of quantum-specific programming tools are critical hurdles. Integrating QPUs into existing infrastructures will require hybrid architectures that combine quantum, classical, and GPU-based systems. From a sustainability perspective, QPUs could reduce the energy demands of data centers by offering more efficient alternatives to GPU-heavy operations. + +[.text-justify] +== Conclusion + +image::QPU004.jpg[xref=#fragment03,width=322,height=220] +Fig: IBM Q quantum computer \ No newline at end of file diff --git a/docs/modules/ROOT/pages/index.adoc b/docs/modules/ROOT/pages/index.adoc index 4a61b46b..872f4baf 100644 --- a/docs/modules/ROOT/pages/index.adoc +++ b/docs/modules/ROOT/pages/index.adoc @@ -7,165 +7,102 @@ endif::[] .INTRODUCTION [.examp] **** - +[.text-justify] In many applications today, software must make decisions quickly. And the best way to do so is parallel programming in C / C ++ and Multithreading (multithread programming). Parallel programming is a programming method which allows you to execute several calculations or processes simultaneously. It is used to improve the performance of applications by using multi-core architectures and distributed systems. Parallel programming consists in breaking down a problem into sub-problublicms which can be solved simultaneously by several calculation units. This reduces the overall execution time of a program by effectively using available hardware resources. Parallel machines offer a wonderful opportunity for applications of large calculation requirements. Effective use of these machines, however, requires an in -depth understanding of their operation. Let's see more about what computing and programming parallel... -*What is Parallel Computing?* - -*Serial Computing* - - -Traditionally, software has been written for serial computation: - -* A problem is broken into a discrete series of instructions -* Instructions are executed sequentially one after another -* Executed on a single processor -* Only one instruction may execute at any moment in time - -image::serialProblem.gif[xref=#fragment_000_001,width=400,height=400] - -*Parallel Computing* - -In the simplest sense, parallel computing is the simultaneous use of multiple compute resources to solve a computational problem: - -* A problem is broken into discrete parts that can be solved concurrently -** Each part is further broken down to a series of instructions -** Instructions from each part execute simultaneously on different processors -** An overall control/coordination mechanism is employed - - -image::parallelProblem.gif[xref=#fragment_000_002,width=400,height=400] - -For example - -image::parallelProblem2.gif[xref=#fragment_000_003,width=400,height=400] - -* The computational problem should be able to: -** Be broken apart into discrete pieces of work that can be solved simultaneously; -** Execute multiple program instructions at any moment in time; -** Be solved in less time with multiple compute resources than with a single compute resource. -* The compute resources are typically: -** A single computer with multiple processors/cores -** An arbitrary number of such computers connected by a network - - -*Parallel Computers* - -*Virtually all stand-alone computers today are parallel from a hardware perspective: -** Multiple functional units (L1 cache, L2 cache, branch, prefetch, decode, floating-point, graphics processing (GPU), integer, etc.) -** Multiple execution units/cores -** Multiple hardware threads - -image::bgqComputeChip.jpeg[xref=#fragment_000_004,width=400,height=400] +*1. What is Parallel Computing ?* +[.text-justify] +In the field of computing, the evolution of technologies has led to significant advances in the way we approach complex problems. Among these advances, parallel computing stands out as a revolutionary approach, offering a powerful alternative to traditional serial computing. To fully understand this concept, it is essential to compare these two computing methods and explore their fundamental characteristics. So, let’s first look at serial computing and then dive into the fascinating world of parallel computing -* Networks connect multiple stand-alone computers (nodes) to make larger parallel computer clusters. +[.text-justify] +*Serial Computing* : Traditionally, software has been developed with a focus on serial computation. In this conventional framework, a problem is decomposed into a discrete series of instructions that are executed sequentially, one after the other, on a single processor. This approach inherently limits the system's efficiency, as only one instruction can be processed at any given moment. -image::nodesNetwork.gif[xref=#fragment_000_005,width=400,height=400] +[.text-justify] +*Parallel Computing* : In contrast, parallel computing represents a significant advancement in computational methodology. At its core, parallel computing involves the simultaneous utilization of multiple computational resources to address complex problems. In this paradigm, a problem is divided into discrete components that can be solved concurrently. Each component is further segmented into a series of instructions, which are executed simultaneously across different processors. This necessitates the implementation of an overarching control and coordination mechanism to ensure the effective integration of results. -* For example, the schematic below shows a typical LLNL parallel computer cluster: -** Each compute node is a multi-processor parallel computer in itself -** Multiple compute nodes are networked together with an Infiniband network -** Special purpose nodes, also multi-processor, are used for other purposes +*2. Overview of the different hardware architectures* -image::parallelComputer1.gif[xref=#fragment_000_006,width=400,height=400] +[.text-justify] +In modern computing, various hardware architectures have been developed to address specific computational needs. Each architecture is uniquely designed to optimize performance for particular tasks, ranging from general-purpose processing to specialized operations in graphics, machine learning, and natural language processing. This section explores the key architectures—CPU, GPU, GPGPU, TPU, NPU, and LPU—highlighting their distinct roles and applications in the evolving landscape of technology. -* The majority of the world's large parallel computers (supercomputers) are clusters of hardware produced by a handful of (mostly) well known vendors. +*2. 1 CPU, GPU, GPGPU Architecture* +[.text-justify] +** *CPU*, *GPU*, and *GPGPU* architectures are all types of computer processing architectures, but they differ in their design and operation. -*CPU, GPU, GPGPU Architecture* +[.text-justify] +* *CPU*: A central processor (CPU) is a processing unit that is designed to perform various computing tasks including data processing, mathematical and logical calculations, communication between different components of a computer system, etc. Modern CPUs usually have multiple cores to process multiple tasks simultaneously. -* CPU, GPU, and GPGPU architectures are all types of computer processing architectures, but they differ in their design and operation. +[.text-justify] +* *GPU*: A graphics processing unit (GPU) is an architecture designed to accelerate the processing of images and graphics. GPUs have thousands of cores that allow them to process millions of pixels simultaneously, making them an ideal choice for video games, 3D modeling, and other graphics-intensive applications. +[.text-justify] +* *GPGPU*: A General Processing Architecture (GPGPU) is a type of GPU that is designed to be used for purposes other than graphics processing. GPGPUs are used to perform computations of an intensive nature using the hundreds or thousands of cores available on the graphics card. They are particularly effective for parallel computing, machine learning, and other computationally intensive areas. -* CPU: A central processor (CPU) is a processing unit that is designed to perform various computing tasks including data processing, mathematical and logical calculations, communication between different components of a computer system, etc. Modern CPUs usually have multiple cores to process multiple tasks simultaneously. +[.text-justify] +In conclusion, the main difference between the three architectures CPU, GPU and GPGPU lies in their design and operation. While CPUs are designed for general computer processing, GPUs are designed for specialized graphics processing, and GPGPUs are a modified version of GPUs intended to be used for specialized computer processing other than graphics processing. -* GPU: A graphics processing unit (GPU) is an architecture designed to accelerate the processing of images and graphics. GPUs have thousands of cores that allow them to process millions of pixels simultaneously, making them an ideal choice for video games, 3D modeling, and other graphics-intensive applications. -* GPGPU: A General Processing Architecture (GPGPU) is a type of GPU that is designed to be used for purposes other than graphics processing. GPGPUs are used to perform computations of an intensive nature using the hundreds or thousands of cores available on the graphics card. They are particularly effective for parallel computing, machine learning, and other computationally intensive areas. +*2.2 TPU, NPU, LPU , DPU Architecture* -* In conclusion, the main difference between the three architectures CPU, GPU and GPGPU lies in their design and operation. While CPUs are designed for general computer processing, GPUs are designed for specialized graphics processing, and GPGPUs are a modified version of GPUs intended to be used for specialized computer processing other than graphics processing. +[.text-justify] +* *TPU*: A Tensor Processing Unit (TPU) is a specialized hardware processor developed by Google to accelerate machine learning. Unlike traditional CPUs or GPUs, TPUs are specifically designed to handle tensor operations, which account for most of the computations in deep learning models. This makes them incredibly efficient at those tasks and provides an enormous speedup compared to CPUs and GPUs. In this article, we’ll explore what a TPU is, how it works, and why they are so beneficial for machine learning applications. +[.text-justify] +* *NPU*: A Neural Processing Unit (NPU), is a specialized hardware accelerator designed for executing artificial neural network tasks efficiently and with high throughput. NPUs deliver high performance while minimizing power consumption, making them suitable for mobile devices, edge computing, and other energy-sensitive applications. NPUs use the traditional von Neumann architecture, which separates the memory and the processing units. TPUs use the systolic array architecture, which integrates the memory and the processing units into a single chip. NPUs have a higher peak performance than TPUs, but they also have a higher latency and power consumption. TPUs have a lower peak performance than NPUs, but they also have a lower latency and power consumption. -*TPU, NPU, LPU Architecture* +[.text-justify] +* *LPU*: Language Processing Units (LPUs) are a relatively new addition, designed specifically for handling the complexities of natural language processing tasks. While CPUs, GPUs, and TPUs play significant roles in the broader field of AI, LPUs offer optimized performance for generative models that deal with text, such as GPT (Generative Pre-trained Transformer). They're good at these tasks and might be more efficient than Graphics Processing Units (GPUs). GPUs are still great for things like graphics and AI.The true power of generative AI comes from the interplay and integration of these processing units. CPUs handle the overarching control and coordination, GPUs accelerate the bulk of computational workloads, TPUs offer specialized efficiency for deep learning, and LPUs bring a new level of performance to natural language processing. Together, they form the backbone of generative AI systems, enabling the rapid development and deployment of models that can create highly realistic and complex outputs. -* TPU: A Tensor Processing Unit (TPU) is a specialized hardware processor developed by Google to accelerate machine learning. Unlike traditional CPUs or GPUs, TPUs are specifically designed to handle tensor operations, which account for most of the computations in deep learning models. This makes them incredibly efficient at those tasks and provides an enormous speedup compared to CPUs and GPUs. In this article, we’ll explore what a TPU is, how it works, and why they are so beneficial for machine learning applications. +[.text-justify] +* *DPU*: A Data Processing Unit (DPU) is a specialized processor designed to optimize data-centric workloads in modern computing environments. It combines a multi-core CPU, hardware accelerators, and high-speed networking capabilities into a single system-on-chip (SoC). DPUs are primarily used to offload networking, security, and storage functions from the main CPU, allowing it to focus on running operating systems and applications. This new class of programmable processors is becoming increasingly important in data centers and cloud computing, where they help improve overall system efficiency and performance. DPUs can handle tasks such as packet processing, encryption, and data compression, effectively becoming a third pillar of computing alongside CPUs and GPUs. As data-intensive applications continue to grow, DPUs are expected to play a crucial role in optimizing data movement and processing in large-scale computing environments. -* NPU: A Neural Processing Unit (NPU), is a specialized hardware accelerator designed for executing artificial neural network tasks efficiently and with high throughput. NPUs deliver high performance while minimizing power consumption, making them suitable for mobile devices, edge computing, and other energy-sensitive applications. NPUs use the traditional von Neumann architecture, which separates the memory and the processing units. TPUs use the systolic array architecture, which integrates the memory and the processing units into a single chip. NPUs have a higher peak performance than TPUs, but they also have a higher latency and power consumption. TPUs have a lower peak performance than NPUs, but they also have a lower latency and power consumption. +[.text-justify] +* *QPUs*: Quantum processing units (QPUs) process information by using qubits instead of binary bits and are designed to perform complex quantum algorithms. QPUs are best used for certain kinds of highly complicated problems, and many of today’s promising quantum algorithms provide probabilistic solutions instead of precise answers. -* LPU: Language Processing Units (LPUs) are a relatively new addition, designed specifically for handling the complexities of natural language processing tasks. While CPUs, GPUs, and TPUs play significant roles in the broader field of AI, LPUs offer optimized performance for generative models that deal with text, such as GPT (Generative Pre-trained Transformer). They're good at these tasks and might be more efficient than Graphics Processing Units (GPUs). GPUs are still great for things like graphics and AI.The true power of generative AI comes from the interplay and integration of these processing units. CPUs handle the overarching control and coordination, GPUs accelerate the bulk of computational workloads, TPUs offer specialized efficiency for deep learning, and LPUs bring a new level of performance to natural language processing. Together, they form the backbone of generative AI systems, enabling the rapid development and deployment of models that can create highly realistic and complex outputs. +*3. Why Use Parallel Computing ?* -*Why Use Parallel Computing?* +[.text-justify] +Parallel computing is essential for modeling complex real-world phenomena that involve multiple simultaneous events. It utilizes data and task parallelism through shared or distributed memory models. Key benefits include improved performance and scalability, allowing for faster execution times and efficient adaptation to more powerful systems. While parallel computing presents challenges in development and debugging, it remains crucial for solving complex computational problems across various scientific and technological domains. Its ability to handle intricate simulations and process large datasets makes it an indispensable tool in modern computing. -The Real World Is Massively Complex -* In the natural world, many complex, interrelated events are happening at the same time, yet within a temporal sequence. -* Compared to serial computing, parallel computing is much better suited for modeling, simulating and understanding complex, real world phenomena. +Main Reasons for Employing Parallel Programming : +1. **Time and Cost Efficiency**: Theoretically, allocating additional resources to a task can reduce its completion time, leading to potential cost savings. Furthermore, parallel computers can be constructed using inexpensive commodity components. -*Key points concerning parallel programming* +2. **Solving Large or Complex Problems**: Many problems are so vast or intricate that solving them with a serial program is impractical or impossible, particularly when accounting for limited computer memory. - * Types of parallelism +3. **Concurrency**: A single computational resource is limited to executing one task at a time. In contrast, multiple compute resources can perform numerous tasks simultaneously. For example, collaborative networks provide a global platform for individuals from around the world to meet and work together virtually. -** Data parallelism: The same operations are carried out on different data sets, often used in the processing of large amounts of data. +4. **Utilizing Non-local Resources**: Parallel computing allows for the use of computational resources across wide area networks or even the Internet when local resources are insufficient. -** Task parallelism: different independent tasks are carried out in parallel. This is often used in applications where several processes can be executed simultaneously. +5. **Maximizing Hardware Efficiency**: Modern computers, including laptops, are inherently parallel in architecture with multiple processors and cores. Parallel software is specifically designed to exploit this architecture effectively. In many cases, traditional serial programs fail to utilize the full potential of modern computing power. - * Programming models: There are several models of parallel programming, each with its own characteristics and use cases: - - ** Shared memory: Threads share the same memory, which facilitates communication between them. Libraries like OpenMP are often used in this context. - ** Distributed memory: each calculation unit has its own memory, and communication is made by passing messages, as with MPI (Passing Interface message). - - * Benefits +*4. Kokkos: A Modern Solution for Portable Parallel Programming* - ** Improved performance: Using several cores or machines, programs can run much faster. - ** Scalability: Applications can be designed to adapt to increasingly powerful systems by adding resources. +[.text-justify] +Parallel programming has become essential to fully exploit the capabilities of modern hardware architectures. These architectures include different types of specialized processors, each designed for specific tasks: - * Disadvantages +* *CPU (Central Processing Unit)* is the traditional general-purpose processor, capable of performing a wide variety of tasks but with a limited number of cores. +* *GPU (Graphics Processing Unit)* is optimized for massive parallel processing, particularly efficient for graphics computations and certain types of algorithms. +* *TPU (Tensor Processing Unit)* is designed specifically for machine learning and artificial intelligence operations. +* *NPU (Neural Processing Unit)* is similar to the TPU, but usually integrated into mobile devices for local AI tasks. - ** Complexity: Writing parallel programs can be more complex than writing sequential programs due to the need to manage synchronization and communication between threads or processes. +[.text-justify] +In this context of hardware diversity, *Kokkos* emerges as a powerful solution for portable parallel programming. *Kokkos* is a C++ library that allows developers to write high-performance parallel code that can run efficiently on various hardware architectures, including multi-core *CPUs* and *GPUs*. This library provides a hardware abstraction that allows expressing parallel algorithms in a way that is independent of the underlying architecture, while automatically optimizing performance for each specific platform. Kokkos thus greatly simplifies the process of developing portable and high-performance parallel applications, by allowing developers to focus on the algorithm rather than on the implementation details specific to each architecture. I therefore invite you to consult the Kokkos section after studying the basics of parallel programming. - ** Difficulties of debugging: errors in parallel programs, such as race conditions, can be difficult to detect and correct. -*Main Reasons for Using Parallel Programming* - -** Save time and or money - -*** In theory, throwing more resources at a task will shorten its time to completion, with potential cost savings. -*** Parallel computers can be built from cheap, commodity components - - -** Solver large/ More complex problems - -*** Many problems are so large and/or complex that it is impractical or impossible to solve them using a serial program, especially given limited computer memory. - - -** Provide concurency - -*** A single compute resource can only do one thing at a time. Multiple compute resources can do many things simultaneously. -*** Example: Collaborative Networks provide a global venue where people from around the world can meet and conduct work "virtually." - -** Take advantage of non-local resources - -*** Using compute resources on a wide area network, or even the Internet when local compute resources are scarce or insufficient. - -** Make better use of underlying parallel hardware - -*** Modern computers, even laptops, are parallel in architecture with multiple processors/cores. -*** Parallel software is specifically intended for parallel hardware with multiple cores, threads, etc. -*** In most cases, serial programs run on modern computers "waste" potential computing power. - - -*Who Is Using Parallel Computing?* +*5. Who Is Using Parallel Computing?* * *Science and Engineering* @@ -221,6 +158,12 @@ xref:ROOT:attachment$Session6_ParallelProgramming_Specx.pdf[SPECX],... xref:ROOT:attachment$OpenMP-API-Specification-5-2.pdf[OpenMP 5.0], xref:ROOT:attachment$OpenMP-Technical-Report -12-Version 6-0.pdf[OpenMP 6.0],... + +xref:ROOT:attachment$KokkosSession1.pdf[Kokkos Session 1], +xref:ROOT:attachment$KokkosSession2.pdf[Kokkos Session 2], +xref:ROOT:attachment$KokkosSession3.pdf[Kokkos Session 3],... + + **** diff --git a/docs/modules/ROOT/pages/quickstart.adoc b/docs/modules/ROOT/pages/quickstart.adoc index 4c1967b8..3e6c98e2 100644 --- a/docs/modules/ROOT/pages/quickstart.adoc +++ b/docs/modules/ROOT/pages/quickstart.adoc @@ -1,8 +1,8 @@ -= Quickstart Guide += Quickstart Guide -To get started with compiling and using {project_name}, you can either +To get started with compiling and using {project_name}, you can either -- [x] install https://docs.feelpp.org/user/latest/install/index.html[{feelpp} Debian or Ubuntu packages] if you use such systems or +- [x] install https://docs.feelpp.org/user/latest/install/index.html[{feelpp} Debian or Ubuntu packages] if you use such systems or - [x] use DevContainers in Visual Studio Code as described below. To get started with viewing and creating documentation, checkout the project xref:antora.adoc[Antora] page. @@ -38,7 +38,7 @@ To start using DevContainers in Visual Studio Code, follow these steps: } ---- -4. Customize the configuration based on your project requirements. In the example above, the configuration sets the name of the Dev Container as "Feel++ Dev Jammy" and specifies the Docker image to use as "ghcr.io/feelpp/feelpp-dev:jammy". Additionally, it includes a list of extensions to install inside the Dev Container. +<.>4. Customize the configuration based on your project requirements. In the example above, the configuration sets the name of the Dev Container as "Feel++ Dev Jammy" and specifies the Docker image to use as "ghcr.io/feelpp/feelpp-dev:jammy". Additionally, it includes a list of extensions to install inside the Dev Container. === Opening in DevContainer diff --git a/docs/modules/kokkos/examples/README.adoc b/docs/modules/kokkos/examples/README.adoc new file mode 100644 index 00000000..b2fb1914 --- /dev/null +++ b/docs/modules/kokkos/examples/README.adoc @@ -0,0 +1,10 @@ += Configure this repository + +[source, sh] +---- +mkdir build +cd build +cmake -DKokkos_OPENMP=ON .. + +make +---- \ No newline at end of file diff --git a/docs/modules/kokkos/examples/src/00_views.cpp b/docs/modules/kokkos/examples/src/00_views.cpp new file mode 100644 index 00000000..f3752a91 --- /dev/null +++ b/docs/modules/kokkos/examples/src/00_views.cpp @@ -0,0 +1,20 @@ +#include +#include + +int main(int argc, char *argv[]) { + int N = 5, K = 10; + Kokkos::initialize(argc, argv); + { + Kokkos::View a("a", N), b("b", K); + a = b; // a gets deallocated and both a and b are points to the same thing + Kokkos::View c(b); // copy constructor + std::cout << "Label of c: " << c.label() + << std::endl; // The label of c is the same as the label of b + a(0, 2) = 1; + b(0, 2) = 2; + c(0, 2) = 3; + std::cout << "a(0, 2) = " << a(0, 2) << std::endl; + } + Kokkos::finalize(); + return 0; +} \ No newline at end of file diff --git a/docs/modules/kokkos/examples/src/01_views_1D.cpp b/docs/modules/kokkos/examples/src/01_views_1D.cpp new file mode 100644 index 00000000..c26e01cd --- /dev/null +++ b/docs/modules/kokkos/examples/src/01_views_1D.cpp @@ -0,0 +1,26 @@ +#include +#include + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + const size_t N = 100000; + // Create a 1D View of doubles + Kokkos::View myView("MyView", N); + // Fill the View with data + Kokkos::parallel_for( + N, KOKKOS_LAMBDA(const int i) { myView(i) = i; }); + // Compute the sum of all elements + int sum = 0.0; + Kokkos::parallel_reduce( + N, + KOKKOS_LAMBDA(const int i, int &partial_sum) { + partial_sum += myView(i); + }, + sum); + + std::cout << "Sum: " << sum << std::endl; + } + Kokkos::finalize(); + return 0; +} \ No newline at end of file diff --git a/docs/modules/kokkos/examples/src/02_views_2D.cpp b/docs/modules/kokkos/examples/src/02_views_2D.cpp new file mode 100644 index 00000000..b6e6c705 --- /dev/null +++ b/docs/modules/kokkos/examples/src/02_views_2D.cpp @@ -0,0 +1,25 @@ +#include + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + // Define a 2D view of doubles with 10 elements + Kokkos::View view("view", 10, 2); + + // Initialize the view using parallel_for + Kokkos::parallel_for( + "InitView", 10, KOKKOS_LAMBDA(const int i) { + view(i, 0) = i * 1.0; + view(i, 1) = i * 2.0; + }); + + // Print the view elements + Kokkos::parallel_for( + "PrintView", 10, KOKKOS_LAMBDA(const int i) { + printf("view(%d, 0) = %f\n", i, view(i, 0)); + printf("view(%d, 1) = %f\n", i, view(i, 1)); + }); + } + Kokkos::finalize(); + return 0; +} \ No newline at end of file diff --git a/docs/modules/kokkos/examples/src/03_views_properties.cpp b/docs/modules/kokkos/examples/src/03_views_properties.cpp new file mode 100644 index 00000000..75b2eab6 --- /dev/null +++ b/docs/modules/kokkos/examples/src/03_views_properties.cpp @@ -0,0 +1,19 @@ +#include +#include + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + const int N0 = 10; + Kokkos::View a("A", N0); + assert(a.extent(0) == N0); + assert(a.extent(1) == 5); + assert(a.size() == N0 * 5); + assert(a.rank() == 2); + assert(a.span() == N0 * 5); + assert(a.data() != nullptr); + assert(a.label() == "A"); + } + Kokkos::finalize(); + return 0; +} \ No newline at end of file diff --git a/docs/modules/kokkos/examples/src/04_kokkos_exercise_views.cpp b/docs/modules/kokkos/examples/src/04_kokkos_exercise_views.cpp new file mode 100644 index 00000000..3b862807 --- /dev/null +++ b/docs/modules/kokkos/examples/src/04_kokkos_exercise_views.cpp @@ -0,0 +1,228 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +// EXERCISE 2 Goal: +// Replace raw allocations with Kokkos Views. +// 1. Define device views. +// 2. Replace data access with view access operators. +// +// Note: Kokkos::parallel_for() initializations were removed to initialize on +// host. + +#include +#include +#include +#include +#include + +#include + +void checkSizes(int &N, int &M, int &S, int &nrepeat); + +int main(int argc, char *argv[]) { + int N = -1; // number of rows 2^12 + int M = -1; // number of columns 2^10 + int S = -1; // total size 2^22 + int nrepeat = 100; // number of repeats of the test + + // Read command line arguments. + for (int i = 0; i < argc; i++) { + if ((strcmp(argv[i], "-N") == 0) || (strcmp(argv[i], "-Rows") == 0)) { + N = pow(2, atoi(argv[++i])); + printf(" User N is %d\n", N); + } else if ((strcmp(argv[i], "-M") == 0) || + (strcmp(argv[i], "-Columns") == 0)) { + M = pow(2, atof(argv[++i])); + printf(" User M is %d\n", M); + } else if ((strcmp(argv[i], "-S") == 0) || + (strcmp(argv[i], "-Size") == 0)) { + S = pow(2, atof(argv[++i])); + printf(" User S is %d\n", S); + } else if (strcmp(argv[i], "-nrepeat") == 0) { + nrepeat = atoi(argv[++i]); + } else if ((strcmp(argv[i], "-h") == 0) || + (strcmp(argv[i], "-help") == 0)) { + printf(" y^T*A*x Options:\n"); + printf(" -Rows (-N) : exponent num, determines number of rows " + "2^num (default: 2^12 = 4096)\n"); + printf(" -Columns (-M) : exponent num, determines number of " + "columns 2^num (default: 2^10 = 1024)\n"); + printf(" -Size (-S) : exponent num, determines total matrix " + "size 2^num (default: 2^22 = 4096*1024 )\n"); + printf(" -nrepeat : number of repetitions (default: 100)\n"); + printf(" -help (-h): print this message\n\n"); + exit(1); + } + } + + // Check sizes. + checkSizes(N, M, S, nrepeat); + + Kokkos::initialize(argc, argv); + { + + // EXERCISE: Create views of the right size. + + // 1. Device Views + typedef Kokkos::View ViewVectorType; + typedef Kokkos::View ViewMatrixType; + + // EXERCISE: This no longer needs allocation after views introduced... + // Hint: If arrays are not allocated, they also do not need to be + // deallocated below + // Allocate y, x vectors and Matrix A: + // double * const y = new double[ N ]; + // double * const x = new double[ M ]; + // double * const A = new double[ N * M ]; + + ViewVectorType y("y", N); + ViewVectorType x("x", M); + ViewMatrixType A("A", N, M); + + // Initialize y vector on host. + // EXERCISE: Convert y to 1D View's member access API: y(i) + // for ( int i = 0; i < N; ++i ) { + // y[ i ] = 1; + // } + Kokkos::parallel_for( + N, KOKKOS_LAMBDA(int i) { y(i) = 1; }); + + // Initialize x vector on host. + // EXERCISE: Convert x to 1D View's member access API: x(i) + // for ( int i = 0; i < M; ++i ) { + // x[ i ] = 1; + // } + Kokkos::parallel_for( + M, KOKKOS_LAMBDA(int i) { x(i) = 1; }); + + // Initialize A matrix on host, note 2D indexing computation. + // EXERCISE: convert 'A' to use View's member access API: A(j,i) + // for ( int j = 0; j < N; ++j ) { + // for ( int i = 0; i < M; ++i ) { + // A[ j * M + i ] = 1; + // } + // } + Kokkos::parallel_for( + N, KOKKOS_LAMBDA(int j) { + for (int i = 0; i < M; ++i) { + A(j, i) = 1; + } + }); + + // Timer products. + Kokkos::Timer timer; + + for (int repeat = 0; repeat < nrepeat; repeat++) { + // Application: = y^T*A*x + double result = 0; + + Kokkos::parallel_reduce( + "yAx", N, + KOKKOS_LAMBDA(int j, double &update) { + double temp2 = 0; + + // EXERCISE: Replace access with view access operators. + for (int i = 0; i < M; ++i) { + // temp2 += A[ j * M + i ] * x[ i ]; + temp2 += A(j, i) * x(i); + } + + update += y(j) * temp2; + }, + result); + + // Output result. + if (repeat == (nrepeat - 1)) { + printf(" Computed result for %d x %d is %lf\n", N, M, result); + } + + const double solution = (double)N * (double)M; + + if (result != solution) { + printf(" Error: result( %lf ) != solution( %lf )\n", result, solution); + } + } + + // Calculate time. + double time = timer.seconds(); + + // Calculate bandwidth. + // Each matrix A row (each of length M) is read once. + // The x vector (of length M) is read N times. + // The y vector (of length N) is read once. + // double Gbytes = 1.0e-9 * double( sizeof(double) * ( 2 * M * N + N ) ); + double Gbytes = 1.0e-9 * double(sizeof(double) * (M + M * N + N)); + + // Print results (problem size, time and bandwidth in GB/s). + printf(" N( %d ) M( %d ) nrepeat ( %d ) problem( %g MB ) time( %g s ) " + "bandwidth( %g GB/s )\n", + N, M, nrepeat, Gbytes * 1000, time, Gbytes * nrepeat / time); + + // delete [] y; //EXERCISE hint: ... + // delete [] x; //EXERCISE hint: ... + // delete [] A; //EXERCISE hint: ... + } + Kokkos::finalize(); + + return 0; +} + +void checkSizes(int &N, int &M, int &S, int &nrepeat) { + // If S is undefined and N or M is undefined, set S to 2^22 or the bigger of N + // and M. + if (S == -1 && (N == -1 || M == -1)) { + S = pow(2, 22); + if (S < N) + S = N; + if (S < M) + S = M; + } + + // If S is undefined and both N and M are defined, set S = N * M. + if (S == -1) + S = N * M; + + // If both N and M are undefined, fix row length to the smaller of S and 2^10 + // = 1024. + if (N == -1 && M == -1) { + if (S > 1024) { + M = 1024; + } else { + M = S; + } + } + + // If only M is undefined, set it. + if (M == -1) + M = S / N; + + // If N is undefined, set it. + if (N == -1) + N = S / M; + + printf(" Total size S = %d N = %d M = %d\n", S, N, M); + + // Check sizes. + if ((S < 0) || (N < 0) || (M < 0) || (nrepeat < 0)) { + printf(" Sizes must be greater than 0.\n"); + exit(1); + } + + if ((N * M) != S) { + printf(" N * M != S\n"); + exit(1); + } +} diff --git a/docs/modules/kokkos/examples/src/05_kokkos_mirrors.cpp b/docs/modules/kokkos/examples/src/05_kokkos_mirrors.cpp new file mode 100644 index 00000000..05a0536e --- /dev/null +++ b/docs/modules/kokkos/examples/src/05_kokkos_mirrors.cpp @@ -0,0 +1,225 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include +#include +#include +#include +#include + +#include + +void checkSizes(int &N, int &M, int &S, int &nrepeat); + +int main(int argc, char *argv[]) { + int N = -1; // number of rows 2^12 + int M = -1; // number of columns 2^10 + int S = -1; // total size 2^22 + int nrepeat = 100; // number of repeats of the test + + // Read command line arguments. + for (int i = 0; i < argc; i++) { + if ((strcmp(argv[i], "-N") == 0) || (strcmp(argv[i], "-Rows") == 0)) { + N = pow(2, atoi(argv[++i])); + printf(" User N is %d\n", N); + } else if ((strcmp(argv[i], "-M") == 0) || + (strcmp(argv[i], "-Columns") == 0)) { + M = pow(2, atof(argv[++i])); + printf(" User M is %d\n", M); + } else if ((strcmp(argv[i], "-S") == 0) || + (strcmp(argv[i], "-Size") == 0)) { + S = pow(2, atof(argv[++i])); + printf(" User S is %d\n", S); + } else if (strcmp(argv[i], "-nrepeat") == 0) { + nrepeat = atoi(argv[++i]); + } else if ((strcmp(argv[i], "-h") == 0) || + (strcmp(argv[i], "-help") == 0)) { + printf(" y^T*A*x Options:\n"); + printf(" -Rows (-N) : exponent num, determines number of rows " + "2^num (default: 2^12 = 4096)\n"); + printf(" -Columns (-M) : exponent num, determines number of " + "columns 2^num (default: 2^10 = 1024)\n"); + printf(" -Size (-S) : exponent num, determines total matrix " + "size 2^num (default: 2^22 = 4096*1024 )\n"); + printf(" -nrepeat : number of repetitions (default: 100)\n"); + printf(" -help (-h): print this message\n\n"); + exit(1); + } + } + + // Check sizes. + checkSizes(N, M, S, nrepeat); + + Kokkos::initialize(argc, argv); + { + +#ifdef KOKKOS_ENABLE_CUDA + std::cout << "Kokkos::CudaSpace" << std::endl; +#define MemSpace Kokkos::CudaSpace +#endif +#ifdef KOKKOS_ENABLE_HIP + std::cout << "Kokkos::HIPSpace" << std::endl; +#define MemSpace Kokkos::Experimental::HIPSpace +#endif +#ifdef KOKKOS_ENABLE_OPENMPTARGET + std::cout << "Kokkos::OpenMPTargetSpace" << std::endl; +#define MemSpace Kokkos::OpenMPTargetSpace +#endif + +#ifndef MemSpace + std::cout << "Kokkos::HostSpace" << std::endl; +#define MemSpace Kokkos::HostSpace +#endif + + using ExecSpace = MemSpace::execution_space; + using range_policy = Kokkos::RangePolicy; + + using Layout = Kokkos::LayoutLeft; + // using Layout = Kokkos::LayoutRight; + + // Allocate y, x vectors and Matrix A on device. + typedef Kokkos::View ViewVectorType; + typedef Kokkos::View ViewMatrixType; + ViewVectorType y("y", N); + ViewVectorType x("x", M); + ViewMatrixType A("A", N, M); + + // Create host mirrors of device views. + ViewVectorType::HostMirror h_y = Kokkos::(y); + ViewVectorType::HostMirror h_x = Kokkos::create_mirror_view(x); + ViewMatrixType::HostMirror h_A = Kokkos::create_mirror_view(A); + + // Initialize y vector on host. + for (int i = 0; i < N; ++i) { + h_y(i) = 1; + } + + // Initialize x vector on host. + for (int i = 0; i < M; ++i) { + h_x(i) = 1; + } + + // Initialize A matrix on host. + for (int j = 0; j < N; ++j) { + for (int i = 0; i < M; ++i) { + h_A(j, i) = 1; + } + } + + // Deep copy host views to device views. + Kokkos::deep_copy(y, h_y); + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(A, h_A); + + // Timer products. + Kokkos::Timer timer; + + for (int repeat = 0; repeat < nrepeat; repeat++) { + // Application: = y^T*A*x + double result = 0; + + Kokkos::parallel_reduce( + "yAx", range_policy(0, N), + KOKKOS_LAMBDA(int j, double &update) { + double temp2 = 0; + + for (int i = 0; i < M; ++i) { + temp2 += A(j, i) * x(i); + } + + update += y(j) * temp2; + }, + result); + + // Output result. + if (repeat == (nrepeat - 1)) { + printf(" Computed result for %d x %d is %lf\n", N, M, result); + } + + const double solution = (double)N * (double)M; + + if (result != solution) { + printf(" Error: result( %lf ) != solution( %lf )\n", result, solution); + } + } + + // Calculate time. + double time = timer.seconds(); + + // Calculate bandwidth. + // Each matrix A row (each of length M) is read once. + // The x vector (of length M) is read N times. + // The y vector (of length N) is read once. + // double Gbytes = 1.0e-9 * double( sizeof(double) * ( 2 * M * N + N ) ); + double Gbytes = 1.0e-9 * double(sizeof(double) * (M + M * N + N)); + + // Print results (problem size, time and bandwidth in GB/s). + printf(" N( %d ) M( %d ) nrepeat ( %d ) problem( %g MB ) time( %g s ) " + "bandwidth( %g GB/s )\n", + N, M, nrepeat, Gbytes * 1000, time, Gbytes * nrepeat / time); + } + Kokkos::finalize(); + + return 0; +} + +void checkSizes(int &N, int &M, int &S, int &nrepeat) { + // If S is undefined and N or M is undefined, set S to 2^22 or the bigger of N + // and M. + if (S == -1 && (N == -1 || M == -1)) { + S = pow(2, 22); + if (S < N) + S = N; + if (S < M) + S = M; + } + + // If S is undefined and both N and M are defined, set S = N * M. + if (S == -1) + S = N * M; + + // If both N and M are undefined, fix row length to the smaller of S and 2^10 + // = 1024. + if (N == -1 && M == -1) { + if (S > 1024) { + M = 1024; + } else { + M = S; + } + } + + // If only M is undefined, set it. + if (M == -1) + M = S / N; + + // If N is undefined, set it. + if (N == -1) + N = S / M; + + printf(" Total size S = %d N = %d M = %d\n", S, N, M); + + // Check sizes. + if ((S < 0) || (N < 0) || (M < 0) || (nrepeat < 0)) { + printf(" Sizes must be greater than 0.\n"); + exit(1); + } + + if ((N * M) != S) { + printf(" N * M != S\n"); + exit(1); + } +} diff --git a/docs/modules/kokkos/examples/src/100_none.cpp b/docs/modules/kokkos/examples/src/100_none.cpp new file mode 100644 index 00000000..db2f5f68 --- /dev/null +++ b/docs/modules/kokkos/examples/src/100_none.cpp @@ -0,0 +1,15 @@ +#include +#include +#include +#include +#include +#include + +#include + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + {} + Kokkos::finalize(); + return 0; +} diff --git a/docs/modules/kokkos/examples/src/10_kokkos_cholesky.cpp b/docs/modules/kokkos/examples/src/10_kokkos_cholesky.cpp new file mode 100644 index 00000000..8959a11e --- /dev/null +++ b/docs/modules/kokkos/examples/src/10_kokkos_cholesky.cpp @@ -0,0 +1,152 @@ +#include +#include +#include +#include +#include +#include + +#include + +void choleskySimple(Kokkos::View A, int n) { + for (int k = 0; k < n; ++k) { + // Calculation of the diagonal element + Kokkos::parallel_for( + Kokkos::RangePolicy<>(0, 1), KOKKOS_LAMBDA(const int) { + double sum = 0.0; + for (int j = 0; j < k; ++j) { + sum += A(k, j) * A(k, j); + } + A(k, k) = std::sqrt(A(k, k) - sum); + }); + + // Calculation of elements under the diagonal + Kokkos::parallel_for( + Kokkos::RangePolicy<>(k + 1, n), KOKKOS_LAMBDA(const int i) { + double sum = 0.0; + for (int j = 0; j < k; ++j) { + sum += A(i, j) * A(k, j); + } + A(i, k) = (A(i, k) - sum) / A(k, k); + }); + } + + // Zeroing the upper triangular part + Kokkos::parallel_for( + Kokkos::RangePolicy<>(0, n), KOKKOS_LAMBDA(const int i) { + for (int j = i + 1; j < n; ++j) { + A(i, j) = 0.0; + } + }); +} + +void matrix_product(Kokkos::View A, Kokkos::View C, + int n) { + Kokkos::parallel_for( + "MatrixProduct", n, KOKKOS_LAMBDA(const int i) { + for (int j = 0; j < n; j++) { // optimisation + double sum = 0.0; + for (int k = 0; k <= i && k <= j; k++) { + sum += A(i, k) * A(j, k); + } + C(i, j) = sum; + } + }); +} + +void matrix_product(Kokkos::View A, Kokkos::View B, + Kokkos::View C, int n) { + Kokkos::parallel_for( + "MatrixProduct", n, KOKKOS_LAMBDA(const int i) { + for (int j = 0; j < n; ++j) { + double sum = 0.0; + for (int k = 0; k < n; ++k) { + sum += A(i, k) * B(k, j); + } + C(i, j) = sum; + } + }); +} + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + const int n = 4; // Size of the matrix + // View allocation + Kokkos::View A("A", n, n); + Kokkos::View A_original("A_original", n, n); + Kokkos::View C("C", n, n); + + // Initialization of the positive defined matrix A + Kokkos::parallel_for( + "InitMatrix", n, KOKKOS_LAMBDA(const int i) { + for (int j = 0; j < n; ++j) { + if (i == j) { + A(i, j) = (i + 1) + (j + 1) + n; + } else { + A(i, j) = A(j, i) = std::min(i, j) + 1; + } + A_original(i, j) = A(i, j); + } + }); + + // Synchronization to ensure initialization is complete + Kokkos::fence(); + + // Create mirror views for display + auto h_A = Kokkos::create_mirror_view(A); + auto h_A_original = Kokkos::create_mirror_view(A_original); + auto h_C = Kokkos::create_mirror_view(C); + + // Copy data to host + Kokkos::deep_copy(h_A, A); + + // Display the initial matrix + printf("Matrix A init:\n"); + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + printf("%f ", h_A(i, j)); + } + printf("\n"); + } + + // Cholesky factorization + choleskySimple(A, n); + + // Copy data to host after Cholesky + Kokkos::deep_copy(h_A, A); + + printf("\nCholesky Matrix (L):\n"); + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + printf("%f ", h_A(i, j)); + } + printf("\n"); + } + + // Calculation of the product L * L^T + matrix_product(A, C, n); + + // Copy data to host + Kokkos::deep_copy(h_C, C); + + printf("\nProduct L * L^T :\n"); + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + printf("%f ", h_C(i, j)); + } + printf("\n"); + } + + // Verification + Kokkos::deep_copy(h_A_original, A_original); + double max_diff = 0.0; + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + max_diff = std::max(max_diff, std::abs(h_C(i, j) - h_A_original(i, j))); + } + } + printf("\nMaximum difference between initial A and L*L^T : %e\n", max_diff); + } + Kokkos::finalize(); + return 0; +} diff --git a/docs/modules/kokkos/examples/src/11_kokkos_memspace.cpp b/docs/modules/kokkos/examples/src/11_kokkos_memspace.cpp new file mode 100644 index 00000000..4a601539 --- /dev/null +++ b/docs/modules/kokkos/examples/src/11_kokkos_memspace.cpp @@ -0,0 +1,54 @@ +#include +#include +#include +#include +#include +#include + +#include + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { +#ifdef KOKKOS_ENABLE_CUDA + std::cout << "Kokkos::CudaSpace" << std::endl; +#define MemSpace Kokkos::CudaSpace +#endif +#ifdef KOKKOS_ENABLE_HIP + std::cout << "Kokkos::HIPSpace" << std::endl; +#define MemSpace Kokkos::Experimental::HIPSpace +#endif +#ifdef KOKKOS_ENABLE_OPENMPTARGET + std::cout << "Kokkos::OpenMPTargetSpace" << std::endl; +#define MemSpace Kokkos::OpenMPTargetSpace +#endif + +#ifndef MemSpace + std::cout << "Kokkos::HostSpace" << std::endl; +#define MemSpace Kokkos::HostSpace +#endif + + const int N = 1000000; + Kokkos::View data("data", N); + + Kokkos::Timer timer; + + Kokkos::parallel_for( + "init", N, KOKKOS_LAMBDA(const int i) { data(i) = i * 0.000001; }); + + double sum = 0.0; + Kokkos::parallel_reduce( + "sum", N, + KOKKOS_LAMBDA(const int i, double &partial_sum) { + partial_sum += data(i); + }, + sum); + + double elapsed_time = timer.seconds(); + + std::cout << "Sum: " << sum << std::endl; + std::cout << "Elapsed time: " << elapsed_time << " seconds" << std::endl; + } + Kokkos::finalize(); + return 0; +} diff --git a/docs/modules/kokkos/examples/src/12_kokkos_simple_multi_gpus.cpp b/docs/modules/kokkos/examples/src/12_kokkos_simple_multi_gpus.cpp new file mode 100644 index 00000000..bcee8aec --- /dev/null +++ b/docs/modules/kokkos/examples/src/12_kokkos_simple_multi_gpus.cpp @@ -0,0 +1,79 @@ +#include +#include +#include +#include +#include +#include + +#include + +struct VectorAddFunctor { + Kokkos::View a; + Kokkos::View b; + Kokkos::View c; + + VectorAddFunctor(Kokkos::View a_, + Kokkos::View b_, + Kokkos::View c_) + : a(a_), b(b_), c(c_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { c(i) = a(i) + b(i); } +}; + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + const int vector_size = 1000000; + const int num_gpus = Kokkos::HIP::detect_device_count(); + // const int num_gpus = + // Kokkos::HIP::impl_internal_space_instance()->m_device_count; //another + // approach + Kokkos::InitializationSettings settings; + std::cout << "Number of GPUs available : " << num_gpus << std::endl; + // #define MemSpace Kokkos::Experimental::HIPSpace + + Kokkos::Timer timer; + + for (int gpu = 0; gpu < num_gpus; ++gpu) { + + settings.set_device_id(gpu); + Kokkos::HIP::impl_initialize(settings); + Kokkos::fence(); + + Kokkos::View a("a", vector_size); + Kokkos::View b("b", vector_size); + Kokkos::View c("c", vector_size); + + Kokkos::parallel_for( + Kokkos::RangePolicy(0, vector_size), + KOKKOS_LAMBDA(const int i) { + a(i) = 1.0; + b(i) = 2.0; + }); + + Kokkos::parallel_for(Kokkos::RangePolicy(0, vector_size), + VectorAddFunctor(a, b, c)); + + Kokkos::View::HostMirror h_c = Kokkos::create_mirror_view(c); + Kokkos::deep_copy(h_c, c); + + bool correct = true; + for (int i = 0; i < vector_size; ++i) { + if (h_c(i) != 3.0) { + correct = false; + break; + } + } + + std::cout << "Result on GPU " << gpu << " : " + << (correct ? "Correct" : "Incorrect") << std::endl; + } + + double elapsed_time = timer.seconds(); + + std::cout << "Elapsed time: " << elapsed_time << " seconds" << std::endl; + } + Kokkos::finalize(); + return 0; +} diff --git a/docs/modules/kokkos/examples/src/13_kokkos_execution_patterns.cpp b/docs/modules/kokkos/examples/src/13_kokkos_execution_patterns.cpp new file mode 100644 index 00000000..a9b0b196 --- /dev/null +++ b/docs/modules/kokkos/examples/src/13_kokkos_execution_patterns.cpp @@ -0,0 +1,58 @@ +#include +#include +#include +#include +#include +#include + +#include + +struct VectorAdd { + // Member variables for the vectors + Kokkos::View a; + Kokkos::View b; + Kokkos::View c; + // Constructor to initialize the vectors + VectorAdd(Kokkos::View a_, Kokkos::View b_, + Kokkos::View c_) + : a(a_), b(b_), c(c_) {} + // Functor to perform vector addition + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + c(i) = a(i) + b(i); // Perform addition + } +}; + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + const int N = 1000; // Size of the vectors + // Allocate and initialize vectors on the device + Kokkos::View a("A", N); + Kokkos::View b("B", N); + Kokkos::View c("C", N); + // Initialize vectors a and b on the host + Kokkos::parallel_for( + "InitializeVectors", N, KOKKOS_LAMBDA(const int i) { + a(i) = static_cast(i); // Fill vector A with values 0 to N-1 + b(i) = + static_cast(N - i); // Fill vector B with values N-1 to 0 + }); + // Perform vector addition using Kokkos parallel_for + VectorAdd vectorAdd(a, b, c); + Kokkos::parallel_for("VectorAdd", N, vectorAdd); + // Synchronize to ensure all computations are complete + Kokkos::fence(); + // Output the first 10 results for verification + std::cout << "Result of vector addition (first 10 elements):" << std::endl; + + auto h_c = Kokkos::create_mirror_view(c); + Kokkos::deep_copy(h_c, c); + for (int i = 0; i < 10; ++i) { + std::cout << "c[" << i << "] = " << h_c(i) + << std::endl; // Print results from vector C + } + } + Kokkos::finalize(); + return 0; +} \ No newline at end of file diff --git a/docs/modules/kokkos/examples/src/14_kokkos_execution_patterns_policies.cpp b/docs/modules/kokkos/examples/src/14_kokkos_execution_patterns_policies.cpp new file mode 100644 index 00000000..a75ada82 --- /dev/null +++ b/docs/modules/kokkos/examples/src/14_kokkos_execution_patterns_policies.cpp @@ -0,0 +1,83 @@ +#include +#include +#include +#include +#include +#include + +#include + +struct VectorAdd { + // Member variables for the vectors + Kokkos::View a; + Kokkos::View b; + Kokkos::View c; + // Constructor to initialize the vectors + VectorAdd(Kokkos::View a_, Kokkos::View b_, + Kokkos::View c_) + : a(a_), b(b_), c(c_) {} + // Functor to perform vector addition + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + c(i) = a(i) + b(i); // Perform addition + } +}; + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + const int N = 1000; // Size of the vectors + // Allocate vectors on the device + Kokkos::View a("A", N); + Kokkos::View b("B", N); + Kokkos::View c("C", N); + // Initialize vectors a and b on the host + Kokkos::parallel_for( + "InitializeVectors", N, KOKKOS_LAMBDA(const int i) { + a(i) = static_cast(i); // Fill vector A with values 0 to N-1 + b(i) = + static_cast(N - i); // Fill vector B with values N-1 to 0 + }); + // Perform vector addition using default execution policy + Kokkos::parallel_for("VectorAdd", N, VectorAdd(a, b, c)); + // Synchronize to ensure all computations are complete + Kokkos::fence(); + + auto h_c = Kokkos::create_mirror_view(c); + + // Output the first 10 results for verification + Kokkos::deep_copy(h_c, c); + std::cout << "Result of vector addition (first 10 elements):" << std::endl; + for (int i = 0; i < 10; ++i) { + std::cout << "c[" << i << "] = " << h_c(i) + << std::endl; // Print results from vector C + } + // Perform vector addition using a different execution policy (Dynamic + // Scheduling) + Kokkos::TeamPolicy<> teamPolicy(N, 32); // League size: N, Team size: 32 + Kokkos::parallel_for( + teamPolicy, + KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type &teamMember) { + const int teamSize = teamMember.team_size(); + const int i = + teamMember.league_rank() * teamSize + teamMember.team_rank(); + if (i < N) { + c(i) = a(i) + b(i); // Perform addition within the team + } + }); + + // Synchronize again after using the team policy + Kokkos::fence(); + // Output the results after using the team policy + Kokkos::deep_copy(h_c, c); + std::cout + << "Result of vector addition using Team Policy (first 10 elements):" + << std::endl; + for (int i = 0; i < 10; ++i) { + std::cout << "c[" << i << "] = " << h_c(i) + << std::endl; // Print results from vector C + } + } + Kokkos::finalize(); + return 0; +} \ No newline at end of file diff --git a/docs/modules/kokkos/examples/src/15_kokkos_dual_view.cpp b/docs/modules/kokkos/examples/src/15_kokkos_dual_view.cpp new file mode 100644 index 00000000..9a9ec1c6 --- /dev/null +++ b/docs/modules/kokkos/examples/src/15_kokkos_dual_view.cpp @@ -0,0 +1,55 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +struct DualViewExample { + // Define the dual view type + using dual_view_type = Kokkos::DualView; + + // Function to initialize device view + static void initialize(dual_view_type &dv) { + // Initialize the device view with values + Kokkos::parallel_for( + "Initialize DeviceView", dv.d_view.extent(0), + KOKKOS_LAMBDA(const int i) { + dv.d_view(i) = static_cast(i); // Assign values based on index + }); + // Synchronize to update the host mirror + dv.template sync(); + } + // Function to print values from both views + static void printValues(const dual_view_type &dv) { + std::cout << "Host View Values: "; + for (int i = 0; i < dv.h_view.extent(0); ++i) { + std::cout << dv.h_view(i) << " "; // Access host view + } + std::cout << std::endl; + std::cout << "Device View Values: "; + Kokkos::parallel_for( + "Print DeviceView", dv.d_view.extent(0), KOKKOS_LAMBDA(const int i) { + printf("%f ", dv.d_view(i)); // Access device view + }); + std::cout << std::endl; + } +}; + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + const int N = 10; // Size of the DualView + // Create a DualView with N elements + DualViewExample::dual_view_type dv("MyDualView", N); + // Initialize the device view + DualViewExample::initialize(dv); + // Print values from both views + DualViewExample::printValues(dv); + } + Kokkos::finalize(); + return 0; +} \ No newline at end of file diff --git a/docs/modules/kokkos/examples/src/16_kokkos_simd.cpp b/docs/modules/kokkos/examples/src/16_kokkos_simd.cpp new file mode 100644 index 00000000..9137b728 --- /dev/null +++ b/docs/modules/kokkos/examples/src/16_kokkos_simd.cpp @@ -0,0 +1,47 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + using simd_type = Kokkos::Experimental::native_simd; + using tag_type = Kokkos::Experimental::element_aligned_tag; + constexpr int width = int(simd_type::size()); + int n = 1000; + Kokkos::View x("x", n); + Kokkos::View y("y", n); + Kokkos::View z("z", n); + Kokkos::View r("r", n); + Kokkos::parallel_for( + "init", n, KOKKOS_LAMBDA(const int i) { + x(i) = static_cast(i); + y(i) = static_cast(i * 2); + z(i) = static_cast(i * 3); + }); + Kokkos::parallel_for( + "compute", n / width, KOKKOS_LAMBDA(const int i) { + int idx = i * width; + simd_type sx([&x, idx](std::size_t j) { return x(idx + j); }); + simd_type sy([&y, idx](std::size_t j) { return y(idx + j); }); + simd_type sz([&z, idx](std::size_t j) { return z(idx + j); }); + simd_type sr = Kokkos::sqrt(sx * sx + sy * sy + sz * sz); + sr.copy_to(r.data() + idx, tag_type()); + }); + Kokkos::fence(); + auto h_r = Kokkos::create_mirror_view(r); + Kokkos::deep_copy(h_r, r); + printf("First 5 results:\n"); + for (int i = 0; i < 5; ++i) { + printf("r[%d] = %f\n", i, h_r(i)); + } + } + Kokkos::finalize(); + return 0; +} \ No newline at end of file diff --git a/docs/modules/kokkos/examples/src/17_kokkos_polynom_jacobi_1.cpp b/docs/modules/kokkos/examples/src/17_kokkos_polynom_jacobi_1.cpp new file mode 100644 index 00000000..022f7e1e --- /dev/null +++ b/docs/modules/kokkos/examples/src/17_kokkos_polynom_jacobi_1.cpp @@ -0,0 +1,88 @@ +#include +#include +#include +#include +#include +#include + +#include + +// https://en.wikipedia.org/wiki/Jacobi_polynomials + +// Function to evaluate the Jacobi polynomial P_n^(α,β)(x) +KOKKOS_INLINE_FUNCTION +double jacobi_polynomial(int n, double x, double alpha, double beta) { + if (n == 0) + return 1.0; + if (n == 1) + return 0.5 * (alpha + beta + (alpha - beta) * x); + + double p0 = 1.0; + double p1 = 0.5 * (alpha + beta + (alpha - beta) * x); + double p_n = 0.0; + + for (int k = 2; k <= n; ++k) { + p_n = ((2 * k + alpha + beta - 1) * (p1 + (alpha - beta) * p0)) / + (k + alpha + beta); + p_n -= ((k + alpha - 1) * (k + beta - 1) * p0) / + ((k - 1) * (k + alpha + beta)); + p0 = p1; + p1 = p_n; + } + return p_n; +} + +struct JacobiKernel { + int n; + double alpha; + double beta; + Kokkos::View x_values; + Kokkos::View results; + + JacobiKernel(int n_, double alpha_, double beta_, + Kokkos::View x_vals, Kokkos::View res) + : n(n_), alpha(alpha_), beta(beta_), x_values(x_vals), results(res) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + results(i) = jacobi_polynomial(n, x_values(i), alpha, beta); + } +}; + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + const int num_points = 20; + Kokkos::View x_values("x_values", num_points); + Kokkos::View results("results", num_points); + + // Initialization of input values + Kokkos::parallel_for( + "InitX Values", num_points, KOKKOS_LAMBDA(const int i) { + x_values(i) = + -1.0 + + 2.0 * i / (num_points - 1); // Values ​​between -1 and 1 + }); + + // Evaluation of Jacobi polynomials + // (degree n=3, α=2.0, β=3.0). + int n = 3; + double alpha = 2.0, beta = 3.0; + + Kokkos::parallel_for("Jacob Eval", num_points, + JacobiKernel(n, alpha, beta, x_values, results)); + + // Displaying results + auto h_results = Kokkos::create_mirror_view(results); + Kokkos::deep_copy(h_results, results); + + for (int i = 0; i < num_points; ++i) { + std::cout << "P_" << n << "^(" << alpha << ", " << beta << ")(" + << h_results(i) << ") = " << h_results(i) << "\n"; + } + + // Make a graphic representation afterwards or ... + } + Kokkos::finalize(); + return 0; +} \ No newline at end of file diff --git a/docs/modules/kokkos/examples/src/18_layout_Left.cpp b/docs/modules/kokkos/examples/src/18_layout_Left.cpp new file mode 100644 index 00000000..164e7e56 --- /dev/null +++ b/docs/modules/kokkos/examples/src/18_layout_Left.cpp @@ -0,0 +1,47 @@ +#include +#include +#include +#include +#include +#include + +#include + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + const int N = 5; + const int M = 4; + + Kokkos::Timer timer; + + // Creating a 2D View with LayoutLeft + Kokkos::View matrix("Matrix", N, M); + + // Filling the matrix + Kokkos::parallel_for( + "FillMatrix", N, KOKKOS_LAMBDA(const int i) { + for (int j = 0; j < M; ++j) { + matrix(i, j) = i * 10 + j; + } + }); + + // Create a mirror on the host to display the results + auto h_matrix = Kokkos::create_mirror_view(matrix); + Kokkos::deep_copy(h_matrix, matrix); + + // Display the matrix + std::cout << "Matrix with LayoutLeft :" << std::endl; + for (int i = 0; i < N; ++i) { + for (int j = 0; j < M; ++j) { + std::cout << h_matrix(i, j) << " "; + } + std::cout << std::endl; + } + + double elapsed_time = timer.seconds(); + std::cout << "Elapsed time: " << elapsed_time << " seconds" << std::endl; + } + Kokkos::finalize(); + return 0; +} diff --git a/docs/modules/kokkos/examples/src/19_layout_Right.cpp b/docs/modules/kokkos/examples/src/19_layout_Right.cpp new file mode 100644 index 00000000..b1ac7bb2 --- /dev/null +++ b/docs/modules/kokkos/examples/src/19_layout_Right.cpp @@ -0,0 +1,47 @@ +#include +#include +#include +#include +#include +#include + +#include + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + + const int N = 5; + const int M = 4; + + Kokkos::Timer timer; + + // Creating a 2D View with LayoutRight + Kokkos::View matrix("Matrix", N, M); + + // Filling the matrix + Kokkos::parallel_for( + "FillMatrix", N, KOKKOS_LAMBDA(const int i) { + for (int j = 0; j < M; ++j) { + matrix(i, j) = i * 10 + j; + } + }); + + // Create a mirror on the host to display the results + auto h_matrix = Kokkos::create_mirror_view(matrix); + Kokkos::deep_copy(h_matrix, matrix); + + // Display the matrix + std::cout << "Matrix with LayoutRight :" << std::endl; + for (int i = 0; i < N; ++i) { + for (int j = 0; j < M; ++j) { + std::cout << h_matrix(i, j) << " "; + } + std::cout << std::endl; + } + double elapsed_time = timer.seconds(); + std::cout << "Elapsed time: " << elapsed_time << " seconds" << std::endl; + } + Kokkos::finalize(); + return 0; +} diff --git a/docs/modules/kokkos/examples/src/20_reduction.cpp b/docs/modules/kokkos/examples/src/20_reduction.cpp new file mode 100644 index 00000000..e0eb7944 --- /dev/null +++ b/docs/modules/kokkos/examples/src/20_reduction.cpp @@ -0,0 +1,82 @@ +#include +#include +#include +#include +#include +#include + +#include + + + +struct CustomReduction { + double max_value; + double sum; + int count; + + KOKKOS_INLINE_FUNCTION + CustomReduction() : max_value(-std::numeric_limits::max()), sum(0.0), count(0) {} + + KOKKOS_INLINE_FUNCTION + CustomReduction& operator+=(const CustomReduction& rhs) { + max_value = std::max(max_value, rhs.max_value); + sum += rhs.sum; + count += rhs.count; + return *this; + } + + KOKKOS_INLINE_FUNCTION + void operator+=(const volatile CustomReduction& rhs) volatile { + max_value = std::max(max_value, rhs.max_value); + sum += rhs.sum; + count += rhs.count; + } +}; + +namespace Kokkos { + template<> + struct reduction_identity { + KOKKOS_FORCEINLINE_FUNCTION static CustomReduction sum() { + return CustomReduction(); + } + }; +} + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + + Kokkos::Timer timer; + const int N = 1000000; + + Kokkos::View data("Data", N); + + // Data initialization + Kokkos::parallel_for( + "Init", N, + KOKKOS_LAMBDA(const int i) { data(i) = std::sin(i * 0.001) * 100; }); + + CustomReduction result; + + // Advanced reduction + Kokkos::parallel_reduce( + "AdvancedReduction", N, + KOKKOS_LAMBDA(const int i, CustomReduction &local) { + local.max_value = std::max(local.max_value, data(i)); + local.sum += data(i); + if (data(i) > 50) + local.count++; + }, + Kokkos::Sum(result)); + + std::cout << "Maximum value : " << result.max_value << std::endl; + std::cout << "Sum : " << result.sum << std::endl; + std::cout << "Number of values > 50 : " << result.count << std::endl; + std::cout << "Average : " << result.sum / N << std::endl; + + double elapsed_time = timer.seconds(); + std::cout << "Elapsed time: " << elapsed_time << " seconds" << std::endl; + } + Kokkos::finalize(); + return 0; +} diff --git a/docs/modules/kokkos/examples/src/21_reduction_multiple.cpp b/docs/modules/kokkos/examples/src/21_reduction_multiple.cpp new file mode 100644 index 00000000..283d9b8a --- /dev/null +++ b/docs/modules/kokkos/examples/src/21_reduction_multiple.cpp @@ -0,0 +1,48 @@ +#include +#include +#include +#include +#include +#include + +#include + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + Kokkos::Timer timer; + const int N = 1000000; + + Kokkos::View data("Data", N); + + // Data initialization + Kokkos::parallel_for( + "Init", N, + KOKKOS_LAMBDA(const int i) { data(i) = std::sin(i * 0.001) * 100; }); + + double sum = 0.0; + double max_val = -std::numeric_limits::max(); + int count_positive = 0; + + // Multiple reductions + Kokkos::parallel_reduce( + "MultipleReductions", N, + KOKKOS_LAMBDA(const int i, double &lsum, double &lmax, int &lcount) { + lsum += data(i); + lmax = std::max(lmax, data(i)); + if (data(i) > 0) + lcount++; + }, + sum, Kokkos::Max(max_val), count_positive); + + std::cout << "Sum : " << sum << std::endl; + std::cout << "Maximum value : " << max_val << std::endl; + std::cout << "Number of positive values : " << count_positive << std::endl; + std::cout << "Average : " << sum / N << std::endl; + + double elapsed_time = timer.seconds(); + std::cout << "Elapsed time: " << elapsed_time << " seconds" << std::endl; + } + Kokkos::finalize(); + return 0; +} diff --git a/docs/modules/kokkos/examples/src/22_1_hierarchical_parallelism.cpp b/docs/modules/kokkos/examples/src/22_1_hierarchical_parallelism.cpp new file mode 100644 index 00000000..e97ee164 --- /dev/null +++ b/docs/modules/kokkos/examples/src/22_1_hierarchical_parallelism.cpp @@ -0,0 +1,106 @@ +#include +#include +#include +#include +#include +#include + +#include + + +struct HierarchicalParallelism { + Kokkos::View matrix; + HierarchicalParallelism(int N, int M) : matrix("matrix", N, M) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const Kokkos::TeamPolicy<>::member_type& team_member) const { + const int i = team_member.league_rank(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, matrix.extent(1)), + [&] (const int j) { + matrix(i, j) = i * matrix.extent(1) + j; + }); + + team_member.team_barrier(); + if (team_member.team_rank() == 0) { + double sum = 0.0; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team_member, matrix.extent(1)), + [&] (const int j, double& lsum) { + lsum += matrix(i, j); + }, sum); + + Kokkos::single(Kokkos::PerTeam(team_member), [&] () { + matrix(i, 0) = sum; + // std::cout << "Sum of row " << i << " is " << sum << std::endl; + }); + } + } +}; + + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + const int N = 1000; + const int M = 100; + HierarchicalParallelism functor(N, M); + Kokkos::parallel_for(Kokkos::TeamPolicy<>(N, Kokkos::AUTO), functor); + } + Kokkos::finalize(); + return 0; +} + + +// int main(int argc, char *argv[]) { +// Kokkos::initialize(argc, argv); +// { +// Kokkos::Timer timer; +// const int N = 1000000; +// const int TEAM_SIZE = 16; +// const int VECTOR_SIZE = 4; + +// Kokkos::View data("Data", N); + +// // Data initialization +// Kokkos::parallel_for( +// "Init", N, KOKKOS_LAMBDA(const int i) { data(i) = i * 0.01; }); + +// double sum = 0.0; + +// // Hierarchical parallelism +// Kokkos::parallel_reduce( +// "HierarchicalSum", +// Kokkos::TeamPolicy<>(N / (TEAM_SIZE * VECTOR_SIZE), TEAM_SIZE, +// VECTOR_SIZE), +// KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type &team_member, +// double &team_sum) { +// const int team_rank = team_member.team_rank(); +// const int team_size = team_member.team_size(); +// const int league_rank = team_member.league_rank(); + +// double thread_sum = 0.0; + +// Kokkos::parallel_reduce( +// Kokkos::ThreadVectorRange(team_member, VECTOR_SIZE), +// [&](const int vector_rank, double &vector_sum) { +// const int i = +// (league_rank * team_size + team_rank) * VECTOR_SIZE + +// vector_rank; +// if (i < N) { +// vector_sum += data(i); +// } +// }, +// thread_sum); + +// Kokkos::single(Kokkos::PerThread(team_member), +// [&]() { Kokkos::atomic_add(&team_sum, thread_sum); }); +// }, +// sum); + +// std::cout << "Total Sum : " << sum << std::endl; +// std::cout << "Average : " << sum / N << std::endl; +// double elapsed_time = timer.seconds(); +// std::cout << "Elapsed time: " << elapsed_time << " seconds" << std::endl; +// } +// Kokkos::finalize(); +// return 0; +// } diff --git a/docs/modules/kokkos/examples/src/22_2_scratch_memory.cpp b/docs/modules/kokkos/examples/src/22_2_scratch_memory.cpp new file mode 100644 index 00000000..13d0ab7e --- /dev/null +++ b/docs/modules/kokkos/examples/src/22_2_scratch_memory.cpp @@ -0,0 +1,56 @@ +#include +#include +#include +#include +#include +#include + +#include + +struct ScratchMemoryExample { + Kokkos::View data; + ScratchMemoryExample(int N) : data("data", N) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const Kokkos::TeamPolicy<>::member_type& team_member) const { + const int team_size = team_member.team_size(); + const int team_rank = team_member.team_rank(); + const int league_rank = team_member.league_rank(); + + // Allocate team scratch memory + double* team_scratch = (double*)team_member.team_shmem().get_shmem(team_size * sizeof(double)); + + // Each thread initializes its scratch memory + team_scratch[team_rank] = league_rank * team_size + team_rank; + + // Synchronize to ensure all threads have written to scratch memory + team_member.team_barrier(); + + // Perform a reduction within the team + double team_sum = 0.0; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team_member, team_size), [&](const int i, double& lsum) { + lsum += team_scratch[i]; + }, team_sum); + + // Only one thread writes the result back to global memory + if (team_rank == 0) { + data(league_rank) = team_sum; + } + } + + // Specify the amount of scratch memory needed + size_t team_shmem_size(int team_size) const { + return team_size * sizeof(double); + } +}; + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + const int N = 1000; + ScratchMemoryExample functor(N); + Kokkos::parallel_for(Kokkos::TeamPolicy<>(N / 10, Kokkos::AUTO).set_scratch_size(0, Kokkos::PerTeam(functor.team_shmem_size(10))), functor); + } + Kokkos::finalize(); + return 0; +} \ No newline at end of file diff --git a/docs/modules/kokkos/examples/src/22_3_token.cpp b/docs/modules/kokkos/examples/src/22_3_token.cpp new file mode 100644 index 00000000..a648f885 --- /dev/null +++ b/docs/modules/kokkos/examples/src/22_3_token.cpp @@ -0,0 +1,37 @@ +#include +#include +#include +#include +#include +#include + +#include + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + // Size of the array + const int N = 100; + // Kokkos view to store the results + Kokkos::View results("results", N); + // Create a UniqueToken (based on thread execution) + Kokkos::Experimental::UniqueToken unique_token; + // Number of available threads + const int num_threads = unique_token.size(); + std::cout << "Number of threads: " << num_threads << std::endl; + Kokkos::parallel_for("UniqueTokenExample", N, KOKKOS_LAMBDA(const int i) { + // Get a unique identifier for this thread + int token = unique_token.acquire(); + results(i) = token; + unique_token.release(token); + }); + // Copy the results to the host for display + auto host_results = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), results); + std::cout << "Results: "; + for (int i = 0; i < N; ++i) { + std::cout << host_results(i) << " "; + } + std::cout << std::endl; + } + Kokkos::finalize(); +} diff --git a/docs/modules/kokkos/examples/src/23_nested_parallelism.cpp b/docs/modules/kokkos/examples/src/23_nested_parallelism.cpp new file mode 100644 index 00000000..d8b4bc27 --- /dev/null +++ b/docs/modules/kokkos/examples/src/23_nested_parallelism.cpp @@ -0,0 +1,62 @@ +#include +#include +#include +#include +#include +#include + +#include + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + Kokkos::Timer timer; + const int N = 1000; + const int M = 100; + + // Create a 2D Kokkos View for storing matrix data + Kokkos::View matrix("Matrix", N, M); + + // Define a TeamPolicy for parallelism + using team_policy = Kokkos::TeamPolicy<>; + using member_type = team_policy::member_type; + + // Fill the matrix using nested parallelism + Kokkos::parallel_for( + "OuterLoop", team_policy(N, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type &team_member) { + const int i = team_member.league_rank(); // Get the "outer" index + Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, M), + [=](const int j) { matrix(i, j) = i * M + j; }); + }); + + // Compute the sum of all elements in the matrix + double sum = 0.0; + Kokkos::parallel_reduce( + "Sum", team_policy(N, Kokkos::AUTO), + KOKKOS_LAMBDA(const member_type &team_member, double &lsum) { + const int i = team_member.league_rank(); // Get the "outer" index + double row_sum = 0.0; + + // Compute the row sum using nested parallelism + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team_member, M), + [=](const int j, double &thread_sum) { + thread_sum += matrix(i, j); + }, + row_sum); + + lsum += row_sum; // Accumulate row sums into lsum + }, + sum); + + // Print results + std::cout << "Total Sum : " << sum << std::endl; + std::cout << "Average : " << sum / (N * M) << std::endl; + + double elapsed_time = timer.seconds(); + std::cout << "Elapsed time: " << elapsed_time << " seconds" << std::endl; + } + Kokkos::finalize(); + return 0; +} diff --git a/docs/modules/kokkos/examples/src/24_combined_team_thread_range_thread_vector_range.cpp b/docs/modules/kokkos/examples/src/24_combined_team_thread_range_thread_vector_range.cpp new file mode 100644 index 00000000..22d10ada --- /dev/null +++ b/docs/modules/kokkos/examples/src/24_combined_team_thread_range_thread_vector_range.cpp @@ -0,0 +1,60 @@ +#include +#include +#include +#include +#include +#include + +#include + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + Kokkos::Timer timer; + const int N = 100; + const int M = 64; + const int K = 16; + + Kokkos::View A("A", N, M, K); + + Kokkos::parallel_for( + "Init", N * M * K, KOKKOS_LAMBDA(const int idx) { + int i = idx / (M * K); + int j = (idx / K) % M; + int k = idx % K; + A(i, j, k) = i * M * K + j * K + k; + }); + + double total_sum = 0.0; + + Kokkos::parallel_reduce( + "CombinedRanges", Kokkos::TeamPolicy<>(N, Kokkos::AUTO), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type &team_member, + double &team_sum) { + const int n = team_member.league_rank(); + + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team_member, M), + [&](const int m, double &thread_sum) { + double vector_sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team_member, K), + [&](const int k, double &inner_sum) { + inner_sum += A(n, m, k); + }, + vector_sum); + thread_sum += vector_sum; + }, + team_sum); + }, + total_sum); + + std::cout << "Total Sum : " << total_sum << std::endl; + std::cout << "Average : " << total_sum / (N * M * K) << std::endl; + + double elapsed_time = timer.seconds(); + std::cout << "Elapsed time: " << elapsed_time << " seconds" << std::endl; + } + Kokkos::finalize(); + return 0; +} diff --git a/docs/modules/kokkos/examples/src/25_asynchronicity.cpp b/docs/modules/kokkos/examples/src/25_asynchronicity.cpp new file mode 100644 index 00000000..de7fc5b7 --- /dev/null +++ b/docs/modules/kokkos/examples/src/25_asynchronicity.cpp @@ -0,0 +1,54 @@ +#include +#include +#include +#include +#include +#include + +#include + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + Kokkos::Timer timer; + const int N = 1000000; + + Kokkos::View a("a", N); + Kokkos::View b("b", N); + Kokkos::View c("c", N); + + // Initialize View 'a' + Kokkos::parallel_for("init_a", Kokkos::RangePolicy(0, N), KOKKOS_LAMBDA(const int i) { + a(i) = i * 0.1; + }); + + // Initialize View 'b' + Kokkos::parallel_for("init_b", Kokkos::RangePolicy(0, N), KOKKOS_LAMBDA(const int i) { + b(i) = i * 0.2; + }); + + // Compute View 'c' as the sum of 'a' and 'b' + Kokkos::parallel_for("compute_c", Kokkos::RangePolicy(0, N), KOKKOS_LAMBDA(const int i) { + c(i) = a(i) + b(i); + }); + + // Ensure all computations are complete + Kokkos::fence(); + + // Compute the sum of all elements in 'c' + double sum = 0.0; + Kokkos::parallel_reduce("sum_c", Kokkos::RangePolicy(0, N), KOKKOS_LAMBDA(const int i, double& lsum) { + lsum += c(i); + }, sum); + + // Ensure all reductions are complete + Kokkos::fence(); + + std::cout << "Sum c : " << sum << std::endl; + + double elapsed_time = timer.seconds(); + std::cout << "Elapsed time: " << elapsed_time << " seconds" << std::endl; + } + Kokkos::finalize(); + return 0; +} diff --git a/docs/modules/kokkos/examples/src/26_stream.cpp b/docs/modules/kokkos/examples/src/26_stream.cpp new file mode 100644 index 00000000..82e53184 --- /dev/null +++ b/docs/modules/kokkos/examples/src/26_stream.cpp @@ -0,0 +1,54 @@ +#include +#include +#include +#include +#include +#include + +#include + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + Kokkos::Timer timer; + + const int N = 1000000; + const int num_streams = 2; + + Kokkos::View a("a", N); + Kokkos::View b("b", N); + Kokkos::View c("c", N); + + std::vector streams(num_streams); + + for (int i = 0; i < num_streams; ++i) { + streams[i] = Kokkos::HIP(); + } + + Kokkos::parallel_for(Kokkos::RangePolicy(streams[0], 0, N), KOKKOS_LAMBDA(const int i) { + a(i) = i * 0.1; + }); + + Kokkos::parallel_for(Kokkos::RangePolicy(streams[1], 0, N), KOKKOS_LAMBDA(const int i) { + b(i) = i * 0.2; + }); + + Kokkos::parallel_for(Kokkos::RangePolicy(streams[0], 0, N), KOKKOS_LAMBDA(const int i) { + c(i) = a(i) + b(i); + }); + + Kokkos::fence(); + + double sum = 0.0; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, N), KOKKOS_LAMBDA(const int i, double& lsum) { + lsum += c(i); + }, sum); + + std::cout << "Sum c : " << sum << std::endl; + + double elapsed_time = timer.seconds(); + std::cout << "Elapsed time: " << elapsed_time << " seconds" << std::endl; + } + Kokkos::finalize(); + return 0; +} diff --git a/docs/modules/kokkos/examples/src/27_mpi_gpu_hip.cpp b/docs/modules/kokkos/examples/src/27_mpi_gpu_hip.cpp new file mode 100644 index 00000000..b2ea5829 --- /dev/null +++ b/docs/modules/kokkos/examples/src/27_mpi_gpu_hip.cpp @@ -0,0 +1,80 @@ +#include +#include +#include +#include +#include +#include + +#include + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + int provided; + int initialized; + MPI_Initialized(&initialized); + if (!initialized) { + MPI_Init_thread(nullptr, nullptr, MPI_THREAD_FUNNELED, &provided); + } + + int rank, world_size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + try { + Kokkos::InitializationSettings settings; + settings.set_device_id(rank % Kokkos::HIP::detect_device_count()); + if (!Kokkos::is_initialized()) { + // settings.set_num_threads(2); // if you want ... or more parameters + Kokkos::initialize(settings); + } + + { + int n = 10; + Kokkos::View data("data", n); + + Kokkos::parallel_for( + Kokkos::RangePolicy(0, n), + KOKKOS_LAMBDA(const int i) { data(i) = rank * 1.0 + i; }); + + Kokkos::fence(); + + double local_sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, n), + KOKKOS_LAMBDA(const int i, double &sum) { sum += data(i); }, + local_sum); + + Kokkos::fence(); + + double global_sum; + MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, + MPI_COMM_WORLD); + + std::cout << "rank[" << rank << "] Lobale Sum : " << local_sum + << std::endl; + + if (rank == 0) { + std::cout << "Globale sum : " << global_sum << std::endl; + } + } + + // Kokkos::finalize(); + + if (Kokkos::is_initialized()) { + Kokkos::finalize(); + } + } catch (std::exception &e) { + std::cerr << "Exception caught on rank " << rank << ": " << e.what() + << std::endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + MPI_Initialized(&initialized); + if (initialized) { + MPI_Finalize(); + } + } + Kokkos::finalize(); + return 0; +} diff --git a/docs/modules/kokkos/examples/src/28_multi_gpu_hip.cpp b/docs/modules/kokkos/examples/src/28_multi_gpu_hip.cpp new file mode 100644 index 00000000..1600215b --- /dev/null +++ b/docs/modules/kokkos/examples/src/28_multi_gpu_hip.cpp @@ -0,0 +1,75 @@ +#include +#include +#include +#include +#include +#include + +#include + +struct VectorAddFunctor { + Kokkos::View a; + Kokkos::View b; + Kokkos::View c; + + VectorAddFunctor(Kokkos::View a_, + Kokkos::View b_, + Kokkos::View c_) + : a(a_), b(b_), c(c_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { c(i) = a(i) + b(i); } +}; + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + + const int vector_size = 1000000; + const int num_gpus = Kokkos::HIP::detect_device_count(); + + Kokkos::InitializationSettings settings; + std::cout << "Number of GPUs available : " << num_gpus << std::endl; + + Kokkos::Timer timer; + + for (int gpu = 0; gpu < num_gpus; ++gpu) { + + settings.set_device_id(gpu); + Kokkos::HIP::impl_initialize(settings); + Kokkos::fence(); + + Kokkos::View a("a", vector_size); + Kokkos::View b("b", vector_size); + Kokkos::View c("c", vector_size); + + Kokkos::parallel_for( + Kokkos::RangePolicy(0, vector_size), + KOKKOS_LAMBDA(const int i) { + a(i) = 1.0; + b(i) = 2.0; + }); + + Kokkos::parallel_for(Kokkos::RangePolicy(0, vector_size), + VectorAddFunctor(a, b, c)); + + Kokkos::View::HostMirror h_c = Kokkos::create_mirror_view(c); + Kokkos::deep_copy(h_c, c); + + bool correct = true; + for (int i = 0; i < vector_size; ++i) { + if (h_c(i) != 3.0) { + correct = false; + break; + } + } + + std::cout << "Result on GPU " << gpu << " : " + << (correct ? "Correct" : "Incorrect") << std::endl; + } + double elapsed_time = timer.seconds(); + std::cout << "Elapsed time: " << elapsed_time << " seconds" << std::endl; + } + Kokkos::finalize(); + return 0; +} diff --git a/docs/modules/kokkos/examples/src/29_mpi_multi_gpu_hip.cpp b/docs/modules/kokkos/examples/src/29_mpi_multi_gpu_hip.cpp new file mode 100644 index 00000000..6efa2698 --- /dev/null +++ b/docs/modules/kokkos/examples/src/29_mpi_multi_gpu_hip.cpp @@ -0,0 +1,88 @@ +#include +#include +#include +#include +#include +#include + +#include + +int main(int argc, char *argv[]) { + // Kokkos::initialize(argc, argv); + { + int provided; + int initialized; + MPI_Initialized(&initialized); + if (!initialized) { + MPI_Init_thread(nullptr, nullptr, MPI_THREAD_FUNNELED, &provided); + } + + int rank, world_size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + Kokkos::Timer timer; + + try { + Kokkos::InitializationSettings settings; + int num_gpus = Kokkos::HIP::detect_device_count(); + int gpu_id = rank % num_gpus; + settings.set_device_id(gpu_id); + + std::cout << "rank : [" << rank << "] num gpu id : [" << gpu_id << "] " + << std::endl; + + if (!Kokkos::is_initialized()) { + Kokkos::initialize(settings); + } + + { + int n = 10; + Kokkos::View data("data", n); + + Kokkos::parallel_for( + Kokkos::RangePolicy(0, n), + KOKKOS_LAMBDA(const int i) { data(i) = rank * 1.0 + i; }); + + Kokkos::fence(); + + double local_sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, n), + KOKKOS_LAMBDA(const int i, double &sum) { sum += data(i); }, + local_sum); + + Kokkos::fence(); + + double global_sum; + MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, + MPI_COMM_WORLD); + + std::cout << "rank[" << rank << "] Locale Sum : " << local_sum + << std::endl; + + if (rank == 0) { + std::cout << "Globale Sum : " << global_sum << std::endl; + double elapsed_time = timer.seconds(); + std::cout << "Elapsed time: " << elapsed_time << " seconds" + << std::endl; + } + } + + if (Kokkos::is_initialized()) { + Kokkos::finalize(); + } + } catch (std::exception &e) { + std::cerr << "Exception caught on rank " << rank << ": " << e.what() + << std::endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + MPI_Initialized(&initialized); + if (initialized) { + MPI_Finalize(); + } + } + // Kokkos::finalize(); + return 0; +} diff --git a/docs/modules/kokkos/examples/src/30_mpi_multi_gpu_intra_connection_hip.cpp b/docs/modules/kokkos/examples/src/30_mpi_multi_gpu_intra_connection_hip.cpp new file mode 100644 index 00000000..31af5c20 --- /dev/null +++ b/docs/modules/kokkos/examples/src/30_mpi_multi_gpu_intra_connection_hip.cpp @@ -0,0 +1,120 @@ +#include +#include +#include +#include +#include +#include + +#include + +int main(int argc, char *argv[]) { + // Kokkos::initialize(argc, argv); + int provided; + int initialized; + MPI_Initialized(&initialized); + if (!initialized) { + MPI_Init_thread(nullptr, nullptr, MPI_THREAD_FUNNELED, &provided); + } + + int rank, world_size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + Kokkos::Timer timer; + + try { + Kokkos::InitializationSettings settings; + int num_gpus = Kokkos::HIP::detect_device_count(); + int gpu_id = rank % num_gpus; + settings.set_device_id(gpu_id); + + if (!Kokkos::is_initialized()) { + Kokkos::initialize(settings); + } + + { + int n = 10; + Kokkos::View data("data", n); + + // Initialize data on the GPU + Kokkos::parallel_for( + Kokkos::RangePolicy(0, n), + KOKKOS_LAMBDA(const int i) { data(i) = rank * 1.0 + i; }); + + Kokkos::fence(); + + // Create a mirror view to copy data to the host + auto host_data = Kokkos::create_mirror_view(data); + Kokkos::deep_copy(host_data, data); + + // Preparing buffers for MPI exchange + std::vector send_buffer(n); + for (int i = 0; i < n; ++i) { + send_buffer[i] = host_data(i); // Copy data into the send buffer + } + + std::vector recv_buffer(n); + + // Exchange data with neighbors + int left_neighbor = (rank - 1 + world_size) % world_size; + int right_neighbor = (rank + 1) % world_size; + + MPI_Sendrecv(send_buffer.data(), n, MPI_DOUBLE, right_neighbor, 0, + recv_buffer.data(), n, MPI_DOUBLE, left_neighbor, 0, + MPI_COMM_WORLD, MPI_STATUS_IGNORE); + + // Copy the received data to a mirror view on the host + auto received_host_data = + Kokkos::View("received_host_data", n); + for (int i = 0; i < n; ++i) { + received_host_data(i) = recv_buffer[i]; + } + + // Copy data received from host to GPU + Kokkos::View received_data( + "received_data", n); + Kokkos::deep_copy(received_data, received_host_data); + + // Calculation of the local sum including the received data + double local_sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, n), + KOKKOS_LAMBDA(const int i, double &sum) { + sum += data(i) + received_data(i); + }, + local_sum); + + Kokkos::fence(); + + double global_sum; + MPI_Allreduce(&local_sum, &global_sum, 1, MPI_DOUBLE, MPI_SUM, + MPI_COMM_WORLD); + + std::cout << "rank[" << rank + << "] Local amount (including exchanged data)) : " << local_sum + << std::endl; + + if (rank == 0) { + std::cout << "Globale Sum : " << global_sum << std::endl; + double elapsed_time = timer.seconds(); + std::cout << "Elapsed time: " << elapsed_time << " seconds" + << std::endl; + } + } + + if (Kokkos::is_initialized()) { + Kokkos::finalize(); + } + } catch (std::exception &e) { + std::cerr << "Exception caught on rank " << rank << ": " << e.what() + << std::endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + MPI_Initialized(&initialized); + if (initialized) { + MPI_Finalize(); + } + // Kokkos::finalize(); + return 0; +} diff --git a/docs/modules/kokkos/examples/src/31_multidim_loops.cpp b/docs/modules/kokkos/examples/src/31_multidim_loops.cpp new file mode 100644 index 00000000..5af7acae --- /dev/null +++ b/docs/modules/kokkos/examples/src/31_multidim_loops.cpp @@ -0,0 +1,26 @@ +#include +#include +#include +#include +#include +#include + +#include + +int main(int argc, char *argv[]) { + Kokkos::initialize(argc, argv); + { + const int N1 = 10; + const int N2 = 10; + const int N3 = 10; + + Kokkos::MDRangePolicy> policy({0, 0, 0}, {N1, N2, N3}); + + Kokkos::parallel_for("3DLoop", policy, KOKKOS_LAMBDA(const int i, const int j, const int k) { + // Example computation + printf("Processing element (%d, %d, %d)\n", i, j, k); + }); + } + Kokkos::finalize(); + return 0; +} diff --git a/docs/modules/kokkos/examples/src/CMakeLists.txt b/docs/modules/kokkos/examples/src/CMakeLists.txt new file mode 100644 index 00000000..0770caf7 --- /dev/null +++ b/docs/modules/kokkos/examples/src/CMakeLists.txt @@ -0,0 +1,78 @@ +cmake_minimum_required(VERSION 3.10) + + +option(FEELPP_ENABLE_KOKKOS "Enable Kokkos support" ON) +set(Kokkos_ENABLE_OPENMP ON ) + +if (FEELPP_ENABLE_KOKKOS) + find_package(Kokkos REQUIRED) + if ( Kokkos_FOUND ) + message(STATUS "Kokkos found") + if(Kokkos_ENABLE_OPENMP) + message(STATUS "Kokkos OpenMP backend is enabled") + endif() + if (Kokkos_ENABLE_SERIAL) + message(STATUS "Kokkos Serial backend is enabled") + endif() + if (Kokkos_ENABLE_THREADS) + message(STATUS "Kokkos Threads backend is enabled") + endif() + if(Kokkos_ENABLE_HIP) + message(STATUS "Kokkos HIP backend is enabled") + endif() + if (Kokkos_ENABLE_CUDA) + message(STATUS "Kokkos CUDA backend is enabled") + endif() + endif() + message(STATUS "[feelpp] Kokkos support enabled") + + # Add an executable + add_executable(00_views 00_views.cpp) + add_executable(01_views_1D 01_views_1D.cpp) + add_executable(02_views_2D 02_views_2D.cpp) + add_executable(03_views_properties 03_views_properties.cpp) + add_executable(04_kokkos_exercise_views 04_kokkos_exercise_views.cpp) + + # My prog + add_executable(10_kokkos_cholesky 10_kokkos_cholesky.cpp) + add_executable(11_kokkos_memspace 11_kokkos_memspace.cpp) + #add_executable(12_kokkos_simple_multi_gpus 12_kokkos_simple_multi_gpus.cpp) + #add_executable(13_kokkos_execution_patterns 13_kokkos_execution_patterns.cpp) + #add_executable(14_kokkos_execution_patterns_policies 14_kokkos_execution_patterns_policies.cpp) + #add_executable(15_kokkos_dual_view 15_kokkos_dual_view.cpp) + #add_executable(16_kokkos_simd 16_kokkos_simd.cpp) + #add_executable(17_kokkos_polynom_jacobi_1 17_kokkos_polynom_jacobi_1.cpp) + + add_executable(22_hierarchical_parallelism 22_1_hierarchical_parallelism.cpp) + add_executable(22_scratch_memory 22_2_scratch_memory.cpp) + add_executable(22_token 22_3_token.cpp) + + add_executable(31_multidim_loops 31_multidim_loops.cpp) + + #feelpp_add_application(kokkos SRCS feelpp_kokkos.cpp) + + # Link Kokkos to the executable + target_link_libraries(00_views Kokkos::kokkos) + target_link_libraries(01_views_1D Kokkos::kokkos) + target_link_libraries(02_views_2D Kokkos::kokkos) + target_link_libraries(03_views_properties Kokkos::kokkos) + target_link_libraries(04_kokkos_exercise_views Kokkos::kokkos) + + # My prog + target_link_libraries(10_kokkos_cholesky Kokkos::kokkos) + target_link_libraries(11_kokkos_memspace Kokkos::kokkos) + #target_link_libraries(12_kokkos_simple_multi_gpus Kokkos::kokkos) + #target_link_libraries(13_kokkos_execution_patterns Kokkos::kokkos) + #target_link_libraries(14_kokkos_execution_patterns_policies Kokkos::kokkos) + #target_link_libraries(15_kokkos_dual_view Kokkos::kokkos) + #target_link_libraries(16_kokkos_simd Kokkos::kokkos) + #target_link_libraries(17_kokkos_polynom_jacobi_1 Kokkos::kokkos); + + target_link_libraries(22_hierarchical_parallelism Kokkos::kokkos) + target_link_libraries(22_scratch_memory Kokkos::kokkos) + target_link_libraries(22_token Kokkos::kokkos) + + target_link_libraries(31_multidim_loops Kokkos::kokkos) + + +endif() \ No newline at end of file diff --git a/docs/modules/kokkos/examples/src/feelpp_kokkos.cpp b/docs/modules/kokkos/examples/src/feelpp_kokkos.cpp new file mode 100644 index 00000000..3183f7a1 --- /dev/null +++ b/docs/modules/kokkos/examples/src/feelpp_kokkos.cpp @@ -0,0 +1,39 @@ +#include +// #include +#include + +using namespace Feel; + +int main(int argc, char **argv) { + Environment env(_argc = argc, _argv = argv, _desc = feel_options(), + _about = about(_name = "mylaplacian", + _author = "Feel++ Consortium", + _email = "feelpp-devel at feelpp.org")); + + // Kokkos::initialize(argc, argv); + + // create mesh + auto mesh = unitSquare(); + + // function space + auto Vh = Pch<1>(mesh); + auto u = Vh->element(); + auto v = Vh->element(); + + // left hand side + auto a = form2(_trial = Vh, _test = Vh); + a = integrate(_range = elements(mesh), _expr = gradt(u) * trans(grad(v))); + + // right hand side + auto l = form1(_test = Vh); + l = integrate(_range = elements(mesh), _expr = id(v)); + + // boundary condition + a += on(_range = boundaryfaces(mesh), _rhs = l, _element = u, + _expr = constant(0.)); + + // solve the equation a(u,v) = l(v) + a.solve(_rhs = l, _solution = u); + + // Kokkos::finalize(); +} \ No newline at end of file diff --git a/docs/modules/kokkos/images/kokkos-DualView.png b/docs/modules/kokkos/images/kokkos-DualView.png new file mode 100644 index 00000000..6ee64bec Binary files /dev/null and b/docs/modules/kokkos/images/kokkos-DualView.png differ diff --git a/docs/modules/kokkos/images/kokkos-EcoSystem.png b/docs/modules/kokkos/images/kokkos-EcoSystem.png new file mode 100644 index 00000000..009f1bdd Binary files /dev/null and b/docs/modules/kokkos/images/kokkos-EcoSystem.png differ diff --git a/docs/modules/kokkos/images/kokkos-abstractions-doc.png b/docs/modules/kokkos/images/kokkos-abstractions-doc.png new file mode 100644 index 00000000..53682a72 Binary files /dev/null and b/docs/modules/kokkos/images/kokkos-abstractions-doc.png differ diff --git a/docs/modules/kokkos/images/kokkos-core.png b/docs/modules/kokkos/images/kokkos-core.png new file mode 100644 index 00000000..f10d38ef Binary files /dev/null and b/docs/modules/kokkos/images/kokkos-core.png differ diff --git a/docs/modules/kokkos/images/kokkos-layout-Left-Right.png b/docs/modules/kokkos/images/kokkos-layout-Left-Right.png new file mode 100644 index 00000000..946289f5 Binary files /dev/null and b/docs/modules/kokkos/images/kokkos-layout-Left-Right.png differ diff --git a/docs/modules/kokkos/images/kokkos-mirrors-schematic.png b/docs/modules/kokkos/images/kokkos-mirrors-schematic.png new file mode 100644 index 00000000..76282a27 Binary files /dev/null and b/docs/modules/kokkos/images/kokkos-mirrors-schematic.png differ diff --git a/docs/modules/kokkos/images/kokkos-node-doc.png b/docs/modules/kokkos/images/kokkos-node-doc.png new file mode 100644 index 00000000..6f41200b Binary files /dev/null and b/docs/modules/kokkos/images/kokkos-node-doc.png differ diff --git a/docs/modules/kokkos/images/kokkos1.jpg b/docs/modules/kokkos/images/kokkos1.jpg new file mode 100644 index 00000000..086206b2 Binary files /dev/null and b/docs/modules/kokkos/images/kokkos1.jpg differ diff --git a/docs/modules/kokkos/images/kokkos2.png b/docs/modules/kokkos/images/kokkos2.png new file mode 100644 index 00000000..e5c2f839 Binary files /dev/null and b/docs/modules/kokkos/images/kokkos2.png differ diff --git a/docs/modules/kokkos/nav.adoc b/docs/modules/kokkos/nav.adoc index 5c81020d..177d0f56 100644 --- a/docs/modules/kokkos/nav.adoc +++ b/docs/modules/kokkos/nav.adoc @@ -7,8 +7,28 @@ ** xref:basic-concepts/index.adoc[Basic Concepts] +*** xref:basic-concepts/views.adoc[Views] *** xref:basic-concepts/execution-spaces.adoc[Execution Spaces] *** xref:basic-concepts/memory-spaces.adoc[Memory Spaces] -*** xref:basic-concepts/views.adoc[Views] +*** xref:basic-concepts/mirrors.adoc[Mirrors] +**** xref:basic-concepts/mirrors_sol_code.adoc[Solution from Kokkos tutorial] +*** xref:basic-concepts/memory-access-patterns.adoc[Memory Access Patterns] + +** xref:advanced-concepts/index.adoc[Advanced Concepts] +*** xref:advanced-concepts/advanced-reductions.adoc[Advanced Reductions] +*** xref:advanced-concepts/hierarchical-parallelism.adoc[Hierarchical Parallelism] +*** xref:advanced-concepts/mpi.adoc[MPI] +*** xref:advanced-concepts/pgas.adoc[PGAS] +*** xref:advanced-concepts/mpi-vs-pgas.adoc[MPI vs PGAS] + + +*** xref:advanced-concepts/multidimensional-loops-and-data-structure.adoc[Multidimensional Loops and Data Structure] +*** xref:advanced-concepts/single-instruction-mutliple-data.adoc[Single Instruction Multiple Data] +*** xref:advanced-concepts/asynchronicity-and-streams.adoc[Asynchronicity and Stream] + +** xref:diagnostic-tools-algebraic-strategies/index.adoc[Diagnostic Tools Algebraic Strategies] +*** xref:diagnostic-tools-algebraic-strategies/kernels-math-library.adoc[Kernels Math library] +*** xref:diagnostic-tools-algebraic-strategies/tools-profiling-tuning-debugging.adoc[Tools Profing Tuning Debugging] +** xref:gaya.adoc[Compile on Gaya] diff --git a/docs/modules/kokkos/pages/advanced-concepts/advanced-reductions.adoc b/docs/modules/kokkos/pages/advanced-concepts/advanced-reductions.adoc new file mode 100644 index 00000000..20e43210 --- /dev/null +++ b/docs/modules/kokkos/pages/advanced-concepts/advanced-reductions.adoc @@ -0,0 +1,143 @@ += Kokkos Advanced Reductions + +== Introduction + +[.text-justify] +In Kokkos C++, a reduction is a parallel operation that combines the results of individual calculations into a single final value. [1][2] This mechanism, primarily implemented through the `Kokkos::parallel_reduce` function, offers a powerful paradigm for consolidating data distributed across different processing units. The concept of a "Reducer" in Kokkos encapsulates the logic of combining intermediate values, defining not only the merging operation but also the initialization of thread-private variables and the localization of the final result. + +[.text-justify] +Kokkos allows for multiple reductions to be performed within a single kernel, which can significantly reduce kernel launch overhead and improve overall performance. It also offers the ability to use Views as reduction targets, enabling asynchronous reduction operations. This capability is particularly valuable in scenarios where the reduction result is needed for further computation or when overlapping computation and communication. + +[.text-justify] +For cases where built-in reducers do not suffice, Kokkos provides mechanisms for implementing custom reductions. This extensibility allows developers to define complex reduction operations tailored to their specific computational needs. Custom reductions can be particularly useful for domain-specific algorithms or when dealing with non-standard data types [3]. + + +== Advanced Reductions + +Kokkos provides powerful tools for performing reductions in parallel computations. + +** Using Reducers for Different Reductions + +Kokkos offers various built-in reducers for common operations: + + ** `Kokkos::Sum` for summation + ** `Kokkos::Prod` for product + ** `Kokkos::Min` and `Kokkos::Max` for minimum and maximum + +.Sum with Kokkos +[%collapsible.proof] +==== +[source,c++] +---- +double result; +Kokkos::parallel_reduce("Sum", policy, +KOKKOS_LAMBDA (const int i, double& lsum) { + lsum += data[i]; +}, Kokkos::Sum(result)); +---- +==== +** Multiple Reductions in One Kernel + +Kokkos allows performing multiple reductions simultaneously: + +.Multiple Reductions +[%collapsible.proof] +==== +[source,c++] +---- +struct MultipleResults { + double sum; + int max; +}; + +MultipleResults results; +Kokkos::parallel_reduce("MultiReduce", policy, +KOKKOS_LAMBDA (const int i, MultipleResults& lresults) { + lresults.sum += data[i]; + if (data[i] > lresults.max) lresults.max = data[i]; +}, +Kokkos::Sum(results)); +---- +==== + +** Using `Kokkos::View` as Result for Asynchronicity + +For asynchronous operations, you can use xref:basic-concepts/views.adoc[Views] as the reduction target: + +.Async Reduction +[%collapsible.proof] +==== +[source,c++] +---- +Kokkos::View result("Result", 1); +Kokkos::parallel_reduce("AsyncReduce", policy, +KOKKOS_LAMBDA (const int i, double& lsum) { + lsum += data[i]; +}, Kokkos::Sum(result(0))); +---- +==== + +This allows the reduction to be performed asynchronously, with the result available in the view. + +** Custom Reductions: +Kokkos supports custom reduction operations: + +.Custom Reduction +[%collapsible.proof] +==== +[source,c++] +---- +struct CustomReducer { +typedef double value_type; +KOKKOS_INLINE_FUNCTION void join(value_type& dest, const value_type& src) const { + dest = (dest > src) ? dest : src; // Custom max operation +} +KOKKOS_INLINE_FUNCTION void init(value_type& val) const { + val = std::numeric_limits::lowest(); +} +}; + +double result; +Kokkos::parallel_reduce("CustomReduce", policy, +KOKKOS_LAMBDA (const int i, double& lval) { + lval = (lval > data[i]) ? lval : data[i]; +}, CustomReducer()); +---- +==== + + +== References + +** [1] https://kokkos.org/kokkos-core-wiki/ProgrammingGuide/Custom-Reductions.html +** [2] https://kokkos.org/kokkos-core-wiki/API/core/builtinreducers/ReducerConcept.html +** [3] https://www.nersc.gov/assets/Uploads/Kokkos-training-Day1-NewUsers-Bruno-v2.pdf + + +.*Points to keep in mind* +**** + +* *Reduction* aggregates values computed by different threads or computing units in parallel. + +* *Types of reductions*: +*** By default, Kokkos performs a "sum" reduction. +*** Custom reductions are possible for more complex operations. + +* *Reducer concept*: +*** A Reducer is a class that defines how to join (reduce) two values. +*** It also specifies the initialization of thread-private variables and the location of the final result. + +* *Usage*: +*** Reduction is usually done with the `Kokkos::parallel_reduce` function. +*** It can be used with lambdas or CPP functors. + +* *Data types*: +*** Built-in reductions work with CPP intrinsic and `Kokkos::complex` types. +*** For custom types, a specialization of `Kokkos::reduction_identity` is required. + +* *Flexibility*: +*** Kokkos allows reductions on scalars, but also on more complex structures like matrices. + +* *Performance*: +*** Reductions are optimized for different hardware architectures, ensuring performance portability. + +**** \ No newline at end of file diff --git a/docs/modules/kokkos/pages/advanced-concepts/asynchronicity-and-streams.adoc b/docs/modules/kokkos/pages/advanced-concepts/asynchronicity-and-streams.adoc new file mode 100644 index 00000000..1b449304 --- /dev/null +++ b/docs/modules/kokkos/pages/advanced-concepts/asynchronicity-and-streams.adoc @@ -0,0 +1,98 @@ += Kokkos Asynchronicity, Streams, and Task Parallelism + +== Introduction + +[.text-justify] + +Kokkos C++ supports *asynchronous* execution through its parallel dispatch operations, which may return before completion while executing in sequence with other Kokkos operations [1][2]. Streams in Kokkos, particularly for CUDA, allow for overlapping kernels and can be used to create CUDA instances, enabling concurrent execution of multiple kernels. + + +== Asynchronicity and Streams + +Asynchronous execution and streams are essential for maximizing hardware utilization, especially on GPUs. Kokkos provides mechanisms for managing asynchronous operations and concurrent kernel execution. + +*Blocking and Non-blocking Operations* : Kokkos operations can be either blocking or non-blocking: + +- Blocking operations: These operations wait for completion before returning control to the caller. +- Non-blocking operations: These operations initiate work but return control immediately, allowing other computations to proceed concurrently. + +*Overlapping Work* : Non-blocking operations enable work overlap, which can improve overall performance. Types of work that can overlap include: + +1. Host-to-device data transfers +2. Device-to-host data transfers +3. Kernel executions +4. Host computations + +*Waiting for Completion* : To ensure that all asynchronous operations have completed, Kokkos provides synchronization mechanisms: + +1. `Kokkos::fence()`: Waits for all outstanding asynchronous operations to complete. +2. `Kokkos::wait()`: Can be used with specific futures or task policies to wait for particular operations. + +*Running Kernels Simultaneously on a GPU* : To run kernels simultaneously on a GPU, Kokkos leverages streams. While not explicitly shown in the provided search results, Kokkos supports concurrent kernel execution through its execution policies and asynchronous launch capabilities + +... + + + + +== Task Parallelism + +*Task parallelism* in Kokkos allows for fine-grained dependent execution, which is particularly useful for irregular problems and algorithms with complex dependency structures. + +*Basic Interface for Fine-grained Tasking* : + +Kokkos provides a TaskPolicy for coordinating task execution [2]. The basic interface includes: + +1. Creating tasks: `policy.create(Functor())` +2. Adding dependencies: `policy.add_dependence(task1, task2)` +3. Spawning tasks: `policy.spawn(task)` +4. Waiting for completion: `Kokkos::wait(task)` or `Kokkos::wait(policy)` + +*Expressing Dynamic Dependency Structures* : + +Dynamic dependency structures can be expressed using the `add_dependence` method, allowing for the creation of complex task graphs[2]. For example: + +[source, c++] +---- + auto fx = policy.create(Functor(x)); + auto fy = policy.create(Functor(y)); + policy.add_dependence(fx, fy); // fx is scheduled after fy +---- + +*When to Use Kokkos Tasking* : + +Kokkos tasking is particularly useful in the following scenarios: + +1. Irregular problems with complex dependencies +2. Producer-consumer patterns +3. Recursive algorithms +4. When fine-grained parallelism is needed within tasks + +Tasking in Kokkos allows for better locality exploitation by enabling nested data-parallelism within a task, which can be particularly beneficial for heterogeneous devices [2]. + + +... + + + +== References + +** [1] https://kokkos.org/kokkos-core-wiki/ProgrammingGuide/SIMD.html +** [2] https://trilinos.github.io/pdfs/KokkosPortableAPI.pdf +** [3] https://kokkos.org/kokkos-core-wiki/ProgrammingGuide/Machine-Model.html + + + +.*Points to keep in mind* +**** + +*Asynchronicity* in Kokkos means that parallel operations are executed in a non-blocking manner, possibly returning before they are fully completed, while maintaining sequential order relative to other Kokkos operations in the same execution or memory space. + +*Streams* are abstractions representing queues of parallel operations associated with a specific execution space instance, allowing asynchronous and ordered execution of tasks. + +*Task Parallelism* in Kokkos is a programming model enabling the asynchronous execution of interdependent tasks, organized in a directed acyclic graph (DAG), suitable for irregular and recursive problems, and providing a high-level abstraction for parallelization on heterogeneous architectures. + + +**** + + diff --git a/docs/modules/kokkos/pages/advanced-concepts/hierarchical-parallelism.adoc b/docs/modules/kokkos/pages/advanced-concepts/hierarchical-parallelism.adoc new file mode 100644 index 00000000..436e71a5 --- /dev/null +++ b/docs/modules/kokkos/pages/advanced-concepts/hierarchical-parallelism.adoc @@ -0,0 +1,289 @@ += Kokkos Hierarchical Parallelism + +== Introduction + +[.text-justify] +Kokkos' *hierarchical parallelism* is a paradigm that enables the exploitation of multiple levels of *shared-memory parallelism*, allowing developers to leverage increased parallelism in their computations for potential performance improvements. This framework supports various levels of parallelism, including thread teams, threads within a team, and vector lanes, which can be nested to create complex parallel structures [1][2][6]. + +[.text-justify] +The paradigm employs a two-tiered approach: an outer level, often implemented using a league of teams, which divides the overall workload into larger chunks, and an inner level, typically comprising threads within a team, which focuses on finer-grained parallelism within these chunks. *Thread teams*, a fundamental concept in Kokkos, represent collections of threads that can synchronize and share a common scratch pad memory. + + +== Hierarchical parallelism + +At the heart of Kokkos' *hierarchical parallelism* lies the ability to exploit multiple levels of *shared-memory parallelism*. +This approach allows developers to map complex algorithms to the hierarchical nature of modern hardware, from multi-core CPUs to many-core GPUs and leverage more parallelism in their computations, potentially leading to significant performance improvements. The framework supports various levels of parallelism, including thread teams, threads within a team, and vector lanes, which can be nested to create complex parallel structures . + +=== Similarities and Differences Between Outer and Inner Levels of Parallelism + + - **Outer Level (League)**: The outermost level of parallelism, often referred to as the "league," typically corresponds to coarse-grained work distribution. This level is suitable for dividing large workloads across multiple compute units or NUMA domains. + + - **Inner Level (Team)**: The inner level, or "team," represents a finer-grained parallelism within each league member. Teams often map to hardware-specific groupings like CUDA thread blocks or CPU core groups. + + - **Similarities**: Both levels support parallel patterns such as for-loops, reductions, and scans, allowing for consistent programming models across levels. + + - **Differences**: Inner levels have access to fast, shared memory resources and synchronization primitives, while outer levels are more independent and lack direct communication mechanisms. + +=== Thread Teams + +Kokkos introduces the concept of *thread teams*, which organizes parallel work into a two-dimensional structure: + + - **League**: A collection of teams that can execute independently. + - **Team**: A group of threads that can synchronize and share resources. + - **Thread**: The basic unit of parallel execution within a team. + +This hierarchical structure allows for efficient mapping of algorithms to hardware: + + - On *GPUs*, *teams* often correspond to thread blocks, with threads mapping to CUDA threads or vectorized operations. + - On *CPUs*, *teams* might represent groups of cores, with threads corresponding to individual CPU threads or SIMD lanes. + +=== Performance Improvement with Well-Coordinated Teams + +Well-coordinated teams can significantly boost performance by: + + - **Optimizing Memory Access**: Teams can cooperatively load data into shared memory, reducing global memory accesses. + - **Load Balancing**: The two-level structure allows for dynamic work distribution, adapting to varying workloads across different parts of the computation. + - **Hardware Utilization**: By matching the team structure to hardware capabilities, Kokkos can achieve high occupancy and efficient resource usage [3]. + + +=== Example of implementation + + +.`HierarchicalParallelism` +[source, c++] +---- +struct HierarchicalParallelism { + Kokkos::View matrix; + HierarchicalParallelism(int N, int M) : matrix("matrix", N, M) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const Kokkos::TeamPolicy<>::member_type& team_member) const { + const int i = team_member.league_rank(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, matrix.extent(1)), <2> + [&] (const int j) { + matrix(i, j) = i * matrix.extent(1) + j; + }); + + team_member.team_barrier(); + if (team_member.team_rank() == 0) { + double sum = 0.0; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team_member, matrix.extent(1)), <2> + [&] (const int j, double& lsum) { + lsum += matrix(i, j); + }, sum); + + Kokkos::single(Kokkos::PerTeam(team_member), [&] () { <3> + matrix(i, 0) = sum; + }); + } + } +}; +---- + +.Execution +[source, c++] +---- +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + const int N = 1000; + const int M = 100; + HierarchicalParallelism functor(N, M); + Kokkos::parallel_for(Kokkos::TeamPolicy<>(N, Kokkos::AUTO), functor); <1> + } + Kokkos::finalize(); + return 0; +} +---- + +Hierarchical parallelism is implemented as follows: + +. The top level uses `Kokkos::TeamPolicy` to parallelize on the rows of the matrix. +. `Kokkos::TeamThreadRange` is used to parallelize operations on columns within each team. +. `Kokkos::single` is used to ensure that some operations are performed only once per team. + + +== Scratch Memory + +[.text-justify] +*Scratch memory*, or *scratch pad*, in Kokkos provides a powerful mechanism for managing temporary, fast-access storage within parallel kernels. This feature is crucial for optimizing performance in memory-bound applications [4]. The scratch pad is accessible only by threads within a team and has a lifetime equal to that of the team. It allows algorithms to load a workset into a shared space, enabling collaborative work among team members. This approach can significantly reduce global memory accesses, as intermediate results and frequently accessed data can be stored in the faster, more local scratch memory. + +*Concept of Team and Thread Private Scratch Pads* + +Kokkos offers two levels of scratch memory: *Team Scratch*,*Thread Scratch*. These scratch pads provide a flexible way to manage temporary data without relying on slower global memory accesses. + + - **Team Scratch**: Shared among all threads in a team, analogous to CUDA shared memory. + - **Thread Scratch**: Private to individual threads, useful for thread-local computations. + +*Reducing Global Memory Accesses* + +Scratch memory significantly reduces global memory traffic by: + + - **Data Reuse**: Frequently accessed data can be loaded once into scratch memory and reused by multiple threads. + - **Intermediate Results**: Temporary computations can be stored in scratch memory, avoiding redundant global memory writes. + +*When to Use Scratch Memory* + +Scratch memory is particularly beneficial in scenarios such as: + + - **Stencil Computations**: Where neighboring data elements are repeatedly accessed. + - **Reduction Operations**: For efficient partial sum calculations within teams. + - **Data Gather-Scatter**: When reorganizing data for more efficient processing. + +*Using Scratch Memory and Necessary Barriers* + +To effectively use scratch memory: + + 1. Allocate scratch memory using `team_shmem_size()` or `thread_shmem_size()` in the execution policy. + 2. Create scratch views within kernels using `ScratchView` or `team_scratch()`/`thread_scratch()`. + 3. Use team barriers (`team.team_barrier()`) to ensure data consistency when sharing scratch memory among threads. + +.Example of Scratch Memory Usage +[source, c++] +---- +struct ScratchMemoryExample { + Kokkos::View data; + ScratchMemoryExample(int N) : data("data", N) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const Kokkos::TeamPolicy<>::member_type& team_member) const { + const int team_size = team_member.team_size(); + const int team_rank = team_member.team_rank(); + const int league_rank = team_member.league_rank(); + + // Allocate team scratch memory + double* team_scratch = (double*)team_member.team_shmem().get_shmem(team_size * sizeof(double)); <1> + + // Each thread initializes its scratch memory + team_scratch[team_rank] = league_rank * team_size + team_rank; + + // Synchronize to ensure all threads have written to scratch memory + team_member.team_barrier(); <3> + + // Perform a reduction within the team + double team_sum = 0.0; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team_member, team_size), [&](const int i, double& lsum) { + lsum += team_scratch[i]; + }, team_sum); + + // Only one thread writes the result back to global memory + if (team_rank == 0) { + data(league_rank) = team_sum; + } + } + + // Specify the amount of scratch memory needed + size_t team_shmem_size(int team_size) const { + return team_size * sizeof(double); + } +}; + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + const int N = 1000; + ScratchMemoryExample functor(N); + Kokkos::parallel_for(Kokkos::TeamPolicy<>(N / 10, Kokkos::AUTO).set_scratch_size(0, Kokkos::PerTeam(functor.team_shmem_size(10))), functor); <2> + } + Kokkos::finalize(); + return 0; +} +---- + + + + +== Unique Token + +[.text-justify] +*Unique tokens* in Kokkos provide a mechanism for thread-safe resource allocation and access in parallel environments [5]. This feature is particularly useful when multiple threads or teams need to access shared resources without conflicts. Unique tokens come in two scopes: Global and Instance. The Global scope provides identifiers that are unique across the entire execution space, while the Instance scope offers identifiers that are unique within a specific instance of parallel execution. This distinction allows developers to choose the appropriate level of uniqueness based on their specific requirements. + +*Unique tokens* ensure that each thread or team can acquire a distinct identifier or resource without conflicts. This mechanism is crucial for scenarios where threads need exclusive access to shared resources or need to perform thread-specific operations. + +*Acquiring Per-Team Unique IDs* + +To acquire unique IDs: + + 1. Create a `UniqueToken` object for the desired execution space. + 2. Use the `acquire()` method within parallel kernels to obtain a unique identifier. + 3. Release the token using `release()` when it's no longer needed. + +*Difference Between Global and Instance Scope* + +Kokkos offers two scopes for unique tokens: *Global Scope* and *Instance Scope*. The choice of scope depends on the required level of uniqueness and the potential for resource contention in the application. + + - **Global Scope**: Tokens are unique across all instances of `UniqueToken` in the application. + - **Instance Scope**: Tokens are unique only within a specific instance of `UniqueToken`. + + +.Tokens +[source, c++] +---- +include::example$src/22_3_token.cpp[] +---- + +Explanations: + +**UniqueToken** : + `Kokkos::Experimental::UniqueToken` is used to generate unique identifiers in a parallel context. + The `acquire()` method provides a unique identifier. + The `release()` method releases this identifier so that it can be reused. + +**Kokkos view** : + Data is stored in a view (`Kokkos::View`), which is an abstraction for managing data across different memory spaces. + +**Parallel loop** : + `Kokkos::parallel_for` executes a loop in parallel. + Each iteration gets a unique identifier via `unique_token.acquire()`. + +**Copying results**: + Data is copied to the host using `Kokkos::create_mirror_view_and_copy` for display. + + + + +== References +** [1] https://kokkos.org/kokkos-core-wiki/ProgrammingGuide/HierarchicalParallelism.html +** [2] https://kokkos.org/kokkos-core-wiki/ProgrammingGuide/ProgrammingModel.html +** [3] https://indico.mathrice.fr/event/303/attachments/598/799/cafe_calcul_kokkos_2021.pdf +** [4] https://github.com/kokkos/kokkos/issues/1932 +** [5] https://www.nersc.gov/assets/Uploads/Kokkos-training-Day2-NewUsers-Bruno-v2.pdf +** [6] https://indico.math.cnrs.fr/event/12037/attachments/5040/8137/KokkosTutorial_04_HierarchicalParallelism.pdf + + + + + +.*Points to keep in mind* +**** + +* *Hierarchal Parallelism* + +*** Hierarchical work can be parallelized via hierarchical parallelism. +*** Hierarchical parallelism is leveraged using thread teams launched with a TeamPolicy. +*** Team “worksets” are processed by a team in nested parallel for (or reduce or scan) calls with a TeamThreadRange and ThreadVectorRange policy. +*** Execution can be restricted to a subset of the team with the single pattern using either a PerTeam or PerThread policy. +*** Teams can be used to reduce contention for global resources even in “flat” algorithms. + + +* *Scratch Space* + +*** Scratch Memory can be use with the TeamPolicy to provide thread or team private memory. +*** Scratch memory exposes on-chip user managed caches (e.g. on NVIDIA GPUs) +*** The size must be determined before launching a kernel. +*** Two levels are available: large/slow and small/fast. + + +* *Tocken* +*** UniqueToken provides a thread safe portable way to divide thread or team specific resources +*** UniqueToken can be sized, such that it returns only ids within a specific range. +*** A Global scope UniqueToken can be acquired, allowing safe ids accross disjoint concurrent code sections. + + +* *Unique Token* + +*** UniqueToken give a thread safe portable way to divide thread specific resources +*** UniqueToken can be sized to restrict ids to a range. +*** A Global UniqueToken is available. + +**** diff --git a/docs/modules/kokkos/pages/advanced-concepts/index.adoc b/docs/modules/kokkos/pages/advanced-concepts/index.adoc new file mode 100644 index 00000000..6cb9c025 --- /dev/null +++ b/docs/modules/kokkos/pages/advanced-concepts/index.adoc @@ -0,0 +1,24 @@ += Advanced concepts + + +The content of this part has ben constructed from the slides from the https://indico.math.cnrs.fr/event/12037/[Kokkos training days] held in 2024. + + +== Other ressources + +- Organization Kokkos on GitHub: https://github.com/kokkos[github.com/kokkos], +- Slides, recording and Q&A for the Lectures: https://github.com/kokkos/kokkos-tutorials/wiki/Kokkos-Lecture-Series[github.com/kokkos/kokkos-tutorials/wiki], +- Kokkos Core Wiki, with API documentation: https://kokkos.github.io/kokkos-core-wiki[kokkos.github.io/kokkos-core-wiki], +- Slack channel for Kokkos: https://kokkos.slack.com[kokkosteam.slack.com]. + + +== Contents + +- xref:advanced-concepts/advanced-reductions.adoc[Advanced Reductions] +- xref:advanced-concepts/hierarchical-parallelism.adoc[Hierarchical Parallelism] +- xref:advanced-concepts/mpi.adoc[MPI] +- xref:advanced-concepts/pgas.adoc[PGAS] +- xref:advanced-concepts/mpi-vs-pgas.adoc[MPI vs PGAS] +- xref:advanced-concepts/multidimensional-loops-and-data-structure.adoc[Multidimensional Loops and Data Structure] +- xref:advanced-concepts/single-instruction-mutliple-data.adoc[Single Instruction Multiple Data] +- xref:advanced-concepts/asynchronicity-and-streams.adoc[Asynchronicity and Stream] diff --git a/docs/modules/kokkos/pages/advanced-concepts/mpi-vs-pgas.adoc b/docs/modules/kokkos/pages/advanced-concepts/mpi-vs-pgas.adoc new file mode 100644 index 00000000..f5eb5b9e --- /dev/null +++ b/docs/modules/kokkos/pages/advanced-concepts/mpi-vs-pgas.adoc @@ -0,0 +1,49 @@ += Kokkos MPI (Message Passing Interface) vs PGAS (Partitioned Global Address Space) + +== Introduction + +[.text-justify] +In terms of performance under Kokkos C++, the choice between MPI and PGAS (Partitioned Global Address Space) depends on the specific application context and the target hardware architecture. Here are the key points to consider: + +1. MPI is generally more mature and widely used for distributed programming[2]. It offers good performance for two-sided communications and is well optimized on many platforms. + +2. PGAS, on the other hand, can offer performance advantages for some use cases: + +- It better exploits data locality, which can improve efficiency and scalability compared to traditional shared memory approaches[3]. +- PGAS allows one-sided communication operations, which can be more efficient in some scenarios[3]. + +3. Kokkos proposes "remote spaces" that implement the PGAS model, including NVSHMEM, conventional SHMEM, and one-sided MPI[2][5]. These implementations can provide good performance, especially on specific architectures such as NVIDIA GPUs. + +4. A performance study using miniFE (a mini-benchmark for solving linear systems) showed that implementations based on SHMEM (a form of PGAS) can outperform MPI in some cases [1][4]. + +5. However, it is important to note that performance can vary significantly depending on the specific application, problem size, and hardware architecture. + + +[plantuml, format=svg, opts="inline"] +---- +legend +[Kokkos C++] + |_[MPI] + |_Used for: CPU, GPU + |_Strengths: Two-way communications, maturity + |_Best for: Traditional HPC applications + + |_[PGAS] + |_Implementations: SHMEM, NVSHMEM, ROCSHMEM + |_Used for: CPU, NVIDIA GPU, AMD GPU + |_Strengths: One-way communications, data locality + |_Best for: Applications with frequent access to remote data +end legend +---- + + + + + +== References +** [1] https://extremecomputingtraining.anl.gov/wp-content/uploads/sites/96/2019/08/ ATPESC_2019_Track-2_3_8-1_830am_Trott-Kokkos.pdf +** [2] https://www.reddit.com/r/cpp/comments/1efklad/mpi_or_gpu_parallel_computation_of_dem_code_for/ +** [3] https://en.wikipedia.org/wiki/Partitioned_global_address_space +** [4] https://oaciss.uoregon.edu/E4S-Forum19/talks/Trott-E4S.pdf +** [5] https://github.com/kokkos/kokkos-remote-spaces + diff --git a/docs/modules/kokkos/pages/advanced-concepts/mpi.adoc b/docs/modules/kokkos/pages/advanced-concepts/mpi.adoc new file mode 100644 index 00000000..b37ba903 --- /dev/null +++ b/docs/modules/kokkos/pages/advanced-concepts/mpi.adoc @@ -0,0 +1,135 @@ += Kokkos MPI (Message Passing Interface) + +== Introduction + +[.text-justify] +In the realm of high-performance computing, the integration of internode communication models with intranode parallelism frameworks has become increasingly crucial. This synergy is exemplified by the combination of *MPI* (Message Passing Interface) and *PGAS* (Partitioned Global Address Space) with Kokkos, a performance portability ecosystem for manycore architectures. + +== Internode Communication + +[.text-justify] +The integration of MPI concept inside Kokkos represents a powerful approach to hybrid programming, leveraging the strengths of both paradigms. MPI excels in distributed memory parallelism, while Kokkos shines in shared memory parallelism and performance portability across diverse architectures [1]. + +[.text-justify] +When writing a hybrid MPI-Kokkos program, one of the primary considerations is data transfer between MPI ranks. Kokkos Views, the library's multidimensional array abstraction, can be seamlessly integrated with MPI communications. To send data from a Kokkos View, one simply needs to pass the View's data pointer and size to MPI functions [2]. For example: + +[source, c++] +---- + Kokkos::View myView("MyView", 1000); + MPI_Send(myView.data(), myView.size(), MPI_DOUBLE, dest, tag, comm); +---- + +[.text-justify] +This straightforward approach works because Kokkos ensures that View data is contiguous in memory, aligning perfectly with MPI's expectations [2]. + +[.text-justify] +A key optimization in hybrid MPI-Kokkos programs is the overlapping of communication and computation. This can be achieved by leveraging Kokkos' execution spaces and MPI's non-blocking communication primitives. For example: + +[source, c++] +---- + auto future = Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int i) { + // Computation kernel + }); + MPI_Request request; + MPI_Isend(data, count, MPI_INT, dest, tag, comm, &request); + future.wait(); // Wait for computation to complete + MPI_Wait(&request, MPI_STATUS_IGNORE); // Wait for communication to complete +---- + +[.text-justify] +This pattern allows the computation to proceed concurrently with the MPI communication, potentially masking latency and improving overall performance [1]. + +[.text-justify] +Buffer packing strategies play a crucial role in optimizing MPI communication, especially when dealing with non-contiguous data. Kokkos provides efficient mechanisms for packing and unpacking data. One approach is to use Kokkos parallel_for to pack data into a contiguous buffer before sending: + +[source, c++] +---- + Kokkos::View sendBuffer("SendBuffer", count); + Kokkos::parallel_for(count, KOKKOS_LAMBDA(int i) { + sendBuffer(i) = computeValue(i); + }); + MPI_Send(sendBuffer.data(), count, MPI_DOUBLE, dest, tag, comm); +---- + +[.text-justify] +This method ensures efficient memory access patterns and can leverage the full parallelism of the underlying hardware. + +[.text-justify] +For sparse communication patterns, generating efficient index lists is crucial. Kokkos can assist in this process through its parallel algorithms. For instance, to create a list of indices for non-zero elements: + +[source, c++] +---- + Kokkos::View indexList("IndexList", n); + Kokkos::parallel_scan(n, KOKKOS_LAMBDA(int i, int& update, bool final) { + if (data(i) != 0) { + if (final) indexList(update) = i; + ++update; + } + }); +---- + +[.text-justify] +This approach efficiently generates a compact list of relevant indices, which can then be used to optimize MPI communications for sparse data structures. + + +*Example* + +[source, c++] +---- + #include + #include + #include + #include + + int main(int argc, char* argv[]) { + MPI_Init(&argc, &argv); + int rank, size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + Kokkos::initialize(argc, argv); + { + const int localSize = 1000; + Kokkos::View localData("localData", localSize); + Kokkos::parallel_for("fill", localSize, KOKKOS_LAMBDA(const int i) { + localData(i) = rank * localSize + i; + }); + // Calculate the local sum + double localSum = 0.0; + Kokkos::parallel_reduce("sum", localSize, KOKKOS_LAMBDA(const int i, double& sum) { + sum += localData(i); + }, localSum); + // MPI reduction to get the global sum + double globalSum; + MPI_Reduce(&localSum, &globalSum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + // Display the result on process 0 + if (rank == 0) { + std::cout << "Somme globale : " << globalSum << std::endl; + } + } + Kokkos::finalize(); + MPI_Finalize(); + return 0; + } +---- + +Explanations: + + + +== References +** [1] https://kokkos.org/kokkos-core-wiki/usecases/MPI-Halo-Exchange.html +** [2] https://indico.math.cnrs.fr/event/12037/attachments/5040/8156/KokkosTutorial_07_Tools.pdf + + +.*Points to keep in mind* +**** + +*MPI* in Kokkos C++ is a standard message passing interface used in conjunction with Kokkos for inter-process communication in distributed parallel applications, enabling efficient data exchange between compute nodes while exploiting the performance portability capabilities of Kokkos for single-node computing. + + +... + + +**** + diff --git a/docs/modules/kokkos/pages/advanced-concepts/multidimensional-loops-and-data-structure.adoc b/docs/modules/kokkos/pages/advanced-concepts/multidimensional-loops-and-data-structure.adoc new file mode 100644 index 00000000..63facb55 --- /dev/null +++ b/docs/modules/kokkos/pages/advanced-concepts/multidimensional-loops-and-data-structure.adoc @@ -0,0 +1,292 @@ += Kokkos Multidimensional Loops and Data Structure + +== Introduction + +[.text-justify] +Kokkos offers powerful abstractions for handling *multidimensional loops*, *data structures*, and *memory management*. Here we will look at several key aspects of Kokkos, providing an overview of its capabilities and best practices for effective parallel programming. + + + +== MultiDimensional Loops and Data Structures in Kokkos + +[.text-justify] +Kokkos provides the *`MDRangePolicy`*, a sophisticated tool for parallelizing tightly nested loops across multiple dimensions. This policy allows developers to express complex multidimensional algorithms with ease and efficiency [5]. The *`MDRangePolicy`* can handle loops of 2 to 6 dimensions, making it suitable for a wide range of scientific and engineering applications [1]. + +To utilize the *`MDRangePolicy`*, one must specify the dimensionality of the loop using the `Rank`` template parameter. The syntax for creating an `MDRangePolicy` is as follows: + +. Create an MDRangePolicy with 3 dimensions +[source,c++] +---- +Kokkos::parallel_for("Label", + Kokkos::MDRangePolicy>({0, 0, 0}, {N0, N1, N2}), <1> + KOKKOS_LAMBDA (int64_t i, int64_t j, int64_t k) { <2> + // Loop body + } +); +---- + +<1> Creation of a three-dimensional iteration space. The policy takes two required arguments: an initializer list for the beginning indices and another for the end indices [5]. Optionally, a third argument can be provided to specify tiling dimensions, which can be crucial for performance tuning [9]. +<2> The lambda function defines the loop body, which will be executed in parallel across the specified iteration space, with the indices `i`, `j`, and `k` ranging from the start to the end values. + + +.Example +[source, c++] +---- +include::example$src/31_multidim_loops.cpp[] +---- + + +== Subviews: Taking Slices of Views with Kokkos + +*Subviews* in Kokkos provide a powerful mechanism for creating views that reference a subset of an existing view's data. +This capability is essential for efficient data manipulation and algorithm implementation. + +The basic syntax for creating a subview is: + +[source,c++] +---- +auto subview = Kokkos::subview(view, index1, index2, ...); +---- + + +Kokkos offers flexible indexing options for *subviews*. You can use integer indices for single elements, `Kokkos::ALL` for entire dimensions, or `Kokkos::pair` for ranges [9]. + +When working with *subviews*, it's important to understand the view assignment rules. Kokkos ensures that view assignments are only allowed when the memory spaces are compatible and the shapes match. This strict checking helps prevent errors and ensures performance portability across different architectures. + + +A more elaborate example demonstrating the use of *`MDRangePolicy`* in the context of tensor operations is as follows: + +[source,c++] +---- +Kokkos::parallel_for("mdr_for_all_cells", + Kokkos::MDRangePolicy>({0, 0, 0}, {C, F, P}), + KOKKOS_LAMBDA (const int c, const int f, const int p) { + auto result = Kokkos::subview(outputField, c, f, p, Kokkos::ALL); + auto left = Kokkos::subview(inputData, c, p, Kokkos::ALL, Kokkos::ALL); + auto right = Kokkos::subview(inputField, c, f, p, Kokkos::ALL); + for (int i = 0; i < D; ++i) { + double tmp(0); + for (int j = 0; j < D; ++j) + tmp += left(i, j) * right(j); + result(i) = tmp; + } + } +); +---- + +This code snippet showcases how `MDRangePolicy` can be used in conjunction with subviews to perform complex tensor operations efficiently [9]. + + + + +== Unmanaged Views: Dealing with External Memory with Kokkos + +[.text-justify] +Unmanaged views in Kokkos provide a way to work with externally allocated memory, which is particularly useful when integrating Kokkos into existing codebases or when interfacing with external libraries [6]. + +To create an unmanaged view, you can use the following syntax: + +[source,c++] +---- +Kokkos::View> unmanaged_view(external_ptr, size); +---- + +Unmanaged views are essential when you need to wrap externally allocated data into Kokkos views. This is often necessary when Kokkos is used in a library that receives pointers to data allocations as input [6]. + +When working with unmanaged views, it's crucial to ensure that the lifetime of the external memory outlives the Kokkos view. Additionally, be cautious when using unmanaged views with device memory, as the memory management becomes the responsibility of the developer. + +== Thread Safety and Atomic Operations with Kokkos + +[.text-justify] +In parallel programming, ensuring *thread safety* is paramount. Kokkos provides *atomic* operations to handle situations where multiple threads might attempt to access and modify the same memory location concurrently [7]. + +*Atomic* operations in Kokkos are particularly useful for implementing the scatter-add pattern, where multiple threads contribute to a shared result. While atomic operations provide thread safety, they can impact performance, especially under high contention. + +The performance characteristics of atomic operations can vary significantly between CPUs and GPUs, and even among different data types. On CPUs, atomic operations on integers are generally faster than on floating-point types. On GPUs, the performance impact of atomics can be more pronounced, especially for global memory operations[7]. + +To use atomic operations in Kokkos, you can employ the Kokkos::atomic_* functions: + +[source,c++] +---- +Kokkos::atomic_add(&value, increment); +---- + +It's important to note that while atomics provide a solution for thread safety, they should be used judiciously, as overuse can lead to performance bottlenecks. + +Another example: + +Atomics: the portable and thread-scalable solution + +[source,c++] +---- +parallel_for(N, KOKKOS_LAMBDA(const size_t index) { const Something value = ...; + const int bucketIndex = computeBucketIndex(value); Kokkos::atomic_add(&_histogram(bucketIndex), 1); +}); +---- + +Atomics are the only scalable solution to thread safety. +Locks are not portable. +Data replication is not thread scalable. + +*Example* + +[source, c++] +---- +struct AtomicCounter { + // Shared atomic counter + Kokkos::Atomic counter; + // Constructor to initialize the counter + AtomicCounter() : counter(0) {} + // Function to increment the counter atomically + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + counter.fetch_add(1); // Atomically increment the counter + } +}; + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + const int numIterations = 1234567; // Number of increments + AtomicCounter atomicCounter; + // Launch a parallel for loop to increment the counter + Kokkos::parallel_for("IncrementCounter", numIterations, atomicCounter); + // Synchronize to ensure all increments are complete + Kokkos::fence(); + // Output the final value of the counter + std::cout << "Final Counter Value: " << atomicCounter.counter << std::endl; + } + Kokkos::finalize(); + return 0; +} +---- + +Explanations: The `AtomicCounter` structure contains an atomic integer `counter` that will be incremented by multiple threads. The `Kokkos::Atomic` type ensures that operations on the counter are thread-safe. The `operator()` function uses `fetch_add(1)` to atomically increment the `counter`. This operation guarantees that even if multiple threads attempt to update the counter simultaneously, each update will be executed safely without race conditions. After launching the parallel loop, `Kokkos::fence()` is called to ensure that all increments are completed before accessing the final value of the counter. + + +== DualView with Kokkos + +[.text-justify] +*DualView* is a powerful abstraction in Kokkos that manages mirrored data on both host and device. This is particularly valuable in heterogeneous computing environments where data needs to be accessed and modified on both the CPU and accelerators like GPUs. *DualView* simplifies the task of managing data movement between memory spaces, e.g., host and device. + +image::kokkos-DualView.png[xref=#fragment101,width=320,height=150] + +The primary motivation for *DualView* is to simplify data management and synchronization between host and device memory spaces. It automatically tracks which side (host or device) has been modified and needs synchronization, reducing the likelihood of errors due to out-of-sync data. + +To create a DualView, you can use the following syntax: + +[source,c++] +---- +Kokkos::DualView dual_data("label", size); +---- + +`DualView` provides methods like `sync()` and `modify()` to manage data coherency between *host* and *device*. This abstraction significantly simplifies the development of applications that need to work efficiently across different memory spaces, enhancing both productivity and performance portability. + + +*Kokkos* offers a rich set of tools and abstractions for high-performance, portable parallel programming. By leveraging features like *MDRangePolicy*, *subviews*, *unmanaged views*, *atomic operations*, and *DualView*, developers can create efficient, scalable applications that perform well across a wide range of hardware architectures. + +*Example* + +[source, c++] +---- +struct DualViewExample { + // Define the dual view type + using dual_view_type = Kokkos::DualView; + + // Function to initialize device view + static void initialize(dual_view_type& dv) { + // Initialize the device view with values + Kokkos::parallel_for("Initialize DeviceView", dv.d_view.extent(0), KOKKOS_LAMBDA(const int i) { + dv.d_view(i) = static_cast(i); // Assign values based on index + }); + // Synchronize to update the host mirror + dv.template sync(); + } + // Function to print values from both views + static void printValues(const dual_view_type& dv) { + std::cout << "Host View Values: "; + for (int i = 0; i < dv.h_view.extent(0); ++i) { + std::cout << dv.h_view(i) << " "; // Access host view + } + std::cout << std::endl; + std::cout << "Device View Values: "; + Kokkos::parallel_for("Print DeviceView", dv.d_view.extent(0), KOKKOS_LAMBDA(const int i) { + printf("%f ", dv.d_view(i)); // Access device view + }); + std::cout << std::endl; + } +}; + +int main( int argc, char* argv[] ) +{ + Kokkos::initialize(argc, argv); + { + const int N = 10; // Size of the DualView + // Create a DualView with N elements + DualViewExample::dual_view_type dv("MyDualView", N); + // Initialize the device view + DualViewExample::initialize(dv); + // Print values from both views + DualViewExample::printValues(dv); + } + Kokkos::finalize(); + return 0; +} +---- + +Explanations: This example effectively demonstrates how to use *DualView* in Kokkos to manage data across different memory spaces while ensuring synchronization between them. The program starts by initializing the Kokkos runtime environment. A `DualView` is defined as `dual_view_type`, which can hold data in both host and device memory. + + +== References +** [1] https://indico.math.cnrs.fr/event/12037/attachments/5040/8130/KokkosTutorial_03_MDRangeMoreViews.pdf +** [2] https://kokkos.org/kokkos-core-wiki/ProgrammingGuide/View.html +** [3] https://github.com/kokkos/kokkos/issues/549 +** [4] https://indico.math.cnrs.fr/event/12037/attachments/5040/8129/KokkosTutorial_02_ViewsAndSpaces.pdf +** [5] https://kokkos.org/kokkos-core-wiki/API/core/policies/MDRangePolicy.html +** [6] https://github.com/kokkos/kokkos-core-wiki/blob/main/docs/source/ProgrammingGuide/Interoperability.md +** [7] https://kokkos.org/kokkos-core-wiki/ProgrammingGuide/Machine-Model.html +** [8] https://extremecomputingtraining.anl.gov/wp-content/uploads/sites/96/2024/08/ATPESC-2024-Track-2d-Talk-1-Turcksin-Kokkos.pdf +** [9] https://kokkos.org/kokkos-core-wiki/usecases/MDRangePolicy.html +** [10] https://github.com/kokkos/kokkos/issues/102 +** [11] https://gensoft.pasteur.fr/docs/lammps/2020.03.03/Speed_kokkos.html + + + +.*Points to keep in mind* +**** + +* *MDRangePolicy* + +** The MDRangePolicy allows parallelization of tightly nested loops of 2 to 6 dimensions. +** It provides a more intuitive and potentially more efficient alternative to flattening multidimensional loops. + +* *Subviews: Taking Slices of Views with Kokkos* + +** Subviews in Kokkos allow you to create views that reference a subset of an existing view's data. +** Similar capability as provided by Matlab, Fortran, or Python. +** Prefer the use of auto for the type. + View v("v", N0, N1, N2); + auto sv = subview(v, i0, ALL, make_pair(start,end)); + +* *Unmanaged Views* +** Interoperability with externally allocated arrays. +** No reference counting, memory not deallocated at destruction. +** User is responsible for insuring proper dynamic and/or static extents, MemorySpace, Layout, etc. + View v_unmanaged(raw_ptr , N0, N1); + + +* *Atomic operations* +** Atomic functions available on the host or the device (e.g. Kokkos::atomic add). +** Use Atomic memory trait for atomic accesses on Views. + View v("v", N0); + View > v_atomic = v; +** Use ScatterView for scatter-add parallel pattern. ScatterView can transparently switch between Atomic and Data Replication based scatter algorithms. + + +* *Dual Views* +** For managing data synchronization between host and device. +** Helps in codes with no holistic view of data flow. + +**** + + diff --git a/docs/modules/kokkos/pages/advanced-concepts/pgas.adoc b/docs/modules/kokkos/pages/advanced-concepts/pgas.adoc new file mode 100644 index 00000000..455f75f6 --- /dev/null +++ b/docs/modules/kokkos/pages/advanced-concepts/pgas.adoc @@ -0,0 +1,112 @@ += Kokkos PGAS (Partitioned Global Address Space) + +== Introduction + +[.text-justify] +In the realm of high-performance computing, the integration of internode communication models with intranode parallelism frameworks has become increasingly crucial. This synergy is exemplified by the combination of *MPI* (Message Passing Interface) and *PGAS* (Partitioned Global Address Space) with Kokkos, a performance portability ecosystem for manycore architectures. + + +== Kokkos Remote Spaces: PGAS Support + +[.text-justify] + +*PGAS (Partitioned Global Address Space)* models are gaining traction, particularly with the advent of "super-node" architectures and evolving network infrastructures [1][2]. Kokkos Remote Spaces extends the Kokkos ecosystem to embrace this paradigm, offering a bridge between shared and distributed memory programming models. + +*PGAS* enables Kokkos to provide a global view of data for convenient multi-GPU, multi-node, and multi-device programming. PGAS provides a high-level abstraction for remote memory accesses, simplifying distributed programming for developers using Kokkos. Kokkos Remote Spaces supports multiple PGAS backends, including SHMEM, *NVSHMEM*, *ROCSHMEM*, and *MPI* One-side, providing flexibility for different types of systems and architectures. PGAS implementations are optimized for high-performance communications, which is crucial for the scientific computing applications that Kokkos targets. Using PGAS allows Kokkos to maintain its philosophy of performance portability across different architectures, from CPUs to GPUs. By using PGAS, Kokkos can offer efficient and portable distributed programming, while maintaining a consistent programming interface with the rest of the Kokkos ecosystem. + +To write a *PGAS* application with Kokkos, developers can utilize the Kokkos Remote Spaces extension. This extension introduces new memory spaces that return data handles with PGAS semantics. Creating a global View in this context is straightforward: + +[source, c++] +---- + Kokkos::View globalView("GlobalView", N, M); +---- + +[.text-justify] +This declaration creates a two-dimensional View that spans across multiple processing elements in a PGAS model. + +[.text-justify] +Accessing global data in a PGAS model requires careful consideration of data locality and communication costs. Kokkos Remote Spaces provides abstractions that simplify this process. For example, accessing an element of the global View might look like this: + +[source, c++] +---- + auto element = globalView(i, j); +---- + +[.text-justify] +Behind the scenes, Kokkos handles the necessary communication to fetch or update the data, abstracting away the complexities of the underlying PGAS implementation. + +[.text-justify] +A prime example of PGAS applications is the Sparse Matrix-Vector Multiplication (SpMV) operation, a key component of the Conjugate Gradient (CG) method. In a PGAS model using Kokkos Remote Spaces, the vector becomes distributed, while the sparse matrix stores global indices. This approach allows for efficient parallel computation across multiple nodes. + +The implementation of SpMV in this context might involve: + +1. Distributing the vector across processing elements. +2. Storing the sparse matrix with global indices. +3. Performing local computations using Kokkos parallel constructs. +4. Utilizing PGAS operations for necessary remote data accesses. + +This strategy can lead to significant performance improvements, especially for large-scale problems that exceed the memory capacity of a single node. + +*Example* + +[source, c++] +---- + Kokkos::initialize(argc, argv); + { + using ExecSpace = Kokkos::Cuda; + using RemoteSpace = Kokkos::Experimental::NVShmemSpace; + using RemoteView = Kokkos::View; + const int N = 1000; + RemoteView remote_data("RemoteData", N); + Kokkos::parallel_for("InitializeData", Kokkos::RangePolicy(0, N), + KOKKOS_LAMBDA(const int i) { + remote_data(i) = static_cast(i); + }); + + Kokkos::fence(); + double sum = 0.0; + Kokkos::parallel_reduce("SumData", Kokkos::RangePolicy(0, N), + KOKKOS_LAMBDA(const int i, double& lsum) { + lsum += remote_data(i); + }, sum); + Kokkos::fence(); + printf("Sum of remote data: %f\n", sum); + } + Kokkos::finalize(); +---- + +Explanations: + +Using Kokkos::Experimental::NVShmemSpace as a remote memory space. Creating a RemoteView using NVShmemSpace. Initializing the remote data using a parallel_for on the CUDA runspace. Computing the sum of the remote data with a parallel_reduce. Using Kokkos::fence() to ensure synchronization between remote operations. + +This code demonstrates how Kokkos Remote Spaces allows using NVSHMEM as a PGAS backend for simplified multi-GPU programming, providing a global view of the data while maintaining the portability of Kokkos performance + + +== References +** [1] https://extremecomputingtraining.anl.gov/wp-content/uploads/sites/96/2019/08/ATPESC_2019_Track-2_3_8-1_830am_Trott-Kokkos.pdf +** [2] https://en.wikipedia.org/wiki/Partitioned_global_address_space + +.*Points to keep in mind* +**** + +*PGAS* (Partitioned Global Address Space) is a parallel programming model where the global address space is logically partitioned, with each portion local to a process or thread. + +*Kokkos Remote Spaces* is an extension of Kokkos that adds support for Distributed Shared Memory (DSM) to enable a global view of data in a multi-GPU, multi-node, multi-device environment. + +*Kokkos Remote Spaces* is an extension of Kokkos that adds support for Distributed Shared Memory (DSM) to enable a global view of data in a multi-GPU, multi-node, multi-device environment. + +*NVShmemSpace* is an NVIDIA implementation of the Partitioned Global Address Space (PGAS) model that enables low-latency access to shared memory distributed across multiple GPUs in a cluster. + +*ROC_SHMEM* is an implementation of the Partitioned Global Address Space (PGAS) model for AMD GPUs, enabling GPU-initiated communication operations in a multi-GPU environment + +* *Support for PGAS in Kokkos:* + *** Kokkos Remote Spaces extends Kokkos to support PGAS models. + *** Bridges the gap between shared and distributed memory programming. + *** Particularly relevant for "super-node" architectures and evolving network infrastructures. + + +... + + +**** + diff --git a/docs/modules/kokkos/pages/advanced-concepts/single-instruction-mutliple-data.adoc b/docs/modules/kokkos/pages/advanced-concepts/single-instruction-mutliple-data.adoc new file mode 100644 index 00000000..ccfd48d3 --- /dev/null +++ b/docs/modules/kokkos/pages/advanced-concepts/single-instruction-mutliple-data.adoc @@ -0,0 +1,103 @@ += Kokkos Tasking Stream SIMD (Single Instruction Multiple Data) + +== Introduction + +[.text-justify] + +Kokkos provides powerful tools for high-performance computing, including *SIMD (Single Instruction, Multiple Data)* operations, asynchronous execution with streams, and task parallelism. These features enable developers to write efficient, portable code that can leverage the full potential of modern hardware architectures. Let's explore each of these aspects in detail. + +== SIMD (Single Instruction, Multiple Data) + +*SIMD* operations are a crucial component of modern high-performance computing, allowing for efficient vectorization of code. Kokkos offers portable vector intrinsic types that abstract away hardware-specific details, enabling developers to write vectorized code that can run efficiently on various architectures. + +*Portable Vector Intrinsic Types* : Kokkos provides the `Kokkos::Experimental::simd` type, which serves as an abstraction over platform-specific vector datatypes [3]. This type is designed to work across all backends, potentially falling back to scalar operations when necessary[4]. The `simd` type supports various fundamental C++ types for which the current platform supports vector intrinsics. + +*Improving Vectorization with SIMD Types* : To improve vectorization using SIMD types in Kokkos, developers can follow these steps: + +1. Include the necessary header: `#include ` +2. Define the SIMD type: `using simd_type = Kokkos::Experimental::native_simd; +3. Use SIMD types in computations to ensure vectorization: + +[source, c++] +---- + simd_type sx(x + i, tag_type()); + simd_type sy(y + i, tag_type()); + simd_type sz(z + i, tag_type()); + simd_type sr = Kokkos::sqrt(sx * sx + sy * sy + sz * sz); + sr.copy_to(r + i, tag_type()); +---- + +This approach guarantees that the compiler will generate the appropriate vector instructions for the target architecture [1]. + +*SIMD Types as an Alternative to ThreadVector Loops* : *SIMD* types can be used as an alternative to ThreadVector loops, providing more explicit control over vectorization. This approach allows developers to reason more clearly about the available parallelism in their code, often leading to better performance than relying on auto-vectorization[1]. + +*Achieving Outer Loop Vectorization* : *SIMD* types enable outer loop vectorization by processing multiple elements simultaneously. For example, on a CPU with 256-bit vector registers, the following code can process four elements at once: + +[source, c++] +---- + constexpr int width = int(simd_type::size()); + for (int i = 0; i < n; i += width) { + // SIMD operations here + } +---- + +This approach can significantly improve performance for suitable algorithms [1]. + +*Example* + +[source, c++] +---- + Kokkos::initialize(argc, argv); + { + using simd_type = Kokkos::Experimental::native_simd; + using tag_type = Kokkos::Experimental::element_aligned_tag; + constexpr int width = int(simd_type::size()); + int n = 1000; + Kokkos::View x("x", n); + Kokkos::View y("y", n); + Kokkos::View z("z", n); + Kokkos::View r("r", n); + Kokkos::parallel_for("init", n, KOKKOS_LAMBDA(const int i) { + x(i) = static_cast(i); + y(i) = static_cast(i * 2); + z(i) = static_cast(i * 3); + }); + Kokkos::parallel_for("compute", n / width, KOKKOS_LAMBDA(const int i) { + int idx = i * width; + simd_type sx([&x, idx](std::size_t j) { return x(idx + j); }); + simd_type sy([&y, idx](std::size_t j) { return y(idx + j); }); + simd_type sz([&z, idx](std::size_t j) { return z(idx + j); }); + simd_type sr = Kokkos::sqrt(sx * sx + sy * sy + sz * sz); + sr.copy_to(r.data() + idx, tag_type()); + }); + Kokkos::fence(); + auto h_r = Kokkos::create_mirror_view( r ); + Kokkos::deep_copy( h_r, r ); + printf("First 5 results:\n"); + for (int i = 0; i < 5; ++i) { + printf("r[%d] = %f\n", i, h_r(i)); + } + } + Kokkos::finalize(); +---- + +Explanations: This program uses Kokkos with SIMD to efficiently compute the square root of the sum of squares of three vectors. + + + +== References + +** [1] https://kokkos.org/kokkos-core-wiki/ProgrammingGuide/SIMD.html +** [2] https://trilinos.github.io/pdfs/KokkosPortableAPI.pdf +** [3] https://github.com/kokkos/kokkos-core-wiki/blob/main/docs/source/API/simd/simd.md +** [4] https://arxiv.org/pdf/2210.06439 + + +.*Points to keep in mind* +**** + +*SIMD* (Single Instruction, Multiple Data) in Kokkos is a C++ representation of vector registers that allows a single instruction to be applied to multiple data simultaneously, thus improving performance by parallelizing operations at the data level. + +**** + + diff --git a/docs/modules/kokkos/pages/basic-concepts/execution-spaces.adoc b/docs/modules/kokkos/pages/basic-concepts/execution-spaces.adoc index e69de29b..6b0d8d12 100644 --- a/docs/modules/kokkos/pages/basic-concepts/execution-spaces.adoc +++ b/docs/modules/kokkos/pages/basic-concepts/execution-spaces.adoc @@ -0,0 +1,207 @@ += Kokkos Execution Spaces + +image::kokkos-core.png[xref=#fragment101,width=400,height=200] + +== Introduction + +[.text-justify] +In a simple way, Kokkos introduces the concept of execution spaces as a fundamental abstraction for parallel computing. Execution spaces in Kokkos represent the logical grouping of computational units that share identical performance properties, providing a unified interface to target diverse hardware architectures [1]. + +[.text-justify] +Basically, a defined execution space where parallel operations can be executed in a heterogeneous computing environment. In modern GPU/CPU hybrid systems, for example, two main types of execution spaces emerge: GPU cores and CPU cores. This abstraction thus allows developers to write code that can seamlessly adapt to different hardware configurations without major modifications. So it is easier to do than to worry about how to do it for different configurations. + +[.text-justify] +Another point, controlling the execution location of parallel bodies in Kokkos is a crucial aspect of performance optimization and hardware utilization. By default, Kokkos will execute parallel operations in the default execution space, unless otherwise specified [2]. However, developers have several methods at their disposal to fine-tune the execution location of their parallel code. + + +== Methods for Controlling Execution Spaces + +image::kokkos-abstractions-doc.png[xref=#fragment101,width=400,height=200] + +* *Specifying Execution Spaces* : One approach to control the Execution Space is by explicitly defining it in the parallel dispatch call. This can be achieved by using the RangePolicy template argument [3]. For example, In this code, *ExecutionSpace* is replaced with the desired Execution Space, such as *Kokkos::Cuda* for NVIDIA GPUs or *Kokkos::HIP* for AMD GPUs or *Kokkos::OpenMP* for multi-core CPUs. Also it is quite simple ! +[source,c++] +---- + parallel_for("Label", RangePolicy(0, numberOfIntervals), + [=] (const int64_t i) { + /* ... body ... */ + }); +---- +* *Changing the Default Execution Space* : Another method involves changing the default Execution Space at compilation time. This approach affects all parallel operations that do not explicitly specify an Execution Space. While this method provides a global solution, it may limit flexibility in scenarios where different parts of the application benefit from distinct Execution Spaces. + +* *Functor-based Control* : For more granular control, developers can define functors with an execution_space public typedef. This approach ensures that the parallel dispatch will only run the functor in the specified Execution Space, providing a robust mechanism for execution space-specific optimizations. + +* *Requirements and Considerations* : It is important to note that utilizing specific Execution Spaces comes with certain requirements. The desired Execution Space must be enabled during Kokkos compilation and properly initialized (and later finalized) in the application. Additionally, for non-CPU Execution Spaces, functions and lambdas may need to be annotated with specific macros to ensure portability [3]. + +* *Performance Implications* : The choice of Execution Space can significantly impact performance. Kokkos allows developers to target different parts of heterogeneous hardware architectures, enabling optimized utilization of available resources [4]. For instance, compute-intensive operations might benefit from GPU Execution Spaces, while memory-bound tasks could be more suited for CPU Execution Spaces. + +* *Advanced Concepts: Team Policies* : For more complex parallel patterns, Kokkos introduces Team Policies, which implement hierarchical parallelism [4]. Team Policies group threads into teams, allowing for sophisticated parallel structures that can better match the underlying hardware topology. This concept is particularly useful for architectures with multiple levels of parallelism, such as GPUs with their warp and block structures. + + +== Execution Patterns + +Execution Patterns are the fundamental parallel algorithms in which an application has to be expressed. Examples are + +* *parallel_for()*: execute a function in undetermined order a specified amount of times, +* *parallel_reduce()*: which combines parallel_for() execution with a reduction operation, +* *parallel_scan()*: which combines a parallel_for() operation with a prefix or postfix scan on output values of each operation, and +* *task*: which executes a single function with dependencies on other functions. + +Expressing an application in these patterns allows the underlying implementation or the used compiler to reason about valid transformations. + +*Example* + +[source, c++] +---- + struct VectorAdd { + // Member variables for the vectors + Kokkos::View a; + Kokkos::View b; + Kokkos::View c; + // Constructor to initialize the vectors + VectorAdd(Kokkos::View a_, Kokkos::View b_, Kokkos::View c_) + : a(a_), b(b_), c(c_) {} + // Functor to perform vector addition + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + c(i) = a(i) + b(i); // Perform addition + } + }; + + int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + const int N = 1000; // Size of the vectors + // Allocate and initialize vectors on the device + Kokkos::View a("A", N); + Kokkos::View b("B", N); + Kokkos::View c("C", N); + // Initialize vectors a and b on the host + Kokkos::parallel_for("InitializeVectors", N, KOKKOS_LAMBDA(const int i) { + a(i) = static_cast(i); // Fill vector A with values 0 to N-1 + b(i) = static_cast(N - i); // Fill vector B with values N-1 to 0 + }); + // Perform vector addition using Kokkos parallel_for + VectorAdd vectorAdd(a, b, c); + Kokkos::parallel_for("VectorAdd", N, vectorAdd); + // Synchronize to ensure all computations are complete + Kokkos::fence(); + // Output the first 10 results for verification + std::cout << "Result of vector addition (first 10 elements):" << std::endl; + for (int i = 0; i < 10; ++i) { + std::cout << "c[" << i << "] = " << c(i) << std::endl; // Print results from vector C + } + } + Kokkos::finalize(); + return 0; + } +---- + +Explanations: This example effectively demonstrates how to utilize *execution patterns* in Kokkos. + + +== Execution Policies + +In the realm of parallel computing, Execution Policies act as the conductors of a grand digital orchestra, directing how functions perform their symphonies of calculations. Among these maestros, Range Policies stand as the simplest, guiding operations through elements like a steady metronome, without concern for order or synchronization. An Execution Policy determines, together with an Execution Pattern, How a function is executed. + +* *Range Policies*: Simple policies for executing operations on each element in a range, without specifying order or concurrency. + +* *Team Policies* : Used for hierarchical parallelism, grouping threads into teams. Key features include: + - League size (number of teams) and team size (threads per team) + - Concurrent execution within a team + - Team synchronization via barriers + - Scratch pad memory for temporary storage + - Nested parallel operations + +The model is inspired by CUDA and OpenMP, aiming to improve performance across various hardware architectures by encouraging locality-aware programming. [5] + + +*Example* + +[source, c++] +---- + struct VectorAdd { + Kokkos::View a; + Kokkos::View b; + Kokkos::View c; + VectorAdd(Kokkos::View a_, Kokkos::View b_, Kokkos::View c_) + : a(a_), b(b_), c(c_) {} + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + c(i) = a(i) + b(i); // Perform addition + } + }; + + int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + const int N = 1000; // Size of the vectors + // Allocate vectors on the device + Kokkos::View a("A", N); + Kokkos::View b("B", N); + Kokkos::View c("C", N); + // Initialize vectors a and b on the host + Kokkos::parallel_for("InitializeVectors", N, KOKKOS_LAMBDA(const int i) { + a(i) = static_cast(i); // Fill vector A with values 0 to N-1 + b(i) = static_cast(N - i); // Fill vector B with values N-1 to 0 + }); + // Perform vector addition using default execution policy + Kokkos::parallel_for("VectorAdd", N, VectorAdd(a, b, c)); + // Synchronize to ensure all computations are complete + Kokkos::fence(); + // Output the first 10 results for verification + std::cout << "Result of vector addition (first 10 elements):" << std::endl; + for (int i = 0; i < 10; ++i) { + std::cout << "c[" << i << "] = " << c(i) << std::endl; // Print results from vector C + } + // Perform vector addition using a different execution policy (Dynamic Scheduling) + Kokkos::TeamPolicy<> teamPolicy(N, 32); // League size: N, Team size: 32 + Kokkos::parallel_for(teamPolicy, KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& teamMember) { + const int teamSize = teamMember.team_size(); + const int i = teamMember.league_rank() * teamSize + teamMember.team_rank(); + if (i < N) { + c(i) = a(i) + b(i); // Perform addition within the team + } + }); + + // Synchronize again after using the team policy + Kokkos::fence(); + // Output the results after using the team policy + std::cout << "Result of vector addition using Team Policy (first 10 elements):" << std::endl; + for (int i = 0; i < 10; ++i) { + std::cout << "c[" << i << "] = " << c(i) << std::endl; // Print results from vector C + } + } + Kokkos::finalize(); + return 0; + } + +---- + +Explanations: This example effectively demonstrates how to use different execution *policies* to perform computations efficiently in a parallel computing environment. + + + +== References + +** [1] https://kokkos.org/kokkos-core-wiki/ProgrammingGuide/Machine-Model.html +** [2] https://kokkos.org/kokkos-core-wiki/API/core/execution_spaces.html# +** [3] https://indico.math.cnrs.fr/event/12037/attachments/5040/8129/ +** [4] https://github.com/kokkos/kokkos-core-wiki/blob/main/docs/source/ProgrammingGuide/ProgrammingModel.md +** [5] https://kokkos.org/kokkos-core-wiki/ProgrammingGuide/ProgrammingModel.html + + + +.*Points to keep in mind* +**** + +*Execution Patterns* : The Kokkos patterns execution designates the parallelism models which allow to express calculation operations in parallel, such as parallel loops and reductions, while allowing Kokkos to manage the distribution of these tasks on the available execution resources available. + +*Team Policies* : Kokkos' police execution define how parallel operations are carried out, specifying the type of execution, the granularity of work and the resources to be used to optimize performance on different architectures. + +*Space Accessibility* : Space Accessibility is a trait that expresses the accessibility and assignability relationships between different memory and execution spaces. It allows to determine: + +*** Whether an execution space can access a given memory space. +*** Whether data can be assigned or copied between different memory spaces. + +**** + diff --git a/docs/modules/kokkos/pages/basic-concepts/index.adoc b/docs/modules/kokkos/pages/basic-concepts/index.adoc index e69de29b..dfbb0ecc 100644 --- a/docs/modules/kokkos/pages/basic-concepts/index.adoc +++ b/docs/modules/kokkos/pages/basic-concepts/index.adoc @@ -0,0 +1,22 @@ += Basic concepts + + +The content of this part has ben constructed from the slides from the https://indico.math.cnrs.fr/event/12037/[Kokkos training days] held in 2024. + + +== Other ressources + +- Organization Kokkos on GitHub: https://github.com/kokkos[github.com/kokkos], +- Slides, recording and Q&A for the Lectures: https://github.com/kokkos/kokkos-tutorials/wiki/Kokkos-Lecture-Series[github.com/kokkos/kokkos-tutorials/wiki], +- Kokkos Core Wiki, with API documentation: https://kokkos.github.io/kokkos-core-wiki[kokkos.github.io/kokkos-core-wiki], +- Slack channel for Kokkos: https://kokkos.slack.com[kokkosteam.slack.com]. + + +== Contents + +- xref:basic-concepts/views.adoc[Views] +- xref:basic-concepts/execution-spaces.adoc[Execution Spaces] +- xref:basic-concepts/memory-spaces.adoc[Memory Spaces] +- xref:basic-concepts/memory-access-patterns.adoc[Memory Access Patterns] + + diff --git a/docs/modules/kokkos/pages/basic-concepts/memory-access-patterns.adoc b/docs/modules/kokkos/pages/basic-concepts/memory-access-patterns.adoc new file mode 100644 index 00000000..8036dac5 --- /dev/null +++ b/docs/modules/kokkos/pages/basic-concepts/memory-access-patterns.adoc @@ -0,0 +1,126 @@ += Kokkos Memory Access Patterns + +== Introduction + +[.text-justify] +In the realm of high-performance computing, managing memory access patterns is crucial for achieving optimal performance across diverse hardware architectures. The View's Layout parameter plays a pivotal role in determining how data is organized in memory. Kokkos offers several layout options, including LayoutRight, LayoutLeft, and LayoutStride [1]. LayoutRight, which is typically the default for CPU architectures, organizes data in a row-major format, where elements of the rightmost dimension are contiguous in memory. Conversely, LayoutLeft, often preferred for GPU architectures, uses a column-major layout. This flexibility allows developers to tailor the data organization to the specific requirements of their target hardware, maximizing performance across different platforms. + +[.text-justify] +Memory access patterns in Kokkos are intricately linked to how parallel work indices are mapped to the layout of multidimensional array data. The library provides a sophisticated mapping mechanism that aligns the iteration space of parallel computations with the underlying memory layout. This alignment is critical for performance, as it directly impacts how efficiently the hardware can fetch and process data [2]. For instance, when using a LayoutRight View on a CPU, iterating over the rightmost dimension in the innermost loop of a parallel_for construct will result in cache-friendly, stride-1 memory accesses [3]. + +[.text-justify] +The significance of proper memory access patterns and layouts cannot be overstated when it comes to performance. On CPUs, well-aligned access patterns lead to efficient cache utilization, reducing memory latency and improving overall throughput. The importance of caching is particularly evident in operations like inner products, where repeated access to the same data can benefit greatly from cache locality [2]. On GPUs, coalesced memory accesses are paramount. When threads in a warp access contiguous memory locations, the GPU can combine these accesses into fewer, larger transactions, significantly boosting memory bandwidth utilization [4]. + + +== Managing Memory Access Patterns + +Memory access patterns play a pivotal role in achieving performance portability with Kokkos. The library provides mechanisms to control data layout and optimize memory access for different architectures. + +=== View's Layout Parameter and Data Layout Control + +The View's Layout parameter in Kokkos is a powerful tool for controlling data layout: + +** Kokkos provides different layout options, primarily `LayoutRight` and `LayoutLeft`. +** `LayoutRight` is typically the default for CPUs, representing row-major order. +** `LayoutLeft` is often the default for GPUs, representing column-major order. +** These layouts determine how multidimensional data is stored in memory. + +For example, to create a 2D view with a specific layout: +[source,c++] +---- +Kokkos::View view2D("view2D", 64, 64); +---- +This creates a 64x64 2D array with a row-major layout. + +=== Kokkos Mapping and Memory Access Patterns + +Kokkos maps parallel work indices to the layout of multidimensional array data: + +** The mapping aims to provide efficient access if iteration indices correspond to the first index of the array. +** This mapping is crucial for performance, as it determines how threads access memory. + +Consider this example: +[source,c++] +---- +View view(...); +Kokkos::parallel_for("Label", ..., +KOKKOS_LAMBDA (int workIndex) { + view(workIndex, ..., ...) = ...; // Efficient access + view(..., workIndex, ...) = ...; // Less efficient +}); +---- + +Here, accessing the view with `workIndex` as the first parameter is more efficient due to the default layout and mapping. + +=== Performance Impact of Memory Access Patterns and Layouts + +To illustrate the performance impact of different memory configurations, consider a simple inner product computation. When implemented with a LayoutRight View on a CPU, the operation benefits from efficient cache usage as it iterates over contiguous memory. However, the same layout on a GPU may lead to uncoalesced memory accesses, potentially reducing performance by an order of magnitude or more [2]. Conversely, a LayoutLeft View would provide coalesced accesses on the GPU but might suffer from cache thrashing on the CPU. This example underscores the importance of selecting the appropriate layout for each target architecture to achieve optimal performance + +Thefore, the significance of memory access patterns and layouts on performance cannot be overstated: + +** On CPUs, proper access patterns lead to effective caching, reducing memory latency. +** On GPUs, coalesced memory access is crucial for performance, where adjacent threads access adjacent memory locations. +** Misaligned or non-coalesced access can lead to significant performance degradation, potentially by more than 10x on GPUs. + +Concrete Example of Memory Configuration Performance + +image::kokkos-layout-Left-Right.png[xref=#fragment101,width=483,height=270] + +Let's consider a simple inner product computation: +[source,c++] +---- +Kokkos::parallel_reduce("Label", +RangePolicy(0, N), +KOKKOS_LAMBDA (const size_t row, double& valueToUpdate) { + double thisRowsSum = 0; + for (size_t entry = 0; entry < M; ++entry) { + thisRowsSum += A(row, entry) * x(entry); + } + valueToUpdate += y(row) * thisRowsSum; +}, result); +---- + +Remark in this example: + +** For a CPU with `LayoutRight`, this access pattern is cache-friendly. + +** For a GPU with `LayoutLeft`, the access to `A(row, entry)` might not be coalesced, potentially leading to performance issues. + +To optimize for both architectures, you might need to transpose the data or use different layouts for different devices. That's all. + +... + +== References + +** [1] https://github.com/CExA-project/cheat-sheet-for-kokkos/blob/main/utilization.md +** [2] https://www.nersc.gov/assets/Uploads/Kokkos-training-Day1-NewUsers-Bruno-v2.pdf +** [3] https://kokkos.org/kokkos-core-wiki/ProgrammingGuide/View.html +** [4] https://aiichironakano.github.io/cs653/Edwards-Kokkos-JPDC14.pdf + + + +.*Points to keep in mind* +**** + + +* *Important concept concerning layout* + +** Every *View* has multidimensional array Layout set at compile-time +** Most-common layouts are *LayoutLeft* and *LayoutRight*. +** Layouts are extensible and flexible +** If no layout specified, default for that memory space is used. *LayoutLeft* for *CudaSpace*, *LayoutRight* for *HostSpace*. +** LayoutRight row-major HostSpace: cached (good),CudaSpace: uncoalesced (bad) +** LayoutLeft column-major HostSpace: uncached (bad),CudaSpace: coalesced (good) +** Kokkos architecture-dependent HostSpace: cached (good) CudaSpace: coalesced (good) + +* *Performance* + +** For performance, accesses to views in *HostSpace* must *be cached*, while access to views in *CudaSpace* must be *coalesced*. +** *Uncoalesced access* on *GPUs* and *non-cached loads* on CPUs greatly *reduces performance (can be 10X)* +** Kokkos maps parallel work indices and multidimensional array layout for *performance portable memory access patterns*. + +* *Memory spaces available* +** HostSpace, CudaSpace, CudaUVMSpace, ... more +** Remark here is no UVMSpace for HIP, In the meantime, another strategy will have to be used. + +**** diff --git a/docs/modules/kokkos/pages/basic-concepts/memory-spaces.adoc b/docs/modules/kokkos/pages/basic-concepts/memory-spaces.adoc index e69de29b..18ab91d7 100644 --- a/docs/modules/kokkos/pages/basic-concepts/memory-spaces.adoc +++ b/docs/modules/kokkos/pages/basic-concepts/memory-spaces.adoc @@ -0,0 +1,135 @@ += Kokkos Memory Spaces + + +image::kokkos-node-doc.png[xref=#fragment101,width=322,height=181] + +== Introduction + +[.text-justify] +Kokkos, a performance portability programming model, introduces the concept of _Memory Spaces_ as a fundamental abstraction to address the complexities of heterogeneous computing environments. Memory Spaces [1] in Kokkos represent distinct memory areas with specific characteristics and accessibility patterns. These abstractions enable programmers to express algorithms independently of hardware specifics while maintaining control over data placement and movement. The Kokkos machine model envisions future computing nodes as complex systems with multiple execution units and memory hierarchies. + +[.text-justify] +In heterogeneous nodes, Kokkos' space abstractions prove particularly valuable. Such nodes may include CPU cores, GPU accelerators, and other specialized processing units, each with access to different memory types like host memory, device memory, or high-bandwidth memory. Kokkos abstracts these hardware-specific details, allowing developers to focus on algorithm structure rather than platform intricacies [2][3]. + +[.text-justify] +To control data residence, Kokkos provides mechanisms to specify the desired Memory Space when creating Views. A View in Kokkos is a multidimensional array abstraction encapsulating both data and layout. By specifying the appropriate Memory Space template parameter, developers can dictate where data should reside, enabling optimizations based on access patterns and hardware characteristics. + +[.text-justify] +Kokkos annotation macros play a vital role in achieving portability across architectures, allowing developers to provide hints and directives to the runtime. These macros are particularly important in performance-critical code sections, enabling expression of parallelism, memory access, and execution space preferences. + +[.text-justify] +In conclusion, Kokkos' Memory Spaces, initialization/finalization procedures, and annotation macros form a cohesive framework for developing portable, high-performance code for heterogeneous computing environments, allowing efficient utilization of diverse hardware resources while maintaining a single, portable codebase. + + +== Instances of Kokkos Memory Spaces + +[.text-justify] +Memory spaces in Kokkos are dynamic and flexible, offering programmers the ability to allocate data across various memory types, including on-package memory, DRAM, and non-volatile memories. Each memory space has specific instances that enable precise data storage allocation, with flexibility for developers to strategically choose memory locations for different data structures. As follows : + +* *Memory Spaces* : + - Memory spaces, like execution spaces, have specific instances. + - An instance of a memory space allows the programmer to request data storage allocations. + - Different types of memory are available, such as on-package memory, slower DRAM, and non-volatile memories. + - GPUs may have their own local memory space. + +* *Memory Allocation*: + - The programmer can choose where to allocate each data structure. + - Kokkos provides abstraction for allocation routines and memory management operations. + +* *Atomic Accesses*: + - Used to prevent race conditions when multiple threads access the same memory address. + - Atomic operations ensure that a read, simple computation, and write to memory are completed as a single unit. + +* *Memory Consistency*: + - Kokkos assumes a very weak memory consistency model. + - Programmers should not assume any specific ordering of memory operations within a kernel. + - Kokkos provides a fence operation to ensure completion of memory operations. + + +== Illustration of some memory space concepts + +* *Memory Spaces and Allocation* + +** Allocating in default memory space: +[source, c++] +---- + Kokkos::View defaultView("defaultView", 1000); +---- +** Allocating in CUDA memory space: +[source, c++] +---- + Kokkos::View cudaView("cudaView", 1000); +---- +** Allocating in host memory space: +[source, c++] +---- + Kokkos::View hostView("hostView", 1000); + +---- +* *Atomic Accesses* + +** Example of atomic addition: + +[source, c++] +---- + KOKKOS_INLINE_FUNCTION + void atomicAdd(int* addr, int val) { + Kokkos::atomic_add(addr, val); + } +---- + +* *Memory Consistency* + +** Using fence operation: +[source, c++] +---- + Kokkos::fence(); +---- + +* *Memory Space Instances* + +** Creating a memory pool for custom allocation: +[source, c++] +---- + Kokkos::MemoryPool memoryPool(cudaSpace, totalSize); + void* ptr = memoryPool.allocate(allocSize); +---- + +[.text-justify] +These examples demonstrate how Kokkos allows flexible memory allocation across different memory spaces, provides atomic operations for thread-safe memory access, and offers memory consistency control through fencing. The MemoryPool example shows how specific instances of memory spaces can be used for custom allocation strategies [5]. + +... + +== References + +** [1] https://kokkos.org/kokkos-core-wiki/API/core/memory_spaces.html +** [2] https://www.epj-conferences.org/articles/epjconf/pdf/2021/05/epjconf_chep2021_03034.pdf +** [3] https://kokkos.org/kokkos-core-wiki/ProgrammingGuide/HierarchicalParallelism.html +** [4] https://kokkos.org/kokkos-core-wiki/ProgrammingGuide/Machine-Model.html +** [5] https://docs.trilinos.org/dev/packages/kokkos/doc/html/ + + + + +.*Points to keep in mind* +**** + +* *Memory Spaces* + +Memory Spaces represent the places where data physically resides. They specify: + +*** The physical location of the data (e.g., CPU memory, GPU memory). +*** The access characteristics to that data. + +Each execution space in Kokkos has a default Memory Space associated with it. + + +The main difference between *Memory Spaces* and *Space Accessibility* is that: + +** Memory Spaces define **where** data is stored. +** Space Accessibility determines **how** these spaces can interact with each other. + +Space Accessibility therefore provides an additional layer of abstraction to manage the complex relationships between different spaces in a heterogeneous architecture. + +**** + diff --git a/docs/modules/kokkos/pages/basic-concepts/mirrors.adoc b/docs/modules/kokkos/pages/basic-concepts/mirrors.adoc new file mode 100644 index 00000000..b544e8dd --- /dev/null +++ b/docs/modules/kokkos/pages/basic-concepts/mirrors.adoc @@ -0,0 +1,121 @@ += Mirrors + +== Overview + +_Mirrors_ are views that reference data in a possibly different memory space. +They are used to access data in a different memory space without copying it. + +.Mirroring schematic +[source, c++] +---- +Kokkos::View view(...); +auto host_mirror = Kokkos::create_mirror_view(view); +---- + +image::kokkos-mirrors-schematic.png[] + +Two views are created: `view` in the memory space `Space` and `host_mirror` in the host memory space. +The data are copied back and forth between the two views using the `Kokkos::deep_copy` function. + + +== Mirroring pattern + +. *Create* a `view` in a specific memory space. + +[source, c++] +---- +Kokkos::View view(...); +---- + +[start=2] +. *Create* a mirror view in the host memory space, `host_mirror`. + +[source, c++] +---- +auto host_mirror = Kokkos::create_mirror_view(view); +---- + +[start=3] +. *Populate* `host_mirror` with data (from file, user input, etc.). +. *Copy* data from `host_mirror` to `view` using `Kokkos::deep_copy`. +. *Perform* computations on `view`: + +[source, c++] +---- +Kokkos::parallel_for("Operation", + RangePolicy(0, view.extent(0)), + KOKKOS_LAMBDA(...) { /* use and change view */ } +); +---- + +[start=6] +. If nedded, *copy* data back to `host_mirror` using `Kokkos::deep_copy`. + +[source, c++] +---- +Kokkos::deep_copy(host_mirror, view); +---- + + +NOTE: `create_mirror_view` allocates data only if the host process cannon access `view` 's data. +Otherwise, it returns a view that references the same data as `view`. +The ++command++ `create_mirror` makes always make data allocation. + +== Example + +We present the https://github.com/kokkos/kokkos-tutorials/tree/main/Exercises/04[example `04`] from the tutorial of Kokkos. + + +.Allocate the matrices and vectors on the device +[source, cpp] +---- +typedef Kokkos::View ViewVectorType; +typedef Kokkos::View ViewMatrixType; +ViewVectorType y( "y", N ); +ViewVectorType x( "x", M ); +ViewMatrixType A( "A", N, M ); + +// Create host mirrors of device views. +ViewVectorType::HostMirror h_y = Kokkos::create_mirror_view( y ); +ViewVectorType::HostMirror h_x = Kokkos::create_mirror_view( x ); +ViewMatrixType::HostMirror h_A = Kokkos::create_mirror_view( A ); +---- + +.Deep copy host views to device views. +[source, cpp] +---- +Kokkos::deep_copy( y, h_y ); +Kokkos::deep_copy( x, h_x ); +Kokkos::deep_copy( A, h_A ); +---- + +Then the code computes the quantity stem:[\left]. + +The full code is present in xref:basic-concepts/mirrors_sol_code.adoc[the file `05_kokkos_mirrors.cpp`]. + + +=== Performance comparisons + +On `gaya`: + +[source, bash] +---- +./kokkos_mirros -nrepeat 1000 + Total size S = 4194304 N = 4096 M = 1024 +Kokkos::HostSpace + Computed result for 4096 x 1024 is 4194304.000000 + N( 4096 ) M( 1024 ) nrepeat ( 1000 ) problem( 33.5954 MB ) time( 0.323193 s ) bandwidth( 103.949 GB/s ) +---- + + +On `gaya-gpu`: + +[source, bash] +---- +./kokkos_mirros -nrepeat 1000 + Total size S = 4194304 N = 4096 M = 1024 +Kokkos::HIPSpace + Computed result for 4096 x 1024 is 4194304.000000 + N( 4096 ) M( 1024 ) nrepeat ( 1000 ) problem( 33.5954 MB ) time( 0.485132 s ) bandwidth( 69.25 GB/s ) +---- + diff --git a/docs/modules/kokkos/pages/basic-concepts/mirrors_sol_code.adoc b/docs/modules/kokkos/pages/basic-concepts/mirrors_sol_code.adoc new file mode 100644 index 00000000..254c8340 --- /dev/null +++ b/docs/modules/kokkos/pages/basic-concepts/mirrors_sol_code.adoc @@ -0,0 +1,16 @@ +xref:basic-concepts/mirrors.adoc#Example[Back]. + + +[%dynamic, cmake] +---- +cmake_minimum_required(VERSION 3.16) +project(KokkosTutorial) + +add_executable(kokkos_mirror 05_kokkos_mirrors.cpp) +target_link_libraries(kokkos_mirror Kokkos::kokkos) +---- + +[source, cpp, filename="05_kokkos_mirrors.cpp", compile=cmake] +---- +include::example$src/05_kokkos_mirrors.cpp[] +---- diff --git a/docs/modules/kokkos/pages/basic-concepts/views.adoc b/docs/modules/kokkos/pages/basic-concepts/views.adoc index e69de29b..7ed14158 100644 --- a/docs/modules/kokkos/pages/basic-concepts/views.adoc +++ b/docs/modules/kokkos/pages/basic-concepts/views.adoc @@ -0,0 +1,115 @@ += Views + +== What is Kokkos View ? + +[.text-justify] +Kokkos Views are a fundamental abstraction in the Kokkos programming model, designed to provide a portable and efficient way to manage multidimensional arrays across diverse computing architectures. These Views serve as the primary data structure in Kokkos, offering a powerful mechanism for handling data in high-performance computing applications. +[.text-justify] +Views can have up to eight dimensions, and these dimensions can be specified at either compile-time or run-time. The flexibility in dimension specification allows for efficient memory management and optimization across different hardware architectures. +[.text-justify] +The motivation behind the View abstraction stems from the evolving landscape of high-performance computing. The memory space, which can be explicitly specified as an additional template parameter, determines where the data resides and which execution units can access it. +[.text-justify] +The View life cycle is an essential aspect of using Kokkos effectively. When a View is constructed, it allocates memory in the specified memory space. Kokkos employs a reference-counting mechanism to manage the lifetime of this allocation. As Views are copied or assigned, the reference count is adjusted accordingly. The View abstraction is part of Kokkos' broader machine model, which assumes a hierarchy of execution spaces and memory spaces. This model anticipates future shared-memory computing architectures, where nodes might contain multiple types of compute engines and memory systems. By abstracting these details through Views, Kokkos allows developers to write code that can adapt to evolving hardware landscapes without significant rewrites. +[.text-justify] +In the context of high-performance computing, Kokkos Views shine in their ability to handle large, multidimensional data structures efficiently. They are particularly useful in scientific simulations, linear algebra operations, and other computationally intensive tasks. + + + +== View key concepts and template parameters + + +A _view_ is a lightweight object that provides a way to access data in a multi-dimensional array. +They behave like pointers, so they can be used in the same way as pointers in (C++). + +[source,c++] +---- +View x(...), y(...); +... + +parallel_for("DAXPY", N, [=] const int64_t i { + // Views x and y are captured by value (copy) + y(i) = a * x(i) + y(i); +}); +---- + +Some general aspects of views: + +* They are *multi-diemensional arrays* of dimension 0 (scalar), 1 (vector), 2 (matrix), etc (to 8). Their number of dimension (named _rank_) is fixed at compilation. +* They are *rectangular* arrays, i.e., all dimensions are fixed at construction, and +* the sizes of dimensions are set either at compile-time or runtime. +* The elements can easily be accessed via the operator `()`. + +.Example of a 3D array with dimension set either at compilation or at runtime +[source,c++] +---- +View data("label", N, M, K); // 3 at runtime, 0 at compilation +View data("label", N, M); // 2 at runtime, 1 at compilation +View data("label"); // 3 at compilation + +data(i, j, k) = 3.14; +---- + +NOTE: The `label` of the view is not mandatory, but it is useful for debugging and profiling. + + +== Usage + + +.*First example:* build and fill-up a view +[tabs] +[%collapsible] +==== +1D example:: ++ +[source,c++] +---- +include::example$src/01_views_1D.cpp[indent=0] +---- + +2D example:: ++ +[source,c++] +---- +include::example$src/02_views_2D.cpp[indent=0] +---- + +==== + + +Views behave like `std::shared_ptr` in the sense that they are reference-counted objects. They are automatically deleted when the last reference to them is removed: + +[source, c++] +---- +include::example$src/00_views.cpp[indent=0] +---- +.Result +[source] +---- +Label of c: b +a(0, 2) = 3 +---- + + +== Properties + +Views have several properties that can be queried at runtime: + +* `label()`: the label of the view +* `rank()`: the number of dimensions of the view +* `extent(i)`: the size of the `i`-th dimension +* `span()`: the total number of elements in the view +* `data()`: a pointer to the data +* `operator()`: access to the data + + +== Exercice + +Exercice taken from the Kokkos tutorial: xref:basic-concepts/views_exercice_tutorial.adoc[Inner Product, Flat Parallelism an the CPU, with Views]. + + + +[bibliography] +== References + +. Kokkos documentation API https://kokkos.org/kokkos-core-wiki/API/core/view/view.html +. Kokkos package description https://docs.trilinos.org/dev/packages/kokkos/doc/html/classKokkos_1_1View.html \ No newline at end of file diff --git a/docs/modules/kokkos/pages/basic-concepts/views_exercice_tutorial.adoc b/docs/modules/kokkos/pages/basic-concepts/views_exercice_tutorial.adoc new file mode 100644 index 00000000..ab917b6a --- /dev/null +++ b/docs/modules/kokkos/pages/basic-concepts/views_exercice_tutorial.adoc @@ -0,0 +1,85 @@ += Inner Product, Flat Parallelism and the CPU, with Views + +Exercise 02 taken from the https://github.com/kokkos/kokkos-tutorials[Kokkos tutorial]. +The initial code file to complete can be found on https://github.com/kokkos/kokkos-tutorials/blob/main/Exercises/02/Begin/exercise_2_begin.cpp[this link]. + +== Problem statement + +The code provided in the exercise is a simple matrix-vector multiplication: + +[stem] +++++ +y = A * x +++++ + +.Goals of the exercise +**** +Replace raw allocations with Kokkos Views. + +1. Define device views. +2. Replace data access with view access operators. +**** + +.Run the exercise +[source, sh] +---- +./04_kokkos_exercise_views -S 26 +---- + +CAUTION: The solution are present just below, so if you wann try it by yourself, stop reading this page ! + +== Solutions + +.Allocating memory +[source, diff] +---- +- double * const y = new double[ N ]; ++ ViewVectorType y( "y", N ); +- double * const x = new double[ M ]; ++ ViewVectorType x( "x", M ); +- double * const A = new double[ N * M ]; ++ ViewMatrixType A( "A", N, M ); +---- + +.Initialize data +[source, diff] +---- +- for ( int i = 0; i < N; ++i ) { +- y[ i ] = 1; +- } ++ Kokkos::parallel_for( N, KOKKOS_LAMBDA ( int i ) { ++ y( i ) = 1; ++ }); + +- for ( int i = 0; i < M; ++i ) { +- x[ i ] = 1; +- } ++ Kokkos::parallel_for( M, KOKKOS_LAMBDA ( int i ) { ++ x( i ) = 1; ++ }); +---- + +.Matrix-vector multiplication +[source, diff] +---- +- for ( int j = 0; j < N; ++j ) { +- for ( int i = 0; i < M; ++i ) { +- A[ j * M + i ] = 1; +- } +- } ++ Kokkos::parallel_for( N, KOKKOS_LAMBDA ( int j ) { ++ for ( int i = 0; i < M; ++i ) { ++ A( j, i ) = 1; ++ } ++ }); +---- + +Concerning deallocation, Kokkos will automatically deallocate the memory when the views go out of scope. + +.Deallocate memory +[source, diff] +---- +- delete [] y; +- delete [] x; +- delete [] A; +---- \ No newline at end of file diff --git a/docs/modules/kokkos/pages/coding/Kokkos_coding.adoc b/docs/modules/kokkos/pages/coding/Kokkos_coding.adoc new file mode 100644 index 00000000..42db2e63 --- /dev/null +++ b/docs/modules/kokkos/pages/coding/Kokkos_coding.adoc @@ -0,0 +1,37 @@ += Kokkos Coding Practice + + + +.Start +[.examp] +**** +This start example illustrates how to do a task reduction and consists in calculating the sum of all elements of an array. +ifeval::[{showproof} >= 2] +.Code Start +[%collapsible.proof] +==== +[%dynamic,cpp,filename="OpenMP_Start.cpp",compile="openmp"] +---- +include::ROOT:example$src/OpenMP/OpenMP_Start.cpp[] +---- +==== +//endif::[] +**** + + + +.Firstprivate +[.examp] +**** +Specifies that each thread should have its own instance of a variable, and that the variable should be initialized with the value of the variable, because it exists before the parallel construct. +.Code Firstprivate +[%collapsible.proof] +==== +[%dynamic,cpp,filename="OpenMP_Firstprivate.cpp",compile="openmp"] +---- +include::ROOT:example$src/OpenMP/OpenMP_Firstprivate.cpp[] +---- +==== +//endif::[] +**** + diff --git a/docs/modules/kokkos/pages/coding/MCQ_001.adoc b/docs/modules/kokkos/pages/coding/MCQ_001.adoc new file mode 100644 index 00000000..0bc23448 --- /dev/null +++ b/docs/modules/kokkos/pages/coding/MCQ_001.adoc @@ -0,0 +1,98 @@ + += Welcome to MCQ Kokkos + +*General question about Kokkos* + +++++ + + +
+

1. What is Kokkos ?

+ A programming language
+ A C++ programming model for performance portability
+ An operating system
+ A compiler
+ +

+
+ +
+

2. What is the main objective of Kokkos ?

+ Optimize performance on a single type of hardware
+ Replace CUDA and OpenMP
+ Ensure performance portability across different HPC architectures
+ A compiler
+ +

+
+ +
+

3. Which backends does Kokkos support ?

+ CUDA only
+ OpenMP only
+ CUDA, OpenMP, and HPX
+ Only CPUs
+ +

+
+ +
+

4. What does a “View” represent in Kokkos ?

+ A graphics window
+ A simple pointer
+ A multidimensional array with memory management
+ A tree data structure
+ +

+
+ +
+

5. How does Kokkos manage memory for Views ?

+ Manually by the programmer
+ Automatically with a reference counter
+ Only on CPU
+ Without any memory management
+ +

+
+ +
+

6. What is special about the dimensions of a View Kokkos ?

+ They are always dynamic
+ They are always static
+ They can be defined at compile time or at run time
+ They are limited to 3 dimensions maximum
+ +

+
+ +
+

7. How is a Kokkos View copied ?

+ Default deep copy
+ Shallow copy by default
+ Unable to copy a View
+ Copy only to the same type of memory
+ +

+
+ +
+

8. What tool does Kokkos provide for profiling and debugging ?

+ KokkosDebug
+ KokkosProfile
+ KokkosP
+ KokkosAnalyzer
+ +

+
+++++ \ No newline at end of file diff --git a/docs/modules/kokkos/pages/coding/MCQ_002.adoc b/docs/modules/kokkos/pages/coding/MCQ_002.adoc new file mode 100644 index 00000000..f7111ef3 --- /dev/null +++ b/docs/modules/kokkos/pages/coding/MCQ_002.adoc @@ -0,0 +1,98 @@ + += Welcome to MCQ Kokkos + +*Quiz on the concept of View* + +++++ + + +
+

1. What is a View Kokkos ?

+ A simple pointer
+ A multidimensional array with memory management
+ A tree data structure
+ A graphics window
+ +

+
+ +
+

2. How does Kokkos manage memory for Views ?

+ Manually by the programmer
+ Automatically with a reference counter
+ Only on CPU
+ Without any memory management
+ +

+
+ +
+

3. What is special about the dimensions of a View Kokkos ?

+ They are always dynamic
+ They are always static
+ They can be defined at compile time or at run time
+ They are limited to 3 dimensions maximum
+ +

+
+ +
+

4. How is a Kokkos View copied ?

+ Default deep copy
+ Shallow copy by default
+ Unable to copy a View
+ Copy only to the same type of memory
+ +

+
+ +
+

5. What is the maximum number of dimensions for a Kokkos View ?

+ 3
+ 5
+ 8
+ Unlimited
+ +

+
+ +
+

6. What does the Layout parameter represent in a Kokkos View ?

+ The size of the View
+ The organization of data in memory
+ The type of data stored
+ The execution space
+ +

+
+ +
+

7. What is special about the default layout on CUDA GPU ?

+ LayoutRight
+ LayoutLeft
+ LayoutRandom
+ No default layout
+ +

+
+ +
+

8. How do you access the elements of a Kokkos View ?

+ With brackets [ ]
+ With parentheses ( ) + With the .at() method
> + With the .get() method
+ +

+
+++++ diff --git a/docs/modules/kokkos/pages/coding/MCQ_003.adoc b/docs/modules/kokkos/pages/coding/MCQ_003.adoc new file mode 100644 index 00000000..2c28333a --- /dev/null +++ b/docs/modules/kokkos/pages/coding/MCQ_003.adoc @@ -0,0 +1,98 @@ + += Welcome to MCQ Kokkos + +*Quiz on the concept of mirrors* + +++++ + + +
+

1. What are mirrors in Kokkos ?

+ Exact copies of the Views
+ Views of equivalent arrays potentially residing in different memory spaces
+ Pointers to GPU memory
+ Data structures for debugging
+ +

+
+ +
+

2. What Kokkos function is used to create a mirror ?

+ Kokkos::create_mirror()
+ Kokkos::create_mirror_view()
+ Kokkos::make_mirror()
+ Answers a and b are correct
+ +

+
+ +
+

3. What does the create_mirror_view() function do ?

+ It always allocates memory
+ It never allocates memory
+ It allocates memory only if the host process cannot access the view data
+ It always copies the data
+ +

+
+ +
+

4. What is the difference between create_mirror() and create_mirror_view() ?

+ create_mirror() always allocates memory, create_mirror_view() only if needed
+ create_mirror() only works on GPU, create_mirror_view() only on CPU
+ There is no difference
+ create_mirror() copies data, create_mirror_view() does not
+ +

+
+ +
+

5. Does Kokkos automatically perform a deep copy when creating a mirror ?

+ Yes, always
+ No, never
+ Only if specified
+ Only for GPU views
+ +

+
+ +
+

6. What function is used to copy data between a view and its mirror ?

+ Kokkos::copy()
+ Kokkos::deep_copy()
+ Kokkos::mirror_copy()
+ Kokkos::transfer()
+ +

+
+ +
+

7. What is the main interest of using mirrors in Kokkos ?

+ Increase the calculation speed
+ Facilitate the transfer of data between different memory spaces
+ Reduce memory usage
+ Simplify the code syntax
+ +

+
+ +
+

8. When does a mirror reference the same data as the original view ?

+ Always
+ Never
+ When the original view is in the host's memory space
+ When the original view is in the device's memory space
+ +

+
+++++ \ No newline at end of file diff --git a/docs/modules/kokkos/pages/coding/MCQ_004.adoc b/docs/modules/kokkos/pages/coding/MCQ_004.adoc new file mode 100644 index 00000000..099c0314 --- /dev/null +++ b/docs/modules/kokkos/pages/coding/MCQ_004.adoc @@ -0,0 +1,98 @@ + += Welcome to MCQ Kokkos + +*Quiz on the concept of Execution Spaces* + +++++ + + +
+

1. What does an Execution Space represent in Kokkos?

+ A memory space
+ A logical grouping of computing units sharing identical performance properties
+ A type of View
+ A parallel algorithm
+ +

+
+ +
+

2. What is the function of an Execution Space?

+ Manage memory
+ Provide parallel execution resources
+ Compile the code
+ Optimize algorithms
+ +

+
+ +
+

3. What is the default Execution Space called in Kokkos?

+ Kokkos::DefaultExecutionSpace
+ Kokkos::MainExecutionSpace
+ Kokkos::PrimaryExecutionSpace
+ Kokkos::BaseExecutionSpace
+ +

+
+ +
+

4. What method is used to synchronize an Execution Space?

+ sync()
+ wait()
+ fence()
+ barrier()
+ +

+
+ +
+

5. What macro is used to make a function compatible with different Execution Spaces?

+ KOKKOS_FUNCTION
+ KOKKOS_INLINE_FUNCTION
+ KOKKOS_EXEC_FUNCTION
+ KOKKOS_PORTABLE_FUNCTION
+ +

+
+ +
+

6. Which Execution Space is always set, regardless of the backend used?

+ Kokkos::Cuda
+ Kokkos::OpenMP
+ Kokkos::Serial
+ Kokkos::Threads
+ +

+
+ +
+

7. How do you specify the Execution Space in an execution policy?

+ Kokkos::RangePolicy(...)
+ Kokkos::Policy(...)
+ Kokkos::ExecPolicy(...)
+ Kokkos::ParallelPolicy(...)
+ +

+
+ +
+

8. What is the relationship between an Execution Space and a Memory Space in Kokkos?

+ They are always the same
+ They are always different
+ An Execution Space has a default Memory Space associated with it
+ There is no relationship between the two
+ +

+
+++++ \ No newline at end of file diff --git a/docs/modules/kokkos/pages/coding/MCQ_005.adoc b/docs/modules/kokkos/pages/coding/MCQ_005.adoc new file mode 100644 index 00000000..cb0b2108 --- /dev/null +++ b/docs/modules/kokkos/pages/coding/MCQ_005.adoc @@ -0,0 +1,99 @@ + += Welcome to MCQ Kokkos + +*Quiz on the concept of advanced reduction* + +++++ + + + +
+

1. What is the default reducer in Kokkos?

+ Max
+ Min
+ Sum
+ Prod
+ +

+
+ +
+

2. What operation does the Kokkos::BAnd reducer perform?

+ Binary addition
+ Binary AND
+ Binary OR
+ Binary XOR
+ +

+
+ +
+

3. What reducer allows us to find both the minimum value and its index?

+ Min
+ MinLoc
+ MinMax
+ MinMaxLoc
+ +

+
+ +
+

4. How do you specify a custom reducer in a parallel reduction?

+ As a first argument
+ As a final argument
+ In the body of the lambda
+ In a separate function
+ +

+
+ +
+

5. What does the Kokkos::MinMaxLoc reducer do?

+ Find the minimum and maximum
+ Find the minimum and maximum with their indices
+ Find the minimum or maximum, whichever is closer
+ Find the average between the minimum and the maximum
+ +

+
+ +
+

6. Do Kokkos reducers work on variable length arrays?

+ Yes, always
+ No, never
+ Only for certain reducers
+ Only on CPU
+ +

+
+ +
+

7. What is special about reductions in scalar variables?

+ They are faster
+ They are guaranteed to be synchronous
+ They only work on CPU
+ They require a special reducer
+ +

+
+ +
+

8. Is it possible to make multiple reductions simultaneously in Kokkos?

+ No, never
+ Yes, but only two at a time
+ Yes, with identical data types
+ Yes, even with different data types
+ +

+
+++++ \ No newline at end of file diff --git a/docs/modules/kokkos/pages/coding/MCQ_006.adoc b/docs/modules/kokkos/pages/coding/MCQ_006.adoc new file mode 100644 index 00000000..d09e1324 --- /dev/null +++ b/docs/modules/kokkos/pages/coding/MCQ_006.adoc @@ -0,0 +1,98 @@ + += Welcome to MCQ Kokkos + +*Quiz on the concept of Hierarchical Parallelism* + +++++ + + +
+

1. What is a thread team in Kokkos?

+ A group of CPUs
+ A collection of threads that can synchronize and share scratch memory
+ A set of GPUs
+ A type of View Kokkos
+ +

+
+ +
+

2. How many levels of nested parallelism does Kokkos allow?

+ 2
+ 3
+ 4
+ Unlimited
+ +

+
+ +
+

3. What execution policy is used for hierarchical parallelism?

+ RangePolicy
+ TeamPolicy
+ HierarchyPolicy
+ NestedPolicy
+ +

+
+ +
+

4. What is the finest level of parallelism in Kokkos?

+ League
+ Team
+ Thread
+ Vector
+ +

+
+ +
+

5. What method is used to synchronize threads within a team?

+ team_sync()
+ team_barrier()
+ team_wait()
+ team_fence()
+ +

+
+ +
+

6. What type of memory is shared between threads in the same team?

+ Global memory
+ Shared memory
+ Scratch memory
+ Team memory
+ +

+
+ +
+

7. What policy is used for thread-level parallelism in a team?

+ TeamThreadRange
+ ThreadTeamRange
+ TeamParallelRange
+ ParallelTeamRange
+ +

+
+ +
+

8. What is the main advantage of hierarchical parallelism in Kokkos?

+ Simplification of the code
+ Reduction of memory consumption
+ Better exploitation of different levels of hardware parallelism
+ Increased accuracy of calculations
+ +

+
+++++ \ No newline at end of file diff --git a/docs/modules/kokkos/pages/coding/MCQ_007.adoc b/docs/modules/kokkos/pages/coding/MCQ_007.adoc new file mode 100644 index 00000000..cad57d36 --- /dev/null +++ b/docs/modules/kokkos/pages/coding/MCQ_007.adoc @@ -0,0 +1,105 @@ += Welcome to MCQ Kokkos + +*Quiz on the concept of Asynchronicity, Streams, and Task Parallelism* + +++++ + + + +
+

1. What does asynchronous execution mean in Kokkos?

+ Operations are always executed in order
+ Operations may return before they are completed
+ Operations cannot be parallelized
+ Operations are always synchronous
+ +

+
+ + +
+

2. How do I force completion of all pending operations in Kokkos?

+ With the wait() function
+ With the sync() function
+ With the fence() function
+ With the complete() function
+ +

+
+ + +
+

3. What is the main purpose of Task Parallelism in Kokkos?

+ Optimize memory usage
+ Manage dependencies between irregular tasks
+ Replace data parallelism
+ Simplify sequential programming
+ +

+
+ + +
+

4. How do I create a task in Kokkos?

+ With task.create()
+ With policy.create()
+ With Kokkos::create_task()
+ With new Task()
+ +

+
+ + +
+

5. What does the task respawn mechanism in Kokkos allow?

+ Create a new task
+ Complete a current task
+ Restart a task without waiting for its dependencies
+ Delete a task
+ +

+
+ + +
+

6. What Kokkos concept allows to execute operations in parallel on different streams?

+ TaskPolicy
+ StreamPolicy
+ Runtime Space Instances
+ ParallelPolicy
+ +

+
+ + +
+

7. How can we implement nested parallelism in a Kokkos task?

+ It is not possible
+ Using TeamThreadRange
+ By creating a new task
+ Only with sequential loops
+ +

+
+ + +
+

8. What is the main advantage of combining Task Parallelism and Data Parallelism in Kokkos?

+ Reduce memory consumption
+ Increase the accuracy of calculations
+ Improve locality and exploit material hierarchy
+ Simplify the code
+ +

+
+++++ \ No newline at end of file diff --git a/docs/modules/kokkos/pages/diagnostic-tools-algebraic-strategies/index.adoc b/docs/modules/kokkos/pages/diagnostic-tools-algebraic-strategies/index.adoc new file mode 100644 index 00000000..bc6a3572 --- /dev/null +++ b/docs/modules/kokkos/pages/diagnostic-tools-algebraic-strategies/index.adoc @@ -0,0 +1,18 @@ += Diagnostic tools + + +The content of this part has ben constructed from the slides from the https://indico.math.cnrs.fr/event/12037/[Kokkos training days] held in 2024. + + +== Other ressources + +- Organization Kokkos on GitHub: https://github.com/kokkos[github.com/kokkos], +- Slides, recording and Q&A for the Lectures: https://github.com/kokkos/kokkos-tutorials/wiki/Kokkos-Lecture-Series[github.com/kokkos/kokkos-tutorials/wiki], +- Kokkos Core Wiki, with API documentation: https://kokkos.github.io/kokkos-core-wiki[kokkos.github.io/kokkos-core-wiki], +- Slack channel for Kokkos: https://kokkos.slack.com[kokkosteam.slack.com]. + + +== Contents + +- xref:diagnostic-tools-algebraic-strategies/kernels-math-library.adoc[Kernels Math library] +- xref:diagnostic-tools-algebraic-strategies/tools-profiling-tuning-debugging.adoc[Tools Profing Tuning Debugging] \ No newline at end of file diff --git a/docs/modules/kokkos/pages/diagnostic-tools-algebraic-strategies/kernels-math-library.adoc b/docs/modules/kokkos/pages/diagnostic-tools-algebraic-strategies/kernels-math-library.adoc new file mode 100644 index 00000000..5418d372 --- /dev/null +++ b/docs/modules/kokkos/pages/diagnostic-tools-algebraic-strategies/kernels-math-library.adoc @@ -0,0 +1,83 @@ += Kokkos Kernels Math Library + +== Introduction + +[.text-justify] +The **Kokkos Kernels Math Library** is a high-performance library specifically designed to provide computational kernels for linear algebra and graph operations. Built on top of the Kokkos programming model, it ensures performance portability across diverse hardware architectures, including CPUs, GPUs, and other accelerators. The library supports dense and sparse linear algebra operations, graph computations, and machine learning kernels. It can be used as a standalone library or integrated into larger frameworks like Tpetra for distributed parallelism. + +== Kokkos Ecosystem for Performance Portability +[.text-justify] +The **Kokkos Ecosystem** is a comprehensive framework aimed at achieving performance portability for high-performance computing (HPC) applications. It includes: + +- **Kokkos Core**, which provides abstractions for parallel execution and memory management. + +- **Kokkos Kernels**, offering dense and sparse linear algebra kernels as well as graph algorithms. + +- **Profiling and Debugging Tools**, enabling developers to analyze and optimize their applications. +[.text-justify] +This ecosystem allows developers to write architecture-agnostic code that performs efficiently on both current and future HPC platforms. By leveraging hierarchical parallelism (team-level, thread-level, and vector-level), Kokkos ensures scalability across heterogeneous architectures. + + +== BLAS and LAPACK +[.text-justify] +*Motivation for BLAS/LAPACK Functions* : BLAS (Basic Linear Algebra Subprograms) and LAPACK (Linear Algebra Package) are foundational libraries for numerical computing. They provide highly optimized routines for vector and matrix operations, such as matrix multiplication, eigenvalue computations, and solving linear systems. Their inclusion in Kokkos Kernels ensures that scientific applications can leverage these standard interfaces while benefiting from performance portability. + +[.text-justify] +*Algorithm Specialization for Applications* : Kokkos Kernels supports algorithm specialization to optimize performance on different architectures. For example, it provides multiple implementations of key BLAS/LAPACK routines tailored to specific hardware backends like CUDA or OpenMP. +[.text-justify] +*Calling BLAS/LAPACK Functions* : Developers can call BLAS/LAPACK functions using Kokkos' abstractions. These calls are typically embedded within team-level or serial execution contexts to ensure efficient resource utilization. + + +== Batched BLAS and LAPACK +[.text-justify] +*Motivation for Batched Functions* : Batched BLAS/LAPACK functions address scenarios where many small independent linear algebra problems need to be solved simultaneously. This approach minimizes synchronization overhead and improves cache efficiency, making it ideal for applications like finite element methods or particle simulations. + +[.text-justify] +*Two Namespaces with BLAS and LAPACK Functions* : Kokkos Kernels provides two namespaces for batched operations: + +- **Standard Batched BLAS Interfaces**, which mimic traditional BLAS routines. +- **Team-Level Batched Interfaces**, optimized for use within hierarchical parallelism contexts. + +[.text-justify] +*Calling Batched Functions* : Batched functions are invoked using Kokkos' execution policies, enabling parallel processing of multiple small problems in a single call. + + + +== Sparse Linear Algebra +[.text-justify] +Sparse linear algebra is a critical component of scientific computing, especially in simulations involving large but sparsely populated matrices. + +[.text-justify] +*Key Characteristics of Algorithms* : Sparse algorithms in Kokkos Kernels focus on reducing memory usage and optimizing data access patterns. They leverage compressed storage formats to minimize the footprint of sparse matrices. + +*Containers*: CrsMatrix, StaticCrsGraph, Vector + +- **CrsMatrix**: A compressed row storage matrix format. +- **StaticCrsGraph**: Represents the sparsity pattern of a matrix. +- **Vector**: A container optimized for sparse vector operations. + +*Key Operations* : + +- **SpMV (Sparse Matrix-Vector Multiplication)**: Efficiently multiplies a sparse matrix with a dense vector. +- **SpADD (Sparse Matrix Addition)**: Combines two sparse matrices. +- **SpGEMM (Sparse General Matrix-Matrix Multiplication)**: Multiplies two sparse matrices. + + +== Graph Kernels +[.text-justify] +Kokkos Kernels includes graph algorithms essential for tasks like coloring and partitioning. +[.text-justify] +*Distance-1 Graph Coloring* : Assigns colors to vertices such that no two adjacent vertices share the same color. This is useful in scheduling problems or parallel preconditioners. +[.text-justify] +*Distance-2 Graph Coloring* : Ensures that vertices up to two edges apart have distinct colors. This is particularly relevant in higher-order finite element methods. +[.text-justify] +*Bipartite Graph Partial Coloring* : Focuses on bipartite graphs, assigning colors to one set of vertices while considering constraints from the other set. + +... + +== References +** [1] https://github.com/kokkos/kokkos-kernels +** [2] https://kokkos.org/kokkos-core-wiki/ +** [3] https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/link-with-blas-and-lapack-in-Intel-MKL-kit/m-p/1262137?profile.language=fr + + diff --git a/docs/modules/kokkos/pages/diagnostic-tools-algebraic-strategies/tools-profiling-tuning-debugging.adoc b/docs/modules/kokkos/pages/diagnostic-tools-algebraic-strategies/tools-profiling-tuning-debugging.adoc new file mode 100644 index 00000000..abba0b74 --- /dev/null +++ b/docs/modules/kokkos/pages/diagnostic-tools-algebraic-strategies/tools-profiling-tuning-debugging.adoc @@ -0,0 +1,104 @@ += Kokkos Tools: Profiling Tuning Debugging + +== Introduction + +[.text-justify] +Kokkos Tools represent a sophisticated suite of utilities designed to enhance the development and optimization of high-performance computing applications. These tools leverage Kokkos' built-in instrumentation to provide developers with powerful capabilities for profiling, debugging, and tuning their code across diverse hardware architectures. + +== Kokkos Tools and Built-in Instrumentation + +*The Need for Kokkos-aware Tools* : + +** Modern heterogeneous computing environments present complex challenges for performance analysis and optimization. +** Traditional profiling and debugging tools often lack context-specific information for Kokkos applications. +** Kokkos-aware tools bridge this gap by interfacing directly with the Kokkos runtime, providing more meaningful insights. + +*How Instrumentation Helps ?* : + +** Kokkos' built-in instrumentation allows for non-intrusive gathering of detailed execution information. +** It tracks critical events such as kernel launches and memory operations without requiring source code modifications. +** This approach minimizes impact on application behavior while still offering rich performance data. + +*Simple Profiling Tools* : + + ** KernelLogger: Helps developers localize errors and verify runtime flow by printing Kokkos operations as they occur [1]. + ** SimpleKernelTimer: Measures time spent in kernels, identifying hotspots and aiding in performance optimization [1]. + ** MemoryEvents: Tracks memory-related events, helping identify issues like excessive temporary allocations [1]. + +*Simple Debugging Tools* : + +** KernelLogger: Acts as a debugging tool by inserting fences that check for errors and printing Kokkos operations [4]. +** These tools can help pinpoint issues in kernel execution and memory management, crucial for complex parallel applications. + + + +== Vendor and Independent Profiling GUIs + +*What Connectors Provide ?* : + +** Connectors translate Kokkos instrumentation for use with vendor-specific and independent profiling tools. +** They bridge the gap between Kokkos' internal instrumentation and external profiling interfaces. +** This allows developers to use familiar tools while gaining Kokkos-specific insights. + +*Available Tools* : + +** *nvtx-connector*: Interfaces with NVIDIA tools like Nsight Compute, translating KokkosP hooks into NVTX instrumentation [4]. +** *vtune-focused-connector*: Enables integration with Intel's VTune profiler for detailed performance analysis on Intel architectures. +** *TAU (Tuning and Analysis Utilities)*: Offers built-in support for Kokkos without requiring a separate connector [2]. + + +== Tuning +[.text-justify] +As applications grow in complexity, the need for *tuning* becomes increasingly apparent. Kokkos recognizes this need and provides autotuning hooks to help developers optimize their code for different architectures and workloads. +[.text-justify] +The necessity for *tuning* is evident when considering the myriad of parameters that can affect performance. For instance, in a *sparse matrix-vector multiplication (SpMV)* implementation, factors such as the number of rows per team, team size, and vector length can significantly impact performance across different hardware [5]. Manually determining optimal values for these parameters across various architectures is a daunting and time-consuming task. + + +== Custom Tools + +*The KokkosP Hooks* : + +** KokkosP interface exposes hooks corresponding to various Kokkos runtime events. +** These hooks include kernel launches, memory operations, and region entries/exits. + +*Callback Registration Inside the Application* : + +** Developers implement callback functions for relevant KokkosP hooks. +** These callbacks are registered with the Kokkos runtime to be invoked at appropriate execution points. + +*Throwaway Debugging Tools* : + +** Lightweight, purpose-built tools can be quickly implemented for specific debugging scenarios. +** Example: A tool to log memory allocations exceeding a certain size to identify potential memory leaks. + + +== References +** [1] https://github.com/kokkos/kokkos-tools +** [2] https://kokkos.org/blog/blog-post-09/ +** [3] https://timemory.readthedocs.io/en/develop/ +** [4] https://indico.math.cnrs.fr/event/12037/attachments/5040/8156/KokkosTutorial_07_Tools.pdf +** [5] https://indico.math.cnrs.fr/event/12037/attachments/5040/8156/KokkosTutorial_07_Tools.pdf + + + +.*Points to keep in mind* +**** + +* *Kokkos Tools* +** Kokkos Tools provide an instrumentation interface KokkosP and Tools to leverage it. + +* *Kokkos Connector Tools* +** Connectors inject Kokkos specific information into vendor and academic tools. +** Helps readability of profiles. +** Removes need to put vendor specific instrumentation in codes +** Growing list of tools support Kokkos natively. + +* *Kokkos Tuning Hooks enable more performance portability* +** Avoid figuring out the right heuristic for every platform. +** Input variables descripte the problem scope. +** Output variables descripe the search space. + +**** + + + diff --git a/docs/modules/kokkos/pages/gaya.adoc b/docs/modules/kokkos/pages/gaya.adoc new file mode 100644 index 00000000..9ed19b58 --- /dev/null +++ b/docs/modules/kokkos/pages/gaya.adoc @@ -0,0 +1,46 @@ += Compile on Gaya + +This page gathers the steps to compile a program with Kokkos and Feel++ on the cluster https://intranet.math.unistra.fr/intra/ressources_num%C3%A9riques/calcul/gaya/[Gaya]. + +The code can be compiled either with the ROCM support (to run on the GPU) or without it. +On this page, we present the two methods. + +== Load the spack environment + +To compile a program with Kokkos and Feel++ on Gaya, you need to load the spack environment. + +[source, bash] +---- +source /data/cemosis/spack/share/spack/setup-env.sh <1> +spacktivate feelpp-openmpi4-rocm-kokkos <2> +spacktivate feelpp-openmpi4-kokkos <3> +---- + +<1> Setup the bash environment, and load spack commands. +<2> Load the environment with ROCM support. +<3> Load the environment without ROCM support. + +== Set-up CMake + +[source, bash] +---- +cmake --preset feelpp-clang-cpp20-spack-rocm-kokkos-none-release <1> +cmake --preset feelpp-clang-cpp20-spack-kokkos-none-release <2> +---- + +<1> Set-up the CMake configuration with ROCM support. +<2> Set-up the CMake configuration without ROCM support. + +== Compile the code + +[source, bash] +---- +cmake --build --preset feelpp-clang-cpp20-spack-rocm-kokkos-none-release [-t ] +cmake --build --preset feelpp-clang-cpp20-spack-kokkos-none-release [-t ] +---- + +(or `make` indie the corresponding build directory) + +== Execute the code + +If you compiled the code with ROCM support, you need to execute it on a GPU node, namely `gaya-gpu`. diff --git a/docs/modules/kokkos/pages/introduction/installation.adoc b/docs/modules/kokkos/pages/introduction/installation.adoc index e69de29b..0af83fe6 100644 --- a/docs/modules/kokkos/pages/introduction/installation.adoc +++ b/docs/modules/kokkos/pages/introduction/installation.adoc @@ -0,0 +1,70 @@ += Installation + +Kokkos is avaiable on a GitHub repository: https://github.com/kokkos/kokkos[github.com/kokkos/kokkos]. + + +== Via CMake + +To build Kokkos with CMake, you can use the following commands: + +NOTE: +[source,sh] +---- +cmake -DKokkos_=ON +---- + +Where `` is one of the following: + +[cols="1,1", options="header"] +|=== +| Backend | Description + +| SERIAL +| Enables serial execution backend + +| OPENMP +| Enables OpenMP backend + +| CUDA +| Enables CUDA backend + +| HIP +| Enables HIP backend + +| SYCL +| Enables SYCL backend + +| THREADS +| Enables POSIX threads backend +|=== + + + +== Via Spack + +https://spack.io/[Spack] is a package manager for supercomputers, Linux, and macOS. +It makes installing scientific software easy. +To install Kokkos with Spack, you can use the following commands: + +[source,sh] +---- +spack install kokkos +---- + +Some variants are available for Kokkos, you can use the following command to see the full list of variants: + +[source,sh] +---- +spack info kokkos +---- + +[NOTE] +==== +On `gaya`, there are environments already set with spack. +You can load the environment with the following command: + +[source,sh] +---- +spacktivate feelpp-openmpi4-kokkos +---- +==== \ No newline at end of file diff --git a/docs/modules/kokkos/pages/introduction/why-kokkos.adoc b/docs/modules/kokkos/pages/introduction/why-kokkos.adoc index e69de29b..05450de3 100644 --- a/docs/modules/kokkos/pages/introduction/why-kokkos.adoc +++ b/docs/modules/kokkos/pages/introduction/why-kokkos.adoc @@ -0,0 +1,29 @@ +image::kokkos2.png[xref=#fragment100,width=284,height=72] + + +[.text-justify] +We have previously explored various hardware architectures, specialized languages, and advanced parallel programming techniques. At the heart of this technological complexity emerges Kokkos, a framework that, like a conductor, harmonizes and simplifies these diverse concepts. Kokkos offers a unified approach, allowing developers to efficiently navigate this complex ecosystem of parallel programming. + +[.text-justify] +The main appeal of Kokkos lies in its ability to abstract hardware-specific details, allowing programmers to write code once and deploy it across diverse architectures. This "write once, run anywhere" approach is particularly valuable in today's context where supercomputers and high-performance computing clusters often integrate heterogeneous accelerators. Kokkos offers powerful abstractions such as execution spaces, execution policies, and data containers like Kokkos::View, which automatically adapt to the underlying architecture. + +[.text-justify] +Another major advantage of Kokkos is its rich ecosystem, which we will see in more detail later, including profiling tools like Kokkos::Tools, which allow developers to fine-tune their applications. Moreover, Kokkos integrates well with other essential technologies of high-performance computing, such as MPI for inter-node communication, thus offering a complete solution for the development of large-scale parallel applications. + +image::kokkos-EcoSystem.png[xref=#fragment102,width=442,height=237] + +[.text-justify] +Kokkos is thus a (C++) library designed to address the growing challenges of parallel programming and performance portability in the field of high-performance computing (HPC). Its unified approach to target both CPUs and GPUs from different manufacturers (NVIDIA, AMD, Intel) makes it particularly attractive in a constantly evolving technology landscape. + +[.text-justify] +Its major advantage is the **simplification of parallel programming** and its ability to offer **performance portability**. It provides a programming model based on clear and scalable parallel patterns, such as `parallel_for` and `parallel_reduce`, which allow developers to efficiently exploit hardware resources without having to directly manage underlying complexities such as threads or CUDA/AMD blocks. Kokkos allows writing a single code that can be efficiently executed on different platforms. This approach significantly reduces the learning curve compared to other models such as CUDA, AMD or OpenMP while providing comparable flexibility. + +[.text-justify] +In addition, Kokkos promotes **increased productivity** thanks to its seamless integration with the (C++) language. It is not a new language or extension, but a library that leverages modern CPP features, such as lambda functions, to write compact and high-performance computing kernels. This allows developers to focus on the algorithmic logic while letting Kokkos handle the architecture-specific details + +[.text-justify] +In conclusion, using Kokkos effectively addresses the challenges related to performance portability, simplifying parallel programming, and code sustainability. Its combination of advanced abstractions, a robust ecosystem, and cross-platform compatibility makes it a strategic choice for developers working in the field of scientific and technical computing. *Therefore, I invite you to use Kokkos!* + + +For more information please visit the official webpage (See: https://kokkos.org/) + diff --git a/package-lock.json b/package-lock.json index ad4cf229..0ac92002 100644 --- a/package-lock.json +++ b/package-lock.json @@ -15,7 +15,7 @@ "@asciidoctor/core": "^2.2.8", "@djencks/asciidoctor-template": "^0.0.3", "@feelpp/antora-extensions": "^1.0.0-rc.2", - "@feelpp/asciidoctor-extensions": "^1.0.0-rc.11", + "@feelpp/asciidoctor-extensions": "^1.0.0-rc.13", "asciidoctor": "^2.2.6", "asciidoctor-emoji": "^0.4.2", "asciidoctor-jupyter": "^0.7.0", @@ -24,7 +24,7 @@ "node-srv": "^3.0.3" }, "devDependencies": { - "broken-link-checker": "^0.7.8", + "broken-link-checker": "^0.6.7", "http-server": "^14.1.1", "write-good": "^0.13.1" } @@ -351,9 +351,10 @@ "integrity": "sha512-pk39PtIMdUXZOO3zurSBqqt7gx25kkz/YgTbPa1v7X61u5DU+vzZ+kdkmqP51niXRLctZHkyIjnlgffmVG0itw==" }, "node_modules/@feelpp/asciidoctor-extensions": { - "version": "1.0.0-rc.11", - "resolved": "https://registry.npmjs.org/@feelpp/asciidoctor-extensions/-/asciidoctor-extensions-1.0.0-rc.11.tgz", - "integrity": "sha512-cqqlN5Srs8hqQ1QG14fQaZJB12RVrY9TG/X4ACtAp/3ms7GlDIQxxw2zR80p7f4n2V0XmRv3cKZ6IB3cT8Zb+A==" + "version": "1.0.0-rc.13", + "resolved": "https://registry.npmjs.org/@feelpp/asciidoctor-extensions/-/asciidoctor-extensions-1.0.0-rc.13.tgz", + "integrity": "sha512-brDULLvvzMDtrmFfDXT9nZoPIpzJ+cIwsWVnJ6rQVz0HZkd1MDUm+LcsDMduwUbezFTRpuu98sYTiIpbZmEfAw==", + "license": "MIT" }, "node_modules/@iarna/toml": { "version": "2.2.5", @@ -392,12 +393,6 @@ "node": ">= 8" } }, - "node_modules/@types/node": { - "version": "14.14.37", - "resolved": "https://registry.npmjs.org/@types/node/-/node-14.14.37.tgz", - "integrity": "sha512-XYmBiy+ohOR4Lh5jE379fV2IU+6Jn4g5qASinhitfyO71b/sCo6MKsMLF5tc7Zf2CE8hViVQyYSobJNke8OvUw==", - "dev": true - }, "node_modules/abbrev": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/abbrev/-/abbrev-1.1.1.tgz", @@ -646,16 +641,10 @@ "tough-cookie": "^2.3.1" } }, - "node_modules/bhttp/node_modules/extend": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/extend/-/extend-2.0.2.tgz", - "integrity": "sha512-AgFD4VU+lVLP6vjnlNfF7OeInLTyeyckCNPEsuxz1vi786UuK/nk6ynPuhn/h+Ju9++TQyr5EpLRI14fc1QtTQ==", - "dev": true - }, "node_modules/bluebird": { "version": "2.11.0", "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-2.11.0.tgz", - "integrity": "sha1-U0uQM8AiyVecVro7Plpcqvu2UOE=", + "integrity": "sha512-UfFSr22dmHPQqPP9XWHRhq+gWnHCYguQGkXQlbyPtW5qTnhFWA8/iXg765tH0cAjy7l/zPJ1aBTO0g5XgA7kvQ==", "dev": true }, "node_modules/brace-expansion": { @@ -679,33 +668,24 @@ } }, "node_modules/broken-link-checker": { - "version": "0.7.8", - "resolved": "https://registry.npmjs.org/broken-link-checker/-/broken-link-checker-0.7.8.tgz", - "integrity": "sha512-/zH4/nLMNKDeDH5nVuf/R6WYd0Yjnar1NpcdAO2+VlwjGKzJa6y42C03UO+imBSHwe6BefSkVi82fImE2Rb7yg==", + "version": "0.6.7", + "resolved": "https://registry.npmjs.org/broken-link-checker/-/broken-link-checker-0.6.7.tgz", + "integrity": "sha512-/j/MmMaFDUDa5pVAZnPSonu/uGaFm3ccQKgagh1akgG3B54xIINcT0fKhxE2yG4k0yjXMnoMort5Bf+1APHePQ==", "dev": true, "dependencies": { "bhttp": "^1.2.1", "calmcard": "~0.1.1", - "chalk": "^1.1.3", + "chalk": "^1.1.1", "char-spinner": "^1.0.1", - "condense-whitespace": "^1.0.0", "default-user-agent": "^1.0.0", - "errno": "~0.1.4", - "extend": "^3.0.0", - "http-equiv-refresh": "^1.0.0", - "humanize-duration": "^3.9.1", - "is-stream": "^1.0.1", - "is-string": "^1.0.4", - "limited-request-queue": "^2.0.0", - "link-types": "^1.1.0", - "maybe-callback": "^2.1.0", + "limited-request-queue": "^1.0.1", + "maybe-callback": "^1.0.0", "nopter": "~0.3.0", - "parse5": "^3.0.2", - "robot-directives": "~0.3.0", - "robots-txt-guard": "~0.1.0", - "robots-txt-parse": "~0.0.4", - "urlcache": "~0.7.0", - "urlobj": "0.0.11" + "object-assign": "^4.0.1", + "parse5": "^1.5.0", + "urlcache": "~0.5.0", + "urlobj": "0.0.8", + "void-elements": "^2.0.1" }, "bin": { "blc": "bin/blc", @@ -747,9 +727,9 @@ } }, "node_modules/buffer-from": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.1.tgz", - "integrity": "sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A==", + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.2.tgz", + "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==", "dev": true }, "node_modules/cache-directory": { @@ -776,10 +756,40 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/call-bind-apply-helpers": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.1.tgz", + "integrity": "sha512-BhYE+WDaywFg2TBWYNXAE+8B1ATnThNBqXHP5nQu0jWJdVvY2hvkpyB3qOmtmDePiS5/BDQ8wASEWGMWRG148g==", + "dev": true, + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/call-bound": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/call-bound/-/call-bound-1.0.3.tgz", + "integrity": "sha512-YTd+6wGlNlPxSuri7Y6X8tY2dmm12UMH66RpKMhiX6rsk5wXXnYgbUcOt8kiS31/AjfoTOvCsE+w8nZQLQnzHA==", + "dev": true, + "dependencies": { + "call-bind-apply-helpers": "^1.0.1", + "get-intrinsic": "^1.2.6" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/caller-path": { "version": "0.1.0", "resolved": "https://registry.npmjs.org/caller-path/-/caller-path-0.1.0.tgz", - "integrity": "sha1-lAhe9jWB7NPaqSREqP6U6CV3dR8=", + "integrity": "sha512-UJiE1otjXPF5/x+T3zTnSFiTOEmJoGTD9HmBoxnCUwho61a2eSNn/VwtwuIBDAo2SEOv1AJ7ARI5gCmohFLu/g==", "dev": true, "dependencies": { "callsites": "^0.2.0" @@ -791,7 +801,7 @@ "node_modules/callsites": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/callsites/-/callsites-0.2.0.tgz", - "integrity": "sha1-r6uWJikQp/M8GaV3WCXGnzTjUMo=", + "integrity": "sha512-Zv4Dns9IbXXmPkgRRUjAaJQgfN4xX5p6+RQFhWUqscdvvK2xK/ZL8b3IXIJsj+4sD+f24NwnWy2BY8AJ82JB0A==", "dev": true, "engines": { "node": ">=0.10.0" @@ -800,10 +810,19 @@ "node_modules/calmcard": { "version": "0.1.1", "resolved": "https://registry.npmjs.org/calmcard/-/calmcard-0.1.1.tgz", - "integrity": "sha1-NawrZkkrDtOa0GqJOg/25hEk5Ek=", + "integrity": "sha512-Ol6L8HQ0n1qFAOmuXcXcOXVpcQ+x5/N8ff+i6G76a4sDjRVtCWmpsG6RorloS1vVG9O0IGq9l7sqaxBnwG/LUA==", "deprecated": "no longer maintained", "dev": true }, + "node_modules/camelcase": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-1.2.1.tgz", + "integrity": "sha512-wzLkDa4K/mzI1OSITC+DUyjgIl/ETNHE9QvYgy6J6Jvqyyz4C0Xfd+lQhb19sX2jMpZV4IssUn0VDVmglV+s4g==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/chalk": { "version": "1.1.3", "resolved": "https://registry.npmjs.org/chalk/-/chalk-1.1.3.tgz", @@ -820,19 +839,10 @@ "node": ">=0.10.0" } }, - "node_modules/chalk/node_modules/escape-string-regexp": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz", - "integrity": "sha1-G2HAViGQqN/2rjuyzwIAyhMLhtQ=", - "dev": true, - "engines": { - "node": ">=0.8.0" - } - }, "node_modules/char-spinner": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/char-spinner/-/char-spinner-1.0.1.tgz", - "integrity": "sha1-5upnvSR+EHESmDt6sEee02KAAIE=", + "integrity": "sha512-acv43vqJ0+N0rD+Uw3pDHSxP30FHrywu2NO6/wBaHChJIizpDeBUd6NjqhNhy9LGaEAhZAXn46QzmlAvIWd16g==", "dev": true }, "node_modules/clean-git-ref": { @@ -841,9 +851,9 @@ "integrity": "sha512-bLSptAy2P0s6hU4PzuIMKmMJJSE6gLXGH1cntDu7bWJUksvuM+7ReOK61mozULErYvP6a15rnYl0zFDef+pyPw==" }, "node_modules/cli-table": { - "version": "0.3.6", - "resolved": "https://registry.npmjs.org/cli-table/-/cli-table-0.3.6.tgz", - "integrity": "sha512-ZkNZbnZjKERTY5NwC2SeMeLeifSPq/pubeRoTpdr3WchLlnZg6hEgvHkK5zL7KNFdd9PmHN8lxrENUwI3cE8vQ==", + "version": "0.3.11", + "resolved": "https://registry.npmjs.org/cli-table/-/cli-table-0.3.11.tgz", + "integrity": "sha512-IqLQi4lO0nIB4tcdTpN4LCB9FI3uqrJZK7RC515EnhZ6qBaglkIgICb1wjeAqpdoOabm1+SuQtkXIPdYC93jhQ==", "dev": true, "dependencies": { "colors": "1.0.3" @@ -918,7 +928,7 @@ "node_modules/colors": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/colors/-/colors-1.0.3.tgz", - "integrity": "sha1-BDP0TYCWgP3rYO0mDxsMJi6CpAs=", + "integrity": "sha512-pFGrxThWcWQ2MsAz6RtgeWe4NK2kUE1WfsrvvlctdII745EW9I0yflqhe7++M5LEc7bV2c/9/5zc8sFcpL0Drw==", "dev": true, "engines": { "node": ">=0.1.90" @@ -927,7 +937,7 @@ "node_modules/combined-stream2": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/combined-stream2/-/combined-stream2-1.1.2.tgz", - "integrity": "sha1-9uFLegFWZvjHsKH6xQYkAWSsNXA=", + "integrity": "sha512-sVqUHJmbdVm+HZWy4l34BPLczxI4fltN4Bm2vcvASsqBIXW4xFb4TRkwM8bw/UUXK9/OdHdAwi2cRYVEKrxzbg==", "dev": true, "dependencies": { "bluebird": "^2.8.1", @@ -964,9 +974,9 @@ } }, "node_modules/concat-stream/node_modules/readable-stream": { - "version": "2.3.7", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.7.tgz", - "integrity": "sha512-Ebho8K4jIbHAxnuxi7o42OrZgF/ZTNcsZj6nRKyUmkhLFq8CHItp/fy6hQZuZmP/n3yZ9VBUbp4zz/mX8hmYPw==", + "version": "2.3.8", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz", + "integrity": "sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==", "dev": true, "dependencies": { "core-util-is": "~1.0.0", @@ -993,15 +1003,6 @@ "safe-buffer": "~5.1.0" } }, - "node_modules/condense-whitespace": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/condense-whitespace/-/condense-whitespace-1.0.0.tgz", - "integrity": "sha1-g3bZjvAo5sss0kaOKM5CxcZasak=", - "dev": true, - "engines": { - "node": ">=0.10.0" - } - }, "node_modules/convict": { "version": "6.2.4", "resolved": "https://registry.npmjs.org/convict/-/convict-6.2.4.tgz", @@ -1015,9 +1016,9 @@ } }, "node_modules/core-util-is": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", - "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=", + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz", + "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==", "dev": true }, "node_modules/corser": { @@ -1074,7 +1075,7 @@ "node_modules/default-user-agent": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/default-user-agent/-/default-user-agent-1.0.0.tgz", - "integrity": "sha1-FsRu/cq6PtxF8k8r1IaLAbfCrcY=", + "integrity": "sha512-bDF7bg6OSNcSwFWPu4zYKpVkJZQYVrAANMYB8bc9Szem1D0yKdm4sa/rOCs2aC9+2GMqQ7KnwtZRvDhmLF0dXw==", "dev": true, "dependencies": { "os-name": "~1.0.3" @@ -1098,7 +1099,7 @@ "node_modules/dev-null": { "version": "0.1.1", "resolved": "https://registry.npmjs.org/dev-null/-/dev-null-0.1.1.tgz", - "integrity": "sha1-WiBc48Ky73e2I41roXnrdMag6Bg=", + "integrity": "sha512-nMNZG0zfMgmdv8S5O0TM5cpwNbGKRGPCxVsr0SmA3NZZy9CYBbuNLL0PD3Acx9e5LIUgwONXtM9kM6RlawPxEQ==", "dev": true }, "node_modules/diff3": { @@ -1106,11 +1107,20 @@ "resolved": "https://registry.npmjs.org/diff3/-/diff3-0.0.3.tgz", "integrity": "sha512-iSq8ngPOt0K53A6eVr4d5Kn6GNrM2nQZtC740pzIriHtn4pOQ2lyzEXQMBeVcWERN0ye7fhBsk9PbLLQOnUx/g==" }, - "node_modules/duplexer": { - "version": "0.1.2", - "resolved": "https://registry.npmjs.org/duplexer/-/duplexer-0.1.2.tgz", - "integrity": "sha512-jtD6YG370ZCIi/9GTaJKQxWTZD045+4R4hTk/x1UyoqadyJ9x9CgSi1RlVDQF8U2sxLLSnFkCaMihqljHIWgMg==", - "dev": true + "node_modules/dunder-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", + "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.1", + "es-errors": "^1.3.0", + "gopd": "^1.2.0" + }, + "engines": { + "node": ">= 0.4" + } }, "node_modules/e-prime": { "version": "0.10.4", @@ -1134,30 +1144,51 @@ "node_modules/eol": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/eol/-/eol-0.2.0.tgz", - "integrity": "sha1-L22whqJDpG4+Xb0OE0Ncfr6/Cd0=", + "integrity": "sha512-LCBxmDyUDh5pAXALohe9NCwyedyECwpFrcebZyW/XNTzn4WZFY3cX9MdkrJQu71ojEoHqcsciqFG7d3WQA+1Ew==", "dev": true }, - "node_modules/errno": { - "version": "0.1.8", - "resolved": "https://registry.npmjs.org/errno/-/errno-0.1.8.tgz", - "integrity": "sha512-dJ6oBr5SQ1VSd9qkk7ByRgb/1SH4JZjCHSW/mr63/QcXO9zLVxvJ6Oy13nio03rxpSnVDDjFor75SjVeZWPW/A==", - "dev": true, - "dependencies": { - "prr": "~1.0.1" - }, - "bin": { - "errno": "cli.js" - } - }, "node_modules/errors": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/errors/-/errors-0.2.0.tgz", - "integrity": "sha1-D1Hoidqj4RsZ5xhtEfEEqmbrJAM=", + "integrity": "sha512-W0w4yTo+twP/wGTF25kBGAXroAHzvxZvEDHJsCixlWS8lf8li0aZDhT+hz0mHQwsSW5esD5jyTQkaqA0ZHF83A==", "dev": true, "engines": { "node": "*" } }, + "node_modules/es-define-property": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", + "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-errors": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", + "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-object-atoms": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz", + "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==", + "dev": true, + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/escalade": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.1.tgz", @@ -1166,6 +1197,15 @@ "node": ">=6" } }, + "node_modules/escape-string-regexp": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz", + "integrity": "sha512-vbRorB5FUQWvla16U8R/qgaFIya2qGzwDrNmCZuYKrbdSUMG6I1ZCGQRefkRVhuOkIGVne7BQ35DSfo1qvJqFg==", + "dev": true, + "engines": { + "node": ">=0.8.0" + } + }, "node_modules/event-target-shim": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz", @@ -1189,9 +1229,9 @@ } }, "node_modules/extend": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", - "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==", + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/extend/-/extend-2.0.2.tgz", + "integrity": "sha512-AgFD4VU+lVLP6vjnlNfF7OeInLTyeyckCNPEsuxz1vi786UuK/nk6ynPuhn/h+Ju9++TQyr5EpLRI14fc1QtTQ==", "dev": true }, "node_modules/fast-copy": { @@ -1252,9 +1292,9 @@ } }, "node_modules/follow-redirects": { - "version": "1.15.2", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz", - "integrity": "sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==", + "version": "1.15.9", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.9.tgz", + "integrity": "sha512-gew4GsXizNgdoRyqmyfMHyAmXsZDk6mHkSxZFCzW9gwlbtOW44CDtYavM+y+72qD/Vq2l550kMF52DT8fOLJqQ==", "dev": true, "funding": [ { @@ -1262,6 +1302,7 @@ "url": "https://github.com/sponsors/RubenVerborgh" } ], + "license": "MIT", "engines": { "node": ">=4.0" }, @@ -1287,7 +1328,7 @@ "node_modules/form-fix-array": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/form-fix-array/-/form-fix-array-1.0.0.tgz", - "integrity": "sha1-oTR6R+UxF6t7zb8+Lz7JHGZ2m8g=", + "integrity": "sha512-f3qXI4CcvW7/6vqTKwCftcrFgfEBfWYPQTfvXrHYevHbJVfc107/SVvXvwUAYMaUAHdvu9ENQvLufJKphQI14w==", "dev": true }, "node_modules/fs.realpath": { @@ -1296,10 +1337,14 @@ "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==" }, "node_modules/function-bind": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.1.tgz", - "integrity": "sha512-yIovAzMX49sF8Yl58fSCWJ5svSLuaibPxXQJFLmBObTuCr0Mf1KiPopGM9NiFjiYBCbfaa2Fh6breQ6ANVTI0A==", - "dev": true + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", + "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", + "dev": true, + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } }, "node_modules/get-caller-file": { "version": "2.0.5", @@ -1310,19 +1355,44 @@ } }, "node_modules/get-intrinsic": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.1.1.tgz", - "integrity": "sha512-kWZrnVM42QCiEA2Ig1bG8zjoIMOgxWwYCEeNdwY6Tv/cOSeGpcoX4pXHfKUxNKVoArnrEr2e9srnAxxGIraS9Q==", + "version": "1.2.7", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.2.7.tgz", + "integrity": "sha512-VW6Pxhsrk0KAOqs3WEd0klDiF/+V7gQOpAvY1jVU/LHmaD/kQO4523aiJuikX/QAKYiW6x8Jh+RJej1almdtCA==", "dev": true, + "license": "MIT", "dependencies": { - "function-bind": "^1.1.1", - "has": "^1.0.3", - "has-symbols": "^1.0.1" + "call-bind-apply-helpers": "^1.0.1", + "es-define-property": "^1.0.1", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.0.0", + "function-bind": "^1.1.2", + "get-proto": "^1.0.0", + "gopd": "^1.2.0", + "has-symbols": "^1.1.0", + "hasown": "^2.0.2", + "math-intrinsics": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" }, "funding": { "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/get-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", + "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", + "dev": true, + "license": "MIT", + "dependencies": { + "dunder-proto": "^1.0.1", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/glob": { "version": "7.1.3", "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.3.tgz", @@ -1350,6 +1420,19 @@ "node": ">= 6" } }, + "node_modules/gopd": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", + "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/handlebars": { "version": "4.7.8", "resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.7.8.tgz", @@ -1382,18 +1465,6 @@ "node": ">=0.10.0" } }, - "node_modules/has": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/has/-/has-1.0.3.tgz", - "integrity": "sha512-f2dvO0VU6Oej7RkWJGrehjbzMAjFp5/VKPp5tTpWIV4JHHZK1/BxbFRtf/siA2SWTe09caDmVtYYzWEIbBS4zw==", - "dev": true, - "dependencies": { - "function-bind": "^1.1.1" - }, - "engines": { - "node": ">= 0.4.0" - } - }, "node_modules/has-ansi": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/has-ansi/-/has-ansi-2.0.0.tgz", @@ -1416,10 +1487,26 @@ } }, "node_modules/has-symbols": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", + "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-tostringtag": { "version": "1.0.2", - "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.0.2.tgz", - "integrity": "sha512-chXa79rL/UC2KlX17jo3vRGz0azaWEx5tGqZg5pO3NUyEJVB17dMruQlzCCOfUvElghKcm5194+BCRvi2Rv/Gw==", + "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz", + "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==", "dev": true, + "dependencies": { + "has-symbols": "^1.0.3" + }, "engines": { "node": ">= 0.4" }, @@ -1427,6 +1514,19 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/hasown": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", + "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/he": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz", @@ -1461,15 +1561,6 @@ "node": ">=12" } }, - "node_modules/http-equiv-refresh": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/http-equiv-refresh/-/http-equiv-refresh-1.0.0.tgz", - "integrity": "sha1-jsU4hmBCvl8/evpzfRmNlL6xsHs=", - "dev": true, - "engines": { - "node": ">= 0.10" - } - }, "node_modules/http-proxy": { "version": "1.18.1", "resolved": "https://registry.npmjs.org/http-proxy/-/http-proxy-1.18.1.tgz", @@ -1554,12 +1645,6 @@ "node": ">=8" } }, - "node_modules/humanize-duration": { - "version": "3.25.1", - "resolved": "https://registry.npmjs.org/humanize-duration/-/humanize-duration-3.25.1.tgz", - "integrity": "sha512-P+dRo48gpLgc2R9tMRgiDRNULPKCmqFYgguwqOO2C0fjO35TgdURDQDANSR1Nt92iHlbHGMxOTnsB8H8xnMa2Q==", - "dev": true - }, "node_modules/iconv-lite": { "version": "0.6.3", "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", @@ -1668,20 +1753,15 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/is-stream": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-1.1.0.tgz", - "integrity": "sha1-EtSj3U5o4Lec6428hBc66A2RykQ=", - "dev": true, - "engines": { - "node": ">=0.10.0" - } - }, "node_modules/is-string": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/is-string/-/is-string-1.0.5.tgz", - "integrity": "sha512-buY6VNRjhQMiF1qWDouloZlQbRhDPCebwxSjxMjxgemYT46YMd2NR0/H+fBhEfWX4A/w9TBJ+ol+okqJKFE6vQ==", + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/is-string/-/is-string-1.1.1.tgz", + "integrity": "sha512-BtEeSsoaQjlSPBemMQIrY1MY0uM6vnS1g5fmufYOtnxLGUZM2178PKbhsk7Ffv58IX+ZtcvoGwccYsh0PglkAA==", "dev": true, + "dependencies": { + "call-bound": "^1.0.3", + "has-tostringtag": "^1.0.2" + }, "engines": { "node": ">= 0.4" }, @@ -1692,18 +1772,9 @@ "node_modules/isarray": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", - "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=", + "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==", "dev": true }, - "node_modules/isbot": { - "version": "2.5.7", - "resolved": "https://registry.npmjs.org/isbot/-/isbot-2.5.7.tgz", - "integrity": "sha512-8P+oGrRDvuCpDdovK9oD4skHmSXu56bsK17K2ovXrkW7Ic4H9Y4AqnUUqlXqZxcqQ2358kid9Rb+fbLH5yeeUw==", - "dev": true, - "engines": { - "node": ">=6" - } - }, "node_modules/isomorphic-git": { "version": "1.25.10", "resolved": "https://registry.npmjs.org/isomorphic-git/-/isomorphic-git-1.25.10.tgz", @@ -1767,27 +1838,19 @@ } }, "node_modules/limited-request-queue": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/limited-request-queue/-/limited-request-queue-2.0.0.tgz", - "integrity": "sha1-FMfBILE4BgsZoqEDCrr2aTVyZQ0=", + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/limited-request-queue/-/limited-request-queue-1.0.1.tgz", + "integrity": "sha512-D+QsNiBdTZiR6BADlzPrKYtEn9Pxj/WMFqSWjxnFFfqReKls7/DPQP/qyp6wbBoIhJUJvdmdUw/bU1SAW4kt2w==", "dev": true, "dependencies": { "is-browser": "^2.0.1", - "parse-domain": "~0.2.0" + "object-assign": "^4.0.1", + "parse-domain": "~0.1.2" }, "engines": { "node": ">= 0.10" } }, - "node_modules/link-types": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/link-types/-/link-types-1.1.0.tgz", - "integrity": "sha1-r2XlnbUucMH/sYrEw8sFa/55aDA=", - "dev": true, - "engines": { - "node": ">= 0.10" - } - }, "node_modules/lodash": { "version": "4.17.21", "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", @@ -1804,20 +1867,20 @@ "integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==", "dev": true }, - "node_modules/lru-cache": { - "version": "4.1.5", - "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-4.1.5.tgz", - "integrity": "sha512-sWZlbEP2OsHNkXrMl5GYk/jKk70MBng6UU4YI/qGDYbgf6YbP4EvmqISbXCoJiRKs+1bSpFHVgQxvJ17F2li5g==", + "node_modules/math-intrinsics": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", + "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==", "dev": true, - "dependencies": { - "pseudomap": "^1.0.2", - "yallist": "^2.1.2" + "license": "MIT", + "engines": { + "node": ">= 0.4" } }, "node_modules/maybe-callback": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/maybe-callback/-/maybe-callback-2.1.0.tgz", - "integrity": "sha1-ivoLp7aRp6sSPn8S9l4yu10fgkM=", + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/maybe-callback/-/maybe-callback-1.0.0.tgz", + "integrity": "sha512-7/dLp+T2Z9keggtwVEyjdyW2uuW0XQqKJopHppYZm4BFjOjpa050eo6475XHcx9uf+nDUm9vyyIKTElPD73/OQ==", "dev": true }, "node_modules/merge2": { @@ -1935,7 +1998,7 @@ "node_modules/ms": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", - "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=", + "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", "dev": true }, "node_modules/multi-progress": { @@ -1996,7 +2059,7 @@ "node_modules/nopt": { "version": "3.0.6", "resolved": "https://registry.npmjs.org/nopt/-/nopt-3.0.6.tgz", - "integrity": "sha1-xkZdvwirzU2zWTF/eaxopkayj/k=", + "integrity": "sha512-4GUt3kSEYmk4ITxzB/b9vaIDfUVWN/Ml1Fwl11IlnIG2iaJ9O6WXZ9SrYM9NLI8OCBieN2Y8SWC2oJV0RQ7qYg==", "dev": true, "dependencies": { "abbrev": "1" @@ -2008,7 +2071,7 @@ "node_modules/nopter": { "version": "0.3.0", "resolved": "https://registry.npmjs.org/nopter/-/nopter-0.3.0.tgz", - "integrity": "sha1-uWkOb6uPJWs35OfM0j4rOEUMxx8=", + "integrity": "sha512-gSYCd2Gxge0Ovnszf8fI9WkN7/PiuTTIpbeo2VhyQ3+NwJ7z3x6jiOFAMuU65JZz+BK/I1uzae2neQVGFHMClQ==", "deprecated": "try optionator", "dev": true, "dependencies": { @@ -2028,7 +2091,7 @@ "node_modules/nopter/node_modules/ansi-regex": { "version": "0.2.1", "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-0.2.1.tgz", - "integrity": "sha1-DY6UaWej2BQ/k+JOKYUl/BsiNfk=", + "integrity": "sha512-sGwIGMjhYdW26/IhwK2gkWWI8DRCVO6uj3hYgHT+zD+QL1pa37tM3ujhyfcJIYSbsxp7Gxhy7zrRW/1AHm4BmA==", "dev": true, "engines": { "node": ">=0.10.0" @@ -2037,16 +2100,7 @@ "node_modules/nopter/node_modules/ansi-styles": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-1.1.0.tgz", - "integrity": "sha1-6uy/Zs1waIJ2Cy9GkVgrj1XXp94=", - "dev": true, - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/nopter/node_modules/camelcase": { - "version": "1.2.1", - "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-1.2.1.tgz", - "integrity": "sha1-m7UwTS4LVmmLLHWLCKPqqdqlijk=", + "integrity": "sha512-f2PKUkN5QngiSemowa6Mrk9MPCdtFiOSmibjZ+j1qhLGHHYsqZwmBMRF3IRMVXo8sybDqx2fJl2d/8OphBoWkA==", "dev": true, "engines": { "node": ">=0.10.0" @@ -2055,7 +2109,7 @@ "node_modules/nopter/node_modules/chalk": { "version": "0.5.1", "resolved": "https://registry.npmjs.org/chalk/-/chalk-0.5.1.tgz", - "integrity": "sha1-Zjs6ZItotV0EaQ1JFnqoN4WPIXQ=", + "integrity": "sha512-bIKA54hP8iZhyDT81TOsJiQvR1gW+ZYSXFaZUAvoD4wCHdbHY2actmpTE4x344ZlFqHbvoxKOaESULTZN2gstg==", "dev": true, "dependencies": { "ansi-styles": "^1.1.0", @@ -2068,19 +2122,10 @@ "node": ">=0.10.0" } }, - "node_modules/nopter/node_modules/escape-string-regexp": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz", - "integrity": "sha1-G2HAViGQqN/2rjuyzwIAyhMLhtQ=", - "dev": true, - "engines": { - "node": ">=0.8.0" - } - }, "node_modules/nopter/node_modules/has-ansi": { "version": "0.1.0", "resolved": "https://registry.npmjs.org/has-ansi/-/has-ansi-0.1.0.tgz", - "integrity": "sha1-hPJlqujA5qiKEtcCKJS3VoiUxi4=", + "integrity": "sha512-1YsTg1fk2/6JToQhtZkArMkurq8UoWU1Qe0aR3VUHjgij4nOylSWLWAtBXoZ4/dXOmugfLGm1c+QhuD0JyedFA==", "dev": true, "dependencies": { "ansi-regex": "^0.2.0" @@ -2092,10 +2137,19 @@ "node": ">=0.10.0" } }, + "node_modules/nopter/node_modules/object-assign": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-2.1.1.tgz", + "integrity": "sha512-CdsOUYIh5wIiozhJ3rLQgmUTgcyzFwZZrqhkKhODMoGtPKM+wt0h0CNIoauJWMsS9822EdzPsF/6mb4nLvPN5g==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/nopter/node_modules/strip-ansi": { "version": "0.3.0", "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-0.3.0.tgz", - "integrity": "sha1-JfSOoiynkYfzF0pNuHWTR7sSYiA=", + "integrity": "sha512-DerhZL7j6i6/nEnVG0qViKXI0OKouvvpsAiaj7c+LfqZZZxdwZtv8+UiA/w4VUJpT8UzX0pR1dcHOii1GbmruQ==", "dev": true, "dependencies": { "ansi-regex": "^0.2.1" @@ -2110,7 +2164,7 @@ "node_modules/nopter/node_modules/supports-color": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-0.2.0.tgz", - "integrity": "sha1-2S3iaU6z9nMjlz1649i1W0wiGQo=", + "integrity": "sha512-tdCZ28MnM7k7cJDJc7Eq80A9CsRFAAOZUy41npOZCs++qSjfIy7o5Rh46CBk+Dk5FbKJ33X3Tqg4YrV07N5RaA==", "dev": true, "bin": { "supports-color": "cli.js" @@ -2120,9 +2174,9 @@ } }, "node_modules/object-assign": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-2.1.1.tgz", - "integrity": "sha1-Q8NuXVaf+OSBbE76i+AtJpZ8GKo=", + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", + "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==", "dev": true, "engines": { "node": ">=0.10.0" @@ -2192,7 +2246,7 @@ "node_modules/os-name": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/os-name/-/os-name-1.0.3.tgz", - "integrity": "sha1-GzefZINa98Wn9JizV8uVIVwVnt8=", + "integrity": "sha512-f5estLO2KN8vgtTRaILIgEGBoBrMnZ3JQ7W9TMZCnOIGwHe8TRGSpcagnWDo+Dfhd/z08k9Xe75hvciJJ8Qaew==", "dev": true, "dependencies": { "osx-release": "^1.0.0", @@ -2205,19 +2259,10 @@ "node": ">=0.10.0" } }, - "node_modules/os-tmpdir": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/os-tmpdir/-/os-tmpdir-1.0.2.tgz", - "integrity": "sha1-u+Z0BseaqFxc/sdm/lc0VV36EnQ=", - "dev": true, - "engines": { - "node": ">=0.10.0" - } - }, "node_modules/osx-release": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/osx-release/-/osx-release-1.1.0.tgz", - "integrity": "sha1-8heRGigTaUmvG/kwiyQeJzfTzWw=", + "integrity": "sha512-ixCMMwnVxyHFQLQnINhmIpWqXIfS2YOXchwQrk+OFzmo6nDjQ0E4KXAyyUh0T0MZgV4bUhkRrAbVqlE4yLVq4A==", "dev": true, "dependencies": { "minimist": "^1.1.0" @@ -2235,19 +2280,16 @@ "integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==" }, "node_modules/parse-domain": { - "version": "0.2.2", - "resolved": "https://registry.npmjs.org/parse-domain/-/parse-domain-0.2.2.tgz", - "integrity": "sha1-GImJseLnOYv/PE9P19yhV+tR+sE=", + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/parse-domain/-/parse-domain-0.1.2.tgz", + "integrity": "sha512-3P8WcWUQY+W3jH637ozSr/+pMAv4RZDAobK0ADOTayzYf+BBnmaqmiknyNLyDCEHRNN88KFDGapWaVT9ix8VqQ==", "dev": true }, "node_modules/parse5": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/parse5/-/parse5-3.0.3.tgz", - "integrity": "sha512-rgO9Zg5LLLkfJF9E6CCmXlSE4UVceloys8JrFqCcHloC3usd/kJCyPDwH2SOlzix2j3xaP9sUX3e8+kvkuleAA==", - "dev": true, - "dependencies": { - "@types/node": "*" - } + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-1.5.1.tgz", + "integrity": "sha512-w2jx/0tJzvgKwZa58sj2vAYq/S/K1QJfIB3cWYea/Iu1scFPDQQ3IQiVZTHWtRBwAjv2Yd7S/xeZf3XqLDb3bA==", + "dev": true }, "node_modules/passive-voice": { "version": "0.1.0", @@ -2432,23 +2474,17 @@ "node": ">=0.4.0" } }, - "node_modules/prr": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/prr/-/prr-1.0.1.tgz", - "integrity": "sha1-0/wRS6BplaRexok/SEzrHXj19HY=", - "dev": true - }, - "node_modules/pseudomap": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/pseudomap/-/pseudomap-1.0.2.tgz", - "integrity": "sha1-8FKijacOYYkX7wqKw0wa5aaChrM=", - "dev": true - }, "node_modules/psl": { - "version": "1.8.0", - "resolved": "https://registry.npmjs.org/psl/-/psl-1.8.0.tgz", - "integrity": "sha512-RIdOzyoavK+hA18OGGWDqUTsCLhtA7IcZ/6NCs4fFJaHBDab+pDDmDIByWFRQJq2Cd7r1OoQxBGKOaztq+hjIQ==", - "dev": true + "version": "1.15.0", + "resolved": "https://registry.npmjs.org/psl/-/psl-1.15.0.tgz", + "integrity": "sha512-JZd3gMVBAVQkSs6HdNZo9Sdo0LNcQeMNP3CozBJb3JYC/QUYZTnKxP+f8oWRX4rHP5EurWxqAHTSwUCjlNKa1w==", + "dev": true, + "dependencies": { + "punycode": "^2.3.1" + }, + "funding": { + "url": "https://github.com/sponsors/lupomontero" + } }, "node_modules/pump": { "version": "3.0.2", @@ -2460,9 +2496,9 @@ } }, "node_modules/punycode": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.1.1.tgz", - "integrity": "sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A==", + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", + "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==", "dev": true, "engines": { "node": ">=6" @@ -2577,37 +2613,6 @@ "node": ">=0.10.0" } }, - "node_modules/robot-directives": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/robot-directives/-/robot-directives-0.3.0.tgz", - "integrity": "sha1-F0+x/8KpuXh3MB6HyJs5X0KdH2U=", - "dev": true, - "dependencies": { - "isbot": "^2.0.0", - "useragent": "^2.1.8" - }, - "engines": { - "node": ">= 0.10" - } - }, - "node_modules/robots-txt-guard": { - "version": "0.1.1", - "resolved": "https://registry.npmjs.org/robots-txt-guard/-/robots-txt-guard-0.1.1.tgz", - "integrity": "sha512-6+nGkE6c2dI9/dmhmNcoMKVwJxlA6sgN/XNo0rm6LLdA0hnj4YkpgrZdhMPl58gJkAqeiHlf4+8tJcLM1tv1Ew==", - "dev": true - }, - "node_modules/robots-txt-parse": { - "version": "0.0.4", - "resolved": "https://registry.npmjs.org/robots-txt-parse/-/robots-txt-parse-0.0.4.tgz", - "integrity": "sha1-99HzI/eZIdfpxsS70lBI9umBDXE=", - "dev": true, - "dependencies": { - "bluebird": "^2.3.5", - "split": "^0.3.0", - "stream-combiner": "^0.2.1", - "through": "^2.3.4" - } - }, "node_modules/run-parallel": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz", @@ -2680,9 +2685,9 @@ "integrity": "sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw==" }, "node_modules/semver": { - "version": "5.7.1", - "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz", - "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==", + "version": "5.7.2", + "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz", + "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==", "dev": true, "bin": { "semver": "bin/semver" @@ -2778,18 +2783,6 @@ "node": ">=0.10.0" } }, - "node_modules/split": { - "version": "0.3.3", - "resolved": "https://registry.npmjs.org/split/-/split-0.3.3.tgz", - "integrity": "sha1-zQ7qXmOiEd//frDwkcQTPi0N0o8=", - "dev": true, - "dependencies": { - "through": "2" - }, - "engines": { - "node": "*" - } - }, "node_modules/split2": { "version": "4.2.0", "resolved": "https://registry.npmjs.org/split2/-/split2-4.2.0.tgz", @@ -2801,23 +2794,13 @@ "node_modules/splitargs": { "version": "0.0.7", "resolved": "https://registry.npmjs.org/splitargs/-/splitargs-0.0.7.tgz", - "integrity": "sha1-/p965lc3GzOxDLgNoUPPgknPazs=", + "integrity": "sha512-UUFYD2oWbNwULH6WoVtLUOw8ch586B+HUqcsAjjjeoBQAM1bD4wZRXu01koaxyd8UeYpybWqW4h+lO1Okv40Tg==", "dev": true }, - "node_modules/stream-combiner": { - "version": "0.2.2", - "resolved": "https://registry.npmjs.org/stream-combiner/-/stream-combiner-0.2.2.tgz", - "integrity": "sha1-rsjLrBd7Vrb0+kec7YwZEs7lKFg=", - "dev": true, - "dependencies": { - "duplexer": "~0.1.1", - "through": "~2.3.4" - } - }, "node_modules/stream-length": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/stream-length/-/stream-length-1.0.2.tgz", - "integrity": "sha1-gnfzy+5JpNqrz9tOL0qbXp8snwA=", + "integrity": "sha512-aI+qKFiwoDV4rsXiS7WRoCt+v2RX1nUj17+KJC5r2gfh5xoSJIfP6Y3Do/HtvesFcTSWthIuJ3l1cvKQY/+nZg==", "dev": true, "dependencies": { "bluebird": "^2.6.2" @@ -2932,85 +2915,46 @@ "real-require": "^0.2.0" } }, - "node_modules/through": { - "version": "2.3.8", - "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz", - "integrity": "sha1-DdTJ/6q8NXlgsbckEV1+Doai4fU=", - "dev": true - }, - "node_modules/through2-sink": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/through2-sink/-/through2-sink-1.0.0.tgz", - "integrity": "sha1-XxBruh1zMNrTy6XAqxhjkjJWw5k=", - "dev": true, - "dependencies": { - "through2": "~0.5.1", - "xtend": "~3.0.0" - } - }, - "node_modules/through2-sink/node_modules/isarray": { - "version": "0.0.1", - "resolved": "https://registry.npmjs.org/isarray/-/isarray-0.0.1.tgz", - "integrity": "sha1-ihis/Kmo9Bd+Cav8YDiTmwXR7t8=", - "dev": true - }, - "node_modules/through2-sink/node_modules/readable-stream": { - "version": "1.0.34", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-1.0.34.tgz", - "integrity": "sha1-Elgg40vIQtLyqq+v5MKRbuMsFXw=", - "dev": true, - "dependencies": { - "core-util-is": "~1.0.0", - "inherits": "~2.0.1", - "isarray": "0.0.1", - "string_decoder": "~0.10.x" - } - }, - "node_modules/through2-sink/node_modules/string_decoder": { - "version": "0.10.31", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", - "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=", - "dev": true - }, - "node_modules/through2-sink/node_modules/through2": { + "node_modules/through2": { "version": "0.5.1", "resolved": "https://registry.npmjs.org/through2/-/through2-0.5.1.tgz", - "integrity": "sha1-390BLrnHAOIyP9M084rGIqs3Lac=", + "integrity": "sha512-zexCrAOTbjkBCXGyozn7hhS3aEaqdrc59mAD2E3dKYzV1vFuEGQ1hEDJN2oQMQFwy4he2zyLqPZV+AlfS8ZWJA==", "dev": true, "dependencies": { "readable-stream": "~1.0.17", "xtend": "~3.0.0" } }, - "node_modules/through2-sink/node_modules/xtend": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/xtend/-/xtend-3.0.0.tgz", - "integrity": "sha1-XM50B7r2Qsunvs2laBEcST9ZZlo=", + "node_modules/through2-sink": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/through2-sink/-/through2-sink-1.0.0.tgz", + "integrity": "sha512-9HvIHIEXZ5YgstQx3vsu4U/QQ/n7X5RHlXf8MsfSEnEzeUFbX9BHBWmlwdQ1b6CzDlUEDwjFnkSIxpJZ6qP+0Q==", "dev": true, - "engines": { - "node": ">=0.4" + "dependencies": { + "through2": "~0.5.1", + "xtend": "~3.0.0" } }, "node_modules/through2-spy": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/through2-spy/-/through2-spy-1.2.0.tgz", - "integrity": "sha1-nIkcqcpA4eHkzzHhrFf5TMnSSMs=", + "integrity": "sha512-QJ/32YGXA8K/PlkT/7kJOfWZREdlwoCcdVOL/wZHhV0JQJlwUQeDi4QZrEUEAdIL6Kjb9BRdHHz65zg+rzFxuA==", "dev": true, "dependencies": { "through2": "~0.5.1", "xtend": "~3.0.0" } }, - "node_modules/through2-spy/node_modules/isarray": { + "node_modules/through2/node_modules/isarray": { "version": "0.0.1", "resolved": "https://registry.npmjs.org/isarray/-/isarray-0.0.1.tgz", - "integrity": "sha1-ihis/Kmo9Bd+Cav8YDiTmwXR7t8=", + "integrity": "sha512-D2S+3GLxWH+uhrNEcoh/fnmYeP8E8/zHl644d/jdA0g2uyXvy3sb0qxotE+ne0LtccHknQzWwZEzhak7oJ0COQ==", "dev": true }, - "node_modules/through2-spy/node_modules/readable-stream": { + "node_modules/through2/node_modules/readable-stream": { "version": "1.0.34", "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-1.0.34.tgz", - "integrity": "sha1-Elgg40vIQtLyqq+v5MKRbuMsFXw=", + "integrity": "sha512-ok1qVCJuRkNmvebYikljxJA/UEsKwLl2nI1OmaqAu4/UE+h0wKCHok4XkL/gvi39OacXvw59RJUOFUkDib2rHg==", "dev": true, "dependencies": { "core-util-is": "~1.0.0", @@ -3019,43 +2963,12 @@ "string_decoder": "~0.10.x" } }, - "node_modules/through2-spy/node_modules/string_decoder": { + "node_modules/through2/node_modules/string_decoder": { "version": "0.10.31", "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", - "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=", + "integrity": "sha512-ev2QzSzWPYmy9GuqfIVildA4OdcGLeFZQrq5ys6RtiuF+RQQiZWr8TZNyAcuVXyQRYfEO+MsoB/1BuQVhOJuoQ==", "dev": true }, - "node_modules/through2-spy/node_modules/through2": { - "version": "0.5.1", - "resolved": "https://registry.npmjs.org/through2/-/through2-0.5.1.tgz", - "integrity": "sha1-390BLrnHAOIyP9M084rGIqs3Lac=", - "dev": true, - "dependencies": { - "readable-stream": "~1.0.17", - "xtend": "~3.0.0" - } - }, - "node_modules/through2-spy/node_modules/xtend": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/xtend/-/xtend-3.0.0.tgz", - "integrity": "sha1-XM50B7r2Qsunvs2laBEcST9ZZlo=", - "dev": true, - "engines": { - "node": ">=0.4" - } - }, - "node_modules/tmp": { - "version": "0.0.33", - "resolved": "https://registry.npmjs.org/tmp/-/tmp-0.0.33.tgz", - "integrity": "sha512-jRCJlojKnZ3addtTOjdIqoRuPEKBvNXcGYqzO6zWZX8KfKEpnGY5jfggJQ3EjKuu8D4bJRr0y+cYJFmYbImXGw==", - "dev": true, - "dependencies": { - "os-tmpdir": "~1.0.2" - }, - "engines": { - "node": ">=0.6.0" - } - }, "node_modules/to-regex-range": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz", @@ -3093,7 +3006,7 @@ "node_modules/typedarray": { "version": "0.0.6", "resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz", - "integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c=", + "integrity": "sha512-/aCDEGatGvZ2BIk+HmLf4ifCJFwvKFNb9/JeZPMulfgFracn9QFcAf5GO8B/mweUjSoblS5In0cWhqpfs/5PQA==", "dev": true }, "node_modules/typeof-article": { @@ -3157,51 +3070,33 @@ "dev": true }, "node_modules/urlcache": { - "version": "0.7.0", - "resolved": "https://registry.npmjs.org/urlcache/-/urlcache-0.7.0.tgz", - "integrity": "sha512-xOW4t6wJDT07+VunsHwePemyXXRidCSOZ/1RIILJi2XnB+81FA5H0MRvS63/7joTWjGLajcJJGvR5odpbkV6hw==", + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/urlcache/-/urlcache-0.5.0.tgz", + "integrity": "sha512-65SRCvjp3gX2CM0XlG47+vbJENarEWuKKJRZOLMLquZOW55DgeBS7bWaOH+QPVK/4fArJjkBZH9EtvF4iQMWJw==", "dev": true, "dependencies": { - "urlobj": "0.0.11" + "object-assign": "^4.0.1", + "urlobj": "0.0.8" }, "engines": { "node": ">= 0.10" } }, "node_modules/urlobj": { - "version": "0.0.11", - "resolved": "https://registry.npmjs.org/urlobj/-/urlobj-0.0.11.tgz", - "integrity": "sha512-Ncck0WWtuFBbZhSYwKjK1AU2V51V98P/KHUPkaEc+mFy4xkpAHFNyVQT+S5SgtsJAr94e4wiKUucJSfasV2kBw==", + "version": "0.0.8", + "resolved": "https://registry.npmjs.org/urlobj/-/urlobj-0.0.8.tgz", + "integrity": "sha512-+3lJQv5fXSpo1gAd/FmFql66ZMqfAEU0r4IE7142VhELNKL+Hhg/BI11m6Sbcr54Vm9BAtIN4+zK4P5y2uhJiw==", "deprecated": "use universal-url, minurl, relateurl, url-relation", "dev": true, "dependencies": { "is-object": "^1.0.1", "is-string": "^1.0.4", - "object-assign": "^4.1.1" + "object-assign": "^4.0.1" }, "engines": { "node": ">= 0.10" } }, - "node_modules/urlobj/node_modules/object-assign": { - "version": "4.1.1", - "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", - "integrity": "sha1-IQmtx5ZYh8/AXLvUQsrIv7s2CGM=", - "dev": true, - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/useragent": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/useragent/-/useragent-2.3.0.tgz", - "integrity": "sha512-4AoH4pxuSvHCjqLO04sU6U/uE65BYza8l/KKBS0b0hnUPWi+cQ2BpeTEwejCSx9SPV5/U03nniDTrWx5NrmKdw==", - "dev": true, - "dependencies": { - "lru-cache": "4.1.x", - "tmp": "0.0.x" - } - }, "node_modules/util-deprecate": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", @@ -3210,7 +3105,7 @@ "node_modules/uuid": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/uuid/-/uuid-2.0.3.tgz", - "integrity": "sha1-Z+LoY3lyFVMN/zGOW/nc6/1Hsho=", + "integrity": "sha512-FULf7fayPdpASncVy4DLh3xydlXEJJpvIELjYjNeQWYUZ9pclcpvCZSr2gkmN2FrrGcI7G/cJsIEwk5/8vfXpg==", "deprecated": "Please upgrade to version 7 or higher. Older versions may use Math.random() in certain circumstances, which is known to be problematic. See https://v8.dev/blog/math-random for details.", "dev": true }, @@ -3229,6 +3124,16 @@ "node": ">=10.13.0" } }, + "node_modules/void-elements": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/void-elements/-/void-elements-2.0.1.tgz", + "integrity": "sha512-qZKX4RnBzH2ugr8Lxa7x+0V6XD9Sb/ouARtiasEQCHB1EVU4NXtmHsDDrx1dO4ne5fc3J6EW05BP1Dl0z0iung==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/weasel-words": { "version": "0.1.1", "resolved": "https://registry.npmjs.org/weasel-words/-/weasel-words-0.1.1.tgz", @@ -3250,7 +3155,7 @@ "node_modules/win-release": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/win-release/-/win-release-1.1.1.tgz", - "integrity": "sha1-X6VeAr58qTTt/BJmVjLoSbcuUgk=", + "integrity": "sha512-iCRnKVvGxOQdsKhcQId2PXV1vV3J/sDPXKA4Oe9+Eti2nb2ESEsYHRYls/UjoUW3bIc5ZDO8dTH50A/5iVN+bw==", "dev": true, "dependencies": { "semver": "^5.0.1" @@ -3349,7 +3254,16 @@ "resolved": "https://registry.npmjs.org/xdg-basedir/-/xdg-basedir-3.0.0.tgz", "integrity": "sha512-1Dly4xqlulvPD3fZUQJLY+FUIeqN3N2MM3uqe4rCJftAvOjFa3jFGfctOgluGx4ahPbUCsZkmJILiP0Vi4T6lQ==", "engines": { - "node": ">=4" + "node": ">=4" + } + }, + "node_modules/xtend": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/xtend/-/xtend-3.0.0.tgz", + "integrity": "sha512-sp/sT9OALMjRW1fKDlPeuSZlDQpkqReA0pyJukniWbTGoEKefHxhGJynE3PNhUMlcM8qWIjPwecwCw4LArS5Eg==", + "dev": true, + "engines": { + "node": ">=0.4" } }, "node_modules/y18n": { @@ -3360,12 +3274,6 @@ "node": ">=10" } }, - "node_modules/yallist": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/yallist/-/yallist-2.1.2.tgz", - "integrity": "sha1-HBH5IY8HYImkfdUS+TxmmaaoHVI=", - "dev": true - }, "node_modules/yargs": { "version": "16.2.0", "resolved": "https://registry.npmjs.org/yargs/-/yargs-16.2.0.tgz", @@ -3656,9 +3564,9 @@ "integrity": "sha512-pk39PtIMdUXZOO3zurSBqqt7gx25kkz/YgTbPa1v7X61u5DU+vzZ+kdkmqP51niXRLctZHkyIjnlgffmVG0itw==" }, "@feelpp/asciidoctor-extensions": { - "version": "1.0.0-rc.11", - "resolved": "https://registry.npmjs.org/@feelpp/asciidoctor-extensions/-/asciidoctor-extensions-1.0.0-rc.11.tgz", - "integrity": "sha512-cqqlN5Srs8hqQ1QG14fQaZJB12RVrY9TG/X4ACtAp/3ms7GlDIQxxw2zR80p7f4n2V0XmRv3cKZ6IB3cT8Zb+A==" + "version": "1.0.0-rc.13", + "resolved": "https://registry.npmjs.org/@feelpp/asciidoctor-extensions/-/asciidoctor-extensions-1.0.0-rc.13.tgz", + "integrity": "sha512-brDULLvvzMDtrmFfDXT9nZoPIpzJ+cIwsWVnJ6rQVz0HZkd1MDUm+LcsDMduwUbezFTRpuu98sYTiIpbZmEfAw==" }, "@iarna/toml": { "version": "2.2.5", @@ -3688,12 +3596,6 @@ "fastq": "^1.6.0" } }, - "@types/node": { - "version": "14.14.37", - "resolved": "https://registry.npmjs.org/@types/node/-/node-14.14.37.tgz", - "integrity": "sha512-XYmBiy+ohOR4Lh5jE379fV2IU+6Jn4g5qASinhitfyO71b/sCo6MKsMLF5tc7Zf2CE8hViVQyYSobJNke8OvUw==", - "dev": true - }, "abbrev": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/abbrev/-/abbrev-1.1.1.tgz", @@ -3877,20 +3779,12 @@ "through2-sink": "^1.0.0", "through2-spy": "^1.2.0", "tough-cookie": "^2.3.1" - }, - "dependencies": { - "extend": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/extend/-/extend-2.0.2.tgz", - "integrity": "sha512-AgFD4VU+lVLP6vjnlNfF7OeInLTyeyckCNPEsuxz1vi786UuK/nk6ynPuhn/h+Ju9++TQyr5EpLRI14fc1QtTQ==", - "dev": true - } } }, "bluebird": { "version": "2.11.0", "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-2.11.0.tgz", - "integrity": "sha1-U0uQM8AiyVecVro7Plpcqvu2UOE=", + "integrity": "sha512-UfFSr22dmHPQqPP9XWHRhq+gWnHCYguQGkXQlbyPtW5qTnhFWA8/iXg765tH0cAjy7l/zPJ1aBTO0g5XgA7kvQ==", "dev": true }, "brace-expansion": { @@ -3911,33 +3805,24 @@ } }, "broken-link-checker": { - "version": "0.7.8", - "resolved": "https://registry.npmjs.org/broken-link-checker/-/broken-link-checker-0.7.8.tgz", - "integrity": "sha512-/zH4/nLMNKDeDH5nVuf/R6WYd0Yjnar1NpcdAO2+VlwjGKzJa6y42C03UO+imBSHwe6BefSkVi82fImE2Rb7yg==", + "version": "0.6.7", + "resolved": "https://registry.npmjs.org/broken-link-checker/-/broken-link-checker-0.6.7.tgz", + "integrity": "sha512-/j/MmMaFDUDa5pVAZnPSonu/uGaFm3ccQKgagh1akgG3B54xIINcT0fKhxE2yG4k0yjXMnoMort5Bf+1APHePQ==", "dev": true, "requires": { "bhttp": "^1.2.1", "calmcard": "~0.1.1", - "chalk": "^1.1.3", + "chalk": "^1.1.1", "char-spinner": "^1.0.1", - "condense-whitespace": "^1.0.0", "default-user-agent": "^1.0.0", - "errno": "~0.1.4", - "extend": "^3.0.0", - "http-equiv-refresh": "^1.0.0", - "humanize-duration": "^3.9.1", - "is-stream": "^1.0.1", - "is-string": "^1.0.4", - "limited-request-queue": "^2.0.0", - "link-types": "^1.1.0", - "maybe-callback": "^2.1.0", + "limited-request-queue": "^1.0.1", + "maybe-callback": "^1.0.0", "nopter": "~0.3.0", - "parse5": "^3.0.2", - "robot-directives": "~0.3.0", - "robots-txt-guard": "~0.1.0", - "robots-txt-parse": "~0.0.4", - "urlcache": "~0.7.0", - "urlobj": "0.0.11" + "object-assign": "^4.0.1", + "parse5": "^1.5.0", + "urlcache": "~0.5.0", + "urlobj": "0.0.8", + "void-elements": "^2.0.1" } }, "buffer": { @@ -3955,9 +3840,9 @@ "integrity": "sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==" }, "buffer-from": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.1.tgz", - "integrity": "sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A==", + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.2.tgz", + "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==", "dev": true }, "cache-directory": { @@ -3978,10 +3863,30 @@ "get-intrinsic": "^1.0.2" } }, + "call-bind-apply-helpers": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.1.tgz", + "integrity": "sha512-BhYE+WDaywFg2TBWYNXAE+8B1ATnThNBqXHP5nQu0jWJdVvY2hvkpyB3qOmtmDePiS5/BDQ8wASEWGMWRG148g==", + "dev": true, + "requires": { + "es-errors": "^1.3.0", + "function-bind": "^1.1.2" + } + }, + "call-bound": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/call-bound/-/call-bound-1.0.3.tgz", + "integrity": "sha512-YTd+6wGlNlPxSuri7Y6X8tY2dmm12UMH66RpKMhiX6rsk5wXXnYgbUcOt8kiS31/AjfoTOvCsE+w8nZQLQnzHA==", + "dev": true, + "requires": { + "call-bind-apply-helpers": "^1.0.1", + "get-intrinsic": "^1.2.6" + } + }, "caller-path": { "version": "0.1.0", "resolved": "https://registry.npmjs.org/caller-path/-/caller-path-0.1.0.tgz", - "integrity": "sha1-lAhe9jWB7NPaqSREqP6U6CV3dR8=", + "integrity": "sha512-UJiE1otjXPF5/x+T3zTnSFiTOEmJoGTD9HmBoxnCUwho61a2eSNn/VwtwuIBDAo2SEOv1AJ7ARI5gCmohFLu/g==", "dev": true, "requires": { "callsites": "^0.2.0" @@ -3990,13 +3895,19 @@ "callsites": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/callsites/-/callsites-0.2.0.tgz", - "integrity": "sha1-r6uWJikQp/M8GaV3WCXGnzTjUMo=", + "integrity": "sha512-Zv4Dns9IbXXmPkgRRUjAaJQgfN4xX5p6+RQFhWUqscdvvK2xK/ZL8b3IXIJsj+4sD+f24NwnWy2BY8AJ82JB0A==", "dev": true }, "calmcard": { "version": "0.1.1", "resolved": "https://registry.npmjs.org/calmcard/-/calmcard-0.1.1.tgz", - "integrity": "sha1-NawrZkkrDtOa0GqJOg/25hEk5Ek=", + "integrity": "sha512-Ol6L8HQ0n1qFAOmuXcXcOXVpcQ+x5/N8ff+i6G76a4sDjRVtCWmpsG6RorloS1vVG9O0IGq9l7sqaxBnwG/LUA==", + "dev": true + }, + "camelcase": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-1.2.1.tgz", + "integrity": "sha512-wzLkDa4K/mzI1OSITC+DUyjgIl/ETNHE9QvYgy6J6Jvqyyz4C0Xfd+lQhb19sX2jMpZV4IssUn0VDVmglV+s4g==", "dev": true }, "chalk": { @@ -4010,20 +3921,12 @@ "has-ansi": "^2.0.0", "strip-ansi": "^3.0.0", "supports-color": "^2.0.0" - }, - "dependencies": { - "escape-string-regexp": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz", - "integrity": "sha1-G2HAViGQqN/2rjuyzwIAyhMLhtQ=", - "dev": true - } } }, "char-spinner": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/char-spinner/-/char-spinner-1.0.1.tgz", - "integrity": "sha1-5upnvSR+EHESmDt6sEee02KAAIE=", + "integrity": "sha512-acv43vqJ0+N0rD+Uw3pDHSxP30FHrywu2NO6/wBaHChJIizpDeBUd6NjqhNhy9LGaEAhZAXn46QzmlAvIWd16g==", "dev": true }, "clean-git-ref": { @@ -4032,9 +3935,9 @@ "integrity": "sha512-bLSptAy2P0s6hU4PzuIMKmMJJSE6gLXGH1cntDu7bWJUksvuM+7ReOK61mozULErYvP6a15rnYl0zFDef+pyPw==" }, "cli-table": { - "version": "0.3.6", - "resolved": "https://registry.npmjs.org/cli-table/-/cli-table-0.3.6.tgz", - "integrity": "sha512-ZkNZbnZjKERTY5NwC2SeMeLeifSPq/pubeRoTpdr3WchLlnZg6hEgvHkK5zL7KNFdd9PmHN8lxrENUwI3cE8vQ==", + "version": "0.3.11", + "resolved": "https://registry.npmjs.org/cli-table/-/cli-table-0.3.11.tgz", + "integrity": "sha512-IqLQi4lO0nIB4tcdTpN4LCB9FI3uqrJZK7RC515EnhZ6qBaglkIgICb1wjeAqpdoOabm1+SuQtkXIPdYC93jhQ==", "dev": true, "requires": { "colors": "1.0.3" @@ -4096,13 +3999,13 @@ "colors": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/colors/-/colors-1.0.3.tgz", - "integrity": "sha1-BDP0TYCWgP3rYO0mDxsMJi6CpAs=", + "integrity": "sha512-pFGrxThWcWQ2MsAz6RtgeWe4NK2kUE1WfsrvvlctdII745EW9I0yflqhe7++M5LEc7bV2c/9/5zc8sFcpL0Drw==", "dev": true }, "combined-stream2": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/combined-stream2/-/combined-stream2-1.1.2.tgz", - "integrity": "sha1-9uFLegFWZvjHsKH6xQYkAWSsNXA=", + "integrity": "sha512-sVqUHJmbdVm+HZWy4l34BPLczxI4fltN4Bm2vcvASsqBIXW4xFb4TRkwM8bw/UUXK9/OdHdAwi2cRYVEKrxzbg==", "dev": true, "requires": { "bluebird": "^2.8.1", @@ -4133,9 +4036,9 @@ }, "dependencies": { "readable-stream": { - "version": "2.3.7", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.7.tgz", - "integrity": "sha512-Ebho8K4jIbHAxnuxi7o42OrZgF/ZTNcsZj6nRKyUmkhLFq8CHItp/fy6hQZuZmP/n3yZ9VBUbp4zz/mX8hmYPw==", + "version": "2.3.8", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz", + "integrity": "sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==", "dev": true, "requires": { "core-util-is": "~1.0.0", @@ -4164,12 +4067,6 @@ } } }, - "condense-whitespace": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/condense-whitespace/-/condense-whitespace-1.0.0.tgz", - "integrity": "sha1-g3bZjvAo5sss0kaOKM5CxcZasak=", - "dev": true - }, "convict": { "version": "6.2.4", "resolved": "https://registry.npmjs.org/convict/-/convict-6.2.4.tgz", @@ -4180,9 +4077,9 @@ } }, "core-util-is": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", - "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=", + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz", + "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==", "dev": true }, "corser": { @@ -4221,7 +4118,7 @@ "default-user-agent": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/default-user-agent/-/default-user-agent-1.0.0.tgz", - "integrity": "sha1-FsRu/cq6PtxF8k8r1IaLAbfCrcY=", + "integrity": "sha512-bDF7bg6OSNcSwFWPu4zYKpVkJZQYVrAANMYB8bc9Szem1D0yKdm4sa/rOCs2aC9+2GMqQ7KnwtZRvDhmLF0dXw==", "dev": true, "requires": { "os-name": "~1.0.3" @@ -4239,7 +4136,7 @@ "dev-null": { "version": "0.1.1", "resolved": "https://registry.npmjs.org/dev-null/-/dev-null-0.1.1.tgz", - "integrity": "sha1-WiBc48Ky73e2I41roXnrdMag6Bg=", + "integrity": "sha512-nMNZG0zfMgmdv8S5O0TM5cpwNbGKRGPCxVsr0SmA3NZZy9CYBbuNLL0PD3Acx9e5LIUgwONXtM9kM6RlawPxEQ==", "dev": true }, "diff3": { @@ -4247,11 +4144,16 @@ "resolved": "https://registry.npmjs.org/diff3/-/diff3-0.0.3.tgz", "integrity": "sha512-iSq8ngPOt0K53A6eVr4d5Kn6GNrM2nQZtC740pzIriHtn4pOQ2lyzEXQMBeVcWERN0ye7fhBsk9PbLLQOnUx/g==" }, - "duplexer": { - "version": "0.1.2", - "resolved": "https://registry.npmjs.org/duplexer/-/duplexer-0.1.2.tgz", - "integrity": "sha512-jtD6YG370ZCIi/9GTaJKQxWTZD045+4R4hTk/x1UyoqadyJ9x9CgSi1RlVDQF8U2sxLLSnFkCaMihqljHIWgMg==", - "dev": true + "dunder-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", + "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", + "dev": true, + "requires": { + "call-bind-apply-helpers": "^1.0.1", + "es-errors": "^1.3.0", + "gopd": "^1.2.0" + } }, "e-prime": { "version": "0.10.4", @@ -4275,29 +4177,47 @@ "eol": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/eol/-/eol-0.2.0.tgz", - "integrity": "sha1-L22whqJDpG4+Xb0OE0Ncfr6/Cd0=", + "integrity": "sha512-LCBxmDyUDh5pAXALohe9NCwyedyECwpFrcebZyW/XNTzn4WZFY3cX9MdkrJQu71ojEoHqcsciqFG7d3WQA+1Ew==", "dev": true }, - "errno": { - "version": "0.1.8", - "resolved": "https://registry.npmjs.org/errno/-/errno-0.1.8.tgz", - "integrity": "sha512-dJ6oBr5SQ1VSd9qkk7ByRgb/1SH4JZjCHSW/mr63/QcXO9zLVxvJ6Oy13nio03rxpSnVDDjFor75SjVeZWPW/A==", - "dev": true, - "requires": { - "prr": "~1.0.1" - } - }, "errors": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/errors/-/errors-0.2.0.tgz", - "integrity": "sha1-D1Hoidqj4RsZ5xhtEfEEqmbrJAM=", + "integrity": "sha512-W0w4yTo+twP/wGTF25kBGAXroAHzvxZvEDHJsCixlWS8lf8li0aZDhT+hz0mHQwsSW5esD5jyTQkaqA0ZHF83A==", + "dev": true + }, + "es-define-property": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", + "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==", "dev": true }, + "es-errors": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", + "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", + "dev": true + }, + "es-object-atoms": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz", + "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==", + "dev": true, + "requires": { + "es-errors": "^1.3.0" + } + }, "escalade": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.1.tgz", "integrity": "sha512-k0er2gUkLf8O0zKJiAhmkTnJlTvINGv7ygDNPbeIsX/TJjGJZHuh9B2UxbsaEkmlEo9MfhrSzmhIlhRlI2GXnw==" }, + "escape-string-regexp": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz", + "integrity": "sha512-vbRorB5FUQWvla16U8R/qgaFIya2qGzwDrNmCZuYKrbdSUMG6I1ZCGQRefkRVhuOkIGVne7BQ35DSfo1qvJqFg==", + "dev": true + }, "event-target-shim": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz", @@ -4315,9 +4235,9 @@ "integrity": "sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q==" }, "extend": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", - "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==", + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/extend/-/extend-2.0.2.tgz", + "integrity": "sha512-AgFD4VU+lVLP6vjnlNfF7OeInLTyeyckCNPEsuxz1vi786UuK/nk6ynPuhn/h+Ju9++TQyr5EpLRI14fc1QtTQ==", "dev": true }, "fast-copy": { @@ -4369,9 +4289,9 @@ } }, "follow-redirects": { - "version": "1.15.2", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.2.tgz", - "integrity": "sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==", + "version": "1.15.9", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.9.tgz", + "integrity": "sha512-gew4GsXizNgdoRyqmyfMHyAmXsZDk6mHkSxZFCzW9gwlbtOW44CDtYavM+y+72qD/Vq2l550kMF52DT8fOLJqQ==", "dev": true }, "form-data2": { @@ -4390,7 +4310,7 @@ "form-fix-array": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/form-fix-array/-/form-fix-array-1.0.0.tgz", - "integrity": "sha1-oTR6R+UxF6t7zb8+Lz7JHGZ2m8g=", + "integrity": "sha512-f3qXI4CcvW7/6vqTKwCftcrFgfEBfWYPQTfvXrHYevHbJVfc107/SVvXvwUAYMaUAHdvu9ENQvLufJKphQI14w==", "dev": true }, "fs.realpath": { @@ -4399,9 +4319,9 @@ "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==" }, "function-bind": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.1.tgz", - "integrity": "sha512-yIovAzMX49sF8Yl58fSCWJ5svSLuaibPxXQJFLmBObTuCr0Mf1KiPopGM9NiFjiYBCbfaa2Fh6breQ6ANVTI0A==", + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", + "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", "dev": true }, "get-caller-file": { @@ -4410,14 +4330,31 @@ "integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==" }, "get-intrinsic": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.1.1.tgz", - "integrity": "sha512-kWZrnVM42QCiEA2Ig1bG8zjoIMOgxWwYCEeNdwY6Tv/cOSeGpcoX4pXHfKUxNKVoArnrEr2e9srnAxxGIraS9Q==", + "version": "1.2.7", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.2.7.tgz", + "integrity": "sha512-VW6Pxhsrk0KAOqs3WEd0klDiF/+V7gQOpAvY1jVU/LHmaD/kQO4523aiJuikX/QAKYiW6x8Jh+RJej1almdtCA==", "dev": true, "requires": { - "function-bind": "^1.1.1", - "has": "^1.0.3", - "has-symbols": "^1.0.1" + "call-bind-apply-helpers": "^1.0.1", + "es-define-property": "^1.0.1", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.0.0", + "function-bind": "^1.1.2", + "get-proto": "^1.0.0", + "gopd": "^1.2.0", + "has-symbols": "^1.1.0", + "hasown": "^2.0.2", + "math-intrinsics": "^1.1.0" + } + }, + "get-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", + "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", + "dev": true, + "requires": { + "dunder-proto": "^1.0.1", + "es-object-atoms": "^1.0.0" } }, "glob": { @@ -4441,6 +4378,12 @@ "is-glob": "^4.0.1" } }, + "gopd": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", + "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==", + "dev": true + }, "handlebars": { "version": "4.7.8", "resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.7.8.tgz", @@ -4462,15 +4405,6 @@ "typeof-article": "^0.1.1" } }, - "has": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/has/-/has-1.0.3.tgz", - "integrity": "sha512-f2dvO0VU6Oej7RkWJGrehjbzMAjFp5/VKPp5tTpWIV4JHHZK1/BxbFRtf/siA2SWTe09caDmVtYYzWEIbBS4zw==", - "dev": true, - "requires": { - "function-bind": "^1.1.1" - } - }, "has-ansi": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/has-ansi/-/has-ansi-2.0.0.tgz", @@ -4487,11 +4421,29 @@ "dev": true }, "has-symbols": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.0.2.tgz", - "integrity": "sha512-chXa79rL/UC2KlX17jo3vRGz0azaWEx5tGqZg5pO3NUyEJVB17dMruQlzCCOfUvElghKcm5194+BCRvi2Rv/Gw==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", + "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==", "dev": true }, + "has-tostringtag": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz", + "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==", + "dev": true, + "requires": { + "has-symbols": "^1.0.3" + } + }, + "hasown": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", + "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", + "dev": true, + "requires": { + "function-bind": "^1.1.2" + } + }, "he": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz", @@ -4517,12 +4469,6 @@ "whatwg-encoding": "^2.0.0" } }, - "http-equiv-refresh": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/http-equiv-refresh/-/http-equiv-refresh-1.0.0.tgz", - "integrity": "sha1-jsU4hmBCvl8/evpzfRmNlL6xsHs=", - "dev": true - }, "http-proxy": { "version": "1.18.1", "resolved": "https://registry.npmjs.org/http-proxy/-/http-proxy-1.18.1.tgz", @@ -4585,12 +4531,6 @@ } } }, - "humanize-duration": { - "version": "3.25.1", - "resolved": "https://registry.npmjs.org/humanize-duration/-/humanize-duration-3.25.1.tgz", - "integrity": "sha512-P+dRo48gpLgc2R9tMRgiDRNULPKCmqFYgguwqOO2C0fjO35TgdURDQDANSR1Nt92iHlbHGMxOTnsB8H8xnMa2Q==", - "dev": true - }, "iconv-lite": { "version": "0.6.3", "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", @@ -4664,28 +4604,20 @@ "integrity": "sha512-2rRIahhZr2UWb45fIOuvZGpFtz0TyOZLf32KxBbSoUCeZR495zCKlWUKKUByk3geS2eAs7ZAABt0Y/Rx0GiQGA==", "dev": true }, - "is-stream": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-1.1.0.tgz", - "integrity": "sha1-EtSj3U5o4Lec6428hBc66A2RykQ=", - "dev": true - }, "is-string": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/is-string/-/is-string-1.0.5.tgz", - "integrity": "sha512-buY6VNRjhQMiF1qWDouloZlQbRhDPCebwxSjxMjxgemYT46YMd2NR0/H+fBhEfWX4A/w9TBJ+ol+okqJKFE6vQ==", - "dev": true + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/is-string/-/is-string-1.1.1.tgz", + "integrity": "sha512-BtEeSsoaQjlSPBemMQIrY1MY0uM6vnS1g5fmufYOtnxLGUZM2178PKbhsk7Ffv58IX+ZtcvoGwccYsh0PglkAA==", + "dev": true, + "requires": { + "call-bound": "^1.0.3", + "has-tostringtag": "^1.0.2" + } }, "isarray": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", - "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=", - "dev": true - }, - "isbot": { - "version": "2.5.7", - "resolved": "https://registry.npmjs.org/isbot/-/isbot-2.5.7.tgz", - "integrity": "sha512-8P+oGrRDvuCpDdovK9oD4skHmSXu56bsK17K2ovXrkW7Ic4H9Y4AqnUUqlXqZxcqQ2358kid9Rb+fbLH5yeeUw==", + "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==", "dev": true }, "isomorphic-git": { @@ -4730,21 +4662,16 @@ "integrity": "sha512-dcS1ul+9tmeD95T+x28/ehLgd9mENa3LsvDTtzm3vyBEO7RPptvAD+t44WVXaUjTBRcrpFeFlC8WCruUR456hw==" }, "limited-request-queue": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/limited-request-queue/-/limited-request-queue-2.0.0.tgz", - "integrity": "sha1-FMfBILE4BgsZoqEDCrr2aTVyZQ0=", + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/limited-request-queue/-/limited-request-queue-1.0.1.tgz", + "integrity": "sha512-D+QsNiBdTZiR6BADlzPrKYtEn9Pxj/WMFqSWjxnFFfqReKls7/DPQP/qyp6wbBoIhJUJvdmdUw/bU1SAW4kt2w==", "dev": true, "requires": { "is-browser": "^2.0.1", - "parse-domain": "~0.2.0" + "object-assign": "^4.0.1", + "parse-domain": "~0.1.2" } }, - "link-types": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/link-types/-/link-types-1.1.0.tgz", - "integrity": "sha1-r2XlnbUucMH/sYrEw8sFa/55aDA=", - "dev": true - }, "lodash": { "version": "4.17.21", "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", @@ -4761,20 +4688,16 @@ "integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==", "dev": true }, - "lru-cache": { - "version": "4.1.5", - "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-4.1.5.tgz", - "integrity": "sha512-sWZlbEP2OsHNkXrMl5GYk/jKk70MBng6UU4YI/qGDYbgf6YbP4EvmqISbXCoJiRKs+1bSpFHVgQxvJ17F2li5g==", - "dev": true, - "requires": { - "pseudomap": "^1.0.2", - "yallist": "^2.1.2" - } + "math-intrinsics": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", + "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==", + "dev": true }, "maybe-callback": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/maybe-callback/-/maybe-callback-2.1.0.tgz", - "integrity": "sha1-ivoLp7aRp6sSPn8S9l4yu10fgkM=", + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/maybe-callback/-/maybe-callback-1.0.0.tgz", + "integrity": "sha512-7/dLp+T2Z9keggtwVEyjdyW2uuW0XQqKJopHppYZm4BFjOjpa050eo6475XHcx9uf+nDUm9vyyIKTElPD73/OQ==", "dev": true }, "merge2": { @@ -4855,7 +4778,7 @@ "ms": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", - "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=", + "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", "dev": true }, "multi-progress": { @@ -4901,7 +4824,7 @@ "nopt": { "version": "3.0.6", "resolved": "https://registry.npmjs.org/nopt/-/nopt-3.0.6.tgz", - "integrity": "sha1-xkZdvwirzU2zWTF/eaxopkayj/k=", + "integrity": "sha512-4GUt3kSEYmk4ITxzB/b9vaIDfUVWN/Ml1Fwl11IlnIG2iaJ9O6WXZ9SrYM9NLI8OCBieN2Y8SWC2oJV0RQ7qYg==", "dev": true, "requires": { "abbrev": "1" @@ -4910,7 +4833,7 @@ "nopter": { "version": "0.3.0", "resolved": "https://registry.npmjs.org/nopter/-/nopter-0.3.0.tgz", - "integrity": "sha1-uWkOb6uPJWs35OfM0j4rOEUMxx8=", + "integrity": "sha512-gSYCd2Gxge0Ovnszf8fI9WkN7/PiuTTIpbeo2VhyQ3+NwJ7z3x6jiOFAMuU65JZz+BK/I1uzae2neQVGFHMClQ==", "dev": true, "requires": { "caller-path": "~0.1.0", @@ -4926,25 +4849,19 @@ "ansi-regex": { "version": "0.2.1", "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-0.2.1.tgz", - "integrity": "sha1-DY6UaWej2BQ/k+JOKYUl/BsiNfk=", + "integrity": "sha512-sGwIGMjhYdW26/IhwK2gkWWI8DRCVO6uj3hYgHT+zD+QL1pa37tM3ujhyfcJIYSbsxp7Gxhy7zrRW/1AHm4BmA==", "dev": true }, "ansi-styles": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-1.1.0.tgz", - "integrity": "sha1-6uy/Zs1waIJ2Cy9GkVgrj1XXp94=", - "dev": true - }, - "camelcase": { - "version": "1.2.1", - "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-1.2.1.tgz", - "integrity": "sha1-m7UwTS4LVmmLLHWLCKPqqdqlijk=", + "integrity": "sha512-f2PKUkN5QngiSemowa6Mrk9MPCdtFiOSmibjZ+j1qhLGHHYsqZwmBMRF3IRMVXo8sybDqx2fJl2d/8OphBoWkA==", "dev": true }, "chalk": { "version": "0.5.1", "resolved": "https://registry.npmjs.org/chalk/-/chalk-0.5.1.tgz", - "integrity": "sha1-Zjs6ZItotV0EaQ1JFnqoN4WPIXQ=", + "integrity": "sha512-bIKA54hP8iZhyDT81TOsJiQvR1gW+ZYSXFaZUAvoD4wCHdbHY2actmpTE4x344ZlFqHbvoxKOaESULTZN2gstg==", "dev": true, "requires": { "ansi-styles": "^1.1.0", @@ -4954,25 +4871,25 @@ "supports-color": "^0.2.0" } }, - "escape-string-regexp": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz", - "integrity": "sha1-G2HAViGQqN/2rjuyzwIAyhMLhtQ=", - "dev": true - }, "has-ansi": { "version": "0.1.0", "resolved": "https://registry.npmjs.org/has-ansi/-/has-ansi-0.1.0.tgz", - "integrity": "sha1-hPJlqujA5qiKEtcCKJS3VoiUxi4=", + "integrity": "sha512-1YsTg1fk2/6JToQhtZkArMkurq8UoWU1Qe0aR3VUHjgij4nOylSWLWAtBXoZ4/dXOmugfLGm1c+QhuD0JyedFA==", "dev": true, "requires": { "ansi-regex": "^0.2.0" } }, + "object-assign": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-2.1.1.tgz", + "integrity": "sha512-CdsOUYIh5wIiozhJ3rLQgmUTgcyzFwZZrqhkKhODMoGtPKM+wt0h0CNIoauJWMsS9822EdzPsF/6mb4nLvPN5g==", + "dev": true + }, "strip-ansi": { "version": "0.3.0", "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-0.3.0.tgz", - "integrity": "sha1-JfSOoiynkYfzF0pNuHWTR7sSYiA=", + "integrity": "sha512-DerhZL7j6i6/nEnVG0qViKXI0OKouvvpsAiaj7c+LfqZZZxdwZtv8+UiA/w4VUJpT8UzX0pR1dcHOii1GbmruQ==", "dev": true, "requires": { "ansi-regex": "^0.2.1" @@ -4981,15 +4898,15 @@ "supports-color": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-0.2.0.tgz", - "integrity": "sha1-2S3iaU6z9nMjlz1649i1W0wiGQo=", + "integrity": "sha512-tdCZ28MnM7k7cJDJc7Eq80A9CsRFAAOZUy41npOZCs++qSjfIy7o5Rh46CBk+Dk5FbKJ33X3Tqg4YrV07N5RaA==", "dev": true } } }, "object-assign": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-2.1.1.tgz", - "integrity": "sha1-Q8NuXVaf+OSBbE76i+AtJpZ8GKo=", + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", + "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==", "dev": true }, "object-inspect": { @@ -5038,23 +4955,17 @@ "os-name": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/os-name/-/os-name-1.0.3.tgz", - "integrity": "sha1-GzefZINa98Wn9JizV8uVIVwVnt8=", + "integrity": "sha512-f5estLO2KN8vgtTRaILIgEGBoBrMnZ3JQ7W9TMZCnOIGwHe8TRGSpcagnWDo+Dfhd/z08k9Xe75hvciJJ8Qaew==", "dev": true, "requires": { "osx-release": "^1.0.0", "win-release": "^1.0.0" } }, - "os-tmpdir": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/os-tmpdir/-/os-tmpdir-1.0.2.tgz", - "integrity": "sha1-u+Z0BseaqFxc/sdm/lc0VV36EnQ=", - "dev": true - }, "osx-release": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/osx-release/-/osx-release-1.1.0.tgz", - "integrity": "sha1-8heRGigTaUmvG/kwiyQeJzfTzWw=", + "integrity": "sha512-ixCMMwnVxyHFQLQnINhmIpWqXIfS2YOXchwQrk+OFzmo6nDjQ0E4KXAyyUh0T0MZgV4bUhkRrAbVqlE4yLVq4A==", "dev": true, "requires": { "minimist": "^1.1.0" @@ -5066,19 +4977,16 @@ "integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==" }, "parse-domain": { - "version": "0.2.2", - "resolved": "https://registry.npmjs.org/parse-domain/-/parse-domain-0.2.2.tgz", - "integrity": "sha1-GImJseLnOYv/PE9P19yhV+tR+sE=", + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/parse-domain/-/parse-domain-0.1.2.tgz", + "integrity": "sha512-3P8WcWUQY+W3jH637ozSr/+pMAv4RZDAobK0ADOTayzYf+BBnmaqmiknyNLyDCEHRNN88KFDGapWaVT9ix8VqQ==", "dev": true }, "parse5": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/parse5/-/parse5-3.0.3.tgz", - "integrity": "sha512-rgO9Zg5LLLkfJF9E6CCmXlSE4UVceloys8JrFqCcHloC3usd/kJCyPDwH2SOlzix2j3xaP9sUX3e8+kvkuleAA==", - "dev": true, - "requires": { - "@types/node": "*" - } + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-1.5.1.tgz", + "integrity": "sha512-w2jx/0tJzvgKwZa58sj2vAYq/S/K1QJfIB3cWYea/Iu1scFPDQQ3IQiVZTHWtRBwAjv2Yd7S/xeZf3XqLDb3bA==", + "dev": true }, "passive-voice": { "version": "0.1.0", @@ -5236,23 +5144,14 @@ "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz", "integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==" }, - "prr": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/prr/-/prr-1.0.1.tgz", - "integrity": "sha1-0/wRS6BplaRexok/SEzrHXj19HY=", - "dev": true - }, - "pseudomap": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/pseudomap/-/pseudomap-1.0.2.tgz", - "integrity": "sha1-8FKijacOYYkX7wqKw0wa5aaChrM=", - "dev": true - }, "psl": { - "version": "1.8.0", - "resolved": "https://registry.npmjs.org/psl/-/psl-1.8.0.tgz", - "integrity": "sha512-RIdOzyoavK+hA18OGGWDqUTsCLhtA7IcZ/6NCs4fFJaHBDab+pDDmDIByWFRQJq2Cd7r1OoQxBGKOaztq+hjIQ==", - "dev": true + "version": "1.15.0", + "resolved": "https://registry.npmjs.org/psl/-/psl-1.15.0.tgz", + "integrity": "sha512-JZd3gMVBAVQkSs6HdNZo9Sdo0LNcQeMNP3CozBJb3JYC/QUYZTnKxP+f8oWRX4rHP5EurWxqAHTSwUCjlNKa1w==", + "dev": true, + "requires": { + "punycode": "^2.3.1" + } }, "pump": { "version": "3.0.2", @@ -5264,9 +5163,9 @@ } }, "punycode": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.1.1.tgz", - "integrity": "sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A==", + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", + "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==", "dev": true }, "qs": { @@ -5339,34 +5238,6 @@ "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.0.4.tgz", "integrity": "sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw==" }, - "robot-directives": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/robot-directives/-/robot-directives-0.3.0.tgz", - "integrity": "sha1-F0+x/8KpuXh3MB6HyJs5X0KdH2U=", - "dev": true, - "requires": { - "isbot": "^2.0.0", - "useragent": "^2.1.8" - } - }, - "robots-txt-guard": { - "version": "0.1.1", - "resolved": "https://registry.npmjs.org/robots-txt-guard/-/robots-txt-guard-0.1.1.tgz", - "integrity": "sha512-6+nGkE6c2dI9/dmhmNcoMKVwJxlA6sgN/XNo0rm6LLdA0hnj4YkpgrZdhMPl58gJkAqeiHlf4+8tJcLM1tv1Ew==", - "dev": true - }, - "robots-txt-parse": { - "version": "0.0.4", - "resolved": "https://registry.npmjs.org/robots-txt-parse/-/robots-txt-parse-0.0.4.tgz", - "integrity": "sha1-99HzI/eZIdfpxsS70lBI9umBDXE=", - "dev": true, - "requires": { - "bluebird": "^2.3.5", - "split": "^0.3.0", - "stream-combiner": "^0.2.1", - "through": "^2.3.4" - } - }, "run-parallel": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz", @@ -5408,9 +5279,9 @@ "integrity": "sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw==" }, "semver": { - "version": "5.7.1", - "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz", - "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==", + "version": "5.7.2", + "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz", + "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==", "dev": true }, "sha.js": { @@ -5466,15 +5337,6 @@ "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==" }, - "split": { - "version": "0.3.3", - "resolved": "https://registry.npmjs.org/split/-/split-0.3.3.tgz", - "integrity": "sha1-zQ7qXmOiEd//frDwkcQTPi0N0o8=", - "dev": true, - "requires": { - "through": "2" - } - }, "split2": { "version": "4.2.0", "resolved": "https://registry.npmjs.org/split2/-/split2-4.2.0.tgz", @@ -5483,23 +5345,13 @@ "splitargs": { "version": "0.0.7", "resolved": "https://registry.npmjs.org/splitargs/-/splitargs-0.0.7.tgz", - "integrity": "sha1-/p965lc3GzOxDLgNoUPPgknPazs=", + "integrity": "sha512-UUFYD2oWbNwULH6WoVtLUOw8ch586B+HUqcsAjjjeoBQAM1bD4wZRXu01koaxyd8UeYpybWqW4h+lO1Okv40Tg==", "dev": true }, - "stream-combiner": { - "version": "0.2.2", - "resolved": "https://registry.npmjs.org/stream-combiner/-/stream-combiner-0.2.2.tgz", - "integrity": "sha1-rsjLrBd7Vrb0+kec7YwZEs7lKFg=", - "dev": true, - "requires": { - "duplexer": "~0.1.1", - "through": "~2.3.4" - } - }, "stream-length": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/stream-length/-/stream-length-1.0.2.tgz", - "integrity": "sha1-gnfzy+5JpNqrz9tOL0qbXp8snwA=", + "integrity": "sha512-aI+qKFiwoDV4rsXiS7WRoCt+v2RX1nUj17+KJC5r2gfh5xoSJIfP6Y3Do/HtvesFcTSWthIuJ3l1cvKQY/+nZg==", "dev": true, "requires": { "bluebird": "^2.6.2" @@ -5593,32 +5445,26 @@ "real-require": "^0.2.0" } }, - "through": { - "version": "2.3.8", - "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz", - "integrity": "sha1-DdTJ/6q8NXlgsbckEV1+Doai4fU=", - "dev": true - }, - "through2-sink": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/through2-sink/-/through2-sink-1.0.0.tgz", - "integrity": "sha1-XxBruh1zMNrTy6XAqxhjkjJWw5k=", + "through2": { + "version": "0.5.1", + "resolved": "https://registry.npmjs.org/through2/-/through2-0.5.1.tgz", + "integrity": "sha512-zexCrAOTbjkBCXGyozn7hhS3aEaqdrc59mAD2E3dKYzV1vFuEGQ1hEDJN2oQMQFwy4he2zyLqPZV+AlfS8ZWJA==", "dev": true, "requires": { - "through2": "~0.5.1", + "readable-stream": "~1.0.17", "xtend": "~3.0.0" }, "dependencies": { "isarray": { "version": "0.0.1", "resolved": "https://registry.npmjs.org/isarray/-/isarray-0.0.1.tgz", - "integrity": "sha1-ihis/Kmo9Bd+Cav8YDiTmwXR7t8=", + "integrity": "sha512-D2S+3GLxWH+uhrNEcoh/fnmYeP8E8/zHl644d/jdA0g2uyXvy3sb0qxotE+ne0LtccHknQzWwZEzhak7oJ0COQ==", "dev": true }, "readable-stream": { "version": "1.0.34", "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-1.0.34.tgz", - "integrity": "sha1-Elgg40vIQtLyqq+v5MKRbuMsFXw=", + "integrity": "sha512-ok1qVCJuRkNmvebYikljxJA/UEsKwLl2nI1OmaqAu4/UE+h0wKCHok4XkL/gvi39OacXvw59RJUOFUkDib2rHg==", "dev": true, "requires": { "core-util-is": "~1.0.0", @@ -5630,86 +5476,29 @@ "string_decoder": { "version": "0.10.31", "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", - "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=", - "dev": true - }, - "through2": { - "version": "0.5.1", - "resolved": "https://registry.npmjs.org/through2/-/through2-0.5.1.tgz", - "integrity": "sha1-390BLrnHAOIyP9M084rGIqs3Lac=", - "dev": true, - "requires": { - "readable-stream": "~1.0.17", - "xtend": "~3.0.0" - } - }, - "xtend": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/xtend/-/xtend-3.0.0.tgz", - "integrity": "sha1-XM50B7r2Qsunvs2laBEcST9ZZlo=", + "integrity": "sha512-ev2QzSzWPYmy9GuqfIVildA4OdcGLeFZQrq5ys6RtiuF+RQQiZWr8TZNyAcuVXyQRYfEO+MsoB/1BuQVhOJuoQ==", "dev": true } } }, - "through2-spy": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/through2-spy/-/through2-spy-1.2.0.tgz", - "integrity": "sha1-nIkcqcpA4eHkzzHhrFf5TMnSSMs=", + "through2-sink": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/through2-sink/-/through2-sink-1.0.0.tgz", + "integrity": "sha512-9HvIHIEXZ5YgstQx3vsu4U/QQ/n7X5RHlXf8MsfSEnEzeUFbX9BHBWmlwdQ1b6CzDlUEDwjFnkSIxpJZ6qP+0Q==", "dev": true, "requires": { "through2": "~0.5.1", "xtend": "~3.0.0" - }, - "dependencies": { - "isarray": { - "version": "0.0.1", - "resolved": "https://registry.npmjs.org/isarray/-/isarray-0.0.1.tgz", - "integrity": "sha1-ihis/Kmo9Bd+Cav8YDiTmwXR7t8=", - "dev": true - }, - "readable-stream": { - "version": "1.0.34", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-1.0.34.tgz", - "integrity": "sha1-Elgg40vIQtLyqq+v5MKRbuMsFXw=", - "dev": true, - "requires": { - "core-util-is": "~1.0.0", - "inherits": "~2.0.1", - "isarray": "0.0.1", - "string_decoder": "~0.10.x" - } - }, - "string_decoder": { - "version": "0.10.31", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", - "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=", - "dev": true - }, - "through2": { - "version": "0.5.1", - "resolved": "https://registry.npmjs.org/through2/-/through2-0.5.1.tgz", - "integrity": "sha1-390BLrnHAOIyP9M084rGIqs3Lac=", - "dev": true, - "requires": { - "readable-stream": "~1.0.17", - "xtend": "~3.0.0" - } - }, - "xtend": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/xtend/-/xtend-3.0.0.tgz", - "integrity": "sha1-XM50B7r2Qsunvs2laBEcST9ZZlo=", - "dev": true - } } }, - "tmp": { - "version": "0.0.33", - "resolved": "https://registry.npmjs.org/tmp/-/tmp-0.0.33.tgz", - "integrity": "sha512-jRCJlojKnZ3addtTOjdIqoRuPEKBvNXcGYqzO6zWZX8KfKEpnGY5jfggJQ3EjKuu8D4bJRr0y+cYJFmYbImXGw==", + "through2-spy": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/through2-spy/-/through2-spy-1.2.0.tgz", + "integrity": "sha512-QJ/32YGXA8K/PlkT/7kJOfWZREdlwoCcdVOL/wZHhV0JQJlwUQeDi4QZrEUEAdIL6Kjb9BRdHHz65zg+rzFxuA==", "dev": true, "requires": { - "os-tmpdir": "~1.0.2" + "through2": "~0.5.1", + "xtend": "~3.0.0" } }, "to-regex-range": { @@ -5739,7 +5528,7 @@ "typedarray": { "version": "0.0.6", "resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz", - "integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c=", + "integrity": "sha512-/aCDEGatGvZ2BIk+HmLf4ifCJFwvKFNb9/JeZPMulfgFracn9QFcAf5GO8B/mweUjSoblS5In0cWhqpfs/5PQA==", "dev": true }, "typeof-article": { @@ -5787,41 +5576,24 @@ "dev": true }, "urlcache": { - "version": "0.7.0", - "resolved": "https://registry.npmjs.org/urlcache/-/urlcache-0.7.0.tgz", - "integrity": "sha512-xOW4t6wJDT07+VunsHwePemyXXRidCSOZ/1RIILJi2XnB+81FA5H0MRvS63/7joTWjGLajcJJGvR5odpbkV6hw==", + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/urlcache/-/urlcache-0.5.0.tgz", + "integrity": "sha512-65SRCvjp3gX2CM0XlG47+vbJENarEWuKKJRZOLMLquZOW55DgeBS7bWaOH+QPVK/4fArJjkBZH9EtvF4iQMWJw==", "dev": true, "requires": { - "urlobj": "0.0.11" + "object-assign": "^4.0.1", + "urlobj": "0.0.8" } }, "urlobj": { - "version": "0.0.11", - "resolved": "https://registry.npmjs.org/urlobj/-/urlobj-0.0.11.tgz", - "integrity": "sha512-Ncck0WWtuFBbZhSYwKjK1AU2V51V98P/KHUPkaEc+mFy4xkpAHFNyVQT+S5SgtsJAr94e4wiKUucJSfasV2kBw==", + "version": "0.0.8", + "resolved": "https://registry.npmjs.org/urlobj/-/urlobj-0.0.8.tgz", + "integrity": "sha512-+3lJQv5fXSpo1gAd/FmFql66ZMqfAEU0r4IE7142VhELNKL+Hhg/BI11m6Sbcr54Vm9BAtIN4+zK4P5y2uhJiw==", "dev": true, "requires": { "is-object": "^1.0.1", "is-string": "^1.0.4", - "object-assign": "^4.1.1" - }, - "dependencies": { - "object-assign": { - "version": "4.1.1", - "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", - "integrity": "sha1-IQmtx5ZYh8/AXLvUQsrIv7s2CGM=", - "dev": true - } - } - }, - "useragent": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/useragent/-/useragent-2.3.0.tgz", - "integrity": "sha512-4AoH4pxuSvHCjqLO04sU6U/uE65BYza8l/KKBS0b0hnUPWi+cQ2BpeTEwejCSx9SPV5/U03nniDTrWx5NrmKdw==", - "dev": true, - "requires": { - "lru-cache": "4.1.x", - "tmp": "0.0.x" + "object-assign": "^4.0.1" } }, "util-deprecate": { @@ -5832,7 +5604,7 @@ "uuid": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/uuid/-/uuid-2.0.3.tgz", - "integrity": "sha1-Z+LoY3lyFVMN/zGOW/nc6/1Hsho=", + "integrity": "sha512-FULf7fayPdpASncVy4DLh3xydlXEJJpvIELjYjNeQWYUZ9pclcpvCZSr2gkmN2FrrGcI7G/cJsIEwk5/8vfXpg==", "dev": true }, "vinyl": { @@ -5847,6 +5619,12 @@ "teex": "^1.0.1" } }, + "void-elements": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/void-elements/-/void-elements-2.0.1.tgz", + "integrity": "sha512-qZKX4RnBzH2ugr8Lxa7x+0V6XD9Sb/ouARtiasEQCHB1EVU4NXtmHsDDrx1dO4ne5fc3J6EW05BP1Dl0z0iung==", + "dev": true + }, "weasel-words": { "version": "0.1.1", "resolved": "https://registry.npmjs.org/weasel-words/-/weasel-words-0.1.1.tgz", @@ -5865,7 +5643,7 @@ "win-release": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/win-release/-/win-release-1.1.1.tgz", - "integrity": "sha1-X6VeAr58qTTt/BJmVjLoSbcuUgk=", + "integrity": "sha512-iCRnKVvGxOQdsKhcQId2PXV1vV3J/sDPXKA4Oe9+Eti2nb2ESEsYHRYls/UjoUW3bIc5ZDO8dTH50A/5iVN+bw==", "dev": true, "requires": { "semver": "^5.0.1" @@ -5943,17 +5721,17 @@ "resolved": "https://registry.npmjs.org/xdg-basedir/-/xdg-basedir-3.0.0.tgz", "integrity": "sha512-1Dly4xqlulvPD3fZUQJLY+FUIeqN3N2MM3uqe4rCJftAvOjFa3jFGfctOgluGx4ahPbUCsZkmJILiP0Vi4T6lQ==" }, + "xtend": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/xtend/-/xtend-3.0.0.tgz", + "integrity": "sha512-sp/sT9OALMjRW1fKDlPeuSZlDQpkqReA0pyJukniWbTGoEKefHxhGJynE3PNhUMlcM8qWIjPwecwCw4LArS5Eg==", + "dev": true + }, "y18n": { "version": "5.0.8", "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", "integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==" }, - "yallist": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/yallist/-/yallist-2.1.2.tgz", - "integrity": "sha1-HBH5IY8HYImkfdUS+TxmmaaoHVI=", - "dev": true - }, "yargs": { "version": "16.2.0", "resolved": "https://registry.npmjs.org/yargs/-/yargs-16.2.0.tgz", diff --git a/package.json b/package.json index 2bf854b3..63c00ff4 100644 --- a/package.json +++ b/package.json @@ -32,17 +32,17 @@ "@antora/site-generator-default": "^3.1.9", "@asciidoctor/core": "^2.2.8", "@djencks/asciidoctor-template": "^0.0.3", - "@feelpp/asciidoctor-extensions": "^1.0.0-rc.11", "@feelpp/antora-extensions": "^1.0.0-rc.2", + "@feelpp/asciidoctor-extensions": "^1.0.0-rc.13", "asciidoctor": "^2.2.6", "asciidoctor-emoji": "^0.4.2", - "asciidoctor-jupyter": "^0.7.0", + "asciidoctor-jupyter": "^0.7.0", "asciidoctor-kroki": "^0.18.1", "handlebars-utils": "^1.0.6", "node-srv": "^3.0.3" }, "devDependencies": { - "broken-link-checker": "^0.7.8", + "broken-link-checker": "^0.6.7", "http-server": "^14.1.1", "write-good": "^0.13.1" }, diff --git a/parallel-programming.code-workspace b/parallel-programming.code-workspace index 362d7c25..1f355dc7 100644 --- a/parallel-programming.code-workspace +++ b/parallel-programming.code-workspace @@ -3,5 +3,85 @@ { "path": "." } - ] + ], + "settings": { + "files.associations": { + "*.json": "jsonc", + "*.dat": "csv (whitespace)", + "*.pgf": "tex", + "*.pdf_tex": "tex", + ".py.in": "Python", + "*.tikz": "tex", + "*.slurm": "shellscript", + "sstream": "cpp", + "cctype": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "array": "cpp", + "atomic": "cpp", + "bit": "cpp", + "*.tcc": "cpp", + "bitset": "cpp", + "charconv": "cpp", + "chrono": "cpp", + "compare": "cpp", + "concepts": "cpp", + "condition_variable": "cpp", + "cstdint": "cpp", + "deque": "cpp", + "forward_list": "cpp", + "map": "cpp", + "string": "cpp", + "unordered_map": "cpp", + "unordered_set": "cpp", + "vector": "cpp", + "exception": "cpp", + "algorithm": "cpp", + "functional": "cpp", + "iterator": "cpp", + "memory": "cpp", + "memory_resource": "cpp", + "numeric": "cpp", + "optional": "cpp", + "random": "cpp", + "ratio": "cpp", + "source_location": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "utility": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "limits": "cpp", + "mutex": "cpp", + "new": "cpp", + "numbers": "cpp", + "ostream": "cpp", + "semaphore": "cpp", + "span": "cpp", + "stdexcept": "cpp", + "stop_token": "cpp", + "streambuf": "cpp", + "thread": "cpp", + "cinttypes": "cpp", + "typeindex": "cpp", + "typeinfo": "cpp", + "variant": "cpp", + "format": "cpp", + "__nullptr": "cpp", + "string.h": "c" + } + } } \ No newline at end of file diff --git a/site.yml b/site.yml index 47158a5a..209ca3cf 100644 --- a/site.yml +++ b/site.yml @@ -37,3 +37,8 @@ asciidoc: swimmer: Swimmer python: Python cpp: C++ + + # extensions: + # - '@feelpp/asciidoctor-extensions/src/make-dynamic-processor' + # - '@feelpp/asciidoctor-extensions/src/cmake-dynamic-processor' + # - '@feelpp/asciidoctor-extensions/src/cpp-dynamic-processor-collapsible' diff --git a/src/MPI/CMakeLists.txt b/src/MPI/CMakeLists.txt index e73a7e78..065843f6 100644 --- a/src/MPI/CMakeLists.txt +++ b/src/MPI/CMakeLists.txt @@ -1,7 +1,7 @@ #project(MPI_bcast) -######## A simple cmakelists.txt file for ... ############# +######## A simple cmakelists.txt file for ... ############# cmake_minimum_required(VERSION 3.17) #set(CMAKE_CXX_STANDARD 14) @@ -60,4 +60,4 @@ foreach (myfile ${WFOPenMP_SRC}) #endif() endforeach (file ${WFOPenMP_SRC}) -########### end #################################### +########### end #################################### diff --git a/src/MPI/MPI_Philosophers.c b/src/MPI/MPI_Philosophers.c index 1f8a09f5..d6f65312 100755 --- a/src/MPI/MPI_Philosophers.c +++ b/src/MPI/MPI_Philosophers.c @@ -4,8 +4,8 @@ #define NUM_MESSAGES 3*NUM_PHILOSOPHERS*NUM_DINNERS -const int NUM_DINNERS; -const int NUM_PHILOSOPHERS; +int NUM_DINNERS; +int NUM_PHILOSOPHERS; const int NEED_LEFT_CHOPSTICK = 0; const int NEED_RIGHT_CHOPSTICK = 1; const int RELEASE_CHOPSTICKS = 2; @@ -16,7 +16,7 @@ void philosopher(int rank) int leftChopstick; int rightChopstick; int dummy; - + if(rank == NUM_PHILOSOPHERS) { @@ -46,7 +46,7 @@ void philosopher(int rank) printf("Philosopher %d is eating meal %d.\n", rank, i); MPI_Send(&dummy, 1, MPI_INT, 0, RELEASE_CHOPSTICKS, MPI_COMM_WORLD); - } + } } void server() @@ -221,7 +221,7 @@ int main(int argc, char ** argv) if(argc != 3) exit(0); - + sscanf(argv[1], "%d", &NUM_PHILOSOPHERS); printf("The number of philosophers is %d\n", NUM_PHILOSOPHERS); sscanf(argv[2], "%d", &NUM_DINNERS); diff --git a/src/MPI/MPI_many_messages.c b/src/MPI/MPI_many_messages.c index f9313a19..e0338cfe 100755 --- a/src/MPI/MPI_many_messages.c +++ b/src/MPI/MPI_many_messages.c @@ -1,6 +1,7 @@ #include #include #include "mpi.h" +#include #define NUM_MESSAGES 20 @@ -21,13 +22,13 @@ int main (int argc, char** argv) if(my_rank == 0) { printf("Hello from the server!\n"); - + int i; char msg_to_send[100]; for(i = 0; i < NUM_MESSAGES; i++) { sprintf(msg_to_send, "Hi from server, this is msg %d\n", i); - mpi_error_code = MPI_Isend(msg_to_send, strlen(msg_to_send)+1, + mpi_error_code = MPI_Isend(msg_to_send, strlen(msg_to_send)+1, MPI_CHAR, 1, i, MPI_COMM_WORLD, &req[i]); } @@ -43,7 +44,7 @@ int main (int argc, char** argv) int i; int j; for(i = 0; i < NUM_MESSAGES; i++) - mpi_error_code = MPI_Irecv(str[i], 100, + mpi_error_code = MPI_Irecv(str[i], 100, MPI_CHAR, 0, i, MPI_COMM_WORLD, &req[i]); for(j = 0; j < 999; j++) diff --git a/src/MPI/MPI_many_messages2.c b/src/MPI/MPI_many_messages2.c index f8baab9a..1fa5c369 100755 --- a/src/MPI/MPI_many_messages2.c +++ b/src/MPI/MPI_many_messages2.c @@ -1,6 +1,8 @@ #include #include #include "mpi.h" +#include + #define NUM_MESSAGES 20 @@ -21,13 +23,13 @@ int main (int argc, char** argv) if(my_rank == 0) { printf("Hello from the server!\n"); - + int i; char msg_to_send[100]; for(i = 0; i < NUM_MESSAGES; i++) { sprintf(msg_to_send, "Hi from server, this is msg %d\n", i); - mpi_error_code = MPI_Isend(msg_to_send, strlen(msg_to_send)+1, + mpi_error_code = MPI_Isend(msg_to_send, strlen(msg_to_send)+1, MPI_CHAR, 1, i, MPI_COMM_WORLD, &req[i]); } @@ -43,7 +45,7 @@ int main (int argc, char** argv) int i; int j; for(i = 0; i < NUM_MESSAGES; i++) - mpi_error_code = MPI_Irecv(str[i], 100, + mpi_error_code = MPI_Irecv(str[i], 100, MPI_CHAR, 0, i, MPI_COMM_WORLD, &req[i]); for(j = 0; j < 999; j++)