From 6b1d059eda21c1bd421f3d352786fca2cab61954 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= <mgorny@gentoo.org>
Date: Sat, 18 Jan 2025 05:18:37 +0100
Subject: [PATCH] Support ROCM builds from source distribution, and improve
 error handling (#1446)

* Always update both submodules to include them in sdist

Always update both submodules, irrespectively of whether a CUDA
or a ROCM build is being done, to ensure that the necessary files
from both are present in sdist.  Otherwise, attempt to perform a ROCM
build from sdist fails because of missing `composable_kernel` srouces.

* Include `*.py` files from composable_kernel in sdist

Include the `*.py` files from `csrc` in sdist, to ensure that
the `generate.py` script is present.

* Replace the `os.system()` calls in `setup.py` with `subprocess.run()`

* Add error checking to `subprocess.run()` calls in `setup.py`

Add error checking to ensure that `setup.py` fails immediately if one
of the commands fail.  Otherwise, the failures result only in messages
to stderr that could be missed, and could lead to more confusing errors
later in the build process.

* Call git in `setup.py` only when working in a git repository

Call git commands in `setup.py` only when the `.git` directory is
present, indicating that we are working in a git checkout.  Otherwise,
just assert that the needed files are there.  With this, building
from a source distribution no longer attempts to call git
in an incorrect directory.
---
 MANIFEST.in |  1 +
 setup.py    | 24 ++++++++++++++++--------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index 021b4d0f7..d3c4b4eda 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -3,6 +3,7 @@ recursive-include csrc *.h
 recursive-include csrc *.cuh
 recursive-include csrc *.cpp
 recursive-include csrc *.hpp
+recursive-include csrc *.py
 
 recursive-include flash_attn *.cu
 recursive-include flash_attn *.h
diff --git a/setup.py b/setup.py
index a802a7e65..264b0eed5 100644
--- a/setup.py
+++ b/setup.py
@@ -145,11 +145,19 @@ def validate_and_update_archs(archs):
 
 # We want this even if SKIP_CUDA_BUILD because when we run python setup.py sdist we want the .hpp
 # files included in the source distribution, in case the user compiles from source.
-if IS_ROCM:
-    if not USE_TRITON_ROCM:
-        subprocess.run(["git", "submodule", "update", "--init", "csrc/composable_kernel"])
+if os.path.isdir(".git"):
+    subprocess.run(["git", "submodule", "update", "--init", "csrc/composable_kernel"], check=True)
+    subprocess.run(["git", "submodule", "update", "--init", "csrc/cutlass"], check=True)
 else:
-    subprocess.run(["git", "submodule", "update", "--init", "csrc/cutlass"])
+    if IS_ROCM:
+        if not USE_TRITON_ROCM:
+            assert (
+                os.path.exists("csrc/composable_kernel/example/ck_tile/01_fmha/generate.py")
+            ), "csrc/composable_kernel is missing, please use source distribution or git clone"
+    else:
+        assert (
+            os.path.exists("csrc/cutlass/include/cutlass/cutlass.h")
+        ), "csrc/cutlass is missing, please use source distribution or git clone"
 
 if not SKIP_CUDA_BUILD and not IS_ROCM:
     print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
@@ -324,10 +332,10 @@ def validate_and_update_archs(archs):
         if not os.path.exists("./build"):
             os.makedirs("build")
 
-        os.system(f"{sys.executable} {ck_dir}/example/ck_tile/01_fmha/generate.py -d fwd --output_dir build --receipt 2")
-        os.system(f"{sys.executable} {ck_dir}/example/ck_tile/01_fmha/generate.py -d fwd_appendkv --output_dir build --receipt 2")
-        os.system(f"{sys.executable} {ck_dir}/example/ck_tile/01_fmha/generate.py -d fwd_splitkv --output_dir build --receipt 2")
-        os.system(f"{sys.executable} {ck_dir}/example/ck_tile/01_fmha/generate.py -d bwd --output_dir build --receipt 2")
+        subprocess.run([sys.executable, f"{ck_dir}/example/ck_tile/01_fmha/generate.py", "-d", "fwd", "--output_dir", "build", "--receipt", "2"], check=True)
+        subprocess.run([sys.executable, f"{ck_dir}/example/ck_tile/01_fmha/generate.py", "-d", "fwd_appendkv", "--output_dir", "build", "--receipt", "2"], check=True)
+        subprocess.run([sys.executable, f"{ck_dir}/example/ck_tile/01_fmha/generate.py", "-d", "fwd_splitkv", "--output_dir", "build", "--receipt", "2"], check=True)
+        subprocess.run([sys.executable, f"{ck_dir}/example/ck_tile/01_fmha/generate.py", "-d", "bwd", "--output_dir", "build", "--receipt", "2"], check=True)
 
         # Check, if ATen/CUDAGeneratorImpl.h is found, otherwise use ATen/cuda/CUDAGeneratorImpl.h
         # See https://github.com/pytorch/pytorch/pull/70650