AlibabaPAI · hanwen-sun · Feb 5, 2025 · Feb 12, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/benchmarks/benchmark.sh b/benchmarks/benchmark.sh
@@ -29,6 +29,7 @@ FSDP_SIZE=4
 declare -A BACKAND_PARAMS=(
     ["torchacc"]="--backend lazy"
     ["hybridtrace"]="--backend lazy --hybrid_trace"
+    ["partialcompile"]="--backend eager --partial_compiles"
     ["cuda"]="--backend eager"
 )
 
@@ -40,6 +41,9 @@ function run_benchmark() {
     if [ "$backend" == "hybridtrace" ]; then
         export USE_TORCH_XLA=1
         export TORCHACC_PATCH_FA=0
+    elif [ "$backend" == "partialcompile"]; then
+        export USE_TORCH_XLA=1
+        export TORCHACC_PATCH_FA=0
     elif [ "$backend" == "torchacc" ]; then
         export USE_TORCH_XLA=1
         export TORCHACC_PATCH_FA=1
@@ -75,6 +79,7 @@ function run_benchmark() {
 
 for MODEL in "${MODELS[@]}"; do
     run_benchmark "$MODEL" "hybridtrace" ${FSDP_SIZE}
+    run_benchmark "$MODEL" "partialcompile" ${FSDP_SIZE}
     run_benchmark "$MODEL" "torchacc" ${FSDP_SIZE}
     run_benchmark "$MODEL" "cuda" ${FSDP_SIZE}
 done

diff --git a/benchmarks/transformer.py b/benchmarks/transformer.py
@@ -53,6 +53,7 @@ def _parse_args():
     parser.add_argument('--acc', action='store_true', default=False)
     parser.add_argument('--backend', type=str, default='lazy')
     parser.add_argument('--hybrid_trace', action='store_true', default=False)
+    parser.add_argument('--partial_compile', action='store_true', default=False)
     parser.add_argument('--fp16', action='store_true', default=False)
     parser.add_argument('--bf16', action='store_true', default=False)
     parser.add_argument('--gc', action='store_true', default=False)
@@ -85,6 +86,7 @@ def _get_config(args):
     config = ta.Config()
     config.backend.mode = args.backend
     config.backend.hybrid_trace = args.hybrid_trace
+    config.backend.partial_compile = args.partial_compile
     config.compute.fp16 = args.fp16
     config.compute.bf16 = args.bf16
 

diff --git a/docs/source/hybridtracing/hybrid_trace.md b/docs/source/hybridtracing/hybrid_trace.md
@@ -0,0 +1,22 @@
+# Hybrid Trace
+## Introduction
+Hybrid Trace mainly addresses performance degradation issues that arise when XLA encounters tensor evaluation. In this solution, we combine the graph capture capabilities of Dynamo and Lazy Tensor Core (LTC). The model runs on the XLA device, following LTC's execution logic, while locally employing Dynamo to reduce tracing overhead. This strategy retains the potential for full-graph optimization with XLA.
+
+Note: Hybrid Trace runs on torchacc lazy backend(xla device).
+## How to use
+It can be specified within torchacc.accelerate by setting the config.
+```Python
+import torchacc as ta
+
+config = ta.config()
+config.backend.mode = 'lazy'
+config.backend.hybrid_trace = True
+
+...
+
+ta.accelerate(model, config=config)
+```
+
+## Sceneries
+Below are the sceneries we suggest to use hybrid trace:
+- language model with tensor evaluations like qwen and llama.
diff --git a/docs/source/hybridtracing/partial_compile.md b/docs/source/hybridtracing/partial_compile.md
@@ -0,0 +1,23 @@
+# Partial Compile
+## Introduction
+Partial compile in TorchAcc can be employed to achieve performance acceleration over native torch cuda execution In scenarios involving complex user code (e.g., extensive tensor evaluations, custom operations, etc.) which is hard for xla to capture whole graph. Specifically, we utilize Dynamo + XLA backend for partial compilation, with enhancements and optimizations in both functionality and performance.
+
+Note: Partial compile runs on TorchAcc eager backend(cuda device).
+## How to use
+It can be specified within torchacc.accelerate by setting the config.
+```Python
+import torchacc as ta
+
+config = ta.config()
+config.backend.mode = 'eager'
+config.backend.partial_compile = True
+
+...
+
+ta.accelerate(model, config=config)
+```
+
+## Sceneries
+Below are the sceneries we suggest to use partial compile:
+- model with custom ops which xla do not support.
+- model with extensive tensor evaluations.
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -26,7 +26,9 @@ Welcome to TorchAcc's documentation!
    dist/dp
    dist/fsdp
    bucketing
-
+   hybridtracing/hybrid_trace
+   hybridtracing/partial_compile
+
 .. toctree::
    :maxdepth: 2
    :caption: Tutorials

diff --git a/torchacc/accelerate.py b/torchacc/accelerate.py
@@ -186,8 +186,35 @@ def accelerate(
             model = torch.compile(model, backend="hybridtrace")
 
     if config.backend.partial_compile:
+        try:
+            import torch_xla._dynamo.config as config
+            import torch_xla._dynamo.dynamo_bridge as dynamo_bridge
+        except ImportError as e:
+            raise ImportError(
+                "Please follow the instruction in https://torchacc.readthedocs.io/en/stable/install.html to install torch_xla"
+            ) from e
+        # TODO: maybe we should move the config to dynamo_bridge?
+        config.use_call_computation = False
+        config.skip_input_data_check = False
+        config.outside_on_cuda = True
+        config.mark_step_after_layer_if_early_sync = False
+        config.no_xla_graph_sync = True
+        torch.utils.deterministic.fill_uninitialized_memory = False  # disbale initianization for torch.empty()
+
         torch._dynamo.disallow_in_graph(
             torch.nn.functional.scaled_dot_product_attention)
         model = torch.compile(model, backend="openxla")
+        # TODO: current we can't set xla stream as cuda stream because xla
+        # can't receive an externel cuda stream(int) and transfer it to
+        # se::stream.
+        # set cuda stream as xla stream;
+        cuda_device = dist.get_rank() % torch.cuda.device_count(
+        ) if dist.is_initialized() else 0
+        import torch_xla
+        stream = torch_xla._XLAC._get_stream_for_cuda_device(cuda_device)
+        stream = 1 if stream == 0 else stream
+        assert stream is None or type(stream) is int
+        external_stream = torch.cuda.ExternalStream(stream)
+        torch.cuda.set_stream(external_stream)
 
     return (model, dataloader) if dataloader else model