fix: clean up unnecessary changes

zyang6 · zyang6 · commit 7548b7e6f35d · 2025-09-21T08:01:20.000Z
diff --git a/fastvideo/distributed/device_communicators/pyhccl.py b/fastvideo/distributed/device_communicators/pyhccl.py
@@ -29,15 +29,6 @@
 
 logger = init_logger(__name__)
 
-# from vllm.distributed.utils import StatelessProcessGroup
-# from vllm.logger import logger
-
-# from vllm_ascend.distributed.device_communicators.pyhccl_wrapper import (
-#     HCCLLibrary, aclrtStream_t, buffer_type, hcclComm_t, hcclDataTypeEnum,
-#     hcclRedOpTypeEnum, hcclUniqueId)
-# from vllm_ascend.utils import current_stream
-
-
 class PyHcclCommunicator:
 
     def __init__(
@@ -59,12 +50,10 @@ def __init__(
         """
 
         if not isinstance(group, StatelessProcessGroup):
-            ###存疑
             assert dist.is_initialized()
             assert dist.get_backend(group) != dist.Backend.HCCL, (
                 "PyHcclCommunicator should be attached to a non-HCCL group.")
             # note: this rank is the rank in the group
-            print(f'dist.get_backend(group) {dist.get_backend(group) }')
             self.rank = dist.get_rank(group)
             self.world_size = dist.get_world_size(group)
         else:
@@ -173,4 +162,4 @@ def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
             buffer = buffer_type(tensor.data_ptr())
         self.hccl.hcclBroadcast(buffer, tensor.numel(),
                                 hcclDataTypeEnum.from_torch(tensor.dtype), src,
-                                self.comm, aclrtStream_t(stream.npu_stream))
+                                self.comm, aclrtStream_t(stream.npu_stream))
diff --git a/fastvideo/distributed/device_communicators/pyhccl_wrapper.py b/fastvideo/distributed/device_communicators/pyhccl_wrapper.py
@@ -26,9 +26,6 @@
 from fastvideo.utils import find_hccl_library
 
 logger = init_logger(__name__)
-# from vllm.logger import logger
-
-# from vllm_ascend.utils import find_hccl_library
 
 # export types and functions from hccl to Python ===
 # for the original hccl definition, please check
@@ -133,10 +130,6 @@ class HCCLLibrary:
             ctypes.POINTER(hcclComm_t),
         ]),
 
-        # HcclResult HcclAllReduce(
-        #   void *sendBuf, void *recvBuf, uint64_t count,
-        #   HcclDataType dataType, HcclReduceOp op, HcclComm comm,
-        #   aclrtStream stream);
         Function("HcclAllReduce", hcclResult_t, [
             buffer_type,
             buffer_type,
diff --git a/fastvideo/models/loader/component_loader.py b/fastvideo/models/loader/component_loader.py
@@ -273,6 +273,7 @@ def load_model(self,
 
             # Explicitly move model to target device after loading weights
             model = model.to(target_device)
+
             if use_cpu_offload:
                 # Disable FSDP for MPS as it's not compatible
                 if current_platform.is_mps():
diff --git a/fastvideo/models/loader/fsdp_load.py b/fastvideo/models/loader/fsdp_load.py
@@ -104,6 +104,7 @@ def maybe_load_fsdp_model(
         if not training_mode and not fsdp_inference:
             hsdp_replicate_dim = world_size
             hsdp_shard_dim = 1
+        
         if current_platform.is_npu():
             with torch.device("cpu"):
                 device_mesh = init_device_mesh(
@@ -118,7 +119,7 @@ def maybe_load_fsdp_model(
             # (Replicate(), Shard(dim=0))
             mesh_shape=(hsdp_replicate_dim, hsdp_shard_dim),
             mesh_dim_names=("replicate", "shard"),
-            )
+        )
         shard_model(model,
                     cpu_offload=cpu_offload,
                     reshard_after_forward=True,
diff --git a/fastvideo/pipelines/composed_pipeline_base.py b/fastvideo/pipelines/composed_pipeline_base.py
@@ -65,6 +65,7 @@ def __init__(self,
         if self._required_config_modules is None:
             raise NotImplementedError(
                 "Subclass must set _required_config_modules")
+
         maybe_init_distributed_environment_and_model_parallel(
             fastvideo_args.tp_size, fastvideo_args.sp_size)
 
@@ -151,6 +152,7 @@ def from_pretrained(cls,
             assert fastvideo_args.pipeline_config.dit_precision == 'fp32', 'only fp32 is supported for training'
 
         logger.info("fastvideo_args in from_pretrained: %s", fastvideo_args)
+
         pipe = cls(model_path,
                    fastvideo_args,
                    required_config_modules=required_config_modules,
diff --git a/fastvideo/platforms/npu.py b/fastvideo/platforms/npu.py
@@ -1,31 +1,11 @@
-#
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-#
-
 import gc
 import os
 from datetime import timedelta
 from typing import TYPE_CHECKING, Optional, Tuple
 
 import torch
-# import vllm.envs as envs
 from torch.distributed import ProcessGroup
 from torch.distributed.distributed_c10d import PrefixStore
-# from vllm.logger import logger
-# from vllm.platforms import Platform, PlatformEnum
 
 import os
 from collections.abc import Callable
@@ -42,21 +22,6 @@
                                            PlatformEnum)
 from fastvideo.utils import import_pynvml
 
-# import vllm_ascend.envs as envs_ascend
-# from vllm_ascend.ascend_config import check_ascend_config, init_ascend_config
-# from vllm_ascend.utils import (ASCEND_QUATIZATION_METHOD,
-#                                check_torchair_cache_exist,
-#                                delete_torchair_cache_file,
-#                                update_aclgraph_sizes)
-
-# if TYPE_CHECKING:
-#     from vllm.config import ModelConfig, VllmConfig
-#     from vllm.utils import FlexibleArgumentParser
-# else:
-#     ModelConfig = None
-#     VllmConfig = None
-#     FlexibleArgumentParser = None
-
 logger = init_logger(__name__)
 
 class NPUPlatform(Platform):
@@ -115,8 +80,8 @@ def clear_npu_memory(cls):
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: AttentionBackendEnum | None,
                              head_size: int, dtype: torch.dtype) -> str:
-        # TODO(will): maybe come up with a more general interface for local attention
-        # if distributed is False, we always try to use Flash attn
+        # the NPU only supports Flash Attention
+        # TODO(will): Other tasks will be synchronized in subsequent updates.
 
         logger.info("Trying FASTVIDEO_ATTENTION_BACKEND=%s",
                     envs.FASTVIDEO_ATTENTION_BACKEND)
@@ -216,9 +181,6 @@ def get_attn_backend_cls(cls, selected_backend: AttentionBackendEnum | None,
 
         return "fastvideo.attention.backends.flash_attn.FlashAttentionBackend"
 
-    @classmethod
-    def get_punica_wrapper(cls) -> str:
-        return "vllm_ascend.lora.punica_wrapper.punica_npu.PunicaWrapperNPU"
 
     @classmethod
     def get_current_memory_usage(cls,
@@ -235,19 +197,6 @@ def get_device_communicator_cls(cls) -> str:
     def is_pin_memory_available(cls):
         return True
 
-    # @classmethod
-    # def supports_v1(cls, model_config: ModelConfig) -> bool:
-    #     """Returns whether the current platform can support v1 for the supplied
-    #     model configuration.
-    #     """
-    #     return True
-
-    # @classmethod
-    # def get_piecewise_backend_cls(cls) -> str:
-    #     """
-    #     Get piecewise backend class for piecewise graph.
-    #     """
-    #     return "vllm_ascend.compilation.piecewise_backend.NPUPiecewiseBackend"  # noqa
 
     @classmethod
     def stateless_init_device_torch_dist_pg(
@@ -276,12 +225,8 @@ def stateless_init_device_torch_dist_pg(
         backend_class = ProcessGroupHCCL(prefix_store, group_rank, group_size,
                                          backend_options)
         device = torch.device("npu")
-        # TODO(Yizhou): Like we mentioned above, _set_default_backend is not
-        # implemented in the 2.5.1 version of PyTorch. But we need to set it
-        # after the latest version is released.
-        # pg._set_default_backend(backend_type)
         backend_class._set_sequence_number_for_group()
         backend_type = ProcessGroup.BackendType.CUSTOM
 
         pg._register_backend(device, backend_type, backend_class)
-        return pg
+        return pg
diff --git a/fastvideo/training/training_pipeline.py b/fastvideo/training/training_pipeline.py
@@ -76,7 +76,6 @@ def __init__(
             raise ValueError("lora rank must be set when using lora training")
 
         set_random_seed(fastvideo_args.seed)  # for lora param init
-        breakpoint()
         super().__init__(model_path, fastvideo_args, required_config_modules,
                          loaded_modules)  # type: ignore
 
@@ -396,7 +395,6 @@ def train_one_step(self, training_batch: TrainingBatch) -> TrainingBatch:
                 num_latent_t=self.training_args.num_latent_t)
 
             training_batch = self._build_attention_metadata(training_batch)
-            
             training_batch = self._build_input_kwargs(training_batch)
             training_batch = self._transformer_forward_and_compute_loss(
                 training_batch)
@@ -488,6 +486,7 @@ def train(self) -> None:
             training_batch.current_timestep = step
             training_batch.current_vsa_sparsity = current_vsa_sparsity
             training_batch = self.train_one_step(training_batch)
+
             loss = training_batch.total_loss
             grad_norm = training_batch.grad_norm
 
@@ -528,6 +527,7 @@ def train(self) -> None:
                 logger.info(
                     "GPU memory usage after validation: %s MB, trainable params: %sB",
                     gpu_memory_usage, trainable_params)
+
         wandb.finish()
         save_checkpoint(self.transformer, self.global_rank,
                         self.training_args.output_dir,
diff --git a/fastvideo/training/wan_training_pipeline.py b/fastvideo/training/wan_training_pipeline.py
@@ -13,7 +13,6 @@
 if current_platform.is_npu():
     import torch_npu
     from torch_npu.contrib import transfer_to_npu
-from msprobe.pytorch import PrecisionDebugger, seed_all
 
 vsa_available = is_vsa_available()
 
@@ -59,7 +58,7 @@ def initialize_validation_pipeline(self, training_args: TrainingArgs):
 
 def main(args) -> None:
     logger.info("Starting training pipeline...")
-    breakpoint()
+
     pipeline = WanTrainingPipeline.from_pretrained(
         args.pretrained_model_name_or_path, args=args)
     args = pipeline.training_args
@@ -69,7 +68,6 @@ def main(args) -> None:
 
 if __name__ == "__main__":
     argv = sys.argv
-    seed_all(seed=42, mode=True)
     from fastvideo.fastvideo_args import TrainingArgs
     from fastvideo.utils import FlexibleArgumentParser
     parser = FlexibleArgumentParser()