|
5 | 5 | # LICENSE file in the root directory of this source tree.
|
6 | 6 |
|
7 | 7 | import concurrent
|
| 8 | +import threading |
| 9 | +import time |
8 | 10 | from datetime import timedelta
|
9 | 11 | from typing import Optional
|
10 | 12 | from unittest import TestCase
|
|
14 | 16 | from torch.distributed import TCPStore
|
15 | 17 |
|
16 | 18 | from torchft._torchft import QuorumResult
|
| 19 | +from torchft.checkpointing._rwlock import RWLock |
17 | 20 | from torchft.checkpointing.transport import CheckpointTransport
|
18 | 21 | from torchft.manager import MANAGER_ADDR_KEY, REPLICA_ID_KEY, Manager, WorldSizeMode
|
19 | 22 | from torchft.process_group import ProcessGroup
|
@@ -778,3 +781,115 @@ def test_max_retries(self, client_mock: MagicMock) -> None:
|
778 | 781 | # This should succeed and reset the counter
|
779 | 782 | self.assertTrue(manager.should_commit())
|
780 | 783 | self.assertEqual(manager._commit_failures, 0)
|
| 784 | + |
| 785 | + @patch("torchft.manager.ManagerClient", autospec=True) |
| 786 | + def test_state_dict_lock_allow_disallow(self, client_mock: MagicMock) -> None: |
| 787 | + """Test that allow_state_dict_read and disallow_state_dict_read methods work correctly.""" |
| 788 | + manager = self._create_manager() |
| 789 | + |
| 790 | + # Initially, state dict read should be allowed |
| 791 | + self.assertTrue(manager._is_state_dict_read_allowed) |
| 792 | + |
| 793 | + # Test disallow_state_dict_read |
| 794 | + manager.disallow_state_dict_read() |
| 795 | + self.assertFalse(manager._is_state_dict_read_allowed) |
| 796 | + self.assertTrue(manager._state_dict_lock.w_locked()) |
| 797 | + |
| 798 | + # Calling disallow_state_dict_read again should be a no-op |
| 799 | + manager.disallow_state_dict_read() |
| 800 | + self.assertFalse(manager._is_state_dict_read_allowed) |
| 801 | + self.assertTrue(manager._state_dict_lock.w_locked()) |
| 802 | + |
| 803 | + # Test allow_state_dict_read |
| 804 | + manager.allow_state_dict_read() |
| 805 | + self.assertTrue(manager._is_state_dict_read_allowed) |
| 806 | + self.assertFalse(manager._state_dict_lock.w_locked()) |
| 807 | + |
| 808 | + # Calling allow_state_dict_read again should be a no-op |
| 809 | + manager.allow_state_dict_read() |
| 810 | + self.assertTrue(manager._is_state_dict_read_allowed) |
| 811 | + self.assertFalse(manager._state_dict_lock.w_locked()) |
| 812 | + |
| 813 | + @patch("torchft.manager.ManagerClient", autospec=True) |
| 814 | + def test_state_dict_lock_concurrent_access(self, client_mock: MagicMock) -> None: |
| 815 | + """Test that _state_dict_lock properly protects concurrent access to the state dictionary.""" |
| 816 | + manager: Manager = self._create_manager() |
| 817 | + |
| 818 | + # Create flags for thread synchronization |
| 819 | + access_attempted: threading.Event = threading.Event() |
| 820 | + can_proceed: threading.Event = threading.Event() |
| 821 | + access_result: dict[str, bool] = {"succeeded": False} |
| 822 | + |
| 823 | + def try_access_state_dict() -> None: |
| 824 | + # Wait until the main thread signals it's ready |
| 825 | + nonlocal access_attempted, can_proceed, access_result, manager |
| 826 | + access_attempted.set() |
| 827 | + can_proceed.wait(timeout=1.0) |
| 828 | + |
| 829 | + # Try to access the state dict |
| 830 | + if manager._is_state_dict_read_allowed: |
| 831 | + access_result["succeeded"] = True |
| 832 | + |
| 833 | + # Start a thread that will try to access the state dict |
| 834 | + thread = threading.Thread(target=try_access_state_dict) |
| 835 | + thread.daemon = True |
| 836 | + thread.start() |
| 837 | + |
| 838 | + # Disallow state dict read |
| 839 | + manager.disallow_state_dict_read() |
| 840 | + self.assertFalse(manager._is_state_dict_read_allowed) |
| 841 | + |
| 842 | + # Wait for the thread to be ready |
| 843 | + access_attempted.wait(timeout=1.0) |
| 844 | + |
| 845 | + # Signal the thread to proceed while state dict read is disallowed |
| 846 | + can_proceed.set() |
| 847 | + thread.join(timeout=1.0) |
| 848 | + |
| 849 | + # The thread should not have been able to access the state dict |
| 850 | + self.assertFalse(access_result["succeeded"]) |
| 851 | + |
| 852 | + # Reset for the second part of the test |
| 853 | + access_attempted.clear() |
| 854 | + can_proceed.clear() |
| 855 | + |
| 856 | + # Start another thread |
| 857 | + thread = threading.Thread(target=try_access_state_dict) |
| 858 | + thread.daemon = True |
| 859 | + thread.start() |
| 860 | + |
| 861 | + # Allow state dict read |
| 862 | + manager.allow_state_dict_read() |
| 863 | + self.assertTrue(manager._is_state_dict_read_allowed) |
| 864 | + |
| 865 | + # Wait for the thread to be ready |
| 866 | + access_attempted.wait(timeout=1.0) |
| 867 | + |
| 868 | + # Signal the thread to proceed while state dict read is allowed |
| 869 | + can_proceed.set() |
| 870 | + thread.join(timeout=1.0) |
| 871 | + |
| 872 | + # The thread should now have been able to access the state dict |
| 873 | + self.assertTrue(access_result["succeeded"]) |
| 874 | + |
| 875 | + @patch("torchft.manager.ManagerClient", autospec=True) |
| 876 | + def test_manager_state_dict_with_lock(self, client_mock: MagicMock) -> None: |
| 877 | + """Test that _manager_state_dict properly uses the read lock.""" |
| 878 | + manager = self._create_manager() |
| 879 | + |
| 880 | + # Replace the real RWLock with a mock to track lock acquisition |
| 881 | + original_lock = manager._state_dict_lock |
| 882 | + mock_lock = create_autospec(RWLock) |
| 883 | + mock_context = MagicMock() |
| 884 | + mock_lock.r_lock.return_value.__enter__ = lambda _: mock_context |
| 885 | + mock_lock.r_lock.return_value.__exit__ = lambda *args: None |
| 886 | + manager._state_dict_lock = mock_lock |
| 887 | + |
| 888 | + # Call _manager_state_dict |
| 889 | + result = manager._manager_state_dict() |
| 890 | + |
| 891 | + # Verify that r_lock was called |
| 892 | + mock_lock.r_lock.assert_called_once() |
| 893 | + |
| 894 | + # Restore the original lock |
| 895 | + manager._state_dict_lock = original_lock |
0 commit comments