not able to bypass this issue:
I think it maybe due to some firewall setting I cannot change :(
but I hope there is a workaround or it can be fixed somehow
------ log ------
[W426 16:56:01.831041000 TCPStore.cpp:347] [c10d] TCP client failed to connect/validate to host 100.64.0.1:59782 - retrying (try=6, timeout=1800000ms, delay=5241ms): Connection reset by peer
Exception raised from recvBytes at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/distributed/c10d/Utils.hpp:680 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator>) + 52 (0x1068316cc in libc10.dylib)
frame #1: void c10d::tcputil::recvBytes(int, unsigned int*, unsigned long) + 476 (0x11b4f7004 in libtorch_cpu.dylib)
frame #2: unsigned int c10d::detail::TCPClient::receiveValue() + 40 (0x11b4f6cc4 in libtorch_cpu.dylib)
frame #3: c10d::TCPStore::ping() + 200 (0x11b4f5738 in libtorch_cpu.dylib)
frame #4: c10d::TCPStore::TCPStore(std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator>, c10d::TCPStoreOptions const&) + 1212 (0x11b4f44dc in libtorch_cpu.dylib)
frame #5: c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore> c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>::make<std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, c10d::TCPStoreOptions&>(std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, c10d::TCPStoreOptions&) + 168 (0x108f04c2c in libtorch_python.dylib)
frame #6: std::_1::enable_if<std::is_void<pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>>::value, pybind11::detail::void_type>::type pybind11::detail::argument_loader<pybind11::detail::value_and_holder&, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::__1::optional, bool>::call<void, pybind11::detail::void_type, void pybind11::detail::initimpl::factory<torch::distributed::c10d::(anonymous namespace)::c10d_init(_object*, _object*)::$_52, pybind11::detail::void_type ()(), c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore> (std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::1::optional, bool), pybind11::detail::void_type ()>::execute<pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>, pybind11::arg, pybind11::arg, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, char [24]>(pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>&, pybind11::arg const&, pybind11::arg const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, char const (&) [24]) &&::'lambda'(pybind11::detail::value_and_holder&, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::__1::optional, bool)&>(void pybind11::detail::initimpl::factory<torch::distributed::c10d::(anonymous namespace)::c10d_init(_object, _object*)::$_52, pybind11::detail::void_type ()(), c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore> (std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::1::optional, bool), pybind11::detail::void_type ()>::execute<pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>, pybind11::arg, pybind11::arg, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, char [24]>(pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>&, pybind11::arg const&, pybind11::arg const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, char const (&) [24]) &&::'lambda'(pybind11::detail::value_and_holder&, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::__1::optional, bool)&) && + 176 (0x108f047b8 in libtorch_python.dylib)
frame #7: void pybind11::cpp_function::initialize<void pybind11::detail::initimpl::factory<torch::distributed::c10d::(anonymous namespace)::c10d_init(_object, _object*)::$_52, pybind11::detail::void_type ()(), c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore> (std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::1::optional, bool), pybind11::detail::void_type ()>::execute<pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>, pybind11::arg, pybind11::arg, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, char [24]>(pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>&, pybind11::arg const&, pybind11::arg const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, char const (&) [24]) &&::'lambda'(pybind11::detail::value_and_holder&, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::__1::optional, bool), void, pybind11::detail::value_and_holder&, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::_1::optional, bool, pybind11::name, pybind11::is_method, pybind11::sibling, pybind11::detail::is_new_style_constructor, pybind11::arg, pybind11::arg, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, char [24]>(pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>&&, void ()(pybind11::detail::value_and_holder&, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::__1::optional, bool), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&, pybind11::detail::is_new_style_constructor const&, pybind11::arg const&, pybind11::arg const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, char const (&) [24])::'lambda'(pybind11::detail::function_call&)::__invoke(pybind11::detail::function_call&) + 92 (0x108f04668 in libtorch_python.dylib)
frame #8: pybind11::cpp_function::dispatcher(_object*, _object*, _object*) + 4804 (0x10831f8b8 in libtorch_python.dylib)
frame #9: cfunction_call + 76 (0x10191a204 in Python)
frame #10: _PyObject_MakeTpCall + 120 (0x1018bd19c in Python)
frame #11: method_vectorcall + 876 (0x1018c0a18 in Python)
frame #12: _PyVectorcall_Call + 152 (0x1018bdba0 in Python)
frame #13: slot_tp_init + 480 (0x10194f088 in Python)
frame #14: type_call + 148 (0x1019451c4 in Python)
frame #15: pybind11_meta_call + 40 (0x10831afa8 in libtorch_python.dylib)
frame #16: _PyObject_MakeTpCall + 120 (0x1018bd19c in Python)
frame #17: _PyEval_EvalFrameDefault + 16036 (0x1019e41f8 in Python)
frame #18: gen_send_ex2 + 196 (0x1018d9dec in Python)
frame #19: gen_iternext + 36 (0x1018d8950 in Python)
frame #20: builtin_next + 72 (0x1019dcbf8 in Python)
frame #21: cfunction_vectorcall_FASTCALL + 92 (0x1019197e4 in Python)
frame #22: PyObject_Vectorcall + 92 (0x1018bdd10 in Python)
frame #23: _PyEval_EvalFrameDefault + 8596 (0x1019e24e8 in Python)
frame #24: _PyObject_VectorcallDictTstate + 92 (0x1018bcfa4 in Python)
frame #25: slot_tp_init + 196 (0x10194ef6c in Python)
frame #26: type_call + 148 (0x1019451c4 in Python)
frame #27: _PyObject_MakeTpCall + 120 (0x1018bd19c in Python)
frame #28: _PyEval_EvalFrameDefault + 8596 (0x1019e24e8 in Python)
frame #29: _PyObject_VectorcallDictTstate + 200 (0x1018bd010 in Python)
frame #30: slot_tp_init + 196 (0x10194ef6c in Python)
frame #31: type_call + 148 (0x1019451c4 in Python)
frame #32: _PyObject_Call + 128 (0x1018bde90 in Python)
frame #33: _PyEval_EvalFrameDefault + 13232 (0x1019e3704 in Python)
frame #34: PyEval_EvalCode + 200 (0x1019e00e0 in Python)
frame #35: run_eval_code_obj + 104 (0x101a50828 in Python)
frame #36: run_mod + 168 (0x101a5017c in Python)
frame #37: _PyRun_StringFlagsWithName + 148 (0x101a4eb8c in Python)
frame #38: _PyRun_SimpleStringFlagsWithName + 144 (0x101a4e99c in Python)
frame #39: Py_RunMain + 808 (0x101a73de8 in Python)
frame #40: pymain_main + 304 (0x101a74488 in Python)
frame #41: Py_BytesMain + 44 (0x101a7452c in Python)
frame #42: start + 6992 (0x18edafda4 in dyld)
not able to bypass this issue:
I think it maybe due to some firewall setting I cannot change :(
but I hope there is a workaround or it can be fixed somehow
------ log ------
[W426 16:56:01.831041000 TCPStore.cpp:347] [c10d] TCP client failed to connect/validate to host 100.64.0.1:59782 - retrying (try=6, timeout=1800000ms, delay=5241ms): Connection reset by peer
Exception raised from recvBytes at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/distributed/c10d/Utils.hpp:680 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator>) + 52 (0x1068316cc in libc10.dylib)
frame #1: void c10d::tcputil::recvBytes(int, unsigned int*, unsigned long) + 476 (0x11b4f7004 in libtorch_cpu.dylib)
frame #2: unsigned int c10d::detail::TCPClient::receiveValue() + 40 (0x11b4f6cc4 in libtorch_cpu.dylib)
frame #3: c10d::TCPStore::ping() + 200 (0x11b4f5738 in libtorch_cpu.dylib)
frame #4: c10d::TCPStore::TCPStore(std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator>, c10d::TCPStoreOptions const&) + 1212 (0x11b4f44dc in libtorch_cpu.dylib)
frame #5: c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore> c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>::make<std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, c10d::TCPStoreOptions&>(std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, c10d::TCPStoreOptions&) + 168 (0x108f04c2c in libtorch_python.dylib)
frame #6: std::_1::enable_if<std::is_void<pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>>::value, pybind11::detail::void_type>::type pybind11::detail::argument_loader<pybind11::detail::value_and_holder&, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::__1::optional, bool>::call<void, pybind11::detail::void_type, void pybind11::detail::initimpl::factory<torch::distributed::c10d::(anonymous namespace)::c10d_init(_object*, _object*)::$_52, pybind11::detail::void_type ()(), c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore> (std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::1::optional, bool), pybind11::detail::void_type ()>::execute<pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>, pybind11::arg, pybind11::arg, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, char [24]>(pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>&, pybind11::arg const&, pybind11::arg const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, char const (&) [24]) &&::'lambda'(pybind11::detail::value_and_holder&, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::__1::optional, bool)&>(void pybind11::detail::initimpl::factory<torch::distributed::c10d::(anonymous namespace)::c10d_init(_object, _object*)::$_52, pybind11::detail::void_type ()(), c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore> (std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::1::optional, bool), pybind11::detail::void_type ()>::execute<pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>, pybind11::arg, pybind11::arg, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, char [24]>(pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>&, pybind11::arg const&, pybind11::arg const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, char const (&) [24]) &&::'lambda'(pybind11::detail::value_and_holder&, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::__1::optional, bool)&) && + 176 (0x108f047b8 in libtorch_python.dylib)
frame #7: void pybind11::cpp_function::initialize<void pybind11::detail::initimpl::factory<torch::distributed::c10d::(anonymous namespace)::c10d_init(_object, _object*)::$_52, pybind11::detail::void_type ()(), c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore> (std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::1::optional, bool), pybind11::detail::void_type ()>::execute<pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>, pybind11::arg, pybind11::arg, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, char [24]>(pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>&, pybind11::arg const&, pybind11::arg const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, char const (&) [24]) &&::'lambda'(pybind11::detail::value_and_holder&, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::__1::optional, bool), void, pybind11::detail::value_and_holder&, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::_1::optional, bool, pybind11::name, pybind11::is_method, pybind11::sibling, pybind11::detail::is_new_style_constructor, pybind11::arg, pybind11::arg, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, char [24]>(pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>&&, void ()(pybind11::detail::value_and_holder&, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::__1::optional, bool), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&, pybind11::detail::is_new_style_constructor const&, pybind11::arg const&, pybind11::arg const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, char const (&) [24])::'lambda'(pybind11::detail::function_call&)::__invoke(pybind11::detail::function_call&) + 92 (0x108f04668 in libtorch_python.dylib)
frame #8: pybind11::cpp_function::dispatcher(_object*, _object*, _object*) + 4804 (0x10831f8b8 in libtorch_python.dylib)
frame #9: cfunction_call + 76 (0x10191a204 in Python)
frame #10: _PyObject_MakeTpCall + 120 (0x1018bd19c in Python)
frame #11: method_vectorcall + 876 (0x1018c0a18 in Python)
frame #12: _PyVectorcall_Call + 152 (0x1018bdba0 in Python)
frame #13: slot_tp_init + 480 (0x10194f088 in Python)
frame #14: type_call + 148 (0x1019451c4 in Python)
frame #15: pybind11_meta_call + 40 (0x10831afa8 in libtorch_python.dylib)
frame #16: _PyObject_MakeTpCall + 120 (0x1018bd19c in Python)
frame #17: _PyEval_EvalFrameDefault + 16036 (0x1019e41f8 in Python)
frame #18: gen_send_ex2 + 196 (0x1018d9dec in Python)
frame #19: gen_iternext + 36 (0x1018d8950 in Python)
frame #20: builtin_next + 72 (0x1019dcbf8 in Python)
frame #21: cfunction_vectorcall_FASTCALL + 92 (0x1019197e4 in Python)
frame #22: PyObject_Vectorcall + 92 (0x1018bdd10 in Python)
frame #23: _PyEval_EvalFrameDefault + 8596 (0x1019e24e8 in Python)
frame #24: _PyObject_VectorcallDictTstate + 92 (0x1018bcfa4 in Python)
frame #25: slot_tp_init + 196 (0x10194ef6c in Python)
frame #26: type_call + 148 (0x1019451c4 in Python)
frame #27: _PyObject_MakeTpCall + 120 (0x1018bd19c in Python)
frame #28: _PyEval_EvalFrameDefault + 8596 (0x1019e24e8 in Python)
frame #29: _PyObject_VectorcallDictTstate + 200 (0x1018bd010 in Python)
frame #30: slot_tp_init + 196 (0x10194ef6c in Python)
frame #31: type_call + 148 (0x1019451c4 in Python)
frame #32: _PyObject_Call + 128 (0x1018bde90 in Python)
frame #33: _PyEval_EvalFrameDefault + 13232 (0x1019e3704 in Python)
frame #34: PyEval_EvalCode + 200 (0x1019e00e0 in Python)
frame #35: run_eval_code_obj + 104 (0x101a50828 in Python)
frame #36: run_mod + 168 (0x101a5017c in Python)
frame #37: _PyRun_StringFlagsWithName + 148 (0x101a4eb8c in Python)
frame #38: _PyRun_SimpleStringFlagsWithName + 144 (0x101a4e99c in Python)
frame #39: Py_RunMain + 808 (0x101a73de8 in Python)
frame #40: pymain_main + 304 (0x101a74488 in Python)
frame #41: Py_BytesMain + 44 (0x101a7452c in Python)
frame #42: start + 6992 (0x18edafda4 in dyld)