Skip to content

TCP client failed to connect/validate to host 100.64.0.1:59782 - retrying #9

@lefromage

Description

@lefromage

not able to bypass this issue:

I think it maybe due to some firewall setting I cannot change :(

but I hope there is a workaround or it can be fixed somehow

------ log ------
[W426 16:56:01.831041000 TCPStore.cpp:347] [c10d] TCP client failed to connect/validate to host 100.64.0.1:59782 - retrying (try=6, timeout=1800000ms, delay=5241ms): Connection reset by peer
Exception raised from recvBytes at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/distributed/c10d/Utils.hpp:680 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator>) + 52 (0x1068316cc in libc10.dylib)
frame #1: void c10d::tcputil::recvBytes(int, unsigned int*, unsigned long) + 476 (0x11b4f7004 in libtorch_cpu.dylib)
frame #2: unsigned int c10d::detail::TCPClient::receiveValue() + 40 (0x11b4f6cc4 in libtorch_cpu.dylib)
frame #3: c10d::TCPStore::ping() + 200 (0x11b4f5738 in libtorch_cpu.dylib)
frame #4: c10d::TCPStore::TCPStore(std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator>, c10d::TCPStoreOptions const&) + 1212 (0x11b4f44dc in libtorch_cpu.dylib)
frame #5: c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore> c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>::make<std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, c10d::TCPStoreOptions&>(std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, c10d::TCPStoreOptions&) + 168 (0x108f04c2c in libtorch_python.dylib)
frame #6: std::_1::enable_if<std::is_void<pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>>::value, pybind11::detail::void_type>::type pybind11::detail::argument_loader<pybind11::detail::value_and_holder&, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::__1::optional, bool>::call<void, pybind11::detail::void_type, void pybind11::detail::initimpl::factory<torch::distributed::c10d::(anonymous namespace)::c10d_init(_object*, _object*)::$_52, pybind11::detail::void_type ()(), c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore> (std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::1::optional, bool), pybind11::detail::void_type ()>::execute<pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>, pybind11::arg, pybind11::arg, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, char [24]>(pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>&, pybind11::arg const&, pybind11::arg const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, char const (&) [24]) &&::'lambda'(pybind11::detail::value_and_holder&, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::__1::optional, bool)&>(void pybind11::detail::initimpl::factory<torch::distributed::c10d::(anonymous namespace)::c10d_init(_object, _object*)::$_52, pybind11::detail::void_type ()(), c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore> (std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::1::optional, bool), pybind11::detail::void_type ()>::execute<pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>, pybind11::arg, pybind11::arg, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, char [24]>(pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>&, pybind11::arg const&, pybind11::arg const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, char const (&) [24]) &&::'lambda'(pybind11::detail::value_and_holder&, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::__1::optional, bool)&) && + 176 (0x108f047b8 in libtorch_python.dylib)
frame #7: void pybind11::cpp_function::initialize<void pybind11::detail::initimpl::factory<torch::distributed::c10d::(anonymous namespace)::c10d_init(_object
, _object*)::$_52, pybind11::detail::void_type ()(), c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore> (std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::1::optional, bool), pybind11::detail::void_type ()>::execute<pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>, pybind11::arg, pybind11::arg, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, char [24]>(pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>&, pybind11::arg const&, pybind11::arg const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, char const (&) [24]) &&::'lambda'(pybind11::detail::value_and_holder&, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::__1::optional, bool), void, pybind11::detail::value_and_holder&, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::_1::optional, bool, pybind11::name, pybind11::is_method, pybind11::sibling, pybind11::detail::is_new_style_constructor, pybind11::arg, pybind11::arg, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, pybind11::arg_v, char [24]>(pybind11::class<c10d::TCPStore, c10::intrusive_ptr<c10d::TCPStore, c10::detail::intrusive_target_default_null_typec10d::TCPStore>>&&, void ()(pybind11::detail::value_and_holder&, std::__1::basic_string<char, std::__1::char_traits, std::__1::allocator> const&, unsigned short, std::__1::optional, bool, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000l>>, bool, bool, std::__1::optional, bool), pybind11::name const&, pybind11::is_method const&, pybind11::sibling const&, pybind11::detail::is_new_style_constructor const&, pybind11::arg const&, pybind11::arg const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, pybind11::arg_v const&, char const (&) [24])::'lambda'(pybind11::detail::function_call&)::__invoke(pybind11::detail::function_call&) + 92 (0x108f04668 in libtorch_python.dylib)
frame #8: pybind11::cpp_function::dispatcher(_object*, _object*, _object*) + 4804 (0x10831f8b8 in libtorch_python.dylib)
frame #9: cfunction_call + 76 (0x10191a204 in Python)
frame #10: _PyObject_MakeTpCall + 120 (0x1018bd19c in Python)
frame #11: method_vectorcall + 876 (0x1018c0a18 in Python)
frame #12: _PyVectorcall_Call + 152 (0x1018bdba0 in Python)
frame #13: slot_tp_init + 480 (0x10194f088 in Python)
frame #14: type_call + 148 (0x1019451c4 in Python)
frame #15: pybind11_meta_call + 40 (0x10831afa8 in libtorch_python.dylib)
frame #16: _PyObject_MakeTpCall + 120 (0x1018bd19c in Python)
frame #17: _PyEval_EvalFrameDefault + 16036 (0x1019e41f8 in Python)
frame #18: gen_send_ex2 + 196 (0x1018d9dec in Python)
frame #19: gen_iternext + 36 (0x1018d8950 in Python)
frame #20: builtin_next + 72 (0x1019dcbf8 in Python)
frame #21: cfunction_vectorcall_FASTCALL + 92 (0x1019197e4 in Python)
frame #22: PyObject_Vectorcall + 92 (0x1018bdd10 in Python)
frame #23: _PyEval_EvalFrameDefault + 8596 (0x1019e24e8 in Python)
frame #24: _PyObject_VectorcallDictTstate + 92 (0x1018bcfa4 in Python)
frame #25: slot_tp_init + 196 (0x10194ef6c in Python)
frame #26: type_call + 148 (0x1019451c4 in Python)
frame #27: _PyObject_MakeTpCall + 120 (0x1018bd19c in Python)
frame #28: _PyEval_EvalFrameDefault + 8596 (0x1019e24e8 in Python)
frame #29: _PyObject_VectorcallDictTstate + 200 (0x1018bd010 in Python)
frame #30: slot_tp_init + 196 (0x10194ef6c in Python)
frame #31: type_call + 148 (0x1019451c4 in Python)
frame #32: _PyObject_Call + 128 (0x1018bde90 in Python)
frame #33: _PyEval_EvalFrameDefault + 13232 (0x1019e3704 in Python)
frame #34: PyEval_EvalCode + 200 (0x1019e00e0 in Python)
frame #35: run_eval_code_obj + 104 (0x101a50828 in Python)
frame #36: run_mod + 168 (0x101a5017c in Python)
frame #37: _PyRun_StringFlagsWithName + 148 (0x101a4eb8c in Python)
frame #38: _PyRun_SimpleStringFlagsWithName + 144 (0x101a4e99c in Python)
frame #39: Py_RunMain + 808 (0x101a73de8 in Python)
frame #40: pymain_main + 304 (0x101a74488 in Python)
frame #41: Py_BytesMain + 44 (0x101a7452c in Python)
frame #42: start + 6992 (0x18edafda4 in dyld)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions