diff --git a/sycl/include/sycl/buffer.hpp b/sycl/include/sycl/buffer.hpp index 8b3a14af607f2..49fc26260a8f5 100644 --- a/sycl/include/sycl/buffer.hpp +++ b/sycl/include/sycl/buffer.hpp @@ -477,6 +477,7 @@ class buffer : public detail::buffer_plain, buffer &operator=(buffer &&rhs) = default; ~buffer() { + CPOUT << "~buffer()" << std::endl; try { buffer_plain::handleRelease(); } catch (std::exception &e) { diff --git a/sycl/include/sycl/detail/common.hpp b/sycl/include/sycl/detail/common.hpp index 19f4d85ab30d3..718d6aa5c0d0f 100644 --- a/sycl/include/sycl/detail/common.hpp +++ b/sycl/include/sycl/detail/common.hpp @@ -18,6 +18,10 @@ #include // for enable_if_t #include // for index_sequence, make_i... +// CP +//#define CPOUT std::clog +#define CPOUT std::clog.rdbuf(NULL); std::clog + // Default signature enables the passing of user code location information to // public methods as a default argument. namespace sycl { diff --git a/sycl/include/sycl/kernel_bundle.hpp b/sycl/include/sycl/kernel_bundle.hpp index a61019efdbf5d..6b9924c3aaf21 100644 --- a/sycl/include/sycl/kernel_bundle.hpp +++ b/sycl/include/sycl/kernel_bundle.hpp @@ -541,6 +541,9 @@ kernel_bundle get_kernel_bundle(const context &Ctx) { return get_kernel_bundle(Ctx, Ctx.get_devices()); } +// CP +__SYCL_EXPORT void test_release(sycl::context &Ctx, ur_native_handle_t NativeHandle); + namespace detail { // Internal non-template versions of get_kernel_bundle API which is used by diff --git a/sycl/include/sycl/property_list.hpp b/sycl/include/sycl/property_list.hpp index 8f019eb7cf22b..94aaf8011c843 100644 --- a/sycl/include/sycl/property_list.hpp +++ b/sycl/include/sycl/property_list.hpp @@ -18,6 +18,8 @@ #include // for conditional_t, enable... #include // for vector + + namespace sycl { inline namespace _V1 { namespace ext::oneapi { diff --git a/sycl/source/detail/buffer_impl.hpp b/sycl/source/detail/buffer_impl.hpp index be3a529f17718..a6ae00991f2f6 100644 --- a/sycl/source/detail/buffer_impl.hpp +++ b/sycl/source/detail/buffer_impl.hpp @@ -140,9 +140,11 @@ class buffer_impl final : public SYCLMemObjT { MemObjType getType() const override { return MemObjType::Buffer; } ~buffer_impl() { + CPOUT << "~buffer_impl" << std::endl; try { BaseT::updateHostMemory(); } catch (...) { + std::cout << "exception during updateHostMemory() called from ~buffer_impl" << std::endl; } destructorNotification(this); } diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp index 0dcddfa24d8e2..4736d3b45173e 100644 --- a/sycl/source/detail/context_impl.cpp +++ b/sycl/source/detail/context_impl.cpp @@ -33,6 +33,8 @@ context_impl::context_impl(const device &Device, async_handler AsyncHandler, MContext(nullptr), MPlatform(detail::getSyclObjImpl(Device.get_platform())), MPropList(PropList), MSupportBufferLocationByDevices(NotChecked) { + // CP + CPOUT << "context_impl(dev, async, plist) constructor" << std::endl; verifyProps(PropList); MKernelProgramCache.setContextPtr(this); } @@ -43,6 +45,8 @@ context_impl::context_impl(const std::vector Devices, : MOwnedByRuntime(true), MAsyncHandler(AsyncHandler), MDevices(Devices), MContext(nullptr), MPlatform(), MPropList(PropList), MSupportBufferLocationByDevices(NotChecked) { + // CP + CPOUT << "context_impl(devices, async, plist) constructor" << std::endl; verifyProps(PropList); MPlatform = detail::getSyclObjImpl(MDevices[0].get_platform()); std::vector DeviceIds; @@ -76,6 +80,8 @@ context_impl::context_impl(ur_context_handle_t UrContext, : MOwnedByRuntime(OwnedByRuntime), MAsyncHandler(AsyncHandler), MDevices(DeviceList), MContext(UrContext), MPlatform(), MSupportBufferLocationByDevices(NotChecked) { + // CP + CPOUT << "context_impl(UrContext, async, Adapter, DeviceList, OwnedByRuntime) constructor" << std::endl; if (!MDevices.empty()) { MPlatform = detail::getSyclObjImpl(MDevices[0].get_platform()); } else { @@ -126,6 +132,8 @@ cl_context context_impl::get() const { } context_impl::~context_impl() { + // CP + CPOUT << "~context_impl() called" << std::endl; try { // Free all events associated with the initialization of device globals. for (auto &DeviceGlobalInitializer : MDeviceGlobalInitializers) diff --git a/sycl/source/detail/context_impl.hpp b/sycl/source/detail/context_impl.hpp index 65e1fda0a5a7a..7baf542ed1db3 100644 --- a/sycl/source/detail/context_impl.hpp +++ b/sycl/source/detail/context_impl.hpp @@ -23,6 +23,8 @@ #include #include + + namespace sycl { inline namespace _V1 { // Forward declaration diff --git a/sycl/source/detail/error_handling/error_handling.cpp b/sycl/source/detail/error_handling/error_handling.cpp index daae5563776f1..78697a258e0a2 100644 --- a/sycl/source/detail/error_handling/error_handling.cpp +++ b/sycl/source/detail/error_handling/error_handling.cpp @@ -192,6 +192,7 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, for (size_t I = 0; I < 3; ++I) { if (MaxThreadsPerBlock[I] < NDRDesc.LocalSize[I]) { + CPOUT << "---- THROWING ---- " << std::endl; throw sycl::exception(make_error_code(errc::nd_range), "The number of work-items in each dimension of a " "work-group cannot exceed {" + diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index f6e5edfc92e74..e2b972f65cec4 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -43,6 +43,8 @@ void event_impl::initContextIfNeeded() { } event_impl::~event_impl() { + // CP + CPOUT << "~event_impl() called" << std::endl; try { auto Handle = this->getHandle(); if (Handle) @@ -145,7 +147,8 @@ void event_impl::setContextImpl(const ContextImplPtr &Context) { event_impl::event_impl(ur_event_handle_t Event, const context &SyclContext) : MEvent(Event), MContext(detail::getSyclObjImpl(SyclContext)), MIsFlushed(true), MState(HES_Complete) { - + // CP + CPOUT << "event_impl(ur_event_handle_t, context )" << std::endl; ur_context_handle_t TempContext; getAdapter()->call( this->getHandle(), UR_EVENT_INFO_CONTEXT, sizeof(ur_context_handle_t), @@ -163,6 +166,9 @@ event_impl::event_impl(const QueueImplPtr &Queue) : MQueue{Queue}, MIsProfilingEnabled{!Queue || Queue->MIsProfilingEnabled}, MFallbackProfiling{MIsProfilingEnabled && Queue && Queue->isProfilingFallback()} { + // CP + CPOUT << "event_impl(QueueImplPtr)" << std::endl; + if (Queue) this->setContextImpl(Queue->getContextImplPtr()); else { diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 768de70826624..223e5fd0e1dc3 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -55,6 +55,8 @@ class event_impl { // ONEAPI_DEVICE_SELECTOR. Deferring may lead to conficts with noexcept // event methods. This ::get() call uses static vars to read and parse the // ODS env var exactly once. + // CP + CPOUT << "event_impl() constructor" << std::endl; SYCLConfig::get(); } diff --git a/sycl/source/detail/global_handler.cpp b/sycl/source/detail/global_handler.cpp index 5669fbdaacc50..3b76b9aaa4d79 100644 --- a/sycl/source/detail/global_handler.cpp +++ b/sycl/source/detail/global_handler.cpp @@ -60,10 +60,6 @@ class ObjectUsageCounter { LockGuard Guard(GlobalHandler::MSyclGlobalHandlerProtector); MCounter--; - GlobalHandler *RTGlobalObjHandler = GlobalHandler::getInstancePtr(); - if (RTGlobalObjHandler) { - RTGlobalObjHandler->prepareSchedulerToRelease(!MCounter); - } } catch (std::exception &e) { __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~ObjectUsageCounter", e); } @@ -234,6 +230,8 @@ void GlobalHandler::releaseDefaultContexts() { // Note that on Windows the destruction of the default context // races with the detaching of the DLL object that calls urLoaderTearDown. + CPOUT << "releaseDefaultContext()" << std::endl; + MPlatformToDefaultContextCache.Inst.reset(nullptr); } @@ -242,7 +240,10 @@ struct EarlyShutdownHandler { try { #ifdef _WIN32 // on Windows we keep to the existing shutdown procedure - GlobalHandler::instance().releaseDefaultContexts(); + //GlobalHandler::instance().endDeferredRelease(); + //GlobalHandler::instance().releaseDefaultContexts(); + //shutdown_early(); + //shutdown_late(); #else shutdown_early(); #endif @@ -284,13 +285,14 @@ void GlobalHandler::unloadAdapters() { } void GlobalHandler::prepareSchedulerToRelease(bool Blocking) { + // CP - fix part 1 #ifndef _WIN32 if (Blocking) drainThreadPool(); +#endif if (MScheduler.Inst) MScheduler.Inst->releaseResources(Blocking ? BlockingT::BLOCKING : BlockingT::NON_BLOCKING); -#endif } void GlobalHandler::drainThreadPool() { @@ -298,7 +300,7 @@ void GlobalHandler::drainThreadPool() { MHostTaskThreadPool.Inst->drain(); } -#ifdef _WIN32 + //#ifdef _WIN32 // because of something not-yet-understood on Windows // threads may be shutdown once the end of main() is reached // making an orderly shutdown difficult. Fortunately, Windows @@ -309,7 +311,7 @@ void shutdown_win() { GlobalHandler *&Handler = GlobalHandler::getInstancePtr(); Handler->unloadAdapters(); } -#else + //#else void shutdown_early() { const LockGuard Lock{GlobalHandler::MSyclGlobalHandlerProtector}; GlobalHandler *&Handler = GlobalHandler::getInstancePtr(); @@ -321,8 +323,10 @@ void shutdown_early() { // Ensure neither host task is working so that no default context is accessed // upon its release + CPOUT << "shutdown_early() about to prepareSchedulerToRelease" << std::endl; Handler->prepareSchedulerToRelease(true); + CPOUT << "shutdown_early() about to finishAndWait()" << std::endl; if (Handler->MHostTaskThreadPool.Inst) Handler->MHostTaskThreadPool.Inst->finishAndWait(); @@ -353,9 +357,18 @@ void shutdown_late() { delete Handler; Handler = nullptr; } -#endif + //#endif #ifdef _WIN32 +// a simple wrapper to catch and stream any exception then continue +template +void safe_call(F func) { + try { + func(); + } catch (const std::exception& e) { + std::cerr << "exception in DllMain DLL_PROCESS_DETACH " << e.what() << std::endl; + } +} extern "C" __SYCL_EXPORT BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpReserved) { @@ -374,19 +387,8 @@ extern "C" __SYCL_EXPORT BOOL WINAPI DllMain(HINSTANCE hinstDLL, if (PrintUrTrace) std::cout << "---> DLL_PROCESS_DETACH syclx.dll\n" << std::endl; -#ifdef XPTI_ENABLE_INSTRUMENTATION - if (xptiTraceEnabled()) - return TRUE; // When doing xpti tracing, we can't safely call shutdown. - // TODO: figure out what XPTI is doing that prevents - // release. -#endif - - try { - shutdown_win(); - } catch (std::exception &e) { - __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in shutdown_win", e); - return FALSE; - } + safe_call([](){ shutdown_early(); }); + safe_call([](){ shutdown_late(); }); break; case DLL_PROCESS_ATTACH: if (PrintUrTrace) diff --git a/sycl/source/detail/kernel_bundle_impl.hpp b/sycl/source/detail/kernel_bundle_impl.hpp index e538318f807b7..40f4c8469440e 100644 --- a/sycl/source/detail/kernel_bundle_impl.hpp +++ b/sycl/source/detail/kernel_bundle_impl.hpp @@ -533,6 +533,23 @@ class kernel_bundle_impl { ContextImpl->getHandleRef(), spirv.data(), spirv.size(), nullptr, &UrProgram); // program created by urProgramCreateWithIL is implicitly retained. + + // ------------------------------------- + // CP - adding to try an force an imbalance + Adapter->call(UrProgram); + + // rebalance: + // this works. + // Adapter->call(UrProgram); + + // this ALSO works. So much for my theory. + detail::UrFuncInfo programReleaseInfo; + auto programRelease = programReleaseInfo.getFuncPtrFromModule(detail::ur::getURLoaderLibrary()); + programRelease(UrProgram); + + // ------------------------------------- + + if (UrProgram == nullptr) throw sycl::exception( sycl::make_error_code(errc::invalid), diff --git a/sycl/source/detail/kernel_impl.cpp b/sycl/source/detail/kernel_impl.cpp index f89ef979f7c9e..8b05efe5e9841 100644 --- a/sycl/source/detail/kernel_impl.cpp +++ b/sycl/source/detail/kernel_impl.cpp @@ -24,6 +24,8 @@ kernel_impl::kernel_impl(ur_kernel_handle_t Kernel, ContextImplPtr Context, Context)), MCreatedFromSource(true), MKernelBundleImpl(std::move(KernelBundleImpl)), MIsInterop(true), MKernelArgMaskPtr{ArgMask} { + // CP + CPOUT << "kernel_impl(kernel, context, bundle, argmas) constructor" << std::endl; ur_context_handle_t UrContext = nullptr; // Using the adapter from the passed ContextImpl getAdapter()->call( @@ -53,10 +55,14 @@ kernel_impl::kernel_impl(ur_kernel_handle_t Kernel, ContextImplPtr ContextImpl, MCreatedFromSource(false), MDeviceImageImpl(std::move(DeviceImageImpl)), MKernelBundleImpl(std::move(KernelBundleImpl)), MKernelArgMaskPtr{ArgMask}, MCacheMutex{CacheMutex} { + // CP + CPOUT << "kernel_impl(kernel, context, deviceimage, bundle, argmask, program, mutex) constructor" << std::endl; MIsInterop = MKernelBundleImpl->isInterop(); } kernel_impl::~kernel_impl() { + // CP + CPOUT << "~kernel_impl() called" << std::endl; try { // TODO catch an exception and put it to list of asynchronous exceptions getAdapter()->call(MKernel); diff --git a/sycl/source/detail/kernel_program_cache.hpp b/sycl/source/detail/kernel_program_cache.hpp index 968cb9b24b053..badf43222557f 100644 --- a/sycl/source/detail/kernel_program_cache.hpp +++ b/sycl/source/detail/kernel_program_cache.hpp @@ -112,14 +112,20 @@ class KernelProgramCache { struct ProgramBuildResult : public BuildResult { AdapterPtr Adapter; ProgramBuildResult(const AdapterPtr &Adapter) : Adapter(Adapter) { + // CP + CPOUT << "ProgramBuildResult(adapter)" << std::endl; Val = nullptr; } ProgramBuildResult(const AdapterPtr &Adapter, BuildState InitialState) : Adapter(Adapter) { + // CP + CPOUT << "ProgramBuildResult(adapter, state)" << std::endl; Val = nullptr; this->State.store(InitialState); } ~ProgramBuildResult() { + // CP + CPOUT << "~ProgramBuildResult()" << std::endl; try { if (Val) { ur_result_t Err = @@ -131,6 +137,9 @@ class KernelProgramCache { e); } } + ProgramBuildResult() = delete; + ProgramBuildResult(const ProgramBuildResult&) = delete; + ProgramBuildResult& operator=(const ProgramBuildResult&) = delete; }; using ProgramBuildResultPtr = std::shared_ptr; @@ -198,9 +207,13 @@ class KernelProgramCache { struct KernelBuildResult : public BuildResult { AdapterPtr Adapter; KernelBuildResult(const AdapterPtr &Adapter) : Adapter(Adapter) { + // CP + CPOUT << "KernelBuildResult(adapter)" << std::endl; Val.first = nullptr; } ~KernelBuildResult() { + // CP + CPOUT << "~KernelBuildResult()" << std::endl; try { if (Val.first) { ur_result_t Err = diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 0f99f49d1257d..273cbee31b137 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -121,6 +121,8 @@ class queue_impl { MIsProfilingEnabled(has_property()), MQueueID{ MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} { + // CP + CPOUT << "queue_impl() constructor" << std::endl; verifyProps(PropList); if (has_property()) { if (has_property()) @@ -232,6 +234,7 @@ class queue_impl { MIsProfilingEnabled(has_property()), MQueueID{ MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} { + CPOUT << "queue_impl() interop constructor" << std::endl; queue_impl_interop(UrQueue); } @@ -251,11 +254,14 @@ class queue_impl { MIsProfilingEnabled(has_property()), MQueueID{ MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} { + CPOUT << "queue_impl() verify/interop constructor " << std::endl; verifyProps(PropList); queue_impl_interop(UrQueue); } ~queue_impl() { + // CP + CPOUT << "~queue_impl() called" << std::endl; try { #if XPTI_ENABLE_INSTRUMENTATION // The trace event created in the constructor should be active through the diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 31db161f88726..c1c55378b31fa 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -828,9 +828,11 @@ bool Command::producesPiEvent() const { return true; } bool Command::supportsPostEnqueueCleanup() const { return true; } +// CP - moar fix ( this fix and the change to the assert in graph_builder.cpp are not likely really needed. ) bool Command::readyForCleanup() const { return MLeafCounter == 0 && - MEnqueueStatus == EnqueueResultT::SyclEnqueueSuccess; + (MEnqueueStatus == EnqueueResultT::SyclEnqueueSuccess || + MEnqueueStatus == EnqueueResultT::SyclEnqueueFailed); } Command *Command::addDep(DepDesc NewDep, std::vector &ToCleanUp) { @@ -1063,6 +1065,8 @@ AllocaCommandBase::AllocaCommandBase(CommandType Type, QueueImplPtr Queue, : Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd), MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MIsConst(IsConst), MRequirement(std::move(Req)), MReleaseCmd(Queue, this) { + // CP + CPOUT << "AllocaCommandBase constructor " << MType << std::endl; MRequirement.MAccessMode = access::mode::read_write; emitInstrumentationDataProxy(); } @@ -1265,6 +1269,8 @@ void AllocaSubBufCommand::printDot(std::ostream &Stream) const { ReleaseCommand::ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd) : Command(CommandType::RELEASE, std::move(Queue)), MAllocaCmd(AllocaCmd) { + // CP + CPOUT << "ReleaseCommmand(Q, Allocacmd) constructor " << MType << std::endl; emitInstrumentationDataProxy(); } @@ -1390,6 +1396,8 @@ MapMemObject::MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, : Command(CommandType::MAP_MEM_OBJ, std::move(Queue)), MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(std::move(Req)), MDstPtr(DstPtr), MMapMode(MapMode) { + // CP + CPOUT << "MapMemObject constructor " << MType << std::endl; emitInstrumentationDataProxy(); } @@ -1452,6 +1460,8 @@ UnMapMemObject::UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req, void **SrcPtr, QueueImplPtr Queue) : Command(CommandType::UNMAP_MEM_OBJ, std::move(Queue)), MDstAllocaCmd(DstAllocaCmd), MDstReq(std::move(Req)), MSrcPtr(SrcPtr) { + // CP + CPOUT << "UnMapMemObject constructor " << MType << std::endl; emitInstrumentationDataProxy(); } @@ -1540,6 +1550,8 @@ MemCpyCommand::MemCpyCommand(Requirement SrcReq, MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)), MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)), MDstAllocaCmd(DstAllocaCmd) { + // CP + CPOUT << "MemCpyCommand constructor " << MType << " " << this << std::endl; if (MSrcQueue) { MEvent->setContextImpl(MSrcQueue->getContextImplPtr()); } @@ -1714,6 +1726,8 @@ MemCpyCommandHost::MemCpyCommandHost(Requirement SrcReq, : Command(CommandType::COPY_MEMORY, std::move(DstQueue)), MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)), MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)), MDstPtr(DstPtr) { + // CP + CPOUT << "MemCpyCommandHost constructor " << MType << std::endl; if (MSrcQueue) { MEvent->setContextImpl(MSrcQueue->getContextImplPtr()); } @@ -1788,6 +1802,8 @@ ur_result_t MemCpyCommandHost::enqueueImp() { } EmptyCommand::EmptyCommand() : Command(CommandType::EMPTY_TASK, nullptr) { + // CP + CPOUT << "EmptyCommand() " << MType << std::endl; emitInstrumentationDataProxy(); } @@ -1882,7 +1898,8 @@ UpdateHostRequirementCommand::UpdateHostRequirementCommand( void **DstPtr) : Command(CommandType::UPDATE_REQUIREMENT, std::move(Queue)), MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(Req)), MDstPtr(DstPtr) { - + // CP + CPOUT << "UpdateHostRequirementCommand constructor " << MType << std::endl; emitInstrumentationDataProxy(); } @@ -1982,6 +1999,8 @@ ExecCGCommand::ExecCGCommand( : Command(CommandType::RUN_CG, std::move(Queue), CommandBuffer, Dependencies), MEventNeeded(EventNeeded), MCommandGroup(std::move(CommandGroup)) { + // CP + CPOUT << "ExecCGCommand constructor " << MType << std::endl; if (MCommandGroup->getType() == detail::CGType::CodeplayHostTask) { MEvent->setSubmittedQueue( static_cast(MCommandGroup.get())->MQueue); @@ -2810,6 +2829,9 @@ void enqueueImpKernel( KernelIsCooperative, KernelUsesClusterLaunch, WorkGroupMemorySize, BinImage, KernelName); + // CP + // Error = UR_RESULT_SUCCESS; //<-- this changes the leak. + const AdapterPtr &Adapter = Queue->getAdapter(); if (!SyclKernelImpl && !MSyclKernel) { Adapter->call(Kernel); @@ -2817,11 +2839,13 @@ void enqueueImpKernel( } } if (UR_RESULT_SUCCESS != Error) { + // CP + // throwing an exception here does not help. so the problem isn't in the "handling" below + // If we have got non-success error code, let's analyze it to emit nice // exception explaining what was wrong const device_impl &DeviceImpl = *(Queue->getDeviceImplPtr()); - detail::enqueue_kernel_launch::handleErrorOrWarning(Error, DeviceImpl, - Kernel, NDRDesc); + detail::enqueue_kernel_launch::handleErrorOrWarning(Error, DeviceImpl, Kernel, NDRDesc); } } @@ -3677,7 +3701,11 @@ UpdateCommandBufferCommand::UpdateCommandBufferCommand( std::vector> Nodes) : Command(CommandType::UPDATE_CMD_BUFFER, Queue), MGraph(Graph), - MNodes(Nodes) {} + MNodes(Nodes) { + // CP + CPOUT << "Create UpdateCommandBufferCommand " << MType << std::endl; + + } ur_result_t UpdateCommandBufferCommand::enqueueImp() { waitForPreparedHostEvents(); diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 239cebf521767..dc8f4c6f36077 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -83,7 +83,26 @@ struct EnqueueResultT { struct DepDesc { DepDesc(Command *DepCommand, const Requirement *Req, AllocaCommandBase *AllocaCmd) - : MDepCommand(DepCommand), MDepRequirement(Req), MAllocaCmd(AllocaCmd) {} + : MDepCommand(DepCommand), MDepRequirement(Req), MAllocaCmd(AllocaCmd) { + CPOUT << "DepDesc() constructor(" << this << "). MDepCommand: " << MDepCommand << std::endl; + } + + ~DepDesc() { + CPOUT << "~DepDesc() destructor(" << this << "). MDepCommand: " << MDepCommand << std::endl; + } + + DepDesc() = delete; // CP + + // CP - not sure if removing the copy constructor will help identify the problem. + // will take this up if needed. + //DepDesc(const DepDesc &Other) = delete; // CP + + //copy constructor. + DepDesc(const DepDesc &Other) + : MDepCommand(Other.MDepCommand), MDepRequirement(Other.MDepRequirement), + MAllocaCmd(Other.MAllocaCmd) { + CPOUT << "DepDesc() copy constructor(" << this << "). MDepCommand: " << MDepCommand << std::endl; + } friend bool operator<(const DepDesc &Lhs, const DepDesc &Rhs) { return std::tie(Lhs.MDepRequirement, Lhs.MDepCommand) < @@ -219,7 +238,11 @@ class Command { return nullptr; } - virtual ~Command() { MEvent->cleanDepEventsThroughOneLevel(); } + virtual ~Command() { + // CP + CPOUT << "~Command() type: " << MType << " " << this << std::endl; + MEvent->cleanDepEventsThroughOneLevel(); + } const char *getBlockReason() const; diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 5636309cdccc1..307954851b878 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -193,7 +193,9 @@ Scheduler::GraphBuilder::getOrInsertMemObjRecord(const QueueImplPtr &Queue, LeavesCollection::EnqueueListT &ToEnqueue) { // Add the old leaf as a dependency for the new one by duplicating one // of the requirements for the current record + // CP - if we drop copy constructor, this will have to change DepDesc Dep = findDepForRecord(Dependant, Record); + CPOUT << "DepDesc change dependency. Before MDepCommand: " << Dep.MDepCommand << " After: " << Dependency << std::endl; Dep.MDepCommand = Dependency; std::vector ToCleanUp; Command *ConnectionCmd = Dependant->addDep(Dep, ToCleanUp); @@ -201,6 +203,7 @@ Scheduler::GraphBuilder::getOrInsertMemObjRecord(const QueueImplPtr &Queue, ToEnqueue.push_back(ConnectionCmd); --(Dependency->MLeafCounter); + CPOUT << "reduced Dependency->MLeafCounter: " << Dependency->MLeafCounter << " cleanup? " << Dependency->readyForCleanup() << std::endl; if (Dependency->readyForCleanup()) ToCleanUp.push_back(Dependency); for (Command *Cmd : ToCleanUp) @@ -486,8 +489,11 @@ Scheduler::GraphBuilder::addCopyBack(Requirement *Req, std::vector ToCleanUp; for (Command *Dep : Deps) { - Command *ConnCmd = MemCpyCmd->addDep( - DepDesc{Dep, MemCpyCmd->getRequirement(), SrcAllocaCmd}, ToCleanUp); + // CP -- moar fix + if (Dep->MEnqueueStatus == EnqueueResultT::SyclEnqueueFailed) + continue; // nothing to do + + Command *ConnCmd = MemCpyCmd->addDep(DepDesc{Dep, MemCpyCmd->getRequirement(), SrcAllocaCmd}, ToCleanUp); if (ConnCmd) ToEnqueue.push_back(ConnCmd); } @@ -626,6 +632,7 @@ Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record, // A helper function for finding a command dependency on a specific memory // object +// CP - will have to change if we move from copy constructor DepDesc Scheduler::GraphBuilder::findDepForRecord(Command *Cmd, MemObjRecord *Record) { for (const DepDesc &DD : Cmd->MDeps) { @@ -634,7 +641,9 @@ DepDesc Scheduler::GraphBuilder::findDepForRecord(Command *Cmd, } } assert(false && "No dependency found for a leaf of the record"); - return {nullptr, nullptr, nullptr}; + // CP -- if we drop copy constructor, this will have to change + static DepDesc nullDep{nullptr, nullptr, nullptr}; + return nullDep; } // The function searches for the alloca command matching context and @@ -1157,8 +1166,10 @@ void Scheduler::GraphBuilder::cleanupCommand( if (SYCLConfig::get()) return; + // CP -- change to include failed probably not needed. assert(Cmd->MLeafCounter == 0 && - (Cmd->isSuccessfullyEnqueued() || AllowUnsubmitted)); + (Cmd->isSuccessfullyEnqueued() || AllowUnsubmitted || + (Cmd->MEnqueueStatus == EnqueueResultT::SyclEnqueueFailed))); Command::CommandType CmdT = Cmd->getType(); assert(CmdT != Command::ALLOCA && CmdT != Command::ALLOCA_SUB_BUF); @@ -1169,6 +1180,7 @@ void Scheduler::GraphBuilder::cleanupCommand( for (DepDesc &Dep : UserCmd->MDeps) { // Link the users of the command to the alloca command(s) instead if (Dep.MDepCommand == Cmd) { + CPOUT << "DepDesc changing MDepCommand. Before: " << Dep.MDepCommand; // ... unless the user is the alloca itself. if (Dep.MAllocaCmd == UserCmd) { Dep.MDepCommand = nullptr; @@ -1176,6 +1188,7 @@ void Scheduler::GraphBuilder::cleanupCommand( Dep.MDepCommand = Dep.MAllocaCmd; Dep.MDepCommand->MUsers.insert(UserCmd); } + CPOUT << " After: " << Dep.MDepCommand << std::endl; } } } @@ -1247,6 +1260,7 @@ Command *Scheduler::GraphBuilder::connectDepEvent( // add user to Dep.MDepCommand is already performed beyond this if branch { DepDesc DepOnConnect = Dep; + CPOUT << "connect DepDesc changing MDepCommand. Before: " << DepOnConnect.MDepCommand << " After: " << ConnectCmd << std::endl; DepOnConnect.MDepCommand = ConnectCmd; // Dismiss the result here as it's not a connection now, diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index efbbb52acab73..e100bd07ef3b4 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -52,6 +52,10 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record, #endif std::vector ToCleanUp; for (Command *Cmd : Record->MReadLeaves) { + // CP -- possible fix + if(Cmd->MEnqueueStatus == EnqueueResultT::SyclEnqueueFailed) + continue; // nothing to do + EnqueueResultT Res; bool Enqueued = GraphProcessor::enqueueCommand(Cmd, GraphReadLock, Res, ToCleanUp, Cmd); @@ -65,6 +69,10 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record, GraphProcessor::waitForEvent(Cmd->getEvent(), GraphReadLock, ToCleanUp); } for (Command *Cmd : Record->MWriteLeaves) { + // CP -- possible fix + if(Cmd->MEnqueueStatus == EnqueueResultT::SyclEnqueueFailed) + continue; // nothing to do + EnqueueResultT Res; bool Enqueued = GraphProcessor::enqueueCommand(Cmd, GraphReadLock, Res, ToCleanUp, Cmd); @@ -128,7 +136,7 @@ EventImplPtr Scheduler::addCG( NewEvent->setSubmissionTime(); } - enqueueCommandForCG(NewEvent, AuxiliaryCmds); + enqueueCommandForCG(NewEvent, AuxiliaryCmds); // may throw if (!AuxiliaryResources.empty()) registerAuxiliaryResources(NewEvent, std::move(AuxiliaryResources)); @@ -149,26 +157,32 @@ void Scheduler::enqueueCommandForCG(EventImplPtr NewEvent, EnqueueResultT Res; bool Enqueued; - auto CleanUp = [&]() { + // CP + auto CleanUp = [&](Command* SomeCmd) { + + // original logic. doesn't do anything b.c. MDeps or MUsers rarely both empty if (NewCmd && (NewCmd->MDeps.size() == 0 && NewCmd->MUsers.size() == 0)) { if (NewEvent) { NewEvent->setCommand(nullptr); } delete NewCmd; } + + // CP -- latest and last fix!! + cleanupCommands(ToCleanUp); }; for (Command *Cmd : AuxiliaryCmds) { - Enqueued = GraphProcessor::enqueueCommand(Cmd, Lock, Res, ToCleanUp, Cmd, - Blocking); - try { + try { + Enqueued = GraphProcessor::enqueueCommand(Cmd, Lock, Res, ToCleanUp, Cmd, Blocking); + if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw exception(make_error_code(errc::runtime), "Auxiliary enqueue process failed."); } catch (...) { // enqueueCommand() func and if statement above may throw an exception, // so destroy required resources to avoid memory leak - CleanUp(); + CleanUp(Cmd); std::rethrow_exception(std::current_exception()); } } @@ -177,19 +191,18 @@ void Scheduler::enqueueCommandForCG(EventImplPtr NewEvent, // TODO: Check if lazy mode. EnqueueResultT Res; try { - bool Enqueued = GraphProcessor::enqueueCommand( - NewCmd, Lock, Res, ToCleanUp, NewCmd, Blocking); + bool Enqueued = GraphProcessor::enqueueCommand(NewCmd, Lock, Res, ToCleanUp, NewCmd, Blocking); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) - throw exception(make_error_code(errc::runtime), - "Enqueue process failed."); + throw exception(make_error_code(errc::runtime), "Enqueue process failed."); } catch (...) { // enqueueCommand() func and if statement above may throw an exception, // so destroy required resources to avoid memory leak - CleanUp(); + CleanUp(NewCmd); std::rethrow_exception(std::current_exception()); } } } + // THIS cleanup op has no bearing on the outcome. Wihtout it the good app still has no leaks. cleanupCommands(ToCleanUp); } @@ -266,11 +279,22 @@ void Scheduler::waitForEvent(const EventImplPtr &Event, bool *Success) { bool Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj, bool StrictLock) { + CPOUT << "Scheduler::removeMemoryObject() " << StrictLock << std::endl; MemObjRecord *Record = MGraphBuilder.getMemObjRecord(MemObj); + CPOUT << "Got a Record: " << Record << std::endl; if (!Record) // No operations were performed on the mem object return true; + //CP - fix part 2. Should this be the same for linux? +#ifdef _WIN32 + bool allowWait = MemObj->hasUserDataPtr() || GlobalHandler::instance().isOkToDefer(); +#else + bool allowWait = true; +#endif + CPOUT << "allowWait: " << allowWait << std::endl; + + if(allowWait) { // This only needs a shared mutex as it only involves enqueueing and // awaiting for events @@ -389,6 +413,7 @@ void Scheduler::releaseResources(BlockingT Blocking) { cleanupCommands({}); cleanupAuxiliaryResources(Blocking); + // We need loop since sometimes we may need new objects to be added to // deferred mem objects storage during cleanup. Known example is: we cleanup // existing deferred mem objects under write lock, during this process we @@ -453,7 +478,9 @@ void Scheduler::NotifyHostTaskCompletion(Command *Cmd) { { ReadLockT Lock = acquireReadLock(); - std::vector Deps = Cmd->MDeps; + // CP -- not needed + //std::vector Deps = Cmd->MDeps; + // Host tasks are cleaned up upon completion rather than enqueuing. if (Cmd->MLeafCounter == 0) { ToCleanUp.push_back(Cmd); @@ -491,10 +518,8 @@ void Scheduler::cleanupDeferredMemObjects(BlockingT Blocking) { std::vector> TempStorage; { std::lock_guard LockDef{MDeferredMemReleaseMutex}; - MDeferredMemObjRelease.swap(TempStorage); + MDeferredMemObjRelease.swap(TempStorage); // it is here that host-task-failure freezes. destructors, presumably? } - // if any objects in TempStorage exist - it is leaving scope and being - // deleted } std::vector> ObjsReadyToRelease; diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index c6d2d07600d12..a3741b0d4dae6 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -201,7 +201,13 @@ struct MemObjRecord { MemObjRecord(ContextImplPtr Ctx, std::size_t LeafLimit, LeavesCollection::AllocateDependencyF AllocateDependency) : MReadLeaves{this, LeafLimit, AllocateDependency}, - MWriteLeaves{this, LeafLimit, AllocateDependency}, MCurContext{Ctx} {} + MWriteLeaves{this, LeafLimit, AllocateDependency}, MCurContext{Ctx} { + CPOUT << "MemObjRecord() constructor" << std::endl; + } + + ~MemObjRecord() { + CPOUT << "~MemObjRecord destructor" << std::endl; + } // Contains all allocation commands for the memory object. std::vector MAllocaCommands; @@ -693,6 +699,7 @@ class Scheduler { protected: /// Finds a command dependency corresponding to the record. + // CP - switch if we drop copy constructor DepDesc findDepForRecord(Command *Cmd, MemObjRecord *Record); /// Searches for suitable alloca in memory record. diff --git a/sycl/source/detail/sycl_mem_obj_t.cpp b/sycl/source/detail/sycl_mem_obj_t.cpp index 25e092232ae7f..a2f875d2031f7 100644 --- a/sycl/source/detail/sycl_mem_obj_t.cpp +++ b/sycl/source/detail/sycl_mem_obj_t.cpp @@ -140,6 +140,7 @@ void SYCLMemObjT::updateHostMemory(void *const Ptr) { } void SYCLMemObjT::updateHostMemory() { + CPOUT << "updateHostMemory() entered. Have Upload functor: " << (MUploadDataFunctor != nullptr) << " NeedWriteBack: " << MNeedWriteBack << std::endl; if ((MUploadDataFunctor != nullptr) && MNeedWriteBack) MUploadDataFunctor(); diff --git a/sycl/source/kernel_bundle.cpp b/sycl/source/kernel_bundle.cpp index e19c2b9df2a75..619e133678168 100644 --- a/sycl/source/kernel_bundle.cpp +++ b/sycl/source/kernel_bundle.cpp @@ -328,6 +328,41 @@ bool is_compatible(const std::vector &KernelIDs, const device &Dev) { return true; } +// CP +#include "detail/adapter.hpp" +#include +//using ContextImplPtr = std::shared_ptr; +void test_release(sycl::context &Context, ur_native_handle_t NativeHandle) { + //detail::ProgramManager::getInstance(); + + const detail::ContextImplPtr &ContextImpl = getSyclObjImpl(Context); + const detail::AdapterPtr &Adapter = ContextImpl->getAdapter(); + ur_program_handle_t UrProgram = nullptr; + ur_program_native_properties_t Properties{}; + Properties.stype = UR_STRUCTURE_TYPE_PROGRAM_NATIVE_PROPERTIES; + Properties.isNativeHandleOwned = true; //!KeepOwnership; + + detail::UrFuncInfo programReleaseInfo; + auto programRelease = programReleaseInfo.getFuncPtrFromModule(detail::ur::getURLoaderLibrary()); + + Adapter->call( NativeHandle, ContextImpl->getHandleRef(), &Properties, &UrProgram); + + uint32_t NumDevices = 0; + Adapter->call( UrProgram, UR_PROGRAM_INFO_NUM_DEVICES, sizeof(NumDevices), &NumDevices, nullptr); + std::cout << "NumDevice: " << NumDevices << std::endl; + + // is this necessary? SHouldn't be. + Adapter->call(UrProgram); + Adapter->call(UrProgram); + + // + //Adapter->call(UrProgram); + + // or + //programRelease(UrProgram); + +} + ///////////////////////// // * kernel_compiler extension * ///////////////////////// diff --git a/sycl/test-e2e/Basic/backend_info.cpp b/sycl/test-e2e/Basic/backend_info.cpp index f61b1809a6e46..d64d3cf5b533e 100644 --- a/sycl/test-e2e/Basic/backend_info.cpp +++ b/sycl/test-e2e/Basic/backend_info.cpp @@ -1,5 +1,5 @@ // RUN: %{build} -o %t.out -// RUN: %{run} %t.out +// RUN: env SYCL_UR_TRACE=-2 %{run} %t.out // // RUN: %{build} -DTEST_ERRORS -D_GLIBCXX_USE_CXX11_ABI=0 -fsyntax-only -Xclang -verify -Xclang -verify-ignore-unexpected=note diff --git a/sycl/test-e2e/Basic/buffer/buffer_release.cpp b/sycl/test-e2e/Basic/buffer/buffer_release.cpp index 92f48f2027cd3..c41c141938523 100644 --- a/sycl/test-e2e/Basic/buffer/buffer_release.cpp +++ b/sycl/test-e2e/Basic/buffer/buffer_release.cpp @@ -1,7 +1,7 @@ // REQUIRES: cpu // RUN: %{build} -o %t.out -// RUN: %{run} %t.out +// RUN: env SYCL_UR_TRACE=-2 %{run} %t.out #include diff --git a/sycl/test-e2e/Basic/subdevice.cpp b/sycl/test-e2e/Basic/subdevice.cpp index 6ad21afc12ad2..9aeeebb261a79 100644 --- a/sycl/test-e2e/Basic/subdevice.cpp +++ b/sycl/test-e2e/Basic/subdevice.cpp @@ -154,6 +154,7 @@ int main() { } // test exception + std::cout << "TEST EXCEPTION!! " << std::endl; try { const size_t out_of_bounds = std::numeric_limits::max(); const auto partition =