Skip to content

call to unknown function jl_f_issubtype but CUDA works #846

@AntonReinhard

Description

@AntonReinhard

I have made the following minimal example:

using QEDbase
using QEDbase.Mocks
using KernelAbstractions
using Random

@kernel function mwe_kernel(
        @Const(moms::AbstractVector),
        dest::AbstractVector,
    )
    id = @index(Global)
    dest[id] = moms[id][1] == moms[id][2]
end

RNG = MersenneTwister(137137)

# works ->
using CUDA
moms = CuVector([Mocks._rand_momenta(RNG, 2, MockMomentum{Float32}) for _ in 1:128])
dest = similar(moms, Bool)
mwe_kernel(get_backend(moms))(moms, dest; ndrange = length(moms))
KernelAbstractions.synchronize(get_backend(moms))

# crashes ->
using AMDGPU
moms = ROCVector([Mocks._rand_momenta(RNG, 2, MockMomentum{Float32}) for _ in 1:128])
dest = similar(moms, Bool)
mwe_kernel(get_backend(moms))(moms, dest; ndrange = length(moms))
KernelAbstractions.synchronize(get_backend(moms))

It's using KernelAbstractions, but as far as i can tell the problem comes from AMDGPU.jl, so I'm reporting here. If that's not the case it can of course be moved.

The output for me is

ERROR: LoadError: InvalidIRError: compiling MethodInstance for gpu_mwe_kernel(::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{…}, KernelAbstractions.NDIteration.NDRange{…}}, ::AMDGPU.Device.ROCDeviceVector{Tuple{…}, 1}, ::AMDGPU.Device.ROCDeviceVector{Bool, 1}) resulted in invalid LLVM IR
Reason: unsupported call to an unknown function (call to jl_f_issubtype)
Stacktrace:
 [1] typesplit
   @ ./promotion.jl:150
 [2] multiple call sites
   @ unknown:0
Reason: unsupported dynamic function invocation (call to getproperty(x::Type, f::Symbol) @ Base Base_compiler.jl:48)
Stacktrace:
 [1] typesplit
   @ ./promotion.jl:154
 [2] multiple call sites
   @ unknown:0
Reason: unsupported call to an unknown function (call to jl_f_issubtype)
Stacktrace:
 [1] typesplit
   @ ./promotion.jl:150
 [2] typesplit
   @ ./promotion.jl:154
 [3] multiple call sites
   @ unknown:0
Reason: unsupported call to an unknown function (call to jl_f_apply_type)
Stacktrace:
 [1] typesplit
   @ ./promotion.jl:154
 [2] multiple call sites
   @ unknown:0

... very long stack trace ...

Hint: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erroneous code with Cthulhu.jl
Stacktrace:
  [1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.GCNCompilerTarget, AMDGPU.Compiler.HIPCompilerParams}, args::LLVM.Module)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Gp8bZ/src/validation.jl:167
  [2] macro expansion
    @ ~/.julia/packages/GPUCompiler/Gp8bZ/src/driver.jl:417 [inlined]
  [3] macro expansion
    @ ~/.julia/packages/Tracy/tYwAE/src/tracepoint.jl:163 [inlined]
  [4] emit_llvm(job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Gp8bZ/src/driver.jl:416
  [5] emit_llvm
    @ ~/.julia/packages/GPUCompiler/Gp8bZ/src/driver.jl:182 [inlined]
  [6] compile_unhooked(output::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Gp8bZ/src/driver.jl:95
  [7] compile_unhooked
    @ ~/.julia/packages/GPUCompiler/Gp8bZ/src/driver.jl:80 [inlined]
  [8] compile(target::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Gp8bZ/src/driver.jl:67
  [9] compile
    @ ~/.julia/packages/GPUCompiler/Gp8bZ/src/driver.jl:55 [inlined]
 [10] #hipcompile##0
    @ ~/.julia/packages/AMDGPU/TqRG0/src/compiler/codegen.jl:211 [inlined]
 [11] JuliaContext(f::AMDGPU.Compiler.var"#hipcompile##0#hipcompile##1"{GPUCompiler.CompilerJob{GPUCompiler.GCNCompilerTarget, AMDGPU.Compiler.HIPCompilerParams}}; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Gp8bZ/src/driver.jl:34
 [12] JuliaContext(f::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Gp8bZ/src/driver.jl:25
 [13] hipcompile(job::GPUCompiler.CompilerJob)
    @ AMDGPU.Compiler ~/.julia/packages/AMDGPU/TqRG0/src/compiler/codegen.jl:210
 [14] actual_compilation(cache::Dict{Any, AMDGPU.HIP.HIPFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.GCNCompilerTarget, AMDGPU.Compiler.HIPCompilerParams}, compiler::typeof(AMDGPU.Compiler.hipcompile), linker::typeof(AMDGPU.Compiler.hiplink))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Gp8bZ/src/execution.jl:245
 [15] cached_compilation(cache::Dict{Any, AMDGPU.HIP.HIPFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.GCNCompilerTarget, AMDGPU.Compiler.HIPCompilerParams}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/Gp8bZ/src/execution.jl:159
 [16] macro expansion
    @ ~/.julia/packages/AMDGPU/TqRG0/src/compiler/codegen.jl:166 [inlined]
 [17] macro expansion
    @ ./lock.jl:376 [inlined]
 [18] hipfunction(f::typeof(gpu_mwe_kernel), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{…}, KernelAbstractions.NDIteration.NDRange{…}}, AMDGPU.Device.ROCDeviceVector{Tuple{…}, 1}, AMDGPU.Device.ROCDeviceVector{Bool, 1}}}; kwargs::@Kwargs{})
    @ AMDGPU.Compiler ~/.julia/packages/AMDGPU/TqRG0/src/compiler/codegen.jl:160
 [19] hipfunction(f::typeof(gpu_mwe_kernel), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{…}, KernelAbstractions.NDIteration.NDRange{…}}, AMDGPU.Device.ROCDeviceVector{Tuple{…}, 1}, AMDGPU.Device.ROCDeviceVector{Bool, 1}}})
    @ AMDGPU.Compiler ~/.julia/packages/AMDGPU/TqRG0/src/compiler/codegen.jl:159
 [20] macro expansion
    @ ~/.julia/packages/AMDGPU/TqRG0/src/highlevel.jl:155 [inlined]
 [21] (::KernelAbstractions.Kernel{ROCBackend, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(gpu_mwe_kernel)})(::ROCArray{Tuple{MockMomentum{Float32}, MockMomentum{Float32}}, 1, AMDGPU.Runtime.Mem.HIPBuffer}, ::Vararg{Any}; ndrange::Int64, workgroupsize::Nothing)
    @ AMDGPU.ROCKernels ~/.julia/packages/AMDGPU/TqRG0/src/ROCKernels.jl:96
 [22] top-level scope
    @ ~/repos/QEDbase.jl/temp.jl:27
 [23] include(mapexpr::Function, mod::Module, _path::String)
    @ Base ./Base.jl:307
 [24] top-level scope
    @ REPL[4]:1
in expression starting at /home/reinha57/repos/QEDbase.jl/temp.jl:27
Some type information was truncated. Use `show(err)` to see complete types.

It looks like something in the == operator from SVectors is not properly understood by AMDGPU.jl, but it works with CUDA.jl so it should work. A working "fix" is implementing the == naively myself:

Base.:(==)(mom1::MOM_T, mom2::MOM_T) where {MOM_T <: AbstractMockMomentum} = (mom1[1] == mom2[1] && mom1[2] == mom2[2] && mom1[3] == mom2[3] && mom1[4] == mom2[4])

But it would be nice not to have to rely on these fixes. Maybe this could help identify issues and get closer to a feature equivalence between CUDA.jl and AMDGPU.jl.

AMDGPU.versioninfo():

[ Info: AMDGPU versioninfo
┌───────────┬──────────────────┬───────────┬─────────────────────────────────────────────────────────────────────────────────────────┐
│ Available │ Name             │ Version   │ Path                                                                                    │
├───────────┼──────────────────┼───────────┼─────────────────────────────────────────────────────────────────────────────────────────┤
│     +     │ LLD              │ -         │ /opt/rocm-6.0.2/lib/llvm/bin/ld.lld                                                     │
│     +     │ Device Libraries │ -         │ /home/reinha57/.julia/artifacts/b46ab46ef568406312e5f500efb677511199c2f9/amdgcn/bitcode │
│     +     │ HIP              │ 6.0.32831 │ /opt/rocm-6.0.2/lib/libamdhip64.so                                                      │
│     +     │ rocBLAS          │ 4.0.0     │ /opt/rocm-6.0.2/lib/librocblas.so                                                       │
│     +     │ rocSOLVER        │ 3.24.0    │ /opt/rocm-6.0.2/lib/librocsolver.so                                                     │
│     +     │ rocSPARSE        │ 3.0.2     │ /opt/rocm-6.0.2/lib/librocsparse.so                                                     │
│     +     │ rocRAND          │ 2.10.5    │ /opt/rocm-6.0.2/lib/librocrand.so                                                       │
│     +     │ rocFFT           │ 1.0.25    │ /opt/rocm-6.0.2/lib/librocfft.so                                                        │
│     -     │ MIOpen           │ -         │ -                                                                                       │
└───────────┴──────────────────┴───────────┴─────────────────────────────────────────────────────────────────────────────────────────┘

[ Info: AMDGPU devices
┌────┬────────────────────────┬──────────┬───────────┬────────────┬───────────────┐
│ Id │                   Name │ GCN arch │ Wavefront │     Memory │ Shared Memory │
├────┼────────────────────────┼──────────┼───────────┼────────────┼───────────────┤
│  1 │ AMD Radeon RX 7900 XTX │  gfx1100 │        32 │ 23.984 GiB │    64.000 KiB │
└────┴────────────────────────┴──────────┴───────────┴────────────┴───────────────┘

rocminfo output:

ROCk module version 6.14.14 is loaded
=====================    
HSA System Attributes    
=====================    
Runtime Version:         1.14
Runtime Ext Version:     1.6
System Timestamp Freq.:  1000.000000MHz
Sig. Max Wait Duration:  18446744073709551615 (0xFFFFFFFFFFFFFFFF) (timestamp count)
Machine Model:           LARGE                              
System Endianness:       LITTLE                             
Mwaitx:                  DISABLED
DMAbuf Support:          YES

==========               
HSA Agents               
==========               
*******                  
Agent 1                  
*******                  
  Name:                    AMD EPYC 7452 32-Core Processor    
  Uuid:                    CPU-XX                             
  Marketing Name:          AMD EPYC 7452 32-Core Processor    
  Vendor Name:             CPU                                
  Feature:                 None specified                     
  Profile:                 FULL_PROFILE                       
  Float Round Mode:        NEAR                               
  Max Queue Number:        0(0x0)                             
  Queue Min Size:          0(0x0)                             
  Queue Max Size:          0(0x0)                             
  Queue Type:              MULTI                              
  Node:                    0                                  
  Device Type:             CPU                                
  Cache Info:              
    L1:                      32768(0x8000) KB                   
  Chip ID:                 0(0x0)                             
  ASIC Revision:           0(0x0)                             
  Cacheline Size:          64(0x40)                           
  Max Clock Freq. (MHz):   2350                               
  BDFID:                   0                                  
  Internal Node ID:        0                                  
  Compute Unit:            64                                 
  SIMDs per CU:            0                                  
  Shader Engines:          0                                  
  Shader Arrs. per Eng.:   0                                  
  WatchPts on Addr. Ranges:1                                  
  Memory Properties:       
  Features:                None
  Pool Info:               
    Pool 1                   
      Segment:                 GLOBAL; FLAGS: FINE GRAINED        
      Size:                    263766732(0xfb8c2cc) KB            
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Recommended Granule:4KB                                
      Alloc Alignment:         4KB                                
      Accessible by all:       TRUE                               
    Pool 2                   
      Segment:                 GLOBAL; FLAGS: KERNARG, FINE GRAINED
      Size:                    263766732(0xfb8c2cc) KB            
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Recommended Granule:4KB                                
      Alloc Alignment:         4KB                                
      Accessible by all:       TRUE                               
    Pool 3                   
      Segment:                 GLOBAL; FLAGS: COARSE GRAINED      
      Size:                    263766732(0xfb8c2cc) KB            
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Recommended Granule:4KB                                
      Alloc Alignment:         4KB                                
      Accessible by all:       TRUE                               
  ISA Info:                
*******                  
Agent 2                  
*******                  
  Name:                    gfx1100                            
  Uuid:                    GPU-8459fddd3785d451               
  Marketing Name:          AMD Radeon RX 7900 XTX             
  Vendor Name:             AMD                                
  Feature:                 KERNEL_DISPATCH                    
  Profile:                 BASE_PROFILE                       
  Float Round Mode:        NEAR                               
  Max Queue Number:        128(0x80)                          
  Queue Min Size:          64(0x40)                           
  Queue Max Size:          131072(0x20000)                    
  Queue Type:              MULTI                              
  Node:                    1                                  
  Device Type:             GPU                                
  Cache Info:              
    L1:                      32(0x20) KB                        
    L2:                      6144(0x1800) KB                    
    L3:                      98304(0x18000) KB                  
  Chip ID:                 29772(0x744c)                      
  ASIC Revision:           0(0x0)                             
  Cacheline Size:          128(0x80)                          
  Max Clock Freq. (MHz):   2371                               
  BDFID:                   49920                              
  Internal Node ID:        1                                  
  Compute Unit:            96                                 
  SIMDs per CU:            2                                  
  Shader Engines:          6                                  
  Shader Arrs. per Eng.:   2                                  
  WatchPts on Addr. Ranges:4                                  
  Coherent Host Access:    FALSE                              
  Memory Properties:       
  Features:                KERNEL_DISPATCH 
  Fast F16 Operation:      TRUE                               
  Wavefront Size:          32(0x20)                           
  Workgroup Max Size:      1024(0x400)                        
  Workgroup Max Size per Dimension:
    x                        1024(0x400)                        
    y                        1024(0x400)                        
    z                        1024(0x400)                        
  Max Waves Per CU:        32(0x20)                           
  Max Work-item Per CU:    1024(0x400)                        
  Grid Max Size:           4294967295(0xffffffff)             
  Grid Max Size per Dimension:
    x                        4294967295(0xffffffff)             
    y                        4294967295(0xffffffff)             
    z                        4294967295(0xffffffff)             
  Max fbarriers/Workgrp:   32                                 
  Packet Processor uCode:: 542                                
  SDMA engine uCode::      24                                 
  IOMMU Support::          None                               
  Pool Info:               
    Pool 1                   
      Segment:                 GLOBAL; FLAGS: COARSE GRAINED      
      Size:                    25149440(0x17fc000) KB             
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Recommended Granule:2048KB                             
      Alloc Alignment:         4KB                                
      Accessible by all:       FALSE                              
    Pool 2                   
      Segment:                 GLOBAL; FLAGS: EXTENDED FINE GRAINED
      Size:                    25149440(0x17fc000) KB             
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Recommended Granule:2048KB                             
      Alloc Alignment:         4KB                                
      Accessible by all:       FALSE                              
    Pool 3                   
      Segment:                 GROUP                              
      Size:                    64(0x40) KB                        
      Allocatable:             FALSE                              
      Alloc Granule:           0KB                                
      Alloc Recommended Granule:0KB                                
      Alloc Alignment:         0KB                                
      Accessible by all:       FALSE                              
  ISA Info:                
    ISA 1                    
      Name:                    amdgcn-amd-amdhsa--gfx1100         
      Machine Models:          HSA_MACHINE_MODEL_LARGE            
      Profiles:                HSA_PROFILE_BASE                   
      Default Rounding Mode:   NEAR                               
      Default Rounding Mode:   NEAR                               
      Fast f16:                TRUE                               
      Workgroup Max Size:      1024(0x400)                        
      Workgroup Max Size per Dimension:
        x                        1024(0x400)                        
        y                        1024(0x400)                        
        z                        1024(0x400)                        
      Grid Max Size:           4294967295(0xffffffff)             
      Grid Max Size per Dimension:
        x                        4294967295(0xffffffff)             
        y                        4294967295(0xffffffff)             
        z                        4294967295(0xffffffff)             
      FBarrier Max Size:       32                                 
*** Done ***             

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions