diff --git a/runtime/codert_vm/CMakeLists.txt b/runtime/codert_vm/CMakeLists.txt index 0b39a85c094..acff6fe23b3 100644 --- a/runtime/codert_vm/CMakeLists.txt +++ b/runtime/codert_vm/CMakeLists.txt @@ -49,9 +49,11 @@ target_include_directories(j9codert_vm if(OMR_ARCH_X86) j9vm_gen_asm(xnathelp.m4) + j9vm_gen_asm(xvector.m4) target_sources(j9codert_vm PRIVATE xnathelp.s + xvector.s ) elseif(OMR_ARCH_POWER) j9vm_gen_asm(pnathelp.m4) diff --git a/runtime/codert_vm/cnathelp.cpp b/runtime/codert_vm/cnathelp.cpp index b8a078ba8e7..c7d687d606b 100644 --- a/runtime/codert_vm/cnathelp.cpp +++ b/runtime/codert_vm/cnathelp.cpp @@ -71,6 +71,15 @@ samePCs(void *pc1, void *pc2) #define samePCs(pc1, pc2) (MASK_PC(pc1) == MASK_PC(pc2)) #endif /* J9ZOS390 && !J9VM_ENV_DATA64 */ +#if defined(J9HAMMER) && (JAVA_SPEC_VERSION >= 17) +#define JIT_HELPER(x) extern "C" void x() +JIT_HELPER(jitSaveVectorRegistersAVX512); +JIT_HELPER(jitRestoreVectorRegistersAVX512); + +JIT_HELPER(jitSaveVectorRegistersAVX); +JIT_HELPER(jitRestoreVectorRegistersAVX); +#endif /* defined(J9HAMMER) && (JAVA_SPEC_VERSION >= 17) */ + /** * Fix the java and decompilation stacks for cases where exceptions can be * thrown from insde a JIT synthetic exception handler. There must be a @@ -4125,6 +4134,19 @@ initPureCFunctionTable(J9JavaVM *vm) jitConfig->old_slow_jitReportInstanceFieldWrite = (void*)old_slow_jitReportInstanceFieldWrite; jitConfig->old_slow_jitReportStaticFieldRead = (void*)old_slow_jitReportStaticFieldRead; jitConfig->old_slow_jitReportStaticFieldWrite = (void*)old_slow_jitReportStaticFieldWrite; + +#if defined(J9HAMMER) && (JAVA_SPEC_VERSION >= 17) + if (J9_ARE_ANY_BITS_SET(vm->extendedRuntimeFlags3, J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_512)) + { + jitConfig->saveVectorRegisters = (void *)jitSaveVectorRegistersAVX512; + jitConfig->restoreVectorRegisters = (void *)jitRestoreVectorRegistersAVX512; + } + else if (J9_ARE_ANY_BITS_SET(vm->extendedRuntimeFlags3, J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_256)) + { + jitConfig->saveVectorRegisters = (void *)jitSaveVectorRegistersAVX; + jitConfig->restoreVectorRegisters = (void *)jitRestoreVectorRegistersAVX; + } +#endif /* defined(J9HAMMER) && (JAVA_SPEC_VERSION >= 17) */ } } /* extern "C" */ diff --git a/runtime/codert_vm/xnathelp.m4 b/runtime/codert_vm/xnathelp.m4 index c2b31464422..7aa914de625 100644 --- a/runtime/codert_vm/xnathelp.m4 +++ b/runtime/codert_vm/xnathelp.m4 @@ -1070,10 +1070,13 @@ START_PROC(jitReferenceArrayCopy) mov PARM_REG(2),_rcx mov PARM_REG(1),_rbp call FASTCALL_SYMBOL(impl_jitReferenceArrayCopy,2) - dnl set ZF if succeed - test _rax,_rax + dnl Save return value to check later. + dnl We don't check it now because restoring the register clobbers flags. + mov dword ptr J9TR_VMThread_floatTemp3[_rbp],eax RESTORE_C_VOLATILE_REGS SWITCH_TO_JAVA_STACK + dnl Set ZF on success. + test dword ptr J9TR_VMThread_floatTemp3[_rbp], -1 push uword ptr J9TR_VMThread_jitReturnAddress[_rbp] ret END_PROC(jitReferenceArrayCopy) diff --git a/runtime/codert_vm/xvector.m4 b/runtime/codert_vm/xvector.m4 new file mode 100644 index 00000000000..5c1dac63424 --- /dev/null +++ b/runtime/codert_vm/xvector.m4 @@ -0,0 +1,123 @@ +dnl Copyright IBM Corp. and others 2023 +dnl +dnl This program and the accompanying materials are made available under +dnl the terms of the Eclipse Public License 2.0 which accompanies this +dnl distribution and is available at https://www.eclipse.org/legal/epl-2.0/ +dnl or the Apache License, Version 2.0 which accompanies this distribution and +dnl is available at https://www.apache.org/licenses/LICENSE-2.0. +dnl +dnl This Source Code may also be made available under the following +dnl Secondary Licenses when the conditions for such availability set +dnl forth in the Eclipse Public License, v. 2.0 are satisfied: GNU +dnl General Public License, version 2 with the GNU Classpath +dnl Exception [1] and GNU General Public License, version 2 with the +dnl OpenJDK Assembly Exception [2]. +dnl +dnl [1] https://www.gnu.org/software/classpath/license.html +dnl [2] https://openjdk.org/legal/assembly-exception.html +dnl +dnl SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 OR GPL-2.0-only WITH Classpath-exception-2.0 OR GPL-2.0-only WITH OpenJDK-assembly-exception-1.0 + +include(xhelpers.m4) + + FILE_START + +dnl For all of these functions, on entry: +dnl +dnl 1) return address on the stack +dnl 2) r8 is a scratch register on 64-bit +dnl 3) eax is a scratch register on 32-bit + +START_PROC(jitSaveVectorRegistersAVX512) + lfence + + dnl save ZMM registers + +ifdef({ASM_J9VM_ENV_DATA64},{ + pop r8 + forloop({REG_CTR}, 0, 31, {SAVE_ZMM_REG(REG_CTR, J9TR_cframe_jitFPRs+(REG_CTR*64))}) +}, { dnl ASM_J9VM_ENV_DATA64 + pop eax + forloop({REG_CTR}, 0, 7, {SAVE_ZMM_REG(REG_CTR, J9TR_cframe_jitFPRs+(REG_CTR*64))}) +}) + + vzeroupper + + dnl save Opmask registers + forloop({REG_CTR}, 0, 7, {SAVE_MASK_64(REG_CTR, J9TR_cframe_maskRegisters+(REG_CTR*8))}) + +ifdef({ASM_J9VM_ENV_DATA64},{ + push r8 +}, { dnl ASM_J9VM_ENV_DATA64 + push eax +}) + ret +END_PROC(jitSaveVectorRegistersAVX512) + +START_PROC(jitRestoreVectorRegistersAVX512) + lfence + + dnl restore ZMM registers +ifdef({ASM_J9VM_ENV_DATA64},{ + pop r8 + forloop({REG_CTR}, 0, 31, {RESTORE_ZMM_REG(REG_CTR, J9TR_cframe_jitFPRs+(REG_CTR*64))}) +}, { dnl ASM_J9VM_ENV_DATA64 + pop eax + forloop({REG_CTR}, 0, 7, {RESTORE_ZMM_REG(REG_CTR, J9TR_cframe_jitFPRs+(REG_CTR*64))}) +}) + + dnl restore Opmask registers + forloop({REG_CTR}, 0, 7, {RESTORE_MASK_64(REG_CTR, J9TR_cframe_maskRegisters+(REG_CTR*8))}) + +ifdef({ASM_J9VM_ENV_DATA64},{ + push r8 +}, { dnl ASM_J9VM_ENV_DATA64 + push eax +}) + ret +END_PROC(jitRestoreVectorRegistersAVX512) + +START_PROC(jitSaveVectorRegistersAVX) + lfence + + dnl save YMM registers + +ifdef({ASM_J9VM_ENV_DATA64},{ + pop r8 + forloop({REG_CTR}, 0, 15, {vmovdqu ymmword ptr J9TR_cframe_jitFPRs+(REG_CTR*32)[_rsp],ymm{}REG_CTR}) +}, { dnl ASM_J9VM_ENV_DATA64 + pop eax + forloop({REG_CTR}, 0, 7, {vmovdqu ymmword ptr J9TR_cframe_jitFPRs+(REG_CTR*32)[_rsp],ymm{}REG_CTR}) +}) + + vzeroupper + +ifdef({ASM_J9VM_ENV_DATA64},{ + push r8 +}, { dnl ASM_J9VM_ENV_DATA64 + push eax +}) + ret +END_PROC(jitSaveVectorRegistersAVX) + +START_PROC(jitRestoreVectorRegistersAVX) + lfence + + dnl restore YMM registers +ifdef({ASM_J9VM_ENV_DATA64},{ + pop r8 + forloop({REG_CTR}, 0, 15, {vmovdqu ymm{}REG_CTR,ymmword ptr J9TR_cframe_jitFPRs+(REG_CTR*32)[_rsp]}) +}, { dnl ASM_J9VM_ENV_DATA64 + pop eax + forloop({REG_CTR}, 0, 7, {vmovdqu ymm{}REG_CTR,ymmword ptr J9TR_cframe_jitFPRs+(REG_CTR*32)[_rsp]}) +}) + +ifdef({ASM_J9VM_ENV_DATA64},{ + push r8 +}, { dnl ASM_J9VM_ENV_DATA64 + push eax +}) + ret +END_PROC(jitRestoreVectorRegistersAVX) + + FILE_END diff --git a/runtime/jilgen/jilconsts.c b/runtime/jilgen/jilconsts.c index 5b3395fedd3..8214b0c0688 100644 --- a/runtime/jilgen/jilconsts.c +++ b/runtime/jilgen/jilconsts.c @@ -399,6 +399,7 @@ writeConstants(OMRPortLibrary *OMRPORTLIB, IDATA fd) writeConstant(OMRPORTLIB, fd, "J9TR_cframe_machineBP", offsetof(J9CInterpreterStackFrame, machineBP)) | writeConstant(OMRPORTLIB, fd, "J9TR_cframe_jitGPRs", offsetof(J9CInterpreterStackFrame, jitGPRs)) | writeConstant(OMRPORTLIB, fd, "J9TR_cframe_jitFPRs", offsetof(J9CInterpreterStackFrame, jitFPRs)) | + writeConstant(OMRPORTLIB, fd, "J9TR_cframe_maskRegisters", offsetof(J9CInterpreterStackFrame, maskRegisters)) | writeConstant(OMRPORTLIB, fd, "J9TR_cframe_rax", offsetof(J9CInterpreterStackFrame, jitGPRs.jitGPRs.named.rax)) | writeConstant(OMRPORTLIB, fd, "J9TR_cframe_rbx", offsetof(J9CInterpreterStackFrame, jitGPRs.jitGPRs.named.rbx)) | writeConstant(OMRPORTLIB, fd, "J9TR_cframe_rcx", offsetof(J9CInterpreterStackFrame, jitGPRs.jitGPRs.named.rcx)) | @@ -505,6 +506,7 @@ writeConstants(OMRPortLibrary *OMRPORTLIB, IDATA fd) writeConstant(OMRPORTLIB, fd, "J9TR_JavaVM_bytecodeLoop", offsetof(J9JavaVM, bytecodeLoop)) | writeConstant(OMRPORTLIB, fd, "J9TR_JavaVM_extendedRuntimeFlags", offsetof(J9JavaVM, extendedRuntimeFlags)) | writeConstant(OMRPORTLIB, fd, "J9TR_JavaVM_extendedRuntimeFlags2", offsetof(J9JavaVM, extendedRuntimeFlags2)) | + writeConstant(OMRPORTLIB, fd, "J9TR_JavaVM_extendedRuntimeFlags3", offsetof(J9JavaVM, extendedRuntimeFlags3)) | writeConstant(OMRPORTLIB, fd, "J9TR_JavaVMInternalFunctionTable", offsetof(J9JavaVM, internalVMFunctions)) | writeConstant(OMRPORTLIB, fd, "J9TR_JavaVM_memoryManagerFunctions", offsetof(J9JavaVM, memoryManagerFunctions)) | #if defined(OMR_GC_CONCURRENT_SCAVENGER) && defined(J9VM_ARCH_S390) @@ -669,6 +671,8 @@ writeConstants(OMRPortLibrary *OMRPORTLIB, IDATA fd) writeConstant(OMRPORTLIB, fd, "J9TR_JitConfig_old_slow_jitReportInstanceFieldWrite", offsetof(J9JITConfig, old_slow_jitReportInstanceFieldWrite)) | writeConstant(OMRPORTLIB, fd, "J9TR_JitConfig_old_slow_jitReportStaticFieldRead", offsetof(J9JITConfig, old_slow_jitReportStaticFieldRead)) | writeConstant(OMRPORTLIB, fd, "J9TR_JitConfig_old_slow_jitReportStaticFieldWrite", offsetof(J9JITConfig, old_slow_jitReportStaticFieldWrite)) | + writeConstant(OMRPORTLIB, fd, "J9TR_JitConfig_saveVectorRegisters", offsetof(J9JITConfig, saveVectorRegisters)) | + writeConstant(OMRPORTLIB, fd, "J9TR_JitConfig_restoreVectorRegisters", offsetof(J9JITConfig, restoreVectorRegisters)) | writeConstant(OMRPORTLIB, fd, "J9TR_JitConfig_old_fast_jitGetFlattenableField", offsetof(J9JITConfig, old_fast_jitGetFlattenableField)) | writeConstant(OMRPORTLIB, fd, "J9TR_JitConfig_old_fast_jitCloneValueType", offsetof(J9JITConfig, old_fast_jitCloneValueType)) | @@ -777,6 +781,8 @@ writeConstants(OMRPortLibrary *OMRPORTLIB, IDATA fd) writeConstant(OMRPORTLIB, fd, "J9TR_ELSSize", sizeof(J9VMEntryLocalStorage)) | writeConstant(OMRPORTLIB, fd, "J9TR_J9_EXTENDED_RUNTIME_DEBUG_MODE", J9_EXTENDED_RUNTIME_DEBUG_MODE) | writeConstant(OMRPORTLIB, fd, "J9TR_J9_EXTENDED_RUNTIME_USE_VECTOR_REGISTERS", J9_EXTENDED_RUNTIME_USE_VECTOR_REGISTERS) | + writeConstant(OMRPORTLIB, fd, "J9TR_J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_256", J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_256) | + writeConstant(OMRPORTLIB, fd, "J9TR_J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_512", J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_512) | writeConstant(OMRPORTLIB, fd, "J9TR_J9_EXTENDED_RUNTIME2_COMPRESS_OBJECT_REFERENCES", J9_EXTENDED_RUNTIME2_COMPRESS_OBJECT_REFERENCES) | writeConstant(OMRPORTLIB, fd, "J9TR_J9_INLINE_JNI_MAX_ARG_COUNT", J9_INLINE_JNI_MAX_ARG_COUNT) | diff --git a/runtime/oti/j9consts.h b/runtime/oti/j9consts.h index ca8def0bd91..642a1a37318 100644 --- a/runtime/oti/j9consts.h +++ b/runtime/oti/j9consts.h @@ -374,6 +374,8 @@ extern "C" { #define J9_EXTENDED_RUNTIME3_YIELD_PINNED_CONTINUATION 0x2 #define J9_EXTENDED_RUNTIME3_CACHE_MAPS 0x4 #define J9_EXTENDED_RUNTIME3_MODULE_PACKAGES_INITIALIZED 0x8 +#define J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_256 0x10 +#define J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_512 0x20 #define J9_OBJECT_HEADER_AGE_DEFAULT 0xA /* OBJECT_HEADER_AGE_DEFAULT */ #define J9_OBJECT_HEADER_SHAPE_MASK 0xE /* OBJECT_HEADER_SHAPE_MASK */ diff --git a/runtime/oti/j9nonbuilder.h b/runtime/oti/j9nonbuilder.h index 12f6e263749..c0d71b6152f 100644 --- a/runtime/oti/j9nonbuilder.h +++ b/runtime/oti/j9nonbuilder.h @@ -4305,6 +4305,8 @@ typedef struct J9JITConfig { void *old_slow_jitReportInstanceFieldWrite; void *old_slow_jitReportStaticFieldRead; void *old_slow_jitReportStaticFieldWrite; + void *saveVectorRegisters; + void *restoreVectorRegisters; struct J9MemorySegment* codeCache; struct J9MemorySegment* dataCache; struct J9MemorySegmentList* codeCacheList; @@ -6906,8 +6908,9 @@ typedef struct J9CInterpreterStackFrame { * * Stack must be 16-byte aligned. */ - U_8 jitFPRs[6 * 16]; /* xmm0-5 128-bit OR xmm0-7 64-bit */ + U_8 jitFPRs[6 * 64]; /* zmm0-5 512-bit OR xmm0-7 64-bit */ U_8 preservedFPRs[10 * 16]; /* xmm6-15 128-bit */ + U_8 maskRegisters[8 * 8]; /* k0-k7 */ UDATA align[1]; /* r15,r14,r13,r12,rdi,rsi,rbx,rbp,return address * RSP is 16-byte aligned at this point @@ -6917,7 +6920,8 @@ typedef struct J9CInterpreterStackFrame { * * Stack must be 16-byte aligned. */ - U_8 jitFPRs[16 * 16]; /* xmm0-15 128-bit OR xmm0-7 64-bit */ + U_8 jitFPRs[32 * 64]; /* zmm0-31 512-bit OR xmm0-7 64-bit */ + U_8 maskRegisters[8 * 8]; /* k0-k7 */ UDATA align[1]; /* r15,r14,r13,r12,rbx,rbp,return address * RSP is 16-byte aligned at this point @@ -6930,7 +6934,8 @@ typedef struct J9CInterpreterStackFrame { */ J9JITGPRSpillArea jitGPRs; UDATA align1[2]; - U_8 jitFPRs[8 * 16]; /* xmm0-7 128-bit */ + U_8 jitFPRs[8 * 64]; /* zmm0-7 512-bit */ + U_8 maskRegisters[8 * 8]; /* k0-k7 */ UDATA align2[1]; /* ebx,edi,esi * ESP is forcibly 16-byte aligned at this point diff --git a/runtime/oti/jvminit.h b/runtime/oti/jvminit.h index 7b805b41ca4..b74a10f19a1 100644 --- a/runtime/oti/jvminit.h +++ b/runtime/oti/jvminit.h @@ -664,6 +664,7 @@ enum INIT_STAGE { #define VMOPT_XCONCURRENTBACKGROUND "-Xconcurrentbackground" #define VMOPT_XGCTHREADS "-Xgcthreads" #define VMOPT_XGCMAXTHREADS "-Xgcmaxthreads" +#define VMOPT_PRESERVE_VECTORS "-XPreserveExtendedRegs" #define VMOPT_XXSHOW_EXTENDED_NPE_MESSAGE "-XX:+ShowCodeDetailsInExceptionMessages" #define VMOPT_XXNOSHOW_EXTENDED_NPE_MESSAGE "-XX:-ShowCodeDetailsInExceptionMessages" diff --git a/runtime/oti/xhelpers.m4 b/runtime/oti/xhelpers.m4 index 4847b37e167..8f3a20e408e 100644 --- a/runtime/oti/xhelpers.m4 +++ b/runtime/oti/xhelpers.m4 @@ -43,8 +43,76 @@ ifdef({ASM_J9VM_ENV_DATA64},{ LABEL(skip_vzu{}VZU_COUNT): }) +dnl for(=; <= ; ++) { } +dnl $1 = symbol name +dnl $2 = starting value +dnl $3 = ending value +dnl $4 = expression +define({forloop}, + {define({$1}, {$2})$4 + ifelse({$2}, {$3}, {},{$0({$1}, incr({$2}), {$3}, {$4})})}) +define({SYM_COUNT},0) +define({INC_SYM_COUNT},{define({SYM_COUNT},incr(SYM_COUNT))}) + J9CONST({CINTERP_STACK_SIZE},J9TR_cframe_sizeof) +dnl Work arround for older versions of MASM which don't support AVX-512 +ifdef({WIN32},{ + + dnl Generate instruction of format OP , [ + rsp] + dnl $1 - prefix + dnl $2 - opcode + dnl $3 - reg number + dnl $4 - offset + dnl low 3 bits of register number are stored in modR/M[5:3] + define({INSTRUCTION}, { + dnl prefix + $1 + dnl opcode + BYTE $2 + dnl modR/M byte + BYTE 084h OR (($3 AND 7) SHL 3 ) + dnl SIB byte + BYTE 024h + dnl displacement + DWORD $4 + }) + + dnl 2 byte VEX prefix + define({VEX2},{BYTE 0c5h, 0f8h}) + + dnl 3 byte VEX prefix with W bit set + define({VEX3},{BYTE 0c4h, 0e1h, 0f8h}) + + dnl EVEX prefix + dnl $1 - register number + dnl bits 3 and 4 of the register number are stored inverted in bits 7 and 4 in the second byte of the EVEX prefix + define({EVEX},{ + BYTE 062h + BYTE 061h OR ((NOT $1 AND 8) SHL 4) OR (NOT $1 AND 010h) + BYTE 0feh, 048h + }) + + dnl $1 = register number + dnl $2 = stack displacment + define({SAVE_MASK_16}, {INSTRUCTION({VEX2}, 090h, {$1}, {$2})}) + define({RESTORE_MASK_16}, {INSTRUCTION({VEX2}, 091h, {$1}, {$2})}) + define({SAVE_MASK_64}, {INSTRUCTION({VEX3}, 090h, {$1}, {$2})}) + define({RESTORE_MASK_64}, {INSTRUCTION({VEX3}, 091h, {$1}, {$2})}) + define({SAVE_ZMM_REG}, {INSTRUCTION({EVEX({$1})}, 07fh, {$1}, {$2})}) + define({RESTORE_ZMM_REG}, {INSTRUCTION({EVEX({$1})}, 06fh, {$1}, {$2})}) + +},{ dnl WIN32 + dnl $1 = register number + dnl $2 = stack displacment + define({SAVE_MASK_16}, {kmovw word ptr $2[_rsp],k{}$1}) + define({RESTORE_MASK_16}, {kmovw k{}$1,word ptr $2[_rsp]}) + define({SAVE_MASK_64}, {kmovq qword ptr $2[_rsp],k{}$1}) + define({RESTORE_MASK_64}, {kmovq k{}$1,qword ptr $2[_rsp]}) + define({SAVE_ZMM_REG}, {vmovdqu64 zmmword ptr $2[_rsp],zmm{}$1}) + define({RESTORE_ZMM_REG}, {vmovdqu64 zmm{}$1,zmmword ptr $2[_rsp]}) + +}) dnl WIN32 ifdef({WIN32},{ define({SHORT_JMP},{short}) @@ -332,7 +400,6 @@ define({SAVE_C_VOLATILE_REGS},{ mov qword ptr J9TR_cframe_r9[_rsp],r9 mov qword ptr J9TR_cframe_r10[_rsp],r10 mov qword ptr J9TR_cframe_r11[_rsp],r11 - EMIT_VZEROUPPER_IF_AVX() ifdef({METHOD_INVOCATION},{ movq qword ptr J9TR_cframe_jitFPRs+(0*8)[_rsp],xmm0 movq qword ptr J9TR_cframe_jitFPRs+(1*8)[_rsp],xmm1 @@ -341,23 +408,37 @@ ifdef({METHOD_INVOCATION},{ movq qword ptr J9TR_cframe_jitFPRs+(4*8)[_rsp],xmm4 movq qword ptr J9TR_cframe_jitFPRs+(5*8)[_rsp],xmm5 },{ dnl METHOD_INVOCATION + mov r8,J9TR_VMThread_javaVM[J9VMTHREAD] + mov r8d,J9TR_JavaVM_extendedRuntimeFlags3[r8] + test r8d,(J9TR_J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_256 | J9TR_J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_512) + jz LABEL(L_xmm_save{}SYM_COUNT) + + dnl save YMM/ZMM registers (out-of-line) + LABEL(L_ool_save{}SYM_COUNT): + + mov r8,J9TR_VMThread_javaVM[J9VMTHREAD] + mov r8,J9TR_JavaVMJitConfig[r8] + mov r8, J9TR_JitConfig_saveVectorRegisters[r8] + call r8 + + jmp LABEL(L_save_volatile_done{}SYM_COUNT) + + dnl save XMM registers + LABEL(L_xmm_save{}SYM_COUNT): + EMIT_VZEROUPPER_IF_AVX() movdqa J9TR_cframe_jitFPRs+(0*16)[_rsp],xmm0 movdqa J9TR_cframe_jitFPRs+(1*16)[_rsp],xmm1 movdqa J9TR_cframe_jitFPRs+(2*16)[_rsp],xmm2 movdqa J9TR_cframe_jitFPRs+(3*16)[_rsp],xmm3 movdqa J9TR_cframe_jitFPRs+(4*16)[_rsp],xmm4 movdqa J9TR_cframe_jitFPRs+(5*16)[_rsp],xmm5 + + LABEL(L_save_volatile_done{}SYM_COUNT): + INC_SYM_COUNT() }) dnl METHOD_INVOCATION }) define({RESTORE_C_VOLATILE_REGS},{ - mov rax,qword ptr J9TR_cframe_rax[_rsp] - mov rcx,qword ptr J9TR_cframe_rcx[_rsp] - mov rdx,qword ptr J9TR_cframe_rdx[_rsp] - mov r8,qword ptr J9TR_cframe_r8[_rsp] - mov r9,qword ptr J9TR_cframe_r9[_rsp] - mov r10,qword ptr J9TR_cframe_r10[_rsp] - mov r11,qword ptr J9TR_cframe_r11[_rsp] ifdef({METHOD_INVOCATION},{ movq xmm0,qword ptr J9TR_cframe_jitFPRs+(0*8)[_rsp] movq xmm1,qword ptr J9TR_cframe_jitFPRs+(1*8)[_rsp] @@ -366,13 +447,42 @@ ifdef({METHOD_INVOCATION},{ movq xmm4,qword ptr J9TR_cframe_jitFPRs+(4*8)[_rsp] movq xmm5,qword ptr J9TR_cframe_jitFPRs+(5*8)[_rsp] },{ dnl METHOD_INVOCATION + dnl J9TR_J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_256 marks if we are using AVX-2 (eg YMM) + dnl J9TR_J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_512 marks if we are using AVX-512 (eg ZMM) + dnl No flags means normal SSE registers (XMM) + mov r8,J9TR_VMThread_javaVM[J9VMTHREAD] + mov r8d,J9TR_JavaVM_extendedRuntimeFlags3[r8] + test r8d,(J9TR_J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_256 | J9TR_J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_512) + jz LABEL(L_xmm_restore{}SYM_COUNT) + + dnl restore YMM/ZMM registers (out-of-line) + LABEL(L_ool_restore{}SYM_COUNT): + + mov r8,J9TR_VMThread_javaVM[J9VMTHREAD] + mov r8,J9TR_JavaVMJitConfig[r8] + mov r8,J9TR_JitConfig_restoreVectorRegisters[r8] + call r8 + + jmp LABEL(L_restore_volatile_done{}SYM_COUNT) + + dnl restore XMM registers + LABEL(L_xmm_restore{}SYM_COUNT): movdqa xmm0,J9TR_cframe_jitFPRs+(0*16)[_rsp] movdqa xmm1,J9TR_cframe_jitFPRs+(1*16)[_rsp] movdqa xmm2,J9TR_cframe_jitFPRs+(2*16)[_rsp] movdqa xmm3,J9TR_cframe_jitFPRs+(3*16)[_rsp] movdqa xmm4,J9TR_cframe_jitFPRs+(4*16)[_rsp] movdqa xmm5,J9TR_cframe_jitFPRs+(5*16)[_rsp] + LABEL(L_restore_volatile_done{}SYM_COUNT): + INC_SYM_COUNT() }) dnl METHOD_INVOCATION + mov rax,qword ptr J9TR_cframe_rax[_rsp] + mov rcx,qword ptr J9TR_cframe_rcx[_rsp] + mov rdx,qword ptr J9TR_cframe_rdx[_rsp] + mov r8,qword ptr J9TR_cframe_r8[_rsp] + mov r9,qword ptr J9TR_cframe_r9[_rsp] + mov r10,qword ptr J9TR_cframe_r10[_rsp] + mov r11,qword ptr J9TR_cframe_r11[_rsp] }) dnl No need to save/restore xmm8-15 - the stack walker will never need to read @@ -422,7 +532,6 @@ define({SAVE_C_VOLATILE_REGS},{ mov qword ptr J9TR_cframe_r9[_rsp],r9 mov qword ptr J9TR_cframe_r10[_rsp],r10 mov qword ptr J9TR_cframe_r11[_rsp],r11 - EMIT_VZEROUPPER_IF_AVX() ifdef({METHOD_INVOCATION},{ movq qword ptr J9TR_cframe_jitFPRs+(0*8)[_rsp],xmm0 movq qword ptr J9TR_cframe_jitFPRs+(1*8)[_rsp],xmm1 @@ -433,6 +542,27 @@ ifdef({METHOD_INVOCATION},{ movq qword ptr J9TR_cframe_jitFPRs+(6*8)[_rsp],xmm6 movq qword ptr J9TR_cframe_jitFPRs+(7*8)[_rsp],xmm7 },{ dnl METHOD_INVOCATION + dnl J9TR_J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_256 marks if we are using AVX-2 (eg YMM) + dnl J9TR_J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_512 marks if we are using AVX-512 (eg ZMM) + dnl No flags means normal SSE registers (XMM) + mov r8,J9TR_VMThread_javaVM[J9VMTHREAD] + mov r8d,J9TR_JavaVM_extendedRuntimeFlags3[r8] + test r8d,(J9TR_J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_256 | J9TR_J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_512) + jz LABEL(L_xmm_save{}SYM_COUNT) + + dnl save YMM/ZMM registers (out-of-line) + LABEL(L_ool_save{}SYM_COUNT): + + mov r8,J9TR_VMThread_javaVM[J9VMTHREAD] + mov r8,J9TR_JavaVMJitConfig[r8] + mov r8,J9TR_JitConfig_saveVectorRegisters[r8] + call r8 + + jmp LABEL(L_save_volatile_done{}SYM_COUNT) + + dnl save XMM registers + LABEL(L_xmm_save{}SYM_COUNT): + EMIT_VZEROUPPER_IF_AVX() movdqa J9TR_cframe_jitFPRs+(0*16)[_rsp],xmm0 movdqa J9TR_cframe_jitFPRs+(1*16)[_rsp],xmm1 movdqa J9TR_cframe_jitFPRs+(2*16)[_rsp],xmm2 @@ -449,19 +579,13 @@ ifdef({METHOD_INVOCATION},{ movdqa J9TR_cframe_jitFPRs+(13*16)[_rsp],xmm13 movdqa J9TR_cframe_jitFPRs+(14*16)[_rsp],xmm14 movdqa J9TR_cframe_jitFPRs+(15*16)[_rsp],xmm15 + + LABEL(L_save_volatile_done{}SYM_COUNT): + INC_SYM_COUNT() }) dnl METHOD_INVOCATION }) define({RESTORE_C_VOLATILE_REGS},{ - mov rax,qword ptr J9TR_cframe_rax[_rsp] - mov rcx,qword ptr J9TR_cframe_rcx[_rsp] - mov rdx,qword ptr J9TR_cframe_rdx[_rsp] - mov rdi,qword ptr J9TR_cframe_rdi[_rsp] - mov rsi,qword ptr J9TR_cframe_rsi[_rsp] - mov r8,qword ptr J9TR_cframe_r8[_rsp] - mov r9,qword ptr J9TR_cframe_r9[_rsp] - mov r10,qword ptr J9TR_cframe_r10[_rsp] - mov r11,qword ptr J9TR_cframe_r11[_rsp] ifdef({METHOD_INVOCATION},{ movq xmm0,qword ptr J9TR_cframe_jitFPRs+(0*8)[_rsp] movq xmm1,qword ptr J9TR_cframe_jitFPRs+(1*8)[_rsp] @@ -472,6 +596,27 @@ ifdef({METHOD_INVOCATION},{ movq xmm6,qword ptr J9TR_cframe_jitFPRs+(6*8)[_rsp] movq xmm7,qword ptr J9TR_cframe_jitFPRs+(7*8)[_rsp] },{ dnl METHOD_INVOCATION + + dnl J9TR_J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_256 marks if we are using AVX-2 (eg YMM) + dnl J9TR_J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_512 marks if we are using AVX-512 (eg ZMM) + dnl No flags means normal SSE registers (XMM) + mov r8,J9TR_VMThread_javaVM[J9VMTHREAD] + mov r8d,J9TR_JavaVM_extendedRuntimeFlags3[r8] + test r8d,(J9TR_J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_256 | J9TR_J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_512) + jz LABEL(L_xmm_restore{}SYM_COUNT) + + dnl restore YMM/ZMM registers (out-of-line) + LABEL(L_ool_restore{}SYM_COUNT): + + mov r8,J9TR_VMThread_javaVM[J9VMTHREAD] + mov r8,J9TR_JavaVMJitConfig[r8] + mov r8,J9TR_JitConfig_restoreVectorRegisters[r8] + call r8 + + jmp LABEL(L_restore_volatile_done{}SYM_COUNT) + + dnl restore XMM registers + LABEL(L_xmm_restore{}SYM_COUNT): movdqa xmm0,J9TR_cframe_jitFPRs+(0*16)[_rsp] movdqa xmm1,J9TR_cframe_jitFPRs+(1*16)[_rsp] movdqa xmm2,J9TR_cframe_jitFPRs+(2*16)[_rsp] @@ -488,7 +633,19 @@ ifdef({METHOD_INVOCATION},{ movdqa xmm13,J9TR_cframe_jitFPRs+(13*16)[_rsp] movdqa xmm14,J9TR_cframe_jitFPRs+(14*16)[_rsp] movdqa xmm15,J9TR_cframe_jitFPRs+(15*16)[_rsp] + + LABEL(L_restore_volatile_done{}SYM_COUNT): + INC_SYM_COUNT() }) dnl METHOD_INVOCATION + mov rax,qword ptr J9TR_cframe_rax[_rsp] + mov rcx,qword ptr J9TR_cframe_rcx[_rsp] + mov rdx,qword ptr J9TR_cframe_rdx[_rsp] + mov rdi,qword ptr J9TR_cframe_rdi[_rsp] + mov rsi,qword ptr J9TR_cframe_rsi[_rsp] + mov r8,qword ptr J9TR_cframe_r8[_rsp] + mov r9,qword ptr J9TR_cframe_r9[_rsp] + mov r10,qword ptr J9TR_cframe_r10[_rsp] + mov r11,qword ptr J9TR_cframe_r11[_rsp] }) define({SAVE_C_NONVOLATILE_REGS},{ @@ -552,6 +709,24 @@ define({SAVE_C_VOLATILE_REGS},{ ifdef({METHOD_INVOCATION},{ dnl No FP parameter registers },{ dnl METHOD_INVOCATION + mov eax,dword ptr J9TR_VMThread_javaVM[J9VMTHREAD] + mov eax,dword ptr J9TR_JavaVM_extendedRuntimeFlags3[eax] + test eax,(J9TR_J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_256 | J9TR_J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_512) + jz LABEL(L_xmm_save{}SYM_COUNT) + + dnl save YMM/ZMM registers (out-of-line) + LABEL(L_ool_save{}SYM_COUNT): + + mov eax,J9TR_VMThread_javaVM[J9VMTHREAD] + mov eax,J9TR_JavaVMJitConfig[eax] + mov eax,J9TR_JitConfig_saveVectorRegisters[eax] + call eax + + jmp LABEL(L_save_volatile_done{}SYM_COUNT) + + dnl save XMM registers + LABEL(L_xmm_save{}SYM_COUNT): + EMIT_VZEROUPPER_IF_AVX() movdqa J9TR_cframe_jitFPRs+(0*16)[_rsp],xmm0 movdqa J9TR_cframe_jitFPRs+(1*16)[_rsp],xmm1 @@ -561,16 +736,34 @@ dnl No FP parameter registers movdqa J9TR_cframe_jitFPRs+(5*16)[_rsp],xmm5 movdqa J9TR_cframe_jitFPRs+(6*16)[_rsp],xmm6 movdqa J9TR_cframe_jitFPRs+(7*16)[_rsp],xmm7 + + LABEL(L_save_volatile_done{}SYM_COUNT): + INC_SYM_COUNT() + + mov eax,dword ptr J9TR_cframe_rax[_rsp] }) dnl METHOD_INVOCATION }) define({RESTORE_C_VOLATILE_REGS},{ - mov eax,dword ptr J9TR_cframe_rax[_rsp] - mov ecx,dword ptr J9TR_cframe_rcx[_rsp] - mov edx,dword ptr J9TR_cframe_rdx[_rsp] ifdef({METHOD_INVOCATION},{ dnl No FP parameter registers },{ dnl METHOD_INVOCATION + mov eax,dword ptr J9TR_VMThread_javaVM[J9VMTHREAD] + mov eax,dword ptr J9TR_JavaVM_extendedRuntimeFlags3[eax] + test eax,(J9TR_J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_256 | J9TR_J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_512) + jz LABEL(L_xmm_restore{}SYM_COUNT) + + dnl restore YMM/ZMM registers (out-of-line) + LABEL(L_ool_restore{}SYM_COUNT): + + mov eax,J9TR_VMThread_javaVM[J9VMTHREAD] + mov eax,J9TR_JavaVMJitConfig[eax] + mov eax,J9TR_JitConfig_restoreVectorRegisters[eax] + call eax + + jmp LABEL(L_restore_volatile_done{}SYM_COUNT) + + LABEL(L_xmm_restore{}SYM_COUNT): movdqa xmm0,J9TR_cframe_jitFPRs+(0*16)[_rsp] movdqa xmm1,J9TR_cframe_jitFPRs+(1*16)[_rsp] movdqa xmm2,J9TR_cframe_jitFPRs+(2*16)[_rsp] @@ -579,7 +772,13 @@ dnl No FP parameter registers movdqa xmm5,J9TR_cframe_jitFPRs+(5*16)[_rsp] movdqa xmm6,J9TR_cframe_jitFPRs+(6*16)[_rsp] movdqa xmm7,J9TR_cframe_jitFPRs+(7*16)[_rsp] + + LABEL(L_restore_volatile_done{}SYM_COUNT): + INC_SYM_COUNT() }) dnl METHOD_INVOCATION + mov eax,dword ptr J9TR_cframe_rax[_rsp] + mov ecx,dword ptr J9TR_cframe_rcx[_rsp] + mov edx,dword ptr J9TR_cframe_rdx[_rsp] }) define({SAVE_C_NONVOLATILE_REGS},{ diff --git a/runtime/vm/jvminit.c b/runtime/vm/jvminit.c index bab38520421..6c5eeb557b3 100644 --- a/runtime/vm/jvminit.c +++ b/runtime/vm/jvminit.c @@ -1233,6 +1233,22 @@ initializeJavaVM(void * osMainThread, J9JavaVM ** vmPtr, J9CreateJavaVMParams *c ) { vm->extendedRuntimeFlags |= J9_EXTENDED_RUNTIME_USE_VECTOR_REGISTERS; } + +#if JAVA_SPEC_VERSION >= 17 + if (omrsysinfo_processor_has_feature(&desc, OMR_FEATURE_X86_AVX) + && omrsysinfo_processor_has_feature(&desc, OMR_FEATURE_X86_XSAVE_AVX) + ) { + vm->extendedRuntimeFlags3 |= J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_256; + } + + if (omrsysinfo_processor_has_feature(&desc, OMR_FEATURE_X86_AVX512F) + && omrsysinfo_processor_has_feature(&desc, OMR_FEATURE_X86_AVX512BW) + && omrsysinfo_processor_has_feature(&desc, OMR_FEATURE_X86_XSAVE_AVX512) + ) { + vm->extendedRuntimeFlags3 |= J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_512; + } +#endif /* JAVA_SPEC_VERSION >= 17 */ + } #endif /* defined(J9HAMMER) */ @@ -2748,6 +2764,14 @@ VMInitStages(J9JavaVM *vm, IDATA stage, void* reserved) /* Consumed here as the option is dealt with before the consumed args list exists */ FIND_AND_CONSUME_VMARG(STARTSWITH_MATCH, VMOPT_XOPTIONSFILE_EQUALS, NULL); +#if JAVA_SPEC_VERSION >= 17 + /* Extended vector register preservation (ymm/zmm/opmask registers) requires a VM option. */ + if ((argIndex = FIND_AND_CONSUME_VMARG(EXACT_MATCH, VMOPT_PRESERVE_VECTORS, NULL)) < 0) { + vm->extendedRuntimeFlags3 &= ~J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_256; + vm->extendedRuntimeFlags3 &= ~J9_EXTENDED_RUNTIME3_USE_VECTOR_LENGTH_512; + } +#endif + #ifdef J9VM_OPT_METHOD_HANDLE if ((argIndex = FIND_AND_CONSUME_VMARG(STARTSWITH_MATCH, VMOPT_XXMHCOMPILECOUNT_EQUALS, NULL)) >= 0) { UDATA mhCompileCount = 0;