From e02109d8424071d8473095e34209345e9a8c338d Mon Sep 17 00:00:00 2001 From: Gary Klimowicz Date: Thu, 2 May 2019 14:52:54 -0700 Subject: [PATCH 01/11] Disable the generation of AVX-512 instructions on Windows --- tools/shared/x86.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/tools/shared/x86.c b/tools/shared/x86.c index 89772598d46..692917fa53c 100644 --- a/tools/shared/x86.c +++ b/tools/shared/x86.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2006-2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,6 +27,12 @@ X86TYPE mach, mach_count; +#ifdef TARGET_WIN +#define DONT_GENERATE_AVX512 true /* a temporary restriction */ +#else +#define DONT_GENERATE_AVX512 false +#endif + void set_mach(X86TYPE *mach, int machtype) { @@ -181,15 +187,19 @@ set_mach(X86TYPE *mach, int machtype) break; case TP_SKYLAKE: - mach->type[MACH_INTEL_SKYLAKE] = 1; - mach->feature[FEATURE_AVX512VL] = 1; + if (! DONT_GENERATE_AVX512) { + mach->type[MACH_INTEL_SKYLAKE] = 1; + mach->feature[FEATURE_AVX512VL] = 1; + } /* ...and fall through... */ case TP_KNIGHTS_LANDING: - if (machtype == TP_KNIGHTS_LANDING) { - mach->type[MACH_INTEL_KNIGHTS_LANDING] = 1; + if (! DONT_GENERATE_AVX512) { + if (machtype == TP_KNIGHTS_LANDING) { + mach->type[MACH_INTEL_KNIGHTS_LANDING] = 1; + } + mach->feature[FEATURE_AVX512F] = 1; } - mach->feature[FEATURE_AVX512F] = 1; /* ...and fall through... */ case TP_HASWELL: From 0c6f235108d14b671b8cd1cd06264c4d24f6aaa5 Mon Sep 17 00:00:00 2001 From: Gary Klimowicz Date: Fri, 3 May 2019 08:23:41 -0700 Subject: [PATCH 02/11] Increase the number of entries in the assigned goto table The assigned goto table was set to the wrong size and writes to the table went to arbitrary locations in the heap. --- tools/flang2/flang2exe/upper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/flang2/flang2exe/upper.cpp b/tools/flang2/flang2exe/upper.cpp index 7daf22e6402..c53f5d2c15a 100644 --- a/tools/flang2/flang2exe/upper.cpp +++ b/tools/flang2/flang2exe/upper.cpp @@ -2724,7 +2724,7 @@ read_symbol(void) NEW(agototab, int, agotosz); agotomax = 0; } - NEED(agoto, agototab, int, agotosz, agotosz + 32); + NEED(agoto, agototab, int, agotosz, agoto + 32); agototab[agoto - 1] = newsptr; if (agoto > agotomax) agotomax = agoto; From 3af93c6501287ee1540b32551f4cb2c6342728b5 Mon Sep 17 00:00:00 2001 From: Gary Klimowicz Date: Fri, 3 May 2019 09:07:05 -0700 Subject: [PATCH 03/11] Fix internal compiler error with -mp The dtype is DT_NONE when computing from NME. Get the dtype from the symbol's ILMs instead. Use the helper function "get_dtype_from_ilm". --- tools/flang2/flang2exe/expatomics.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/flang2/flang2exe/expatomics.cpp b/tools/flang2/flang2exe/expatomics.cpp index b62a4c8a0ab..97c06ee3046 100644 --- a/tools/flang2/flang2exe/expatomics.cpp +++ b/tools/flang2/flang2exe/expatomics.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -3099,6 +3099,9 @@ exp_mp_atomic_capture(ILM *ilmp) cpt.rhs[cnt] = ILI_OF(ILM_OPND(ilmp, 2)); cpt.nme[cnt] = nme[LHS_IDX] = NME_OF(ILM_OPND(ilmp, 1)); cpt.dtype[cnt] = dt_nme(nme[LHS_IDX]); + if (!cpt.dtype[cnt]) { + cpt.dtype[cnt] = get_dtype_from_ilm(ilmp); + } cpt.mem_order[cnt] = ILM_OPND(ilmp, 3); /* Don't use CSE for LHS */ From f05a8b3627e76481a339719f415089606d8c2c28 Mon Sep 17 00:00:00 2001 From: Gary Klimowicz Date: Fri, 3 May 2019 09:41:34 -0700 Subject: [PATCH 04/11] Reserve xflags for induction variable replacement limitations --- tools/flang2/docs/xflag.n | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/flang2/docs/xflag.n b/tools/flang2/docs/xflag.n index b87a54eacae..fef2f21c1db 100644 --- a/tools/flang2/docs/xflag.n +++ b/tools/flang2/docs/xflag.n @@ -5274,6 +5274,11 @@ Don't demote address KMUL operations .XB 0x200000: in LLVM output, don't output the instruction info (lilix index, opcode) .XB 0x400000: +.XB 0x800000: +If the number of ACIV induction variables is too large, kill off all +but the innermost loop ones. +.XB 0x1000000: +Only find ACIV induction variables for innermost loops. reserved .XF "202:" From 8c0aa6a234b4820b09b18c7cba0dc47cba1c4c8a Mon Sep 17 00:00:00 2001 From: Gary Klimowicz Date: Fri, 3 May 2019 10:30:02 -0700 Subject: [PATCH 05/11] Generate mp-specific prologues for functions correctly --- tools/flang2/flang2exe/exp_rte.cpp | 2 +- tools/flang2/flang2exe/expsmp.cpp | 4 ++-- tools/flang2/flang2exe/expsmp.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/flang2/flang2exe/exp_rte.cpp b/tools/flang2/flang2exe/exp_rte.cpp index f9989c623f4..988b514b360 100644 --- a/tools/flang2/flang2exe/exp_rte.cpp +++ b/tools/flang2/flang2exe/exp_rte.cpp @@ -2285,7 +2285,7 @@ exp_end(ILM *ilmp, int curilm, bool is_func) } /* emit any mp initialization for the function & its entries */ - exp_mp_func_prologue(); + exp_mp_func_prologue(true); if (!XBIT(121, 0x01) || /* -Mnoframe isn't specified */ (flg.debug && !XBIT(123, 0x400)) || /* -debug is set */ diff --git a/tools/flang2/flang2exe/expsmp.cpp b/tools/flang2/flang2exe/expsmp.cpp index f2aeabeb067..e857dc59cb7 100644 --- a/tools/flang2/flang2exe/expsmp.cpp +++ b/tools/flang2/flang2exe/expsmp.cpp @@ -2966,7 +2966,7 @@ clear_tplnk(void) /** \brief Generate any mp-specific prologue for a function. */ void -exp_mp_func_prologue(void) +exp_mp_func_prologue(bool process_tp) { SPTR sym; int ili, tmpthread; @@ -2980,7 +2980,7 @@ exp_mp_func_prologue(void) if (CUDAG(GBL_CURRFUNC) == CUDA_GLOBAL || CUDAG(GBL_CURRFUNC) == CUDA_DEVICE) return; #endif - if (1) { + if (process_tp) { for (sym = gbl.threadprivate; sym > NOSYM; sym = TPLNKG(sym)) { /* For each threadprivate common, must 'declare' the threads' * copies by calling: diff --git a/tools/flang2/flang2exe/expsmp.h b/tools/flang2/flang2exe/expsmp.h index e0827d69565..7d1e50db610 100644 --- a/tools/flang2/flang2exe/expsmp.h +++ b/tools/flang2/flang2exe/expsmp.h @@ -99,7 +99,7 @@ void clear_tplnk(void); /** \brief ... */ -void exp_mp_func_prologue(void); +void exp_mp_func_prologue(bool); /** \brief ... From aa7993a0b1c2cddcc11c56b10d570c5e97848d1c Mon Sep 17 00:00:00 2001 From: Gary Klimowicz Date: Fri, 3 May 2019 10:50:08 -0700 Subject: [PATCH 06/11] Another fix for empty structure constructors Fixes another test case in GitHub #239. --- tools/flang1/flang1exe/dinit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/flang1/flang1exe/dinit.c b/tools/flang1/flang1exe/dinit.c index 95f1039c2aa..d3e4f62b42e 100644 --- a/tools/flang1/flang1exe/dinit.c +++ b/tools/flang1/flang1exe/dinit.c @@ -239,7 +239,7 @@ dinit_data(VAR *ivl, ACL *ict, int dtype) assert(dtype, "dinit_data: no object to initialize", 0, 2); member = DTY(dtype + 1); /* for type extension */ - if (PARENTG(DTY(dtype + 3))) { + if (PARENTG(DTY(dtype + 3)) && get_seen_contains()) { member = SYMLKG(member); } } From 23c76c57ca4c89577ee456858df3e99ffd8636da Mon Sep 17 00:00:00 2001 From: Gary Klimowicz Date: Fri, 3 May 2019 11:13:47 -0700 Subject: [PATCH 07/11] Fix issue identified in large application Resolves issue with treating an array section as a simplewholearray. --- tools/flang1/flang1exe/rest.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/flang1/flang1exe/rest.c b/tools/flang1/flang1exe/rest.c index ec702298b64..7e78eff534b 100644 --- a/tools/flang1/flang1exe/rest.c +++ b/tools/flang1/flang1exe/rest.c @@ -2530,8 +2530,11 @@ handle_seq_section(int entry, int arr, int loc, int std, int *retval, } else { /* right now, no members can be distributed anyway */ arrayalign = ALIGNG(arraysptr); - if (POINTERG(arraysptr) && !arrayalign) - is_seq_pointer = TRUE; + if (POINTERG(arraysptr)) { + is_pointer = TRUE; + if (!arrayalign) + is_seq_pointer = TRUE; + } } break; default: From 04644d8c0c71804ae453d51817225ae26d7eae1a Mon Sep 17 00:00:00 2001 From: Gary Klimowicz Date: Fri, 3 May 2019 11:37:47 -0700 Subject: [PATCH 08/11] Define a new ILI opcode 'IL_PKIMVX' This could be used to implement the vectorization of IL_KIMV ILIs, i.e., signed and unsigned integer*8 to signed or unsigned integer*4 type conversions with truncation. --- tools/flang2/utils/ilitp/x86_64/ilitp.n | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tools/flang2/utils/ilitp/x86_64/ilitp.n b/tools/flang2/utils/ilitp/x86_64/ilitp.n index 5b9e6ebd90c..5f5515057ff 100644 --- a/tools/flang2/utils/ilitp/x86_64/ilitp.n +++ b/tools/flang2/utils/ilitp/x86_64/ilitp.n @@ -1126,15 +1126,27 @@ respectively. .IL KIMV krlnk -Move an Integer64 register to a integer register with truncation. -Used also for K to UI, UK to I, and UK to UI conversions. -Bug fix - don't treat this as a move for purposes of register allocation. +Signed or unsigned integer*8 to signed or unsigned integer*4 type +conversion with truncation. Note, we mustn't treat this as a move for +the purposes of register allocation. .AT move null ir cse .CG "mov" 'l' .SI direct lat(1) .SI ld direct lat(3) .SI st direct lat(3) +.IL PKIMVX xmm xmm +Truncate 'N' packed signed or unsigned integer*8 values from 'xmm1' +(an xmm, ymm or zmm register) into 'N' packed signed or unsigned +integer*4 values in 'xmm2' (an xmm or ymm register), where 'N' is 2, 4 +or 8 if 'xmm1' is an xmm, ymm or zmm register respectively. This +instruction requires support for AVX-512F and AVX-512VL, and the +source operand must be a register (unusually for AVX instructions the +destination can be a memory operand!). +.AT other null trm +.CG terminal "vpmovqd" avx_only avx_special + + .IL IAMV irlnk Move an integer reg into an address reg. Sign extension required? From 5c8d7a9d33db6d0c627c01bd6c3ea541b905ded8 Mon Sep 17 00:00:00 2001 From: Gary Klimowicz Date: Fri, 3 May 2019 12:50:18 -0700 Subject: [PATCH 09/11] Add error messages, return value for clause_errchk for future use --- include/flang/Error/errmsg-accel.n | 37 ++++++++++++++++++++++++++++++ include/flang/Error/errmsg-in.n | 3 +++ tools/flang1/flang1exe/semsmp.c | 13 +++++++---- 3 files changed, 49 insertions(+), 4 deletions(-) diff --git a/include/flang/Error/errmsg-accel.n b/include/flang/Error/errmsg-accel.n index e80467d8c0e..45df545b420 100644 --- a/include/flang/Error/errmsg-accel.n +++ b/include/flang/Error/errmsg-accel.n @@ -14,11 +14,42 @@ .\" * limitations under the License. .\" * .\" */ +.MS W 972 "The directive #pragma acc mirror is deprecated; use #pragma acc declare create instead" +.MS W 973 "The directive #pragma acc reflected is deprecated; use #pragma acc declare present" +.MS W 974 "The directive #pragma acc region is deprecated; use #pragma acc kernels instead" +.MS W 975 "The directive #pragma acc data region is deprecated; use #pragma acc data instead" +.MS W 976 "The directive #pragma acc for is deprecated; use #pragma acc loop instead" +.MS W 977 "The directive #pragma acc for kernel is deprecated; use #pragma acc loop instead" +.MS W 978 "The clause parallel is deprecated; use clause gang instead" +.MS W 979 "The directive #pragma acc region for is deprecated; use #pragma acc kernels loop instead" +.MS W 980 "The directive #pragma acc region loop is deprecated; use #pragma acc kernels loop instead" +.MS W 981 "The directive #pragma acc kernels for is deprecated; use #pragma acc kernels loop instead" +.MS W 982 "The directive #pragma acc deviceptr is deprecated; use #pragma acc declare deviceptr instead" +.MS W 983 "The directive #pragma acc parallel for is deprecated; use #pragma acc parallel loop instead" +.MS W 984 "The directive #pragma acc scalar region is deprecated; use #pragma acc serial instead" +.MS W 985 "The clause local is deprecated; use clause create instead" +.MS W 986 "The clause cache is deprecated; use directive #pragma acc cache instead" +.MS W 987 "The clause update host is deprecated; use separate update host directive after the region instead" +.MS W 988 "The clause update device is deprecated; use separate update device directive before the region instead" +.MS W 989 "The clause update in is deprecated; use separate update device directeve before the region instead" +.MS W 990 "The clause update out is deprecated; use update self instead" +.MS W 991 "The clause pnot is deprecated; use no_create instead" +.MS W 992 "The clause updatein is deprecated; use update device instead" +.MS W 993 "The clause updateout is deprecated; use update self instead" +.MS W 994 "The directive #pragma acc copy is deprecated; use #pragma acc declare copy instead" +.MS W 995 "The directive #pragma acc copyin is deprecated; use #pragma acc declare copyin instead" +.MS W 996 "The directive #pragma acc copyout is deprecated; use #pragma acc declare copyout instead" +.MS W 997 "The directive #pragma acc device_resident is deprecated; use #pragma acc declare device_resident instead" +.MS W 998 "The directive #pragma acc for host is deprecated; no OpenACC equivalent" +.MS W 999 "The directive #pragma acc loop kernel is deprecated; no OpenACC equivalent" .MS S 1000 "Call in OpenACC region to procedure '$' which has no acc routine information" .MS S 1001 "All selected compute capabilities were disabled (see -Minfo)" .MS S 1002 "Reduction type not supported for this variable datatype - $" .MS W 1003 "Lambda capture by reference not supported in Accellerated region" .MS W 1004 "Lambda capture 'this' by reference not supported in Accellerated region" +.MS W 1005 "The clause unroll is deprecated; no OpenACC equivalent" +.MS W 1006 "The clause mirror is deprecated; no OpenACC equivalent" +.MS W 1007 "The clause host is deprecated; no OpenACC equivalent" .\" Skipping ahead to 1011 to keep the same numbers as the fortran front-end. .MS S 1011 "Device variable cannot be THREADPRIVATE - $" .MS S 1012 "Threadprivate variables are not supported in acc routine - $" @@ -42,8 +73,14 @@ .MS F 1028 "The variable $ doesn't have predefined policy $ available" .MS F 1029 "The variable $ using policy $ is not a structure-based type" .MS F 1030 "Policy motion $ is not allowed in $ directive" +.MS W 1031 "The directive #pragma acc create is deprecated; use #pragma acc declare create instead" +.MS W 1032 "The directive #pragma acc present is deprecated; use #pragma acc declare present instead" +.MS W 1033 "The directive #pragma acc link is deprecated; use #pragma acc declare link instead" .MS F 1034 "Only signed/unsigned 32 bits and 64 bits integer variables are allowed in bound expression. $ is is not such variable" .MS F 1035 "Only integer sibling members and global variables are allowed in bound expression. $ is is neither of them." .MS F 1036 "No global variable named $ has been defined" .MS F 1037 "Default clause can only contain include and exclude keyword." .MS W 1038 "COPY clause on a dummy variable with INTENT(IN) attribute demoted to COPYIN - $" +.MS S 1039 "OpenACC data clause expected after $" +.MS S 1040 "OpenACC $ data clause may not follow a device_type clause." +.MS S 1041 "OpenACC $ clause may not follow a device_type clause." diff --git a/include/flang/Error/errmsg-in.n b/include/flang/Error/errmsg-in.n index 4ba298964d4..64dabdb774c 100644 --- a/include/flang/Error/errmsg-in.n +++ b/include/flang/Error/errmsg-in.n @@ -1523,3 +1523,6 @@ A DO CONCURRENT variable with LOCAL_INIT locality must have a host variable of t .MS S 1210 "Parent module $ must declare a separate module procedure." .MS S 1211 "Submodule's ancestor module $ must be a nonintrinsic module." .MS S 1212 "$ was previously declared to be a module procedure." +.MS S 1213 "OpenACC $ data clause may not follow a device_type clause." +.MS S 1214 "PGI Accelerator $ data clause may not follow a device_type clause." +.MS S 1215 "OpenACC data clause expected after $." diff --git a/tools/flang1/flang1exe/semsmp.c b/tools/flang1/flang1exe/semsmp.c index f2652575130..7f76f81dd3c 100644 --- a/tools/flang1/flang1exe/semsmp.c +++ b/tools/flang1/flang1exe/semsmp.c @@ -42,7 +42,7 @@ /* contents of this file: */ static void add_clause(int, LOGICAL); -static void clause_errchk(BIGINT64, char *); +static bool clause_errchk(BIGINT64, char *); static void accel_sched_errchk(); static void accel_nosched_errchk(); static void accel_pragmagen(int, int, int); @@ -5807,14 +5807,19 @@ add_clause(int clause, LOGICAL one_only) CL_PRESENT(clause) = 1; } -static void +static bool clause_errchk(BIGINT64 bt, char *dirname) { int i; + bool any = false; for (i = 0; i < CL_MAXV; i++) - if (CL_PRESENT(i) && !(CL_STMT(i) & bt)) - error(533, 3, gbl.lineno, CL_NAME(i), dirname); + if (CL_PRESENT(i)) { + any = true; + if (!(CL_STMT(i) & bt)) + error(533, 3, gbl.lineno, CL_NAME(i), dirname); + } + return any; } static void From 01591d0382898521ea9e1f8d146395739b4b58a2 Mon Sep 17 00:00:00 2001 From: Gary Klimowicz Date: Fri, 3 May 2019 13:33:19 -0700 Subject: [PATCH 10/11] Remove reference to PGI in PCAST --- tools/shared/pgifeat.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/shared/pgifeat.h b/tools/shared/pgifeat.h index 3c7038f599e..be9794acd99 100644 --- a/tools/shared/pgifeat.h +++ b/tools/shared/pgifeat.h @@ -87,6 +87,7 @@ #define ACCVERYSTRICT XBIT(186,0x200000) /* - * PGI Compiler Assisted Software Testing (PCAST) + * Compiler Assisted Software Testing (PCAST) */ + #define XBIT_PCAST XBIT(215, 0x8) From f718fa7c72871bfa0d9cedc4d8945b218b526cc1 Mon Sep 17 00:00:00 2001 From: Gary Klimowicz Date: Fri, 3 May 2019 14:00:16 -0700 Subject: [PATCH 11/11] Update the description for two vectorization flags --- tools/flang2/docs/xflag.n | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/flang2/docs/xflag.n b/tools/flang2/docs/xflag.n index fef2f21c1db..773eb541f1c 100644 --- a/tools/flang2/docs/xflag.n +++ b/tools/flang2/docs/xflag.n @@ -1317,12 +1317,12 @@ definition on only one side of the conditional. LLVM - disable extended conditional vectorization in all loops where the predicate size is different than the computational size. .XB 0x20: -LLVM - don't allow vectorization if multiple lhs data type sizes exist +LLVM - allow vectorization if multiple lhs data type sizes exist within the inner loop .XB 0x40: LLVM - don't allow vectorization of DBLE ili .XB 0x80: -LLVM - don't allow vectorization of DFLOAT ili +LLVM - don't allow vectorization of DFLOAT/IKMV ili .XF "37": .XB 0x01: