aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMike Pall <mike>2022-12-07 18:38:22 +0100
committerMike Pall <mike>2022-12-07 18:38:22 +0100
commitde2e1ca9d3d87e74c0c20c1e4ad3c32b31a5875b (patch)
treec6dd3a9575b77c9f019c9d8627a814a955227acf
parent7d5d4a1b1a690d9fc87253868ba967bf25f4df6e (diff)
downloadluajit-de2e1ca9d3d87e74c0c20c1e4ad3c32b31a5875b.tar.gz
luajit-de2e1ca9d3d87e74c0c20c1e4ad3c32b31a5875b.tar.bz2
luajit-de2e1ca9d3d87e74c0c20c1e4ad3c32b31a5875b.zip
Disable FMA by default. Use -Ofma or jit.opt.start("+fma") to enable.
See the discussion in #918 for the rationale.
-rw-r--r--doc/running.html8
-rw-r--r--src/lj_asm_arm.h6
-rw-r--r--src/lj_asm_arm64.h3
-rw-r--r--src/lj_asm_ppc.h3
-rw-r--r--src/lj_jit.h4
-rw-r--r--src/lj_vmmath.c13
-rw-r--r--src/vm_arm64.dasc4
7 files changed, 35 insertions, 6 deletions
diff --git a/doc/running.html b/doc/running.html
index 9979d223..edc049fb 100644
--- a/doc/running.html
+++ b/doc/running.html
@@ -220,6 +220,12 @@ mix the three forms, but note that setting an optimization level
220overrides all earlier flags. 220overrides all earlier flags.
221</p> 221</p>
222<p> 222<p>
223Note that <tt>-Ofma</tt> is not enabled by default at any level,
224because it affects floating-point result accuracy. Only enable this,
225if you fully understand the trade-offs of FMA for performance (higher),
226determinism (lower) and numerical accuracy (higher).
227</p>
228<p>
223Here are the available flags and at what optimization levels they 229Here are the available flags and at what optimization levels they
224are enabled: 230are enabled:
225</p> 231</p>
@@ -251,6 +257,8 @@ are enabled:
251<td class="flag_name">sink</td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_level">&bull;</td><td class="flag_desc">Allocation/Store Sinking</td></tr> 257<td class="flag_name">sink</td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_level">&bull;</td><td class="flag_desc">Allocation/Store Sinking</td></tr>
252<tr class="even"> 258<tr class="even">
253<td class="flag_name">fuse</td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_level">&bull;</td><td class="flag_desc">Fusion of operands into instructions</td></tr> 259<td class="flag_name">fuse</td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_level">&bull;</td><td class="flag_desc">Fusion of operands into instructions</td></tr>
260<tr class="odd">
261<td class="flag_name">fma </td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_desc">Fused multiply-add</td></tr>
254</table> 262</table>
255<p> 263<p>
256Here are the parameters and their default settings: 264Here are the parameters and their default settings:
diff --git a/src/lj_asm_arm.h b/src/lj_asm_arm.h
index 326330f4..ba6267ec 100644
--- a/src/lj_asm_arm.h
+++ b/src/lj_asm_arm.h
@@ -313,7 +313,11 @@ static void asm_fusexref(ASMState *as, ARMIns ai, Reg rd, IRRef ref,
313} 313}
314 314
315#if !LJ_SOFTFP 315#if !LJ_SOFTFP
316/* Fuse to multiply-add/sub instruction. */ 316/*
317** Fuse to multiply-add/sub instruction.
318** VMLA rounds twice (UMA, not FMA) -- no need to check for JIT_F_OPT_FMA.
319** VFMA needs VFPv4, which is uncommon on the remaining ARM32 targets.
320*/
317static int asm_fusemadd(ASMState *as, IRIns *ir, ARMIns ai, ARMIns air) 321static int asm_fusemadd(ASMState *as, IRIns *ir, ARMIns ai, ARMIns air)
318{ 322{
319 IRRef lref = ir->op1, rref = ir->op2; 323 IRRef lref = ir->op1, rref = ir->op2;
diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h
index 4e34b3be..805ea54b 100644
--- a/src/lj_asm_arm64.h
+++ b/src/lj_asm_arm64.h
@@ -337,7 +337,8 @@ static int asm_fusemadd(ASMState *as, IRIns *ir, A64Ins ai, A64Ins air)
337{ 337{
338 IRRef lref = ir->op1, rref = ir->op2; 338 IRRef lref = ir->op1, rref = ir->op2;
339 IRIns *irm; 339 IRIns *irm;
340 if (lref != rref && 340 if ((as->flags & JIT_F_OPT_FMA) &&
341 lref != rref &&
341 ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) && 342 ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) &&
342 ra_noreg(irm->r)) || 343 ra_noreg(irm->r)) ||
343 (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) && 344 (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) &&
diff --git a/src/lj_asm_ppc.h b/src/lj_asm_ppc.h
index 546b8e5d..aa818745 100644
--- a/src/lj_asm_ppc.h
+++ b/src/lj_asm_ppc.h
@@ -235,7 +235,8 @@ static int asm_fusemadd(ASMState *as, IRIns *ir, PPCIns pi, PPCIns pir)
235{ 235{
236 IRRef lref = ir->op1, rref = ir->op2; 236 IRRef lref = ir->op1, rref = ir->op2;
237 IRIns *irm; 237 IRIns *irm;
238 if (lref != rref && 238 if ((as->flags & JIT_F_OPT_FMA) &&
239 lref != rref &&
239 ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) && 240 ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) &&
240 ra_noreg(irm->r)) || 241 ra_noreg(irm->r)) ||
241 (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) && 242 (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) &&
diff --git a/src/lj_jit.h b/src/lj_jit.h
index 32b3861a..7f081730 100644
--- a/src/lj_jit.h
+++ b/src/lj_jit.h
@@ -87,10 +87,11 @@
87#define JIT_F_OPT_ABC (JIT_F_OPT << 7) 87#define JIT_F_OPT_ABC (JIT_F_OPT << 7)
88#define JIT_F_OPT_SINK (JIT_F_OPT << 8) 88#define JIT_F_OPT_SINK (JIT_F_OPT << 8)
89#define JIT_F_OPT_FUSE (JIT_F_OPT << 9) 89#define JIT_F_OPT_FUSE (JIT_F_OPT << 9)
90#define JIT_F_OPT_FMA (JIT_F_OPT << 10)
90 91
91/* Optimizations names for -O. Must match the order above. */ 92/* Optimizations names for -O. Must match the order above. */
92#define JIT_F_OPTSTRING \ 93#define JIT_F_OPTSTRING \
93 "\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\3abc\4sink\4fuse" 94 "\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\3abc\4sink\4fuse\3fma"
94 95
95/* Optimization levels set a fixed combination of flags. */ 96/* Optimization levels set a fixed combination of flags. */
96#define JIT_F_OPT_0 0 97#define JIT_F_OPT_0 0
@@ -99,6 +100,7 @@
99#define JIT_F_OPT_3 (JIT_F_OPT_2|\ 100#define JIT_F_OPT_3 (JIT_F_OPT_2|\
100 JIT_F_OPT_FWD|JIT_F_OPT_DSE|JIT_F_OPT_ABC|JIT_F_OPT_SINK|JIT_F_OPT_FUSE) 101 JIT_F_OPT_FWD|JIT_F_OPT_DSE|JIT_F_OPT_ABC|JIT_F_OPT_SINK|JIT_F_OPT_FUSE)
101#define JIT_F_OPT_DEFAULT JIT_F_OPT_3 102#define JIT_F_OPT_DEFAULT JIT_F_OPT_3
103/* Note: FMA is not set by default. */
102 104
103/* -- JIT engine parameters ----------------------------------------------- */ 105/* -- JIT engine parameters ----------------------------------------------- */
104 106
diff --git a/src/lj_vmmath.c b/src/lj_vmmath.c
index b6cc60ba..d0febd81 100644
--- a/src/lj_vmmath.c
+++ b/src/lj_vmmath.c
@@ -36,6 +36,17 @@ LJ_FUNCA double lj_wrap_fmod(double x, double y) { return fmod(x, y); }
36 36
37/* -- Helper functions ---------------------------------------------------- */ 37/* -- Helper functions ---------------------------------------------------- */
38 38
39/* Required to prevent the C compiler from applying FMA optimizations.
40**
41** Yes, there's -ffp-contract and the FP_CONTRACT pragma ... in theory.
42** But the current state of C compilers is a mess in this regard.
43** Also, this function is not performance sensitive at all.
44*/
45LJ_NOINLINE static double lj_vm_floormul(double x, double y)
46{
47 return lj_vm_floor(x / y) * y;
48}
49
39double lj_vm_foldarith(double x, double y, int op) 50double lj_vm_foldarith(double x, double y, int op)
40{ 51{
41 switch (op) { 52 switch (op) {
@@ -43,7 +54,7 @@ double lj_vm_foldarith(double x, double y, int op)
43 case IR_SUB - IR_ADD: return x-y; break; 54 case IR_SUB - IR_ADD: return x-y; break;
44 case IR_MUL - IR_ADD: return x*y; break; 55 case IR_MUL - IR_ADD: return x*y; break;
45 case IR_DIV - IR_ADD: return x/y; break; 56 case IR_DIV - IR_ADD: return x/y; break;
46 case IR_MOD - IR_ADD: return x-lj_vm_floor(x/y)*y; break; 57 case IR_MOD - IR_ADD: return x-lj_vm_floormul(x, y); break;
47 case IR_POW - IR_ADD: return pow(x, y); break; 58 case IR_POW - IR_ADD: return pow(x, y); break;
48 case IR_NEG - IR_ADD: return -x; break; 59 case IR_NEG - IR_ADD: return -x; break;
49 case IR_ABS - IR_ADD: return fabs(x); break; 60 case IR_ABS - IR_ADD: return fabs(x); break;
diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc
index 3448d0d2..36a036ae 100644
--- a/src/vm_arm64.dasc
+++ b/src/vm_arm64.dasc
@@ -2636,7 +2636,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
2636 |.macro ins_arithmod, res, reg1, reg2 2636 |.macro ins_arithmod, res, reg1, reg2
2637 | fdiv d2, reg1, reg2 2637 | fdiv d2, reg1, reg2
2638 | frintm d2, d2 2638 | frintm d2, d2
2639 | fmsub res, d2, reg2, reg1 2639 | // Cannot use fmsub, because FMA is not enabled by default.
2640 | fmul d2, d2, reg2
2641 | fsub res, reg1, d2
2640 |.endmacro 2642 |.endmacro
2641 | 2643 |
2642 |.macro ins_arithdn, intins, fpins 2644 |.macro ins_arithdn, intins, fpins