diff options
author | Mike Pall <mike> | 2022-12-07 18:38:22 +0100 |
---|---|---|
committer | Mike Pall <mike> | 2022-12-07 18:38:22 +0100 |
commit | de2e1ca9d3d87e74c0c20c1e4ad3c32b31a5875b (patch) | |
tree | c6dd3a9575b77c9f019c9d8627a814a955227acf | |
parent | 7d5d4a1b1a690d9fc87253868ba967bf25f4df6e (diff) | |
download | luajit-de2e1ca9d3d87e74c0c20c1e4ad3c32b31a5875b.tar.gz luajit-de2e1ca9d3d87e74c0c20c1e4ad3c32b31a5875b.tar.bz2 luajit-de2e1ca9d3d87e74c0c20c1e4ad3c32b31a5875b.zip |
Disable FMA by default. Use -Ofma or jit.opt.start("+fma") to enable.
See the discussion in #918 for the rationale.
-rw-r--r-- | doc/running.html | 8 | ||||
-rw-r--r-- | src/lj_asm_arm.h | 6 | ||||
-rw-r--r-- | src/lj_asm_arm64.h | 3 | ||||
-rw-r--r-- | src/lj_asm_ppc.h | 3 | ||||
-rw-r--r-- | src/lj_jit.h | 4 | ||||
-rw-r--r-- | src/lj_vmmath.c | 13 | ||||
-rw-r--r-- | src/vm_arm64.dasc | 4 |
7 files changed, 35 insertions, 6 deletions
diff --git a/doc/running.html b/doc/running.html index 9979d223..edc049fb 100644 --- a/doc/running.html +++ b/doc/running.html | |||
@@ -220,6 +220,12 @@ mix the three forms, but note that setting an optimization level | |||
220 | overrides all earlier flags. | 220 | overrides all earlier flags. |
221 | </p> | 221 | </p> |
222 | <p> | 222 | <p> |
223 | Note that <tt>-Ofma</tt> is not enabled by default at any level, | ||
224 | because it affects floating-point result accuracy. Only enable this, | ||
225 | if you fully understand the trade-offs of FMA for performance (higher), | ||
226 | determinism (lower) and numerical accuracy (higher). | ||
227 | </p> | ||
228 | <p> | ||
223 | Here are the available flags and at what optimization levels they | 229 | Here are the available flags and at what optimization levels they |
224 | are enabled: | 230 | are enabled: |
225 | </p> | 231 | </p> |
@@ -251,6 +257,8 @@ are enabled: | |||
251 | <td class="flag_name">sink</td><td class="flag_level"> </td><td class="flag_level"> </td><td class="flag_level">•</td><td class="flag_desc">Allocation/Store Sinking</td></tr> | 257 | <td class="flag_name">sink</td><td class="flag_level"> </td><td class="flag_level"> </td><td class="flag_level">•</td><td class="flag_desc">Allocation/Store Sinking</td></tr> |
252 | <tr class="even"> | 258 | <tr class="even"> |
253 | <td class="flag_name">fuse</td><td class="flag_level"> </td><td class="flag_level"> </td><td class="flag_level">•</td><td class="flag_desc">Fusion of operands into instructions</td></tr> | 259 | <td class="flag_name">fuse</td><td class="flag_level"> </td><td class="flag_level"> </td><td class="flag_level">•</td><td class="flag_desc">Fusion of operands into instructions</td></tr> |
260 | <tr class="odd"> | ||
261 | <td class="flag_name">fma </td><td class="flag_level"> </td><td class="flag_level"> </td><td class="flag_level"> </td><td class="flag_desc">Fused multiply-add</td></tr> | ||
254 | </table> | 262 | </table> |
255 | <p> | 263 | <p> |
256 | Here are the parameters and their default settings: | 264 | Here are the parameters and their default settings: |
diff --git a/src/lj_asm_arm.h b/src/lj_asm_arm.h index 326330f4..ba6267ec 100644 --- a/src/lj_asm_arm.h +++ b/src/lj_asm_arm.h | |||
@@ -313,7 +313,11 @@ static void asm_fusexref(ASMState *as, ARMIns ai, Reg rd, IRRef ref, | |||
313 | } | 313 | } |
314 | 314 | ||
315 | #if !LJ_SOFTFP | 315 | #if !LJ_SOFTFP |
316 | /* Fuse to multiply-add/sub instruction. */ | 316 | /* |
317 | ** Fuse to multiply-add/sub instruction. | ||
318 | ** VMLA rounds twice (UMA, not FMA) -- no need to check for JIT_F_OPT_FMA. | ||
319 | ** VFMA needs VFPv4, which is uncommon on the remaining ARM32 targets. | ||
320 | */ | ||
317 | static int asm_fusemadd(ASMState *as, IRIns *ir, ARMIns ai, ARMIns air) | 321 | static int asm_fusemadd(ASMState *as, IRIns *ir, ARMIns ai, ARMIns air) |
318 | { | 322 | { |
319 | IRRef lref = ir->op1, rref = ir->op2; | 323 | IRRef lref = ir->op1, rref = ir->op2; |
diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 4e34b3be..805ea54b 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h | |||
@@ -337,7 +337,8 @@ static int asm_fusemadd(ASMState *as, IRIns *ir, A64Ins ai, A64Ins air) | |||
337 | { | 337 | { |
338 | IRRef lref = ir->op1, rref = ir->op2; | 338 | IRRef lref = ir->op1, rref = ir->op2; |
339 | IRIns *irm; | 339 | IRIns *irm; |
340 | if (lref != rref && | 340 | if ((as->flags & JIT_F_OPT_FMA) && |
341 | lref != rref && | ||
341 | ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) && | 342 | ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) && |
342 | ra_noreg(irm->r)) || | 343 | ra_noreg(irm->r)) || |
343 | (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) && | 344 | (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) && |
diff --git a/src/lj_asm_ppc.h b/src/lj_asm_ppc.h index 546b8e5d..aa818745 100644 --- a/src/lj_asm_ppc.h +++ b/src/lj_asm_ppc.h | |||
@@ -235,7 +235,8 @@ static int asm_fusemadd(ASMState *as, IRIns *ir, PPCIns pi, PPCIns pir) | |||
235 | { | 235 | { |
236 | IRRef lref = ir->op1, rref = ir->op2; | 236 | IRRef lref = ir->op1, rref = ir->op2; |
237 | IRIns *irm; | 237 | IRIns *irm; |
238 | if (lref != rref && | 238 | if ((as->flags & JIT_F_OPT_FMA) && |
239 | lref != rref && | ||
239 | ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) && | 240 | ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) && |
240 | ra_noreg(irm->r)) || | 241 | ra_noreg(irm->r)) || |
241 | (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) && | 242 | (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) && |
diff --git a/src/lj_jit.h b/src/lj_jit.h index 32b3861a..7f081730 100644 --- a/src/lj_jit.h +++ b/src/lj_jit.h | |||
@@ -87,10 +87,11 @@ | |||
87 | #define JIT_F_OPT_ABC (JIT_F_OPT << 7) | 87 | #define JIT_F_OPT_ABC (JIT_F_OPT << 7) |
88 | #define JIT_F_OPT_SINK (JIT_F_OPT << 8) | 88 | #define JIT_F_OPT_SINK (JIT_F_OPT << 8) |
89 | #define JIT_F_OPT_FUSE (JIT_F_OPT << 9) | 89 | #define JIT_F_OPT_FUSE (JIT_F_OPT << 9) |
90 | #define JIT_F_OPT_FMA (JIT_F_OPT << 10) | ||
90 | 91 | ||
91 | /* Optimizations names for -O. Must match the order above. */ | 92 | /* Optimizations names for -O. Must match the order above. */ |
92 | #define JIT_F_OPTSTRING \ | 93 | #define JIT_F_OPTSTRING \ |
93 | "\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\3abc\4sink\4fuse" | 94 | "\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\3abc\4sink\4fuse\3fma" |
94 | 95 | ||
95 | /* Optimization levels set a fixed combination of flags. */ | 96 | /* Optimization levels set a fixed combination of flags. */ |
96 | #define JIT_F_OPT_0 0 | 97 | #define JIT_F_OPT_0 0 |
@@ -99,6 +100,7 @@ | |||
99 | #define JIT_F_OPT_3 (JIT_F_OPT_2|\ | 100 | #define JIT_F_OPT_3 (JIT_F_OPT_2|\ |
100 | JIT_F_OPT_FWD|JIT_F_OPT_DSE|JIT_F_OPT_ABC|JIT_F_OPT_SINK|JIT_F_OPT_FUSE) | 101 | JIT_F_OPT_FWD|JIT_F_OPT_DSE|JIT_F_OPT_ABC|JIT_F_OPT_SINK|JIT_F_OPT_FUSE) |
101 | #define JIT_F_OPT_DEFAULT JIT_F_OPT_3 | 102 | #define JIT_F_OPT_DEFAULT JIT_F_OPT_3 |
103 | /* Note: FMA is not set by default. */ | ||
102 | 104 | ||
103 | /* -- JIT engine parameters ----------------------------------------------- */ | 105 | /* -- JIT engine parameters ----------------------------------------------- */ |
104 | 106 | ||
diff --git a/src/lj_vmmath.c b/src/lj_vmmath.c index b6cc60ba..d0febd81 100644 --- a/src/lj_vmmath.c +++ b/src/lj_vmmath.c | |||
@@ -36,6 +36,17 @@ LJ_FUNCA double lj_wrap_fmod(double x, double y) { return fmod(x, y); } | |||
36 | 36 | ||
37 | /* -- Helper functions ---------------------------------------------------- */ | 37 | /* -- Helper functions ---------------------------------------------------- */ |
38 | 38 | ||
39 | /* Required to prevent the C compiler from applying FMA optimizations. | ||
40 | ** | ||
41 | ** Yes, there's -ffp-contract and the FP_CONTRACT pragma ... in theory. | ||
42 | ** But the current state of C compilers is a mess in this regard. | ||
43 | ** Also, this function is not performance sensitive at all. | ||
44 | */ | ||
45 | LJ_NOINLINE static double lj_vm_floormul(double x, double y) | ||
46 | { | ||
47 | return lj_vm_floor(x / y) * y; | ||
48 | } | ||
49 | |||
39 | double lj_vm_foldarith(double x, double y, int op) | 50 | double lj_vm_foldarith(double x, double y, int op) |
40 | { | 51 | { |
41 | switch (op) { | 52 | switch (op) { |
@@ -43,7 +54,7 @@ double lj_vm_foldarith(double x, double y, int op) | |||
43 | case IR_SUB - IR_ADD: return x-y; break; | 54 | case IR_SUB - IR_ADD: return x-y; break; |
44 | case IR_MUL - IR_ADD: return x*y; break; | 55 | case IR_MUL - IR_ADD: return x*y; break; |
45 | case IR_DIV - IR_ADD: return x/y; break; | 56 | case IR_DIV - IR_ADD: return x/y; break; |
46 | case IR_MOD - IR_ADD: return x-lj_vm_floor(x/y)*y; break; | 57 | case IR_MOD - IR_ADD: return x-lj_vm_floormul(x, y); break; |
47 | case IR_POW - IR_ADD: return pow(x, y); break; | 58 | case IR_POW - IR_ADD: return pow(x, y); break; |
48 | case IR_NEG - IR_ADD: return -x; break; | 59 | case IR_NEG - IR_ADD: return -x; break; |
49 | case IR_ABS - IR_ADD: return fabs(x); break; | 60 | case IR_ABS - IR_ADD: return fabs(x); break; |
diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc index 3448d0d2..36a036ae 100644 --- a/src/vm_arm64.dasc +++ b/src/vm_arm64.dasc | |||
@@ -2636,7 +2636,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) | |||
2636 | |.macro ins_arithmod, res, reg1, reg2 | 2636 | |.macro ins_arithmod, res, reg1, reg2 |
2637 | | fdiv d2, reg1, reg2 | 2637 | | fdiv d2, reg1, reg2 |
2638 | | frintm d2, d2 | 2638 | | frintm d2, d2 |
2639 | | fmsub res, d2, reg2, reg1 | 2639 | | // Cannot use fmsub, because FMA is not enabled by default. |
2640 | | fmul d2, d2, reg2 | ||
2641 | | fsub res, reg1, d2 | ||
2640 | |.endmacro | 2642 | |.endmacro |
2641 | | | 2643 | | |
2642 | |.macro ins_arithdn, intins, fpins | 2644 | |.macro ins_arithdn, intins, fpins |