From a7a1d4a241492e81f659a920f7379c193593ebc6 Mon Sep 17 00:00:00 2001
From: Igor Pavlov <87184205+ip7z@users.noreply.github.com>
Date: Wed, 19 Jun 2024 00:00:00 +0000
Subject: 24.07

---
 Asm/x86/LzmaDecOpt.asm | 40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

(limited to 'Asm/x86/LzmaDecOpt.asm')

diff --git a/Asm/x86/LzmaDecOpt.asm b/Asm/x86/LzmaDecOpt.asm
index f2818e7..7c568df 100644
--- a/Asm/x86/LzmaDecOpt.asm
+++ b/Asm/x86/LzmaDecOpt.asm
@@ -1,5 +1,5 @@
 ; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function
-; 2021-02-23: Igor Pavlov : Public domain
+; 2024-06-18: Igor Pavlov : Public domain
 ;
 ; 3 - is the code compatibility version of LzmaDec_DecodeReal_*()
 ; function for check at link time.
@@ -17,11 +17,41 @@ include 7zAsm.asm
 
 MY_ASM_START
 
-_TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE'
+; if Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT is     defined, we use additional SEGMENT with 64-byte alignment.
+; if Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT is not defined, we use default SEGMENT (where default 16-byte alignment of segment is expected).
+; The performance is almost identical in our tests.
+; But the performance can depend from position of lzmadec code inside instruction cache
+; or micro-op cache line (depending from low address bits in 32-byte/64-byte cache lines).
+; And 64-byte alignment provides a more consistent speed regardless
+; of the code's position in the executable.
+; But also it's possible that code without Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT can be
+; slightly faster than 64-bytes aligned code in some cases, if offset of lzmadec
+; code in 64-byte block after compilation provides better speed by some reason.
+; Note that Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT adds an extra section to the ELF file.
+; If you don't want to get that extra section, do not define Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT.
+
+ifndef Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT
+if (IS_LINUX gt 0)
+  Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT equ 1
+else
+  Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT equ 1
+endif
+endif
 
+ifdef Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT
+_TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE'
 MY_ALIGN macro num:req
         align  num
+        ; align  16
 endm
+else
+MY_ALIGN macro num:req
+        ; We expect that ".text" is aligned for 16-bytes.
+        ; So we don't need large alignment inside out function.
+        align  16
+endm
+endif
+
 
 MY_ALIGN_16 macro
         MY_ALIGN 16
@@ -610,7 +640,11 @@ PARAM_lzma      equ REG_ABI_PARAM_0
 PARAM_limit     equ REG_ABI_PARAM_1
 PARAM_bufLimit  equ REG_ABI_PARAM_2
 
+ifdef Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT
 ; MY_ALIGN_64
+else
+  MY_ALIGN_16
+endif
 MY_PROC LzmaDec_DecodeReal_3, 3
 MY_PUSH_PRESERVED_ABI_REGS
 
@@ -1298,6 +1332,8 @@ fin:
 MY_POP_PRESERVED_ABI_REGS
 MY_ENDP
 
+ifdef Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT
 _TEXT$LZMADECOPT ENDS
+endif
 
 end
-- 
cgit v1.2.3-55-g6feb