From 5b39dc76f1bc82f941d5c800ab9f34407a06b53a Mon Sep 17 00:00:00 2001 From: Igor Pavlov <87184205+ip7z@users.noreply.github.com> Date: Wed, 21 Jun 2023 00:00:00 +0000 Subject: 23.01 --- C/Bcj2Enc.c | 559 ++++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 377 insertions(+), 182 deletions(-) (limited to 'C/Bcj2Enc.c') diff --git a/C/Bcj2Enc.c b/C/Bcj2Enc.c index 682362a..79460bb 100644 --- a/C/Bcj2Enc.c +++ b/C/Bcj2Enc.c @@ -1,60 +1,62 @@ -/* Bcj2Enc.c -- BCJ2 Encoder (Converter for x86 code) -2021-02-09 : Igor Pavlov : Public domain */ +/* Bcj2Enc.c -- BCJ2 Encoder converter for x86 code (Branch CALL/JUMP variant2) +2023-04-02 : Igor Pavlov : Public domain */ #include "Precomp.h" /* #define SHOW_STAT */ - #ifdef SHOW_STAT #include -#define PRF(x) x +#define PRF2(s) printf("%s ip=%8x tempPos=%d src= %8x\n", s, (unsigned)p->ip64, p->tempPos, (unsigned)(p->srcLim - p->src)); #else -#define PRF(x) +#define PRF2(s) #endif -#include - #include "Bcj2.h" #include "CpuArch.h" -#define CProb UInt16 - #define kTopValue ((UInt32)1 << 24) -#define kNumModelBits 11 -#define kBitModelTotal (1 << kNumModelBits) +#define kNumBitModelTotalBits 11 +#define kBitModelTotal (1 << kNumBitModelTotalBits) #define kNumMoveBits 5 void Bcj2Enc_Init(CBcj2Enc *p) { unsigned i; - - p->state = BCJ2_ENC_STATE_OK; + p->state = BCJ2_ENC_STATE_ORIG; p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE; - - p->prevByte = 0; - + p->context = 0; + p->flushRem = 5; + p->isFlushState = 0; p->cache = 0; - p->range = 0xFFFFFFFF; + p->range = 0xffffffff; p->low = 0; p->cacheSize = 1; - - p->ip = 0; - - p->fileIp = 0; - p->fileSize = 0; - p->relatLimit = BCJ2_RELAT_LIMIT; - + p->ip64 = 0; + p->fileIp64 = 0; + p->fileSize64_minus1 = BCJ2_ENC_FileSizeField_UNLIMITED; + p->relatLimit = BCJ2_ENC_RELAT_LIMIT_DEFAULT; + // p->relatExcludeBits = 0; p->tempPos = 0; - - p->flushPos = 0; - for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++) p->probs[i] = kBitModelTotal >> 1; } -static BoolInt MY_FAST_CALL RangeEnc_ShiftLow(CBcj2Enc *p) +// Z7_NO_INLINE +Z7_FORCE_INLINE +static BoolInt Bcj2_RangeEnc_ShiftLow(CBcj2Enc *p) { - if ((UInt32)p->low < (UInt32)0xFF000000 || (UInt32)(p->low >> 32) != 0) + const UInt32 low = (UInt32)p->low; + const unsigned high = (unsigned) + #if defined(Z7_MSC_VER_ORIGINAL) \ + && defined(MY_CPU_X86) \ + && defined(MY_CPU_LE) \ + && !defined(MY_CPU_64BIT) + // we try to rid of __aullshr() call in MSVS-x86 + (((const UInt32 *)&p->low)[1]); // [1] : for little-endian only + #else + (p->low >> 32); + #endif + if (low < (UInt32)0xff000000 || high != 0) { Byte *buf = p->bufs[BCJ2_STREAM_RC]; do @@ -65,247 +67,440 @@ static BoolInt MY_FAST_CALL RangeEnc_ShiftLow(CBcj2Enc *p) p->bufs[BCJ2_STREAM_RC] = buf; return True; } - *buf++ = (Byte)(p->cache + (Byte)(p->low >> 32)); - p->cache = 0xFF; + *buf++ = (Byte)(p->cache + high); + p->cache = 0xff; } while (--p->cacheSize); p->bufs[BCJ2_STREAM_RC] = buf; - p->cache = (Byte)((UInt32)p->low >> 24); + p->cache = (Byte)(low >> 24); } p->cacheSize++; - p->low = (UInt32)p->low << 8; + p->low = low << 8; return False; } -static void Bcj2Enc_Encode_2(CBcj2Enc *p) -{ - if (BCJ2_IS_32BIT_STREAM(p->state)) + +/* +We can use 2 alternative versions of code: +1) non-marker version: + Byte CBcj2Enc::context + Byte temp[8]; + Last byte of marker (e8/e9/[0f]8x) can be written to temp[] buffer. + Encoder writes last byte of marker (e8/e9/[0f]8x) to dest, only in conjunction + with writing branch symbol to range coder in same Bcj2Enc_Encode_2() call. + +2) marker version: + UInt32 CBcj2Enc::context + Byte CBcj2Enc::temp[4]; + MARKER_FLAG in CBcj2Enc::context shows that CBcj2Enc::context contains finded marker. + it's allowed that + one call of Bcj2Enc_Encode_2() writes last byte of marker (e8/e9/[0f]8x) to dest, + and another call of Bcj2Enc_Encode_2() does offset conversion. + So different values of (fileIp) and (fileSize) are possible + in these different Bcj2Enc_Encode_2() calls. + +Also marker version requires additional if((v & MARKER_FLAG) == 0) check in main loop. +So we use non-marker version. +*/ + +/* + Corner cases with overlap in multi-block. + before v23: there was one corner case, where converted instruction + could start in one sub-stream and finish in next sub-stream. + If multi-block (solid) encoding is used, + and BCJ2_ENC_FINISH_MODE_END_BLOCK is used for each sub-stream. + and (0f) is last byte of previous sub-stream + and (8x) is first byte of current sub-stream + then (0f 8x) pair is treated as marker by BCJ2 encoder and decoder. + BCJ2 encoder can converts 32-bit offset for that (0f 8x) cortage, + if that offset meets limit requirements. + If encoder allows 32-bit offset conversion for such overlap case, + then the data in 3 uncompressed BCJ2 streams for some sub-stream + can depend from data of previous sub-stream. + That corner case is not big problem, and it's rare case. + Since v23.00 we do additional check to prevent conversions in such overlap cases. +*/ + +/* + Bcj2Enc_Encode_2() output variables at exit: { - Byte *cur = p->bufs[p->state]; - if (cur == p->lims[p->state]) - return; - SetBe32(cur, p->tempTarget); - p->bufs[p->state] = cur + 4; + if (Bcj2Enc_Encode_2() exits with (p->state == BCJ2_ENC_STATE_ORIG)) + { + it means that encoder needs more input data. + if (p->srcLim == p->src) at exit, then + { + (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM) + all input data were read and processed, and we are ready for + new input data. + } + else + { + (p->srcLim != p->src) + (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE) + The encoder have found e8/e9/0f_8x marker, + and p->src points to last byte of that marker, + Bcj2Enc_Encode_2() needs more input data to get totally + 5 bytes (last byte of marker and 32-bit branch offset) + as continuous array starting from p->src. + (p->srcLim - p->src < 5) requirement is met after exit. + So non-processed resedue from p->src to p->srcLim is always less than 5 bytes. + } + } } +*/ - p->state = BCJ2_ENC_STATE_ORIG; - - for (;;) +Z7_NO_INLINE +static void Bcj2Enc_Encode_2(CBcj2Enc *p) +{ + if (!p->isFlushState) { - if (p->range < kTopValue) + const Byte *src; + UInt32 v; { - if (RangeEnc_ShiftLow(p)) - return; - p->range <<= 8; + const unsigned state = p->state; + if (BCJ2_IS_32BIT_STREAM(state)) + { + Byte *cur = p->bufs[state]; + if (cur == p->lims[state]) + return; + SetBe32a(cur, p->tempTarget) + p->bufs[state] = cur + 4; + } } + p->state = BCJ2_ENC_STATE_ORIG; // for main reason of exit + src = p->src; + v = p->context; + + // #define WRITE_CONTEXT p->context = v; // for marker version + #define WRITE_CONTEXT p->context = (Byte)v; + #define WRITE_CONTEXT_AND_SRC p->src = src; WRITE_CONTEXT + for (;;) { + // const Byte *src; + // UInt32 v; + CBcj2Enc_ip_unsigned ip; + if (p->range < kTopValue) + { + // to reduce register pressure and code size: we save and restore local variables. + WRITE_CONTEXT_AND_SRC + if (Bcj2_RangeEnc_ShiftLow(p)) + return; + p->range <<= 8; + src = p->src; + v = p->context; + } + // src = p->src; + // #define MARKER_FLAG ((UInt32)1 << 17) + // if ((v & MARKER_FLAG) == 0) // for marker version { - const Byte *src = p->src; const Byte *srcLim; - Byte *dest; - SizeT num = (SizeT)(p->srcLim - src); - - if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE) + Byte *dest = p->bufs[BCJ2_STREAM_MAIN]; { - if (num <= 4) - return; - num -= 4; + const SizeT remSrc = (SizeT)(p->srcLim - src); + SizeT rem = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest); + if (rem >= remSrc) + rem = remSrc; + srcLim = src + rem; } - else if (num == 0) - break; - - dest = p->bufs[BCJ2_STREAM_MAIN]; - if (num > (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest)) + /* p->context contains context of previous byte: + bits [0 : 7] : src[-1], if (src) was changed in this call + bits [8 : 31] : are undefined for non-marker version + */ + // v = p->context; + #define NUM_SHIFT_BITS 24 + #define CONV_FLAG ((UInt32)1 << 16) + #define ONE_ITER { \ + b = src[0]; \ + *dest++ = (Byte)b; \ + v = (v << NUM_SHIFT_BITS) | b; \ + if (((b + (0x100 - 0xe8)) & 0xfe) == 0) break; \ + if (((v - (((UInt32)0x0f << (NUM_SHIFT_BITS)) + 0x80)) & \ + ((((UInt32)1 << (4 + NUM_SHIFT_BITS)) - 0x1) << 4)) == 0) break; \ + src++; if (src == srcLim) { break; } } + + if (src != srcLim) + for (;;) { - num = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest); - if (num == 0) - { - p->state = BCJ2_STREAM_MAIN; - return; - } + /* clang can generate ineffective code with setne instead of two jcc instructions. + we can use 2 iterations and external (unsigned b) to avoid that ineffective code genaration. */ + unsigned b; + ONE_ITER + ONE_ITER } - - srcLim = src + num; + + ip = p->ip64 + (CBcj2Enc_ip_unsigned)(SizeT)(dest - p->bufs[BCJ2_STREAM_MAIN]); + p->bufs[BCJ2_STREAM_MAIN] = dest; + p->ip64 = ip; - if (p->prevByte == 0x0F && (src[0] & 0xF0) == 0x80) - *dest = src[0]; - else for (;;) + if (src == srcLim) { - Byte b = *src; - *dest = b; - if (b != 0x0F) + WRITE_CONTEXT_AND_SRC + if (src != p->srcLim) { - if ((b & 0xFE) == 0xE8) - break; - dest++; - if (++src != srcLim) - continue; - break; + p->state = BCJ2_STREAM_MAIN; + return; } - dest++; - if (++src == srcLim) - break; - if ((*src & 0xF0) != 0x80) - continue; - *dest = *src; + /* (p->src == p->srcLim) + (p->state == BCJ2_ENC_STATE_ORIG) */ + if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM) + return; + /* (p->finishMode == BCJ2_ENC_FINISH_MODE_END_STREAM */ + // (p->flushRem == 5); + p->isFlushState = 1; break; } - - num = (SizeT)(src - p->src); - - if (src == srcLim) - { - p->prevByte = src[-1]; - p->bufs[BCJ2_STREAM_MAIN] = dest; - p->src = src; - p->ip += (UInt32)num; - continue; - } - + src++; + // p->src = src; + } + // ip = p->ip; // for marker version + /* marker was found */ + /* (v) contains marker that was found: + bits [NUM_SHIFT_BITS : NUM_SHIFT_BITS + 7] + : value of src[-2] : xx/xx/0f + bits [0 : 7] : value of src[-1] : e8/e9/8x + */ + { { - Byte context = (Byte)(num == 0 ? p->prevByte : src[-1]); - BoolInt needConvert; - - p->bufs[BCJ2_STREAM_MAIN] = dest + 1; - p->ip += (UInt32)num + 1; - src++; - - needConvert = False; - + #if NUM_SHIFT_BITS != 24 + v &= ~(UInt32)CONV_FLAG; + #endif + // UInt32 relat = 0; if ((SizeT)(p->srcLim - src) >= 4) { - UInt32 relatVal = GetUi32(src); - if ((p->fileSize == 0 || (UInt32)(p->ip + 4 + relatVal - p->fileIp) < p->fileSize) - && ((relatVal + p->relatLimit) >> 1) < p->relatLimit) - needConvert = True; + /* + if (relat != 0 || (Byte)v != 0xe8) + BoolInt isBigOffset = True; + */ + const UInt32 relat = GetUi32(src); + /* + #define EXCLUDE_FLAG ((UInt32)1 << 4) + #define NEED_CONVERT(rel) ((((rel) + EXCLUDE_FLAG) & (0 - EXCLUDE_FLAG * 2)) != 0) + if (p->relatExcludeBits != 0) + { + const UInt32 flag = (UInt32)1 << (p->relatExcludeBits - 1); + isBigOffset = (((relat + flag) & (0 - flag * 2)) != 0); + } + // isBigOffset = False; // for debug + */ + ip -= p->fileIp64; + // Use the following if check, if (ip) is 64-bit: + if (ip > (((v + 0x20) >> 5) & 1)) // 23.00 : we eliminate milti-block overlap for (Of 80) and (e8/e9) + if ((CBcj2Enc_ip_unsigned)((CBcj2Enc_ip_signed)ip + 4 + (Int32)relat) <= p->fileSize64_minus1) + if (((UInt32)(relat + p->relatLimit) >> 1) < p->relatLimit) + v |= CONV_FLAG; } - + else if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE) { - UInt32 bound; - unsigned ttt; - Byte b = src[-1]; - CProb *prob = p->probs + (unsigned)(b == 0xE8 ? 2 + (unsigned)context : (b == 0xE9 ? 1 : 0)); - - ttt = *prob; - bound = (p->range >> kNumModelBits) * ttt; - - if (!needConvert) + // (p->srcLim - src < 4) + // /* + // for non-marker version + p->ip64--; // p->ip = ip - 1; + p->bufs[BCJ2_STREAM_MAIN]--; + src--; + v >>= NUM_SHIFT_BITS; + // (0 < p->srcLim - p->src <= 4) + // */ + // v |= MARKER_FLAG; // for marker version + /* (p->state == BCJ2_ENC_STATE_ORIG) */ + WRITE_CONTEXT_AND_SRC + return; + } + { + const unsigned c = ((v + 0x17) >> 6) & 1; + CBcj2Prob *prob = p->probs + (unsigned) + (((0 - c) & (Byte)(v >> NUM_SHIFT_BITS)) + c + ((v >> 5) & 1)); + /* + ((Byte)v == 0xe8 ? 2 + ((Byte)(v >> 8)) : + ((Byte)v < 0xe8 ? 0 : 1)); // ((v >> 5) & 1)); + */ + const unsigned ttt = *prob; + const UInt32 bound = (p->range >> kNumBitModelTotalBits) * ttt; + if ((v & CONV_FLAG) == 0) { + // static int yyy = 0; yyy++; printf("\n!needConvert = %d\n", yyy); + // v = (Byte)v; // for marker version p->range = bound; - *prob = (CProb)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits)); - p->src = src; - p->prevByte = b; + *prob = (CBcj2Prob)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits)); + // WRITE_CONTEXT_AND_SRC continue; } - p->low += bound; p->range -= bound; - *prob = (CProb)(ttt - (ttt >> kNumMoveBits)); - + *prob = (CBcj2Prob)(ttt - (ttt >> kNumMoveBits)); + } + // p->context = src[3]; + { + // const unsigned cj = ((Byte)v == 0xe8 ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP); + const unsigned cj = (((v + 0x57) >> 6) & 1) + BCJ2_STREAM_CALL; + ip = p->ip64; + v = GetUi32(src); // relat + ip += 4; + p->ip64 = ip; + src += 4; + // p->src = src; { - UInt32 relatVal = GetUi32(src); - UInt32 absVal; - p->ip += 4; - absVal = p->ip + relatVal; - p->prevByte = src[3]; - src += 4; - p->src = src; + const UInt32 absol = (UInt32)ip + v; + Byte *cur = p->bufs[cj]; + v >>= 24; + // WRITE_CONTEXT + if (cur == p->lims[cj]) { - unsigned cj = (b == 0xE8) ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP; - Byte *cur = p->bufs[cj]; - if (cur == p->lims[cj]) - { - p->state = cj; - p->tempTarget = absVal; - return; - } - SetBe32(cur, absVal); - p->bufs[cj] = cur + 4; + p->state = cj; + p->tempTarget = absol; + WRITE_CONTEXT_AND_SRC + return; } + SetBe32a(cur, absol) + p->bufs[cj] = cur + 4; } } } } - } + } // end of loop } - if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM) - return; - - for (; p->flushPos < 5; p->flushPos++) - if (RangeEnc_ShiftLow(p)) + for (; p->flushRem != 0; p->flushRem--) + if (Bcj2_RangeEnc_ShiftLow(p)) return; - p->state = BCJ2_ENC_STATE_OK; + p->state = BCJ2_ENC_STATE_FINISHED; } +/* +BCJ2 encoder needs look ahead for up to 4 bytes in (src) buffer. +So base function Bcj2Enc_Encode_2() + in BCJ2_ENC_FINISH_MODE_CONTINUE mode can return with + (p->state == BCJ2_ENC_STATE_ORIG && p->src < p->srcLim) +Bcj2Enc_Encode() solves that look ahead problem by using p->temp[] buffer. + so if (p->state == BCJ2_ENC_STATE_ORIG) after Bcj2Enc_Encode(), + then (p->src == p->srcLim). + And the caller's code is simpler with Bcj2Enc_Encode(). +*/ + +Z7_NO_INLINE void Bcj2Enc_Encode(CBcj2Enc *p) { - PRF(printf("\n")); - PRF(printf("---- ip = %8d tempPos = %8d src = %8d\n", p->ip, p->tempPos, p->srcLim - p->src)); - + PRF2("\n----") if (p->tempPos != 0) { + /* extra: number of bytes that were copied from (src) to (temp) buffer in this call */ unsigned extra = 0; - + /* We will touch only minimal required number of bytes in input (src) stream. + So we will add input bytes from (src) stream to temp[] with step of 1 byte. + We don't add new bytes to temp[] before Bcj2Enc_Encode_2() call + in first loop iteration because + - previous call of Bcj2Enc_Encode() could use another (finishMode), + - previous call could finish with (p->state != BCJ2_ENC_STATE_ORIG). + the case with full temp[] buffer (p->tempPos == 4) is possible here. + */ for (;;) { + // (0 < p->tempPos <= 5) // in non-marker version + /* p->src : the current src data position including extra bytes + that were copied to temp[] buffer in this call */ const Byte *src = p->src; const Byte *srcLim = p->srcLim; - EBcj2Enc_FinishMode finishMode = p->finishMode; - - p->src = p->temp; - p->srcLim = p->temp + p->tempPos; + const EBcj2Enc_FinishMode finishMode = p->finishMode; if (src != srcLim) + { + /* if there are some src data after the data copied to temp[], + then we use MODE_CONTINUE for temp data */ p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE; - - PRF(printf(" ip = %8d tempPos = %8d src = %8d\n", p->ip, p->tempPos, p->srcLim - p->src)); - + } + p->src = p->temp; + p->srcLim = p->temp + p->tempPos; + PRF2(" ") Bcj2Enc_Encode_2(p); - { - unsigned num = (unsigned)(p->src - p->temp); - unsigned tempPos = p->tempPos - num; + const unsigned num = (unsigned)(p->src - p->temp); + const unsigned tempPos = p->tempPos - num; unsigned i; p->tempPos = tempPos; for (i = 0; i < tempPos; i++) - p->temp[i] = p->temp[(size_t)i + num]; - + p->temp[i] = p->temp[(SizeT)i + num]; + // tempPos : number of bytes in temp buffer p->src = src; p->srcLim = srcLim; p->finishMode = finishMode; - - if (p->state != BCJ2_ENC_STATE_ORIG || src == srcLim) + if (p->state != BCJ2_ENC_STATE_ORIG) + { + // (p->tempPos <= 4) // in non-marker version + /* if (the reason of exit from Bcj2Enc_Encode_2() + is not BCJ2_ENC_STATE_ORIG), + then we exit from Bcj2Enc_Encode() with same reason */ + // optional code begin : we rollback (src) and tempPos, if it's possible: + if (extra >= tempPos) + extra = tempPos; + p->src = src - extra; + p->tempPos = tempPos - extra; + // optional code end : rollback of (src) and tempPos return; - + } + /* (p->tempPos <= 4) + (p->state == BCJ2_ENC_STATE_ORIG) + so encoder needs more data than in temp[] */ + if (src == srcLim) + return; // src buffer has no more input data. + /* (src != srcLim) + so we can provide more input data from src for Bcj2Enc_Encode_2() */ if (extra >= tempPos) { - p->src = src - tempPos; + /* (extra >= tempPos) means that temp buffer contains + only data from src buffer of this call. + So now we can encode without temp buffer */ + p->src = src - tempPos; // rollback (src) p->tempPos = 0; break; } - - p->temp[tempPos] = src[0]; + // we append one additional extra byte from (src) to temp[] buffer: + p->temp[tempPos] = *src; p->tempPos = tempPos + 1; + // (0 < p->tempPos <= 5) // in non-marker version p->src = src + 1; extra++; } } } - PRF(printf("++++ ip = %8d tempPos = %8d src = %8d\n", p->ip, p->tempPos, p->srcLim - p->src)); - + PRF2("++++") + // (p->tempPos == 0) Bcj2Enc_Encode_2(p); + PRF2("====") if (p->state == BCJ2_ENC_STATE_ORIG) { const Byte *src = p->src; - unsigned rem = (unsigned)(p->srcLim - src); - unsigned i; - for (i = 0; i < rem; i++) - p->temp[i] = src[i]; - p->tempPos = rem; - p->src = src + rem; + const Byte *srcLim = p->srcLim; + const unsigned rem = (unsigned)(srcLim - src); + /* (rem <= 4) here. + if (p->src != p->srcLim), then + - we copy non-processed bytes from (p->src) to temp[] buffer, + - we set p->src equal to p->srcLim. + */ + if (rem) + { + unsigned i = 0; + p->src = srcLim; + p->tempPos = rem; + // (0 < p->tempPos <= 4) + do + p->temp[i] = src[i]; + while (++i != rem); + } + // (p->tempPos <= 4) + // (p->src == p->srcLim) } } + +#undef PRF2 +#undef CONV_FLAG +#undef MARKER_FLAG +#undef WRITE_CONTEXT +#undef WRITE_CONTEXT_AND_SRC +#undef ONE_ITER +#undef NUM_SHIFT_BITS +#undef kTopValue +#undef kNumBitModelTotalBits +#undef kBitModelTotal +#undef kNumMoveBits -- cgit v1.2.3-55-g6feb