1 files changed, 377 insertions, 182 deletions
diff --git a/C/Bcj2Enc.c b/C/Bcj2Enc.c
index 682362a..79460bb 100644
--- a/C/Bcj2Enc.c
+++ b/C/Bcj2Enc.c
@@ -1,60 +1,62 @@
-/* Bcj2Enc.c -- BCJ2 Encoder (Converter for x86 code)
+/* Bcj2Enc.c -- BCJ2 Encoder converter for x86 code (Branch CALL/JUMP variant2)
-2021-02-09 : Igor Pavlov : Public domain */
+2023-04-02 : Igor Pavlov : Public domain */
 #include "Precomp.h"
 /* #define SHOW_STAT */
 #ifdef SHOW_STAT
 #include <stdio.h>
-#define PRF(x) x
+#define PRF2(s) printf("%s ip=%8x  tempPos=%d  src= %8x\n", s, (unsigned)p->ip64, p->tempPos, (unsigned)(p->srcLim - p->src));
 #else
-#define PRF(x)
+#define PRF2(s)
 #endif
-#include <string.h>
 #include "Bcj2.h"
 #include "CpuArch.h"
-#define CProb UInt16
 #define kTopValue ((UInt32)1 << 24)
-#define kNumModelBits 11
+#define kNumBitModelTotalBits 11
-#define kBitModelTotal (1 << kNumModelBits)
+#define kBitModelTotal (1 << kNumBitModelTotalBits)
 #define kNumMoveBits 5
 void Bcj2Enc_Init(CBcj2Enc *p)
 {
  unsigned i;
+  p->state = BCJ2_ENC_STATE_ORIG;
-  p->state = BCJ2_ENC_STATE_OK;
  p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
+  p->context = 0;
-  p->prevByte = 0;
+  p->flushRem = 5;
+  p->isFlushState = 0;
  p->cache = 0;
-  p->range = 0xFFFFFFFF;
+  p->range = 0xffffffff;
  p->low = 0;
  p->cacheSize = 1;
+  p->ip64 = 0;
-  p->ip = 0;
+  p->fileIp64 = 0;
+  p->fileSize64_minus1 = BCJ2_ENC_FileSizeField_UNLIMITED;
-  p->fileIp = 0;
+  p->relatLimit = BCJ2_ENC_RELAT_LIMIT_DEFAULT;
-  p->fileSize = 0;
+  // p->relatExcludeBits = 0;
-  p->relatLimit = BCJ2_RELAT_LIMIT;
  p->tempPos = 0;
-  p->flushPos = 0;
  for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++)
    p->probs[i] = kBitModelTotal >> 1;
 }
-static BoolInt MY_FAST_CALL RangeEnc_ShiftLow(CBcj2Enc *p)
+// Z7_NO_INLINE
+Z7_FORCE_INLINE
+static BoolInt Bcj2_RangeEnc_ShiftLow(CBcj2Enc *p)
 {
-  if ((UInt32)p->low < (UInt32)0xFF000000 || (UInt32)(p->low >> 32) != 0)
+  const UInt32 low = (UInt32)p->low;
+  const unsigned high = (unsigned)
+    #if defined(Z7_MSC_VER_ORIGINAL) \
+        && defined(MY_CPU_X86) \
+        && defined(MY_CPU_LE) \
+        && !defined(MY_CPU_64BIT)
+      // we try to rid of __aullshr() call in MSVS-x86
+      (((const UInt32 *)&p->low)[1]); // [1] : for little-endian only
+    #else
+      (p->low >> 32);
+    #endif
+  if (low < (UInt32)0xff000000 || high != 0)
  {
    Byte *buf = p->bufs[BCJ2_STREAM_RC];
    do
@@ -65,247 +67,440 @@ static BoolInt MY_FAST_CALL RangeEnc_ShiftLow(CBcj2Enc *p)
        p->bufs[BCJ2_STREAM_RC] = buf;
        return True;
      }
-      *buf++ = (Byte)(p->cache + (Byte)(p->low >> 32));
+      *buf++ = (Byte)(p->cache + high);
-      p->cache = 0xFF;
+      p->cache = 0xff;
    }
    while (--p->cacheSize);
    p->bufs[BCJ2_STREAM_RC] = buf;
-    p->cache = (Byte)((UInt32)p->low >> 24);
+    p->cache = (Byte)(low >> 24);
  }
  p->cacheSize++;
-  p->low = (UInt32)p->low << 8;
+  p->low = low << 8;
  return False;
 }
-static void Bcj2Enc_Encode_2(CBcj2Enc *p)
-{
+/*
-  if (BCJ2_IS_32BIT_STREAM(p->state))
+We can use 2 alternative versions of code:
+1) non-marker version:
+  Byte CBcj2Enc::context
+  Byte temp[8];
+  Last byte of marker (e8/e9/[0f]8x) can be written to temp[] buffer.
+  Encoder writes last byte of marker (e8/e9/[0f]8x) to dest, only in conjunction
+  with writing branch symbol to range coder in same Bcj2Enc_Encode_2() call.
+2) marker version:
+  UInt32 CBcj2Enc::context
+  Byte CBcj2Enc::temp[4];
+  MARKER_FLAG in CBcj2Enc::context shows that CBcj2Enc::context contains finded marker.
+  it's allowed that
+    one call of Bcj2Enc_Encode_2() writes last byte of marker (e8/e9/[0f]8x) to dest,
+    and another call of Bcj2Enc_Encode_2() does offset conversion.
+    So different values of (fileIp) and (fileSize) are possible
+    in these different Bcj2Enc_Encode_2() calls.
+Also marker version requires additional if((v & MARKER_FLAG) == 0) check in main loop.
+So we use non-marker version.
+*/
+/*
+  Corner cases with overlap in multi-block.
+  before v23: there was one corner case, where converted instruction
+    could start in one sub-stream and finish in next sub-stream.
+  If multi-block (solid) encoding is used,
+    and BCJ2_ENC_FINISH_MODE_END_BLOCK is used for each sub-stream.
+    and (0f) is last byte of previous sub-stream
+    and (8x) is first byte of current sub-stream
+  then (0f 8x) pair is treated as marker by BCJ2 encoder and decoder.
+  BCJ2 encoder can converts 32-bit offset for that (0f 8x) cortage,
+  if that offset meets limit requirements.
+  If encoder allows 32-bit offset conversion for such overlap case,
+  then the data in 3 uncompressed BCJ2 streams for some sub-stream
+  can depend from data of previous sub-stream.
+  That corner case is not big problem, and it's rare case.
+  Since v23.00 we do additional check to prevent conversions in such overlap cases.
+*/
+/*
+  Bcj2Enc_Encode_2() output variables at exit:
  {
-    Byte *cur = p->bufs[p->state];
+    if (Bcj2Enc_Encode_2() exits with (p->state == BCJ2_ENC_STATE_ORIG))
-    if (cur == p->lims[p->state])
+    {
-      return;
+      it means that encoder needs more input data.
-    SetBe32(cur, p->tempTarget);
+      if (p->srcLim == p->src) at exit, then
-    p->bufs[p->state] = cur + 4;
+      {
+        (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
+        all input data were read and processed, and we are ready for
+        new input data.
+      }
+      else
+      {
+        (p->srcLim != p->src)
+        (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
+          The encoder have found e8/e9/0f_8x marker,
+          and p->src points to last byte of that marker,
+          Bcj2Enc_Encode_2() needs more input data to get totally
+          5 bytes (last byte of marker and 32-bit branch offset)
+          as continuous array starting from p->src.
+        (p->srcLim - p->src < 5) requirement is met after exit.
+          So non-processed resedue from p->src to p->srcLim is always less than 5 bytes.
+      }
+    }
  }
+*/
-  p->state = BCJ2_ENC_STATE_ORIG;
+Z7_NO_INLINE
+static void Bcj2Enc_Encode_2(CBcj2Enc *p)
-  for (;;)
+{
+  if (!p->isFlushState)
  {
-    if (p->range < kTopValue)
+    const Byte *src;
+    UInt32 v;
    {
-      if (RangeEnc_ShiftLow(p))
+      const unsigned state = p->state;
-        return;
+      if (BCJ2_IS_32BIT_STREAM(state))
-      p->range <<= 8;
+      {
+        Byte *cur = p->bufs[state];
+        if (cur == p->lims[state])
+          return;
+        SetBe32a(cur, p->tempTarget)
+        p->bufs[state] = cur + 4;
+      }
    }
+    p->state = BCJ2_ENC_STATE_ORIG; // for main reason of exit
+    src = p->src;
+    v = p->context;
+    
+    // #define WRITE_CONTEXT  p->context = v; // for marker version
+    #define WRITE_CONTEXT           p->context = (Byte)v;
+    #define WRITE_CONTEXT_AND_SRC   p->src = src;  WRITE_CONTEXT
+    for (;;)
    {
+      // const Byte *src;
+      // UInt32 v;
+      CBcj2Enc_ip_unsigned ip;
+      if (p->range < kTopValue)
+      {
+        // to reduce register pressure and code size: we save and restore local variables.
+        WRITE_CONTEXT_AND_SRC
+        if (Bcj2_RangeEnc_ShiftLow(p))
+          return;
+        p->range <<= 8;
+        src = p->src;
+        v = p->context;
+      }
+      // src = p->src;
+      // #define MARKER_FLAG  ((UInt32)1 << 17)
+      // if ((v & MARKER_FLAG) == 0) // for marker version
      {
-        const Byte *src = p->src;
        const Byte *srcLim;
-        Byte *dest;
+        Byte *dest = p->bufs[BCJ2_STREAM_MAIN];
-        SizeT num = (SizeT)(p->srcLim - src);
-        if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
        {
-          if (num <= 4)
+          const SizeT remSrc = (SizeT)(p->srcLim - src);
-            return;
+          SizeT rem = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest);
-          num -= 4;
+          if (rem >= remSrc)
+            rem = remSrc;
+          srcLim = src + rem;
        }
-        else if (num == 0)
+        /* p->context contains context of previous byte:
-          break;
+           bits [0 : 7]  : src[-1], if (src) was changed in this call
+           bits [8 : 31] : are undefined for non-marker version
-        dest = p->bufs[BCJ2_STREAM_MAIN];
+        */
-        if (num > (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest))
+        // v = p->context;
+        #define NUM_SHIFT_BITS  24
+        #define CONV_FLAG  ((UInt32)1 << 16)
+        #define ONE_ITER { \
+          b = src[0]; \
+          *dest++ = (Byte)b; \
+          v = (v << NUM_SHIFT_BITS) | b; \
+          if (((b + (0x100 - 0xe8)) & 0xfe) == 0) break; \
+          if (((v - (((UInt32)0x0f << (NUM_SHIFT_BITS)) + 0x80)) & \
+              ((((UInt32)1 << (4 + NUM_SHIFT_BITS)) - 0x1) << 4)) == 0) break; \
+          src++; if (src == srcLim) { break; } }
+        if (src != srcLim)
+        for (;;)
        {
-          num = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest);
+          /* clang can generate ineffective code with setne instead of two jcc instructions.
-          if (num == 0)
+             we can use 2 iterations and external (unsigned b) to avoid that ineffective code genaration. */
-          {
+          unsigned b;
-            p->state = BCJ2_STREAM_MAIN;
+          ONE_ITER
-            return;
+          ONE_ITER
-          }
        }
-       
+        
-        srcLim = src + num;
+        ip = p->ip64 + (CBcj2Enc_ip_unsigned)(SizeT)(dest - p->bufs[BCJ2_STREAM_MAIN]);
+        p->bufs[BCJ2_STREAM_MAIN] = dest;
+        p->ip64 = ip;
-        if (p->prevByte == 0x0F && (src[0] & 0xF0) == 0x80)
+        if (src == srcLim)
-          *dest = src[0];
-        else for (;;)
        {
-          Byte b = *src;
+          WRITE_CONTEXT_AND_SRC
-          *dest = b;
+          if (src != p->srcLim)
-          if (b != 0x0F)
          {
-            if ((b & 0xFE) == 0xE8)
+            p->state = BCJ2_STREAM_MAIN;
-              break;
+            return;
-            dest++;
-            if (++src != srcLim)
-              continue;
-            break;
          }
-          dest++;
+          /* (p->src == p->srcLim)
-          if (++src == srcLim)
+          (p->state == BCJ2_ENC_STATE_ORIG) */
-            break;
+          if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
-          if ((*src & 0xF0) != 0x80)
+            return;
-            continue;
+          /* (p->finishMode == BCJ2_ENC_FINISH_MODE_END_STREAM */
-          *dest = *src;
+          // (p->flushRem == 5);
+          p->isFlushState = 1;
          break;
        }
-        
+        src++;
-        num = (SizeT)(src - p->src);
+        // p->src = src;
-        
+      }
-        if (src == srcLim)
+      // ip = p->ip; // for marker version
-        {
+      /* marker was found */
-          p->prevByte = src[-1];
+      /* (v) contains marker that was found:
-          p->bufs[BCJ2_STREAM_MAIN] = dest;
+           bits [NUM_SHIFT_BITS : NUM_SHIFT_BITS + 7]
-          p->src = src;
+                         : value of src[-2] : xx/xx/0f
-          p->ip += (UInt32)num;
+           bits [0 : 7]  : value of src[-1] : e8/e9/8x
-          continue;
+      */
-        }
+      {
- 
        {
-          Byte context = (Byte)(num == 0 ? p->prevByte : src[-1]);
+        #if NUM_SHIFT_BITS != 24
-          BoolInt needConvert;
+          v &= ~(UInt32)CONV_FLAG;
+        #endif
-          p->bufs[BCJ2_STREAM_MAIN] = dest + 1;
+          // UInt32 relat = 0;
-          p->ip += (UInt32)num + 1;
-          src++;
-          
-          needConvert = False;
          if ((SizeT)(p->srcLim - src) >= 4)
          {
-            UInt32 relatVal = GetUi32(src);
+            /*
-            if ((p->fileSize == 0 || (UInt32)(p->ip + 4 + relatVal - p->fileIp) < p->fileSize)
+            if (relat != 0 || (Byte)v != 0xe8)
-                && ((relatVal + p->relatLimit) >> 1) < p->relatLimit)
+            BoolInt isBigOffset = True;
-              needConvert = True;
+            */
+            const UInt32 relat = GetUi32(src);
+            /*
+            #define EXCLUDE_FLAG  ((UInt32)1 << 4)
+            #define NEED_CONVERT(rel) ((((rel) + EXCLUDE_FLAG) & (0 - EXCLUDE_FLAG * 2)) != 0)
+            if (p->relatExcludeBits != 0)
+            {
+              const UInt32 flag = (UInt32)1 << (p->relatExcludeBits - 1);
+              isBigOffset = (((relat + flag) & (0 - flag * 2)) != 0);
+            }
+            // isBigOffset = False; // for debug
+            */
+            ip -= p->fileIp64;
+            // Use the following if check, if (ip) is 64-bit:
+            if (ip > (((v + 0x20) >> 5) & 1))  // 23.00 : we eliminate milti-block overlap for (Of 80) and (e8/e9)
+            if ((CBcj2Enc_ip_unsigned)((CBcj2Enc_ip_signed)ip + 4 + (Int32)relat) <= p->fileSize64_minus1)
+            if (((UInt32)(relat + p->relatLimit) >> 1) < p->relatLimit)
+              v |= CONV_FLAG;
          }
+          else if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
          {
-            UInt32 bound;
+            // (p->srcLim - src < 4)
-            unsigned ttt;
+            // /*
-            Byte b = src[-1];
+            // for non-marker version
-            CProb *prob = p->probs + (unsigned)(b == 0xE8 ? 2 + (unsigned)context : (b == 0xE9 ? 1 : 0));
+            p->ip64--; // p->ip = ip - 1;
+            p->bufs[BCJ2_STREAM_MAIN]--;
-            ttt = *prob;
+            src--;
-            bound = (p->range >> kNumModelBits) * ttt;
+            v >>= NUM_SHIFT_BITS;
-            
+            // (0 < p->srcLim - p->src <= 4)
-            if (!needConvert)
+            // */
+            // v |= MARKER_FLAG; // for marker version
+            /* (p->state == BCJ2_ENC_STATE_ORIG) */
+            WRITE_CONTEXT_AND_SRC
+            return;
+          }
+          {
+            const unsigned c = ((v + 0x17) >> 6) & 1;
+            CBcj2Prob *prob = p->probs + (unsigned)
+                (((0 - c) & (Byte)(v >> NUM_SHIFT_BITS)) + c + ((v >> 5) & 1));
+            /*
+                ((Byte)v == 0xe8 ? 2 + ((Byte)(v >> 8)) :
+                ((Byte)v < 0xe8 ? 0 : 1));  // ((v >> 5) & 1));
+            */
+            const unsigned ttt = *prob;
+            const UInt32 bound = (p->range >> kNumBitModelTotalBits) * ttt;
+            if ((v & CONV_FLAG) == 0)
            {
+              // static int yyy = 0; yyy++; printf("\n!needConvert = %d\n", yyy);
+              // v = (Byte)v; // for marker version
              p->range = bound;
-              *prob = (CProb)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
+              *prob = (CBcj2Prob)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
-              p->src = src;
+              // WRITE_CONTEXT_AND_SRC
-              p->prevByte = b;
              continue;
            }
-            
            p->low += bound;
            p->range -= bound;
-            *prob = (CProb)(ttt - (ttt >> kNumMoveBits));
+            *prob = (CBcj2Prob)(ttt - (ttt >> kNumMoveBits));
+          }
+          // p->context = src[3];
+          {
+            // const unsigned cj = ((Byte)v == 0xe8 ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP);
+            const unsigned cj = (((v + 0x57) >> 6) & 1) + BCJ2_STREAM_CALL;
+            ip = p->ip64;
+            v = GetUi32(src); // relat
+            ip += 4;
+            p->ip64 = ip;
+            src += 4;
+            // p->src = src;
            {
-              UInt32 relatVal = GetUi32(src);
+              const UInt32 absol = (UInt32)ip + v;
-              UInt32 absVal;
+              Byte *cur = p->bufs[cj];
-              p->ip += 4;
+              v >>= 24;
-              absVal = p->ip + relatVal;
+              // WRITE_CONTEXT
-              p->prevByte = src[3];
+              if (cur == p->lims[cj])
-              src += 4;
-              p->src = src;
              {
-                unsigned cj = (b == 0xE8) ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP;
+                p->state = cj;
-                Byte *cur = p->bufs[cj];
+                p->tempTarget = absol;
-                if (cur == p->lims[cj])
+                WRITE_CONTEXT_AND_SRC
-                {
+                return;
-                  p->state = cj;
-                  p->tempTarget = absVal;
-                  return;
-                }
-                SetBe32(cur, absVal);
-                p->bufs[cj] = cur + 4;
              }
+              SetBe32a(cur, absol)
+              p->bufs[cj] = cur + 4;
            }
          }
        }
      }
-    }
+    } // end of loop
  }
-  if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
+  for (; p->flushRem != 0; p->flushRem--)
-    return;
+    if (Bcj2_RangeEnc_ShiftLow(p))
-  for (; p->flushPos < 5; p->flushPos++)
-    if (RangeEnc_ShiftLow(p))
      return;
-  p->state = BCJ2_ENC_STATE_OK;
+  p->state = BCJ2_ENC_STATE_FINISHED;
 }
+/*
+BCJ2 encoder needs look ahead for up to 4 bytes in (src) buffer.
+So base function Bcj2Enc_Encode_2()
+  in BCJ2_ENC_FINISH_MODE_CONTINUE mode can return with
+  (p->state == BCJ2_ENC_STATE_ORIG && p->src < p->srcLim)
+Bcj2Enc_Encode() solves that look ahead problem by using p->temp[] buffer.
+  so if (p->state == BCJ2_ENC_STATE_ORIG) after Bcj2Enc_Encode(),
+    then (p->src == p->srcLim).
+  And the caller's code is simpler with Bcj2Enc_Encode().
+*/
+Z7_NO_INLINE
 void Bcj2Enc_Encode(CBcj2Enc *p)
 {
-  PRF(printf("\n"));
+  PRF2("\n----")
-  PRF(printf("---- ip = %8d   tempPos = %8d   src = %8d\n", p->ip, p->tempPos, p->srcLim - p->src));
  if (p->tempPos != 0)
  {
+    /* extra: number of bytes that were copied from (src) to (temp) buffer in this call */
    unsigned extra = 0;
-   
+    /* We will touch only minimal required number of bytes in input (src) stream.
+       So we will add input bytes from (src) stream to temp[] with step of 1 byte.
+       We don't add new bytes to temp[] before Bcj2Enc_Encode_2() call
+         in first loop iteration because
+         - previous call of Bcj2Enc_Encode() could use another (finishMode),
+         - previous call could finish with (p->state != BCJ2_ENC_STATE_ORIG).
+       the case with full temp[] buffer (p->tempPos == 4) is possible here.
+    */
    for (;;)
    {
+      // (0 < p->tempPos <= 5) // in non-marker version
+      /* p->src : the current src data position including extra bytes
+                  that were copied to temp[] buffer in this call */
      const Byte *src = p->src;
      const Byte *srcLim = p->srcLim;
-      EBcj2Enc_FinishMode finishMode = p->finishMode;
+      const EBcj2Enc_FinishMode finishMode = p->finishMode;
-      
-      p->src = p->temp;
-      p->srcLim = p->temp + p->tempPos;
      if (src != srcLim)
+      {
+        /* if there are some src data after the data copied to temp[],
+           then we use MODE_CONTINUE for temp data */
        p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
-      
+      }
-      PRF(printf("     ip = %8d   tempPos = %8d   src = %8d\n", p->ip, p->tempPos, p->srcLim - p->src));
+      p->src = p->temp;
+      p->srcLim = p->temp + p->tempPos;
+      PRF2("    ")
      Bcj2Enc_Encode_2(p);
-      
      {
-        unsigned num = (unsigned)(p->src - p->temp);
+        const unsigned num = (unsigned)(p->src - p->temp);
-        unsigned tempPos = p->tempPos - num;
+        const unsigned tempPos = p->tempPos - num;
        unsigned i;
        p->tempPos = tempPos;
        for (i = 0; i < tempPos; i++)
-          p->temp[i] = p->temp[(size_t)i + num];
+          p->temp[i] = p->temp[(SizeT)i + num];
-      
+        // tempPos : number of bytes in temp buffer
        p->src = src;
        p->srcLim = srcLim;
        p->finishMode = finishMode;
-        
+        if (p->state != BCJ2_ENC_STATE_ORIG)
-        if (p->state != BCJ2_ENC_STATE_ORIG || src == srcLim)
+        {
+          // (p->tempPos <= 4) // in non-marker version
+          /* if (the reason of exit from Bcj2Enc_Encode_2()
+                 is not BCJ2_ENC_STATE_ORIG),
+             then we exit from Bcj2Enc_Encode() with same reason */
+          // optional code begin : we rollback (src) and tempPos, if it's possible:
+          if (extra >= tempPos)
+            extra = tempPos;
+          p->src = src - extra;
+          p->tempPos = tempPos - extra;
+          // optional code end : rollback of (src) and tempPos
          return;
-        
+        }
+        /* (p->tempPos <= 4)
+           (p->state == BCJ2_ENC_STATE_ORIG)
+             so encoder needs more data than in temp[] */
+        if (src == srcLim)
+          return; // src buffer has no more input data.
+        /* (src != srcLim)
+           so we can provide more input data from src for Bcj2Enc_Encode_2() */
        if (extra >= tempPos)
        {
-          p->src = src - tempPos;
+          /* (extra >= tempPos) means that temp buffer contains
+             only data from src buffer of this call.
+             So now we can encode without temp buffer */
+          p->src = src - tempPos; // rollback (src)
          p->tempPos = 0;
          break;
        }
-        
+        // we append one additional extra byte from (src) to temp[] buffer:
-        p->temp[tempPos] = src[0];
+        p->temp[tempPos] = *src;
        p->tempPos = tempPos + 1;
+        // (0 < p->tempPos <= 5) // in non-marker version
        p->src = src + 1;
        extra++;
      }
    }
  }
-  PRF(printf("++++ ip = %8d   tempPos = %8d   src = %8d\n", p->ip, p->tempPos, p->srcLim - p->src));
+  PRF2("++++")
+  // (p->tempPos == 0)
  Bcj2Enc_Encode_2(p);
+  PRF2("====")
  
  if (p->state == BCJ2_ENC_STATE_ORIG)
  {
    const Byte *src = p->src;
-    unsigned rem = (unsigned)(p->srcLim - src);
+    const Byte *srcLim = p->srcLim;
-    unsigned i;
+    const unsigned rem = (unsigned)(srcLim - src);
-    for (i = 0; i < rem; i++)
+    /* (rem <= 4) here.
-      p->temp[i] = src[i];
+       if (p->src != p->srcLim), then
-    p->tempPos = rem;
+         - we copy non-processed bytes from (p->src) to temp[] buffer,
-    p->src = src + rem;
+         - we set p->src equal to p->srcLim.
+    */
+    if (rem)
+    {
+      unsigned i = 0;
+      p->src = srcLim;
+      p->tempPos = rem;
+      // (0 < p->tempPos <= 4)
+      do
+        p->temp[i] = src[i];
+      while (++i != rem);
+    }
+    // (p->tempPos <= 4)
+    // (p->src == p->srcLim)
  }
 }
+#undef PRF2
+#undef CONV_FLAG
+#undef MARKER_FLAG
+#undef WRITE_CONTEXT
+#undef WRITE_CONTEXT_AND_SRC
+#undef ONE_ITER
+#undef NUM_SHIFT_BITS
+#undef kTopValue
+#undef kNumBitModelTotalBits
+#undef kBitModelTotal
+#undef kNumMoveBits