diff options
| author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-04-23 11:02:52 -0300 |
|---|---|---|
| committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-04-23 11:02:52 -0300 |
| commit | f8e9bc1c721a0802b2260f48ced72c7e04d7b1ef (patch) | |
| tree | fcd765f59c5d74574bdb21cd7a11e1f723068d87 | |
| parent | 9f7183c280f310c0d0b49b7b9c3b8eac297fafa7 (diff) | |
| download | lpeg-f8e9bc1c721a0802b2260f48ced72c7e04d7b1ef.tar.gz lpeg-f8e9bc1c721a0802b2260f48ced72c7e04d7b1ef.tar.bz2 lpeg-f8e9bc1c721a0802b2260f48ced72c7e04d7b1ef.zip | |
Towards a smaller encoding for charsets in code
| -rw-r--r-- | lpcode.c | 23 | ||||
| -rw-r--r-- | lpprint.c | 18 | ||||
| -rw-r--r-- | lptypes.h | 8 | ||||
| -rw-r--r-- | lpvm.c | 28 | ||||
| -rw-r--r-- | lpvm.h | 18 | ||||
| -rwxr-xr-x | test.lua | 2 |
6 files changed, 69 insertions, 28 deletions
| @@ -22,6 +22,7 @@ static const Charset fullset_ = | |||
| 22 | 22 | ||
| 23 | static const Charset *fullset = &fullset_; | 23 | static const Charset *fullset = &fullset_; |
| 24 | 24 | ||
| 25 | |||
| 25 | /* | 26 | /* |
| 26 | ** {====================================================== | 27 | ** {====================================================== |
| 27 | ** Analysis and some optimizations | 28 | ** Analysis and some optimizations |
| @@ -441,8 +442,8 @@ static int needfollow (TTree *tree) { | |||
| 441 | */ | 442 | */ |
| 442 | int sizei (const Instruction *i) { | 443 | int sizei (const Instruction *i) { |
| 443 | switch((Opcode)i->i.code) { | 444 | switch((Opcode)i->i.code) { |
| 444 | case ISet: case ISpan: return CHARSETINSTSIZE; | 445 | case ISet: case ISpan: return 1 + i->i.aux2.set.size; |
| 445 | case ITestSet: return CHARSETINSTSIZE + 1; | 446 | case ITestSet: return 2 + i->i.aux2.set.size; |
| 446 | case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: | 447 | case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: |
| 447 | case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: | 448 | case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: |
| 448 | case IUTFR: | 449 | case IUTFR: |
| @@ -585,10 +586,14 @@ static void codechar (CompileState *compst, int c, int tt) { | |||
| 585 | /* | 586 | /* |
| 586 | ** Add a charset posfix to an instruction | 587 | ** Add a charset posfix to an instruction |
| 587 | */ | 588 | */ |
| 588 | static void addcharset (CompileState *compst, const byte *cs) { | 589 | static void addcharset (CompileState *compst, int inst, const byte *cs) { |
| 589 | int p = gethere(compst); | 590 | int p = gethere(compst); |
| 590 | int i; | 591 | int i; |
| 591 | for (i = 0; i < (int)CHARSETINSTSIZE - 1; i++) | 592 | /* for now, all sets use all bits */ |
| 593 | getinstr(compst, inst).i.aux2.set.offset = 0; | ||
| 594 | getinstr(compst, inst).i.aux2.set.size = instsize(CHARSETSIZE); | ||
| 595 | getinstr(compst, inst).i.aux1 = 0; | ||
| 596 | for (i = 0; i < (int)instsize(CHARSETSIZE); i++) | ||
| 592 | nextinstruction(compst); /* space for buffer */ | 597 | nextinstruction(compst); /* space for buffer */ |
| 593 | /* fill buffer with charset */ | 598 | /* fill buffer with charset */ |
| 594 | loopset(j, getinstr(compst, p).buff[j] = cs[j]); | 599 | loopset(j, getinstr(compst, p).buff[j] = cs[j]); |
| @@ -610,8 +615,8 @@ static void codecharset (CompileState *compst, const byte *cs, int tt) { | |||
| 610 | cs_equal(cs, getinstr(compst, tt + 2).buff)) | 615 | cs_equal(cs, getinstr(compst, tt + 2).buff)) |
| 611 | addinstruction(compst, IAny, 0); | 616 | addinstruction(compst, IAny, 0); |
| 612 | else { | 617 | else { |
| 613 | addinstruction(compst, ISet, 0); | 618 | int i = addinstruction(compst, ISet, 0); |
| 614 | addcharset(compst, cs); | 619 | addcharset(compst, i, cs); |
| 615 | } | 620 | } |
| 616 | break; | 621 | break; |
| 617 | } | 622 | } |
| @@ -641,7 +646,7 @@ static int codetestset (CompileState *compst, Charset *cs, int e) { | |||
| 641 | } | 646 | } |
| 642 | case ISet: { | 647 | case ISet: { |
| 643 | int i = addoffsetinst(compst, ITestSet); | 648 | int i = addoffsetinst(compst, ITestSet); |
| 644 | addcharset(compst, cs->cs); | 649 | addcharset(compst, i, cs->cs); |
| 645 | return i; | 650 | return i; |
| 646 | } | 651 | } |
| 647 | default: assert(0); return 0; | 652 | default: assert(0); return 0; |
| @@ -793,8 +798,8 @@ static void coderep (CompileState *compst, TTree *tree, int opt, | |||
| 793 | const Charset *fl) { | 798 | const Charset *fl) { |
| 794 | Charset st; | 799 | Charset st; |
| 795 | if (tocharset(tree, &st)) { | 800 | if (tocharset(tree, &st)) { |
| 796 | addinstruction(compst, ISpan, 0); | 801 | int i = addinstruction(compst, ISpan, 0); |
| 797 | addcharset(compst, st.cs); | 802 | addcharset(compst, i, st.cs); |
| 798 | } | 803 | } |
| 799 | else { | 804 | else { |
| 800 | int e1 = getfirst(tree, fullset, &st); | 805 | int e1 = getfirst(tree, fullset, &st); |
| @@ -33,6 +33,18 @@ void printcharset (const byte *st) { | |||
| 33 | } | 33 | } |
| 34 | 34 | ||
| 35 | 35 | ||
| 36 | static void printIcharset (const Instruction *inst, const byte *buff) { | ||
| 37 | byte cs[CHARSETSIZE]; | ||
| 38 | int i; | ||
| 39 | loopset(j, cs[j] = 0); | ||
| 40 | for (i = 0; i < CHARSETSIZE << 3; i++) { | ||
| 41 | if (charinset(inst, buff, i)) | ||
| 42 | setchar(cs, i); | ||
| 43 | } | ||
| 44 | printcharset(cs); | ||
| 45 | } | ||
| 46 | |||
| 47 | |||
| 36 | static const char *capkind (int kind) { | 48 | static const char *capkind (int kind) { |
| 37 | const char *const modes[] = { | 49 | const char *const modes[] = { |
| 38 | "close", "position", "constant", "backref", | 50 | "close", "position", "constant", "backref", |
| @@ -83,15 +95,15 @@ void printinst (const Instruction *op, const Instruction *p) { | |||
| 83 | break; | 95 | break; |
| 84 | } | 96 | } |
| 85 | case ISet: { | 97 | case ISet: { |
| 86 | printcharset((p+1)->buff); | 98 | printIcharset(p, (p+1)->buff); |
| 87 | break; | 99 | break; |
| 88 | } | 100 | } |
| 89 | case ITestSet: { | 101 | case ITestSet: { |
| 90 | printcharset((p+2)->buff); printjmp(op, p); | 102 | printIcharset(p, (p+2)->buff); printjmp(op, p); |
| 91 | break; | 103 | break; |
| 92 | } | 104 | } |
| 93 | case ISpan: { | 105 | case ISpan: { |
| 94 | printcharset((p+1)->buff); | 106 | printIcharset(p, (p+1)->buff); |
| 95 | break; | 107 | break; |
| 96 | } | 108 | } |
| 97 | case IOpenCall: { | 109 | case IOpenCall: { |
| @@ -127,19 +127,19 @@ typedef struct Charset { | |||
| 127 | #define MAXPATTSIZE (SHRT_MAX - 10) | 127 | #define MAXPATTSIZE (SHRT_MAX - 10) |
| 128 | 128 | ||
| 129 | 129 | ||
| 130 | /* size (in elements) for an instruction plus extra l bytes */ | 130 | /* size (in instructions) for l bytes (l > 0) */ |
| 131 | #define instsize(l) (((l) + sizeof(Instruction) - 1)/sizeof(Instruction) + 1) | 131 | #define instsize(l) (((l) - 1)/sizeof(Instruction) + 1) |
| 132 | 132 | ||
| 133 | 133 | ||
| 134 | /* size (in elements) for a ISet instruction */ | 134 | /* size (in elements) for a ISet instruction */ |
| 135 | #define CHARSETINSTSIZE instsize(CHARSETSIZE) | 135 | #define CHARSETINSTSIZE (1 + instsize(CHARSETSIZE)) |
| 136 | 136 | ||
| 137 | /* size (in elements) for a IFunc instruction */ | 137 | /* size (in elements) for a IFunc instruction */ |
| 138 | #define funcinstsize(p) ((p)->i.aux + 2) | 138 | #define funcinstsize(p) ((p)->i.aux + 2) |
| 139 | 139 | ||
| 140 | 140 | ||
| 141 | 141 | ||
| 142 | #define testchar(st,c) (((int)(st)[((c) >> 3)] & (1 << ((c) & 7)))) | 142 | #define testchar(st,c) ((((unsigned int)(st)[((c) >> 3)]) >> ((c) & 7)) & 1) |
| 143 | 143 | ||
| 144 | 144 | ||
| 145 | #endif | 145 | #endif |
| @@ -23,6 +23,16 @@ | |||
| 23 | static const Instruction giveup = {{IGiveup, 0, {0}}}; | 23 | static const Instruction giveup = {{IGiveup, 0, {0}}}; |
| 24 | 24 | ||
| 25 | 25 | ||
| 26 | int charinset (const Instruction *i, const byte *buff, unsigned int c) { | ||
| 27 | c -= (unsigned int)i->i.aux2.set.offset; | ||
| 28 | if (c >= ((unsigned int)i->i.aux2.set.size /* size in instructions... */ | ||
| 29 | * (unsigned int)sizeof(Instruction) /* in bytes... */ | ||
| 30 | * 8u)) /* in bits */ | ||
| 31 | return i->i.aux1; /* out of range */ | ||
| 32 | return (testchar(buff, c) != i->i.aux1); | ||
| 33 | } | ||
| 34 | |||
| 35 | |||
| 26 | /* | 36 | /* |
| 27 | ** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid. | 37 | ** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid. |
| 28 | */ | 38 | */ |
| @@ -259,16 +269,16 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, | |||
| 259 | continue; | 269 | continue; |
| 260 | } | 270 | } |
| 261 | case ISet: { | 271 | case ISet: { |
| 262 | int c = (byte)*s; | 272 | unsigned int c = (byte)*s; |
| 263 | if (testchar((p+1)->buff, c) && s < e) | 273 | if (charinset(p, (p+1)->buff, c) && s < e) |
| 264 | { p += CHARSETINSTSIZE; s++; } | 274 | { p += 1 + p->i.aux2.set.size; s++; } |
| 265 | else goto fail; | 275 | else goto fail; |
| 266 | continue; | 276 | continue; |
| 267 | } | 277 | } |
| 268 | case ITestSet: { | 278 | case ITestSet: { |
| 269 | int c = (byte)*s; | 279 | unsigned int c = (byte)*s; |
| 270 | if (testchar((p + 2)->buff, c) && s < e) | 280 | if (charinset(p, (p + 2)->buff, c) && s < e) |
| 271 | p += 1 + CHARSETINSTSIZE; | 281 | p += 2 + p->i.aux2.set.size; |
| 272 | else p += getoffset(p); | 282 | else p += getoffset(p); |
| 273 | continue; | 283 | continue; |
| 274 | } | 284 | } |
| @@ -280,10 +290,10 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, | |||
| 280 | } | 290 | } |
| 281 | case ISpan: { | 291 | case ISpan: { |
| 282 | for (; s < e; s++) { | 292 | for (; s < e; s++) { |
| 283 | int c = (byte)*s; | 293 | unsigned int c = (byte)*s; |
| 284 | if (!testchar((p+1)->buff, c)) break; | 294 | if (!charinset(p, (p+1)->buff, c)) break; |
| 285 | } | 295 | } |
| 286 | p += CHARSETINSTSIZE; | 296 | p += 1 + p->i.aux2.set.size; |
| 287 | continue; | 297 | continue; |
| 288 | } | 298 | } |
| 289 | case IJmp: { | 299 | case IJmp: { |
| @@ -5,15 +5,22 @@ | |||
| 5 | #include "lpcap.h" | 5 | #include "lpcap.h" |
| 6 | 6 | ||
| 7 | 7 | ||
| 8 | /* | ||
| 9 | ** About Character sets in instructions: a set is a bit map with an | ||
| 10 | ** initial offset, in bits, and a size, in number of instructions. If | ||
| 11 | ** aux1 is one, set is inverted (bit == 1 means char is not in set). | ||
| 12 | */ | ||
| 13 | |||
| 14 | |||
| 8 | /* Virtual Machine's instructions */ | 15 | /* Virtual Machine's instructions */ |
| 9 | typedef enum Opcode { | 16 | typedef enum Opcode { |
| 10 | IAny, /* if no char, fail */ | 17 | IAny, /* if no char, fail */ |
| 11 | IChar, /* if char != aux1, fail */ | 18 | IChar, /* if char != aux1, fail */ |
| 12 | ISet, /* if char not in buff, fail */ | 19 | ISet, /* if char not in set, fail */ |
| 13 | ITestAny, /* in no char, jump to 'offset' */ | 20 | ITestAny, /* in no char, jump to 'offset' */ |
| 14 | ITestChar, /* if char != aux1, jump to 'offset' */ | 21 | ITestChar, /* if char != aux1, jump to 'offset' */ |
| 15 | ITestSet, /* if char not in buff, jump to 'offset' */ | 22 | ITestSet, /* if char not in set, jump to 'offset' */ |
| 16 | ISpan, /* read a span of chars in buff */ | 23 | ISpan, /* read a span of chars in set */ |
| 17 | IUTFR, /* if codepoint not in range [offset, utf_to], fail */ | 24 | IUTFR, /* if codepoint not in range [offset, utf_to], fail */ |
| 18 | IBehind, /* walk back 'aux1' characters (fail if not possible) */ | 25 | IBehind, /* walk back 'aux1' characters (fail if not possible) */ |
| 19 | IRet, /* return from a rule */ | 26 | IRet, /* return from a rule */ |
| @@ -43,6 +50,10 @@ typedef union Instruction { | |||
| 43 | byte aux1; | 50 | byte aux1; |
| 44 | union { | 51 | union { |
| 45 | short key; | 52 | short key; |
| 53 | struct { | ||
| 54 | byte offset; | ||
| 55 | byte size; | ||
| 56 | } set; | ||
| 46 | } aux2; | 57 | } aux2; |
| 47 | } i; | 58 | } i; |
| 48 | int offset; | 59 | int offset; |
| @@ -54,6 +65,7 @@ typedef union Instruction { | |||
| 54 | #define utf_to(inst) (((inst)->i.aux2.key << 8) | (inst)->i.aux1) | 65 | #define utf_to(inst) (((inst)->i.aux2.key << 8) | (inst)->i.aux1) |
| 55 | 66 | ||
| 56 | 67 | ||
| 68 | int charinset (const Instruction *i, const byte *buff, unsigned int c); | ||
| 57 | void printpatt (Instruction *p, int n); | 69 | void printpatt (Instruction *p, int n); |
| 58 | const char *match (lua_State *L, const char *o, const char *s, const char *e, | 70 | const char *match (lua_State *L, const char *o, const char *s, const char *e, |
| 59 | Instruction *op, Capture *capture, int ptop); | 71 | Instruction *op, Capture *capture, int ptop); |
| @@ -117,6 +117,8 @@ eqcharset(m.S"\1\0\2", m.R"\0\2") | |||
| 117 | eqcharset(m.S"\1\0\2", m.R"\1\2" + "\0") | 117 | eqcharset(m.S"\1\0\2", m.R"\1\2" + "\0") |
| 118 | eqcharset(m.S"\1\0\2" - "\0", m.R"\1\2") | 118 | eqcharset(m.S"\1\0\2" - "\0", m.R"\1\2") |
| 119 | 119 | ||
| 120 | eqcharset(m.S("\0\255"), m.P"\0" + "\255") -- charset extremes | ||
| 121 | |||
| 120 | local word = alpha^1 * (1 - alpha)^0 | 122 | local word = alpha^1 * (1 - alpha)^0 |
| 121 | 123 | ||
| 122 | assert((word^0 * -1):match"alo alo") | 124 | assert((word^0 * -1):match"alo alo") |
