diff options
author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-04-23 11:02:52 -0300 |
---|---|---|
committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-04-23 11:02:52 -0300 |
commit | f8e9bc1c721a0802b2260f48ced72c7e04d7b1ef (patch) | |
tree | fcd765f59c5d74574bdb21cd7a11e1f723068d87 | |
parent | 9f7183c280f310c0d0b49b7b9c3b8eac297fafa7 (diff) | |
download | lpeg-f8e9bc1c721a0802b2260f48ced72c7e04d7b1ef.tar.gz lpeg-f8e9bc1c721a0802b2260f48ced72c7e04d7b1ef.tar.bz2 lpeg-f8e9bc1c721a0802b2260f48ced72c7e04d7b1ef.zip |
Towards a smaller encoding for charsets in code
-rw-r--r-- | lpcode.c | 23 | ||||
-rw-r--r-- | lpprint.c | 18 | ||||
-rw-r--r-- | lptypes.h | 8 | ||||
-rw-r--r-- | lpvm.c | 28 | ||||
-rw-r--r-- | lpvm.h | 18 | ||||
-rwxr-xr-x | test.lua | 2 |
6 files changed, 69 insertions, 28 deletions
@@ -22,6 +22,7 @@ static const Charset fullset_ = | |||
22 | 22 | ||
23 | static const Charset *fullset = &fullset_; | 23 | static const Charset *fullset = &fullset_; |
24 | 24 | ||
25 | |||
25 | /* | 26 | /* |
26 | ** {====================================================== | 27 | ** {====================================================== |
27 | ** Analysis and some optimizations | 28 | ** Analysis and some optimizations |
@@ -441,8 +442,8 @@ static int needfollow (TTree *tree) { | |||
441 | */ | 442 | */ |
442 | int sizei (const Instruction *i) { | 443 | int sizei (const Instruction *i) { |
443 | switch((Opcode)i->i.code) { | 444 | switch((Opcode)i->i.code) { |
444 | case ISet: case ISpan: return CHARSETINSTSIZE; | 445 | case ISet: case ISpan: return 1 + i->i.aux2.set.size; |
445 | case ITestSet: return CHARSETINSTSIZE + 1; | 446 | case ITestSet: return 2 + i->i.aux2.set.size; |
446 | case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: | 447 | case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: |
447 | case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: | 448 | case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: |
448 | case IUTFR: | 449 | case IUTFR: |
@@ -585,10 +586,14 @@ static void codechar (CompileState *compst, int c, int tt) { | |||
585 | /* | 586 | /* |
586 | ** Add a charset posfix to an instruction | 587 | ** Add a charset posfix to an instruction |
587 | */ | 588 | */ |
588 | static void addcharset (CompileState *compst, const byte *cs) { | 589 | static void addcharset (CompileState *compst, int inst, const byte *cs) { |
589 | int p = gethere(compst); | 590 | int p = gethere(compst); |
590 | int i; | 591 | int i; |
591 | for (i = 0; i < (int)CHARSETINSTSIZE - 1; i++) | 592 | /* for now, all sets use all bits */ |
593 | getinstr(compst, inst).i.aux2.set.offset = 0; | ||
594 | getinstr(compst, inst).i.aux2.set.size = instsize(CHARSETSIZE); | ||
595 | getinstr(compst, inst).i.aux1 = 0; | ||
596 | for (i = 0; i < (int)instsize(CHARSETSIZE); i++) | ||
592 | nextinstruction(compst); /* space for buffer */ | 597 | nextinstruction(compst); /* space for buffer */ |
593 | /* fill buffer with charset */ | 598 | /* fill buffer with charset */ |
594 | loopset(j, getinstr(compst, p).buff[j] = cs[j]); | 599 | loopset(j, getinstr(compst, p).buff[j] = cs[j]); |
@@ -610,8 +615,8 @@ static void codecharset (CompileState *compst, const byte *cs, int tt) { | |||
610 | cs_equal(cs, getinstr(compst, tt + 2).buff)) | 615 | cs_equal(cs, getinstr(compst, tt + 2).buff)) |
611 | addinstruction(compst, IAny, 0); | 616 | addinstruction(compst, IAny, 0); |
612 | else { | 617 | else { |
613 | addinstruction(compst, ISet, 0); | 618 | int i = addinstruction(compst, ISet, 0); |
614 | addcharset(compst, cs); | 619 | addcharset(compst, i, cs); |
615 | } | 620 | } |
616 | break; | 621 | break; |
617 | } | 622 | } |
@@ -641,7 +646,7 @@ static int codetestset (CompileState *compst, Charset *cs, int e) { | |||
641 | } | 646 | } |
642 | case ISet: { | 647 | case ISet: { |
643 | int i = addoffsetinst(compst, ITestSet); | 648 | int i = addoffsetinst(compst, ITestSet); |
644 | addcharset(compst, cs->cs); | 649 | addcharset(compst, i, cs->cs); |
645 | return i; | 650 | return i; |
646 | } | 651 | } |
647 | default: assert(0); return 0; | 652 | default: assert(0); return 0; |
@@ -793,8 +798,8 @@ static void coderep (CompileState *compst, TTree *tree, int opt, | |||
793 | const Charset *fl) { | 798 | const Charset *fl) { |
794 | Charset st; | 799 | Charset st; |
795 | if (tocharset(tree, &st)) { | 800 | if (tocharset(tree, &st)) { |
796 | addinstruction(compst, ISpan, 0); | 801 | int i = addinstruction(compst, ISpan, 0); |
797 | addcharset(compst, st.cs); | 802 | addcharset(compst, i, st.cs); |
798 | } | 803 | } |
799 | else { | 804 | else { |
800 | int e1 = getfirst(tree, fullset, &st); | 805 | int e1 = getfirst(tree, fullset, &st); |
@@ -33,6 +33,18 @@ void printcharset (const byte *st) { | |||
33 | } | 33 | } |
34 | 34 | ||
35 | 35 | ||
36 | static void printIcharset (const Instruction *inst, const byte *buff) { | ||
37 | byte cs[CHARSETSIZE]; | ||
38 | int i; | ||
39 | loopset(j, cs[j] = 0); | ||
40 | for (i = 0; i < CHARSETSIZE << 3; i++) { | ||
41 | if (charinset(inst, buff, i)) | ||
42 | setchar(cs, i); | ||
43 | } | ||
44 | printcharset(cs); | ||
45 | } | ||
46 | |||
47 | |||
36 | static const char *capkind (int kind) { | 48 | static const char *capkind (int kind) { |
37 | const char *const modes[] = { | 49 | const char *const modes[] = { |
38 | "close", "position", "constant", "backref", | 50 | "close", "position", "constant", "backref", |
@@ -83,15 +95,15 @@ void printinst (const Instruction *op, const Instruction *p) { | |||
83 | break; | 95 | break; |
84 | } | 96 | } |
85 | case ISet: { | 97 | case ISet: { |
86 | printcharset((p+1)->buff); | 98 | printIcharset(p, (p+1)->buff); |
87 | break; | 99 | break; |
88 | } | 100 | } |
89 | case ITestSet: { | 101 | case ITestSet: { |
90 | printcharset((p+2)->buff); printjmp(op, p); | 102 | printIcharset(p, (p+2)->buff); printjmp(op, p); |
91 | break; | 103 | break; |
92 | } | 104 | } |
93 | case ISpan: { | 105 | case ISpan: { |
94 | printcharset((p+1)->buff); | 106 | printIcharset(p, (p+1)->buff); |
95 | break; | 107 | break; |
96 | } | 108 | } |
97 | case IOpenCall: { | 109 | case IOpenCall: { |
@@ -127,19 +127,19 @@ typedef struct Charset { | |||
127 | #define MAXPATTSIZE (SHRT_MAX - 10) | 127 | #define MAXPATTSIZE (SHRT_MAX - 10) |
128 | 128 | ||
129 | 129 | ||
130 | /* size (in elements) for an instruction plus extra l bytes */ | 130 | /* size (in instructions) for l bytes (l > 0) */ |
131 | #define instsize(l) (((l) + sizeof(Instruction) - 1)/sizeof(Instruction) + 1) | 131 | #define instsize(l) (((l) - 1)/sizeof(Instruction) + 1) |
132 | 132 | ||
133 | 133 | ||
134 | /* size (in elements) for a ISet instruction */ | 134 | /* size (in elements) for a ISet instruction */ |
135 | #define CHARSETINSTSIZE instsize(CHARSETSIZE) | 135 | #define CHARSETINSTSIZE (1 + instsize(CHARSETSIZE)) |
136 | 136 | ||
137 | /* size (in elements) for a IFunc instruction */ | 137 | /* size (in elements) for a IFunc instruction */ |
138 | #define funcinstsize(p) ((p)->i.aux + 2) | 138 | #define funcinstsize(p) ((p)->i.aux + 2) |
139 | 139 | ||
140 | 140 | ||
141 | 141 | ||
142 | #define testchar(st,c) (((int)(st)[((c) >> 3)] & (1 << ((c) & 7)))) | 142 | #define testchar(st,c) ((((unsigned int)(st)[((c) >> 3)]) >> ((c) & 7)) & 1) |
143 | 143 | ||
144 | 144 | ||
145 | #endif | 145 | #endif |
@@ -23,6 +23,16 @@ | |||
23 | static const Instruction giveup = {{IGiveup, 0, {0}}}; | 23 | static const Instruction giveup = {{IGiveup, 0, {0}}}; |
24 | 24 | ||
25 | 25 | ||
26 | int charinset (const Instruction *i, const byte *buff, unsigned int c) { | ||
27 | c -= (unsigned int)i->i.aux2.set.offset; | ||
28 | if (c >= ((unsigned int)i->i.aux2.set.size /* size in instructions... */ | ||
29 | * (unsigned int)sizeof(Instruction) /* in bytes... */ | ||
30 | * 8u)) /* in bits */ | ||
31 | return i->i.aux1; /* out of range */ | ||
32 | return (testchar(buff, c) != i->i.aux1); | ||
33 | } | ||
34 | |||
35 | |||
26 | /* | 36 | /* |
27 | ** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid. | 37 | ** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid. |
28 | */ | 38 | */ |
@@ -259,16 +269,16 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, | |||
259 | continue; | 269 | continue; |
260 | } | 270 | } |
261 | case ISet: { | 271 | case ISet: { |
262 | int c = (byte)*s; | 272 | unsigned int c = (byte)*s; |
263 | if (testchar((p+1)->buff, c) && s < e) | 273 | if (charinset(p, (p+1)->buff, c) && s < e) |
264 | { p += CHARSETINSTSIZE; s++; } | 274 | { p += 1 + p->i.aux2.set.size; s++; } |
265 | else goto fail; | 275 | else goto fail; |
266 | continue; | 276 | continue; |
267 | } | 277 | } |
268 | case ITestSet: { | 278 | case ITestSet: { |
269 | int c = (byte)*s; | 279 | unsigned int c = (byte)*s; |
270 | if (testchar((p + 2)->buff, c) && s < e) | 280 | if (charinset(p, (p + 2)->buff, c) && s < e) |
271 | p += 1 + CHARSETINSTSIZE; | 281 | p += 2 + p->i.aux2.set.size; |
272 | else p += getoffset(p); | 282 | else p += getoffset(p); |
273 | continue; | 283 | continue; |
274 | } | 284 | } |
@@ -280,10 +290,10 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e, | |||
280 | } | 290 | } |
281 | case ISpan: { | 291 | case ISpan: { |
282 | for (; s < e; s++) { | 292 | for (; s < e; s++) { |
283 | int c = (byte)*s; | 293 | unsigned int c = (byte)*s; |
284 | if (!testchar((p+1)->buff, c)) break; | 294 | if (!charinset(p, (p+1)->buff, c)) break; |
285 | } | 295 | } |
286 | p += CHARSETINSTSIZE; | 296 | p += 1 + p->i.aux2.set.size; |
287 | continue; | 297 | continue; |
288 | } | 298 | } |
289 | case IJmp: { | 299 | case IJmp: { |
@@ -5,15 +5,22 @@ | |||
5 | #include "lpcap.h" | 5 | #include "lpcap.h" |
6 | 6 | ||
7 | 7 | ||
8 | /* | ||
9 | ** About Character sets in instructions: a set is a bit map with an | ||
10 | ** initial offset, in bits, and a size, in number of instructions. If | ||
11 | ** aux1 is one, set is inverted (bit == 1 means char is not in set). | ||
12 | */ | ||
13 | |||
14 | |||
8 | /* Virtual Machine's instructions */ | 15 | /* Virtual Machine's instructions */ |
9 | typedef enum Opcode { | 16 | typedef enum Opcode { |
10 | IAny, /* if no char, fail */ | 17 | IAny, /* if no char, fail */ |
11 | IChar, /* if char != aux1, fail */ | 18 | IChar, /* if char != aux1, fail */ |
12 | ISet, /* if char not in buff, fail */ | 19 | ISet, /* if char not in set, fail */ |
13 | ITestAny, /* in no char, jump to 'offset' */ | 20 | ITestAny, /* in no char, jump to 'offset' */ |
14 | ITestChar, /* if char != aux1, jump to 'offset' */ | 21 | ITestChar, /* if char != aux1, jump to 'offset' */ |
15 | ITestSet, /* if char not in buff, jump to 'offset' */ | 22 | ITestSet, /* if char not in set, jump to 'offset' */ |
16 | ISpan, /* read a span of chars in buff */ | 23 | ISpan, /* read a span of chars in set */ |
17 | IUTFR, /* if codepoint not in range [offset, utf_to], fail */ | 24 | IUTFR, /* if codepoint not in range [offset, utf_to], fail */ |
18 | IBehind, /* walk back 'aux1' characters (fail if not possible) */ | 25 | IBehind, /* walk back 'aux1' characters (fail if not possible) */ |
19 | IRet, /* return from a rule */ | 26 | IRet, /* return from a rule */ |
@@ -43,6 +50,10 @@ typedef union Instruction { | |||
43 | byte aux1; | 50 | byte aux1; |
44 | union { | 51 | union { |
45 | short key; | 52 | short key; |
53 | struct { | ||
54 | byte offset; | ||
55 | byte size; | ||
56 | } set; | ||
46 | } aux2; | 57 | } aux2; |
47 | } i; | 58 | } i; |
48 | int offset; | 59 | int offset; |
@@ -54,6 +65,7 @@ typedef union Instruction { | |||
54 | #define utf_to(inst) (((inst)->i.aux2.key << 8) | (inst)->i.aux1) | 65 | #define utf_to(inst) (((inst)->i.aux2.key << 8) | (inst)->i.aux1) |
55 | 66 | ||
56 | 67 | ||
68 | int charinset (const Instruction *i, const byte *buff, unsigned int c); | ||
57 | void printpatt (Instruction *p, int n); | 69 | void printpatt (Instruction *p, int n); |
58 | const char *match (lua_State *L, const char *o, const char *s, const char *e, | 70 | const char *match (lua_State *L, const char *o, const char *s, const char *e, |
59 | Instruction *op, Capture *capture, int ptop); | 71 | Instruction *op, Capture *capture, int ptop); |
@@ -117,6 +117,8 @@ eqcharset(m.S"\1\0\2", m.R"\0\2") | |||
117 | eqcharset(m.S"\1\0\2", m.R"\1\2" + "\0") | 117 | eqcharset(m.S"\1\0\2", m.R"\1\2" + "\0") |
118 | eqcharset(m.S"\1\0\2" - "\0", m.R"\1\2") | 118 | eqcharset(m.S"\1\0\2" - "\0", m.R"\1\2") |
119 | 119 | ||
120 | eqcharset(m.S("\0\255"), m.P"\0" + "\255") -- charset extremes | ||
121 | |||
120 | local word = alpha^1 * (1 - alpha)^0 | 122 | local word = alpha^1 * (1 - alpha)^0 |
121 | 123 | ||
122 | assert((word^0 * -1):match"alo alo") | 124 | assert((word^0 * -1):match"alo alo") |