aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRoberto Ierusalimschy <roberto@inf.puc-rio.br>2023-04-23 11:02:52 -0300
committerRoberto Ierusalimschy <roberto@inf.puc-rio.br>2023-04-23 11:02:52 -0300
commitf8e9bc1c721a0802b2260f48ced72c7e04d7b1ef (patch)
treefcd765f59c5d74574bdb21cd7a11e1f723068d87
parent9f7183c280f310c0d0b49b7b9c3b8eac297fafa7 (diff)
downloadlpeg-f8e9bc1c721a0802b2260f48ced72c7e04d7b1ef.tar.gz
lpeg-f8e9bc1c721a0802b2260f48ced72c7e04d7b1ef.tar.bz2
lpeg-f8e9bc1c721a0802b2260f48ced72c7e04d7b1ef.zip
Towards a smaller encoding for charsets in code
-rw-r--r--lpcode.c23
-rw-r--r--lpprint.c18
-rw-r--r--lptypes.h8
-rw-r--r--lpvm.c28
-rw-r--r--lpvm.h18
-rwxr-xr-xtest.lua2
6 files changed, 69 insertions, 28 deletions
diff --git a/lpcode.c b/lpcode.c
index f537904..8dab896 100644
--- a/lpcode.c
+++ b/lpcode.c
@@ -22,6 +22,7 @@ static const Charset fullset_ =
22 22
23static const Charset *fullset = &fullset_; 23static const Charset *fullset = &fullset_;
24 24
25
25/* 26/*
26** {====================================================== 27** {======================================================
27** Analysis and some optimizations 28** Analysis and some optimizations
@@ -441,8 +442,8 @@ static int needfollow (TTree *tree) {
441*/ 442*/
442int sizei (const Instruction *i) { 443int sizei (const Instruction *i) {
443 switch((Opcode)i->i.code) { 444 switch((Opcode)i->i.code) {
444 case ISet: case ISpan: return CHARSETINSTSIZE; 445 case ISet: case ISpan: return 1 + i->i.aux2.set.size;
445 case ITestSet: return CHARSETINSTSIZE + 1; 446 case ITestSet: return 2 + i->i.aux2.set.size;
446 case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: 447 case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall:
447 case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: 448 case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit:
448 case IUTFR: 449 case IUTFR:
@@ -585,10 +586,14 @@ static void codechar (CompileState *compst, int c, int tt) {
585/* 586/*
586** Add a charset posfix to an instruction 587** Add a charset posfix to an instruction
587*/ 588*/
588static void addcharset (CompileState *compst, const byte *cs) { 589static void addcharset (CompileState *compst, int inst, const byte *cs) {
589 int p = gethere(compst); 590 int p = gethere(compst);
590 int i; 591 int i;
591 for (i = 0; i < (int)CHARSETINSTSIZE - 1; i++) 592 /* for now, all sets use all bits */
593 getinstr(compst, inst).i.aux2.set.offset = 0;
594 getinstr(compst, inst).i.aux2.set.size = instsize(CHARSETSIZE);
595 getinstr(compst, inst).i.aux1 = 0;
596 for (i = 0; i < (int)instsize(CHARSETSIZE); i++)
592 nextinstruction(compst); /* space for buffer */ 597 nextinstruction(compst); /* space for buffer */
593 /* fill buffer with charset */ 598 /* fill buffer with charset */
594 loopset(j, getinstr(compst, p).buff[j] = cs[j]); 599 loopset(j, getinstr(compst, p).buff[j] = cs[j]);
@@ -610,8 +615,8 @@ static void codecharset (CompileState *compst, const byte *cs, int tt) {
610 cs_equal(cs, getinstr(compst, tt + 2).buff)) 615 cs_equal(cs, getinstr(compst, tt + 2).buff))
611 addinstruction(compst, IAny, 0); 616 addinstruction(compst, IAny, 0);
612 else { 617 else {
613 addinstruction(compst, ISet, 0); 618 int i = addinstruction(compst, ISet, 0);
614 addcharset(compst, cs); 619 addcharset(compst, i, cs);
615 } 620 }
616 break; 621 break;
617 } 622 }
@@ -641,7 +646,7 @@ static int codetestset (CompileState *compst, Charset *cs, int e) {
641 } 646 }
642 case ISet: { 647 case ISet: {
643 int i = addoffsetinst(compst, ITestSet); 648 int i = addoffsetinst(compst, ITestSet);
644 addcharset(compst, cs->cs); 649 addcharset(compst, i, cs->cs);
645 return i; 650 return i;
646 } 651 }
647 default: assert(0); return 0; 652 default: assert(0); return 0;
@@ -793,8 +798,8 @@ static void coderep (CompileState *compst, TTree *tree, int opt,
793 const Charset *fl) { 798 const Charset *fl) {
794 Charset st; 799 Charset st;
795 if (tocharset(tree, &st)) { 800 if (tocharset(tree, &st)) {
796 addinstruction(compst, ISpan, 0); 801 int i = addinstruction(compst, ISpan, 0);
797 addcharset(compst, st.cs); 802 addcharset(compst, i, st.cs);
798 } 803 }
799 else { 804 else {
800 int e1 = getfirst(tree, fullset, &st); 805 int e1 = getfirst(tree, fullset, &st);
diff --git a/lpprint.c b/lpprint.c
index 1b5c488..ed52328 100644
--- a/lpprint.c
+++ b/lpprint.c
@@ -33,6 +33,18 @@ void printcharset (const byte *st) {
33} 33}
34 34
35 35
36static void printIcharset (const Instruction *inst, const byte *buff) {
37 byte cs[CHARSETSIZE];
38 int i;
39 loopset(j, cs[j] = 0);
40 for (i = 0; i < CHARSETSIZE << 3; i++) {
41 if (charinset(inst, buff, i))
42 setchar(cs, i);
43 }
44 printcharset(cs);
45}
46
47
36static const char *capkind (int kind) { 48static const char *capkind (int kind) {
37 const char *const modes[] = { 49 const char *const modes[] = {
38 "close", "position", "constant", "backref", 50 "close", "position", "constant", "backref",
@@ -83,15 +95,15 @@ void printinst (const Instruction *op, const Instruction *p) {
83 break; 95 break;
84 } 96 }
85 case ISet: { 97 case ISet: {
86 printcharset((p+1)->buff); 98 printIcharset(p, (p+1)->buff);
87 break; 99 break;
88 } 100 }
89 case ITestSet: { 101 case ITestSet: {
90 printcharset((p+2)->buff); printjmp(op, p); 102 printIcharset(p, (p+2)->buff); printjmp(op, p);
91 break; 103 break;
92 } 104 }
93 case ISpan: { 105 case ISpan: {
94 printcharset((p+1)->buff); 106 printIcharset(p, (p+1)->buff);
95 break; 107 break;
96 } 108 }
97 case IOpenCall: { 109 case IOpenCall: {
diff --git a/lptypes.h b/lptypes.h
index 81aa61e..d1cd5be 100644
--- a/lptypes.h
+++ b/lptypes.h
@@ -127,19 +127,19 @@ typedef struct Charset {
127#define MAXPATTSIZE (SHRT_MAX - 10) 127#define MAXPATTSIZE (SHRT_MAX - 10)
128 128
129 129
130/* size (in elements) for an instruction plus extra l bytes */ 130/* size (in instructions) for l bytes (l > 0) */
131#define instsize(l) (((l) + sizeof(Instruction) - 1)/sizeof(Instruction) + 1) 131#define instsize(l) (((l) - 1)/sizeof(Instruction) + 1)
132 132
133 133
134/* size (in elements) for a ISet instruction */ 134/* size (in elements) for a ISet instruction */
135#define CHARSETINSTSIZE instsize(CHARSETSIZE) 135#define CHARSETINSTSIZE (1 + instsize(CHARSETSIZE))
136 136
137/* size (in elements) for a IFunc instruction */ 137/* size (in elements) for a IFunc instruction */
138#define funcinstsize(p) ((p)->i.aux + 2) 138#define funcinstsize(p) ((p)->i.aux + 2)
139 139
140 140
141 141
142#define testchar(st,c) (((int)(st)[((c) >> 3)] & (1 << ((c) & 7)))) 142#define testchar(st,c) ((((unsigned int)(st)[((c) >> 3)]) >> ((c) & 7)) & 1)
143 143
144 144
145#endif 145#endif
diff --git a/lpvm.c b/lpvm.c
index 9ee00a7..f0bb1e7 100644
--- a/lpvm.c
+++ b/lpvm.c
@@ -23,6 +23,16 @@
23static const Instruction giveup = {{IGiveup, 0, {0}}}; 23static const Instruction giveup = {{IGiveup, 0, {0}}};
24 24
25 25
26int charinset (const Instruction *i, const byte *buff, unsigned int c) {
27 c -= (unsigned int)i->i.aux2.set.offset;
28 if (c >= ((unsigned int)i->i.aux2.set.size /* size in instructions... */
29 * (unsigned int)sizeof(Instruction) /* in bytes... */
30 * 8u)) /* in bits */
31 return i->i.aux1; /* out of range */
32 return (testchar(buff, c) != i->i.aux1);
33}
34
35
26/* 36/*
27** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid. 37** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
28*/ 38*/
@@ -259,16 +269,16 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e,
259 continue; 269 continue;
260 } 270 }
261 case ISet: { 271 case ISet: {
262 int c = (byte)*s; 272 unsigned int c = (byte)*s;
263 if (testchar((p+1)->buff, c) && s < e) 273 if (charinset(p, (p+1)->buff, c) && s < e)
264 { p += CHARSETINSTSIZE; s++; } 274 { p += 1 + p->i.aux2.set.size; s++; }
265 else goto fail; 275 else goto fail;
266 continue; 276 continue;
267 } 277 }
268 case ITestSet: { 278 case ITestSet: {
269 int c = (byte)*s; 279 unsigned int c = (byte)*s;
270 if (testchar((p + 2)->buff, c) && s < e) 280 if (charinset(p, (p + 2)->buff, c) && s < e)
271 p += 1 + CHARSETINSTSIZE; 281 p += 2 + p->i.aux2.set.size;
272 else p += getoffset(p); 282 else p += getoffset(p);
273 continue; 283 continue;
274 } 284 }
@@ -280,10 +290,10 @@ const char *match (lua_State *L, const char *o, const char *s, const char *e,
280 } 290 }
281 case ISpan: { 291 case ISpan: {
282 for (; s < e; s++) { 292 for (; s < e; s++) {
283 int c = (byte)*s; 293 unsigned int c = (byte)*s;
284 if (!testchar((p+1)->buff, c)) break; 294 if (!charinset(p, (p+1)->buff, c)) break;
285 } 295 }
286 p += CHARSETINSTSIZE; 296 p += 1 + p->i.aux2.set.size;
287 continue; 297 continue;
288 } 298 }
289 case IJmp: { 299 case IJmp: {
diff --git a/lpvm.h b/lpvm.h
index 607bf48..c02e943 100644
--- a/lpvm.h
+++ b/lpvm.h
@@ -5,15 +5,22 @@
5#include "lpcap.h" 5#include "lpcap.h"
6 6
7 7
8/*
9** About Character sets in instructions: a set is a bit map with an
10** initial offset, in bits, and a size, in number of instructions. If
11** aux1 is one, set is inverted (bit == 1 means char is not in set).
12*/
13
14
8/* Virtual Machine's instructions */ 15/* Virtual Machine's instructions */
9typedef enum Opcode { 16typedef enum Opcode {
10 IAny, /* if no char, fail */ 17 IAny, /* if no char, fail */
11 IChar, /* if char != aux1, fail */ 18 IChar, /* if char != aux1, fail */
12 ISet, /* if char not in buff, fail */ 19 ISet, /* if char not in set, fail */
13 ITestAny, /* in no char, jump to 'offset' */ 20 ITestAny, /* in no char, jump to 'offset' */
14 ITestChar, /* if char != aux1, jump to 'offset' */ 21 ITestChar, /* if char != aux1, jump to 'offset' */
15 ITestSet, /* if char not in buff, jump to 'offset' */ 22 ITestSet, /* if char not in set, jump to 'offset' */
16 ISpan, /* read a span of chars in buff */ 23 ISpan, /* read a span of chars in set */
17 IUTFR, /* if codepoint not in range [offset, utf_to], fail */ 24 IUTFR, /* if codepoint not in range [offset, utf_to], fail */
18 IBehind, /* walk back 'aux1' characters (fail if not possible) */ 25 IBehind, /* walk back 'aux1' characters (fail if not possible) */
19 IRet, /* return from a rule */ 26 IRet, /* return from a rule */
@@ -43,6 +50,10 @@ typedef union Instruction {
43 byte aux1; 50 byte aux1;
44 union { 51 union {
45 short key; 52 short key;
53 struct {
54 byte offset;
55 byte size;
56 } set;
46 } aux2; 57 } aux2;
47 } i; 58 } i;
48 int offset; 59 int offset;
@@ -54,6 +65,7 @@ typedef union Instruction {
54#define utf_to(inst) (((inst)->i.aux2.key << 8) | (inst)->i.aux1) 65#define utf_to(inst) (((inst)->i.aux2.key << 8) | (inst)->i.aux1)
55 66
56 67
68int charinset (const Instruction *i, const byte *buff, unsigned int c);
57void printpatt (Instruction *p, int n); 69void printpatt (Instruction *p, int n);
58const char *match (lua_State *L, const char *o, const char *s, const char *e, 70const char *match (lua_State *L, const char *o, const char *s, const char *e,
59 Instruction *op, Capture *capture, int ptop); 71 Instruction *op, Capture *capture, int ptop);
diff --git a/test.lua b/test.lua
index 439b6cb..d76e3f1 100755
--- a/test.lua
+++ b/test.lua
@@ -117,6 +117,8 @@ eqcharset(m.S"\1\0\2", m.R"\0\2")
117eqcharset(m.S"\1\0\2", m.R"\1\2" + "\0") 117eqcharset(m.S"\1\0\2", m.R"\1\2" + "\0")
118eqcharset(m.S"\1\0\2" - "\0", m.R"\1\2") 118eqcharset(m.S"\1\0\2" - "\0", m.R"\1\2")
119 119
120eqcharset(m.S("\0\255"), m.P"\0" + "\255") -- charset extremes
121
120local word = alpha^1 * (1 - alpha)^0 122local word = alpha^1 * (1 - alpha)^0
121 123
122assert((word^0 * -1):match"alo alo") 124assert((word^0 * -1):match"alo alo")