diff options
author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-04-26 13:36:34 -0300 |
---|---|---|
committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-04-26 13:36:34 -0300 |
commit | 3403b0c7256435560b63f828da92026c5d4c898b (patch) | |
tree | ca6d5753f55fb2d7b6c85cedfe332e03033190a7 | |
parent | def10e7c009f71f99d6a11171d84fc27568f9b81 (diff) | |
download | lpeg-3403b0c7256435560b63f828da92026c5d4c898b.tar.gz lpeg-3403b0c7256435560b63f828da92026c5d4c898b.tar.bz2 lpeg-3403b0c7256435560b63f828da92026c5d4c898b.zip |
New module 'lpcset'
For code related to compact sets.
-rw-r--r-- | lpcode.c | 81 | ||||
-rw-r--r-- | lpcset.c | 69 | ||||
-rw-r--r-- | lpcset.h | 25 | ||||
-rw-r--r-- | makefile | 8 |
4 files changed, 104 insertions, 79 deletions
@@ -7,6 +7,7 @@ | |||
7 | 7 | ||
8 | #include "lptypes.h" | 8 | #include "lptypes.h" |
9 | #include "lpcode.h" | 9 | #include "lpcode.h" |
10 | #include "lpcset.h" | ||
10 | 11 | ||
11 | 12 | ||
12 | /* signals a "no-instruction */ | 13 | /* signals a "no-instruction */ |
@@ -31,70 +32,6 @@ static const Charset *fullset = &fullset_; | |||
31 | 32 | ||
32 | 33 | ||
33 | /* | 34 | /* |
34 | ** Add to 'c' the index of the (only) bit set in byte 'b' | ||
35 | */ | ||
36 | static int onlybit (int c, int b) { | ||
37 | if ((b & 0xF0) != 0) { c += 4; b >>= 4; } | ||
38 | if ((b & 0x0C) != 0) { c += 2; b >>= 2; } | ||
39 | if ((b & 0x02) != 0) { c += 1; } | ||
40 | return c; | ||
41 | } | ||
42 | |||
43 | |||
44 | /* | ||
45 | ** Extra information for the result of 'charsettype'. When result is | ||
46 | ** IChar, 'aux1' is the character. When result is ISet, 'aux1' is the | ||
47 | ** offset (in bytes), 'size' is the size (in bytes), and | ||
48 | ** 'delt' is the default value for bytes outside the set. | ||
49 | */ | ||
50 | typedef struct { | ||
51 | int aux1; | ||
52 | int size; | ||
53 | int deflt; | ||
54 | } charsetinfo; | ||
55 | |||
56 | /* | ||
57 | ** Check whether a charset is empty (returns IFail), singleton (IChar), | ||
58 | ** full (IAny), or none of those (ISet). When singleton, 'info.aux1' | ||
59 | ** returns which character it is. When generic set, 'info' returns | ||
60 | ** information about its range. | ||
61 | */ | ||
62 | static Opcode charsettype (const byte *cs, charsetinfo *info) { | ||
63 | int low0, low1, high0, high1; | ||
64 | for (low1 = 0; low1 < CHARSETSIZE && cs[low1] == 0; low1++) | ||
65 | /* find lowest byte with a 1-bit */; | ||
66 | if (low1 == CHARSETSIZE) | ||
67 | return IFail; /* no characters in set */ | ||
68 | for (high1 = CHARSETSIZE - 1; cs[high1] == 0; high1--) | ||
69 | /* find highest byte with a 1-bit; low1 is a sentinel */; | ||
70 | if (low1 == high1) { /* only one byte with 1-bits? */ | ||
71 | int b = cs[low1]; | ||
72 | if ((b & (b - 1)) == 0) { /* does byte has only one 1-bit? */ | ||
73 | info->aux1 = onlybit(low1 * BITSPERCHAR, b); /* get that bit */ | ||
74 | return IChar; /* single character */ | ||
75 | } | ||
76 | } | ||
77 | for (low0 = 0; low0 < CHARSETSIZE && cs[low0] == 0xFF; low0++) | ||
78 | /* find lowest byte with a 0-bit */; | ||
79 | if (low0 == CHARSETSIZE) | ||
80 | return IAny; /* set has all bits set */ | ||
81 | for (high0 = CHARSETSIZE - 1; cs[high0] == 0xFF; high0--) | ||
82 | /* find highest byte with a 0-bit; low0 is a sentinel */; | ||
83 | if (high1 - low1 <= high0 - low0) { /* range of 1s smaller than of 0s? */ | ||
84 | info->aux1 = low1; | ||
85 | info->size = high1 - low1 + 1; | ||
86 | info->deflt = 0; /* all discharged bits were 0 */ | ||
87 | } | ||
88 | else { | ||
89 | info->aux1 = low0; | ||
90 | info->size = high0 - low0 + 1; | ||
91 | info->deflt = 0xFF; /* all discharged bits were 1 */ | ||
92 | } | ||
93 | return ISet; | ||
94 | } | ||
95 | |||
96 | |||
97 | /* | ||
98 | ** A few basic operations on Charsets | 35 | ** A few basic operations on Charsets |
99 | */ | 36 | */ |
100 | static void cs_complement (Charset *cs) { | 37 | static void cs_complement (Charset *cs) { |
@@ -617,11 +554,9 @@ static void addcharset (CompileState *compst, int inst, const byte *cs, | |||
617 | I->i.aux2.set.size = isize; | 554 | I->i.aux2.set.size = isize; |
618 | I->i.aux1 = info->deflt; | 555 | I->i.aux1 = info->deflt; |
619 | p = nextinstruction(compst, isize); /* space for charset */ | 556 | p = nextinstruction(compst, isize); /* space for charset */ |
620 | charset = getinstr(compst, p).buff; /* previous loop may reallocate things */ | 557 | charset = getinstr(compst, p).buff; /* charset buffer */ |
621 | for (i = 0; i < info->size; i++) | 558 | for (i = 0; i < isize * (int)sizeof(Instruction); i++) |
622 | charset[i] = cs[i + info->aux1]; /* fill buffer with charset */ | 559 | charset[i] = getbytefromcharset(cs, info, i); /* fill the buffer */ |
623 | for (; i < isize * (int)sizeof(Instruction); i++) | ||
624 | charset[i] = info->deflt; /* complete the buffer */ | ||
625 | } | 560 | } |
626 | 561 | ||
627 | 562 | ||
@@ -637,12 +572,8 @@ static int cs_equal (Instruction *p, const byte *cs, charsetinfo *info) { | |||
637 | return 0; | 572 | return 0; |
638 | else { | 573 | else { |
639 | int i; | 574 | int i; |
640 | for (i = 0; i < info->size; i++) { | 575 | for (i = 0; i < instsize(info->size) * (int)sizeof(Instruction); i++) { |
641 | if ((p + 2)->buff[i] != cs[i + info->aux1]) | 576 | if ((p + 2)->buff[i] != getbytefromcharset(cs, info, i)) |
642 | return 0; | ||
643 | } | ||
644 | for (; i < instsize(info->size) * (int)sizeof(Instruction); i++) { | ||
645 | if ((p + 2)->buff[i] != info->deflt) | ||
646 | return 0; | 577 | return 0; |
647 | } | 578 | } |
648 | } | 579 | } |
diff --git a/lpcset.c b/lpcset.c new file mode 100644 index 0000000..9ecf475 --- /dev/null +++ b/lpcset.c | |||
@@ -0,0 +1,69 @@ | |||
1 | |||
2 | #include "lptypes.h" | ||
3 | #include "lpcset.h" | ||
4 | |||
5 | |||
6 | /* | ||
7 | ** Add to 'c' the index of the (only) bit set in byte 'b' | ||
8 | */ | ||
9 | static int onlybit (int c, int b) { | ||
10 | if ((b & 0xF0) != 0) { c += 4; b >>= 4; } | ||
11 | if ((b & 0x0C) != 0) { c += 2; b >>= 2; } | ||
12 | if ((b & 0x02) != 0) { c += 1; } | ||
13 | return c; | ||
14 | } | ||
15 | |||
16 | |||
17 | /* | ||
18 | ** Check whether a charset is empty (returns IFail), singleton (IChar), | ||
19 | ** full (IAny), or none of those (ISet). When singleton, 'info.aux1' | ||
20 | ** returns which character it is. When generic set, 'info' returns | ||
21 | ** information about its range. | ||
22 | */ | ||
23 | Opcode charsettype (const byte *cs, charsetinfo *info) { | ||
24 | int low0, low1, high0, high1; | ||
25 | for (low1 = 0; low1 < CHARSETSIZE && cs[low1] == 0; low1++) | ||
26 | /* find lowest byte with a 1-bit */; | ||
27 | if (low1 == CHARSETSIZE) | ||
28 | return IFail; /* no characters in set */ | ||
29 | for (high1 = CHARSETSIZE - 1; cs[high1] == 0; high1--) | ||
30 | /* find highest byte with a 1-bit; low1 is a sentinel */; | ||
31 | if (low1 == high1) { /* only one byte with 1-bits? */ | ||
32 | int b = cs[low1]; | ||
33 | if ((b & (b - 1)) == 0) { /* does byte has only one 1-bit? */ | ||
34 | info->aux1 = onlybit(low1 * BITSPERCHAR, b); /* get that bit */ | ||
35 | return IChar; /* single character */ | ||
36 | } | ||
37 | } | ||
38 | for (low0 = 0; low0 < CHARSETSIZE && cs[low0] == 0xFF; low0++) | ||
39 | /* find lowest byte with a 0-bit */; | ||
40 | if (low0 == CHARSETSIZE) | ||
41 | return IAny; /* set has all bits set */ | ||
42 | for (high0 = CHARSETSIZE - 1; cs[high0] == 0xFF; high0--) | ||
43 | /* find highest byte with a 0-bit; low0 is a sentinel */; | ||
44 | if (high1 - low1 <= high0 - low0) { /* range of 1s smaller than of 0s? */ | ||
45 | info->aux1 = low1; | ||
46 | info->size = high1 - low1 + 1; | ||
47 | info->deflt = 0; /* all discharged bits were 0 */ | ||
48 | } | ||
49 | else { | ||
50 | info->aux1 = low0; | ||
51 | info->size = high0 - low0 + 1; | ||
52 | info->deflt = 0xFF; /* all discharged bits were 1 */ | ||
53 | } | ||
54 | return ISet; | ||
55 | } | ||
56 | |||
57 | |||
58 | /* | ||
59 | ** Get a byte from a compact charset. If index is inside the charset | ||
60 | ** range, get the byte from the supporting charset (correcting it | ||
61 | ** by the offset). Otherwise, return the default for the set. | ||
62 | */ | ||
63 | byte getbytefromcharset (const byte *cs, const charsetinfo *info, | ||
64 | int index) { | ||
65 | if (index < info->size) | ||
66 | return cs[info->aux1 + index]; | ||
67 | else return info->deflt; | ||
68 | } | ||
69 | |||
diff --git a/lpcset.h b/lpcset.h new file mode 100644 index 0000000..e5152c4 --- /dev/null +++ b/lpcset.h | |||
@@ -0,0 +1,25 @@ | |||
1 | |||
2 | #if !defined(lpset_h) | ||
3 | #define lpset_h | ||
4 | |||
5 | #include "lpcset.h" | ||
6 | #include "lpcode.h" | ||
7 | |||
8 | |||
9 | /* | ||
10 | ** Extra information for the result of 'charsettype'. When result is | ||
11 | ** IChar, 'aux1' is the character. When result is ISet, 'aux1' is the | ||
12 | ** offset (in bytes), 'size' is the size (in bytes), and | ||
13 | ** 'delt' is the default value for bytes outside the set. | ||
14 | */ | ||
15 | typedef struct { | ||
16 | int aux1; | ||
17 | int size; | ||
18 | int deflt; | ||
19 | } charsetinfo; | ||
20 | |||
21 | |||
22 | Opcode charsettype (const byte *cs, charsetinfo *info); | ||
23 | byte getbytefromcharset (const byte *cs, const charsetinfo *info, int index); | ||
24 | |||
25 | #endif | ||
@@ -1,7 +1,7 @@ | |||
1 | LIBNAME = lpeg | 1 | LIBNAME = lpeg |
2 | LUADIR = ../lua/ | 2 | LUADIR = ./lua/ |
3 | 3 | ||
4 | # COPT = -O2 -DNDEBUG | 4 | COPT = -O2 -DNDEBUG |
5 | # COPT = -g | 5 | # COPT = -g |
6 | # COPT = -DLPEG_DEBUG | 6 | # COPT = -DLPEG_DEBUG |
7 | 7 | ||
@@ -26,7 +26,7 @@ CWARNS = -Wall -Wextra -pedantic \ | |||
26 | CFLAGS = $(CWARNS) $(COPT) -std=c99 -I$(LUADIR) -fPIC | 26 | CFLAGS = $(CWARNS) $(COPT) -std=c99 -I$(LUADIR) -fPIC |
27 | CC = gcc | 27 | CC = gcc |
28 | 28 | ||
29 | FILES = lpvm.o lpcap.o lptree.o lpcode.o lpprint.o | 29 | FILES = lpvm.o lpcap.o lptree.o lpcode.o lpprint.o lpcset.o |
30 | 30 | ||
31 | # For Linux | 31 | # For Linux |
32 | linux: | 32 | linux: |
@@ -53,4 +53,4 @@ lpcode.o: lpcode.c lptypes.h lpcode.h lptree.h lpvm.h lpcap.h | |||
53 | lpprint.o: lpprint.c lptypes.h lpprint.h lptree.h lpvm.h lpcap.h | 53 | lpprint.o: lpprint.c lptypes.h lpprint.h lptree.h lpvm.h lpcap.h |
54 | lptree.o: lptree.c lptypes.h lpcap.h lpcode.h lptree.h lpvm.h lpprint.h | 54 | lptree.o: lptree.c lptypes.h lpcap.h lpcode.h lptree.h lpvm.h lpprint.h |
55 | lpvm.o: lpvm.c lpcap.h lptypes.h lpvm.h lpprint.h lptree.h | 55 | lpvm.o: lpvm.c lpcap.h lptypes.h lpvm.h lpprint.h lptree.h |
56 | 56 | lpcset.o: lpcset.c lpcset.h lptypes.h | |