diff options
author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-04-27 10:32:39 -0300 |
---|---|---|
committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-04-27 10:32:39 -0300 |
commit | 012cf9c86cf91cb8354e229bde335592d41b84b2 (patch) | |
tree | 353f17797b1952eaec231c8e4fd5c21e02daf875 /lpcset.c | |
parent | 3403b0c7256435560b63f828da92026c5d4c898b (diff) | |
download | lpeg-012cf9c86cf91cb8354e229bde335592d41b84b2.tar.gz lpeg-012cf9c86cf91cb8354e229bde335592d41b84b2.tar.bz2 lpeg-012cf9c86cf91cb8354e229bde335592d41b84b2.zip |
Compact charsets used in trees, too.
Diffstat (limited to 'lpcset.c')
-rw-r--r-- | lpcset.c | 55 |
1 files changed, 48 insertions, 7 deletions
@@ -16,7 +16,7 @@ static int onlybit (int c, int b) { | |||
16 | 16 | ||
17 | /* | 17 | /* |
18 | ** Check whether a charset is empty (returns IFail), singleton (IChar), | 18 | ** Check whether a charset is empty (returns IFail), singleton (IChar), |
19 | ** full (IAny), or none of those (ISet). When singleton, 'info.aux1' | 19 | ** full (IAny), or none of those (ISet). When singleton, 'info.offset' |
20 | ** returns which character it is. When generic set, 'info' returns | 20 | ** returns which character it is. When generic set, 'info' returns |
21 | ** information about its range. | 21 | ** information about its range. |
22 | */ | 22 | */ |
@@ -31,7 +31,7 @@ Opcode charsettype (const byte *cs, charsetinfo *info) { | |||
31 | if (low1 == high1) { /* only one byte with 1-bits? */ | 31 | if (low1 == high1) { /* only one byte with 1-bits? */ |
32 | int b = cs[low1]; | 32 | int b = cs[low1]; |
33 | if ((b & (b - 1)) == 0) { /* does byte has only one 1-bit? */ | 33 | if ((b & (b - 1)) == 0) { /* does byte has only one 1-bit? */ |
34 | info->aux1 = onlybit(low1 * BITSPERCHAR, b); /* get that bit */ | 34 | info->offset = onlybit(low1 * BITSPERCHAR, b); /* get that bit */ |
35 | return IChar; /* single character */ | 35 | return IChar; /* single character */ |
36 | } | 36 | } |
37 | } | 37 | } |
@@ -42,15 +42,16 @@ Opcode charsettype (const byte *cs, charsetinfo *info) { | |||
42 | for (high0 = CHARSETSIZE - 1; cs[high0] == 0xFF; high0--) | 42 | for (high0 = CHARSETSIZE - 1; cs[high0] == 0xFF; high0--) |
43 | /* find highest byte with a 0-bit; low0 is a sentinel */; | 43 | /* find highest byte with a 0-bit; low0 is a sentinel */; |
44 | if (high1 - low1 <= high0 - low0) { /* range of 1s smaller than of 0s? */ | 44 | if (high1 - low1 <= high0 - low0) { /* range of 1s smaller than of 0s? */ |
45 | info->aux1 = low1; | 45 | info->offset = low1; |
46 | info->size = high1 - low1 + 1; | 46 | info->size = high1 - low1 + 1; |
47 | info->deflt = 0; /* all discharged bits were 0 */ | 47 | info->deflt = 0; /* all discharged bits were 0 */ |
48 | } | 48 | } |
49 | else { | 49 | else { |
50 | info->aux1 = low0; | 50 | info->offset = low0; |
51 | info->size = high0 - low0 + 1; | 51 | info->size = high0 - low0 + 1; |
52 | info->deflt = 0xFF; /* all discharged bits were 1 */ | 52 | info->deflt = 0xFF; /* all discharged bits were 1 */ |
53 | } | 53 | } |
54 | info->cs = cs + info->offset; | ||
54 | return ISet; | 55 | return ISet; |
55 | } | 56 | } |
56 | 57 | ||
@@ -60,10 +61,50 @@ Opcode charsettype (const byte *cs, charsetinfo *info) { | |||
60 | ** range, get the byte from the supporting charset (correcting it | 61 | ** range, get the byte from the supporting charset (correcting it |
61 | ** by the offset). Otherwise, return the default for the set. | 62 | ** by the offset). Otherwise, return the default for the set. |
62 | */ | 63 | */ |
63 | byte getbytefromcharset (const byte *cs, const charsetinfo *info, | 64 | byte getbytefromcharset (const charsetinfo *info, int index) { |
64 | int index) { | ||
65 | if (index < info->size) | 65 | if (index < info->size) |
66 | return cs[info->aux1 + index]; | 66 | return info->cs[index]; |
67 | else return info->deflt; | 67 | else return info->deflt; |
68 | } | 68 | } |
69 | 69 | ||
70 | |||
71 | /* | ||
72 | ** If 'tree' is a 'char' pattern (TSet, TChar, TAny, TFalse), convert it | ||
73 | ** into a charset and return 1; else return 0. | ||
74 | */ | ||
75 | int tocharset (TTree *tree, Charset *cs) { | ||
76 | switch (tree->tag) { | ||
77 | case TChar: { /* only one char */ | ||
78 | assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX); | ||
79 | loopset(i, cs->cs[i] = 0); /* erase all chars */ | ||
80 | setchar(cs->cs, tree->u.n); /* add that one */ | ||
81 | return 1; | ||
82 | } | ||
83 | case TAny: { | ||
84 | loopset(i, cs->cs[i] = 0xFF); /* add all characters to the set */ | ||
85 | return 1; | ||
86 | } | ||
87 | case TFalse: { | ||
88 | loopset(i, cs->cs[i] = 0); /* empty set */ | ||
89 | return 1; | ||
90 | } | ||
91 | case TSet: { /* fill set */ | ||
92 | int i; | ||
93 | loopset(j, cs->cs[j] = tree->u.set.deflt); | ||
94 | for (i = 0; i < tree->u.set.size; i++) | ||
95 | cs->cs[tree->u.set.offset + i] = treebuffer(tree)[i]; | ||
96 | return 1; | ||
97 | } | ||
98 | default: return 0; | ||
99 | } | ||
100 | } | ||
101 | |||
102 | |||
103 | void tree2cset (TTree *tree, charsetinfo *info) { | ||
104 | assert(tree->tag == TSet); | ||
105 | info->offset = tree->u.set.offset; | ||
106 | info->size = tree->u.set.size; | ||
107 | info->deflt = tree->u.set.deflt; | ||
108 | info->cs = treebuffer(tree); | ||
109 | } | ||
110 | |||