diff options
author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-04-26 13:36:34 -0300 |
---|---|---|
committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-04-26 13:36:34 -0300 |
commit | 3403b0c7256435560b63f828da92026c5d4c898b (patch) | |
tree | ca6d5753f55fb2d7b6c85cedfe332e03033190a7 /lpcode.c | |
parent | def10e7c009f71f99d6a11171d84fc27568f9b81 (diff) | |
download | lpeg-3403b0c7256435560b63f828da92026c5d4c898b.tar.gz lpeg-3403b0c7256435560b63f828da92026c5d4c898b.tar.bz2 lpeg-3403b0c7256435560b63f828da92026c5d4c898b.zip |
New module 'lpcset'
For code related to compact sets.
Diffstat (limited to 'lpcode.c')
-rw-r--r-- | lpcode.c | 81 |
1 files changed, 6 insertions, 75 deletions
@@ -7,6 +7,7 @@ | |||
7 | 7 | ||
8 | #include "lptypes.h" | 8 | #include "lptypes.h" |
9 | #include "lpcode.h" | 9 | #include "lpcode.h" |
10 | #include "lpcset.h" | ||
10 | 11 | ||
11 | 12 | ||
12 | /* signals a "no-instruction */ | 13 | /* signals a "no-instruction */ |
@@ -31,70 +32,6 @@ static const Charset *fullset = &fullset_; | |||
31 | 32 | ||
32 | 33 | ||
33 | /* | 34 | /* |
34 | ** Add to 'c' the index of the (only) bit set in byte 'b' | ||
35 | */ | ||
36 | static int onlybit (int c, int b) { | ||
37 | if ((b & 0xF0) != 0) { c += 4; b >>= 4; } | ||
38 | if ((b & 0x0C) != 0) { c += 2; b >>= 2; } | ||
39 | if ((b & 0x02) != 0) { c += 1; } | ||
40 | return c; | ||
41 | } | ||
42 | |||
43 | |||
44 | /* | ||
45 | ** Extra information for the result of 'charsettype'. When result is | ||
46 | ** IChar, 'aux1' is the character. When result is ISet, 'aux1' is the | ||
47 | ** offset (in bytes), 'size' is the size (in bytes), and | ||
48 | ** 'delt' is the default value for bytes outside the set. | ||
49 | */ | ||
50 | typedef struct { | ||
51 | int aux1; | ||
52 | int size; | ||
53 | int deflt; | ||
54 | } charsetinfo; | ||
55 | |||
56 | /* | ||
57 | ** Check whether a charset is empty (returns IFail), singleton (IChar), | ||
58 | ** full (IAny), or none of those (ISet). When singleton, 'info.aux1' | ||
59 | ** returns which character it is. When generic set, 'info' returns | ||
60 | ** information about its range. | ||
61 | */ | ||
62 | static Opcode charsettype (const byte *cs, charsetinfo *info) { | ||
63 | int low0, low1, high0, high1; | ||
64 | for (low1 = 0; low1 < CHARSETSIZE && cs[low1] == 0; low1++) | ||
65 | /* find lowest byte with a 1-bit */; | ||
66 | if (low1 == CHARSETSIZE) | ||
67 | return IFail; /* no characters in set */ | ||
68 | for (high1 = CHARSETSIZE - 1; cs[high1] == 0; high1--) | ||
69 | /* find highest byte with a 1-bit; low1 is a sentinel */; | ||
70 | if (low1 == high1) { /* only one byte with 1-bits? */ | ||
71 | int b = cs[low1]; | ||
72 | if ((b & (b - 1)) == 0) { /* does byte has only one 1-bit? */ | ||
73 | info->aux1 = onlybit(low1 * BITSPERCHAR, b); /* get that bit */ | ||
74 | return IChar; /* single character */ | ||
75 | } | ||
76 | } | ||
77 | for (low0 = 0; low0 < CHARSETSIZE && cs[low0] == 0xFF; low0++) | ||
78 | /* find lowest byte with a 0-bit */; | ||
79 | if (low0 == CHARSETSIZE) | ||
80 | return IAny; /* set has all bits set */ | ||
81 | for (high0 = CHARSETSIZE - 1; cs[high0] == 0xFF; high0--) | ||
82 | /* find highest byte with a 0-bit; low0 is a sentinel */; | ||
83 | if (high1 - low1 <= high0 - low0) { /* range of 1s smaller than of 0s? */ | ||
84 | info->aux1 = low1; | ||
85 | info->size = high1 - low1 + 1; | ||
86 | info->deflt = 0; /* all discharged bits were 0 */ | ||
87 | } | ||
88 | else { | ||
89 | info->aux1 = low0; | ||
90 | info->size = high0 - low0 + 1; | ||
91 | info->deflt = 0xFF; /* all discharged bits were 1 */ | ||
92 | } | ||
93 | return ISet; | ||
94 | } | ||
95 | |||
96 | |||
97 | /* | ||
98 | ** A few basic operations on Charsets | 35 | ** A few basic operations on Charsets |
99 | */ | 36 | */ |
100 | static void cs_complement (Charset *cs) { | 37 | static void cs_complement (Charset *cs) { |
@@ -617,11 +554,9 @@ static void addcharset (CompileState *compst, int inst, const byte *cs, | |||
617 | I->i.aux2.set.size = isize; | 554 | I->i.aux2.set.size = isize; |
618 | I->i.aux1 = info->deflt; | 555 | I->i.aux1 = info->deflt; |
619 | p = nextinstruction(compst, isize); /* space for charset */ | 556 | p = nextinstruction(compst, isize); /* space for charset */ |
620 | charset = getinstr(compst, p).buff; /* previous loop may reallocate things */ | 557 | charset = getinstr(compst, p).buff; /* charset buffer */ |
621 | for (i = 0; i < info->size; i++) | 558 | for (i = 0; i < isize * (int)sizeof(Instruction); i++) |
622 | charset[i] = cs[i + info->aux1]; /* fill buffer with charset */ | 559 | charset[i] = getbytefromcharset(cs, info, i); /* fill the buffer */ |
623 | for (; i < isize * (int)sizeof(Instruction); i++) | ||
624 | charset[i] = info->deflt; /* complete the buffer */ | ||
625 | } | 560 | } |
626 | 561 | ||
627 | 562 | ||
@@ -637,12 +572,8 @@ static int cs_equal (Instruction *p, const byte *cs, charsetinfo *info) { | |||
637 | return 0; | 572 | return 0; |
638 | else { | 573 | else { |
639 | int i; | 574 | int i; |
640 | for (i = 0; i < info->size; i++) { | 575 | for (i = 0; i < instsize(info->size) * (int)sizeof(Instruction); i++) { |
641 | if ((p + 2)->buff[i] != cs[i + info->aux1]) | 576 | if ((p + 2)->buff[i] != getbytefromcharset(cs, info, i)) |
642 | return 0; | ||
643 | } | ||
644 | for (; i < instsize(info->size) * (int)sizeof(Instruction); i++) { | ||
645 | if ((p + 2)->buff[i] != info->deflt) | ||
646 | return 0; | 577 | return 0; |
647 | } | 578 | } |
648 | } | 579 | } |