diff options
| author | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-04-26 13:36:34 -0300 |
|---|---|---|
| committer | Roberto Ierusalimschy <roberto@inf.puc-rio.br> | 2023-04-26 13:36:34 -0300 |
| commit | 3403b0c7256435560b63f828da92026c5d4c898b (patch) | |
| tree | ca6d5753f55fb2d7b6c85cedfe332e03033190a7 | |
| parent | def10e7c009f71f99d6a11171d84fc27568f9b81 (diff) | |
| download | lpeg-3403b0c7256435560b63f828da92026c5d4c898b.tar.gz lpeg-3403b0c7256435560b63f828da92026c5d4c898b.tar.bz2 lpeg-3403b0c7256435560b63f828da92026c5d4c898b.zip | |
New module 'lpcset'
For code related to compact sets.
| -rw-r--r-- | lpcode.c | 81 | ||||
| -rw-r--r-- | lpcset.c | 69 | ||||
| -rw-r--r-- | lpcset.h | 25 | ||||
| -rw-r--r-- | makefile | 8 |
4 files changed, 104 insertions, 79 deletions
| @@ -7,6 +7,7 @@ | |||
| 7 | 7 | ||
| 8 | #include "lptypes.h" | 8 | #include "lptypes.h" |
| 9 | #include "lpcode.h" | 9 | #include "lpcode.h" |
| 10 | #include "lpcset.h" | ||
| 10 | 11 | ||
| 11 | 12 | ||
| 12 | /* signals a "no-instruction */ | 13 | /* signals a "no-instruction */ |
| @@ -31,70 +32,6 @@ static const Charset *fullset = &fullset_; | |||
| 31 | 32 | ||
| 32 | 33 | ||
| 33 | /* | 34 | /* |
| 34 | ** Add to 'c' the index of the (only) bit set in byte 'b' | ||
| 35 | */ | ||
| 36 | static int onlybit (int c, int b) { | ||
| 37 | if ((b & 0xF0) != 0) { c += 4; b >>= 4; } | ||
| 38 | if ((b & 0x0C) != 0) { c += 2; b >>= 2; } | ||
| 39 | if ((b & 0x02) != 0) { c += 1; } | ||
| 40 | return c; | ||
| 41 | } | ||
| 42 | |||
| 43 | |||
| 44 | /* | ||
| 45 | ** Extra information for the result of 'charsettype'. When result is | ||
| 46 | ** IChar, 'aux1' is the character. When result is ISet, 'aux1' is the | ||
| 47 | ** offset (in bytes), 'size' is the size (in bytes), and | ||
| 48 | ** 'delt' is the default value for bytes outside the set. | ||
| 49 | */ | ||
| 50 | typedef struct { | ||
| 51 | int aux1; | ||
| 52 | int size; | ||
| 53 | int deflt; | ||
| 54 | } charsetinfo; | ||
| 55 | |||
| 56 | /* | ||
| 57 | ** Check whether a charset is empty (returns IFail), singleton (IChar), | ||
| 58 | ** full (IAny), or none of those (ISet). When singleton, 'info.aux1' | ||
| 59 | ** returns which character it is. When generic set, 'info' returns | ||
| 60 | ** information about its range. | ||
| 61 | */ | ||
| 62 | static Opcode charsettype (const byte *cs, charsetinfo *info) { | ||
| 63 | int low0, low1, high0, high1; | ||
| 64 | for (low1 = 0; low1 < CHARSETSIZE && cs[low1] == 0; low1++) | ||
| 65 | /* find lowest byte with a 1-bit */; | ||
| 66 | if (low1 == CHARSETSIZE) | ||
| 67 | return IFail; /* no characters in set */ | ||
| 68 | for (high1 = CHARSETSIZE - 1; cs[high1] == 0; high1--) | ||
| 69 | /* find highest byte with a 1-bit; low1 is a sentinel */; | ||
| 70 | if (low1 == high1) { /* only one byte with 1-bits? */ | ||
| 71 | int b = cs[low1]; | ||
| 72 | if ((b & (b - 1)) == 0) { /* does byte has only one 1-bit? */ | ||
| 73 | info->aux1 = onlybit(low1 * BITSPERCHAR, b); /* get that bit */ | ||
| 74 | return IChar; /* single character */ | ||
| 75 | } | ||
| 76 | } | ||
| 77 | for (low0 = 0; low0 < CHARSETSIZE && cs[low0] == 0xFF; low0++) | ||
| 78 | /* find lowest byte with a 0-bit */; | ||
| 79 | if (low0 == CHARSETSIZE) | ||
| 80 | return IAny; /* set has all bits set */ | ||
| 81 | for (high0 = CHARSETSIZE - 1; cs[high0] == 0xFF; high0--) | ||
| 82 | /* find highest byte with a 0-bit; low0 is a sentinel */; | ||
| 83 | if (high1 - low1 <= high0 - low0) { /* range of 1s smaller than of 0s? */ | ||
| 84 | info->aux1 = low1; | ||
| 85 | info->size = high1 - low1 + 1; | ||
| 86 | info->deflt = 0; /* all discharged bits were 0 */ | ||
| 87 | } | ||
| 88 | else { | ||
| 89 | info->aux1 = low0; | ||
| 90 | info->size = high0 - low0 + 1; | ||
| 91 | info->deflt = 0xFF; /* all discharged bits were 1 */ | ||
| 92 | } | ||
| 93 | return ISet; | ||
| 94 | } | ||
| 95 | |||
| 96 | |||
| 97 | /* | ||
| 98 | ** A few basic operations on Charsets | 35 | ** A few basic operations on Charsets |
| 99 | */ | 36 | */ |
| 100 | static void cs_complement (Charset *cs) { | 37 | static void cs_complement (Charset *cs) { |
| @@ -617,11 +554,9 @@ static void addcharset (CompileState *compst, int inst, const byte *cs, | |||
| 617 | I->i.aux2.set.size = isize; | 554 | I->i.aux2.set.size = isize; |
| 618 | I->i.aux1 = info->deflt; | 555 | I->i.aux1 = info->deflt; |
| 619 | p = nextinstruction(compst, isize); /* space for charset */ | 556 | p = nextinstruction(compst, isize); /* space for charset */ |
| 620 | charset = getinstr(compst, p).buff; /* previous loop may reallocate things */ | 557 | charset = getinstr(compst, p).buff; /* charset buffer */ |
| 621 | for (i = 0; i < info->size; i++) | 558 | for (i = 0; i < isize * (int)sizeof(Instruction); i++) |
| 622 | charset[i] = cs[i + info->aux1]; /* fill buffer with charset */ | 559 | charset[i] = getbytefromcharset(cs, info, i); /* fill the buffer */ |
| 623 | for (; i < isize * (int)sizeof(Instruction); i++) | ||
| 624 | charset[i] = info->deflt; /* complete the buffer */ | ||
| 625 | } | 560 | } |
| 626 | 561 | ||
| 627 | 562 | ||
| @@ -637,12 +572,8 @@ static int cs_equal (Instruction *p, const byte *cs, charsetinfo *info) { | |||
| 637 | return 0; | 572 | return 0; |
| 638 | else { | 573 | else { |
| 639 | int i; | 574 | int i; |
| 640 | for (i = 0; i < info->size; i++) { | 575 | for (i = 0; i < instsize(info->size) * (int)sizeof(Instruction); i++) { |
| 641 | if ((p + 2)->buff[i] != cs[i + info->aux1]) | 576 | if ((p + 2)->buff[i] != getbytefromcharset(cs, info, i)) |
| 642 | return 0; | ||
| 643 | } | ||
| 644 | for (; i < instsize(info->size) * (int)sizeof(Instruction); i++) { | ||
| 645 | if ((p + 2)->buff[i] != info->deflt) | ||
| 646 | return 0; | 577 | return 0; |
| 647 | } | 578 | } |
| 648 | } | 579 | } |
diff --git a/lpcset.c b/lpcset.c new file mode 100644 index 0000000..9ecf475 --- /dev/null +++ b/lpcset.c | |||
| @@ -0,0 +1,69 @@ | |||
| 1 | |||
| 2 | #include "lptypes.h" | ||
| 3 | #include "lpcset.h" | ||
| 4 | |||
| 5 | |||
| 6 | /* | ||
| 7 | ** Add to 'c' the index of the (only) bit set in byte 'b' | ||
| 8 | */ | ||
| 9 | static int onlybit (int c, int b) { | ||
| 10 | if ((b & 0xF0) != 0) { c += 4; b >>= 4; } | ||
| 11 | if ((b & 0x0C) != 0) { c += 2; b >>= 2; } | ||
| 12 | if ((b & 0x02) != 0) { c += 1; } | ||
| 13 | return c; | ||
| 14 | } | ||
| 15 | |||
| 16 | |||
| 17 | /* | ||
| 18 | ** Check whether a charset is empty (returns IFail), singleton (IChar), | ||
| 19 | ** full (IAny), or none of those (ISet). When singleton, 'info.aux1' | ||
| 20 | ** returns which character it is. When generic set, 'info' returns | ||
| 21 | ** information about its range. | ||
| 22 | */ | ||
| 23 | Opcode charsettype (const byte *cs, charsetinfo *info) { | ||
| 24 | int low0, low1, high0, high1; | ||
| 25 | for (low1 = 0; low1 < CHARSETSIZE && cs[low1] == 0; low1++) | ||
| 26 | /* find lowest byte with a 1-bit */; | ||
| 27 | if (low1 == CHARSETSIZE) | ||
| 28 | return IFail; /* no characters in set */ | ||
| 29 | for (high1 = CHARSETSIZE - 1; cs[high1] == 0; high1--) | ||
| 30 | /* find highest byte with a 1-bit; low1 is a sentinel */; | ||
| 31 | if (low1 == high1) { /* only one byte with 1-bits? */ | ||
| 32 | int b = cs[low1]; | ||
| 33 | if ((b & (b - 1)) == 0) { /* does byte has only one 1-bit? */ | ||
| 34 | info->aux1 = onlybit(low1 * BITSPERCHAR, b); /* get that bit */ | ||
| 35 | return IChar; /* single character */ | ||
| 36 | } | ||
| 37 | } | ||
| 38 | for (low0 = 0; low0 < CHARSETSIZE && cs[low0] == 0xFF; low0++) | ||
| 39 | /* find lowest byte with a 0-bit */; | ||
| 40 | if (low0 == CHARSETSIZE) | ||
| 41 | return IAny; /* set has all bits set */ | ||
| 42 | for (high0 = CHARSETSIZE - 1; cs[high0] == 0xFF; high0--) | ||
| 43 | /* find highest byte with a 0-bit; low0 is a sentinel */; | ||
| 44 | if (high1 - low1 <= high0 - low0) { /* range of 1s smaller than of 0s? */ | ||
| 45 | info->aux1 = low1; | ||
| 46 | info->size = high1 - low1 + 1; | ||
| 47 | info->deflt = 0; /* all discharged bits were 0 */ | ||
| 48 | } | ||
| 49 | else { | ||
| 50 | info->aux1 = low0; | ||
| 51 | info->size = high0 - low0 + 1; | ||
| 52 | info->deflt = 0xFF; /* all discharged bits were 1 */ | ||
| 53 | } | ||
| 54 | return ISet; | ||
| 55 | } | ||
| 56 | |||
| 57 | |||
| 58 | /* | ||
| 59 | ** Get a byte from a compact charset. If index is inside the charset | ||
| 60 | ** range, get the byte from the supporting charset (correcting it | ||
| 61 | ** by the offset). Otherwise, return the default for the set. | ||
| 62 | */ | ||
| 63 | byte getbytefromcharset (const byte *cs, const charsetinfo *info, | ||
| 64 | int index) { | ||
| 65 | if (index < info->size) | ||
| 66 | return cs[info->aux1 + index]; | ||
| 67 | else return info->deflt; | ||
| 68 | } | ||
| 69 | |||
diff --git a/lpcset.h b/lpcset.h new file mode 100644 index 0000000..e5152c4 --- /dev/null +++ b/lpcset.h | |||
| @@ -0,0 +1,25 @@ | |||
| 1 | |||
| 2 | #if !defined(lpset_h) | ||
| 3 | #define lpset_h | ||
| 4 | |||
| 5 | #include "lpcset.h" | ||
| 6 | #include "lpcode.h" | ||
| 7 | |||
| 8 | |||
| 9 | /* | ||
| 10 | ** Extra information for the result of 'charsettype'. When result is | ||
| 11 | ** IChar, 'aux1' is the character. When result is ISet, 'aux1' is the | ||
| 12 | ** offset (in bytes), 'size' is the size (in bytes), and | ||
| 13 | ** 'delt' is the default value for bytes outside the set. | ||
| 14 | */ | ||
| 15 | typedef struct { | ||
| 16 | int aux1; | ||
| 17 | int size; | ||
| 18 | int deflt; | ||
| 19 | } charsetinfo; | ||
| 20 | |||
| 21 | |||
| 22 | Opcode charsettype (const byte *cs, charsetinfo *info); | ||
| 23 | byte getbytefromcharset (const byte *cs, const charsetinfo *info, int index); | ||
| 24 | |||
| 25 | #endif | ||
| @@ -1,7 +1,7 @@ | |||
| 1 | LIBNAME = lpeg | 1 | LIBNAME = lpeg |
| 2 | LUADIR = ../lua/ | 2 | LUADIR = ./lua/ |
| 3 | 3 | ||
| 4 | # COPT = -O2 -DNDEBUG | 4 | COPT = -O2 -DNDEBUG |
| 5 | # COPT = -g | 5 | # COPT = -g |
| 6 | # COPT = -DLPEG_DEBUG | 6 | # COPT = -DLPEG_DEBUG |
| 7 | 7 | ||
| @@ -26,7 +26,7 @@ CWARNS = -Wall -Wextra -pedantic \ | |||
| 26 | CFLAGS = $(CWARNS) $(COPT) -std=c99 -I$(LUADIR) -fPIC | 26 | CFLAGS = $(CWARNS) $(COPT) -std=c99 -I$(LUADIR) -fPIC |
| 27 | CC = gcc | 27 | CC = gcc |
| 28 | 28 | ||
| 29 | FILES = lpvm.o lpcap.o lptree.o lpcode.o lpprint.o | 29 | FILES = lpvm.o lpcap.o lptree.o lpcode.o lpprint.o lpcset.o |
| 30 | 30 | ||
| 31 | # For Linux | 31 | # For Linux |
| 32 | linux: | 32 | linux: |
| @@ -53,4 +53,4 @@ lpcode.o: lpcode.c lptypes.h lpcode.h lptree.h lpvm.h lpcap.h | |||
| 53 | lpprint.o: lpprint.c lptypes.h lpprint.h lptree.h lpvm.h lpcap.h | 53 | lpprint.o: lpprint.c lptypes.h lpprint.h lptree.h lpvm.h lpcap.h |
| 54 | lptree.o: lptree.c lptypes.h lpcap.h lpcode.h lptree.h lpvm.h lpprint.h | 54 | lptree.o: lptree.c lptypes.h lpcap.h lpcode.h lptree.h lpvm.h lpprint.h |
| 55 | lpvm.o: lpvm.c lpcap.h lptypes.h lpvm.h lpprint.h lptree.h | 55 | lpvm.o: lpvm.c lpcap.h lptypes.h lpvm.h lpprint.h lptree.h |
| 56 | 56 | lpcset.o: lpcset.c lpcset.h lptypes.h | |
