diff options
Diffstat (limited to 'src/lj_str.c')
-rw-r--r-- | src/lj_str.c | 358 |
1 files changed, 265 insertions, 93 deletions
diff --git a/src/lj_str.c b/src/lj_str.c index 0253c15e..5bf8426c 100644 --- a/src/lj_str.c +++ b/src/lj_str.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include "lj_err.h" | 11 | #include "lj_err.h" |
12 | #include "lj_str.h" | 12 | #include "lj_str.h" |
13 | #include "lj_char.h" | 13 | #include "lj_char.h" |
14 | #include "lj_prng.h" | ||
14 | 15 | ||
15 | /* -- String helpers ------------------------------------------------------ */ | 16 | /* -- String helpers ------------------------------------------------------ */ |
16 | 17 | ||
@@ -37,28 +38,6 @@ int32_t LJ_FASTCALL lj_str_cmp(GCstr *a, GCstr *b) | |||
37 | return (int32_t)(a->len - b->len); | 38 | return (int32_t)(a->len - b->len); |
38 | } | 39 | } |
39 | 40 | ||
40 | /* Fast string data comparison. Caveat: unaligned access to 1st string! */ | ||
41 | static LJ_AINLINE int str_fastcmp(const char *a, const char *b, MSize len) | ||
42 | { | ||
43 | MSize i = 0; | ||
44 | lj_assertX(len > 0, "fast string compare with zero length"); | ||
45 | lj_assertX((((uintptr_t)a+len-1) & (LJ_PAGESIZE-1)) <= LJ_PAGESIZE-4, | ||
46 | "fast string compare crossing page boundary"); | ||
47 | do { /* Note: innocuous access up to end of string + 3. */ | ||
48 | uint32_t v = lj_getu32(a+i) ^ *(const uint32_t *)(b+i); | ||
49 | if (v) { | ||
50 | i -= len; | ||
51 | #if LJ_LE | ||
52 | return (int32_t)i >= -3 ? (v << (32+(i<<3))) : 1; | ||
53 | #else | ||
54 | return (int32_t)i >= -3 ? (v >> (32+(i<<3))) : 1; | ||
55 | #endif | ||
56 | } | ||
57 | i += 4; | ||
58 | } while (i < len); | ||
59 | return 0; | ||
60 | } | ||
61 | |||
62 | /* Find fixed string p inside string s. */ | 41 | /* Find fixed string p inside string s. */ |
63 | const char *lj_str_find(const char *s, const char *p, MSize slen, MSize plen) | 42 | const char *lj_str_find(const char *s, const char *p, MSize slen, MSize plen) |
64 | { | 43 | { |
@@ -91,108 +70,301 @@ int lj_str_haspattern(GCstr *s) | |||
91 | return 0; /* No pattern matching chars found. */ | 70 | return 0; /* No pattern matching chars found. */ |
92 | } | 71 | } |
93 | 72 | ||
94 | /* -- String interning ---------------------------------------------------- */ | 73 | /* -- String hashing ------------------------------------------------------ */ |
95 | |||
96 | /* Resize the string hash table (grow and shrink). */ | ||
97 | void lj_str_resize(lua_State *L, MSize newmask) | ||
98 | { | ||
99 | global_State *g = G(L); | ||
100 | GCRef *newhash; | ||
101 | MSize i; | ||
102 | if (g->gc.state == GCSsweepstring || newmask >= LJ_MAX_STRTAB-1) | ||
103 | return; /* No resizing during GC traversal or if already too big. */ | ||
104 | newhash = lj_mem_newvec(L, newmask+1, GCRef); | ||
105 | memset(newhash, 0, (newmask+1)*sizeof(GCRef)); | ||
106 | for (i = g->strmask; i != ~(MSize)0; i--) { /* Rehash old table. */ | ||
107 | GCobj *p = gcref(g->strhash[i]); | ||
108 | while (p) { /* Follow each hash chain and reinsert all strings. */ | ||
109 | MSize h = gco2str(p)->hash & newmask; | ||
110 | GCobj *next = gcnext(p); | ||
111 | /* NOBARRIER: The string table is a GC root. */ | ||
112 | setgcrefr(p->gch.nextgc, newhash[h]); | ||
113 | setgcref(newhash[h], p); | ||
114 | p = next; | ||
115 | } | ||
116 | } | ||
117 | lj_mem_freevec(g, g->strhash, g->strmask+1, GCRef); | ||
118 | g->strmask = newmask; | ||
119 | g->strhash = newhash; | ||
120 | } | ||
121 | 74 | ||
122 | /* Intern a string and return string object. */ | 75 | /* Keyed sparse ARX string hash. Constant time. */ |
123 | GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx) | 76 | static StrHash hash_sparse(uint64_t seed, const char *str, MSize len) |
124 | { | 77 | { |
125 | global_State *g; | 78 | /* Constants taken from lookup3 hash by Bob Jenkins. */ |
126 | GCstr *s; | 79 | StrHash a, b, h = len ^ (StrHash)seed; |
127 | GCobj *o; | ||
128 | MSize len = (MSize)lenx; | ||
129 | MSize a, b, h = len; | ||
130 | if (lenx >= LJ_MAX_STR) | ||
131 | lj_err_msg(L, LJ_ERR_STROV); | ||
132 | g = G(L); | ||
133 | /* Compute string hash. Constants taken from lookup3 hash by Bob Jenkins. */ | ||
134 | if (len >= 4) { /* Caveat: unaligned access! */ | 80 | if (len >= 4) { /* Caveat: unaligned access! */ |
135 | a = lj_getu32(str); | 81 | a = lj_getu32(str); |
136 | h ^= lj_getu32(str+len-4); | 82 | h ^= lj_getu32(str+len-4); |
137 | b = lj_getu32(str+(len>>1)-2); | 83 | b = lj_getu32(str+(len>>1)-2); |
138 | h ^= b; h -= lj_rol(b, 14); | 84 | h ^= b; h -= lj_rol(b, 14); |
139 | b += lj_getu32(str+(len>>2)-1); | 85 | b += lj_getu32(str+(len>>2)-1); |
140 | } else if (len > 0) { | 86 | } else { |
141 | a = *(const uint8_t *)str; | 87 | a = *(const uint8_t *)str; |
142 | h ^= *(const uint8_t *)(str+len-1); | 88 | h ^= *(const uint8_t *)(str+len-1); |
143 | b = *(const uint8_t *)(str+(len>>1)); | 89 | b = *(const uint8_t *)(str+(len>>1)); |
144 | h ^= b; h -= lj_rol(b, 14); | 90 | h ^= b; h -= lj_rol(b, 14); |
145 | } else { | ||
146 | return &g->strempty; | ||
147 | } | 91 | } |
148 | a ^= h; a -= lj_rol(h, 11); | 92 | a ^= h; a -= lj_rol(h, 11); |
149 | b ^= a; b -= lj_rol(a, 25); | 93 | b ^= a; b -= lj_rol(a, 25); |
150 | h ^= b; h -= lj_rol(b, 16); | 94 | h ^= b; h -= lj_rol(b, 16); |
151 | /* Check if the string has already been interned. */ | 95 | return h; |
152 | o = gcref(g->strhash[h & g->strmask]); | 96 | } |
153 | if (LJ_LIKELY((((uintptr_t)str+len-1) & (LJ_PAGESIZE-1)) <= LJ_PAGESIZE-4)) { | 97 | |
154 | while (o != NULL) { | 98 | #if LUAJIT_SECURITY_STRHASH |
155 | GCstr *sx = gco2str(o); | 99 | /* Keyed dense ARX string hash. Linear time. */ |
156 | if (sx->len == len && str_fastcmp(str, strdata(sx), len) == 0) { | 100 | static LJ_NOINLINE StrHash hash_dense(uint64_t seed, StrHash h, |
157 | /* Resurrect if dead. Can only happen with fixstring() (keywords). */ | 101 | const char *str, MSize len) |
158 | if (isdead(g, o)) flipwhite(o); | 102 | { |
159 | return sx; /* Return existing string. */ | 103 | StrHash b = lj_bswap(lj_rol(h ^ (StrHash)(seed >> 32), 4)); |
104 | if (len > 12) { | ||
105 | StrHash a = (StrHash)seed; | ||
106 | const char *pe = str+len-12, *p = pe, *q = str; | ||
107 | do { | ||
108 | a += lj_getu32(p); | ||
109 | b += lj_getu32(p+4); | ||
110 | h += lj_getu32(p+8); | ||
111 | p = q; q += 12; | ||
112 | h ^= b; h -= lj_rol(b, 14); | ||
113 | a ^= h; a -= lj_rol(h, 11); | ||
114 | b ^= a; b -= lj_rol(a, 25); | ||
115 | } while (p < pe); | ||
116 | h ^= b; h -= lj_rol(b, 16); | ||
117 | a ^= h; a -= lj_rol(h, 4); | ||
118 | b ^= a; b -= lj_rol(a, 14); | ||
119 | } | ||
120 | return b; | ||
121 | } | ||
122 | #endif | ||
123 | |||
124 | /* -- String interning ---------------------------------------------------- */ | ||
125 | |||
126 | #define LJ_STR_MAXCOLL 32 | ||
127 | |||
128 | /* Resize the string interning hash table (grow and shrink). */ | ||
129 | void lj_str_resize(lua_State *L, MSize newmask) | ||
130 | { | ||
131 | global_State *g = G(L); | ||
132 | GCRef *newtab, *oldtab = g->str.tab; | ||
133 | MSize i; | ||
134 | |||
135 | /* No resizing during GC traversal or if already too big. */ | ||
136 | if (g->gc.state == GCSsweepstring || newmask >= LJ_MAX_STRTAB-1) | ||
137 | return; | ||
138 | |||
139 | newtab = lj_mem_newvec(L, newmask+1, GCRef); | ||
140 | memset(newtab, 0, (newmask+1)*sizeof(GCRef)); | ||
141 | |||
142 | #if LUAJIT_SECURITY_STRHASH | ||
143 | /* Check which chains need secondary hashes. */ | ||
144 | if (g->str.second) { | ||
145 | int newsecond = 0; | ||
146 | /* Compute primary chain lengths. */ | ||
147 | for (i = g->str.mask; i != ~(MSize)0; i--) { | ||
148 | GCobj *o = (GCobj *)(gcrefu(oldtab[i]) & ~(uintptr_t)1); | ||
149 | while (o) { | ||
150 | GCstr *s = gco2str(o); | ||
151 | MSize hash = s->hashalg ? hash_sparse(g->str.seed, strdata(s), s->len) : | ||
152 | s->hash; | ||
153 | hash &= newmask; | ||
154 | setgcrefp(newtab[hash], gcrefu(newtab[hash]) + 1); | ||
155 | o = gcnext(o); | ||
160 | } | 156 | } |
161 | o = gcnext(o); | ||
162 | } | 157 | } |
163 | } else { /* Slow path: end of string is too close to a page boundary. */ | 158 | /* Mark secondary chains. */ |
164 | while (o != NULL) { | 159 | for (i = newmask; i != ~(MSize)0; i--) { |
165 | GCstr *sx = gco2str(o); | 160 | int secondary = gcrefu(newtab[i]) > LJ_STR_MAXCOLL; |
166 | if (sx->len == len && memcmp(str, strdata(sx), len) == 0) { | 161 | newsecond |= secondary; |
167 | /* Resurrect if dead. Can only happen with fixstring() (keywords). */ | 162 | setgcrefp(newtab[i], secondary); |
168 | if (isdead(g, o)) flipwhite(o); | 163 | } |
169 | return sx; /* Return existing string. */ | 164 | g->str.second = newsecond; |
165 | } | ||
166 | #endif | ||
167 | |||
168 | /* Reinsert all strings from the old table into the new table. */ | ||
169 | for (i = g->str.mask; i != ~(MSize)0; i--) { | ||
170 | GCobj *o = (GCobj *)(gcrefu(oldtab[i]) & ~(uintptr_t)1); | ||
171 | while (o) { | ||
172 | GCobj *next = gcnext(o); | ||
173 | GCstr *s = gco2str(o); | ||
174 | MSize hash = s->hash; | ||
175 | #if LUAJIT_SECURITY_STRHASH | ||
176 | uintptr_t u; | ||
177 | if (LJ_LIKELY(!s->hashalg)) { /* String hashed with primary hash. */ | ||
178 | hash &= newmask; | ||
179 | u = gcrefu(newtab[hash]); | ||
180 | if (LJ_UNLIKELY(u & 1)) { /* Switch string to secondary hash. */ | ||
181 | s->hash = hash = hash_dense(g->str.seed, s->hash, strdata(s), s->len); | ||
182 | s->hashalg = 1; | ||
183 | hash &= newmask; | ||
184 | u = gcrefu(newtab[hash]); | ||
185 | } | ||
186 | } else { /* String hashed with secondary hash. */ | ||
187 | MSize shash = hash_sparse(g->str.seed, strdata(s), s->len); | ||
188 | u = gcrefu(newtab[shash & newmask]); | ||
189 | if (u & 1) { | ||
190 | hash &= newmask; | ||
191 | u = gcrefu(newtab[hash]); | ||
192 | } else { /* Revert string back to primary hash. */ | ||
193 | s->hash = shash; | ||
194 | s->hashalg = 0; | ||
195 | hash = (shash & newmask); | ||
196 | } | ||
197 | } | ||
198 | /* NOBARRIER: The string table is a GC root. */ | ||
199 | setgcrefp(o->gch.nextgc, (u & ~(uintptr_t)1)); | ||
200 | setgcrefp(newtab[hash], ((uintptr_t)o | (u & 1))); | ||
201 | #else | ||
202 | hash &= newmask; | ||
203 | /* NOBARRIER: The string table is a GC root. */ | ||
204 | setgcrefr(o->gch.nextgc, newtab[hash]); | ||
205 | setgcref(newtab[hash], o); | ||
206 | #endif | ||
207 | o = next; | ||
208 | } | ||
209 | } | ||
210 | |||
211 | /* Free old table and replace with new table. */ | ||
212 | lj_str_freetab(g); | ||
213 | g->str.tab = newtab; | ||
214 | g->str.mask = newmask; | ||
215 | } | ||
216 | |||
217 | #if LUAJIT_SECURITY_STRHASH | ||
218 | /* Rehash and rechain all strings in a chain. */ | ||
219 | static LJ_NOINLINE GCstr *lj_str_rehash_chain(lua_State *L, StrHash hashc, | ||
220 | const char *str, MSize len) | ||
221 | { | ||
222 | global_State *g = G(L); | ||
223 | int ow = g->gc.state == GCSsweepstring ? otherwhite(g) : 0; /* Sweeping? */ | ||
224 | GCRef *strtab = g->str.tab; | ||
225 | MSize strmask = g->str.mask; | ||
226 | GCobj *o = gcref(strtab[hashc & strmask]); | ||
227 | setgcrefp(strtab[hashc & strmask], (void *)((uintptr_t)1)); | ||
228 | g->str.second = 1; | ||
229 | while (o) { | ||
230 | uintptr_t u; | ||
231 | GCobj *next = gcnext(o); | ||
232 | GCstr *s = gco2str(o); | ||
233 | StrHash hash; | ||
234 | if (ow) { /* Must sweep while rechaining. */ | ||
235 | if (((o->gch.marked ^ LJ_GC_WHITES) & ow)) { /* String alive? */ | ||
236 | lj_assertG(!isdead(g, o) || (o->gch.marked & LJ_GC_FIXED), | ||
237 | "sweep of undead string"); | ||
238 | makewhite(g, o); | ||
239 | } else { /* Free dead string. */ | ||
240 | lj_assertG(isdead(g, o) || ow == LJ_GC_SFIXED, | ||
241 | "sweep of unlive string"); | ||
242 | lj_str_free(g, s); | ||
243 | o = next; | ||
244 | continue; | ||
170 | } | 245 | } |
171 | o = gcnext(o); | ||
172 | } | 246 | } |
247 | hash = s->hash; | ||
248 | if (!s->hashalg) { /* Rehash with secondary hash. */ | ||
249 | hash = hash_dense(g->str.seed, hash, strdata(s), s->len); | ||
250 | s->hash = hash; | ||
251 | s->hashalg = 1; | ||
252 | } | ||
253 | /* Rechain. */ | ||
254 | hash &= strmask; | ||
255 | u = gcrefu(strtab[hash]); | ||
256 | setgcrefp(o->gch.nextgc, (u & ~(uintptr_t)1)); | ||
257 | setgcrefp(strtab[hash], ((uintptr_t)o | (u & 1))); | ||
258 | o = next; | ||
173 | } | 259 | } |
174 | /* Nope, create a new string. */ | 260 | /* Try to insert the pending string again. */ |
175 | s = lj_mem_newt(L, sizeof(GCstr)+len+1, GCstr); | 261 | return lj_str_new(L, str, len); |
262 | } | ||
263 | #endif | ||
264 | |||
265 | /* Reseed String ID from PRNG after random interval < 2^bits. */ | ||
266 | #if LUAJIT_SECURITY_STRID == 1 | ||
267 | #define STRID_RESEED_INTERVAL 8 | ||
268 | #elif LUAJIT_SECURITY_STRID == 2 | ||
269 | #define STRID_RESEED_INTERVAL 4 | ||
270 | #elif LUAJIT_SECURITY_STRID >= 3 | ||
271 | #define STRID_RESEED_INTERVAL 0 | ||
272 | #endif | ||
273 | |||
274 | /* Allocate a new string and add to string interning table. */ | ||
275 | static GCstr *lj_str_alloc(lua_State *L, const char *str, MSize len, | ||
276 | StrHash hash, int hashalg) | ||
277 | { | ||
278 | GCstr *s = lj_mem_newt(L, lj_str_size(len), GCstr); | ||
279 | global_State *g = G(L); | ||
280 | uintptr_t u; | ||
176 | newwhite(g, s); | 281 | newwhite(g, s); |
177 | s->gct = ~LJ_TSTR; | 282 | s->gct = ~LJ_TSTR; |
178 | s->len = len; | 283 | s->len = len; |
179 | s->hash = h; | 284 | s->hash = hash; |
285 | #ifndef STRID_RESEED_INTERVAL | ||
286 | s->sid = g->str.id++; | ||
287 | #elif STRID_RESEED_INTERVAL | ||
288 | if (!g->str.idreseed--) { | ||
289 | uint64_t r = lj_prng_u64(&g->prng); | ||
290 | g->str.id = (StrID)r; | ||
291 | g->str.idreseed = (uint8_t)(r >> (64 - STRID_RESEED_INTERVAL)); | ||
292 | } | ||
293 | s->sid = g->str.id++; | ||
294 | #else | ||
295 | s->sid = (StrID)lj_prng_u64(&g->prng); | ||
296 | #endif | ||
180 | s->reserved = 0; | 297 | s->reserved = 0; |
298 | s->hashalg = (uint8_t)hashalg; | ||
299 | /* Clear last 4 bytes of allocated memory. Implies zero-termination, too. */ | ||
300 | *(uint32_t *)(strdatawr(s)+(len & ~(MSize)3)) = 0; | ||
181 | memcpy(strdatawr(s), str, len); | 301 | memcpy(strdatawr(s), str, len); |
182 | strdatawr(s)[len] = '\0'; /* Zero-terminate string. */ | 302 | /* Add to string hash table. */ |
183 | /* Add it to string hash table. */ | 303 | hash &= g->str.mask; |
184 | h &= g->strmask; | 304 | u = gcrefu(g->str.tab[hash]); |
185 | s->nextgc = g->strhash[h]; | 305 | setgcrefp(s->nextgc, (u & ~(uintptr_t)1)); |
186 | /* NOBARRIER: The string table is a GC root. */ | 306 | /* NOBARRIER: The string table is a GC root. */ |
187 | setgcref(g->strhash[h], obj2gco(s)); | 307 | setgcrefp(g->str.tab[hash], ((uintptr_t)s | (u & 1))); |
188 | if (g->strnum++ > g->strmask) /* Allow a 100% load factor. */ | 308 | if (g->str.num++ > g->str.mask) /* Allow a 100% load factor. */ |
189 | lj_str_resize(L, (g->strmask<<1)+1); /* Grow string table. */ | 309 | lj_str_resize(L, (g->str.mask<<1)+1); /* Grow string table. */ |
190 | return s; /* Return newly interned string. */ | 310 | return s; /* Return newly interned string. */ |
191 | } | 311 | } |
192 | 312 | ||
313 | /* Intern a string and return string object. */ | ||
314 | GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx) | ||
315 | { | ||
316 | global_State *g = G(L); | ||
317 | if (lenx-1 < LJ_MAX_STR-1) { | ||
318 | MSize len = (MSize)lenx; | ||
319 | StrHash hash = hash_sparse(g->str.seed, str, len); | ||
320 | MSize coll = 0; | ||
321 | int hashalg = 0; | ||
322 | /* Check if the string has already been interned. */ | ||
323 | GCobj *o = gcref(g->str.tab[hash & g->str.mask]); | ||
324 | #if LUAJIT_SECURITY_STRHASH | ||
325 | if (LJ_UNLIKELY((uintptr_t)o & 1)) { /* Secondary hash for this chain? */ | ||
326 | hashalg = 1; | ||
327 | hash = hash_dense(g->str.seed, hash, str, len); | ||
328 | o = (GCobj *)(gcrefu(g->str.tab[hash & g->str.mask]) & ~(uintptr_t)1); | ||
329 | } | ||
330 | #endif | ||
331 | while (o != NULL) { | ||
332 | GCstr *sx = gco2str(o); | ||
333 | if (sx->hash == hash && sx->len == len) { | ||
334 | if (memcmp(str, strdata(sx), len) == 0) { | ||
335 | if (isdead(g, o)) flipwhite(o); /* Resurrect if dead. */ | ||
336 | return sx; /* Return existing string. */ | ||
337 | } | ||
338 | coll++; | ||
339 | } | ||
340 | coll++; | ||
341 | o = gcnext(o); | ||
342 | } | ||
343 | #if LUAJIT_SECURITY_STRHASH | ||
344 | /* Rehash chain if there are too many collisions. */ | ||
345 | if (LJ_UNLIKELY(coll > LJ_STR_MAXCOLL) && !hashalg) { | ||
346 | return lj_str_rehash_chain(L, hash, str, len); | ||
347 | } | ||
348 | #endif | ||
349 | /* Otherwise allocate a new string. */ | ||
350 | return lj_str_alloc(L, str, len, hash, hashalg); | ||
351 | } else { | ||
352 | if (lenx) | ||
353 | lj_err_msg(L, LJ_ERR_STROV); | ||
354 | return &g->strempty; | ||
355 | } | ||
356 | } | ||
357 | |||
193 | void LJ_FASTCALL lj_str_free(global_State *g, GCstr *s) | 358 | void LJ_FASTCALL lj_str_free(global_State *g, GCstr *s) |
194 | { | 359 | { |
195 | g->strnum--; | 360 | g->str.num--; |
196 | lj_mem_free(g, s, sizestring(s)); | 361 | lj_mem_free(g, s, lj_str_size(s->len)); |
362 | } | ||
363 | |||
364 | void LJ_FASTCALL lj_str_init(lua_State *L) | ||
365 | { | ||
366 | global_State *g = G(L); | ||
367 | g->str.seed = lj_prng_u64(&g->prng); | ||
368 | lj_str_resize(L, LJ_MIN_STRTAB-1); | ||
197 | } | 369 | } |
198 | 370 | ||