aboutsummaryrefslogtreecommitdiff
path: root/src/lj_str.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/lj_str.c')
-rw-r--r--src/lj_str.c358
1 files changed, 265 insertions, 93 deletions
diff --git a/src/lj_str.c b/src/lj_str.c
index 0253c15e..5bf8426c 100644
--- a/src/lj_str.c
+++ b/src/lj_str.c
@@ -11,6 +11,7 @@
11#include "lj_err.h" 11#include "lj_err.h"
12#include "lj_str.h" 12#include "lj_str.h"
13#include "lj_char.h" 13#include "lj_char.h"
14#include "lj_prng.h"
14 15
15/* -- String helpers ------------------------------------------------------ */ 16/* -- String helpers ------------------------------------------------------ */
16 17
@@ -37,28 +38,6 @@ int32_t LJ_FASTCALL lj_str_cmp(GCstr *a, GCstr *b)
37 return (int32_t)(a->len - b->len); 38 return (int32_t)(a->len - b->len);
38} 39}
39 40
40/* Fast string data comparison. Caveat: unaligned access to 1st string! */
41static LJ_AINLINE int str_fastcmp(const char *a, const char *b, MSize len)
42{
43 MSize i = 0;
44 lj_assertX(len > 0, "fast string compare with zero length");
45 lj_assertX((((uintptr_t)a+len-1) & (LJ_PAGESIZE-1)) <= LJ_PAGESIZE-4,
46 "fast string compare crossing page boundary");
47 do { /* Note: innocuous access up to end of string + 3. */
48 uint32_t v = lj_getu32(a+i) ^ *(const uint32_t *)(b+i);
49 if (v) {
50 i -= len;
51#if LJ_LE
52 return (int32_t)i >= -3 ? (v << (32+(i<<3))) : 1;
53#else
54 return (int32_t)i >= -3 ? (v >> (32+(i<<3))) : 1;
55#endif
56 }
57 i += 4;
58 } while (i < len);
59 return 0;
60}
61
62/* Find fixed string p inside string s. */ 41/* Find fixed string p inside string s. */
63const char *lj_str_find(const char *s, const char *p, MSize slen, MSize plen) 42const char *lj_str_find(const char *s, const char *p, MSize slen, MSize plen)
64{ 43{
@@ -91,108 +70,301 @@ int lj_str_haspattern(GCstr *s)
91 return 0; /* No pattern matching chars found. */ 70 return 0; /* No pattern matching chars found. */
92} 71}
93 72
94/* -- String interning ---------------------------------------------------- */ 73/* -- String hashing ------------------------------------------------------ */
95
96/* Resize the string hash table (grow and shrink). */
97void lj_str_resize(lua_State *L, MSize newmask)
98{
99 global_State *g = G(L);
100 GCRef *newhash;
101 MSize i;
102 if (g->gc.state == GCSsweepstring || newmask >= LJ_MAX_STRTAB-1)
103 return; /* No resizing during GC traversal or if already too big. */
104 newhash = lj_mem_newvec(L, newmask+1, GCRef);
105 memset(newhash, 0, (newmask+1)*sizeof(GCRef));
106 for (i = g->strmask; i != ~(MSize)0; i--) { /* Rehash old table. */
107 GCobj *p = gcref(g->strhash[i]);
108 while (p) { /* Follow each hash chain and reinsert all strings. */
109 MSize h = gco2str(p)->hash & newmask;
110 GCobj *next = gcnext(p);
111 /* NOBARRIER: The string table is a GC root. */
112 setgcrefr(p->gch.nextgc, newhash[h]);
113 setgcref(newhash[h], p);
114 p = next;
115 }
116 }
117 lj_mem_freevec(g, g->strhash, g->strmask+1, GCRef);
118 g->strmask = newmask;
119 g->strhash = newhash;
120}
121 74
122/* Intern a string and return string object. */ 75/* Keyed sparse ARX string hash. Constant time. */
123GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx) 76static StrHash hash_sparse(uint64_t seed, const char *str, MSize len)
124{ 77{
125 global_State *g; 78 /* Constants taken from lookup3 hash by Bob Jenkins. */
126 GCstr *s; 79 StrHash a, b, h = len ^ (StrHash)seed;
127 GCobj *o;
128 MSize len = (MSize)lenx;
129 MSize a, b, h = len;
130 if (lenx >= LJ_MAX_STR)
131 lj_err_msg(L, LJ_ERR_STROV);
132 g = G(L);
133 /* Compute string hash. Constants taken from lookup3 hash by Bob Jenkins. */
134 if (len >= 4) { /* Caveat: unaligned access! */ 80 if (len >= 4) { /* Caveat: unaligned access! */
135 a = lj_getu32(str); 81 a = lj_getu32(str);
136 h ^= lj_getu32(str+len-4); 82 h ^= lj_getu32(str+len-4);
137 b = lj_getu32(str+(len>>1)-2); 83 b = lj_getu32(str+(len>>1)-2);
138 h ^= b; h -= lj_rol(b, 14); 84 h ^= b; h -= lj_rol(b, 14);
139 b += lj_getu32(str+(len>>2)-1); 85 b += lj_getu32(str+(len>>2)-1);
140 } else if (len > 0) { 86 } else {
141 a = *(const uint8_t *)str; 87 a = *(const uint8_t *)str;
142 h ^= *(const uint8_t *)(str+len-1); 88 h ^= *(const uint8_t *)(str+len-1);
143 b = *(const uint8_t *)(str+(len>>1)); 89 b = *(const uint8_t *)(str+(len>>1));
144 h ^= b; h -= lj_rol(b, 14); 90 h ^= b; h -= lj_rol(b, 14);
145 } else {
146 return &g->strempty;
147 } 91 }
148 a ^= h; a -= lj_rol(h, 11); 92 a ^= h; a -= lj_rol(h, 11);
149 b ^= a; b -= lj_rol(a, 25); 93 b ^= a; b -= lj_rol(a, 25);
150 h ^= b; h -= lj_rol(b, 16); 94 h ^= b; h -= lj_rol(b, 16);
151 /* Check if the string has already been interned. */ 95 return h;
152 o = gcref(g->strhash[h & g->strmask]); 96}
153 if (LJ_LIKELY((((uintptr_t)str+len-1) & (LJ_PAGESIZE-1)) <= LJ_PAGESIZE-4)) { 97
154 while (o != NULL) { 98#if LUAJIT_SECURITY_STRHASH
155 GCstr *sx = gco2str(o); 99/* Keyed dense ARX string hash. Linear time. */
156 if (sx->len == len && str_fastcmp(str, strdata(sx), len) == 0) { 100static LJ_NOINLINE StrHash hash_dense(uint64_t seed, StrHash h,
157 /* Resurrect if dead. Can only happen with fixstring() (keywords). */ 101 const char *str, MSize len)
158 if (isdead(g, o)) flipwhite(o); 102{
159 return sx; /* Return existing string. */ 103 StrHash b = lj_bswap(lj_rol(h ^ (StrHash)(seed >> 32), 4));
104 if (len > 12) {
105 StrHash a = (StrHash)seed;
106 const char *pe = str+len-12, *p = pe, *q = str;
107 do {
108 a += lj_getu32(p);
109 b += lj_getu32(p+4);
110 h += lj_getu32(p+8);
111 p = q; q += 12;
112 h ^= b; h -= lj_rol(b, 14);
113 a ^= h; a -= lj_rol(h, 11);
114 b ^= a; b -= lj_rol(a, 25);
115 } while (p < pe);
116 h ^= b; h -= lj_rol(b, 16);
117 a ^= h; a -= lj_rol(h, 4);
118 b ^= a; b -= lj_rol(a, 14);
119 }
120 return b;
121}
122#endif
123
124/* -- String interning ---------------------------------------------------- */
125
126#define LJ_STR_MAXCOLL 32
127
128/* Resize the string interning hash table (grow and shrink). */
129void lj_str_resize(lua_State *L, MSize newmask)
130{
131 global_State *g = G(L);
132 GCRef *newtab, *oldtab = g->str.tab;
133 MSize i;
134
135 /* No resizing during GC traversal or if already too big. */
136 if (g->gc.state == GCSsweepstring || newmask >= LJ_MAX_STRTAB-1)
137 return;
138
139 newtab = lj_mem_newvec(L, newmask+1, GCRef);
140 memset(newtab, 0, (newmask+1)*sizeof(GCRef));
141
142#if LUAJIT_SECURITY_STRHASH
143 /* Check which chains need secondary hashes. */
144 if (g->str.second) {
145 int newsecond = 0;
146 /* Compute primary chain lengths. */
147 for (i = g->str.mask; i != ~(MSize)0; i--) {
148 GCobj *o = (GCobj *)(gcrefu(oldtab[i]) & ~(uintptr_t)1);
149 while (o) {
150 GCstr *s = gco2str(o);
151 MSize hash = s->hashalg ? hash_sparse(g->str.seed, strdata(s), s->len) :
152 s->hash;
153 hash &= newmask;
154 setgcrefp(newtab[hash], gcrefu(newtab[hash]) + 1);
155 o = gcnext(o);
160 } 156 }
161 o = gcnext(o);
162 } 157 }
163 } else { /* Slow path: end of string is too close to a page boundary. */ 158 /* Mark secondary chains. */
164 while (o != NULL) { 159 for (i = newmask; i != ~(MSize)0; i--) {
165 GCstr *sx = gco2str(o); 160 int secondary = gcrefu(newtab[i]) > LJ_STR_MAXCOLL;
166 if (sx->len == len && memcmp(str, strdata(sx), len) == 0) { 161 newsecond |= secondary;
167 /* Resurrect if dead. Can only happen with fixstring() (keywords). */ 162 setgcrefp(newtab[i], secondary);
168 if (isdead(g, o)) flipwhite(o); 163 }
169 return sx; /* Return existing string. */ 164 g->str.second = newsecond;
165 }
166#endif
167
168 /* Reinsert all strings from the old table into the new table. */
169 for (i = g->str.mask; i != ~(MSize)0; i--) {
170 GCobj *o = (GCobj *)(gcrefu(oldtab[i]) & ~(uintptr_t)1);
171 while (o) {
172 GCobj *next = gcnext(o);
173 GCstr *s = gco2str(o);
174 MSize hash = s->hash;
175#if LUAJIT_SECURITY_STRHASH
176 uintptr_t u;
177 if (LJ_LIKELY(!s->hashalg)) { /* String hashed with primary hash. */
178 hash &= newmask;
179 u = gcrefu(newtab[hash]);
180 if (LJ_UNLIKELY(u & 1)) { /* Switch string to secondary hash. */
181 s->hash = hash = hash_dense(g->str.seed, s->hash, strdata(s), s->len);
182 s->hashalg = 1;
183 hash &= newmask;
184 u = gcrefu(newtab[hash]);
185 }
186 } else { /* String hashed with secondary hash. */
187 MSize shash = hash_sparse(g->str.seed, strdata(s), s->len);
188 u = gcrefu(newtab[shash & newmask]);
189 if (u & 1) {
190 hash &= newmask;
191 u = gcrefu(newtab[hash]);
192 } else { /* Revert string back to primary hash. */
193 s->hash = shash;
194 s->hashalg = 0;
195 hash = (shash & newmask);
196 }
197 }
198 /* NOBARRIER: The string table is a GC root. */
199 setgcrefp(o->gch.nextgc, (u & ~(uintptr_t)1));
200 setgcrefp(newtab[hash], ((uintptr_t)o | (u & 1)));
201#else
202 hash &= newmask;
203 /* NOBARRIER: The string table is a GC root. */
204 setgcrefr(o->gch.nextgc, newtab[hash]);
205 setgcref(newtab[hash], o);
206#endif
207 o = next;
208 }
209 }
210
211 /* Free old table and replace with new table. */
212 lj_str_freetab(g);
213 g->str.tab = newtab;
214 g->str.mask = newmask;
215}
216
217#if LUAJIT_SECURITY_STRHASH
218/* Rehash and rechain all strings in a chain. */
219static LJ_NOINLINE GCstr *lj_str_rehash_chain(lua_State *L, StrHash hashc,
220 const char *str, MSize len)
221{
222 global_State *g = G(L);
223 int ow = g->gc.state == GCSsweepstring ? otherwhite(g) : 0; /* Sweeping? */
224 GCRef *strtab = g->str.tab;
225 MSize strmask = g->str.mask;
226 GCobj *o = gcref(strtab[hashc & strmask]);
227 setgcrefp(strtab[hashc & strmask], (void *)((uintptr_t)1));
228 g->str.second = 1;
229 while (o) {
230 uintptr_t u;
231 GCobj *next = gcnext(o);
232 GCstr *s = gco2str(o);
233 StrHash hash;
234 if (ow) { /* Must sweep while rechaining. */
235 if (((o->gch.marked ^ LJ_GC_WHITES) & ow)) { /* String alive? */
236 lj_assertG(!isdead(g, o) || (o->gch.marked & LJ_GC_FIXED),
237 "sweep of undead string");
238 makewhite(g, o);
239 } else { /* Free dead string. */
240 lj_assertG(isdead(g, o) || ow == LJ_GC_SFIXED,
241 "sweep of unlive string");
242 lj_str_free(g, s);
243 o = next;
244 continue;
170 } 245 }
171 o = gcnext(o);
172 } 246 }
247 hash = s->hash;
248 if (!s->hashalg) { /* Rehash with secondary hash. */
249 hash = hash_dense(g->str.seed, hash, strdata(s), s->len);
250 s->hash = hash;
251 s->hashalg = 1;
252 }
253 /* Rechain. */
254 hash &= strmask;
255 u = gcrefu(strtab[hash]);
256 setgcrefp(o->gch.nextgc, (u & ~(uintptr_t)1));
257 setgcrefp(strtab[hash], ((uintptr_t)o | (u & 1)));
258 o = next;
173 } 259 }
174 /* Nope, create a new string. */ 260 /* Try to insert the pending string again. */
175 s = lj_mem_newt(L, sizeof(GCstr)+len+1, GCstr); 261 return lj_str_new(L, str, len);
262}
263#endif
264
265/* Reseed String ID from PRNG after random interval < 2^bits. */
266#if LUAJIT_SECURITY_STRID == 1
267#define STRID_RESEED_INTERVAL 8
268#elif LUAJIT_SECURITY_STRID == 2
269#define STRID_RESEED_INTERVAL 4
270#elif LUAJIT_SECURITY_STRID >= 3
271#define STRID_RESEED_INTERVAL 0
272#endif
273
274/* Allocate a new string and add to string interning table. */
275static GCstr *lj_str_alloc(lua_State *L, const char *str, MSize len,
276 StrHash hash, int hashalg)
277{
278 GCstr *s = lj_mem_newt(L, lj_str_size(len), GCstr);
279 global_State *g = G(L);
280 uintptr_t u;
176 newwhite(g, s); 281 newwhite(g, s);
177 s->gct = ~LJ_TSTR; 282 s->gct = ~LJ_TSTR;
178 s->len = len; 283 s->len = len;
179 s->hash = h; 284 s->hash = hash;
285#ifndef STRID_RESEED_INTERVAL
286 s->sid = g->str.id++;
287#elif STRID_RESEED_INTERVAL
288 if (!g->str.idreseed--) {
289 uint64_t r = lj_prng_u64(&g->prng);
290 g->str.id = (StrID)r;
291 g->str.idreseed = (uint8_t)(r >> (64 - STRID_RESEED_INTERVAL));
292 }
293 s->sid = g->str.id++;
294#else
295 s->sid = (StrID)lj_prng_u64(&g->prng);
296#endif
180 s->reserved = 0; 297 s->reserved = 0;
298 s->hashalg = (uint8_t)hashalg;
299 /* Clear last 4 bytes of allocated memory. Implies zero-termination, too. */
300 *(uint32_t *)(strdatawr(s)+(len & ~(MSize)3)) = 0;
181 memcpy(strdatawr(s), str, len); 301 memcpy(strdatawr(s), str, len);
182 strdatawr(s)[len] = '\0'; /* Zero-terminate string. */ 302 /* Add to string hash table. */
183 /* Add it to string hash table. */ 303 hash &= g->str.mask;
184 h &= g->strmask; 304 u = gcrefu(g->str.tab[hash]);
185 s->nextgc = g->strhash[h]; 305 setgcrefp(s->nextgc, (u & ~(uintptr_t)1));
186 /* NOBARRIER: The string table is a GC root. */ 306 /* NOBARRIER: The string table is a GC root. */
187 setgcref(g->strhash[h], obj2gco(s)); 307 setgcrefp(g->str.tab[hash], ((uintptr_t)s | (u & 1)));
188 if (g->strnum++ > g->strmask) /* Allow a 100% load factor. */ 308 if (g->str.num++ > g->str.mask) /* Allow a 100% load factor. */
189 lj_str_resize(L, (g->strmask<<1)+1); /* Grow string table. */ 309 lj_str_resize(L, (g->str.mask<<1)+1); /* Grow string table. */
190 return s; /* Return newly interned string. */ 310 return s; /* Return newly interned string. */
191} 311}
192 312
313/* Intern a string and return string object. */
314GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx)
315{
316 global_State *g = G(L);
317 if (lenx-1 < LJ_MAX_STR-1) {
318 MSize len = (MSize)lenx;
319 StrHash hash = hash_sparse(g->str.seed, str, len);
320 MSize coll = 0;
321 int hashalg = 0;
322 /* Check if the string has already been interned. */
323 GCobj *o = gcref(g->str.tab[hash & g->str.mask]);
324#if LUAJIT_SECURITY_STRHASH
325 if (LJ_UNLIKELY((uintptr_t)o & 1)) { /* Secondary hash for this chain? */
326 hashalg = 1;
327 hash = hash_dense(g->str.seed, hash, str, len);
328 o = (GCobj *)(gcrefu(g->str.tab[hash & g->str.mask]) & ~(uintptr_t)1);
329 }
330#endif
331 while (o != NULL) {
332 GCstr *sx = gco2str(o);
333 if (sx->hash == hash && sx->len == len) {
334 if (memcmp(str, strdata(sx), len) == 0) {
335 if (isdead(g, o)) flipwhite(o); /* Resurrect if dead. */
336 return sx; /* Return existing string. */
337 }
338 coll++;
339 }
340 coll++;
341 o = gcnext(o);
342 }
343#if LUAJIT_SECURITY_STRHASH
344 /* Rehash chain if there are too many collisions. */
345 if (LJ_UNLIKELY(coll > LJ_STR_MAXCOLL) && !hashalg) {
346 return lj_str_rehash_chain(L, hash, str, len);
347 }
348#endif
349 /* Otherwise allocate a new string. */
350 return lj_str_alloc(L, str, len, hash, hashalg);
351 } else {
352 if (lenx)
353 lj_err_msg(L, LJ_ERR_STROV);
354 return &g->strempty;
355 }
356}
357
193void LJ_FASTCALL lj_str_free(global_State *g, GCstr *s) 358void LJ_FASTCALL lj_str_free(global_State *g, GCstr *s)
194{ 359{
195 g->strnum--; 360 g->str.num--;
196 lj_mem_free(g, s, sizestring(s)); 361 lj_mem_free(g, s, lj_str_size(s->len));
362}
363
364void LJ_FASTCALL lj_str_init(lua_State *L)
365{
366 global_State *g = G(L);
367 g->str.seed = lj_prng_u64(&g->prng);
368 lj_str_resize(L, LJ_MIN_STRTAB-1);
197} 369}
198 370