summaryrefslogtreecommitdiff
path: root/src/lj_lex.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/lj_lex.c')
-rw-r--r--src/lj_lex.c393
1 files changed, 393 insertions, 0 deletions
diff --git a/src/lj_lex.c b/src/lj_lex.c
new file mode 100644
index 00000000..38b0a7d4
--- /dev/null
+++ b/src/lj_lex.c
@@ -0,0 +1,393 @@
1/*
2** Lexical analyzer.
3** Copyright (C) 2005-2009 Mike Pall. See Copyright Notice in luajit.h
4**
5** Major portions taken verbatim or adapted from the Lua interpreter.
6** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
7*/
8
9#define lj_lex_c
10#define LUA_CORE
11
12#include "lj_obj.h"
13#include "lj_gc.h"
14#include "lj_err.h"
15#include "lj_str.h"
16#include "lj_lex.h"
17#include "lj_parse.h"
18#include "lj_ctype.h"
19
20/* Lua lexer token names. */
21static const char *const tokennames[] = {
22#define TKSTR1(name) #name,
23#define TKSTR2(name, sym) #sym,
24TKDEF(TKSTR1, TKSTR2)
25#undef TKSTR1
26#undef TKSTR2
27 NULL
28};
29
30/* -- Buffer handling ----------------------------------------------------- */
31
32#define char2int(c) cast(int, cast(uint8_t, (c)))
33#define next(ls) \
34 (ls->current = (ls->n--) > 0 ? char2int(*ls->p++) : fillbuf(ls))
35#define save_and_next(ls) (save(ls, ls->current), next(ls))
36#define currIsNewline(ls) (ls->current == '\n' || ls->current == '\r')
37#define END_OF_STREAM (-1)
38
39static int fillbuf(LexState *ls)
40{
41 size_t sz;
42 const char *buf = ls->rfunc(ls->L, ls->rdata, &sz);
43 if (buf == NULL || sz == 0) return END_OF_STREAM;
44 ls->n = (MSize)sz - 1;
45 ls->p = buf;
46 return char2int(*(ls->p++));
47}
48
49static void save(LexState *ls, int c)
50{
51 if (ls->sb.n + 1 > ls->sb.sz) {
52 MSize newsize;
53 if (ls->sb.sz >= LJ_MAX_STR/2)
54 lj_lex_error(ls, 0, LJ_ERR_XELEM);
55 newsize = ls->sb.sz * 2;
56 lj_str_resizebuf(ls->L, &ls->sb, newsize);
57 }
58 ls->sb.buf[ls->sb.n++] = cast(char, c);
59}
60
61static int check_next(LexState *ls, const char *set)
62{
63 if (!strchr(set, ls->current))
64 return 0;
65 save_and_next(ls);
66 return 1;
67}
68
69static void inclinenumber(LexState *ls)
70{
71 int old = ls->current;
72 lua_assert(currIsNewline(ls));
73 next(ls); /* skip `\n' or `\r' */
74 if (currIsNewline(ls) && ls->current != old)
75 next(ls); /* skip `\n\r' or `\r\n' */
76 if (++ls->linenumber >= LJ_MAX_LINE)
77 lj_lex_error(ls, ls->token, LJ_ERR_XLINES);
78}
79
80/* -- Scanner for terminals ----------------------------------------------- */
81
82static void read_numeral(LexState *ls, TValue *tv)
83{
84 lua_assert(lj_ctype_isdigit(ls->current));
85 do {
86 save_and_next(ls);
87 } while (lj_ctype_isdigit(ls->current) || ls->current == '.');
88 if (check_next(ls, "Ee")) /* `E'? */
89 check_next(ls, "+-"); /* optional exponent sign */
90 while (lj_ctype_isident(ls->current))
91 save_and_next(ls);
92 save(ls, '\0');
93 if (!lj_str_numconv(ls->sb.buf, tv))
94 lj_lex_error(ls, TK_number, LJ_ERR_XNUMBER);
95}
96
97static int skip_sep(LexState *ls)
98{
99 int count = 0;
100 int s = ls->current;
101 lua_assert(s == '[' || s == ']');
102 save_and_next(ls);
103 while (ls->current == '=') {
104 save_and_next(ls);
105 count++;
106 }
107 return (ls->current == s) ? count : (-count) - 1;
108}
109
110static void read_long_string(LexState *ls, TValue *tv, int sep)
111{
112 save_and_next(ls); /* skip 2nd `[' */
113 if (currIsNewline(ls)) /* string starts with a newline? */
114 inclinenumber(ls); /* skip it */
115 for (;;) {
116 switch (ls->current) {
117 case END_OF_STREAM:
118 lj_lex_error(ls, TK_eof, tv ? LJ_ERR_XLSTR : LJ_ERR_XLCOM);
119 break;
120 case ']':
121 if (skip_sep(ls) == sep) {
122 save_and_next(ls); /* skip 2nd `]' */
123 goto endloop;
124 }
125 break;
126 case '\n':
127 case '\r':
128 save(ls, '\n');
129 inclinenumber(ls);
130 if (!tv) lj_str_resetbuf(&ls->sb); /* avoid wasting space */
131 break;
132 default:
133 if (tv) save_and_next(ls);
134 else next(ls);
135 break;
136 }
137 } endloop:
138 if (tv) {
139 GCstr *str = lj_parse_keepstr(ls, ls->sb.buf + (2 + (MSize)sep),
140 ls->sb.n - 2*(2 + (MSize)sep));
141 setstrV(ls->L, tv, str);
142 }
143}
144
145static void read_string(LexState *ls, int delim, TValue *tv)
146{
147 save_and_next(ls);
148 while (ls->current != delim) {
149 switch (ls->current) {
150 case END_OF_STREAM:
151 lj_lex_error(ls, TK_eof, LJ_ERR_XSTR);
152 continue;
153 case '\n':
154 case '\r':
155 lj_lex_error(ls, TK_string, LJ_ERR_XSTR);
156 continue;
157 case '\\': {
158 int c;
159 next(ls); /* do not save the `\' */
160 switch (ls->current) {
161 case 'a': c = '\a'; break;
162 case 'b': c = '\b'; break;
163 case 'f': c = '\f'; break;
164 case 'n': c = '\n'; break;
165 case 'r': c = '\r'; break;
166 case 't': c = '\t'; break;
167 case 'v': c = '\v'; break;
168 case '\n': case '\r': save(ls, '\n'); inclinenumber(ls); continue;
169 case END_OF_STREAM: continue; /* will raise an error next loop */
170 default:
171 if (!lj_ctype_isdigit(ls->current)) {
172 save_and_next(ls); /* handles \\, \", \', and \? */
173 } else { /* \xxx */
174 int i = 0;
175 c = 0;
176 do {
177 c = 10*c + (ls->current-'0');
178 next(ls);
179 } while (++i<3 && lj_ctype_isdigit(ls->current));
180 if (c > UCHAR_MAX)
181 lj_lex_error(ls, TK_string, LJ_ERR_XESC);
182 save(ls, c);
183 }
184 continue;
185 }
186 save(ls, c);
187 next(ls);
188 continue;
189 }
190 default:
191 save_and_next(ls);
192 break;
193 }
194 }
195 save_and_next(ls); /* skip delimiter */
196 setstrV(ls->L, tv, lj_parse_keepstr(ls, ls->sb.buf + 1, ls->sb.n - 2));
197}
198
199/* -- Main lexical scanner ------------------------------------------------ */
200
201static int llex(LexState *ls, TValue *tv)
202{
203 lj_str_resetbuf(&ls->sb);
204 for (;;) {
205 if (lj_ctype_isident(ls->current)) {
206 GCstr *s;
207 if (lj_ctype_isdigit(ls->current)) { /* Numeric literal. */
208 read_numeral(ls, tv);
209 return TK_number;
210 }
211 /* Identifier or reserved word. */
212 do {
213 save_and_next(ls);
214 } while (lj_ctype_isident(ls->current));
215 s = lj_parse_keepstr(ls, ls->sb.buf, ls->sb.n);
216 if (s->reserved > 0) /* Reserved word? */
217 return TK_OFS + s->reserved;
218 setstrV(ls->L, tv, s);
219 return TK_name;
220 }
221 switch (ls->current) {
222 case '\n':
223 case '\r':
224 inclinenumber(ls);
225 continue;
226 case ' ':
227 case '\t':
228 case '\v':
229 case '\f':
230 next(ls);
231 continue;
232 case '-':
233 next(ls);
234 if (ls->current != '-') return '-';
235 /* else is a comment */
236 next(ls);
237 if (ls->current == '[') {
238 int sep = skip_sep(ls);
239 lj_str_resetbuf(&ls->sb); /* `skip_sep' may dirty the buffer */
240 if (sep >= 0) {
241 read_long_string(ls, NULL, sep); /* long comment */
242 lj_str_resetbuf(&ls->sb);
243 continue;
244 }
245 }
246 /* else short comment */
247 while (!currIsNewline(ls) && ls->current != END_OF_STREAM)
248 next(ls);
249 continue;
250 case '[': {
251 int sep = skip_sep(ls);
252 if (sep >= 0) {
253 read_long_string(ls, tv, sep);
254 return TK_string;
255 } else if (sep == -1) {
256 return '[';
257 } else {
258 lj_lex_error(ls, TK_string, LJ_ERR_XLDELIM);
259 continue;
260 }
261 }
262 case '=':
263 next(ls);
264 if (ls->current != '=') return '='; else { next(ls); return TK_eq; }
265 case '<':
266 next(ls);
267 if (ls->current != '=') return '<'; else { next(ls); return TK_le; }
268 case '>':
269 next(ls);
270 if (ls->current != '=') return '>'; else { next(ls); return TK_ge; }
271 case '~':
272 next(ls);
273 if (ls->current != '=') return '~'; else { next(ls); return TK_ne; }
274 case '"':
275 case '\'':
276 read_string(ls, ls->current, tv);
277 return TK_string;
278 case '.':
279 save_and_next(ls);
280 if (check_next(ls, ".")) {
281 if (check_next(ls, "."))
282 return TK_dots; /* ... */
283 else
284 return TK_concat; /* .. */
285 } else if (!lj_ctype_isdigit(ls->current)) {
286 return '.';
287 } else {
288 read_numeral(ls, tv);
289 return TK_number;
290 }
291 case END_OF_STREAM:
292 return TK_eof;
293 default: {
294 int c = ls->current;
295 next(ls);
296 return c; /* Single-char tokens (+ - / ...). */
297 }
298 }
299 }
300}
301
302/* -- Lexer API ----------------------------------------------------------- */
303
304void lj_lex_start(lua_State *L, LexState *ls)
305{
306 ls->L = L;
307 ls->fs = NULL;
308 ls->n = 0;
309 ls->p = NULL;
310 ls->lookahead = TK_eof; /* No look-ahead token. */
311 ls->linenumber = 1;
312 ls->lastline = 1;
313 lj_str_resizebuf(ls->L, &ls->sb, LJ_MIN_SBUF);
314 next(ls); /* Read-ahead first char. */
315 if (ls->current == 0xef && ls->n >= 2 && char2int(ls->p[0]) == 0xbb &&
316 char2int(ls->p[1]) == 0xbf) { /* Skip UTF-8 BOM (if buffered). */
317 ls->n -= 2;
318 ls->p += 2;
319 next(ls);
320 }
321 if (ls->current == '#') { /* Skip POSIX #! header line. */
322 do {
323 next(ls);
324 if (ls->current == END_OF_STREAM) return;
325 } while (!currIsNewline(ls));
326 inclinenumber(ls);
327 }
328 if (ls->current == LUA_SIGNATURE[0]) {
329 setstrV(L, L->top++, lj_err_str(L, LJ_ERR_XBCLOAD));
330 lj_err_throw(L, LUA_ERRSYNTAX);
331 }
332 /* This is an unanchored GCstr before it's stored in the prototype.
333 ** Do this last since next() calls the reader which may call the GC.
334 */
335 ls->chunkname = lj_str_newz(L, ls->chunkarg);
336}
337
338void lj_lex_next(LexState *ls)
339{
340 ls->lastline = ls->linenumber;
341 if (LJ_LIKELY(ls->lookahead == TK_eof)) { /* No lookahead token? */
342 ls->token = llex(ls, &ls->tokenval); /* Get next token. */
343 } else { /* Otherwise return lookahead token. */
344 ls->token = ls->lookahead;
345 ls->lookahead = TK_eof;
346 ls->tokenval = ls->lookaheadval;
347 }
348}
349
350LexToken lj_lex_lookahead(LexState *ls)
351{
352 lua_assert(ls->lookahead == TK_eof);
353 ls->lookahead = llex(ls, &ls->lookaheadval);
354 return ls->lookahead;
355}
356
357const char *lj_lex_token2str(LexState *ls, LexToken token)
358{
359 if (token > TK_OFS)
360 return tokennames[token-TK_OFS-1];
361 else if (!lj_ctype_iscntrl(token))
362 return lj_str_pushf(ls->L, "%c", token);
363 else
364 return lj_str_pushf(ls->L, "char(%d)", token);
365}
366
367void lj_lex_error(LexState *ls, LexToken token, ErrMsg em, ...)
368{
369 const char *tok;
370 va_list argp;
371 if (token == 0) {
372 tok = NULL;
373 } else if (token == TK_name || token == TK_string || token == TK_number) {
374 save(ls, '\0');
375 tok = ls->sb.buf;
376 } else {
377 tok = lj_lex_token2str(ls, token);
378 }
379 va_start(argp, em);
380 lj_err_lex(ls->L, strdata(ls->chunkname), tok, ls->linenumber, em, argp);
381 va_end(argp);
382}
383
384void lj_lex_init(lua_State *L)
385{
386 uint32_t i;
387 for (i = 0; i < TK_RESERVED; i++) {
388 GCstr *s = lj_str_newz(L, tokennames[i]);
389 fixstring(s); /* Reserved words are never collected. */
390 s->reserved = cast_byte(i+1);
391 }
392}
393