aboutsummaryrefslogtreecommitdiff
path: root/editors
diff options
context:
space:
mode:
Diffstat (limited to 'editors')
-rw-r--r--editors/awk.c409
1 files changed, 246 insertions, 163 deletions
diff --git a/editors/awk.c b/editors/awk.c
index 2c1272554..878fffa1a 100644
--- a/editors/awk.c
+++ b/editors/awk.c
@@ -337,7 +337,9 @@ static void debug_parse_print_tc(uint32_t n)
337#undef P 337#undef P
338#undef PRIMASK 338#undef PRIMASK
339#undef PRIMASK2 339#undef PRIMASK2
340#define P(x) (x << 24) 340/* Smaller 'x' means _higher_ operator precedence */
341#define PRECEDENCE(x) (x << 24)
342#define P(x) PRECEDENCE(x)
341#define PRIMASK 0x7F000000 343#define PRIMASK 0x7F000000
342#define PRIMASK2 0x7E000000 344#define PRIMASK2 0x7E000000
343 345
@@ -360,7 +362,7 @@ enum {
360 OC_MOVE = 0x1f00, OC_PGETLINE = 0x2000, OC_REGEXP = 0x2100, 362 OC_MOVE = 0x1f00, OC_PGETLINE = 0x2000, OC_REGEXP = 0x2100,
361 OC_REPLACE = 0x2200, OC_RETURN = 0x2300, OC_SPRINTF = 0x2400, 363 OC_REPLACE = 0x2200, OC_RETURN = 0x2300, OC_SPRINTF = 0x2400,
362 OC_TERNARY = 0x2500, OC_UNARY = 0x2600, OC_VAR = 0x2700, 364 OC_TERNARY = 0x2500, OC_UNARY = 0x2600, OC_VAR = 0x2700,
363 OC_DONE = 0x2800, 365 OC_CONST = 0x2800, OC_DONE = 0x2900,
364 366
365 ST_IF = 0x3000, ST_DO = 0x3100, ST_FOR = 0x3200, 367 ST_IF = 0x3000, ST_DO = 0x3100, ST_FOR = 0x3200,
366 ST_WHILE = 0x3300 368 ST_WHILE = 0x3300
@@ -440,9 +442,9 @@ static const uint32_t tokeninfo[] ALIGN4 = {
440#define TI_PREINC (OC_UNARY|xV|P(9)|'P') 442#define TI_PREINC (OC_UNARY|xV|P(9)|'P')
441#define TI_PREDEC (OC_UNARY|xV|P(9)|'M') 443#define TI_PREDEC (OC_UNARY|xV|P(9)|'M')
442 TI_PREINC, TI_PREDEC, OC_FIELD|xV|P(5), 444 TI_PREINC, TI_PREDEC, OC_FIELD|xV|P(5),
443 OC_COMPARE|VV|P(39)|5, OC_MOVE|VV|P(74), OC_REPLACE|NV|P(74)|'+', OC_REPLACE|NV|P(74)|'-', 445 OC_COMPARE|VV|P(39)|5, OC_MOVE|VV|P(38), OC_REPLACE|NV|P(38)|'+', OC_REPLACE|NV|P(38)|'-',
444 OC_REPLACE|NV|P(74)|'*', OC_REPLACE|NV|P(74)|'/', OC_REPLACE|NV|P(74)|'%', OC_REPLACE|NV|P(74)|'&', 446 OC_REPLACE|NV|P(38)|'*', OC_REPLACE|NV|P(38)|'/', OC_REPLACE|NV|P(38)|'%', OC_REPLACE|NV|P(38)|'&',
445 OC_BINARY|NV|P(29)|'+', OC_BINARY|NV|P(29)|'-', OC_REPLACE|NV|P(74)|'&', OC_BINARY|NV|P(15)|'&', 447 OC_BINARY|NV|P(29)|'+', OC_BINARY|NV|P(29)|'-', OC_REPLACE|NV|P(38)|'&', OC_BINARY|NV|P(15)|'&',
446 OC_BINARY|NV|P(25)|'/', OC_BINARY|NV|P(25)|'%', OC_BINARY|NV|P(15)|'&', OC_BINARY|NV|P(25)|'*', 448 OC_BINARY|NV|P(25)|'/', OC_BINARY|NV|P(25)|'%', OC_BINARY|NV|P(15)|'&', OC_BINARY|NV|P(25)|'*',
447 OC_COMPARE|VV|P(39)|4, OC_COMPARE|VV|P(39)|3, OC_COMPARE|VV|P(39)|0, OC_COMPARE|VV|P(39)|1, 449 OC_COMPARE|VV|P(39)|4, OC_COMPARE|VV|P(39)|3, OC_COMPARE|VV|P(39)|0, OC_COMPARE|VV|P(39)|1,
448#define TI_LESS (OC_COMPARE|VV|P(39)|2) 450#define TI_LESS (OC_COMPARE|VV|P(39)|2)
@@ -546,7 +548,6 @@ struct globals {
546 chain beginseq, mainseq, endseq; 548 chain beginseq, mainseq, endseq;
547 chain *seq; 549 chain *seq;
548 node *break_ptr, *continue_ptr; 550 node *break_ptr, *continue_ptr;
549 rstream *iF;
550 xhash *ahash; /* argument names, used only while parsing function bodies */ 551 xhash *ahash; /* argument names, used only while parsing function bodies */
551 xhash *fnhash; /* function names, used only in parsing stage */ 552 xhash *fnhash; /* function names, used only in parsing stage */
552 xhash *vhash; /* variables and arrays */ 553 xhash *vhash; /* variables and arrays */
@@ -555,7 +556,7 @@ struct globals {
555 const char *g_progname; 556 const char *g_progname;
556 int g_lineno; 557 int g_lineno;
557 int nfields; 558 int nfields;
558 int maxfields; /* used in fsrealloc() only */ 559 unsigned maxfields;
559 var *Fields; 560 var *Fields;
560 char *g_pos; 561 char *g_pos;
561 char g_saved_ch; 562 char g_saved_ch;
@@ -579,11 +580,13 @@ struct globals2 {
579 580
580 var *intvar[NUM_INTERNAL_VARS]; /* often used */ 581 var *intvar[NUM_INTERNAL_VARS]; /* often used */
581 582
583 rstream iF;
584
582 /* former statics from various functions */ 585 /* former statics from various functions */
583 char *split_f0__fstrings; 586 char *split_f0__fstrings;
584 587
585 rstream next_input_file__rsm; 588 unsigned next_input_file__argind;
586 smallint next_input_file__files_happen; 589 smallint next_input_file__input_file_seen;
587 590
588 smalluint exitcode; 591 smalluint exitcode;
589 592
@@ -618,7 +621,6 @@ struct globals2 {
618#define seq (G1.seq ) 621#define seq (G1.seq )
619#define break_ptr (G1.break_ptr ) 622#define break_ptr (G1.break_ptr )
620#define continue_ptr (G1.continue_ptr) 623#define continue_ptr (G1.continue_ptr)
621#define iF (G1.iF )
622#define ahash (G1.ahash ) 624#define ahash (G1.ahash )
623#define fnhash (G1.fnhash ) 625#define fnhash (G1.fnhash )
624#define vhash (G1.vhash ) 626#define vhash (G1.vhash )
@@ -644,6 +646,7 @@ struct globals2 {
644#define t_string (G.t_string ) 646#define t_string (G.t_string )
645#define t_lineno (G.t_lineno ) 647#define t_lineno (G.t_lineno )
646#define intvar (G.intvar ) 648#define intvar (G.intvar )
649#define iF (G.iF )
647#define fsplitter (G.fsplitter ) 650#define fsplitter (G.fsplitter )
648#define rsplitter (G.rsplitter ) 651#define rsplitter (G.rsplitter )
649#define g_buf (G.g_buf ) 652#define g_buf (G.g_buf )
@@ -978,6 +981,11 @@ static var *setvar_s(var *v, const char *value)
978 return setvar_p(v, (value && *value) ? xstrdup(value) : NULL); 981 return setvar_p(v, (value && *value) ? xstrdup(value) : NULL);
979} 982}
980 983
984static var *setvar_sn(var *v, const char *value, int len)
985{
986 return setvar_p(v, (value && *value && len > 0) ? xstrndup(value, len) : NULL);
987}
988
981/* same as setvar_s but sets USER flag */ 989/* same as setvar_s but sets USER flag */
982static var *setvar_u(var *v, const char *value) 990static var *setvar_u(var *v, const char *value)
983{ 991{
@@ -1005,6 +1013,11 @@ static var *setvar_i(var *v, double value)
1005 return v; 1013 return v;
1006} 1014}
1007 1015
1016static void setvar_ERRNO(void)
1017{
1018 setvar_i(intvar[ERRNO], errno);
1019}
1020
1008static const char *getvar_s(var *v) 1021static const char *getvar_s(var *v)
1009{ 1022{
1010 /* if v is numeric and has no cached string, convert it to string */ 1023 /* if v is numeric and has no cached string, convert it to string */
@@ -1290,7 +1303,7 @@ static uint32_t next_token(uint32_t expected)
1290 save_tclass = tc; 1303 save_tclass = tc;
1291 save_info = t_info; 1304 save_info = t_info;
1292 tc = TC_BINOPX; 1305 tc = TC_BINOPX;
1293 t_info = OC_CONCAT | SS | P(35); 1306 t_info = OC_CONCAT | SS | PRECEDENCE(35);
1294 } 1307 }
1295 1308
1296 t_tclass = tc; 1309 t_tclass = tc;
@@ -1350,9 +1363,8 @@ static node *parse_expr(uint32_t term_tc)
1350{ 1363{
1351 node sn; 1364 node sn;
1352 node *cn = &sn; 1365 node *cn = &sn;
1353 node *vn, *glptr; 1366 node *glptr;
1354 uint32_t tc, expected_tc; 1367 uint32_t tc, expected_tc;
1355 var *v;
1356 1368
1357 debug_printf_parse("%s() term_tc(%x):", __func__, term_tc); 1369 debug_printf_parse("%s() term_tc(%x):", __func__, term_tc);
1358 debug_parse_print_tc(term_tc); 1370 debug_parse_print_tc(term_tc);
@@ -1363,11 +1375,12 @@ static node *parse_expr(uint32_t term_tc)
1363 expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP | term_tc; 1375 expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP | term_tc;
1364 1376
1365 while (!((tc = next_token(expected_tc)) & term_tc)) { 1377 while (!((tc = next_token(expected_tc)) & term_tc)) {
1378 node *vn;
1366 1379
1367 if (glptr && (t_info == TI_LESS)) { 1380 if (glptr && (t_info == TI_LESS)) {
1368 /* input redirection (<) attached to glptr node */ 1381 /* input redirection (<) attached to glptr node */
1369 debug_printf_parse("%s: input redir\n", __func__); 1382 debug_printf_parse("%s: input redir\n", __func__);
1370 cn = glptr->l.n = new_node(OC_CONCAT | SS | P(37)); 1383 cn = glptr->l.n = new_node(OC_CONCAT | SS | PRECEDENCE(37));
1371 cn->a.n = glptr; 1384 cn->a.n = glptr;
1372 expected_tc = TS_OPERAND | TS_UOPPRE; 1385 expected_tc = TS_OPERAND | TS_UOPPRE;
1373 glptr = NULL; 1386 glptr = NULL;
@@ -1379,24 +1392,42 @@ static node *parse_expr(uint32_t term_tc)
1379 * previous operators with higher priority */ 1392 * previous operators with higher priority */
1380 vn = cn; 1393 vn = cn;
1381 while (((t_info & PRIMASK) > (vn->a.n->info & PRIMASK2)) 1394 while (((t_info & PRIMASK) > (vn->a.n->info & PRIMASK2))
1382 || ((t_info == vn->info) && t_info == TI_COLON) 1395 || (t_info == vn->info && t_info == TI_COLON)
1383 ) { 1396 ) {
1384 vn = vn->a.n; 1397 vn = vn->a.n;
1385 if (!vn->a.n) syntax_error(EMSG_UNEXP_TOKEN); 1398 if (!vn->a.n) syntax_error(EMSG_UNEXP_TOKEN);
1386 } 1399 }
1387 if (t_info == TI_TERNARY) 1400 if (t_info == TI_TERNARY)
1388//TODO: why? 1401//TODO: why?
1389 t_info += P(6); 1402 t_info += PRECEDENCE(6);
1390 cn = vn->a.n->r.n = new_node(t_info); 1403 cn = vn->a.n->r.n = new_node(t_info);
1391 cn->a.n = vn->a.n; 1404 cn->a.n = vn->a.n;
1392 if (tc & TS_BINOP) { 1405 if (tc & TS_BINOP) {
1393 cn->l.n = vn; 1406 cn->l.n = vn;
1394//FIXME: this is the place to detect and reject assignments to non-lvalues. 1407
1395//Currently we allow "assignments" to consts and temporaries, nonsense like this: 1408 /* Prevent:
1396// awk 'BEGIN { "qwe" = 1 }' 1409 * awk 'BEGIN { "qwe" = 1 }'
1397// awk 'BEGIN { 7 *= 7 }' 1410 * awk 'BEGIN { 7 *= 7 }'
1398// awk 'BEGIN { length("qwe") = 1 }' 1411 * awk 'BEGIN { length("qwe") = 1 }'
1399// awk 'BEGIN { (1+1) += 3 }' 1412 * awk 'BEGIN { (1+1) += 3 }'
1413 */
1414 /* Assignment? (including *= and friends) */
1415 if (((t_info & OPCLSMASK) == OC_MOVE)
1416 || ((t_info & OPCLSMASK) == OC_REPLACE)
1417 ) {
1418 debug_printf_parse("%s: MOVE/REPLACE vn->info:%08x\n", __func__, vn->info);
1419 /* Left side is a (variable or array element)
1420 * or function argument
1421 * or $FIELD ?
1422 */
1423 if ((vn->info & OPCLSMASK) != OC_VAR
1424 && (vn->info & OPCLSMASK) != OC_FNARG
1425 && (vn->info & OPCLSMASK) != OC_FIELD
1426 ) {
1427 syntax_error(EMSG_UNEXP_TOKEN); /* no. bad */
1428 }
1429 }
1430
1400 expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP; 1431 expected_tc = TS_OPERAND | TS_UOPPRE | TC_REGEXP;
1401 if (t_info == TI_PGETLINE) { 1432 if (t_info == TI_PGETLINE) {
1402 /* it's a pipe */ 1433 /* it's a pipe */
@@ -1432,6 +1463,8 @@ static node *parse_expr(uint32_t term_tc)
1432 /* one should be very careful with switch on tclass - 1463 /* one should be very careful with switch on tclass -
1433 * only simple tclasses should be used (TC_xyz, not TS_xyz) */ 1464 * only simple tclasses should be used (TC_xyz, not TS_xyz) */
1434 switch (tc) { 1465 switch (tc) {
1466 var *v;
1467
1435 case TC_VARIABLE: 1468 case TC_VARIABLE:
1436 case TC_ARRAY: 1469 case TC_ARRAY:
1437 debug_printf_parse("%s: TC_VARIABLE | TC_ARRAY\n", __func__); 1470 debug_printf_parse("%s: TC_VARIABLE | TC_ARRAY\n", __func__);
@@ -1452,14 +1485,14 @@ static node *parse_expr(uint32_t term_tc)
1452 case TC_NUMBER: 1485 case TC_NUMBER:
1453 case TC_STRING: 1486 case TC_STRING:
1454 debug_printf_parse("%s: TC_NUMBER | TC_STRING\n", __func__); 1487 debug_printf_parse("%s: TC_NUMBER | TC_STRING\n", __func__);
1455 cn->info = OC_VAR; 1488 cn->info = OC_CONST;
1456 v = cn->l.v = xzalloc(sizeof(var)); 1489 v = cn->l.v = xzalloc(sizeof(var));
1457 if (tc & TC_NUMBER) 1490 if (tc & TC_NUMBER) {
1458 setvar_i(v, t_double); 1491 setvar_i(v, t_double);
1459 else { 1492 } else {
1460 setvar_s(v, t_string); 1493 setvar_s(v, t_string);
1461 expected_tc &= ~TC_UOPPOST; /* "str"++ is not allowed */
1462 } 1494 }
1495 expected_tc &= ~TC_UOPPOST; /* NUM++, "str"++ not allowed */
1463 break; 1496 break;
1464 1497
1465 case TC_REGEXP: 1498 case TC_REGEXP:
@@ -1931,9 +1964,9 @@ static void fsrealloc(int size)
1931{ 1964{
1932 int i, newsize; 1965 int i, newsize;
1933 1966
1934 if (size >= maxfields) { 1967 if ((unsigned)size >= maxfields) {
1935 /* Sanity cap, easier than catering for overflows */ 1968 /* Sanity cap, easier than catering for over/underflows */
1936 if (size > 0xffffff) 1969 if ((unsigned)size > 0xffffff)
1937 bb_die_memory_exhausted(); 1970 bb_die_memory_exhausted();
1938 1971
1939 i = maxfields; 1972 i = maxfields;
@@ -2049,13 +2082,17 @@ static int awk_split(const char *s, node *spl, char **slist)
2049 } 2082 }
2050 return n; 2083 return n;
2051 } 2084 }
2052 /* space split */ 2085 /* space split: "In the special case that FS is a single space,
2086 * fields are separated by runs of spaces and/or tabs and/or newlines"
2087 */
2053 while (*s) { 2088 while (*s) {
2054 s = skip_whitespace(s); 2089 /* s = skip_whitespace(s); -- WRONG (also skips \v \f \r) */
2090 while (*s == ' ' || *s == '\t' || *s == '\n')
2091 s++;
2055 if (!*s) 2092 if (!*s)
2056 break; 2093 break;
2057 n++; 2094 n++;
2058 while (*s && !isspace(*s)) 2095 while (*s && !(*s == ' ' || *s == '\t' || *s == '\n'))
2059 *s1++ = *s++; 2096 *s1++ = *s++;
2060 *s1++ = '\0'; 2097 *s1++ = '\0';
2061 } 2098 }
@@ -2232,9 +2269,9 @@ static int awk_getline(rstream *rsm, var *v)
2232{ 2269{
2233 char *b; 2270 char *b;
2234 regmatch_t pmatch[1]; 2271 regmatch_t pmatch[1];
2235 int size, a, p, pp = 0; 2272 int p, pp;
2236 int fd, so, eo, r, rp; 2273 int fd, so, eo, retval, rp;
2237 char c, *m, *s; 2274 char *m, *s;
2238 2275
2239 debug_printf_eval("entered %s()\n", __func__); 2276 debug_printf_eval("entered %s()\n", __func__);
2240 2277
@@ -2243,23 +2280,22 @@ static int awk_getline(rstream *rsm, var *v)
2243 */ 2280 */
2244 fd = fileno(rsm->F); 2281 fd = fileno(rsm->F);
2245 m = rsm->buffer; 2282 m = rsm->buffer;
2246 a = rsm->adv; 2283 if (!m)
2284 m = qrealloc(m, 256, &rsm->size);
2247 p = rsm->pos; 2285 p = rsm->pos;
2248 size = rsm->size;
2249 c = (char) rsplitter.n.info;
2250 rp = 0; 2286 rp = 0;
2251 2287 pp = 0;
2252 if (!m)
2253 m = qrealloc(m, 256, &size);
2254 2288
2255 do { 2289 do {
2256 b = m + a; 2290 b = m + rsm->adv;
2257 so = eo = p; 2291 so = eo = p;
2258 r = 1; 2292 retval = 1;
2259 if (p > 0) { 2293 if (p > 0) {
2294 char c = (char) rsplitter.n.info;
2260 if (rsplitter.n.info == TI_REGEXP) { 2295 if (rsplitter.n.info == TI_REGEXP) {
2261 if (regexec(icase ? rsplitter.n.r.ire : rsplitter.n.l.re, 2296 if (regexec(icase ? rsplitter.n.r.ire : rsplitter.n.l.re,
2262 b, 1, pmatch, 0) == 0) { 2297 b, 1, pmatch, 0) == 0
2298 ) {
2263 so = pmatch[0].rm_so; 2299 so = pmatch[0].rm_so;
2264 eo = pmatch[0].rm_eo; 2300 eo = pmatch[0].rm_eo;
2265 if (b[eo] != '\0') 2301 if (b[eo] != '\0')
@@ -2288,45 +2324,38 @@ static int awk_getline(rstream *rsm, var *v)
2288 } 2324 }
2289 } 2325 }
2290 2326
2291 if (a > 0) { 2327 if (rsm->adv > 0) {
2292 memmove(m, m+a, p+1); 2328 memmove(m, m+rsm->adv, p+1);
2293 b = m; 2329 b = m;
2294 a = 0; 2330 rsm->adv = 0;
2295 } 2331 }
2296 2332
2297 m = qrealloc(m, a+p+128, &size); 2333 b = m = qrealloc(m, p+128, &rsm->size);
2298 b = m + a;
2299 pp = p; 2334 pp = p;
2300 p += safe_read(fd, b+p, size-p-1); 2335 p += safe_read(fd, b+p, rsm->size - p - 1);
2301 if (p < pp) { 2336 if (p < pp) {
2302 p = 0; 2337 p = 0;
2303 r = 0; 2338 retval = 0;
2304 setvar_i(intvar[ERRNO], errno); 2339 setvar_ERRNO();
2305 } 2340 }
2306 b[p] = '\0'; 2341 b[p] = '\0';
2307
2308 } while (p > pp); 2342 } while (p > pp);
2309 2343
2310 if (p == 0) { 2344 if (p == 0) {
2311 r--; 2345 retval--;
2312 } else { 2346 } else {
2313 c = b[so]; b[so] = '\0'; 2347 setvar_sn(v, b+rp, so-rp);
2314 setvar_s(v, b+rp);
2315 v->type |= VF_USER; 2348 v->type |= VF_USER;
2316 b[so] = c; 2349 setvar_sn(intvar[RT], b+so, eo-so);
2317 c = b[eo]; b[eo] = '\0';
2318 setvar_s(intvar[RT], b+so);
2319 b[eo] = c;
2320 } 2350 }
2321 2351
2322 rsm->buffer = m; 2352 rsm->buffer = m;
2323 rsm->adv = a + eo; 2353 rsm->adv += eo;
2324 rsm->pos = p - eo; 2354 rsm->pos = p - eo;
2325 rsm->size = size;
2326 2355
2327 debug_printf_eval("returning from %s(): %d\n", __func__, r); 2356 debug_printf_eval("returning from %s(): %d\n", __func__, retval);
2328 2357
2329 return r; 2358 return retval;
2330} 2359}
2331 2360
2332/* formatted output into an allocated buffer, return ptr to buffer */ 2361/* formatted output into an allocated buffer, return ptr to buffer */
@@ -2382,7 +2411,7 @@ static char *awk_printf(node *n, size_t *len)
2382 while (1) { 2411 while (1) {
2383 if (isalpha(c)) 2412 if (isalpha(c))
2384 break; 2413 break;
2385 if (c == '*') 2414 if (c == '*') /* gawk supports %*d and %*.*f, we don't... */
2386 syntax_error("%*x formats are not supported"); 2415 syntax_error("%*x formats are not supported");
2387 c = *++f; 2416 c = *++f;
2388 if (!c) { /* "....%...." and no letter found after % */ 2417 if (!c) { /* "....%...." and no letter found after % */
@@ -2415,12 +2444,18 @@ static char *awk_printf(node *n, size_t *len)
2415 double d = getvar_i(arg); 2444 double d = getvar_i(arg);
2416 if (strchr("diouxX", c)) { 2445 if (strchr("diouxX", c)) {
2417//TODO: make it wider here (%x -> %llx etc)? 2446//TODO: make it wider here (%x -> %llx etc)?
2447//Can even print the value into a temp string with %.0f,
2448//then replace diouxX with s and print that string.
2449//This will correctly print even very large numbers,
2450//but some replacements are not equivalent:
2451//%09d -> %09s: breaks zero-padding;
2452//%+d -> %+s: won't prepend +; etc
2418 s = xasprintf(s, (int)d); 2453 s = xasprintf(s, (int)d);
2419 } else if (strchr("eEfFgGaA", c)) { 2454 } else if (strchr("eEfFgGaA", c)) {
2420 s = xasprintf(s, d); 2455 s = xasprintf(s, d);
2421 } else { 2456 } else {
2422//TODO: GNU Awk 5.0.1: printf "%W" prints "%W", does not error out 2457 /* gawk 5.1.1 printf("%W") prints "%W", does not error out */
2423 syntax_error(EMSG_INV_FMT); 2458 s = xstrndup(s, f - s);
2424 } 2459 }
2425 } 2460 }
2426 slen = strlen(s); 2461 slen = strlen(s);
@@ -2457,9 +2492,9 @@ static char *awk_printf(node *n, size_t *len)
2457 * store result into (dest), return number of substitutions. 2492 * store result into (dest), return number of substitutions.
2458 * If nm = 0, replace all matches. 2493 * If nm = 0, replace all matches.
2459 * If src or dst is NULL, use $0. 2494 * If src or dst is NULL, use $0.
2460 * If subexp != 0, enable subexpression matching (\1-\9). 2495 * If subexp != 0, enable subexpression matching (\0-\9).
2461 */ 2496 */
2462static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest, int subexp) 2497static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,int subexp*/)
2463{ 2498{
2464 char *resbuf; 2499 char *resbuf;
2465 const char *sp; 2500 const char *sp;
@@ -2467,17 +2502,48 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest, int
2467 int regexec_flags; 2502 int regexec_flags;
2468 regmatch_t pmatch[10]; 2503 regmatch_t pmatch[10];
2469 regex_t sreg, *regex; 2504 regex_t sreg, *regex;
2470 2505 /* True only if called to implement gensub(): */
2506 int subexp = (src != dest);
2507#if defined(REG_STARTEND)
2508 const char *src_string;
2509 size_t src_strlen;
2510 regexec_flags = REG_STARTEND;
2511#else
2512 regexec_flags = 0;
2513#endif
2471 resbuf = NULL; 2514 resbuf = NULL;
2472 residx = 0; 2515 residx = 0;
2473 match_no = 0; 2516 match_no = 0;
2474 regexec_flags = 0;
2475 regex = as_regex(rn, &sreg); 2517 regex = as_regex(rn, &sreg);
2476 sp = getvar_s(src ? src : intvar[F0]); 2518 sp = getvar_s(src ? src : intvar[F0]);
2519#if defined(REG_STARTEND)
2520 src_string = sp;
2521 src_strlen = strlen(src_string);
2522#endif
2477 replen = strlen(repl); 2523 replen = strlen(repl);
2478 while (regexec(regex, sp, 10, pmatch, regexec_flags) == 0) { 2524 for (;;) {
2479 int so = pmatch[0].rm_so; 2525 int so, eo;
2480 int eo = pmatch[0].rm_eo; 2526
2527#if defined(REG_STARTEND)
2528// REG_STARTEND: "This flag is a BSD extension, not present in POSIX"
2529 size_t start_ofs = sp - src_string;
2530 pmatch[0].rm_so = start_ofs;
2531 pmatch[0].rm_eo = src_strlen;
2532 if (regexec(regex, src_string, 10, pmatch, regexec_flags) != 0)
2533 break;
2534 eo = pmatch[0].rm_eo - start_ofs;
2535 so = pmatch[0].rm_so - start_ofs;
2536#else
2537// BUG:
2538// gsub(/\<b*/,"") on "abc" matches empty string at "a...",
2539// advances sp one char (see "Empty match" comment later) to "bc"
2540// ... and erroneously matches "b" even though it is NOT at the word start.
2541 enum { start_ofs = 0 };
2542 if (regexec(regex, sp, 10, pmatch, regexec_flags) != 0)
2543 break;
2544 so = pmatch[0].rm_so;
2545 eo = pmatch[0].rm_eo;
2546#endif
2481 2547
2482 //bb_error_msg("match %u: [%u,%u] '%s'%p", match_no+1, so, eo, sp,sp); 2548 //bb_error_msg("match %u: [%u,%u] '%s'%p", match_no+1, so, eo, sp,sp);
2483 resbuf = qrealloc(resbuf, residx + eo + replen, &resbufsize); 2549 resbuf = qrealloc(resbuf, residx + eo + replen, &resbufsize);
@@ -2485,51 +2551,41 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest, int
2485 residx += eo; 2551 residx += eo;
2486 if (++match_no >= nm) { 2552 if (++match_no >= nm) {
2487 const char *s; 2553 const char *s;
2488 int nbs; 2554 int bslash;
2489 2555
2490 /* replace */ 2556 /* replace */
2491 residx -= (eo - so); 2557 residx -= (eo - so);
2492 nbs = 0; 2558 bslash = 0;
2493 for (s = repl; *s; s++) { 2559 for (s = repl; *s; s++) {
2494 char c = resbuf[residx++] = *s; 2560 char c = *s;
2495 if (c == '\\') { 2561 if (c == '\\' && s[1]) {
2496 nbs++; 2562 bslash ^= 1;
2497 continue; 2563 if (bslash)
2564 continue;
2498 } 2565 }
2499 if (c == '&' || (subexp && c >= '0' && c <= '9')) { 2566 if ((!bslash && c == '&')
2500 int j; 2567 || (subexp && bslash && c >= '0' && c <= '9')
2501 residx -= ((nbs + 3) >> 1); 2568 ) {
2502 j = 0; 2569 int n, j = 0;
2503 if (c != '&') { 2570 if (c != '&') {
2504 j = c - '0'; 2571 j = c - '0';
2505 nbs++;
2506 } 2572 }
2507 if (nbs % 2) { 2573 n = pmatch[j].rm_eo - pmatch[j].rm_so;
2508 resbuf[residx++] = c; 2574 resbuf = qrealloc(resbuf, residx + replen + n, &resbufsize);
2509 } else { 2575 memcpy(resbuf + residx, sp + pmatch[j].rm_so - start_ofs, n);
2510 int n = pmatch[j].rm_eo - pmatch[j].rm_so; 2576 residx += n;
2511 resbuf = qrealloc(resbuf, residx + replen + n, &resbufsize); 2577 } else
2512 memcpy(resbuf + residx, sp + pmatch[j].rm_so, n); 2578 resbuf[residx++] = c;
2513 residx += n; 2579 bslash = 0;
2514 }
2515 }
2516 nbs = 0;
2517 } 2580 }
2518 } 2581 }
2519 2582
2520 regexec_flags = REG_NOTBOL;
2521 sp += eo; 2583 sp += eo;
2522 if (match_no == nm) 2584 if (match_no == nm)
2523 break; 2585 break;
2524 if (eo == so) { 2586 if (eo == so) {
2525 /* Empty match (e.g. "b*" will match anywhere). 2587 /* Empty match (e.g. "b*" will match anywhere).
2526 * Advance by one char. */ 2588 * Advance by one char. */
2527//BUG (bug 1333):
2528//gsub(/\<b*/,"") on "abc" will reach this point, advance to "bc"
2529//... and will erroneously match "b" even though it is NOT at the word start.
2530//we need REG_NOTBOW but it does not exist...
2531//TODO: if EXTRA_COMPAT=y, use GNU matching and re_search,
2532//it should be able to do it correctly.
2533 /* Subtle: this is safe only because 2589 /* Subtle: this is safe only because
2534 * qrealloc allocated at least one extra byte */ 2590 * qrealloc allocated at least one extra byte */
2535 resbuf[residx] = *sp; 2591 resbuf[residx] = *sp;
@@ -2538,6 +2594,7 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest, int
2538 sp++; 2594 sp++;
2539 residx++; 2595 residx++;
2540 } 2596 }
2597 regexec_flags |= REG_NOTBOL;
2541 } 2598 }
2542 2599
2543 resbuf = qrealloc(resbuf, residx + strlen(sp), &resbufsize); 2600 resbuf = qrealloc(resbuf, residx + strlen(sp), &resbufsize);
@@ -2669,8 +2726,6 @@ static NOINLINE var *exec_builtin(node *op, var *res)
2669 } 2726 }
2670 2727
2671 case B_ss: { 2728 case B_ss: {
2672 char *s;
2673
2674 l = strlen(as[0]); 2729 l = strlen(as[0]);
2675 i = getvar_i(av[1]) - 1; 2730 i = getvar_i(av[1]) - 1;
2676 if (i > l) 2731 if (i > l)
@@ -2680,8 +2735,7 @@ static NOINLINE var *exec_builtin(node *op, var *res)
2680 n = (nargs > 2) ? getvar_i(av[2]) : l-i; 2735 n = (nargs > 2) ? getvar_i(av[2]) : l-i;
2681 if (n < 0) 2736 if (n < 0)
2682 n = 0; 2737 n = 0;
2683 s = xstrndup(as[0]+i, n); 2738 setvar_sn(res, as[0]+i, n);
2684 setvar_p(res, s);
2685 break; 2739 break;
2686 } 2740 }
2687 2741
@@ -2758,8 +2812,7 @@ static NOINLINE var *exec_builtin(node *op, var *res)
2758 i = strftime(g_buf, MAXVARFMT, 2812 i = strftime(g_buf, MAXVARFMT,
2759 ((nargs > 0) ? as[0] : "%a %b %d %H:%M:%S %Z %Y"), 2813 ((nargs > 0) ? as[0] : "%a %b %d %H:%M:%S %Z %Y"),
2760 localtime(&tt)); 2814 localtime(&tt));
2761 g_buf[i] = '\0'; 2815 setvar_sn(res, g_buf, i);
2762 setvar_s(res, g_buf);
2763 break; 2816 break;
2764 2817
2765 case B_mt: 2818 case B_mt:
@@ -2770,16 +2823,16 @@ static NOINLINE var *exec_builtin(node *op, var *res)
2770 res = do_match(an[1], as[0]); 2823 res = do_match(an[1], as[0]);
2771 break; 2824 break;
2772 2825
2773 case B_ge: 2826 case B_ge: /* gensub(regex, repl, matchnum, string) */
2774 awk_sub(an[0], as[1], getvar_i(av[2]), av[3], res, TRUE); 2827 awk_sub(an[0], as[1], /*matchnum:*/getvar_i(av[2]), /*src:*/av[3], /*dst:*/res/*, TRUE*/);
2775 break; 2828 break;
2776 2829
2777 case B_gs: 2830 case B_gs: /* gsub(regex, repl, string) */
2778 setvar_i(res, awk_sub(an[0], as[1], 0, av[2], av[2], FALSE)); 2831 setvar_i(res, awk_sub(an[0], as[1], /*matchnum:all*/0, /*src:*/av[2], /*dst:*/av[2]/*, FALSE*/));
2779 break; 2832 break;
2780 2833
2781 case B_su: 2834 case B_su: /* sub(regex, repl, string) */
2782 setvar_i(res, awk_sub(an[0], as[1], 1, av[2], av[2], FALSE)); 2835 setvar_i(res, awk_sub(an[0], as[1], /*matchnum:first*/1, /*src:*/av[2], /*dst:*/av[2]/*, FALSE*/));
2783 break; 2836 break;
2784 } 2837 }
2785 2838
@@ -2796,7 +2849,7 @@ static NOINLINE var *exec_builtin(node *op, var *res)
2796 2849
2797/* if expr looks like "var=value", perform assignment and return 1, 2850/* if expr looks like "var=value", perform assignment and return 1,
2798 * otherwise return 0 */ 2851 * otherwise return 0 */
2799static int is_assignment(const char *expr) 2852static int try_to_assign(const char *expr)
2800{ 2853{
2801 char *exprc, *val; 2854 char *exprc, *val;
2802 2855
@@ -2825,42 +2878,55 @@ static void set_text_mode(FILE *f)
2825#endif 2878#endif
2826 2879
2827/* switch to next input file */ 2880/* switch to next input file */
2828static rstream *next_input_file(void) 2881static int next_input_file(void)
2829{ 2882{
2830#define rsm (G.next_input_file__rsm) 2883#define input_file_seen (G.next_input_file__input_file_seen)
2831#define files_happen (G.next_input_file__files_happen) 2884#define argind (G.next_input_file__argind)
2885 const char *fname;
2832 2886
2833 const char *fname, *ind; 2887 if (iF.F) {
2834 2888 fclose(iF.F);
2835 if (rsm.F) 2889 iF.F = NULL;
2836 fclose(rsm.F); 2890 iF.pos = iF.adv = 0;
2837 rsm.F = NULL; 2891 }
2838 rsm.pos = rsm.adv = 0;
2839 2892
2840 for (;;) { 2893 for (;;) {
2841 if (getvar_i(intvar[ARGIND])+1 >= getvar_i(intvar[ARGC])) { 2894 /* GNU Awk 5.1.1 does not _read_ ARGIND (but does read ARGC).
2842 if (files_happen) 2895 * It only sets ARGIND to 1, 2, 3... for every command-line filename
2843 return NULL; 2896 * (VAR=VAL params cause a gap in numbering).
2897 * If there are none and stdin is used, then ARGIND is not modified:
2898 * if it is set by e.g. 'BEGIN { ARGIND="foo" }', that value will
2899 * still be there.
2900 */
2901 argind++;
2902 if (argind >= getvar_i(intvar[ARGC])) {
2903 if (input_file_seen)
2904 return FALSE;
2844 fname = "-"; 2905 fname = "-";
2845 rsm.F = stdin; 2906 iF.F = stdin;
2846 break; 2907 break;
2847 } 2908 }
2848 ind = getvar_s(incvar(intvar[ARGIND])); 2909 fname = getvar_s(findvar(iamarray(intvar[ARGV]), utoa(argind)));
2849 fname = getvar_s(findvar(iamarray(intvar[ARGV]), ind)); 2910 if (fname && *fname) {
2850 if (fname && *fname && !is_assignment(fname)) { 2911 /* "If a filename on the command line has the form
2851 rsm.F = xfopen_stdin(fname); 2912 * var=val it is treated as a variable assignment"
2913 */
2914 if (try_to_assign(fname))
2915 continue;
2916 iF.F = xfopen_stdin(fname);
2917 setvar_i(intvar[ARGIND], argind);
2852 break; 2918 break;
2853 } 2919 }
2854 } 2920 }
2855#if ENABLE_PLATFORM_MINGW32 2921#if ENABLE_PLATFORM_MINGW32
2856 set_text_mode(rsm.F); 2922 set_text_mode(iF.F);
2857#endif 2923#endif
2858 2924
2859 files_happen = TRUE;
2860 setvar_s(intvar[FILENAME], fname); 2925 setvar_s(intvar[FILENAME], fname);
2861 return &rsm; 2926 input_file_seen = TRUE;
2862#undef rsm 2927 return TRUE;
2863#undef files_happen 2928#undef argind
2929#undef input_file_seen
2864} 2930}
2865 2931
2866#if ENABLE_PLATFORM_MINGW32 2932#if ENABLE_PLATFORM_MINGW32
@@ -2914,6 +2980,7 @@ static var *evaluate(node *op, var *res)
2914 uint32_t opinfo; 2980 uint32_t opinfo;
2915 int opn; 2981 int opn;
2916 node *op1; 2982 node *op1;
2983 var *old_Fields_ptr;
2917 2984
2918 opinfo = op->info; 2985 opinfo = op->info;
2919 opn = (opinfo & OPNMASK); 2986 opn = (opinfo & OPNMASK);
@@ -2922,10 +2989,16 @@ static var *evaluate(node *op, var *res)
2922 debug_printf_eval("opinfo:%08x opn:%08x\n", opinfo, opn); 2989 debug_printf_eval("opinfo:%08x opn:%08x\n", opinfo, opn);
2923 2990
2924 /* execute inevitable things */ 2991 /* execute inevitable things */
2992 old_Fields_ptr = NULL;
2925 if (opinfo & OF_RES1) { 2993 if (opinfo & OF_RES1) {
2926 if ((opinfo & OF_REQUIRED) && !op1) 2994 if ((opinfo & OF_REQUIRED) && !op1)
2927 syntax_error(EMSG_TOO_FEW_ARGS); 2995 syntax_error(EMSG_TOO_FEW_ARGS);
2928 L.v = evaluate(op1, TMPVAR0); 2996 L.v = evaluate(op1, TMPVAR0);
2997 /* Does L.v point to $n variable? */
2998 if ((size_t)(L.v - Fields) < maxfields) {
2999 /* yes, remember where Fields[] is */
3000 old_Fields_ptr = Fields;
3001 }
2929 if (opinfo & OF_STR1) { 3002 if (opinfo & OF_STR1) {
2930 L.s = getvar_s(L.v); 3003 L.s = getvar_s(L.v);
2931 debug_printf_eval("L.s:'%s'\n", L.s); 3004 debug_printf_eval("L.s:'%s'\n", L.s);
@@ -2944,8 +3017,15 @@ static var *evaluate(node *op, var *res)
2944 */ 3017 */
2945 if (opinfo & OF_RES2) { 3018 if (opinfo & OF_RES2) {
2946 R.v = evaluate(op->r.n, TMPVAR1); 3019 R.v = evaluate(op->r.n, TMPVAR1);
2947 //TODO: L.v may be invalid now, set L.v to NULL to catch bugs? 3020 /* Seen in $5=$$5=$0:
2948 //L.v = NULL; 3021 * Evaluation of R.v ($$5=$0 expression)
3022 * made L.v ($5) invalid. It's detected here.
3023 */
3024 if (old_Fields_ptr) {
3025 //if (old_Fields_ptr != Fields)
3026 // debug_printf_eval("L.v moved\n");
3027 L.v += Fields - old_Fields_ptr;
3028 }
2949 if (opinfo & OF_STR2) { 3029 if (opinfo & OF_STR2) {
2950 R.s = getvar_s(R.v); 3030 R.s = getvar_s(R.v);
2951 debug_printf_eval("R.s:'%s'\n", R.s); 3031 debug_printf_eval("R.s:'%s'\n", R.s);
@@ -3111,6 +3191,8 @@ static var *evaluate(node *op, var *res)
3111 3191
3112 /* -- recursive node type -- */ 3192 /* -- recursive node type -- */
3113 3193
3194 case XC( OC_CONST ):
3195 debug_printf_eval("CONST ");
3114 case XC( OC_VAR ): 3196 case XC( OC_VAR ):
3115 debug_printf_eval("VAR\n"); 3197 debug_printf_eval("VAR\n");
3116 L.v = op->l.v; 3198 L.v = op->l.v;
@@ -3154,7 +3236,7 @@ static var *evaluate(node *op, var *res)
3154 /* make sure that we never return a temp var */ 3236 /* make sure that we never return a temp var */
3155 if (L.v == TMPVAR0) 3237 if (L.v == TMPVAR0)
3156 L.v = res; 3238 L.v = res;
3157 /* if source is a temporary string, jusk relink it to dest */ 3239 /* if source is a temporary string, just relink it to dest */
3158 if (R.v == TMPVAR1 3240 if (R.v == TMPVAR1
3159 && !(R.v->type & VF_NUMBER) 3241 && !(R.v->type & VF_NUMBER)
3160 /* Why check !NUMBER? if R.v is a number but has cached R.v->string, 3242 /* Why check !NUMBER? if R.v is a number but has cached R.v->string,
@@ -3240,13 +3322,13 @@ static var *evaluate(node *op, var *res)
3240#endif 3322#endif
3241 } 3323 }
3242 } else { 3324 } else {
3243 if (!iF) 3325 if (!iF.F)
3244 iF = next_input_file(); 3326 next_input_file();
3245 rsm = iF; 3327 rsm = &iF;
3246 } 3328 }
3247 3329
3248 if (!rsm || !rsm->F) { 3330 if (!rsm->F) {
3249 setvar_i(intvar[ERRNO], errno); 3331 setvar_ERRNO();
3250 setvar_i(res, -1); 3332 setvar_i(res, -1);
3251 break; 3333 break;
3252 } 3334 }
@@ -3395,16 +3477,18 @@ static var *evaluate(node *op, var *res)
3395 */ 3477 */
3396 if (rsm->F) 3478 if (rsm->F)
3397 err = rsm->is_pipe ? pclose(rsm->F) : fclose(rsm->F); 3479 err = rsm->is_pipe ? pclose(rsm->F) : fclose(rsm->F);
3398//TODO: fix this case:
3399// $ awk 'BEGIN { print close(""); print ERRNO }'
3400// -1
3401// close of redirection that was never opened
3402// (we print 0, 0)
3403 free(rsm->buffer); 3480 free(rsm->buffer);
3404 hash_remove(fdhash, L.s); 3481 hash_remove(fdhash, L.s);
3482 } else {
3483 err = -1;
3484 /* gawk 'BEGIN { print close(""); print ERRNO }'
3485 * -1
3486 * close of redirection that was never opened
3487 */
3488 errno = ENOENT;
3405 } 3489 }
3406 if (err) 3490 if (err)
3407 setvar_i(intvar[ERRNO], errno); 3491 setvar_ERRNO();
3408 R_d = (double)err; 3492 R_d = (double)err;
3409 break; 3493 break;
3410 } 3494 }
@@ -3584,8 +3668,6 @@ static var *evaluate(node *op, var *res)
3584#undef sreg 3668#undef sreg
3585} 3669}
3586 3670
3587/* -------- main & co. -------- */
3588
3589static int awk_exit(void) 3671static int awk_exit(void)
3590{ 3672{
3591 unsigned i; 3673 unsigned i;
@@ -3678,7 +3760,7 @@ int awk_main(int argc UNUSED_PARAM, char **argv)
3678 setvar_s(intvar[FS], opt_F); 3760 setvar_s(intvar[FS], opt_F);
3679 } 3761 }
3680 while (list_v) { 3762 while (list_v) {
3681 if (!is_assignment(llist_pop(&list_v))) 3763 if (!try_to_assign(llist_pop(&list_v)))
3682 bb_show_usage(); 3764 bb_show_usage();
3683 } 3765 }
3684 3766
@@ -3695,6 +3777,8 @@ int awk_main(int argc UNUSED_PARAM, char **argv)
3695 _setmode(fd, _O_TEXT); 3777 _setmode(fd, _O_TEXT);
3696#endif 3778#endif
3697 s = xmalloc_read(fd, NULL); /* it's NUL-terminated */ 3779 s = xmalloc_read(fd, NULL); /* it's NUL-terminated */
3780 if (!s)
3781 bb_perror_msg_and_die("read error from '%s'", g_progname);
3698 close(fd); 3782 close(fd);
3699 parse_program(s); 3783 parse_program(s);
3700 free(s); 3784 free(s);
@@ -3740,15 +3824,14 @@ int awk_main(int argc UNUSED_PARAM, char **argv)
3740 awk_exit(); 3824 awk_exit();
3741 3825
3742 /* input file could already be opened in BEGIN block */ 3826 /* input file could already be opened in BEGIN block */
3743 if (!iF) 3827 if (!iF.F)
3744 iF = next_input_file(); 3828 goto next_file; /* no, it wasn't, go try opening */
3745 3829 /* Iterate over input files */
3746 /* passing through input files */ 3830 for (;;) {
3747 while (iF) {
3748 nextfile = FALSE; 3831 nextfile = FALSE;
3749 setvar_i(intvar[FNR], 0); 3832 setvar_i(intvar[FNR], 0);
3750 3833
3751 while ((i = awk_getline(iF, intvar[F0])) > 0) { 3834 while ((i = awk_getline(&iF, intvar[F0])) > 0) {
3752 nextrec = FALSE; 3835 nextrec = FALSE;
3753 incvar(intvar[NR]); 3836 incvar(intvar[NR]);
3754 incvar(intvar[FNR]); 3837 incvar(intvar[FNR]);
@@ -3757,11 +3840,11 @@ int awk_main(int argc UNUSED_PARAM, char **argv)
3757 if (nextfile) 3840 if (nextfile)
3758 break; 3841 break;
3759 } 3842 }
3760
3761 if (i < 0) 3843 if (i < 0)
3762 syntax_error(strerror(errno)); 3844 syntax_error(strerror(errno));
3763 3845 next_file:
3764 iF = next_input_file(); 3846 if (!next_input_file())
3847 break;
3765 } 3848 }
3766 3849
3767 awk_exit(); 3850 awk_exit();