From 8817e285b7ce071a27c366a2a602d3ca162b08ba Mon Sep 17 00:00:00 2001 From: Ron Yorston Date: Sat, 7 Aug 2021 09:41:49 +0100 Subject: shuf: speed-up when limited output is requested A user noted that the following command was slower than they expected: busybox shuf -i "1500000000-$(date +%s)" -n 5 At time of writing the range contains 128 million values. On my system this takes 7.7s whereas 'shuf' from coreutils takes a handful of milliseconds. Optimise BusyBox 'shuf' for cases where -n is specified by stopping shuffling once the required number of lines have been processed. On my system the time for the example is reduced to 0.4s. function old new delta shuf_main 520 540 +20 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 1/0 up/down: 20/0) Total: 20 bytes v2: Code shrink. Since outlines <= numlines: - the loop in shuffle_lines() only needs to test the value of outlines; - shuffle_lines() can be called unconditionally. Update timing to allow for the 13 million seconds elapsed since v1. Signed-off-by: Ron Yorston Signed-off-by: Denys Vlasenko --- coreutils/shuf.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'coreutils') diff --git a/coreutils/shuf.c b/coreutils/shuf.c index fdbd3e9b2..3b2ba93cf 100644 --- a/coreutils/shuf.c +++ b/coreutils/shuf.c @@ -24,7 +24,7 @@ //usage: "\n -i L-H Treat numbers L-H as lines" //usage: "\n -n NUM Output at most NUM lines" //usage: "\n -o FILE Write to FILE, not standard output" -//usage: "\n -z End lines with zero byte, not newline" +//usage: "\n -z NUL terminated output" #include "libbb.h" @@ -39,8 +39,10 @@ /* * Use the Fisher-Yates shuffle algorithm on an array of lines. + * If the required number of output lines is less than the total + * we can stop shuffling early. */ -static void shuffle_lines(char **lines, unsigned numlines) +static void shuffle_lines(char **lines, unsigned numlines, unsigned outlines) { unsigned i; unsigned r; @@ -48,7 +50,7 @@ static void shuffle_lines(char **lines, unsigned numlines) srand(monotonic_us()); - for (i = numlines-1; i > 0; i--) { + for (i = numlines-1; outlines > 0; i--, outlines--) { r = rand(); /* RAND_MAX can be as small as 32767 */ if (i > RAND_MAX) @@ -67,7 +69,7 @@ int shuf_main(int argc, char **argv) char *opt_i_str, *opt_n_str, *opt_o_str; unsigned i; char **lines; - unsigned numlines; + unsigned numlines, outlines; char eol; opts = getopt32(argv, "^" @@ -128,24 +130,23 @@ int shuf_main(int argc, char **argv) fclose_if_not_stdin(fp); } - if (numlines != 0) - shuffle_lines(lines, numlines); + outlines = numlines; + if (opts & OPT_n) { + outlines = xatou(opt_n_str); + if (outlines > numlines) + outlines = numlines; + } + + shuffle_lines(lines, numlines, outlines); if (opts & OPT_o) xmove_fd(xopen(opt_o_str, O_WRONLY|O_CREAT|O_TRUNC), STDOUT_FILENO); - if (opts & OPT_n) { - unsigned maxlines; - maxlines = xatou(opt_n_str); - if (numlines > maxlines) - numlines = maxlines; - } - eol = '\n'; if (opts & OPT_z) eol = '\0'; - for (i = 0; i < numlines; i++) { + for (i = numlines - outlines; i < numlines; i++) { if (opts & OPT_i) printf("%u%c", (unsigned)(uintptr_t)lines[i], eol); else -- cgit v1.2.3-55-g6feb From d59f539d577ebf6100f1292e27560514e8a18195 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 23 Aug 2021 15:48:22 +0200 Subject: shuf: tweak --help text Signed-off-by: Denys Vlasenko --- coreutils/shuf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'coreutils') diff --git a/coreutils/shuf.c b/coreutils/shuf.c index 3b2ba93cf..4b41d5e71 100644 --- a/coreutils/shuf.c +++ b/coreutils/shuf.c @@ -17,14 +17,14 @@ //kbuild:lib-$(CONFIG_SHUF) += shuf.o //usage:#define shuf_trivial_usage -//usage: "[-e|-i L-H] [-n NUM] [-o FILE] [-z] [FILE|ARG...]" +//usage: "[-n NUM] [-o FILE] [-z] [FILE | -e [ARG...] | -i L-H]" //usage:#define shuf_full_usage "\n\n" //usage: "Randomly permute lines\n" -//usage: "\n -e Treat ARGs as lines" -//usage: "\n -i L-H Treat numbers L-H as lines" //usage: "\n -n NUM Output at most NUM lines" //usage: "\n -o FILE Write to FILE, not standard output" //usage: "\n -z NUL terminated output" +//usage: "\n -e Treat ARGs as lines" +//usage: "\n -i L-H Treat numbers L-H as lines" #include "libbb.h" @@ -50,7 +50,7 @@ static void shuffle_lines(char **lines, unsigned numlines, unsigned outlines) srand(monotonic_us()); - for (i = numlines-1; outlines > 0; i--, outlines--) { + for (i = numlines - 1; outlines > 0; i--, outlines--) { r = rand(); /* RAND_MAX can be as small as 32767 */ if (i > RAND_MAX) -- cgit v1.2.3-55-g6feb From 60f4843468213324cc348af9d8ec09648b6f6784 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 23 Aug 2021 15:52:34 +0200 Subject: shuf: with -i LOW-HIGH, do not allow any argv's function old new delta shuf_main 436 441 +5 Signed-off-by: Denys Vlasenko --- coreutils/shuf.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'coreutils') diff --git a/coreutils/shuf.c b/coreutils/shuf.c index 4b41d5e71..fc9635147 100644 --- a/coreutils/shuf.c +++ b/coreutils/shuf.c @@ -92,6 +92,9 @@ int shuf_main(int argc, char **argv) char *dash; unsigned lo, hi; + if (argv[0]) + bb_show_usage(); + dash = strchr(opt_i_str, '-'); if (!dash) { bb_error_msg_and_die("bad range '%s'", opt_i_str); -- cgit v1.2.3-55-g6feb From 49a2e484b5bd3f6343e55bfed823d3ca6bd5d45a Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 29 Aug 2021 14:39:01 +0200 Subject: shuf: in -i RANGE, accept numbers up to width of pointers function old new delta .rodata 108468 108474 +6 shuf_main 555 542 -13 Signed-off-by: Denys Vlasenko --- coreutils/shuf.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) (limited to 'coreutils') diff --git a/coreutils/shuf.c b/coreutils/shuf.c index fc9635147..77f8a8ff9 100644 --- a/coreutils/shuf.c +++ b/coreutils/shuf.c @@ -90,7 +90,7 @@ int shuf_main(int argc, char **argv) if (opts & OPT_i) { /* create a range of numbers */ char *dash; - unsigned lo, hi; + uintptr_t lo, hi; if (argv[0]) bb_show_usage(); @@ -100,8 +100,17 @@ int shuf_main(int argc, char **argv) bb_error_msg_and_die("bad range '%s'", opt_i_str); } *dash = '\0'; - lo = xatou(opt_i_str); - hi = xatou(dash + 1); + if (sizeof(lo) == sizeof(int)) { + lo = xatou(opt_i_str); + hi = xatou(dash + 1); + } else + if (sizeof(lo) == sizeof(long)) { + lo = xatoul(opt_i_str); + hi = xatoul(dash + 1); + } else { + lo = xatoull(opt_i_str); + hi = xatoull(dash + 1); + } *dash = '-'; if (hi < lo) { bb_error_msg_and_die("bad range '%s'", opt_i_str); @@ -110,17 +119,21 @@ int shuf_main(int argc, char **argv) numlines = (hi+1) - lo; lines = xmalloc(numlines * sizeof(lines[0])); for (i = 0; i < numlines; i++) { - lines[i] = (char*)(uintptr_t)lo; + lines[i] = (char*)lo; lo++; } } else { /* default - read lines from stdin or the input file */ FILE *fp; + const char *fname = "-"; - if (argc > 1) - bb_show_usage(); + if (argv[0]) { + if (argv[1]) + bb_show_usage(); + fname = argv[0]; + } - fp = xfopen_stdin(argv[0] ? argv[0] : "-"); + fp = xfopen_stdin(fname); lines = NULL; numlines = 0; for (;;) { @@ -150,9 +163,14 @@ int shuf_main(int argc, char **argv) eol = '\0'; for (i = numlines - outlines; i < numlines; i++) { - if (opts & OPT_i) - printf("%u%c", (unsigned)(uintptr_t)lines[i], eol); - else + if (opts & OPT_i) { + if (sizeof(lines[0]) == sizeof(int)) + printf("%u%c", (unsigned)(uintptr_t)lines[i], eol); + else if (sizeof(lines[0]) == sizeof(long)) + printf("%lu%c", (unsigned long)(uintptr_t)lines[i], eol); + else + printf("%llu%c", (unsigned long long)(uintptr_t)lines[i], eol); + } else printf("%s%c", lines[i], eol); } -- cgit v1.2.3-55-g6feb