diff options
| author | Ron Yorston <rmy@pobox.com> | 2021-08-07 09:41:49 +0100 |
|---|---|---|
| committer | Denys Vlasenko <vda.linux@googlemail.com> | 2021-08-22 15:40:21 +0200 |
| commit | 8817e285b7ce071a27c366a2a602d3ca162b08ba (patch) | |
| tree | f3f6e68561420ba8e2f13ea25f81257ed38452f1 | |
| parent | 74c4f356aee9c64978a881e5760055d0e3510a6a (diff) | |
| download | busybox-w32-8817e285b7ce071a27c366a2a602d3ca162b08ba.tar.gz busybox-w32-8817e285b7ce071a27c366a2a602d3ca162b08ba.tar.bz2 busybox-w32-8817e285b7ce071a27c366a2a602d3ca162b08ba.zip | |
shuf: speed-up when limited output is requested
A user noted that the following command was slower than they
expected:
busybox shuf -i "1500000000-$(date +%s)" -n 5
At time of writing the range contains 128 million values. On my
system this takes 7.7s whereas 'shuf' from coreutils takes a
handful of milliseconds.
Optimise BusyBox 'shuf' for cases where -n is specified by stopping
shuffling once the required number of lines have been processed.
On my system the time for the example is reduced to 0.4s.
function old new delta
shuf_main 520 540 +20
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 1/0 up/down: 20/0) Total: 20 bytes
v2: Code shrink. Since outlines <= numlines:
- the loop in shuffle_lines() only needs to test the value of
outlines;
- shuffle_lines() can be called unconditionally.
Update timing to allow for the 13 million seconds elapsed since v1.
Signed-off-by: Ron Yorston <rmy@pobox.com>
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
| -rw-r--r-- | coreutils/shuf.c | 29 |
1 files changed, 15 insertions, 14 deletions
diff --git a/coreutils/shuf.c b/coreutils/shuf.c index fdbd3e9b2..3b2ba93cf 100644 --- a/coreutils/shuf.c +++ b/coreutils/shuf.c | |||
| @@ -24,7 +24,7 @@ | |||
| 24 | //usage: "\n -i L-H Treat numbers L-H as lines" | 24 | //usage: "\n -i L-H Treat numbers L-H as lines" |
| 25 | //usage: "\n -n NUM Output at most NUM lines" | 25 | //usage: "\n -n NUM Output at most NUM lines" |
| 26 | //usage: "\n -o FILE Write to FILE, not standard output" | 26 | //usage: "\n -o FILE Write to FILE, not standard output" |
| 27 | //usage: "\n -z End lines with zero byte, not newline" | 27 | //usage: "\n -z NUL terminated output" |
| 28 | 28 | ||
| 29 | #include "libbb.h" | 29 | #include "libbb.h" |
| 30 | 30 | ||
| @@ -39,8 +39,10 @@ | |||
| 39 | 39 | ||
| 40 | /* | 40 | /* |
| 41 | * Use the Fisher-Yates shuffle algorithm on an array of lines. | 41 | * Use the Fisher-Yates shuffle algorithm on an array of lines. |
| 42 | * If the required number of output lines is less than the total | ||
| 43 | * we can stop shuffling early. | ||
| 42 | */ | 44 | */ |
| 43 | static void shuffle_lines(char **lines, unsigned numlines) | 45 | static void shuffle_lines(char **lines, unsigned numlines, unsigned outlines) |
| 44 | { | 46 | { |
| 45 | unsigned i; | 47 | unsigned i; |
| 46 | unsigned r; | 48 | unsigned r; |
| @@ -48,7 +50,7 @@ static void shuffle_lines(char **lines, unsigned numlines) | |||
| 48 | 50 | ||
| 49 | srand(monotonic_us()); | 51 | srand(monotonic_us()); |
| 50 | 52 | ||
| 51 | for (i = numlines-1; i > 0; i--) { | 53 | for (i = numlines-1; outlines > 0; i--, outlines--) { |
| 52 | r = rand(); | 54 | r = rand(); |
| 53 | /* RAND_MAX can be as small as 32767 */ | 55 | /* RAND_MAX can be as small as 32767 */ |
| 54 | if (i > RAND_MAX) | 56 | if (i > RAND_MAX) |
| @@ -67,7 +69,7 @@ int shuf_main(int argc, char **argv) | |||
| 67 | char *opt_i_str, *opt_n_str, *opt_o_str; | 69 | char *opt_i_str, *opt_n_str, *opt_o_str; |
| 68 | unsigned i; | 70 | unsigned i; |
| 69 | char **lines; | 71 | char **lines; |
| 70 | unsigned numlines; | 72 | unsigned numlines, outlines; |
| 71 | char eol; | 73 | char eol; |
| 72 | 74 | ||
| 73 | opts = getopt32(argv, "^" | 75 | opts = getopt32(argv, "^" |
| @@ -128,24 +130,23 @@ int shuf_main(int argc, char **argv) | |||
| 128 | fclose_if_not_stdin(fp); | 130 | fclose_if_not_stdin(fp); |
| 129 | } | 131 | } |
| 130 | 132 | ||
| 131 | if (numlines != 0) | 133 | outlines = numlines; |
| 132 | shuffle_lines(lines, numlines); | 134 | if (opts & OPT_n) { |
| 135 | outlines = xatou(opt_n_str); | ||
| 136 | if (outlines > numlines) | ||
| 137 | outlines = numlines; | ||
| 138 | } | ||
| 139 | |||
| 140 | shuffle_lines(lines, numlines, outlines); | ||
| 133 | 141 | ||
| 134 | if (opts & OPT_o) | 142 | if (opts & OPT_o) |
| 135 | xmove_fd(xopen(opt_o_str, O_WRONLY|O_CREAT|O_TRUNC), STDOUT_FILENO); | 143 | xmove_fd(xopen(opt_o_str, O_WRONLY|O_CREAT|O_TRUNC), STDOUT_FILENO); |
| 136 | 144 | ||
| 137 | if (opts & OPT_n) { | ||
| 138 | unsigned maxlines; | ||
| 139 | maxlines = xatou(opt_n_str); | ||
| 140 | if (numlines > maxlines) | ||
| 141 | numlines = maxlines; | ||
| 142 | } | ||
| 143 | |||
| 144 | eol = '\n'; | 145 | eol = '\n'; |
| 145 | if (opts & OPT_z) | 146 | if (opts & OPT_z) |
| 146 | eol = '\0'; | 147 | eol = '\0'; |
| 147 | 148 | ||
| 148 | for (i = 0; i < numlines; i++) { | 149 | for (i = numlines - outlines; i < numlines; i++) { |
| 149 | if (opts & OPT_i) | 150 | if (opts & OPT_i) |
| 150 | printf("%u%c", (unsigned)(uintptr_t)lines[i], eol); | 151 | printf("%u%c", (unsigned)(uintptr_t)lines[i], eol); |
| 151 | else | 152 | else |
