diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2018-04-04 17:02:32 +0200 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2018-04-04 17:03:41 +0200 |
commit | c29c2e60d8677cdec142a88609a3d040e4719439 (patch) | |
tree | 9d23487088699fb4f1b4aabd9361c4a7c7d21edf /coreutils | |
parent | ee1fd1246e7d25990531dbbb2d267605b8914d5d (diff) | |
download | busybox-w32-c29c2e60d8677cdec142a88609a3d040e4719439.tar.gz busybox-w32-c29c2e60d8677cdec142a88609a3d040e4719439.tar.bz2 busybox-w32-c29c2e60d8677cdec142a88609a3d040e4719439.zip |
sort: FEATURE_SORT_OPTIMIZE_MEMORY
On sorting all kernel/linux/arch/ *.[ch] files,
this reduces memory usage by 6%.
yes | head -99999999 | sort
goes down from 1900Mb to 380 Mb.
function old new delta
sort_main 862 1098 +236
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Diffstat (limited to 'coreutils')
-rw-r--r-- | coreutils/sort.c | 86 |
1 files changed, 80 insertions, 6 deletions
diff --git a/coreutils/sort.c b/coreutils/sort.c index b39297a26..b8bb6871d 100644 --- a/coreutils/sort.c +++ b/coreutils/sort.c | |||
@@ -18,16 +18,24 @@ | |||
18 | //config: sort is used to sort lines of text in specified files. | 18 | //config: sort is used to sort lines of text in specified files. |
19 | //config: | 19 | //config: |
20 | //config:config FEATURE_SORT_BIG | 20 | //config:config FEATURE_SORT_BIG |
21 | //config: bool "Full SuSv3 compliant sort (support -ktcsbdfiozgM)" | 21 | //config: bool "Full SuSv3 compliant sort (support -ktcbdfiozgM)" |
22 | //config: default y | 22 | //config: default y |
23 | //config: depends on SORT | 23 | //config: depends on SORT |
24 | //config: help | 24 | //config: help |
25 | //config: Without this, sort only supports -r, -u, and an integer version | 25 | //config: Without this, sort only supports -r, -u, -s, and an integer version |
26 | //config: of -n. Selecting this adds sort keys, floating point support, and | 26 | //config: of -n. Selecting this adds sort keys, floating point support, and |
27 | //config: more. This adds a little over 3k to a nonstatic build on x86. | 27 | //config: more. This adds a little over 3k to a nonstatic build on x86. |
28 | //config: | 28 | //config: |
29 | //config: The SuSv3 sort standard is available at: | 29 | //config: The SuSv3 sort standard is available at: |
30 | //config: http://www.opengroup.org/onlinepubs/007904975/utilities/sort.html | 30 | //config: http://www.opengroup.org/onlinepubs/007904975/utilities/sort.html |
31 | //config: | ||
32 | //config:config FEATURE_SORT_OPTIMIZE_MEMORY | ||
33 | //config: bool "Use less memory (but might be slower)" | ||
34 | //config: default n # defaults to N since we are size-paranoid tribe | ||
35 | //config: depends on SORT | ||
36 | //config: help | ||
37 | //config: Attempt to use less memory (by storing only one copy | ||
38 | //config: of duplicated lines, and such). Useful if you work on huge files. | ||
31 | 39 | ||
32 | //applet:IF_SORT(APPLET_NOEXEC(sort, sort, BB_DIR_USR_BIN, BB_SUID_DROP, sort)) | 40 | //applet:IF_SORT(APPLET_NOEXEC(sort, sort, BB_DIR_USR_BIN, BB_SUID_DROP, sort)) |
33 | 41 | ||
@@ -46,19 +54,17 @@ | |||
46 | //usage: "\n -f Ignore case" | 54 | //usage: "\n -f Ignore case" |
47 | //usage: "\n -i Ignore unprintable characters" | 55 | //usage: "\n -i Ignore unprintable characters" |
48 | //usage: "\n -d Dictionary order (blank or alphanumeric only)" | 56 | //usage: "\n -d Dictionary order (blank or alphanumeric only)" |
49 | //usage: "\n -g General numerical sort" | ||
50 | //usage: "\n -M Sort month" | ||
51 | //usage: ) | 57 | //usage: ) |
52 | //-h, --human-numeric-sort: compare human readable numbers (e.g., 2K 1G) | 58 | //-h, --human-numeric-sort: compare human readable numbers (e.g., 2K 1G) |
53 | //usage: "\n -n Sort numbers" | 59 | //usage: "\n -n Sort numbers" |
54 | //usage: IF_FEATURE_SORT_BIG( | 60 | //usage: IF_FEATURE_SORT_BIG( |
61 | //usage: "\n -g General numerical sort" | ||
62 | //usage: "\n -M Sort month" | ||
55 | //usage: "\n -t CHAR Field separator" | 63 | //usage: "\n -t CHAR Field separator" |
56 | //usage: "\n -k N[,M] Sort by Nth field" | 64 | //usage: "\n -k N[,M] Sort by Nth field" |
57 | //usage: ) | 65 | //usage: ) |
58 | //usage: "\n -r Reverse sort order" | 66 | //usage: "\n -r Reverse sort order" |
59 | //usage: IF_FEATURE_SORT_BIG( | ||
60 | //usage: "\n -s Stable (don't sort ties alphabetically)" | 67 | //usage: "\n -s Stable (don't sort ties alphabetically)" |
61 | //usage: ) | ||
62 | //usage: "\n -u Suppress duplicate lines" | 68 | //usage: "\n -u Suppress duplicate lines" |
63 | //usage: IF_FEATURE_SORT_BIG( | 69 | //usage: IF_FEATURE_SORT_BIG( |
64 | //usage: "\n -z Lines are terminated by NUL, not newline" | 70 | //usage: "\n -z Lines are terminated by NUL, not newline" |
@@ -404,6 +410,14 @@ int sort_main(int argc UNUSED_PARAM, char **argv) | |||
404 | int i; | 410 | int i; |
405 | int linecount; | 411 | int linecount; |
406 | unsigned opts; | 412 | unsigned opts; |
413 | #if ENABLE_FEATURE_SORT_OPTIMIZE_MEMORY | ||
414 | bool can_drop_dups; | ||
415 | size_t prev_len = 0; | ||
416 | /* Postpone optimizing if the input is small, < 16k lines: | ||
417 | * even just free()ing duplicate lines takes time. | ||
418 | */ | ||
419 | size_t count_to_optimize_dups = 0x3fff; | ||
420 | #endif | ||
407 | 421 | ||
408 | xfunc_error_retval = 2; | 422 | xfunc_error_retval = 2; |
409 | 423 | ||
@@ -412,6 +426,29 @@ int sort_main(int argc UNUSED_PARAM, char **argv) | |||
412 | sort_opt_str, | 426 | sort_opt_str, |
413 | &str_ignored, &str_ignored, &str_o, &lst_k, &str_t | 427 | &str_ignored, &str_ignored, &str_o, &lst_k, &str_t |
414 | ); | 428 | ); |
429 | #if ENABLE_FEATURE_SORT_OPTIMIZE_MEMORY | ||
430 | /* Can drop dups only if -u but no "complicating" options, | ||
431 | * IOW: if we do a full line compares. Safe options: | ||
432 | * -o FILE Output to FILE | ||
433 | * -z Lines are terminated by NUL, not newline | ||
434 | * -r Reverse sort order | ||
435 | * -s Stable (don't sort ties alphabetically) | ||
436 | * Not sure about these: | ||
437 | * -b Ignore leading blanks | ||
438 | * -f Ignore case | ||
439 | * -i Ignore unprintable characters | ||
440 | * -d Dictionary order (blank or alphanumeric only) | ||
441 | * -n Sort numbers | ||
442 | * -g General numerical sort | ||
443 | * -M Sort month | ||
444 | */ | ||
445 | can_drop_dups = ((opts & ~(FLAG_o|FLAG_z|FLAG_r|FLAG_s)) == FLAG_u); | ||
446 | /* Stable sort needs every line to be uniquely allocated, | ||
447 | * disable optimization to reuse strings: | ||
448 | */ | ||
449 | if (opts & FLAG_s) | ||
450 | count_to_optimize_dups = (size_t)-1L; | ||
451 | #endif | ||
415 | /* global b strips leading and trailing spaces */ | 452 | /* global b strips leading and trailing spaces */ |
416 | if (opts & FLAG_b) | 453 | if (opts & FLAG_b) |
417 | option_mask32 |= FLAG_bb; | 454 | option_mask32 |= FLAG_bb; |
@@ -489,6 +526,41 @@ int sort_main(int argc UNUSED_PARAM, char **argv) | |||
489 | char *line = GET_LINE(fp); | 526 | char *line = GET_LINE(fp); |
490 | if (!line) | 527 | if (!line) |
491 | break; | 528 | break; |
529 | |||
530 | #if ENABLE_FEATURE_SORT_OPTIMIZE_MEMORY | ||
531 | if (count_to_optimize_dups != 0) | ||
532 | count_to_optimize_dups--; | ||
533 | if (count_to_optimize_dups == 0) { | ||
534 | size_t len; | ||
535 | char *new_line; | ||
536 | char first = *line; | ||
537 | |||
538 | /* On kernel/linux/arch/ *.[ch] files, | ||
539 | * this reduces memory usage by 6%. | ||
540 | * yes | head -99999999 | sort | ||
541 | * goes down from 1900Mb to 380 Mb. | ||
542 | */ | ||
543 | if (first == '\0' || first == '\n') { | ||
544 | len = !(first == '\0'); | ||
545 | new_line = (char*)"\n" + 1 - len; | ||
546 | goto replace; | ||
547 | } | ||
548 | len = strlen(line); | ||
549 | if (len <= prev_len) { | ||
550 | new_line = lines[linecount-1] + (prev_len - len); | ||
551 | if (strcmp(line, new_line) == 0) { | ||
552 | if (can_drop_dups && prev_len == len) { | ||
553 | free(line); | ||
554 | continue; | ||
555 | } | ||
556 | replace: | ||
557 | free(line); | ||
558 | line = new_line; | ||
559 | } | ||
560 | } | ||
561 | prev_len = len; | ||
562 | } | ||
563 | #endif | ||
492 | lines = xrealloc_vector(lines, 6, linecount); | 564 | lines = xrealloc_vector(lines, 6, linecount); |
493 | lines[linecount++] = line; | 565 | lines[linecount++] = line; |
494 | } | 566 | } |
@@ -510,6 +582,8 @@ int sort_main(int argc UNUSED_PARAM, char **argv) | |||
510 | } | 582 | } |
511 | return EXIT_SUCCESS; | 583 | return EXIT_SUCCESS; |
512 | } | 584 | } |
585 | #else | ||
586 | //TODO: lighter version which only drops total dups if can_drop_dups == true | ||
513 | #endif | 587 | #endif |
514 | 588 | ||
515 | /* For stable sort, store original line position beyond terminating NUL */ | 589 | /* For stable sort, store original line position beyond terminating NUL */ |