#!/usr/bin/perl
use strict;
use warnings;
use feature 'say';
use autodie;

use FindBin '$Bin';
use lib "$Bin/lib";

use Vnlog::Util qw(parse_options read_and_preparse_input ensure_all_legends_equivalent reconstruct_substituted_command get_key_index);




# This comes from the struct option long_options in sort.c in GNU coreutils
my @specs = ( # options with no args
              "ignore-leading-blanks|b",
              "debug",
              "dictionary-order|d",
              "ignore-case|f",
              "general-numeric-sort|g",
              "ignore-nonprinting|i",
              "merge|m",
              "month-sort|M",
              "numeric-sort|n",
              "human-numeric-sort|h",
              "version-sort|V",
              "random-sort|R",
              "reverse|r",
              "stable|s",
              "unique|u",
              "zero-terminated|z",

              # options with args
              "compress-program=s",
              "files0-from=s",
              "key|k=s@",
              "random-source=s",
              "sort=s",
              "output|o=s",
              "batch-size=s",
              "buffer-size|S=s",
              "field-separator|t=s",
              "temporary-directory|T=s",
              "parallel=s",

              # '--check' and '--check=...' are valid. '--check ...' is NOT
              # valid. parse_options handles that
              "check:s",
              "c|C",

              "help|h");

my %options_unsupported = ( 'files0-from' => <<'EOF',
Support for this could be added but has not been (yet)
EOF

                            'output'      => <<'EOF',
Can't support this. sort calls ftruncate(file), so if I write out the legend
before calling sort, this is clobbered. Or if I use a pipe, the ftruncate() will
fail and sort barfs
EOF
                            'field-separator' => <<'EOF',
vnlog is built on assuming a particular field separator
EOF
                            'zero-terminated' => <<'EOF'
vnlog is built on assuming a particular record separator
EOF
                          );

my ($filenames,$options) = parse_options(\@ARGV, \@specs, 0, <<EOF);
  $0 [options] logfile logfile logfile ... < logfile

The most common options are (from the GNU sort manpage)

  -k, --key=KEYDEF
         sort via a key. This is the vnlog field we're sorting on

  -s, --stable
         stabilize sort by disabling last-resort comparison

  -u, --unique
         with -c, check for strict ordering;
         without -c, output only the first of an equal run

  -f, --ignore-case
         fold lower case to upper case characters

  -r, --reverse
         reverse the result of comparisons

These all choose the sort order:

  -d, --dictionary-order
         consider only blanks and alphanumeric characters

  -g, --general-numeric-sort
         compare according to general numerical value

  -h, --human-numeric-sort
         compare human readable numbers (e.g., 2K 1G)

  -M, --month-sort
         compare (unknown) < 'JAN' < ... < 'DEC'

  -n, --numeric-sort
         compare according to string numerical value

  -R, --random-sort
         shuffle, but group identical keys.  See shuf(1)

  -V, --version-sort
         natural sort of (version) numbers within text

  --sort=WORD
         sort according to WORD: general-numeric -g, human-numeric -h,
         month -M, numeric -n, random -R, version -V
EOF

for my $key(keys %$options)
{
    if($options_unsupported{$key})
    {
        my $keyname = length($key) == 1 ? "-$key" : "--$key";
        die("I don't support $keyname: $options_unsupported{$key}");
    }
}

$options->{'ignore-leading-blanks'} = 1;

my $inputs = read_and_preparse_input($filenames);
ensure_all_legends_equivalent($inputs);
substitute_field_keys($options, $inputs->[0]);
my $ARGV_new = reconstruct_substituted_command($inputs, $options, [], \@specs);

say '# ' . join(' ', @{$inputs->[0]{keys}});
exec 'sort', @$ARGV_new;


sub substitute_field_keys
{
    my ($options, $inputs) = @_;

    return unless defined $options->{key};

    # manpage of sort says that key definitions are given as
    # "F[.C][OPTS][,F[.C][OPTS]]". By default, sort works not just off the field
    # alone, but off the whole line, starting at that field, which is crazy. To
    # do the sane thing of just looking at the field you can do the after-,
    # field spec. I.e. to sort JUST on field 2 you'd say 'sort -k 2,2'. That's
    # weird, and vnl-sort always passes this syntax, and thus vnl-sort
    # doesn't support the , in -k .... vnl-sort also doesn't support the C.
    # I DO want to support OPTS, so the vnl-sort syntax is 'vnl-sort
    # -k field[.OPTS] where OPTS are just like in sort: one or more of
    # [bdfgiMhnRrV]

    my $Nkeyoptions = scalar @{$options->{key}};
    for my $i_keyoption (0..$Nkeyoptions-1)
    {
        $options->{key}[$i_keyoption] =~ /^([^\.]+)(?:\.([bdfgiMhnRrV]+?))?$/
          or die "Couldn't parse '$_' as a sort KEYDEF";

        my $key  = get_key_index($inputs, $1);
        my $opts = $2 // '';

        $options->{key}[$i_keyoption] = "$key,$key$opts";
    }
}

__END__

=head1 NAME

vnl-sort - sorts an vnlog file, preserving the legend

=head1 SYNOPSIS

 $ cat a.vnl
 # a b
 AA 11
 bb 12
 CC 13
 dd 14
 dd 123

 Sort lexically by a:
 $ <a.vnl vnl-sort -k a
 # a b
 AA 11
 CC 13
 bb 12
 dd 123
 dd 14

 Sort lexically by a, ignoring case:
 $ <a.vnl vnl-sort -k a --ignore-case
 # a b
 AA 11
 bb 12
 CC 13
 dd 123
 dd 14

 Sort lexically by a, then numerically by b:
 $ <a.vnl vnl-sort -k a -k b.n
 # a b
 AA 11
 CC 13
 bb 12
 dd 14
 dd 123

 Sort lexically by a, then numerically by b in reverse:
 $ <a.vnl vnl-sort -k a -k b.nr
 # a b
 AA 11
 CC 13
 bb 12
 dd 123
 dd 14


 Sort by month and then day:
 $ cat dat.vnl
 # month day
 March 5
 Jan 2
 Feb 1
 March 30
 Jan 21

 $ <dat.vnl vnl-sort -k month.M -k day.n
 # month day
 Jan 2
 Jan 21
 Feb 1
 March 5
 March 30


=head1 DESCRIPTION

  Usage: vnl-sort [options] logfile logfile logfile ... < logfile

This tool sorts given vnlog files in various ways. C<vnl-sort> is a
wrapper around the GNU coreutils C<sort> tool. Since this is a wrapper, most
commandline options and behaviors of the C<sort> tool are present; consult the
L<sort(1)> manpage for detail. The differences from GNU coreutils C<sort> are

=over

=item *

The input and output to this tool are vnlog files, complete with a legend

=item *

The columns are referenced by name, not index. So instead of saying

  sort -k1

to sort by the first column, you say

  sort -k time

to sort by column "time".

=item *

The fancy C<KEYDEF> spec from C<sort> is only partially supported. I only allow
us to sort by full I<fields>, so the start/stop positions don't make sense. I
I<do> support the C<OPTS> to change the type of sorting in a given particular
column. For instance, to sort by month and then by day, do this (see example
above):

  vnl-sort -k month.M -k day.n

=item *

C<--files0-from> is not supported due to lack of time. If somebody really needs
it, talk to me.

=item *

C<--output> is not supported due to an uninteresting technical limitation. The
output always goes to standard out.

=item *

C<--field-separator> is not supported because vnlog assumes
whitespace-separated fields

=item *

C<--zero-terminated> is not supported because vnlog assumes
newline-separated records

=back

Past that, everything C<sort> does is supported, so see that man page for
detailed documentation. Note that all non-legend comments are stripped out,
since it's not obvious where they should end up.

=head1 BUGS

This and the other C<vnl-xxx> tools that wrap coreutils are written specifically
to work with the Linux kernel and the GNU coreutils. None of these have been
tested with BSD tools or with non-Linux kernels, and I'm sure things don't just
work. It's probably not too effortful to get that running, but somebody needs to
at least bug me for that. Or better yet, send me nice patches :)

=head1 SEE ALSO

L<sort(1)>

=head1 REPOSITORY

https://github.com/dkogan/vnlog/

=head1 AUTHOR

Dima Kogan C<< <dima@secretsauce.net> >>

=head1 LICENSE AND COPYRIGHT

Copyright 2018 Dima Kogan C<< <dima@secretsauce.net> >>

This library is free software; you can redistribute it and/or modify it under
the terms of the GNU Lesser General Public License as published by the Free
Software Foundation; either version 2.1 of the License, or (at your option) any
later version.

=cut
