#!/usr/bin/env perl
# /=====================================================================\ #
# |  latexml                                                            | #
# | main conversion program                                             | #
# |=====================================================================| #
# | Part of LaTeXML:                                                    | #
# |  Public domain software, produced as part of work done by the       | #
# |  United States Government & not subject to copyright in the US.     | #
# |---------------------------------------------------------------------| #
# | Bruce Miller <bruce.miller@nist.gov>                        #_#     | #
# | http://dlmf.nist.gov/LaTeXML/                              (o o)    | #
# \=========================================================ooo==U==ooo=/ #
use strict;
use warnings;
use FindBin;
use lib "$FindBin::RealBin/../lib";
use Getopt::Long qw(:config no_ignore_case);
use Pod::Usage;
use LaTeXML::Core;
use LaTeXML;    # Currently, just for version information.
use LaTeXML::Util::Pathname;
use LaTeXML::Common::Error;
use Encode;

#**********************************************************************
# Parse command line
my ($verbosity, $strict, $comments, $noparse, $includestyles) = (0, 0, 1, 0, 0);
my ($format, $destination, $help, $showversion)               = ('xml', '');
my ($preamble, $postamble, $logfile)                          = (undef, undef, undef);
my ($documentid);
my $inputencoding;
my $mode  = undef;
my @paths = ();
my (@preload);
GetOptions("destination=s" => \$destination,
  "output=s"        => \$destination,
  "preload=s"       => \@preload,
  "path=s"          => \@paths,
  "preamble=s"      => \$preamble,
  "postamble=s"     => \$postamble,
  "quiet"           => sub { $verbosity--; },
  "verbose"         => sub { $verbosity++; },
  "strict"          => \$strict,
  "log=s"           => \$logfile,
  "xml"             => sub { $format = 'xml'; },
  "tex"             => sub { $format = 'tex'; },
  "box"             => sub { $format = 'box'; },
  "bibtex"          => sub { $mode   = 'BibTeX'; },
  "noparse"         => \$noparse,
  "includestyles"   => \$includestyles,
  "inputencoding=s" => \$inputencoding,
  "comments!"       => \$comments,
  "VERSION"         => \$showversion,
  "debug=s"         => sub { no strict 'refs'; $LaTeXML::DEBUG{ lc($_[1]) } = 1; },
  "documentid=s"    => \$documentid,
  "help"            => \$help,
) or pod2usage(-message => $LaTeXML::IDENTITY,
  -exitval => 1, -verbose => 0, -output => \*STDERR);
pod2usage(-message => $LaTeXML::IDENTITY, -exitval => 0, -verbose => 2, -output => \*STDOUT)
  if $help;
if ($showversion) { print STDERR "$LaTeXML::IDENTITY\n"; exit(0); }
my $source = shift(@ARGV);

#======================================================================
# Validate command line before any actual processing
SetVerbosity($verbosity);
UseSTDERR();

if (my @fails = validate_args()) {
  print STDERR join("\n", $LaTeXML::IDENTITY, (map { colorizeString($_, 'error'); } @fails),
    "use option --help for details", "Conversion failed", '');
  exit(1); }

#======================================================================
my $starttime = StartTime();
my $latexml;
my $digested;
my $serialized;
eval {    # Catch errors
  UseLog($logfile);
  Note("$LaTeXML::IDENTITY processing $source");

  $latexml = LaTeXML::Core->new(
    preload         => [@preload],
    searchpaths     => [@paths],
    graphicspaths   => ['.'],
    verbosity       => $verbosity, strict        => $strict,
    includecomments => $comments,  inputencoding => $inputencoding,
    includestyles   => $includestyles,
    documentid      => $documentid,
    nomathparse     => $noparse);
  # Now do all processing relative to that State, to capture all erros
  $latexml->withState(sub {
      # ========================================
      # First read and digest whatever we're given.
      $digested = $latexml->digestFile($source, mode => $mode,
        preamble => $preamble, postamble => $postamble);

      # ========================================
      # Now, convert to DOM, serialize, encode and output, as appropriate.
      if ($digested) {
        if ($format eq 'tex') {
          $serialized = LaTeXML::Core::Token::UnTeX($digested); }
        elsif ($format eq 'box') {
          $serialized = ($verbosity > 0 ? $digested->stringify : $digested->toString); }
        else {
          my $dom = $latexml->convertDocument($digested);
          $serialized = $dom->toString(1); }
        # NOTE that we are serializing via LaTeXML's Document::serialize_aux
        # which has NOT been encoded into bytes, so we need an explicit encode before printing/returning
        $serialized = Encode::encode('UTF-8', $serialized) if $serialized;
      }
      $latexml->showProfile();    # Show profile (if any)
  }); };
#======================================================================
my $exit_message = $@;
Debug($@) if $@ ne $LaTeXML::Common::Error::DIE_MESSAGE;

my $code    = $latexml && $latexml->getStatusCode;
my $status  = $latexml && $latexml->getStatusMessage;
my $runtime = RunTime($starttime);
Note("Conversion " . ($code == 3 ? 'failed' : 'complete') . ": " . $status . " (reqd. $runtime)");
UseLog(undef);

# Should this be within the withState ?
# Hopefully, no errors by now... but if so...
# Does having the status line AFTER the result help or hurt????
if    (!$serialized) { }
elsif ($destination) {
  my $OUTPUT;
  open($OUTPUT, ">", $destination) or die "Couldn't open output file $destination: $!";
  print $OUTPUT $serialized;
  close($OUTPUT); }
else {
  print $serialized; }

# ========================================
# Now, unbind stuff, so we can clear memory
$latexml    = undef;
$digested   = undef;
$serialized = undef;

CheckDebuggable();
UseSTDERR(undef);

if ($exit_message) {    # non-zero exit code for fatal conversions
  exit(1);
}

#======================================================================
# Helper code.

# This validates arguments as much as possible BEFORE invoking any LaTeXML components
# (and WITHOUT invoking any LaTeXML Error handling)
sub validate_args {
  my @fails = ();
  push(@fails, "Unrecognized trailing arguments " . join(',', @ARGV)) if @ARGV;

  # Check search paths
  @paths = map { pathname_canonical($_) } reverse(@paths);
  if (my @baddirs = grep { !-d $_ } @paths) {
    push(@fails, "These search path do not exist: " . join(', ', @baddirs)); }

  # Find the requested source
  my $pathname;
  if (!$source) {
    push(@fails, "No input file given"); }
  elsif ($source eq '-') {
    { local $/ = undef;
      $source = "literal:" . <>;
      $mode   = 'TeX' unless defined $mode; } }    # Or sniff for bibtex data?
  elsif (pathname_is_literaldata($source)) {
  }
  elsif (!($pathname = pathname_find($source, types => ['tex', 'bib', ''], paths => ['.', @paths]))
    || !-r $pathname) {
    push(@fails, "Input file '$source' not readable"); }
  else {
    $mode = 'BibTeX' if !defined $mode && ($pathname =~ /\.bib$/);
    $mode = 'TeX' unless defined $mode; }

  # Always create a .log file for tracking the conversion messages
  # Use $jobname.latexml.log or just latexml.log for logging by default
  if ($pathname && !$logfile) {
    my ($dir, $name, $ext) = pathname_split($pathname);
    if ($name) {
      $logfile = pathname_make(dir => pathname_cwd(), name => $name, type => 'latexml.log'); } }
  $logfile = 'latexml.log' unless $logfile;

  # Check that destination is valid before wasting any time...
  if ($destination) {
    $destination = pathname_canonical($destination);
    if (my $dir = pathname_directory($destination)) {
      if (!(pathname_mkdir($dir) && -w $dir)) {
        push(@fails, "Couldn't create writable destination directory $dir: $!"); } } }
  return @fails; }

#**********************************************************************

__END__

=head1 NAME

C<latexml> - transforms a TeX/LaTeX file into XML.

=head1 SYNOPSIS

latexml [options] I<texfile>

 Options:
 --destination=file sets destination file (default stdout).
 --output=file      [obsolete synonym for --destination]
 --preload=module   requests loading of an optional module;
                    can be repeated
 --preamble=file    sets a preamble file which will
                    effectively be prepended to the main file.
 --postamble=file   sets a postamble file which will
                    effectively be appended to the main file.
 --includestyles    allows latexml to load raw *.sty file;
                    by default it avoids this.
 --path=dir         adds to the paths searched for files,
                    modules, etc;
 --log=file         specifies log file (default is file named after job name)
 --documentid=id    assign an id to the document root.
 --quiet            suppress messages (can repeat)
 --verbose          more informative output (can repeat)
 --strict           makes latexml less forgiving of errors
 --bibtex           processes as a BibTeX bibliography.
 --xml              requests xml output (default).
 --tex              requests TeX output after expansion.
 --box              requests box output after expansion
                    and digestion.
 --noparse          suppresses parsing math
 --nocomments       omit comments from the output
 --inputencoding=enc specify the input encoding.
 --VERSION          show version number.
 --debug=package    enables debugging output for the named
                    package
 --help             shows this help message.

If I<texfile> is '-', latexml reads the TeX source from standard input.
If I<texfile> has an explicit extension of C<.bib>, it is processed
as a BibTeX bibliography.

=head1 OPTIONS AND ARGUMENTS

=over 4

=item C<--destination>=I<file>

Specifies the destination file; by default the XML is written to stdout.

=item C<--preload>=I<module>

Requests the loading of an optional module or package.  This may be useful if the TeX code
does not specifically require the module (eg. through input or usepackage).
For example, use C<--preload=LaTeX.pool> to force LaTeX mode.

=item C<--preamble>=I<file>, C<--postamble>=I<file>

Specifies a file whose contents will effectively be prepended or appended
to the main document file's content. This can be useful when processing
TeX fragments, in which case the preamble would contain documentclass and begindocument
control sequences.  This option is not used when processing BibTeX files.

=item C<--includestyles>

This optional allows processing of style files (files with extensions C<sty>,
C<cls>, C<clo>, C<cnf>).  By default, these files are ignored  unless a latexml
implementation of them is found (with an extension of C<ltxml>).

These style files generally fall into two classes:  Those
that merely affect document style are ignorable in the XML.
Others define new markup and document structure, often using
deeper LaTeX macros to achieve their ends.  Although the omission
will lead to other errors (missing macro definitions), it is
unlikely that processing the TeX code in the style file will
lead to a correct document.

=item C<--path>=I<dir>

Add I<dir> to the search paths used when searching for files, modules, style files, etc;
somewhat like TEXINPUTS.  This option can be repeated.

=item C<--documentid>=I<id>

Assigns an ID to the root element of the XML document.  This ID is generally
inherited as the prefix of ID's on all other elements within the document.
This is useful when constructing a site of multiple documents so that
all nodes have unique IDs.

=item C<--quiet>

Reduces the verbosity of output during processing, used twice is pretty silent.

=item C<--verbose>

Increases the verbosity of output during processing, used twice is pretty chatty.
Can be useful for getting more details when errors occur.

=item C<--strict>

Specifies a strict processing mode. By default, undefined control sequences and
invalid document constructs (that violate the DTD) give warning messages, but attempt
to continue processing.  Using --strict makes them generate fatal errors.

=item C<--bibtex>

Forces latexml to treat the file as a BibTeX bibliography.
Note that the timing is slightly different than the usual
case with BibTeX and LaTeX.  In the latter case, BibTeX simply
selects and formats a subset of the bibliographic entries; the
actual TeX expansion is carried out when the result is included
in a LaTeX document.  In contrast, latexml processes and expands
the entire bibliography; the selection of entries is done
during postprocessing.  This also means that any packages
that define macros used in the bibliography must be
specified using the C<--preload> option.

=item C<--xml>

Requests XML output; this is the default.

=item C<--tex>

Requests TeX output for debugging purposes;  processing is only carried out through expansion and digestion.
This may not be quite valid TeX, since Unicode may be introduced.

=item C<--box>

Requests Box output for debugging purposes;  processing is carried out through expansion and digestions,
and the result is printed.

=item C<--nocomments>

Normally latexml preserves comments from the source file, and adds a comment every 25 lines as
an aid in tracking the source.  The option --nocomments discards such comments.

=item C<--inputencoding=>I<encoding>

Specify the input encoding, eg. C<--inputencoding=iso-8859-1>.
The encoding must be one known to Perl's Encode package.
Note that this only enables the translation of the input bytes to
UTF-8 used internally by LaTeXML, but does not affect catcodes.
It is usually better to use LaTeX's inputenc package.
Note that this does not affect the output encoding, which is
always UTF-8.

=item C<--VERSION>

Shows the version number of the LaTeXML package..

=item C<--debug>=I<package>

Enables debugging output for the named package. The package is given without the leading LaTeXML::.

=item C<--help>

Shows this help message.

=back

=head1 SEE ALSO

L<latexmlpost>, L<latexmlmath>, L<LaTeXML>

=cut
