File: Split.pm

package info (click to toggle)
latexml 0.8.7-2
links: PTS, VCS
area: main
in suites: bookworm
size: 29,128 kB
sloc: xml: 98,982; perl: 29,706; sh: 179; javascript: 28; makefile: 15
file content (195 lines) | stat: -rw-r--r-- 8,643 bytes
parent folder | download | duplicates (3)
# /=====================================================================\ #
# |  LaTeXML::Post::Split                                               | #
# | Split documents into pages                                          | #
# |=====================================================================| #
# | Part of LaTeXML:                                                    | #
# |  Public domain software, produced as part of work done by the       | #
# |  United States Government & not subject to copyright in the US.     | #
# |---------------------------------------------------------------------| #
# | Bruce Miller <bruce.miller@nist.gov>                        #_#     | #
# | http://dlmf.nist.gov/LaTeXML/                              (o o)    | #
# \=========================================================ooo==U==ooo=/ #

package LaTeXML::Post::Split;
use strict;
use warnings;
use LaTeXML::Util::Pathname;
use LaTeXML::Common::XML;
use LaTeXML::Post;
use base qw(LaTeXML::Post::Processor);

sub new {
  my ($class, %options) = @_;
  my $self = $class->SUPER::new(%options);
  $$self{split_xpath}   = $options{split_xpath};
  $$self{splitnaming}   = $options{splitnaming};
  $$self{no_navigation} = $options{no_navigation};
  return $self; }

# Could this actually just return the nodes that are to become pages?
# sub toProcess { ??? }

sub process {
  my ($self, $doc, $root) = @_;
  # RISKY and annoying; to split, we really need an id on the root.
  # Writer will remove it.
  $root->setAttribute('xml:id' => 'TEMPORARY_DOCUMENT_ID') unless $root->hasAttribute('xml:id');

  my @docs  = ($doc);
  my @pages = $self->getPages($doc);
  # Weird test: exclude the "whole document" from the list (?)
  @pages = grep { $_->parentNode->parentNode } @pages;    # Strip out the root node.
  if (@pages) {
    my @nav = $doc->findnodes("descendant::ltx:navigation");
    $doc->removeNodes(@nav) if @nav;
    my $tree = { node => $root, document => $doc,
      id       => $root->getAttribute('xml:id'), name => $doc->getDestination,
      children => [] };
    # Group the pages into a tree, in case they are nested.
    my $haschildren = {};
    presortPages($tree, $haschildren, @pages);
    # Work out the destination paths for each page
    $self->prenamePages($doc, $tree, $haschildren);
    # Now, create remove and create documents for each page.
    push(@docs, $self->processPages($doc, @{ $$tree{children} }));

    $self->addNavigation($tree, @nav) if @nav;
  }
  my $n = scalar(@docs);
  NoteLog(($n > 1 ? " [Split into in $n TOCs]" : "[not split]"));
  return @docs; }

# Get the nodes in the document that WILL BECOME separate "pages".
# (they are not yet removed from the main document)
# Subclass can override, if needed.
sub getPages {
  my ($self, $doc) = @_;
  return $doc->findnodes($$self{split_xpath}); }

# Sort the pages into a tree, in case some pages are children of others
# If a page contains NOTHING BUT child pages (except frontmatter),
# we could just merge that page as a level in it's containing TOC instead of a document.???
sub presortPages {
  my ($tree, $haschildren, @pages) = @_;
  my $nextlevel;    # if $page is a descendant of some othe page
  foreach my $page (@pages) {
    # $page should be a child of the current node, or a child of some higher one.
    while ($$tree{parent} && !isChild($page, $$tree{node})) {
      $tree = $$tree{parent}; }
    my $entry = { node => $page, upid => $$tree{id}, id => $page->getAttribute('xml:id'),
      parent => $tree, children => [] };
    $$haschildren{ $$tree{node}->localname } = 1;    # Wrong key for this!?!
    push(@{ $$tree{children} }, $entry);
    $tree = $entry; }    # go "down", in case following are children of current node.
  return; }

# Get destination pathnames for each page.
sub prenamePages {
  my ($self, $doc, $tree, $haschildren) = @_;
  foreach my $entry (@{ $$tree{children} }) {
    $$entry{name} = $self->getPageName($doc, $$entry{node}, $$tree{node}, $$tree{name},
      $$haschildren{ $$entry{node}->localname });
    $self->prenamePages($doc, $entry, $haschildren); }
  return; }

# Process a sequence of page entries, removing them and generating documents for each.
sub processPages {
  my ($self, $doc, @entries) = @_;
  my $rootid = $doc->getDocumentElement->getAttribute('xml:id');
  # Before any document surgery, copy inheritable attributes.
  my $intoc = 0;    # Whether ANY children appear in toc
  foreach my $entry (@entries) {
    my $node = $$entry{node};
    $intoc ||= ($node->getAttribute('inlist') || '') =~ /\btoc\b/;
    foreach my $attr (qw(xml:lang backgroundcolor)) {
      if (my $anc = $doc->findnode('ancestor-or-self::*[@' . $attr . '][1]', $node)) {
        $node->setAttribute($attr => $anc->getAttribute($attr)); } } }
  my @docs = ();
  while (@entries) {
    my $parent = $entries[0]->{node}->parentNode;
    # Remove $page & ALL following siblings (backwards).
    my @removed = ();
    while (my $sib = $parent->lastChild) {
      $parent->removeChild($sib);
      unshift(@removed, $sib);
      last if $sib->isSameNode($entries[0]->{node}); }
    # Build toc from adjacent nodes that are being extracted.
    my @toc = ();
    # Process a sequence of adjacent pages; these will go into the same TOC.
    while (@entries && @removed && $entries[0]->{node}->isSameNode($removed[0])) {
      my $entry = shift(@entries);
      my $page  = $$entry{node};
      # If any pages go in toc, Assume siblings on their own page should go also
      $page->setAttribute(inlist => 'toc') if $intoc && !$page->hasAttribute('inlist');
      $doc->removeNodes(shift(@removed));
      my $id       = $page->getAttribute('xml:id');
      my $tocentry = ['ltx:tocentry', {},
        ['ltx:ref', { idref => $id, show => 'toctitle' }]];
      push(@toc, $tocentry);
      # Due to the way document building works, we remove & process children pages
      # BEFORE processing this page.
      my @childdocs = $self->processPages($doc, @{ $$entry{children} });
      my $subdoc    = $doc->newDocument($page, destination => $$entry{name},
        parentDocument => $doc, parent_id => $$entry{upid});
      $$entry{document} = $subdoc;
      push(@docs, $subdoc, @childdocs); }
    # Finally, add the toc to reflect the consecutive, removed nodes, and add back the remainder
    my $type = $parent->localname;
    $doc->addNodes($parent, ['ltx:TOC', {}, ['ltx:toclist', { class => 'ltx_toclist_' . $type }, @toc]])
      if @toc && !$doc->findnodes("descendant::ltx:TOC[\@lists='toc']", $parent);
    map { $parent->addChild($_) } @removed; }
  return @docs; }

sub addNavigation {
  my ($self, $entry, @nav) = @_;
  my $doc = $$entry{document};
  $doc->addNodes($doc->getDocumentElement, @nav);    # cloning, as needed...
  foreach my $child (@{ $$entry{children} }) {
    my $childdoc = $$child{document};
    $self->addNavigation($child, @nav); }            # now, recurse
  return; }

# error situation: generate some kind of unique name for page
sub generateUnnamedPageName {
  my ($self) = @_;
  my $ctr = ++$$self{unnamed_page_counter};
  return "FOO" . $ctr; }

sub getPageName {
  my ($self, $doc, $page, $parent, $parentpath, $recursive) = @_;
  my $asdir;
  my $naming = $$self{splitnaming};
  my $attr   = ($naming =~ /^id/ ? 'xml:id'
    : ($naming =~ /^label/ ? 'labels' : undef));
  my $name = $page->getAttribute($attr);
  $name =~ s/\s+.*//   if $name;    # Truncate in case multiple labels.
  $name =~ s/^LABEL:// if $name;
  if (!$name) {
    if (($attr eq 'labels') && ($name = $page->getAttribute('xml:id'))) {
      Info('expected', $attr, $doc->getQName($page),
        "Using '$name' to create page pathname, instead of missing '$attr'");
      $attr = 'xml:id'; }
    else {
      $name = $self->generateUnnamedPageName;
      Info('expected', $attr, $doc->getQName($page),
        "Using '$name' to create page pathname, instead of missing '$attr'"); } }
  if ($naming =~ /relative$/) {
    my $pname = $parent->getAttribute($attr);
    $pname =~ s/\s+.*//   if $pname;    # Truncate in case multiple labels.
    $pname =~ s/^LABEL:// if $pname;
    if ($pname && $name =~ /^\Q$pname\E(\.|_|:)+(.*)$/) {
      $name = $2; }
    $asdir = $recursive; }
  $name =~ s/:+/_/g;
  if ($asdir) {
    return pathname_make(dir => pathname_concat(pathname_directory($parentpath), $name),
      name => 'index',
      type => $doc->getDestinationExtension); }
  else {
    return pathname_make(dir => pathname_directory($parentpath),
      name => $name,
      type => $doc->getDestinationExtension); } }

# ================================================================================
1;