File: Split.pm

package info (click to toggle)
latexml 0.8.7-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 29,128 kB
  • sloc: xml: 98,982; perl: 29,706; sh: 179; javascript: 28; makefile: 15
file content (195 lines) | stat: -rw-r--r-- 8,643 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# /=====================================================================\ #
# |  LaTeXML::Post::Split                                               | #
# | Split documents into pages                                          | #
# |=====================================================================| #
# | Part of LaTeXML:                                                    | #
# |  Public domain software, produced as part of work done by the       | #
# |  United States Government & not subject to copyright in the US.     | #
# |---------------------------------------------------------------------| #
# | Bruce Miller <bruce.miller@nist.gov>                        #_#     | #
# | http://dlmf.nist.gov/LaTeXML/                              (o o)    | #
# \=========================================================ooo==U==ooo=/ #

package LaTeXML::Post::Split;
use strict;
use warnings;
use LaTeXML::Util::Pathname;
use LaTeXML::Common::XML;
use LaTeXML::Post;
use base qw(LaTeXML::Post::Processor);

sub new {
  my ($class, %options) = @_;
  my $self = $class->SUPER::new(%options);
  $$self{split_xpath}   = $options{split_xpath};
  $$self{splitnaming}   = $options{splitnaming};
  $$self{no_navigation} = $options{no_navigation};
  return $self; }

# Could this actually just return the nodes that are to become pages?
# sub toProcess { ??? }

sub process {
  my ($self, $doc, $root) = @_;
  # RISKY and annoying; to split, we really need an id on the root.
  # Writer will remove it.
  $root->setAttribute('xml:id' => 'TEMPORARY_DOCUMENT_ID') unless $root->hasAttribute('xml:id');

  my @docs  = ($doc);
  my @pages = $self->getPages($doc);
  # Weird test: exclude the "whole document" from the list (?)
  @pages = grep { $_->parentNode->parentNode } @pages;    # Strip out the root node.
  if (@pages) {
    my @nav = $doc->findnodes("descendant::ltx:navigation");
    $doc->removeNodes(@nav) if @nav;
    my $tree = { node => $root, document => $doc,
      id       => $root->getAttribute('xml:id'), name => $doc->getDestination,
      children => [] };
    # Group the pages into a tree, in case they are nested.
    my $haschildren = {};
    presortPages($tree, $haschildren, @pages);
    # Work out the destination paths for each page
    $self->prenamePages($doc, $tree, $haschildren);
    # Now, create remove and create documents for each page.
    push(@docs, $self->processPages($doc, @{ $$tree{children} }));

    $self->addNavigation($tree, @nav) if @nav;
  }
  my $n = scalar(@docs);
  NoteLog(($n > 1 ? " [Split into in $n TOCs]" : "[not split]"));
  return @docs; }

# Get the nodes in the document that WILL BECOME separate "pages".
# (they are not yet removed from the main document)
# Subclass can override, if needed.
sub getPages {
  my ($self, $doc) = @_;
  return $doc->findnodes($$self{split_xpath}); }

# Sort the pages into a tree, in case some pages are children of others
# If a page contains NOTHING BUT child pages (except frontmatter),
# we could just merge that page as a level in it's containing TOC instead of a document.???
sub presortPages {
  my ($tree, $haschildren, @pages) = @_;
  my $nextlevel;    # if $page is a descendant of some othe page
  foreach my $page (@pages) {
    # $page should be a child of the current node, or a child of some higher one.
    while ($$tree{parent} && !isChild($page, $$tree{node})) {
      $tree = $$tree{parent}; }
    my $entry = { node => $page, upid => $$tree{id}, id => $page->getAttribute('xml:id'),
      parent => $tree, children => [] };
    $$haschildren{ $$tree{node}->localname } = 1;    # Wrong key for this!?!
    push(@{ $$tree{children} }, $entry);
    $tree = $entry; }    # go "down", in case following are children of current node.
  return; }

# Get destination pathnames for each page.
sub prenamePages {
  my ($self, $doc, $tree, $haschildren) = @_;
  foreach my $entry (@{ $$tree{children} }) {
    $$entry{name} = $self->getPageName($doc, $$entry{node}, $$tree{node}, $$tree{name},
      $$haschildren{ $$entry{node}->localname });
    $self->prenamePages($doc, $entry, $haschildren); }
  return; }

# Process a sequence of page entries, removing them and generating documents for each.
sub processPages {
  my ($self, $doc, @entries) = @_;
  my $rootid = $doc->getDocumentElement->getAttribute('xml:id');
  # Before any document surgery, copy inheritable attributes.
  my $intoc = 0;    # Whether ANY children appear in toc
  foreach my $entry (@entries) {
    my $node = $$entry{node};
    $intoc ||= ($node->getAttribute('inlist') || '') =~ /\btoc\b/;
    foreach my $attr (qw(xml:lang backgroundcolor)) {
      if (my $anc = $doc->findnode('ancestor-or-self::*[@' . $attr . '][1]', $node)) {
        $node->setAttribute($attr => $anc->getAttribute($attr)); } } }
  my @docs = ();
  while (@entries) {
    my $parent = $entries[0]->{node}->parentNode;
    # Remove $page & ALL following siblings (backwards).
    my @removed = ();
    while (my $sib = $parent->lastChild) {
      $parent->removeChild($sib);
      unshift(@removed, $sib);
      last if $sib->isSameNode($entries[0]->{node}); }
    # Build toc from adjacent nodes that are being extracted.
    my @toc = ();
    # Process a sequence of adjacent pages; these will go into the same TOC.
    while (@entries && @removed && $entries[0]->{node}->isSameNode($removed[0])) {
      my $entry = shift(@entries);
      my $page  = $$entry{node};
      # If any pages go in toc, Assume siblings on their own page should go also
      $page->setAttribute(inlist => 'toc') if $intoc && !$page->hasAttribute('inlist');
      $doc->removeNodes(shift(@removed));
      my $id       = $page->getAttribute('xml:id');
      my $tocentry = ['ltx:tocentry', {},
        ['ltx:ref', { idref => $id, show => 'toctitle' }]];
      push(@toc, $tocentry);
      # Due to the way document building works, we remove & process children pages
      # BEFORE processing this page.
      my @childdocs = $self->processPages($doc, @{ $$entry{children} });
      my $subdoc    = $doc->newDocument($page, destination => $$entry{name},
        parentDocument => $doc, parent_id => $$entry{upid});
      $$entry{document} = $subdoc;
      push(@docs, $subdoc, @childdocs); }
    # Finally, add the toc to reflect the consecutive, removed nodes, and add back the remainder
    my $type = $parent->localname;
    $doc->addNodes($parent, ['ltx:TOC', {}, ['ltx:toclist', { class => 'ltx_toclist_' . $type }, @toc]])
      if @toc && !$doc->findnodes("descendant::ltx:TOC[\@lists='toc']", $parent);
    map { $parent->addChild($_) } @removed; }
  return @docs; }

sub addNavigation {
  my ($self, $entry, @nav) = @_;
  my $doc = $$entry{document};
  $doc->addNodes($doc->getDocumentElement, @nav);    # cloning, as needed...
  foreach my $child (@{ $$entry{children} }) {
    my $childdoc = $$child{document};
    $self->addNavigation($child, @nav); }            # now, recurse
  return; }

# error situation: generate some kind of unique name for page
sub generateUnnamedPageName {
  my ($self) = @_;
  my $ctr = ++$$self{unnamed_page_counter};
  return "FOO" . $ctr; }

sub getPageName {
  my ($self, $doc, $page, $parent, $parentpath, $recursive) = @_;
  my $asdir;
  my $naming = $$self{splitnaming};
  my $attr   = ($naming =~ /^id/ ? 'xml:id'
    : ($naming =~ /^label/ ? 'labels' : undef));
  my $name = $page->getAttribute($attr);
  $name =~ s/\s+.*//   if $name;    # Truncate in case multiple labels.
  $name =~ s/^LABEL:// if $name;
  if (!$name) {
    if (($attr eq 'labels') && ($name = $page->getAttribute('xml:id'))) {
      Info('expected', $attr, $doc->getQName($page),
        "Using '$name' to create page pathname, instead of missing '$attr'");
      $attr = 'xml:id'; }
    else {
      $name = $self->generateUnnamedPageName;
      Info('expected', $attr, $doc->getQName($page),
        "Using '$name' to create page pathname, instead of missing '$attr'"); } }
  if ($naming =~ /relative$/) {
    my $pname = $parent->getAttribute($attr);
    $pname =~ s/\s+.*//   if $pname;    # Truncate in case multiple labels.
    $pname =~ s/^LABEL:// if $pname;
    if ($pname && $name =~ /^\Q$pname\E(\.|_|:)+(.*)$/) {
      $name = $2; }
    $asdir = $recursive; }
  $name =~ s/:+/_/g;
  if ($asdir) {
    return pathname_make(dir => pathname_concat(pathname_directory($parentpath), $name),
      name => 'index',
      type => $doc->getDestinationExtension); }
  else {
    return pathname_make(dir => pathname_directory($parentpath),
      name => $name,
      type => $doc->getDestinationExtension); } }

# ================================================================================
1;