File: OperatorDictionary.pm

package info (click to toggle)
latexml 0.8.8-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 31,920 kB
  • sloc: xml: 109,048; perl: 30,224; sh: 179; javascript: 28; makefile: 13
file content (248 lines) | stat: -rw-r--r-- 13,515 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
# /=====================================================================\ #
# |  LaTeXML::Post::MathML::OperatorDictionary                          | #
# | MathML generator for LaTeXML                                        | #
# |=====================================================================| #
# | Part of LaTeXML:                                                    | #
# |  Public domain software, produced as part of work done by the       | #
# |  United States Government & not subject to copyright in the US.     | #
# |---------------------------------------------------------------------| #
# | Bruce Miller <bruce.miller@nist.gov>                        #_#     | #
# | http://dlmf.nist.gov/LaTeXML/                              (o o)    | #
# \=========================================================ooo==U==ooo=/ #

package LaTeXML::Post::MathML::OperatorDictionary;
use strict;
use warnings;
use LaTeXML::Common::Error;
use LaTeXML::Util::Unicode;
use base qw(Exporter);
our @EXPORT = (qw( &opdict_lookup));

#======================================================================
# Algorithm loosely derived from MathML Core 2.6.1

# Special Tables defined at end
our $Operators_2_ascii_chars;
our $Operators_fence;
our $Operators_separator;
our $Content_form;
our $Category_data;
# Spacing constants, defined at end
our $thin;     #  = 0.167;    # 3 mu in ems
our $med;      #   = 0.222;    # 4 mu
our $thick;    # = 0.278;    # 5 mu

# Rough assumption of what MathML form each role corresponds to
# (without actually looking at its position within the expression!)
our $role_form = {
  ATOM    => 'infix',
  UNKNOWN => 'infix',
  ID      => 'infix',
  NUMBER  => 'infix',
  ARRAY   => 'Inner',
  RELOP   => 'infix',
  OPEN    => 'prefix',
  CLOSE   => 'postfix',
  MIDDLE  => 'infix',
  #  PUNCT            => 'postfix',
  PUNCT            => 'infix',
  VERTBAR          => 'postfix',
  PERIOD           => 'postfix',
  METARELOP        => 'infix',
  MODIFIEROP       => 'infix',
  MODIFIER         => 'infix',
  ARROW            => 'infix',
  ADDOP            => 'infix',
  MULOP            => 'infix',
  BINOP            => 'infix',
  POSTFIX          => 'postfix',
  FUNCTION         => 'infix',
  OPFUNCTION       => 'prefix',
  TRIGFUNCTION     => 'prefix',
  APPLYOP          => 'infix',
  COMPOSEOP        => 'infix',
  SUPOP            => 'infix',      #?
  BIGOP            => 'prefix',
  SUMOP            => 'prefix',
  INTOP            => 'prefix',
  LIMITOP          => 'prefix',
  DIFFOP           => 'infix',
  OPERATOR         => 'prefix',
  POSTSUBSCRIPT    => 'infix',
  POSTSUPERSCRIPT  => 'infix',
  FLOATSUPERSCRIPT => 'infix',
  FLOATSUBSCRIPT   => 'infix', };

sub opdict_lookup {
  my ($content, $role) = @_;
  my $form     = $$role_form{ $role // 'ATOM' } // 'infix';
  my $category = lookup_category($content, $form) || 'Default';
  my %data     = (
    %{ $$Category_data{$category} },
    ($$Operators_fence{$content}     ? (fence     => 1) : ()),
    ($$Operators_separator{$content} ? (separator => 1) : ()));
  Debug("OPDICT '$content' "
      . (length($content) == 1 ? '(' . sprintf("0x%05x", ord($content)) . ')' : '')
      . " as " . ($role || '<undefined>') . " ($form) is type $category: "
      . join(',', map { $_ . '=' . (defined $data{$_} ? $data{$_} : '<undefined>'); } sort keys %data))
    if $LaTeXML::DEBUG{mathspacing};
  return %data; }

sub lookup_category {
  my ($content, $form) = @_;
  return 'Default' unless defined $content;
  my $len = length($content);
  # (1) If Content as an UTF-16 string does not have length or 1 or 2 then exit with category Default.
  if ($len > 2) {
    return 'Default'; }
  my $code1 = ord(substr($content, 0, 1));
  # (2) If Content is a single character in the range U+0320–U+03FF then exit with category Default.
  if (($len == 1) && ($code1 >= 0x0320) && ($code1 <= 0x03FF)) {
    return 'Default'; }
  # Otherwise, if it has two characters:
  if ($len == 2) {
    my $code2 = ord(substr($content, 1, 1));
    # [NOT YET HANDLED] If Content is the surrogate pairs corresponding to
    #   U+1EEF0 ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL
    #   or U+1EEF1 ARABIC MATHEMATICAL OPERATOR HAH WITH DAL and Form is postfix,
    #   exit with category I.
    #
    # If the second character is
    #   U+0338 COMBINING LONG SOLIDUS OVERLAY or U+20D2 COMBINING LONG VERTICAL LINE OVERLAY
    #   then replace Content with the first character and move to step 3.
    if (($code2 == 0x0338) || ($code2 == 0x20D2)) {
      $content = substr($content, 0, 1); }
    # Otherwise, if Content is listed in Operators_2_ascii_chars
    #   then replace Content with the Unicode character "U+0320
    #   plus the index of Content in Operators_2_ascii_chars"
    #   and move to step 3.
    elsif (my ($p) = grep { $$Operators_2_ascii_chars[$_] ? ($_) : (); } 0 .. $#$Operators_2_ascii_chars) {
      $code1 += $p;
      $content = chr($code1); }
    # Otherwise exit with category Default.
    else {
      return 'Default'; } }
# (3) If Form is infix and Content corresponds to one of U+007C VERTICAL LINE or U+223C TILDE OPERATOR
#   then exit with category ForceDefault.
# [Ignore the whole Table 27 complication]
  if (($form eq 'infix') && (($code1 == 0x007C) || ($code1 == 0x223C))) {
    return 'ForceDefault'; }
  if (my $cat = $$Content_form{$form}{$content}) {
    return $cat; }
  else {
    return 'Default'; } }

#======================================================================
# Utility for expanding Unocode values & ranges, as given in the MathML Core spec.
sub decode_ranges {
  my ($string, $value) = @_;
  $value = 1 unless defined $value;
  my %result = ();
  foreach my $entry (split(/,\s*/, $string)) {
    if ($entry =~ /^U\+(\w+)$/) {
      $result{ chr(hex($1)) } = $value; }
    elsif ($entry =~ /^\{U\+(\w+)\}$/) {
      $result{ chr(hex($1)) } = $value; }
    elsif ($entry =~ /^\[U\+(\w+).U\+(\w+)\]$/) {
      my ($min, $max) = (hex($1), hex($2));
      for (my $c = $min ; $c <= $max ; $c++) {
        $result{ chr($c) } = $value; } }
    else {
      Debug("MISUNDERSTOOD unicode range: '$entry'"); } }
  return %result; }

#======================================================================
# Special Table	Entries
BEGIN {
  # Figure 24 Special tables for the operator dictionary.
  #   Total size: 82 entries, 90 bytes
  #   (assuming characters are UTF-16 and 1-byte range lengths).
  # Operators_2_ascii_chars 18 entries (2-characters ASCII strings):
  $Operators_2_ascii_chars = [
'!!', '!=', '&&', '**', '*=', '++', '+=', '--', '-=', '->', '//', '/=', ':=', '<=', '<>', '==', '>=', '||'];
  # Note that fence & separator properties have no visible effect, but are for semantics
  # Operators_fence 61 entries (16 Unicode ranges):
  $Operators_fence = { decode_ranges(
"[U+0028-U+0029], {U+005B}, {U+005D}, [U+007B-U+007D], {U+0331}, {U+2016}, [U+2018-U+2019], [U+201C-U+201D], [U+2308-U+230B], [U+2329-U+232A], [U+2772-U+2773], [U+27E6-U+27EF], {U+2980}, [U+2983-U+2999], [U+29D8-U+29DB], [U+29FC-U+29FD]") };
  # Operators_separator	3 entries: U+002C, U+003B, U+2063,
  $Operators_separator = { decode_ranges("U+002C, U+003B, U+2063") };

  #======================================================================
  # Figure 25 Mapping from operator (Content, Form) to a category.
  # Total size: 725 entries, 639 bytes
  # (assuming characters are UTF-16 and 1-byte range lengths).
  # NOTE: replace – (\x{2013}) with -
  # (Content, Form) keys	Category
  $Content_form = {
    infix => {
      #  313 entries (35 Unicode ranges) in infix form:
      decode_ranges(
"[U+2190-U+2195], [U+219A-U+21AE], [U+21B0-U+21B5], {U+21B9}, [U+21BC-U+21D5], [U+21DA-U+21F0], [U+21F3-U+21FF], {U+2794}, {U+2799}, [U+279B-U+27A1], [U+27A5-U+27A6], [U+27A8-U+27AF], {U+27B1}, {U+27B3}, {U+27B5}, {U+27B8}, [U+27BA-U+27BE], [U+27F0-U+27F1], [U+27F4-U+27FF], [U+2900-U+2920], [U+2934-U+2937], [U+2942-U+2975], [U+297C-U+297F], [U+2B04-U+2B07], [U+2B0C-U+2B11], [U+2B30-U+2B3E], [U+2B40-U+2B4C], [U+2B60-U+2B65], [U+2B6A-U+2B6D], [U+2B70-U+2B73], [U+2B7A-U+2B7D], [U+2B80-U+2B87], {U+2B95}, [U+2BA0-U+2BAF], {U+2BB8}", 'A'),
      # 109 entries (32 Unicode ranges) in infix form:
      decode_ranges(
"{U+002B}, {U+002D}, {U+002F}, {U+00B1}, {U+00F7}, {U+0322}, {U+2044}, [U+2212-U+2216], [U+2227-U+222A], {U+2236}, {U+2238}, [U+228C-U+228E], [U+2293-U+2296], {U+2298}, [U+229D-U+229F], [U+22BB-U+22BD], [U+22CE-U+22CF], [U+22D2-U+22D3], [U+2795-U+2797], {U+29B8}, {U+29BC}, [U+29C4-U+29C5], [U+29F5-U+29FB], [U+2A1F-U+2A2E], [U+2A38-U+2A3A], {U+2A3E}, [U+2A40-U+2A4F], [U+2A51-U+2A63], {U+2ADB}, {U+2AF6}, {U+2AFB}, {U+2AFD}", 'B'),
      # 64 entries (33 Unicode ranges) in infix form:
      decode_ranges(
"{U+0025}, {U+002A}, {U+002E}, [U+003F-U+0040], {U+005E}, {U+00B7}, {U+00D7}, {U+0323}, {U+032E}, {U+2022}, {U+2043}, [U+2217-U+2219], {U+2240}, {U+2297}, [U+2299-U+229B], [U+22A0-U+22A1], {U+22BA}, [U+22C4-U+22C7], [U+22C9-U+22CC], [U+2305-U+2306], {U+27CB}, {U+27CD}, [U+29C6-U+29C8], [U+29D4-U+29D7], {U+29E2}, [U+2A1D-U+2A1E], [U+2A2F-U+2A37], [U+2A3B-U+2A3D], {U+2A3F}, {U+2A50}, [U+2A64-U+2A65], [U+2ADC-U+2ADD], {U+2AFE}", 'C'),
      # 7 entries (4 Unicode ranges) in infix form:
      decode_ranges("{U+005C}, {U+005F}, [U+2061-U+2064], {U+2206}", 'K'),
      # 3 entries in infix form:
      decode_ranges("U+002C, U+003A, U+003B", 'M'),
    },
    prefix => {
      # 52 entries (22 Unicode ranges) in prefix form:
      decode_ranges(
"{U+0021}, {U+002B}, {U+002D}, {U+00AC}, {U+00B1}, {U+0331}, {U+2018}, {U+201C}, [U+2200-U+2201], [U+2203-U+2204], {U+2207}, [U+2212-U+2213], [U+221F-U+2222], [U+2234-U+2235], {U+223C}, [U+22BE-U+22BF], {U+2310}, {U+2319}, [U+2795-U+2796], {U+27C0}, [U+299B-U+29AF], [U+2AEC-U+2AED]", 'D'),
      # 30 entries in prefix form:
      decode_ranges(
"U+0028, U+005B, U+007B, U+007C, U+2016, U+2308, U+230A, U+2329, U+2772, U+27E6, U+27E8, U+27EA, U+27EC, U+27EE, U+2980, U+2983, U+2985, U+2987, U+2989, U+298B, U+298D, U+298F, U+2991, U+2993, U+2995, U+2997, U+2999, U+29D8, U+29DA, U+29FC", 'F'),
      # 27 entries (2 Unicode ranges) in prefix form:
      decode_ranges("[U+222B-U+2233], [U+2A0B-U+2A1C]", 'H'),
      # 22 entries (6 Unicode ranges) in prefix form:
      decode_ranges(
        "[U+220F-U+2211], [U+22C0-U+22C3], [U+2A00-U+2A0A], [U+2A1D-U+2A1E], {U+2AFC}, {U+2AFF}", 'J'),
      # 6 entries (3 Unicode ranges) in prefix form:
      decode_ranges("[U+2145-U+2146], {U+2202}, [U+221A-U+221C]", 'L'),
    },
    postfix => {
      # 40 entries (21 Unicode ranges) in postfix form:
      decode_ranges(
"[U+0021-U+0022], [U+0025-U+0027], {U+0060}, {U+00A8}, {U+00B0}, [U+00B2-U+00B4], [U+00B8-U+00B9], [U+02CA-U+02CB], [U+02D8-U+02DA], {U+02DD}, {U+0311}, {U+0320}, {U+0325}, {U+0327}, {U+0331}, [U+2019-U+201B], [U+201D-U+201F], [U+2032-U+2037], {U+2057}, [U+20DB-U+20DC], {U+23CD}", 'E'),
      # 30 entries in postfix form:
      decode_ranges(
"U+0029, U+005D, U+007C, U+007D, U+2016, U+2309, U+230B, U+232A, U+2773, U+27E7, U+27E9, U+27EB, U+27ED, U+27EF, U+2980, U+2984, U+2986, U+2988, U+298A, U+298C, U+298E, U+2990, U+2992, U+2994, U+2996, U+2998, U+2999, U+29D9, U+29DB, U+29FD", 'G'),
      # 22 entries (13 Unicode ranges) in postfix form:
      decode_ranges(
"[U+005E-U+005F], {U+007E}, {U+00AF}, [U+02C6-U+02C7], {U+02C9}, {U+02CD}, {U+02DC}, {U+02F7}, {U+0302}, {U+203E}, [U+2322-U+2323], [U+23B4-U+23B5], [U+23DC-U+23E1]", 'I'),
    },
  };

  #======================================================================
  # Figure 26 Operators values for each category.
  # The third column provides a 4-bit encoding of the categories
  # where the 2 least significant bits encode the form infix (0), prefix (1) and postfix (2).
  # Category Form Encoding rspace lspace properties
  $thin  = 0.167;    # 3 mu in ems
  $med   = 0.222;    # 4 mu
  $thick = 0.278;    # 5 mu
     # NOTE: The table headings lspace,rspace are swapped in the current CORE draft!!!! Report this!!!
  $Category_data = {
    Default      => { form => undef, encoding => undef, lspace => $thick, rspace => $thick },
    ForceDefault => { form => undef, encoding => undef, lspace => $thick, rspace => $thick },
    A => { form => 'infix',   encoding => 0x0, lspace => $thick, rspace => $thick, stretchy => 1 },
    B => { form => 'infix',   encoding => 0x4, lspace => $med,   rspace => $med },
    C => { form => 'infix',   encoding => 0x8, lspace => $thin,  rspace => $thin },
    D => { form => 'prefix',  encoding => 0x1, lspace => 0,      rspace => 0 },
    E => { form => 'postfix', encoding => 0x2, lspace => 0,      rspace => 0 },
    F => { form => 'prefix', encoding => 0x5, lspace => 0, rspace => 0, stretchy => 1, symmetric => 1 },
    G => { form => 'postfix', encoding => 0x6, lspace => 0, rspace => 0, stretchy => 1, symmetric => 1 },
    H => { form => 'prefix', encoding => 0x9, lspace => $thin, rspace => $thin, symmetric => 1, largeop => 1 },
    I => { form => 'postfix', encoding => 0xA, lspace => 0, rspace => 0, stretchy => 1 },
    J => { form => 'prefix', encoding => 0xD, lspace => $thin, rspace => $thin, symmetric => 1, largeop => 1, movablelimits => 1 },
    K => { form => 'infix',  encoding => 0xC,   lspace => 0,     rspace => 0 },
    L => { form => 'prefix', encoding => undef, lspace => $thin, rspace => 0 },
    M => { form => 'infix',  encoding => undef, lspace => 0,     rspace => $thin },
  };
  #======================================================================
}
1;