File: Hyphenate_basic.pm

package info (click to toggle)
libpdf-builder-perl 3.028-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 24,188 kB
  • sloc: perl: 109,309; makefile: 10
file content (299 lines) | stat: -rw-r--r-- 11,808 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
package PDF::Builder::Content::Hyphenate_basic;

use base 'PDF::Builder::Content::Text';

use strict;
use warnings;

our $VERSION = '3.028'; # VERSION
our $LAST_UPDATE = '3.027'; # manually update whenever code is changed

=head1 NAME

PDF::Builder::Content::Hyphenate_basic - Simple hyphenation capability

Inherits from L<PDF::Builder::Content::Text>

=head1 SYNOPSIS

These are internal routines that are somewhat experimental, and may (or may
not) be extended in the future. They are called from various Content routines
that take long strings of text and split them into fixed-length lines.

Words are split to fill the line most completely, without regard to widows and
orphans, long runs of hyphens at the right edge, "rivers" of space flowing
through a paragraph, and other problems. Also, only simple splitting is done
(not actually I<words>), on a simple, language-independent basis. No dictionary 
or rules-based splitting is currently done.

This functionality may well be replaced by "hooks" to call language-specific
word-splitting rules, as well as worrying about the appearance of the results
(such as Knuth-Plass).

=cut

# Main entry. Returns array of left portion of word (and -) to stick on end of
# sentence (may be empty) and remaining (right) portion of word to go on next
# line (usually not empty).
sub splitWord {
    my ($self, $word, $width, %opts) = @_;
    # copy dashed option names to preferred undashed names
    if (defined $opts{'-spHH'} && !defined $opts{'spHH'}) { $opts{'spHH'} = delete($opts{'-spHH'}); }
    if (defined $opts{'-spOP'} && !defined $opts{'spOP'}) { $opts{'spOP'} = delete($opts{'-spOP'}); }
    if (defined $opts{'-spDR'} && !defined $opts{'spDR'}) { $opts{'spDR'} = delete($opts{'-spDR'}); }
    if (defined $opts{'-spLR'} && !defined $opts{'spLR'}) { $opts{'spLR'} = delete($opts{'-spLR'}); }
    if (defined $opts{'-spCC'} && !defined $opts{'spCC'}) { $opts{'spCC'} = delete($opts{'-spCC'}); }

    my ($leftWord, $rightWord, @splitLoc, @chars, $i, $j, $len);

    # various settings, some of which may be language-specific
    my $minBegin = 2;  # minimum 2 characters before split (English rules)
    if (defined $opts{'min_prefix'}) { $minBegin = $opts{'min_prefix'}; }
    my $minEnd   = 3;  # minimum 3 characters to next line (English rules)
    if (defined $opts{'min_suffix'}) { $minEnd = $opts{'min_suffix'}; }
    my $hyphen = '-';
   #my $hyphen = "\xAD";  # add a hyphen at split, unless splitting at -
                       # or other dash character
    # NOTE: PDF-1.7 14.8.2.2.3 suggests using a soft hyphen (\AD) when splitting
    #       a word at the end of the line, so that when text is extracted for
    #       a screen reader, etc., the closed-up word can have the "visible"
    #       hyphen removed. PDF readers should render as -.
    my @suppressHyphen = ( # ASCII/Latin-1/UTF-8 ordinals to NOT add - after
       #  -   en-dash em-dash /
	  45, 8211,   8212,   47,
                         );
    my $splitHardH = defined($opts{'spHH'})? $opts{'spHH'}: 1;  # 1=OK to split on hard (explicit) hyphen U+002D
    my $otherPunc = defined($opts{'spOP'})? $opts{'spOP'}: 1;  # 1=OK to split after most punctuation
    my $digitRun = defined($opts{'spDR'})? $opts{'spDR'}: 1;  # 1=OK to split after run of digit(s)
    my $letterRun = defined($opts{'spLR'})? $opts{'spLR'}: 1;  # 1=OK to split after run of ASCII letter(s)
    my $camelCase = defined($opts{'spCC'})? $opts{'spCC'}: 1;  # 1=OK to split camelCase on ASCII lc-to-UC transition
    my $splitReqBlnk = defined($opts{'spRB'})? $opts{'spRB'}: 0; # 1=OK to split on required blank (NBSP) -- desperation move
    my $splitAnywhere = defined($opts{'spFS'})? $opts{'spFS'}: 0; # 1=OK to split to fit available space -- super desperation move
    if ($splitAnywhere) {
	# if requesting to split within a certain length, suppress all other flags
	$splitHardH = $otherPunc = $digitRun = $letterRun = $camelCase =
	    $splitReqBlnk = 0;
    }

    # note that we are ignoring U+2010 "hyphen" and U+2011 "non-splitting 
    # hyphen". The first is probably rare enough to not be worth the bother,
    # and the second won't be split at anyway.

    $leftWord = '';  # default return values
    $rightWord = $word;

    @splitLoc = ();  # no known OK splits yet

    # highest priority for splits: hard and soft hyphens
    # remove SHYs, remember any break points
    ($word, @splitLoc) = _removeSHY($word);
    # remember any break points due to hard coded hyphens
    @chars = split //, $word;
    for ($i=0; $i<scalar(@chars); $i++) {
	if ($chars[$i] eq '-' && $splitHardH) { push @splitLoc, $i; }
	# note that unlike SHY, - is not removed
    }

    # If nothing in @splitLoc, proceed to find other splits. If @splitLoc
    # has at least one entry, could make it the top priority and split there,
    # and not look at other possible splits. Or, keep adding to @splitLoc 
    # (equal priority for all possible splits). Mix and match is OK
    # (grouping criteria, as hard and soft hyphens were done together).

   #if (!@splitLoc) {
        if ($otherPunc) {
            # look for other punctuation to split after.
	    # don't split on ' or " or other quotes (<, <<, etc.)
	    # !%&)]*+/,.:;<>?^_~ and curly right brace ASCII OK for now
	    # en-dash, em-dash should ideally be split after, whether they are
	    # free floating or embedded between words.
	    my @ASCII_punct = ( '!', '.', '?', ',', '%', '&', ':', ';',
		                '<', '>', ')', ']', chr(125), '_', '~', 
		                '^', '+', '*', '/',   );
	    #                   en-dash em-dash
	    my @UTF8_punct =  ( 8211,   8212,   );
	    # remember not to split if next char is - 
	    # (defer split to after hard hyphen - [if allowed]).
            for ($i=0; $i<scalar(@chars)-1; $i++) {
	        foreach (@ASCII_punct) {
	            if ($chars[$i] eq $_ && $chars[$i+1] ne '-') { 
			push @splitLoc, $i; 
			last;
	            }
	        }
	        foreach (@UTF8_punct) {
	            if (ord($chars[$i]) == $_ && $chars[$i+1] ne '-') { 
			push @splitLoc, $i;
		       	last;
	       	    }
	        }
            }
        }
   #}

    # group digit runs and camelCase together at same priority
   #if (!@splitLoc) {
        if ($digitRun) {
            # look for a run of digits to split after.
	    # that is, any digit NOT followed by another digit.
	    # remember not to split if next char is - 
	    # (defer split to after hard hyphen - [if allowed]).
            for ($i=0; $i<scalar(@chars)-1; $i++) {
	        if ($chars[$i] ge '0' && $chars[$i] le '9' &&
		    !($chars[$i+1] ge '0' && $chars[$i+1] le '9' ||
		      $chars[$i+1] eq '-')) {
		    push @splitLoc, $i;
	        }
	    }
        }

        if ($letterRun) {
            # look for a run of letters (ASCII) to split after.
	    # that is, any letter NOT followed by another letter.
	    # remember not to split if next char is - 
	    # (defer split to after hard hyphen - [if allowed]).
            for ($i=0; $i<scalar(@chars)-1; $i++) {
	        if (($chars[$i] ge 'a' && $chars[$i] le 'z' ||
	             $chars[$i] ge 'A' && $chars[$i] le 'Z' )  &&
		    !($chars[$i+1] ge 'a' && $chars[$i+1] le 'z' ||
		      $chars[$i+1] ge 'A' && $chars[$i+1] le 'Z' ||
	              $chars[$i+1] eq '-')  ) {
		    push @splitLoc, $i;
	        }
	    }
        }

        if ($camelCase) {
            # look for camelCase to split on lowercase to
    	    # uppercase transitions. just ASCII letters for now.
	    # Note that this will split names like McIlroy -> Mc-Ilroy
	    # and MacDonald -> Mac-Donald.
            for ($i=0; $i<scalar(@chars)-1; $i++) {
	        if ($chars[$i] ge 'a' && $chars[$i] le 'z' &&
		    $chars[$i+1] ge 'A' && $chars[$i+1] le 'Z') {
		    push @splitLoc, $i;
	        }
	    }
        }
   #}

   #if (!@splitLoc) {
        # look for real English word split locations
	# TBD
   #}

   if (!@splitLoc && $splitReqBlnk) {
      # remember any break points due to desperation split at NBSP
      @chars = split //, $word;
      for ($i=0; $i<scalar(@chars); $i++) {
	  if ($chars[$i] eq "\xA0") { push @splitLoc, $i; }
	  # note that NBSP converted to regular space (x20). we will need
	  # to overwrite the split one with the hyphen
      }
   }
	
   if (!@splitLoc && $splitAnywhere) {
      # remember any break point due to desperation split at available length
      @chars = split //, $word;
      my $trial = '';
      for ($i=0; $i<scalar(@chars); $i++) {
	  $trial .= $chars[$i];
	  if ($self->advancewidth("$trial$hyphen") > $width) { last; }
      }
      # nothing fit? force one letter, even though it overflows
      if ($i == 0) { $i = 1; }
      push @splitLoc, $i-1;
      # disable minimum prefix and suffix for this
      $minBegin = $minEnd = 1;
   }

    # sort final @splitLoc, remove any split points violating "min" settings
    # set $leftWord and $rightWord if find successful split
    # TBD consider hierarchy of desirable splits, rather than equal weight
    if (@splitLoc) {
        @splitLoc = sort { $a <=> $b } @splitLoc;
	# unnecessary to have unique values
        $len = length($word);
	$j = -1;
        for ($i=0; $i<scalar(@splitLoc); $i++) {
            if ($splitLoc[$i] >= $minBegin-1) { last; }
	    $j = $i;	
        }
	if ($j >= 0) { splice(@splitLoc, 0, $j+1); } # remove j+1 els
	$j = -1;
        for ($i=$#splitLoc; $i>=0; $i--) {
            if ($splitLoc[$i] < $len-$minEnd) { last; }
	    $j = $i;	
        }
	if ($j >= 0) { splice(@splitLoc, $j); } # remove els >= j-th

        # scan R to L through @splitLoc to try splitting there
	# TBD estimate starting position in @splitLoc by dividing $width by
	# 1em to get approximate split location; pick highest @splitLoc
	# element that does not exceed it, and move right (probably) or left
	# to get proper split point.
	while (@splitLoc) {
	    $j = pop @splitLoc;  # proposed split rightmost on list
	    my $trial = substr($word, 0, $j+1);
	    # this is the left fragment at the end of the line. make sure
	    # there is room for the space before it, the hyphen (if added), 
	    # and any letter doubling (e.g., in German or Dutch)

	    # does the left fragment already end in -, etc.?
	    # if it does, don't add a $hyphen. 
	    my $h = $hyphen;
	    $i = ord(substr($trial, -1, 1)); # last character in left fragment
	    foreach (@suppressHyphen) {
		if ($i == $_) { $h = ''; last; }
	    }
	    # left fragment ends in a space (used to be an NBSP)?
	    # remove space, and no hyphen
	    if ($i eq ' ') {
		chop($trial);
		$h = '';
	    }

	    # $width should already count the trailing space in the existing
	    # line, or full width if empty
	    $len = $self->advancewidth("$trial$h", %opts);
	    if ($len > $width) { next; }

	    # TBD any letter doubling needed?
	    $leftWord = $trial.$h;
	    $rightWord = substr($word, $j+1); 
	    last;
	} # while splitLoc has content
	# if fell through because no fragment was short enough, $leftWord and
	# $rightWord were never reassigned, and effect is to leave the entire
	# word for the next line.
    }
    # if 0 elements in @splitLoc, $leftWord and $rightWord already defaulted

    return ($leftWord, $rightWord);
}

# remove soft hyphens (SHYs) from a word. assume is always #173 (good for
# Latin-1, CP-1252, UTF-8; might not work for some encodings)  TBD might want
# to pass in current encoding, or what SHY value is.
# return list of break points where SHYs were removed
sub _removeSHY {
    my ($word) = @_;

    my @SHYs = ();
    my $i = 0;

    my @chars = split //, $word;
    my $out = '';
    foreach (@chars) {
        if (ord($_) == 173) {
	    # it's a SHY, so remove from word, add to list
	    push @SHYs, ($i - 1);
	    next;
	}
	$out .= $_;
	$i++;
    }
    return ($out, @SHYs);
}

1;