File: nobr_forbidden.php

package info (click to toggle)
tuxpaint 1%3A0.9.34-2
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 61,572 kB
  • sloc: ansic: 60,855; makefile: 1,397; sh: 790; objc: 303; cpp: 186; python: 182; php: 43
file content (113 lines) | stat: -rwxr-xr-x 3,073 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/php
<?php
/* nobr_forbidden.php

   A script to encase characters that are forbidden from
   appearing at the beginning of a line (e.g., the
   "。" full-stop), along with the previous character, inside
   a "<nobr>...</nobr>", to prevent `w3m`'s word-wrapping
   routine from doing that.

   Bill Kendrick
   2023-07-17 - 2023-07-17
*/

/* See https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages */

/* Closing brackets (ignoring ' " ]) */
$forbidden_start = ")\)}〕〉》」』】〙〗〟⦆»";

/* Japanese characters: chiisai kana and special marks */
$forbidden_start .= "ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻";

/* Hyphens */
$forbidden_start .= "‐゠–〜";

/* Delimiters */
$forbidden_start .= "?!‼⁇⁈⁉";

/* Mid-sentence punctuation */
$forbidden_start .= "・、:;,";

/* Sentence-ending punctuation */
$forbidden_start .= "。\.";

/* Opening brackets (ignoring ' " [) */
$forbidden_end = "(\({〔〈《「『【〘〖〝⦅«";


/* FIXME: Would be better to use DOMDocument() and modify the
   the text in the nodeValues, but the tuxpaint-docs HTML is
   not currently XHTML compliant ;-( -bjk 2023.07.17

   Something like this:

     $dom = new DOMDocument();
     libxml_use_internal_errors(false);
     $dom->loadHTMLFile("php://stdin");
     
     $p = $dom->getElementsByTagName('p');
     foreach ($p as $pnode) {
       $nodeValue = $pnode->nodeValue;
     
       $nodeValue = preg_replace("/(.。)/", "<nobr>\\1<\/nobr>", $nodeValue);
       $newNode = $dom->createElement("p", $nodeValue);
       $pnode->parentNode->replaceChild($newNode, $pnode);
     }
     
     echo $dom->saveHTML();

   Instead, just reading the HTML file as a big text stream and
   doing our best to only modify things that are not within the
   HTML tags (esp. the <img> tags' "alt" attributes (aka "alt tags")).
*/

//setlocale(LC_ALL, "ja_JP.UTF-8");

$fi = fopen("php://stdin", "r");

$in_tag = false;

while (!feof($fi)) {
  $line = fgets($fi);

  if (!feof($fi)) {
    $newLine = "";
    $text = "";

    for ($i = 0; $i < strlen($line); $i++) {
      $c = substr($line, $i, 1);

      if ($c == "<") {
        $in_tag = true;
        $newLine .= replace_forbidden($text) . $c;
        $text = "";
      } else if ($c == ">") {
        $in_tag = false;
        $newLine .= $c;
        $text = "";
      } else if ($in_tag) {
        $newLine .= $c;
      } else {
        $text .= $c;
      }
    }

    $newLine .= replace_forbidden($text);
    $text = "";

    echo $newLine;
  }
}

function replace_forbidden($str) {
  global $forbidden_start, $forbidden_end;

  $japanese = "\p{Katakana}\p{Hiragana}\p{Han}";

  $str = preg_replace("/([$forbidden_end]+[$japanese][$forbidden_start]+)/u", "<nobr>\\1</nobr>", $str);
  $str = preg_replace("/([$japanese][$forbidden_start]+)/u", "<nobr>\\1</nobr>", $str);
  $str = preg_replace("/([$forbidden_end]+[$japanese])/u", "<nobr>\\1</nobr>", $str);
  return $str;
}