File: build-docs.sh

package info (click to toggle)
mlpack 4.6.2-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 31,272 kB
  • sloc: cpp: 226,039; python: 1,934; sh: 1,198; lisp: 414; makefile: 85
file content (757 lines) | stat: -rwxr-xr-x 26,990 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
#!/usr/bin/env bash
#
# Convert all the Markdown files in doc/ to HTML.
#
# This requires `kramdown` to be available on the path.  `tidy` and
# `linkchecker` (the Python package) and `checklink` (from w3c-linkchecker on
# Debian) are used to test the output and must also be available and on the
# path.  `sqlite3` must also be available.
#
# Run this from the root directory of the repository.
#
# The output directory can be specified as the first option.
#
# If the environment variable DISABLE_HTML_CHECKS is specified, then checks are
# skipped.
#
# If the environment variable LINK_CACHE_FILE is specified, then that file is
# used as a cache of already-valid links that will not be checked.

if [ "$#" -gt 1 ]; then
  echo "Usage: $0 [output_dir/]";
  exit 1;
elif [ "$#" -eq 1 ]; then
  output_dir=$1;
else
  output_dir=doc/html;
fi

# If the header and footer already exist, they will not be overwritten.
template_html_header="${output_dir}/template.html.header";
template_html_footer="${output_dir}/template.html.footer";
template_html_sidebar="${output_dir}/template.html.sidebar";

if ! command -v kramdown &>/dev/null
then
  echo "kramdown not installed!  Cannot build documentation.";
  exit 1;
fi

# If DISABLE_HTML_CHECKS is set, then we won't use tidy or checklink.
if [ -z ${DISABLE_HTML_CHECKS+x} ];
then
  if ! command -v tidy &>/dev/null
  then
    echo "tidy not installed!  Cannot check documentation.";
    exit 1;
  fi

  if ! command -v checklink &>/dev/null
  then
    echo "checklink not installed!  Cannot check documentation.";
    exit 1;
  fi

  if ! command -v linkchecker &>/dev/null
  then
    echo "linkchecker not installed!  Cannot check documentation.";
    exit 1;
  fi

  if ! command -v sqlite3 &> /dev/null
  then
    echo "sqlite3 not installed!  Cannot check documentation.";
    exit 1;
  fi
fi

if [ ! -d doc/ ];
then
  echo "Run this script from the root of the mlpack repository.";
  exit 1;
fi

# Define utility function to run kramdown and turn an .md file to an .html file.
run_kramdown()
{
  input_file=$1;
  # This converts, e.g., ./doc/user/index.md -> doc/html/user/index.html.
  tmp=${input_file#./doc/}; # Strip leading ./doc/.
  output_file="$output_dir/${tmp%.md}.html";

  # Determine what the link root is.  If we're in the root directory, it's
  # nothing, otherwise it's one of more '../'s.
  dir_name=$(dirname $tmp);
  link_root="";
  if [[ "$dir_name" != "." ]];
  then
    levels_below_root=`echo $dir_name | awk -F'/' '{ print NF }'`;
    link_root=$(printf '../%.0s' `seq 1 $levels_below_root`);
  fi

  # Make the enclosing directory if needed.
  out_dir=`dirname "$output_file"`;
  mkdir -p "$out_dir";

  # Kramdown doesn't detect languages correctly with the "```" fence; instead it
  # needs the "~~~" fence.
  sed 's/^```/~~~/' $input_file > $input_file.tmp;

  # Our documentation is full of relative links, like
  # [name](other_file.md#anchor).  We need these to turn into links to the
  # rendered HTML file, like [name](other_file.html#anchor).  We'll do this with
  # regular expressions...
  #
  # - Note that this assumes there are no spaces in any filenames.
  # - We also only catch the second part of the link '](' because the name of
  #   the link could be spread on multiple lines.
  #
  # We start by trying to catch the special cases README.md and HISTORY.md,
  # which our documentation puts in a slightly different place.  In addition,
  # because those files are being moved to the root of the documentation, we
  # must adjust links differently.
  if [[ $input_file != "README.md" ]] && [[ $input_file != "HISTORY.md" ]];
  then
    sed -i "s|\]([./]*README.md)|](${link_root}README.html)|g" $input_file.tmp;
    sed -i "s|\]([./]*README.md#[0-9]-\([^ ]*\))|](${link_root}README.html#\1)|g" $input_file.tmp;
    sed -i 's/\](\([^ ]*\).md)/](\1.html)/g' $input_file.tmp;
    sed -i 's/\](\([^ ]*\).md#\([^ ]*\))/](\1.html#\2)/g' $input_file.tmp;
  else
    sed -i 's/\](doc\/\([^ ]*\).md)/](\1.html)/g' $input_file.tmp;
    sed -i 's/\](doc\/\([^ ]*\).md#\([^ ]*\))/](\1.html#\2)/g' $input_file.tmp;

    # The README specifically has a link to GOVERNANCE.md, but we want to
    # preserve that.  We're not building that file into Markdown.
    sed -i 's|(./GOVERNANCE.md)|(https://github.com/mlpack/mlpack/blob/master/GOVERNANCE.md)|' $input_file.tmp;

    # Ugh!  Github naming of anchors is different than kramdown, and so we have
    # to adjust all the table-of-contents anchor links in the README (and in
    # that file only).
    sed -i 's/\](#[0-9][0-9]-\([^ ]*\))/](#\1)/g' $input_file.tmp;
    sed -i 's/\](#[0-9]-\([^ ]*\))/](#\1)/g' $input_file.tmp;
    sed -i 's/\](#[0-9][0-9]\([^ ]*\))/](#\1)/g' $input_file.tmp;
    sed -i 's/\](#[0-9]\([^ ]*\))/](#\1)/g' $input_file.tmp;

    # For HISTORY.md, we want to turn all references to Github issues into
    # actual links, and all references to Github usernames into links to their
    # profile.
    if [[ $input_file == "HISTORY.md" ]];
    then
      sed -i 's/#\([0-9][0-9]*\)/[#\1](https:\/\/github.com\/mlpack\/mlpack\/issues\/\1)/g' $input_file.tmp;
      sed -i 's/\([^`]\)@\([a-zA-Z0-9_-][a-zA-Z0-9_-]*\)/\1[@\2](https:\/\/github.com\/\2)/g' $input_file.tmp;
    fi
  fi

  # Replace any links to source files with a link to the current version of the
  # source file on Github.
  sed -i 's/\](\/src\/\([^ ]*\)\.hpp)/](https:\/\/github.com\/mlpack\/mlpack\/blob\/master\/src\/\1.hpp)/' $input_file.tmp;

  # If this is binding documentation or quickstart documentation, don't set the
  # default language to C++.
  set_lang=1;
  if [[ `dirname $input_file` == "./doc/user/bindings" ]];
  then
    set_lang=0;
  elif [[ `dirname $input_file` == "./doc/quickstart" ]];
  then
    if [[ `basename $input_file .md` != "cpp" ]];
    then
      set_lang=0;
    fi
  elif [[ $input_file == "HISTORY.md" ]];
  then
    set_lang=0;
  fi

  if [[ "$set_lang" == "0" ]];
  then
    kramdown \
        -x parser-gfm \
        --syntax-highlighter rouge \
        --auto_ids \
        $input_file.tmp > "$output_file.tmp" || exit 1;
  else
    kramdown \
        -x parser-gfm \
        --syntax-highlighter rouge \
        --syntax-highlighter-opts '{ default_lang: c++ }' \
        --auto_ids \
        $input_file.tmp > "$output_file.tmp" || exit 1;
  fi
  cat "$template_html_header" | sed "s|LINKROOT|$link_root|" > "$output_file";

  # Create the sidebar.  Extract anchors from the page, unless we are looking at
  # index.md, since the permanent part of the sidebar links all over index.md
  # anyway.  If we are looking at binding documentation, use a slightly
  # different sidebar.
  if { [[ $dir_name != "user/bindings" ]] && \
       [[ $dir_name != "quickstart" ]] } ||
     [[ $input_file == "./doc/quickstart/cpp.md" ]];
  then
    cat "$template_html_sidebar" | sed "s|LINKROOT|$link_root|" \
        >> "$output_file";
    create_page_sidebar_section "$output_file.tmp" "$output_file" "$dir_name";
  else
    echo "Using custom sidebar...";
    cat "$template_html_sidebar" | sed "s|LINKROOT|$link_root|" |\
        sed 's|<details> <!-- default closed for non-binding pages -->|<details open="true">|' |\
        sed 's|<details open="true"> <!-- default open for non-binding pages -->|<details>|' \
        >> "$output_file";
    # Some pages may have a custom sidebar HTML file.  (Specifically,
    # generated language bindings.)
    if [[ $dir_name == "user/bindings" ]];
    then
      cat "${input_file/%.md/.sidebar.html}" | sed "s|LINKROOT|$link_root|" \
          >> "$output_file";
    else
      sidebar_file=`basename $input_file .md`.sidebar.html;
      cat "./doc/user/bindings/$sidebar_file" | sed "s|LINKROOT|$link_root|" \
          >> "$output_file";
    fi
  fi

  # Add clickable anchors to h2 and h3 headers.
  echo "<div id=\"content\">" >> "$output_file";
  sed -E 's/<h([23]) id="([^"]*)">/<h\1 id="\2"><a href="#\2" class="pl">🔗<\/a> /' "$output_file.tmp" >> "$output_file";

  # Simple postprocessing to make tidy a little happier.
  # (Muting the warning won't change the error code!)
  sed -i 's/<table>/<table summary="">/' "$output_file";

  cat "$template_html_footer" >> "$output_file";
  rm -f $input_file.tmp "$output_file.tmp";
}

# Create the template header file.
create_template_header()
{
  output_file="$1";

  # Note that LINKROOT will be substituted into place by run_kramdown.
  cat > "$output_file" << EOF
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <link type="text/css" rel="stylesheet" href="LINKROOTgfm-mod.css">
  <link type="text/css" rel="stylesheet" href="LINKROOTrouge-github-mod.css">
  <title>mlpack documentation</title>
</head>
<body>
EOF
}

# Create the template footer.
create_template_footer()
{
  output_file="$1";

  cat > "$output_file" << EOF
</div>
</body>
</html>
EOF
}

# Extract anchors to build a sidebar.
# This should take the input HTML (before anchor elements are added), and it
# appends a sidebar list to the output HTML.
create_page_sidebar_section()
{
  sb_input_file="$1";
  sb_output_file="$2";
  sb_dir_name="$3"; # The directory containing the documentation.
  sb_input_file_base=`basename "$sb_input_file" .html.tmp`;

  # Extract h2/h3 anchors into a list.  For individual method documentation, we
  # only extract h3 anchors because those use h2s as their headings.  And, for
  # core.md, we want to extract both h2 and h3 anchors.
  if [[ "$sb_dir_name" == "user/methods" ]];
  then
    # The page title on individual methods is encoded as an h2.
    page_title=`grep '<h2 id=' "$sb_input_file" |\
        head -1 |\
        sed 's/^<h2 id="[^"]*">\(.*\)<\/h2>/\1/'`;

    grep '<h3 id=' "$sb_input_file" | sed 's/<h3 id="\([^"]*\)">\(.*\)<\/h3>/<li><a href="#\1">\2<\/a><\/li>/' > "$sb_output_file.side.tmp";
  elif [[ "$sb_input_file_base" == "core" || "$sb_dir_name" == "user/core" || "$sb_dir_name" == "user/core/trees" ]];
  then
    # The page title on the core class documentation page is encoded as an h1.
    page_title=`grep '<h1 id=' "$sb_input_file" |\
        head -1 |\
        sed 's/^<h1 id="[^"]*">\(.*\)<\/h1>/\1/'`;

    # We want to collect h2s and h3s as individual documentation; each h2 should
    # have a summary/details block.  This is a little tedious to create... we'll
    # do this by creating a temporary tab-separated file with lines like
    #
    # h2  anchor_name   Anchor Title
    # h3  anchor_name   Anchor Title
    # ...
    #
    # and then we'll construct the actual sidebar using that list.
    grep '<h[23] id=' "$sb_input_file" |\
        sed 's/^<\(h[23]\) id="\([^"]*\)">\(.*\)<\/h[23]>/\1\t\2\t\3/' \
        > "$sb_output_file.side.list.tmp";
    in_block=0;
    while read line; do
      # First, extract the pieces of each line.
      line_type=`echo "$line"    | awk -F'\t' '{ print $1 }'`;
      anchor_name=`echo "$line"  | awk -F'\t' '{ print $2 }'`;
      anchor_title=`echo "$line" | awk -F'\t' '{ print $3 }'`;

      # For an h2, we have to print a summary block, if the h2 has any children.
      # (Below is a hacky way to detect that.)
      h3_lines=`grep -A 1 "$line" "$sb_output_file.side.list.tmp" |\
                tail -1 |\
                grep 'h3' |\
                wc -l`;

      # Note that this assumes that *all* h2s have h3 children.  If that's not
      # true, some extra processing will be needed.
      if [ "$line_type" = "h2" ]
      then
        # Close a block if necessary.
        if [ "$in_block" = "1" ];
        then
          # We have to close the previous block.
          echo "</ul></details></li>" >> "$sb_output_file.side.tmp";
          in_block=0;
        fi

        # Create the new details block, if the h2 has children.
        if [ "$h3_lines" -gt 0 ];
        then
          echo "<li><details><summary>" >> "$sb_output_file.side.tmp";
          echo "<a href=\"#$anchor_name\">" >> "$sb_output_file.side.tmp";
          echo "$anchor_title" >> "$sb_output_file.side.tmp";
          echo "</a>" >> "$sb_output_file.side.tmp";
          echo "</summary>" >> "$sb_output_file.side.tmp";
          echo "<ul>" >> "$sb_output_file.side.tmp";
          in_block=1;
        else
          echo "  <li><a href=\"#$anchor_name\">" >> "$sb_output_file.side.tmp";
          echo "  $anchor_title" >> "$sb_output_file.side.tmp";
          echo "  </a></li>" >> "$sb_output_file.side.tmp";
        fi
      else
        echo "  <li><a href=\"#$anchor_name\">" >> "$sb_output_file.side.tmp";
        echo "  $anchor_title" >> "$sb_output_file.side.tmp";
        echo "  </a></li>" >> "$sb_output_file.side.tmp";
      fi
    done < "$sb_output_file.side.list.tmp";

    # Close the last h2 block, if we need to.
    if [ "$in_block" = "1" ];
    then
      echo "</ul></details></li>" >> "$sb_output_file.side.tmp";
    fi

    rm -f "$sb_output_file.side.list.tmp";
  else
    # On other pages, the page title is encoded as an h1.
    page_title=`grep '<h1 id=' "$sb_input_file" |\
        head -1 |\
        sed 's/^<h1 id="[^"]*">\(.*\)<\/h1>/\1/'`;

    grep '<h2 id=' "$sb_input_file" |\
        sed 's/<h2 id="\([^"]*\)">\(.*\)<\/h2>/<li><a href="#\1">\2<\/a><\/li>/' \
        > "$sb_output_file.side.tmp";
  fi
  lines=`cat "$sb_output_file.side.tmp" | wc -l`;

  echo "<ul>" >> "$sb_output_file";

  # Make the top of the sidebar.
  if [ -n "$page_title" ];
  then
    echo "<li class=\"page_title\"><b>$page_title</b> <a href=\"#\">[top]</a>" >> "$sb_output_file";
  else
    echo "<li><a href=\"#\">[top of page]</a>" >> "$sb_output_file";
  fi

  if [[ "$lines" -gt 0 ]];
  then
    echo "<ul>" >> "$sb_output_file";
    cat "$sb_output_file.side.tmp" >> "$sb_output_file";
    echo "</ul>" >> "$sb_output_file";
  fi
  echo "</li>" >> "$sb_output_file";
  echo "</ul>" >> "$sb_output_file";
  echo "</div>" >> "$sb_output_file";

  rm -f "$sb_output_file.side.tmp";
}

# Save any existing template.
if [ -f "$template_html_header" ];
then
  mv "$template_html_header" template.html.header.tmp;
fi

if [ -f "$template_html_footer" ];
then
  mv "$template_html_footer" template.html.footer.tmp;
fi

rm -rf "$output_dir";
mkdir -p "$output_dir";
cp -v doc/css/* "$output_dir";
mkdir -p "$output_dir/img/";
cp -v doc/img/* "$output_dir/img/";
mkdir -p "$output_dir/tutorials/res/";
cp -v doc/tutorials/res/* "$output_dir/tutorials/res/";

# Create the template files we will use, if they don't already exist.
if [ -f template.html.header.tmp ];
then
  mv template.html.header.tmp "$template_html_header";
else
  create_template_header "$template_html_header";
  del_header=1;
fi

if [ -f template.html.footer.tmp ];
then
  mv template.html.footer.tmp "$template_html_footer";
else
  create_template_footer "$template_html_footer";
  del_footer=1;
fi

cp doc/sidebar.html "$template_html_sidebar";

# Process all the .md files.
for f in README.md HISTORY.md `find ./doc/ -iname '*.md'`;
do
  # Skip the JOSS paper...
  if [[ $f == *"joss_paper"* ]]; then
    continue;
  fi

  echo "Processing $f...";
  run_kramdown $f;

  # This converts, e.g., ./doc/user/index.md -> doc/html/user/index.html.
  tmp=${f#./doc/}; # Strip leading ./doc/.
  of="$output_dir/${tmp%.md}.html";

  # Check HTML, if we did not disable that.
  if [ -z ${DISABLE_HTML_CHECKS+x} ];
  then
    tidy -qe "$of" || exit 1;
  fi
done

# Now take a second pass to check all local links.
if [ -z ${DISABLE_HTML_CHECKS+x} ];
then
  find "$output_dir" -iname '*.html' -print0 | while read -d $'\0' f
  do
    echo "Checking local links and anchors in $f...";

    # To run checklink we have to strip out some perl stderr warnings...
    checklink -qs \
        --follow-file-links \
        --suppress-broken 405 \
        --suppress-broken 503 \
        --suppress-broken 301 \
        --suppress-broken 400 \
        -X "^http.*$" "$f" 2>&1 |
        grep -v 'Use of uninitialized value' > checklink_out;
    if [ -s checklink_out ];
    then
      # Store up all failures to print them at once.
      cat checklink_out >> overall_checklink_out;
    fi
    rm -f checklink_out;
  done

  # Check to see if there were any failures, all at once.
  if [ -f overall_checklink_out ];
  then
    cat overall_checklink_out;
    rm -f overall_checklink_out;
    exit 1;
  fi

  # Check to see if there were any failures, all at once.
fi

# Utility script to create linkchecker result SQL table, with a bit of extra
# information.
cat > create.sql << EOF
create table linksdb (
    urlname        varchar(256) not null,
    parentname     varchar(256),
    baseref        varchar(256),
    valid          int,
    result         varchar(256),
    warning        varchar(512),
    info           varchar(512),
    url            varchar(256),
    line           int,
    col            int,
    name           varchar(256),
    checktime      int,
    dltime         int,
    size           int,
    cached         int,
    level          int not null,
    modified       int,
    resulttime     timestamp,
    validdays      int
);
EOF

# Finally, take a third pass to check external links.
if [ -z ${DISABLE_HTML_CHECKS+x} ];
then
  # Create a basic config file for linkchecker.  We will append domains to ignore
  # to this as we go.
  echo "[checking]" > "$output_dir/linkcheckerrc.in";
  echo "maxrequestspersecond=2" >> "$output_dir/linkcheckerrc.in";
  echo "" >> "$output_dir/linkcheckerrc.in";
  echo "[filtering]" >> "$output_dir/linkcheckerrc.in";
  echo "ignore=" >> "$output_dir/linkcheckerrc.in";
  echo "  ^(?!http).*$" >> "$output_dir/linkcheckerrc.in";
  # Github issues/pull requests redirect to each other and we link to so many of
  # them it's not worth checking them.
  echo "  ^https://github.com/mlpack/mlpack/issues/[0-9]*$" >> "$output_dir/linkcheckerrc.in";
  echo "  ^https://github.com/mlpack/mlpack/issues[?]q.*$" >> "$output_dir/linkcheckerrc.in";
  echo "  ^https://github.com/mlpack/mlpack/pulls[?]q.*$" >> "$output_dir/linkcheckerrc.in";

  # Initialize our cache or take the current version of it.
  if [ ! -z ${LINK_CACHE_FILE+x} ];
  then
    if [ -f ${LINK_CACHE_FILE} ];
    then
      cp "$LINK_CACHE_FILE" "$output_dir/all_links.db";
    else
      rm -f "$output_dir/all_links.db";
      cat create.sql | sqlite3 "$output_dir/all_links.db";
    fi
  else
    rm -f "$output_dir/all_links.db";
    cat create.sql | sqlite3 "$output_dir/all_links.db";
  fi

  find "$output_dir" -iname '*.html' -print0 | while read -d $'\0' f
  do
    echo "Checking external links in $f...";

    # Generate our config file for this file by appending all valid files that
    # we have already seen.  Note that we have to append $ to all the ignore
    # patterns so that we don't accidentally match anchors that haven't been
    # checked yet.
    cp "$output_dir/linkcheckerrc.in" "$output_dir/linkcheckerrc";
    echo "SELECT DISTINCT urlname FROM linksdb
          WHERE valid = 1 AND
                urlname LIKE 'http%' AND
                julianday(datetime()) - julianday(resulttime) < validdays AND
                (result LIKE '200%' OR
                 result = 'filtered' OR
                 result = 'syntax OK');" | sqlite3 "$output_dir/all_links.db" |\
        sed 's/^/  /' |\
        sed 's/?/\\?/g' |\
        sed 's/$/$/' >> "$output_dir/linkcheckerrc";

    # Run linkchecker, and make things a little bit prettier if there are
    # failures.
    rm -f links.sql;
    linkchecker --check-extern \
        --recursion-level=1 \
        --threads=4 \
        --verbose \
        --no-status \
        --output=failures \
        --file-output=sql/ascii/links.sql \
        --config="$output_dir/linkcheckerrc" \
        $f |\
        awk -F"', '" '{ print $2; }' |\
        sed 's/'"'"')"$//' |\
        sed 's/^/Failed: /' |\
        sed 's/$/; will try again at the end of the run./';

    # Print the number of links we checked and the number we filtered.
    total_links=`cat links.sql | grep -v '^--' | grep 'http' | wc -l`;
    filtered_links=`grep 'filtered' links.sql | grep -v '^--' | grep 'http' |\
        wc -l`;
    echo "  $filtered_links of $total_links external links were cached.";

    # Insert results into the database.  We have to insert the timestamp and the
    # number of days the result is valid for.  For that, we use a random number
    # of days, because we don't want *all* of our results to expire on the same
    # CI run and have it take forever.
    cat links.sql |\
        sed 's/modified) values (/modified,resulttime,validdays) values (/' |\
        sed "s/);$/, current_timestamp, random() % 10 + 25);/" |\
        sqlite3 "$output_dir/all_links.db";

    # Print any warnings too, because we will try them again later.
    cat create.sql | sqlite3 tmp.db;
    cat links.sql |\
        sed 's/modified) values (/modified,resulttime,validdays) values (/' |\
        sed "s/);$/, current_timestamp, random() % 10 + 25);/" |\
        sqlite3 tmp.db;
    echo "SELECT DISTINCT urlname, warning FROM linksdb
          WHERE valid = 1 AND
                warning IS NOT NULL AND
                (result NOT LIKE '200%' AND
                 warning NOT LIKE '%307 Temporary Redirect%' AND
                 result <> 'filtered' AND
                 result <> 'syntax OK');" |\
        sqlite3 tmp.db |\
        awk -F'|' '{ print "Warning: "$1": "$2"; will try again at the end of the run."; }';
    rm -f tmp.db;
  done

  # Second chance on errors and warnings: filter out any spurious failures.
  echo "SELECT DISTINCT urlname FROM linksdb
        WHERE valid = 0 OR
              (warning IS NOT NULL AND
               warning NOT LIKE '%307 Temporary Redirect%') OR
              (result NOT LIKE '200%' AND
               result <> 'filtered' AND
               result <> 'syntax OK');" | sqlite3 "$output_dir/all_links.db" >\
      links_to_check.txt;
  num_links=`cat links_to_check.txt | wc -l`;
  if [ $num_links -gt 0 ];
  then
    echo "Second check for the following URLs that failed the first time:";
    cat links_to_check.txt | sed 's/^/  /';

    # Slow down the process to try and fix any links that got rate limited.
    cat "$output_dir/linkcheckerrc.in" |\
        sed 's/maxrequestspersecond=.*$/maxrequestspersecond=1/' >\
        "$output_dir/linkcheckerrc";

    linkchecker --check-extern \
        --recursion-level=0 \
        --threads=1 \
        --file-output=sql/ascii/links_failed.sql \
        --no-status \
        --verbose \
        --config="$output_dir/linkcheckerrc" \
        `cat links_to_check.txt | tr '\n' ' '`;

    cat create.sql | sqlite3 tmp.db;
    cat links_failed.sql |\
        sed 's/modified) values (/modified,resulttime,validdays) values (/' |\
        sed "s/);$/, current_timestamp, random() % 10 + 25);/" |\
        sqlite3 tmp.db;
    echo "SELECT DISTINCT urlname, result FROM linksdb
          WHERE valid = 0" | sqlite3 tmp.db |\
        awk -F'|' '{ print "  "$1": "$2; }' > links_failed.txt;
    echo "SELECT DISTINCT urlname, warning FROM linksdb
          WHERE valid = 1 AND warning IS NOT NULL" | sqlite3 tmp.db |\
        awk -F'|' '{ print "  "$1": "$2; }' > links_warned.txt;

    # Also add the second pass results to the global cache.
    cat links_failed.sql |\
        sed 's/modified) values (/modified,resulttime,validdays) values (/' |\
        sed "s/);$/, current_timestamp, random() % 10 + 25);/" |\
        sqlite3 "$output_dir/all_links.db";

    total_links_failed=`cat links_failed.txt links_warned.txt | wc -l`;
    if [ $total_links_failed -gt 0 ];
    then
      echo "The following links have failed:";

      cat links_failed.txt links_warned.txt;
      rm -f links_failed.sql tmp.db links_failed.txt links_warned.txt;
      exitcode=1;
    else
      exitcode=0;
    fi

    rm -f tmp.db links_failed.sql;
  else
    exitcode=0;
  fi

  rm -f links_to_check.txt;

  # Add to the global cache.
  if [ ! -z ${LINK_CACHE_FILE+x} ];
  then
    mv "$output_dir/all_links.db" "${LINK_CACHE_FILE}";
    echo "DELETE FROM linksdb
          WHERE valid = 0 OR
                warning IS NOT NULL OR
                julianday(datetime()) - julianday(resulttime) >= validdays;" |\
        sqlite3 "${LINK_CACHE_FILE}";

    # Keep only the most recent entry for a given urlname, to keep the size of
    # the cache as small as possible.
    echo "CREATE TABLE tmp_linksdb AS SELECT * FROM linksdb
          GROUP BY urlname HAVING MAX(resulttime) ORDER BY urlname;" |\
        sqlite3 "${LINK_CACHE_FILE}";
    echo "DROP TABLE linksdb;" | sqlite3 "${LINK_CACHE_FILE}";
    echo "ALTER TABLE tmp_linksdb RENAME TO linksdb;" | sqlite3 "${LINK_CACHE_FILE}";
  fi

  # Pick all the links that are within a week of timing out and run them again,
  # to see if we can "refresh" them.  This is intended to handle situations
  # where flaky URLs may not always work, but they will be tried a handful of
  # times over the week before their last run expires.  The hope is that one of
  # those runs in the last week before they expire will succeed, preventing a
  # documentation job from failing due to a bad link.
  echo "SELECT DISTINCT urlname FROM linksdb
        WHERE valid = 1 AND
            urlname LIKE 'http%' AND
            validdays -
                (julianday(datetime()) - julianday(resulttime)) <= 7 AND
            (result LIKE '200%' OR
             result = 'filtered' OR
             result = 'syntax OK');" |\
      sqlite3 "$output_dir/all_links.db" > links_to_check.txt;
  num_links=`cat links_to_check.txt | wc -l`;
  if [ $num_links -gt 0 ];
  then
    echo "Checking $num_links links before their cache entry expires...";
    linkchecker --check-extern \
        --recursion-level=0 \
        --threads=1 \
        --file-output=sql/ascii/links_output.sql \
        --output=failures \
        --no-status \
        --verbose \
        --config="$output_dir/linkcheckerrc" \
        `cat links_to_check.txt | tr '\n' ' '` |\
        awk -F"', '" '{ print $2; }' |\
        sed 's/'"'"')"$//' |\
        sed 's/^/Warning: /' |\
        sed 's/$/ failed, but cache entry not yet expired./';

    cat links_output.sql |\
        sed 's/modified) values (/modified,resulttime,validdays) values (/' |\
        sed "s/);$/, current_timestamp, random() % 10 + 25);/" |\
        sqlite3 "$output_dir/all_links.db";
    # Filter out any bad links.
    echo "DELETE FROM all_links WHERE valid = 0;" |\
        sqlite3 "$output_dir/all_links.db";
  fi

  # Clean up unnecessary files.
  rm -f "$output_dir/link_errors.csv" "$output_dir/all_links.csv" \
      "$output_dir/linkcheckerrc.in" "$output_dir/linkcheckerrc";
  rm -f links.csv links_failed.csv;
else
  exitcode=0;
fi

# Remove temporary files.
rm -f create.sql;
if [ "a$del_header" == "a1" ];
then
  rm -f "$template_html_header";
fi

if [ "a$del_footer" == "a1" ];
then
  rm -f "$template_html_footer";
fi

exit $exitcode;