1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301
|
--- spelling-stage-3.lua
--- Copyright 2012, 2013 Stephan Hennig
--
-- This work may be distributed and/or modified under the conditions of
-- the LaTeX Project Public License, either version 1.3 of this license
-- or (at your option) any later version. The latest version of this
-- license is in http://www.latex-project.org/lppl.txt
-- and version 1.3 or later is part of all distributions of LaTeX
-- version 2005/12/01 or later.
--
-- See file README for more information.
--
--- Store the text of a LuaTeX document in a text document data
--- structure.
-- This module provides means to extract text from a LuaTeX document and
-- to store it in a text document data structure.
--
-- In the text document, words are stored as UTF-8 encoded strings. A
-- mapping mechanism is provided by which, during word string
-- recognition, individual code-points, e.g., of glyph nodes, can be
-- translated to arbitrary UTF-8 strings, e.g., ligatures to single
-- letters.
--
-- @author Stephan Hennig
-- @copyright 2012, 2013 Stephan Hennig
-- @release version 0.41
--
-- @trick Prevent LuaDoc from looking past here for module description.
--[[ Trick LuaDoc into entering 'module' mode without using that command.
module(...)
--]]
-- Module table.
local M = {}
-- Import external modules.
local recurse = require('spelling-recurse')
-- Function short-cuts.
local recurse_node_list = recurse.recurse_node_list
local tabinsert = table.insert
local tabremove = table.remove
-- Short-cuts for constants.
local WHATSIT = node.id('whatsit')
local LOCAL_PAR = node.subtype('local_par')
local USER_DEFINED = node.subtype('user_defined')
-- Declare local variables to store references to resources that are
-- provided by external code.
--
-- Text document data structure.
local __text_document
--
-- ID of user-defined whatsit nodes marking the start of a word.
local __uid_start_tag
--
-- ID of user-defined whatsit nodes marking the end of a word.
local __uid_end_tag
--- Module options.
-- This table contains all module options. User functions to set
-- options are provided.
--
-- @class table
-- @name __opts
-- @field table_par When processing a table, when should paragraphs be
-- inserted into the text document?<br />
--
-- <ul>
-- <li> 0 - Don't touch tables in any way.</li>
-- <li> 1 - Insert paragraphs before and after hlists of type
-- <i>alignment column or row</i>, i.e., before and after
-- every table row.</li>
-- <li> 2 - Insert paragraphs before and after hlists of type
-- <i>alignment cell</i>, i.e., before and after every table
-- cell.</li>
-- </ul>
local __opts = {
table_par,
}
--- Set table behaviour.
-- Determine when paragraphs are inserted within tables.
--
-- @param value New value.
local function set_table_paragraphs(value)
__opts.table_par = value
end
M.set_table_paragraphs = set_table_paragraphs
--- Data structure that stores the word strings found in a node list.
--
-- @class table
-- @name __curr_paragraph
local __curr_paragraph
--- Act upon detection of end of current word string.
-- If the current word contains visible characters, store the current
-- word in the current paragraph.
--
-- @param n String tag node.
local function __finish_current_word(n)
-- Provide new empty paragraph, if necessary.
if not __curr_paragraph then
__curr_paragraph = {}
end
-- Append current string to current paragraph.
tabinsert(__curr_paragraph, n.value)
end
--- Act upon detection of end of current paragraph.
-- If the current paragraph contains words, store the current paragraph
-- in the text document.
local function __finish_current_paragraph()
-- Finish a paragraph?
if __curr_paragraph then
-- Append current paragraph to document structure.
tabinsert(__text_document, __curr_paragraph)
__curr_paragraph = nil
end
end
--- Paragraph management stack.
-- Stack of boolean flags, that are used for logging the occurence of a
-- new paragraph within nested vlists.
local __is_vlist_paragraph
--- Paragraph management.
-- This function puts a new boolean flag onto a stack that is used to
-- log the occurence of a new paragraph, while recursing into the coming
-- vlist. After finishing recursing into the vlist, the flag needs to
-- be removed from the stack. Depending on the flag, the then current
-- paragraph can be finished.
local function __vlist_pre_recurse()
tabinsert(__is_vlist_paragraph, false)
end
--- Paragraph management.
-- Remove flag from stack after recursing into a vlist. If necessary,
-- finish the current paragraph.
local function __vlist_post_recurse()
local p = tabremove(__is_vlist_paragraph)
if p then
__finish_current_paragraph()
end
end
--- Handle tables lines and cells.
-- Start a new paragraph before and after an hlist of subtype `alignment
-- column or row` or `alignment cell`, depending on option `table_par`.
--
-- @param n hlist node.
local function __handle_table(n)
local subtype = n.subtype
local table_par = __opts.table_par
if (subtype == 4) and (table_par == 1) then
__finish_current_paragraph()
elseif (subtype == 5) and (table_par == 2) then
__finish_current_paragraph()
end
end
--- Find paragraphs and strings.
-- While scanning a node list, this call-back function finds nodes
-- representing the start of a paragraph (local_par whatsit nodes) and
-- string tags (user_defined whatsit nodes).
--
-- @param head Head node of current branch.
-- @param n The current node.
local function __visit_node(head, n)
local nid = n.id
-- Test for node containing a word string.
if nid == WHATSIT then
-- Test for word string tag.
if (n.subtype == USER_DEFINED) and (n.user_id == __uid_end_tag) then
__finish_current_word(n)
-- Test for paragraph start.
elseif n.subtype == LOCAL_PAR then
__finish_current_paragraph()
__is_vlist_paragraph[#__is_vlist_paragraph] = true
end
end
end
--- Table of call-back functions for node list recursion: store the
--- word strings found in a node list.
-- The call-back functions in this table identify chains of nodes
-- representing word strings in a node list and stores the strings in
-- the text document. A new paragraph is started at local_par whatsit
-- nodes and after finishing a vlist containing a local_par whatsit
-- node. Nodes of type `hlist` are recursed into as if they were
-- non-existent. As an example, the LaTeX input `a\mbox{a b}b` is
-- recognized as two strings `aa` and `bb`.
--
-- @class table
-- @name __cb_store_words
-- @field vlist_pre_recurse Paragraph management.
-- @field vlist_post_recurse Paragraph management.
-- @field hlist_pre_recurse Table management.
-- @field hlist_post_recurse Table management.
-- @field visit_node Find nodes representing paragraphs and words.
local __cb_store_words = {
vlist_pre_recurse = __vlist_pre_recurse,
vlist_post_recurse = __vlist_post_recurse,
hlist_pre_recurse = __handle_table,
hlist_post_recurse = __handle_table,
visit_node = __visit_node,
}
--- Process node list according to this stage.
-- This function recurses into the given node list, finds strings in
-- tags and stores them in the text document.
--
-- @param head Node list.
local function __process_node_list(head)
recurse_node_list(head, __cb_store_words)
-- Clean-up left-over word and/or paragraph.
__finish_current_paragraph()
end
-- Call-back status.
local __is_active_storage
--- Call-back function that processes the node list.
-- <i>This function is not made available in the module table, but in
-- the global package table!</i>
--
-- @param head Node list.
local function cb_AtBeginShipout(box)
if __is_active_storage then
__process_node_list(tex.box[box])
end
end
--- Start storing text.
-- After calling this function, text is stored in the text document.
local function enable_text_storage()
__is_active_storage = true
end
M.enable_text_storage = enable_text_storage
--- Stop storing text.
-- After calling this function, no more text is stored in the text
-- document.
local function disable_text_storage()
__is_active_storage = false
end
M.disable_text_storage = disable_text_storage
--- Module initialisation.
--
local function __init()
-- Get local references to package ressources.
__text_document = PKG_spelling.res.text_document
__uid_start_tag = PKG_spelling.res.whatsit_ids.start_tag
__uid_end_tag = PKG_spelling.res.whatsit_ids.end_tag
-- Make \AtBeginShipout function available in package table.
PKG_spelling.cb_AtBeginShipout = cb_AtBeginShipout
-- Create empty paragraph management stack.
__is_vlist_paragraph = {}
-- Remember call-back status.
__is_active_storage = false
-- Set default table paragraph behaviour.
set_table_paragraphs(0)
end
-- Initialize module.
__init()
-- Return module table.
return M
|