1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
|
# -*- coding: utf-8 -*-
#
# The MIT License (MIT)
#
# Copyright (c) 2018 Philippe Faist
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
r"""
The `latexencode` module provides a set of routines that allows you to
convert a unicode string to LaTeX escape sequences.
For basic usage you can use the :py:func:`unicode_to_latex()` function
directly::
>>> from pylatexenc.latexencode import unicode_to_latex
>>> print(unicode_to_latex('À votre santé'))
\`A votre sant\'e
>>> print(unicode_to_latex('The length of samples #3 & #4 is 3μm'))
The length of samples \#3 \& \#4 is 3\ensuremath{\mu}m
The conversion is handled by the class :py:class:`UnicodeToLatexEncoder`. If
you are converting multiple strings, you may create an instance with the flags
you like and invoke its method
:py:meth:`~UnicodeToLatexEncoder.unicode_to_latex()` as many times as necessary::
>>> from pylatexenc.latexencode import UnicodeToLatexEncoder
>>> u = UnicodeToLatexEncoder(unknown_char_policy='replace')
>>> print(u.unicode_to_latex('À votre santé'))
\`A votre sant\'e
>>> print(u.unicode_to_latex('The length of samples #3 & #4 is 3μm'))
The length of samples \#3 \& \#4 is 3\ensuremath{\mu}m
>>> print(u.unicode_to_latex('À votre santé: 乾杯'))
No known latex representation for character: U+4E7E - ‘乾’
No known latex representation for character: U+676F - ‘杯’
\`A votre sant\'e: {\bfseries ?}{\bfseries ?}
Example using custom conversion rules::
>>> from pylatexenc.latexencode import UnicodeToLatexEncoder, \
... UnicodeToLatexConversionRule, RULE_REGEX
>>> u = UnicodeToLatexEncoder(
... conversion_rules=[
... UnicodeToLatexConversionRule(rule_type=RULE_REGEX, rule=[
... (re.compile(r'-->'), r'\\textrightarrow'),
... (re.compile(r'<--'), r'\\textleftarrow'),
... ]),
... 'defaults'
... ]
... )
>>> print(u.unicode_to_latex("Cheers --> À votre santé"))
Cheers {\textrightarrow} \`A votre sant\'e
See :py:class:`UnicodeToLatexEncoder` and
:py:class:`UnicodeToLatexConversionRule`. Note for regex rules, the replacement
text is expanded like the second argument of `re.sub()` and backslashes need to
be escaped even inside raw strings.
.. versionadded:: 2.0
The class :py:class:`UnicodeToLatexEncoder` along with its helper functions
and classes were introduced in `pylatexenc 2.0`.
The earlier function :py:func:`utf8tolatex()` that was available in
`pylatexenc 1.x` is still provided unchanged, so code written for `pylatexenc
1.x` should work without changes. New code is however strongly encouraged to
employ the new API.
"""
from __future__ import print_function, absolute_import, unicode_literals
import unicodedata
import logging
import sys
import functools
import itertools
if sys.version_info.major > 2:
unicode = str # need to support unicode() w/ no arguments
basestring = str
# use MappingProxyType for keeping
from types import MappingProxyType as _MappingProxyType
# inspect function argument names
from inspect import getfullargspec
else:
_MappingProxyType = dict
# inspect function argument names -- simulate getfullargspec with getargspec (argh...)
from inspect import getargspec as getfullargspec
logger = logging.getLogger(__name__)
from .. import _util
# ------------------------------------------------
from ._unicode_to_latex_encoder import (
get_builtin_uni2latex_dict,
RULE_DICT,
RULE_REGEX,
RULE_CALLABLE,
UnicodeToLatexConversionRule,
get_builtin_conversion_rules,
UnicodeToLatexEncoder,
)
# ------------------------------------------------
from ._partial_latex_encoder import (
PartialLatexToLatexEncoder,
)
# ------------------------------------------------
_u2l_obj_cache = {}
def unicode_to_latex(s, non_ascii_only=False, replacement_latex_protection='braces',
unknown_char_policy='keep', unknown_char_warning=True):
r"""
Shorthand for constructing a :py:class:`UnicodeToLatexEncoder` instance and
calling its :py:meth:`~UnicodeToLatexEncoder.unicode_to_latex()` method.
The :py:class:`UnicodeToLatexEncoder` instances for given option settings
are cached, making repeated calls to :py:func:`unicode_to_latex()` possible
without creating a new instance upon each call.
The parameters `non_ascii_only`, `replacement_latex_protection`,
`unknown_char_policy`, and `unknown_char_warning` are directly passed on to
the :py:class:`UnicodeToLatexEncoder` constructor. See the class doc for
:py:class:`UnicodeToLatexEncoder` for more information about what they do.
You may only use arguments to this function that are python hashable (like
`True`, `False`, or simple strings) to help us keep a cache of previously
constructed :py:class:`UnicodeToLatexEncoder` instances. For instance, it
is not possible to provide a callable to `unknown_char_policy`. It is also
not possible to specify custom conversion rules with this helper function.
If you need any of these features, simply create a
:py:class:`UnicodeToLatexEncoder` instance directly.
"""
key = (non_ascii_only, replacement_latex_protection, unknown_char_policy,
unknown_char_warning)
if key in _u2l_obj_cache:
u = _u2l_obj_cache[key]
else:
u = UnicodeToLatexEncoder(non_ascii_only=non_ascii_only,
replacement_latex_protection=replacement_latex_protection,
unknown_char_policy=unknown_char_policy,
unknown_char_warning=unknown_char_warning)
_u2l_obj_cache[key] = u
return u.unicode_to_latex(s)
# ------------------------------------------------------------------------------
# Don't change pylatexenc 1.x function:
def _get_deprecated_utf82latex():
#
# Don't issue a deprecation warning, because utf8tolatex() uses the
# `utf82latex` dict even if it isn't modified by the user.
#
# _util.pylatexenc_deprecated_2(
# "The module-level dictionary `pylatexenc.latexencode.utf82latex` is deprecated "
# "and might be removed in a future version of `pylatexenc`.",
# )
# return a copy of the dict so that the user can modify the module-level
# `utf82latex` dict without influencing the behavior of the new
# `unicode_to_latex()` routines. (E.g., if two python modules use
# pylatexenc.latexencode, we don't want one python module's use of
# `utf2tolatex()` to influence the behavior of another module's use of
# `unicode_to_latex()`. If both modules use `utf8tolatex()`, we can't avoid
# this influence.)
from ._uni2latexmap import uni2latex as _uni2latex
return _uni2latex.copy()
utf82latex = _util.LazyDict(generate_dict_fn=_get_deprecated_utf82latex)
"""
.. deprecated:: 2.0
Pylatexenc 1.x exposed the module-level dictionary `utf82latex` that could be
modified to alter the behavior of `utf8tolatex()`.
If you would like to obtain a copy of the built-in unicode to text
dictionary, see :py:func:`get_builtin_uni2latex_dict()`. If you would like
to alter the behavior of :py:func:`utf8tolatex()`, you should use
:py:class:`UnicodeToLatexEncoder` which provides a rich interface for
specifying rules how to convert chars to LaTeX escapes.
For backwards compatibility, you can still modify the module-level dictionary
`utf82latex` (but you can't assign a new object to it) and this will directly
modify the global built-in dictionary of known latex escapes. This is not
recommended however, and the `utf82latex` module-level dictionary might be
removed in the future.
.. warning::
Modifying the `utf82latex` module-level dictionary is not recommended.
Doing so will alter the behavior of the `utf8tolatex()` function also for
all other modules that also use `pylatexenc`!
"""
def utf8tolatex(s, non_ascii_only=False, brackets=True, substitute_bad_chars=False,
fail_bad_chars=False):
"""
.. note::
Since `pylatexenc 2.0`, it is recommended to use the the
:py:func:`unicode_to_latex()` function or the
:py:class:`UnicodeToLatexEncoder` class instead of the earlier function
`utf8tolatex()`.
The new routines provide much more flexibility and versatility. For
instance, you can specify custom escape sequences for certain characters.
Some cheap benchmarks seem to indicate that the new routines are not
significantly slower than the `utf8tolatex()` function. Also, the name
`utf8tolatex()` was poorly chosen, since the argument is in fact not
'utf-8'-encoded but rather a Python unicode string object.
The function `utf8tolatex()` is still provided unchanged from `pylatexenc
1.x`. We do not plan to remove this function in the near future so it is
not (yet) considered as deprecated and we will continue to provide it in
near future versions of `pylatexenc`. Bug reports, improvements, and new
features will however be directed to :py:func:`UnicodeToLatexEncoder()`.
Encode a UTF-8 string to a LaTeX snippet.
If `non_ascii_only` is set to `True`, then usual (ascii) characters such as ``#``,
``{``, ``}`` etc. will not be escaped. If set to `False` (the default), they are
escaped to their respective LaTeX escape sequences.
If `brackets` is set to `True` (the default), then LaTeX macros are enclosed in
brackets. For example, ``sant\N{LATIN SMALL LETTER E WITH ACUTE}`` is replaced by
``sant{\\'e}`` if `brackets=True` and by ``sant\\'e`` if `brackets=False`.
.. warning::
Using `brackets=False` might give you an invalid LaTeX string, so avoid
it! (for instance, ``ma\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}tre`` will be
replaced incorrectly by ``ma\\^\\itre`` resulting in an unknown macro ``\\itre``).
If `substitute_bad_chars=True`, then any non-ascii character for which no LaTeX escape
sequence is known is replaced by a question mark in boldface. Otherwise (by default),
the character is left as it is.
If `fail_bad_chars=True`, then a `ValueError` is raised if we cannot find a
character substitution for any non-ascii character.
.. versionchanged:: 1.3
Added `fail_bad_chars` switch
"""
s = unicode(s) # make sure s is unicode
s = unicodedata.normalize('NFC', s)
if not s:
return ""
result = u""
for ch in s:
#logger.longdebug("Encoding char %r", ch)
if (non_ascii_only and ord(ch) < 127):
result += ch
else:
# use the `utf82latex` dict -- not `_uni2latex` which should NOT be
# modified externally even for backwards-compatible code
lch = utf82latex.get(ord(ch), None)
if (lch is not None):
# add brackets if needed, i.e. if we have a substituting macro.
# note: in condition, beware, that lch might be of zero length.
result += ( '{'+lch+'}' if brackets and lch[0:1] == '\\' else
lch )
elif ((ord(ch) >= 32 and ord(ch) <= 127) or
(ch in "\n\r\t")):
# ordinary printable ascii char, just add it
result += ch
else:
# non-ascii char
msg = u"Character cannot be encoded into LaTeX: U+%04X - `%s'" % (ord(ch), ch)
if fail_bad_chars:
raise ValueError(msg)
logger.warning(msg)
if substitute_bad_chars:
result += r'{\bfseries ?}'
else:
# keep unescaped char
result += ch
return result
|