1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
|
#
# complex_chemical_formulas.py
#
# Example that expands on the basic chemical_formulas.py parser to
# include grouped multiplication notation, such as "3(C₆H₅OH)₂".
#
# Copyright (c) 2024, Paul McGuire
#
from collections import Counter
import pyparsing as pp
ppc = pp.common
# fmt: off
table_of_elements: dict[str, float] = {
"H": 1.007, "He": 4.002, "Li": 6.941, "Be": 9.012, "B": 10.811, "C": 12.011,
"N": 14.007, "O": 15.999, "F": 18.998, "Ne": 20.18, "Na": 22.99, "Mg": 24.305,
"Al": 26.982, "Si": 28.086, "P": 30.974, "S": 32.065, "Cl": 35.453, "Ar": 39.948,
"K": 39.098, "Ca": 40.078, "Sc": 44.956, "Ti": 47.867, "V": 50.942, "Cr": 51.996,
"Mn": 54.938, "Fe": 55.845, "Co": 58.933, "Ni": 58.693, "Cu": 63.546, "Zn": 65.38,
"Ga": 69.723, "Ge": 72.64, "As": 74.922, "Se": 78.96, "Br": 79.904, "Kr": 83.798,
"Rb": 85.468, "Sr": 87.62, "Y": 88.906, "Zr": 91.224, "Nb": 92.906, "Mo": 95.96,
"Tc": 98.0, "Ru": 101.07, "Rh": 102.906, "Pd": 106.42, "Ag": 107.868,
"Cd": 112.411, "In": 114.818, "Sn": 118.71, "Sb": 121.76, "Te": 127.6,
"I": 126.904, "Xe": 131.293, "Cs": 132.905, "Ba": 137.327, "La": 138.905,
"Ce": 140.116, "Pr": 140.908, "Nd": 144.242, "Pm": 145.0, "Sm": 150.36,
"Eu": 151.964, "Gd": 157.25, "Tb": 158.925, "Dy": 162.5, "Ho": 164.93,
"Er": 167.259, "Tm": 168.934, "Yb": 173.054, "Lu": 174.967, "Hf": 178.49,
"Ta": 180.948, "W": 183.84, "Re": 186.207, "Os": 190.23, "Ir": 192.217,
"Pt": 195.084, "Au": 196.967, "Hg": 200.59, "Tl": 204.383, "Pb": 207.2,
"Bi": 208.98, "Po": 210.0, "At": 210.0, "Rn": 222.0, "Fr": 223.0, "Ra": 226.0,
"Ac": 227.0, "Th": 232.038, "Pa": 231.036, "U": 238.029, "Np": 237.0,
"Pu": 244.0, "Am": 243.0, "Cm": 247.0, "Bk": 247.0, "Cf": 251.0, "Es": 252.0,
"Fm": 257.0, "Md": 258.0, "No": 259.0, "Lr": 262.0, "Rf": 261.0, "Db": 262.0,
"Sg": 266.0, "Bh": 264.0, "Hs": 267.0, "Mt": 268.0, "Ds": 271.0, "Rg": 272.0,
"Cn": 285.0, "Nh": 284.0, "Fl": 289.0, "Mc": 288.0, "Lv": 292.0, "Ts": 295.0,
"Og": 294.0,
}
# fmt: on
# basic parser elements
# - element - a chemical symbol, corresponding to one of the entries
# in table_of_elements
# - subcript_int - an integer made up of subscript digits
# (a normal integer definition uses the one defined in pyparsing.common)
#
# element = pp.one_of(table_of_elements).set_name("element")
element = pp.Regex(pp.util.make_compressed_re(table_of_elements)).set_name("element")
element.add_parse_action(lambda t: Counter([t[0]]))
subscript_digits = "₀₁₂₃₄₅₆₇₈₉"
subscript_int = pp.Word(subscript_digits).set_name("subscript")
# define mapping of the int value of each subscript digit
subscript_int_map = {digit: value for value, digit in enumerate(subscript_digits)}
@subscript_int.add_parse_action
def convert_subscript_int(s: pp.ParseResults) -> int:
ret = 0
for c in s[0]:
ret = ret * 10 + subscript_int_map[c]
return ret
#
# parse actions used internally by the infix_notation expression
#
def lmult(s, l, t):
"""
Multiply <element><subscript_integer>
"""
*terms, qty = t[0]
return sum(qty * terms, Counter())
def rmult(s, l, t):
"""
Multiply <integer><element>
"""
qty, *terms = t[0]
return sum(qty * terms, Counter())
def element_ref_sum(s, l, t):
"""
Add multiple consecutive element references
"""
return sum(t[0], Counter())
# optional separator in some chemical formulas
optional_separator = pp.Optional(pp.one_of("= ·").suppress())
# define infix expression, where multipliers and subscripts
# are treated like operators, so that grouping in ()'s gets
# properly handled, even when they are nested
element_ref = pp.infix_notation(
element,
[
(subscript_int, 1, pp.OpAssoc.LEFT, lmult),
(ppc.integer, 1, pp.OpAssoc.RIGHT, rmult),
(optional_separator, 2, pp.OpAssoc.LEFT, element_ref_sum),
],
)
# define the overall parser for a chemical formula, made up
# of one or more element_ref's
formula = element_ref[1, ...].set_name("chemical_formula")
# set names on unnamed expressions for better diagram output
pp.autoname_elements()
def molecular_weight(c: Counter) -> float:
"""
Compute overall molecular weight of a chemical formula,
whose elements have been parsed into a Counter containing
chemical symbols and counts of each element, using
the table_of_elements dict to map chemical symbols to
each element's atomic weight.
"""
return sum(table_of_elements[k] * v for k, v in c.items())
if __name__ == '__main__':
import contextlib
# create railroad diagram for this parser
with contextlib.suppress(Exception):
formula.create_diagram(
"complex_chemical_formulas_diagram.html", vertical=2, show_groups=True
)
formula.run_tests(
"""\
NaCl
HOH
H₂O
H₂O₂
C₆H₅OH
C₁₀H₂₁OH
(C₆H₅OH)₂
3(C₆H₅OH)₂
C(OH)₆
CH₃(CH₂)₂OH
(CH₃)₃CH
CH₃(CH₂)₅CH₃
Ba(BrO₃)₂·H₂O
Ba(BrO₃)₂·2(H₂O)
""",
full_dump=False,
post_parse=(
lambda _, tokens:
f"Molecular counts/weight: {dict(tokens[0])}"
f", {molecular_weight(tokens[0]):.3f}"
),
)
print()
|