File: complex_chemical_formulas.py

package info (click to toggle)
pyparsing 3.3.2-1
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 12,200 kB
  • sloc: python: 30,867; ansic: 422; sh: 112; makefile: 24
file content (159 lines) | stat: -rw-r--r-- 5,387 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#
# complex_chemical_formulas.py
#
# Example that expands on the basic chemical_formulas.py parser to
# include grouped multiplication notation, such as "3(C₆H₅OH)₂".
#
# Copyright (c) 2024, Paul McGuire
#

from collections import Counter

import pyparsing as pp

ppc = pp.common

# fmt: off
table_of_elements: dict[str, float] = {
    "H": 1.007, "He": 4.002, "Li": 6.941, "Be": 9.012, "B": 10.811, "C": 12.011,
    "N": 14.007, "O": 15.999, "F": 18.998, "Ne": 20.18, "Na": 22.99, "Mg": 24.305,
    "Al": 26.982, "Si": 28.086, "P": 30.974, "S": 32.065, "Cl": 35.453, "Ar": 39.948,
    "K": 39.098, "Ca": 40.078, "Sc": 44.956, "Ti": 47.867, "V": 50.942, "Cr": 51.996,
    "Mn": 54.938, "Fe": 55.845, "Co": 58.933, "Ni": 58.693, "Cu": 63.546, "Zn": 65.38,
    "Ga": 69.723, "Ge": 72.64, "As": 74.922, "Se": 78.96, "Br": 79.904, "Kr": 83.798,
    "Rb": 85.468, "Sr": 87.62, "Y": 88.906, "Zr": 91.224, "Nb": 92.906, "Mo": 95.96,
    "Tc": 98.0, "Ru": 101.07, "Rh": 102.906, "Pd": 106.42, "Ag": 107.868,
    "Cd": 112.411, "In": 114.818, "Sn": 118.71, "Sb": 121.76, "Te": 127.6,
    "I": 126.904, "Xe": 131.293, "Cs": 132.905, "Ba": 137.327, "La": 138.905,
    "Ce": 140.116, "Pr": 140.908, "Nd": 144.242, "Pm": 145.0, "Sm": 150.36,
    "Eu": 151.964, "Gd": 157.25, "Tb": 158.925, "Dy": 162.5, "Ho": 164.93,
    "Er": 167.259, "Tm": 168.934, "Yb": 173.054, "Lu": 174.967, "Hf": 178.49,
    "Ta": 180.948, "W": 183.84, "Re": 186.207, "Os": 190.23, "Ir": 192.217,
    "Pt": 195.084, "Au": 196.967, "Hg": 200.59, "Tl": 204.383, "Pb": 207.2,
    "Bi": 208.98, "Po": 210.0, "At": 210.0, "Rn": 222.0, "Fr": 223.0, "Ra": 226.0,
    "Ac": 227.0, "Th": 232.038, "Pa": 231.036, "U": 238.029, "Np": 237.0,
    "Pu": 244.0, "Am": 243.0, "Cm": 247.0, "Bk": 247.0, "Cf": 251.0, "Es": 252.0,
    "Fm": 257.0, "Md": 258.0, "No": 259.0, "Lr": 262.0, "Rf": 261.0, "Db": 262.0,
    "Sg": 266.0, "Bh": 264.0, "Hs": 267.0, "Mt": 268.0, "Ds": 271.0, "Rg": 272.0,
    "Cn": 285.0, "Nh": 284.0, "Fl": 289.0, "Mc": 288.0, "Lv": 292.0, "Ts": 295.0,
    "Og": 294.0,
}
# fmt: on

# basic parser elements
#  - element - a chemical symbol, corresponding to one of the entries
#    in table_of_elements
#  - subcript_int - an integer made up of subscript digits
#  (a normal integer definition uses the one defined in pyparsing.common)
#
# element = pp.one_of(table_of_elements).set_name("element")
element = pp.Regex(pp.util.make_compressed_re(table_of_elements)).set_name("element")
element.add_parse_action(lambda t: Counter([t[0]]))

subscript_digits = "₀₁₂₃₄₅₆₇₈₉"
subscript_int = pp.Word(subscript_digits).set_name("subscript")

# define mapping of the int value of each subscript digit
subscript_int_map = {digit: value for value, digit in enumerate(subscript_digits)}

@subscript_int.add_parse_action
def convert_subscript_int(s: pp.ParseResults) -> int:
    ret = 0
    for c in s[0]:
        ret = ret * 10 + subscript_int_map[c]
    return ret

#
# parse actions used internally by the infix_notation expression
#

def lmult(s, l, t):
    """
    Multiply <element><subscript_integer>
    """
    *terms, qty = t[0]
    return sum(qty * terms, Counter())


def rmult(s, l, t):
    """
    Multiply <integer><element>
    """
    qty, *terms = t[0]
    return sum(qty * terms, Counter())


def element_ref_sum(s, l, t):
    """
    Add multiple consecutive element references
    """
    return sum(t[0], Counter())


# optional separator in some chemical formulas
optional_separator = pp.Optional(pp.one_of("= ·").suppress())

# define infix expression, where multipliers and subscripts
# are treated like operators, so that grouping in ()'s gets
# properly handled, even when they are nested
element_ref = pp.infix_notation(
    element,
    [
        (subscript_int, 1, pp.OpAssoc.LEFT, lmult),
        (ppc.integer, 1, pp.OpAssoc.RIGHT, rmult),
        (optional_separator, 2, pp.OpAssoc.LEFT, element_ref_sum),
    ],
)

# define the overall parser for a chemical formula, made up
# of one or more element_ref's
formula = element_ref[1, ...].set_name("chemical_formula")

# set names on unnamed expressions for better diagram output
pp.autoname_elements()


def molecular_weight(c: Counter) -> float:
    """
    Compute overall molecular weight of a chemical formula,
    whose elements have been parsed into a Counter containing
    chemical symbols and counts of each element, using
    the table_of_elements dict to map chemical symbols to
    each element's atomic weight.
    """
    return sum(table_of_elements[k] * v for k, v in c.items())

if __name__ == '__main__':
    import contextlib

    # create railroad diagram for this parser
    with contextlib.suppress(Exception):
        formula.create_diagram(
            "complex_chemical_formulas_diagram.html", vertical=2, show_groups=True
        )

    formula.run_tests(
        """\
        NaCl
        HOH
        H₂O
        H₂O₂
        C₆H₅OH
        C₁₀H₂₁OH
        (C₆H₅OH)₂
        3(C₆H₅OH)₂
        C(OH)₆
        CH₃(CH₂)₂OH
        (CH₃)₃CH
        CH₃(CH₂)₅CH₃
        Ba(BrO₃)₂·H₂O
        Ba(BrO₃)₂·2(H₂O)
        """,
        full_dump=False,
        post_parse=(
            lambda _, tokens:
            f"Molecular counts/weight: {dict(tokens[0])}"
            f", {molecular_weight(tokens[0]):.3f}"
        ),
    )
    print()