File: funcs.py

package info (click to toggle)
python-sigima 1.0.3-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 24,956 kB
  • sloc: python: 33,326; makefile: 3
file content (723 lines) | stat: -rw-r--r-- 26,790 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
# Copyright (c) DataLab Platform Developers, BSD 3-Clause license, see LICENSE file.

"""
I/O signal functions
"""

# pylint: disable=invalid-name  # Allows short reference names like x, y, ...

from __future__ import annotations

import datetime
import re
import warnings
from dataclasses import dataclass
from typing import TextIO

import numpy as np
import pandas as pd
import scipy.interpolate

from sigima.io.common.textreader import count_lines, read_first_n_lines
from sigima.objects.signal.constants import (
    DATETIME_X_FORMAT_KEY,
    DATETIME_X_KEY,
    DEFAULT_DATETIME_FORMAT,
)
from sigima.worker import CallbackWorkerProtocol


def get_labels_units_from_dataframe(
    df: pd.DataFrame,
) -> tuple[str, list[str], str, list[str]]:
    """Get labels and units from a DataFrame.

    Args:
        df: DataFrame

    Returns:
        Tuple (xlabel, ylabels, xunit, yunits)
    """
    # Reading X,Y labels
    xlabel = str(df.columns[0])
    ylabels = [str(col) for col in df.columns[1:]]

    # Retrieving units from labels
    xunit = ""
    yunits = [""] * len(ylabels)
    pattern = r"([\S ]*) \(([\S]*)\)"
    match = re.match(pattern, xlabel)
    if match is not None:
        xlabel, xunit = match.groups()
    for i, ylabel in enumerate(ylabels):
        match = re.match(pattern, ylabel)
        if match is not None:
            ylabels[i], yunits[i] = match.groups()

    return xlabel, ylabels, xunit, yunits


def read_csv_by_chunks(
    fname_or_fileobj: str | TextIO,
    nlines: int | None = None,
    worker: CallbackWorkerProtocol | None = None,
    decimal: str = ".",
    delimiter: str | None = None,
    header: int | None = "infer",
    skiprows: int | None = None,
    nrows: int | None = None,
    comment: str | None = None,
    chunksize: int = 1000,
) -> pd.DataFrame:
    """Read CSV data with primitive options, using pandas read_csv function defaults,
    and reading data in chunks, using the iterator interface.

    Args:
        fname_or_fileobj: CSV file name or text stream object
        nlines: Number of lines contained in file (this argument is mandatory if
         `fname_or_fileobj` is a text stream object: counting line numbers from a
         text stream is not efficient, especially if one already has access to the
         initial text content from which the text stream was made)
        worker: Callback worker object
        decimal: Decimal character
        delimiter: Delimiter
        header: Header line
        skiprows: Skip rows
        nrows: Number of rows to read
        comment: Comment character
        chunksize: Chunk size

    Returns:
        DataFrame
    """
    if isinstance(fname_or_fileobj, str):
        nlines = count_lines(fname_or_fileobj)
    elif nlines is None:
        raise ValueError("Argument `nlines` must be passed for text streams")
    # Read data in chunks, and concatenate them at the end, thus allowing to call the
    # progress callback function at each chunk read and to return an intermediate result
    # if the operation is canceled.
    chunks = []
    for chunk in pd.read_csv(
        fname_or_fileobj,
        decimal=decimal,
        delimiter=delimiter,
        header=header,
        skiprows=skiprows,
        nrows=nrows,
        comment=comment,
        chunksize=chunksize,
        encoding_errors="ignore",
    ):
        chunks.append(chunk)
        # Compute the progression based on the number of lines read so far
        if worker is not None:
            worker.set_progress(sum(len(chunk) for chunk in chunks) / nlines)
            if worker.was_canceled():
                break
    return pd.concat(chunks)


DATA_HEADERS = [
    "#DATA",  # Generic
    "START_OF_DATA",  # Various logging devices
    ">>>>>Begin Spectral Data<<<<<",  # Ocean Optics
    ">>>Begin Data<<<",  # Ocean Optics (alternative)
    ">>>Begin Spectrum Data<<<",  # Avantes
    "# Data Start",  # Andor, Horiba, Mass Spectrometry (Agilent, Thermo Fisher, ...)
    ">DATA START<",  # Mass Spectrometry, Chromatography
    "BEGIN DATA",  # Mass Spectrometry, Chromatography
    "<Data>",  # Mass Spectrometry (XML-based)
    "##Start Data",  # Bruker (X-ray, Raman, FTIR)
    "[DataStart]",  # PerkinElmer (FTIR, UV-Vis)
    "BEGIN SPECTRUM",  # PerkinElmer
    "%% Data Start %%",  # LabVIEW, MATLAB
    "---Begin Data---",  # General scientific instruments
    "===DATA START===",  # Industrial/scientific devices
]


def _read_df_without_header(
    filename: str, skiprows: int | None = None
) -> tuple[pd.DataFrame | None, str, str]:
    """Try to read a CSV file without header, testing various delimiters and decimal.

    Args:
        filename: CSV file name
        skiprows: Number of rows to skip at the beginning of the file

    Returns:
        A tuple (DataFrame if successful, None otherwise, decimal used, delimiter used)
    """
    for decimal in (".", ","):
        for delimiter in (",", ";", r"\s+"):
            try:
                df = pd.read_csv(
                    filename,
                    decimal=decimal,
                    delimiter=delimiter,
                    header=None,
                    comment="#",
                    nrows=1000,  # Read only the first 1000 lines
                    encoding_errors="ignore",
                    skiprows=skiprows,
                    dtype=float,  # Keep dtype to validate delimiter detection
                )
                break
            except (pd.errors.ParserError, ValueError):
                df = None
        if df is not None:
            break
    return df, decimal, delimiter


def _read_df_with_header(filename: str) -> tuple[pd.DataFrame | None, str, str]:
    """Try to read a CSV file with header, testing various delimiters and decimal.

    Args:
        filename: CSV file name

    Returns:
        A tuple (DataFrame if successful, None otherwise, decimal used, delimiter used)
    """
    for decimal in (".", ","):
        for delimiter in (",", ";", r"\s+"):
            # Headers are generally in the first 10 lines, so we try to skip the
            # minimum number of lines before reading the data:
            for skiprows in range(20):
                try:
                    df = pd.read_csv(
                        filename,
                        decimal=decimal,
                        delimiter=delimiter,
                        skiprows=skiprows,
                        comment="#",
                        nrows=1000,  # Read only the first 1000 lines
                        encoding_errors="ignore",
                    )
                    # Validate: CSV should have at least 2 columns (x and y)
                    # If only 1 column, likely wrong delimiter
                    if df.shape[1] >= 2:
                        break  # Good delimiter found
                    df = None  # Try next delimiter
                except (pd.errors.ParserError, ValueError):
                    df = None
            if df is not None:
                break
        if df is not None:
            break
    return df, decimal, delimiter


def _detect_metadata_cols(df: pd.DataFrame) -> tuple[pd.DataFrame, dict]:
    """Detect columns containing constant/single-value metadata.

    Columns with a single unique value (excluding NaN) across all rows are treated
    as metadata rather than data columns. These are typically instrument serial numbers,
    experiment IDs, or other constant identifiers.

    Args:
        df: Input DataFrame

    Returns:
        A tuple (DataFrame with metadata columns removed,
        dict of metadata key-value pairs)
    """
    metadata = {}
    cols_to_drop = []

    # Start from column 1 (skip X column) and check for constant-value columns
    for col_idx in range(1, df.shape[1]):
        col_data = df.iloc[:, col_idx]
        col_name = df.columns[col_idx]

        # Get unique non-NaN values
        unique_values = col_data.dropna().unique()

        # If column has exactly one unique value (excluding NaN), it's metadata
        if len(unique_values) == 1:
            # Store the metadata
            value = unique_values[0]
            # Try to convert to appropriate type (keep as string if necessary)
            try:
                # Try int first
                if float(value).is_integer():
                    value = int(float(value))
                else:
                    value = float(value)
            except (ValueError, TypeError):
                # Keep as string
                value = str(value)

            metadata[str(col_name)] = value
            cols_to_drop.append(col_name)  # Store column name, not index

    # Drop metadata columns from DataFrame
    if cols_to_drop:
        df = df.drop(columns=cols_to_drop)

    return df, metadata


def _detect_datetime_col(df: pd.DataFrame) -> tuple[pd.DataFrame, dict | None]:
    """Try to detect the presence of a datetime column in a DataFrame.

    Detect if the first or second column contains datetime values, and convert it to
    float timestamps if so.

    Args:
        df: Input DataFrame

    Returns:
        A tuple (DataFrame with datetime column converted, datetime metadata dict)
    """
    datetime_col_idx = None

    for col_idx in [0, 1]:  # Check first two columns
        col_data = df.iloc[:, col_idx]
        # Try to convert to datetime
        try:
            # Attempt to parse as datetime
            # Note: format="mixed" was causing failures in some pandas versions,
            # so we use warnings filter to suppress the UserWarning instead
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    "ignore",
                    message="Could not infer format",
                    category=UserWarning,
                )
                datetime_series = pd.to_datetime(col_data, errors="coerce")
            # Check if most values were successfully converted (>90%)
            valid_ratio = datetime_series.notna().sum() / len(datetime_series)

            # Skip if conversion ratio is too low
            if valid_ratio <= 0.9:
                continue

            # Check if values have reasonable variation and are actual dates
            unique_dates = datetime_series.dropna().nunique()
            if unique_dates <= 1:
                continue

            # Check date range - should be reasonable dates, not epoch times
            min_date = datetime_series.min()
            max_date = datetime_series.max()
            # Dates should be after 1900 and the range should be > 1 sec
            valid_datetime = (
                min_date.year >= 1900 and (max_date - min_date).total_seconds() > 1.0
            )

            if valid_datetime:
                # This is a datetime column!
                datetime_col_idx = col_idx
                break
        except (ValueError, TypeError, pd.errors.OutOfBoundsDatetime):
            # Not a datetime column, continue checking
            pass

    datetime_metadata = None

    if datetime_col_idx is not None:
        # Convert datetime column to float timestamps
        col_data = df.iloc[:, datetime_col_idx]
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore", message="Could not infer format", category=UserWarning
            )
            datetime_series = pd.to_datetime(col_data, errors="coerce")
        x_float = datetime_series.astype(np.int64) / 1e9
        # Store datetime metadata (unit will be stored in xunit attribute)
        datetime_metadata = {
            DATETIME_X_KEY: True,
            DATETIME_X_FORMAT_KEY: DEFAULT_DATETIME_FORMAT,
        }

        # If datetime is in column 1 and column 0 looks like an index, drop column 0
        if datetime_col_idx == 1:
            try:
                # Try to convert first column to int - if sequential,
                # it's likely an index column
                first_col = pd.to_numeric(df.iloc[:, 0], errors="coerce")
                if first_col.notna().all():
                    # Check if it's a sequential index (1, 2, 3, ...)
                    diffs = first_col.diff().dropna()
                    if (diffs == 1).sum() / len(diffs) > 0.9:
                        # Drop the index column
                        df = df.iloc[:, 1:].copy()
                        datetime_col_idx = 0  # Now datetime is in position 0
            except (ValueError, TypeError):
                pass

        # Replace datetime column with float timestamps
        df.iloc[:, datetime_col_idx] = x_float

    return df, datetime_metadata


@dataclass
class CSVData:
    """Data structure for CSV file contents.

    This dataclass encapsulates all the data extracted from a CSV file,
    including the actual XY data, labels, units, and metadata.

    Attributes:
        xydata: Numpy array containing X and Y data columns
        xlabel: Label for the X axis
        xunit: Unit for the X axis
        ylabels: List of labels for Y columns
        yunits: List of units for Y columns
        header: Optional header text from the CSV file
        datetime_metadata: Optional dict with datetime conversion info
        column_metadata: Optional dict with constant-value column metadata
    """

    xydata: np.ndarray
    xlabel: str | None = None
    xunit: str | None = None
    ylabels: list[str] | None = None
    yunits: list[str] | None = None
    header: str | None = None
    datetime_metadata: dict | None = None
    column_metadata: dict | None = None


def read_csv(
    filename: str,
    worker: CallbackWorkerProtocol | None = None,
) -> CSVData:
    """Read CSV data and return parsed components including datetime metadata.

    Args:
        filename: CSV file name
        worker: Callback worker object

    Returns:
        CSVData object containing all parsed CSV components
    """
    xydata, xlabel, xunit, ylabels, yunits = None, None, None, None, None
    header, datetime_metadata, column_metadata = None, None, {}

    # The first attempt is to read the CSV file assuming it has no header because it
    # won't raise an error if the first line is data. If it fails, we try to read it
    # with a header, and if it fails again, we try to skip some lines before reading
    # the data.

    skiprows = None

    # Begin by reading the first 100 lines to search for a line that could mark the
    # beginning of the data after it (e.g., a line '#DATA' or other).
    first_100_lines = read_first_n_lines(filename, n=100).splitlines()
    for data_header in DATA_HEADERS:
        if data_header in first_100_lines:
            # Skip the lines before the data header
            skiprows = first_100_lines.index(data_header) + 1
            break

    # First attempt: no header (try to read with different delimiters)
    read_without_header = True
    df, decimal, delimiter = _read_df_without_header(filename, skiprows=skiprows)

    # Second attempt: with header
    if df is None:
        df, decimal, delimiter = _read_df_with_header(filename)

        if df is None:
            raise ValueError("Unable to read CSV file (format not supported)")

        # At this stage, we have a DataFrame with column names, but we don't know
        # if the first line is a header or data. We try to read the first line as
        # a header, and if it fails, we read it as data.
        try:
            # Try to convert columns to float - if first column is datetime, this will
            # fail and we know we have a header
            first_col_numeric = pd.to_numeric(df.columns[0], errors="coerce")
            if pd.notna(first_col_numeric):
                # First column name is numeric, might be data
                df.columns.astype(float)
                # This means the first line is data, so we re-read it, but
            # without the header:
            read_without_header = True
        except (ValueError, TypeError):  # TypeError can occur with pandas >= 2.2
            read_without_header = False
            # This means that the first line is a header, so we already have the data
            # without missing values.
            # However, it also means that there could be text information preceding
            # the header. Let's try to read it and put it in `header` variable.

            # 1. We read only the first 1000 lines to avoid reading the whole file
            # 2. We keep only the lines beginning with a comment character
            # 3. We join the lines to create a single string
            header = ""
            with open(filename, "r", encoding="utf-8") as file:
                for _ in range(1000):
                    line = file.readline()
                    if line.startswith("#"):
                        header += line
                    else:
                        break
            # Remove the last line if it contains the column names:
            last_line = header.splitlines()[-1] if header.splitlines() else ""
            if str(df.columns[0]) in last_line:
                header = "\n".join(header.splitlines()[:-1])

    # Now we read the whole file with the correct options
    try:
        df = read_csv_by_chunks(
            filename,
            worker=worker,
            decimal=decimal,
            delimiter=delimiter,
            header=None if read_without_header else "infer",
            skiprows=skiprows,
            comment="#",
        )
    except pd.errors.ParserError:
        # If chunked reading fails (e.g., ragged CSV), try different approaches
        df = None
        # Try with python engine (more flexible)
        for skip in [skiprows, 0, 9, 10, 15, 20]:  # Try different skiprows values
            if df is not None:
                break
            try:
                df = pd.read_csv(
                    filename,
                    decimal=decimal,
                    delimiter=delimiter,
                    header=None if read_without_header else "infer",
                    skiprows=skip,
                    comment="#",
                    engine="python",
                    encoding_errors="ignore",
                )
                break  # Success!
            except (pd.errors.ParserError, ValueError):
                continue

        # If still failing, try auto-detect
        if df is None:
            try:
                df = pd.read_csv(
                    filename,
                    engine="python",
                    encoding_errors="ignore",
                    comment="#",
                )
            except (pd.errors.ParserError, ValueError) as e:
                raise ValueError(f"Unable to parse CSV file: {e}") from e

    # Remove rows and columns where all values are NaN in the DataFrame:
    df = df.dropna(axis=0, how="all").dropna(axis=1, how="all")

    # Check if first row contains header strings (non-numeric values in all columns)
    # This happens when header="infer" fails to detect the header
    if not df.empty and isinstance(df.columns[0], (int, np.integer)):
        # Columns are integers, not strings - header wasn't properly parsed
        first_row = df.iloc[0]
        # Count how many values in first row are non-numeric strings
        non_numeric_count = 0
        for val in first_row:
            try:
                float(val)
            except (ValueError, TypeError):
                if isinstance(val, str):
                    non_numeric_count += 1
        # If most of first row is non-numeric strings, it's likely a header row
        if non_numeric_count / len(first_row) > 0.5:
            # Use first row as column names
            df.columns = first_row.values
            # Drop the first row (header)
            df = df.iloc[1:].reset_index(drop=True)

    # Try to detect datetime columns - check first two columns
    # Often CSV files have an index column, then a datetime column
    if not df.empty and df.shape[1] >= 2:
        df, datetime_metadata = _detect_datetime_col(df)

    # Try to detect metadata columns (constant-value columns like serial numbers)
    # This must be done after datetime detection but before converting to numpy
    if not df.empty and df.shape[1] >= 2:
        df, column_metadata = _detect_metadata_cols(df)

    # Converting to NumPy array
    try:
        xydata = df.to_numpy(float)
    except (ValueError, TypeError):
        # If conversion fails, try converting each column individually
        # and dropping columns that can't be converted
        for col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
        df = df.dropna(axis=1, how="all")
        xydata = df.to_numpy(float)

    if xydata.size == 0:
        raise ValueError(
            f"Unable to read CSV file (no supported data after cleaning): {filename}"
        )

    xlabel, ylabels, xunit, yunits = get_labels_units_from_dataframe(df)

    return CSVData(
        xydata=xydata,
        xlabel=xlabel,
        xunit=xunit,
        ylabels=ylabels,
        yunits=yunits,
        header=header,
        datetime_metadata=datetime_metadata,
        column_metadata=column_metadata,
    )


def write_csv(
    filename: str,
    xydata: np.ndarray,
    xlabel: str | None,
    xunit: str | None,
    ylabels: list[str] | None,
    yunits: list[str] | None,
    header: str | None,
) -> None:
    """Write CSV data.

    Args:
        filename: CSV file name
        xydata: XY data
        xlabel: X label
        xunit: X unit
        ylabels: Y labels
        yunits: Y units
        header: Header
    """
    labels = ""
    delimiter = ","
    if len(ylabels) == 1:
        ylabels = ["Y"] if not ylabels[0] else ylabels
    elif ylabels:
        ylabels = [
            f"Y{i + 1}" if not label else label for i, label in enumerate(ylabels)
        ]
        if yunits:
            ylabels = [
                f"{label} ({unit})" if unit else label
                for label, unit in zip(ylabels, yunits)
            ]
    if ylabels:
        xlabel = xlabel or "X"
        if xunit:
            xlabel += f" ({xunit})"
        labels = delimiter.join([xlabel] + ylabels)
    df = pd.DataFrame(xydata.T, columns=[xlabel] + ylabels)
    df.to_csv(filename, index=False, header=labels, sep=delimiter)
    # Add header if present
    if header:
        with open(filename, "r+", encoding="utf-8") as file:
            content = file.read()
            file.seek(0, 0)
            file.write(header + "\n" + content)


class MCAFile:
    """Class to handle MCA files."""

    def __init__(self, filename: str) -> None:
        self.filename = filename
        self.raw_data: str = ""
        self.xlabel: str | None = None
        self.x: np.ndarray | None = None
        self.y: np.ndarray | None = None
        self.metadata: dict[str, str] = {}

    def __try_decode(self, raw_bytes: bytes) -> str:
        """Try to decode raw bytes with the specified encoding."""
        encodings_to_try = ["utf-8", "utf-8-sig", "latin-1", "cp1252"]
        for enc in encodings_to_try:
            try:
                return raw_bytes.decode(enc)
            except UnicodeDecodeError:
                continue
        # If all attempts fail, use 'utf-8' with replacement
        warnings.warn("All decoding attempts failed. Used 'utf-8' with replacement.")
        return raw_bytes.decode("utf-8", errors="replace")

    def _read_raw_data(self) -> str:
        """Read the raw data from the MCA file, trying multiple encodings."""
        with open(self.filename, "rb") as file:
            raw_bytes = file.read()
        raw_data = self.__try_decode(raw_bytes)
        self.raw_data = raw_data.replace("\r\n", "\n").replace("\r", "\n")

    def _read_section(self, section: str) -> str | None:
        """Read a section from the raw data."""
        pattern = f"(?:.*)(^<<{section}>>$)(.*?)(?:<<.*>>)"
        match = re.search(pattern, self.raw_data, re.DOTALL + re.MULTILINE)
        if match:
            return match.group(2).strip()
        return None

    @staticmethod
    def _infer_string_value(value_str: str) -> str | float | int | datetime.datetime:
        """Infer the type of a string value and convert it accordingly."""
        # Try to convert the value to a number or datetime
        try:
            if value_str.isdigit():
                value = int(value_str)
            else:
                try:
                    value = float(value_str)
                except ValueError:
                    # Try to parse as datetime
                    try:
                        value = datetime.datetime.strptime(
                            value_str, "%m/%d/%Y %H:%M:%S"
                        )
                    except ValueError:
                        value = value_str  # Keep as string
        except ValueError:
            value = value_str
        return value

    def _extract_metadata_from_section(
        self, section: str
    ) -> dict[str, str | float | int | datetime.datetime]:
        """Extract metadata from a specific section."""
        section_contents = self._read_section(section)
        if section_contents is None:
            return {}
        metadata = {}
        patterns = (r"(.*?) - (.*?)$", r"(.*?)\s*: \s*(.*)$", r"(.*?)\s*=\s*(.*);")
        for line in section_contents.splitlines():
            for pattern in patterns:
                match = re.match(pattern, line)
                if match:
                    key, value_str = match.groups()
                    metadata[key.strip()] = self._infer_string_value(value_str.strip())
                    break
        return metadata

    def read(self) -> None:
        """Read the MCA file and extract data and metadata."""
        self._read_raw_data()
        self.metadata = self._extract_metadata_from_section("PMCA SPECTRUM")
        additional_metadata = self._extract_metadata_from_section("DPP STATUS")
        self.metadata.update(additional_metadata)
        data_section = self._read_section("DATA")
        self.y = np.fromstring(data_section, sep=" ") if data_section else None
        if self.y is not None:
            self.x = np.arange(len(self.y))
            cal_section = self._read_section("CALIBRATION")
            if cal_section:
                cal_metadata = self._extract_metadata_from_section(cal_section)
                self.xlabel = cal_metadata.get("LABEL")
                cal_data = np.array(
                    [
                        [float(v) for v in val.split(" ")]
                        for val in cal_section.splitlines()[1:]
                    ]
                )
                self.x = scipy.interpolate.interp1d(
                    cal_data[:, 0],
                    cal_data[:, 1],
                    bounds_error=False,
                    fill_value="extrapolate",
                )(self.x)