Source code for interferences.table.molecules

"""
Functions for creating, formatting and serialising representaitons of molecules.
"""
import re
import pandas as pd
import numpy as np
import periodictable as pt
from pyrolite.mineral.transform import merge_formulae
from ..util.sorting import get_relative_electronegativity
from ..util.meta import interferences_datafolder
from ..util.log import Handle

logger = Handle(__name__)

_COMPLEVEL = 4
_COMPLIB = "lzo"
_ITEMSIZES = {"label": 50, "index": 40}


[docs]def components_from_index_value(idx):
    return re.findall(r"\w+\[\d+\]", idx)


def _find_duplicate_multiples(df, charges=None):
    """
    Remove multiples of moleclues which have the same m/z (e.g. OH+, H2O2++).

    Parameters
    ----------
    df : :class:`pandas.DataFrame`
        Dataframe to check the index of.
    charges : :class:`list`
        List of valid charges for the frame.

    Returns
    -------
    :class:`list:
    """
    counts = df.index.map(lambda s: s.count("["))
    target_charges = [c for c in np.arange(np.max(charges)) + 1 if c // 2 == c / 2]
    source_n_atoms = [c for c in np.arange(counts.max()) + 1 if c <= (counts.max() / 2)]

    drop_mols = []
    for n_atoms in source_n_atoms:
        src = df.index[counts == n_atoms]  # get e.g. 1-atom molecules
        for m in src.str.strip("+"):
            potential_multiples = [
                repr_formula(merge_formulae([m] * c)) + "+" * c for c in target_charges
            ]

            drop_mols += df.index.intersection(potential_multiples).to_list()
    return drop_mols


[docs]def deduplicate(df, charges=None, multiples=True):
    """
    De-duplicate a dataframe index based on index values and and molecule-multiples.

    Parameters
    ----------
    df : :class:`pandas.DataFrame`
        Dataframe to check the index of.
    charges : :class:`list`
        List of valid charges for the frame.
    multiples : :class:`bool`
        Whether to remove molecule-multiples.

    Returns
    -------
    :class:`pandas.DataFrame`
    """
    # remove duplicate m/z #############################################################
    idx = df.index
    if idx.duplicated().any():
        duplicates = df.index[df.index.duplicated(keep="first")]
        logger.debug("Dropping duplicate indexes: {}".format(", ".join(duplicates)))
        df.drop_duplicates(
            subset="index", keep="first", inplace=True
        )  # drop any duplicate indexes

    if multiples:
        dup_multiples = _find_duplicate_multiples(df, charges=charges)
        if dup_multiples:
            logger.debug(
                "Dropping multiples (duplicate m_z): {}".format(
                    ", ".join(dup_multiples)
                )
            )
            df.drop(dup_multiples, axis=0, inplace=True)  # drop any duplicate m_z
    return df


def _get_isotope(element):
    """
    Parameters
    ----------
    element : :class:`periodictable.core.Element`
        Element or isotope.
    Returns
    -------
    :class:`int`
    """
    try:
        return element.isotope
    except AttributeError:
        return 0


[docs]def repr_formula(molecule):
    """
    Get a string representation of a formula which preserves element and isotope
    information.
    """
    parts = [
        "{}".format(repr(el)) if cnt == 1 else "{}".format(repr(el)) * cnt
        for el, cnt in molecule.atoms.items()
    ]
    return "".join(parts)


[docs]def get_formatted_formula(molecule, sorted=False):
    """
    Construct a formatted name for a molecule.

    Parameters
    -----------
    molecule : :class:`~periodictable.formulas.Formula`
        Molecule to name.
    sorted : :class:`bool`
        Whether a molecular formula is already sorted, so sorting can
        be skipped.

    Returns
    -------
    :class:`str`
    """
    molecule = pt.formula(molecule)
    components = list(molecule.atoms.keys())
    if not sorted:
        components = sorted(
            components,
            key=lambda x: (get_relative_electronegativity(x), _get_isotope(x)),
        )
    name = r"$\mathrm{"  # remove italicized text effect
    for c in components:
        part = ""
        if hasattr(c, "isotope"):
            part += "^{" + "{}".format(c.isotope) + "}"  # superscript isotope
        part += str(c.element)
        count = molecule.atoms[c]
        if count > 1:
            part += "_{" + "{:d}".format(molecule.atoms[c]) + "}"
        name += part
    name += "}$"  # finish TeX formatting
    return name


[docs]def get_molecule_labels(df, **kwargs):
    """
    Get labels for molecules based on their composition and charge.

    Parameters
    -----------
    df : :class:`pandas.DataFrame`

    Returns
    -------
    :class:`pandas.Series`
    """
    # look up index values which are pre-computed
    label_src = interferences_datafolder(subfolder="table") / "labels.h5"
    labels = pd.DataFrame(index=df.index, columns=["label"])
    try:
        with pd.HDFStore(
            label_src, complevel=_COMPLEVEL, complib=_COMPLIB, **kwargs
        ) as store:
            label_store = store.select("/table")

        known = label_store.index.intersection(df.index)
        unknown = df.index.difference(known)
        if known.size:
            labels.loc[known, "label"] = label_store["label"]

    except (KeyError, FileNotFoundError):
        label_store = pd.DataFrame(columns=["label"])
        unknown = df.index  # assume they're all unknown

    if unknown.size:
        logger.debug("Buiding {} labels.".format(unknown.size))
        # fill in the gaps

        mols = unknown.map(lambda x: get_formatted_formula(x.strip('+'), sorted=True))
        charges = df.loc[unknown, "charge"].apply(
            lambda c: r"$\mathrm{^{" + "+" * c + "}}$"
        )
        labels.loc[unknown, "label"] = mols + charges
        # append new index values to the datafile
        logger.debug("Dumping {} labels to file.".format(unknown.size))

        if label_src.exists():
            labels.loc[unknown].to_hdf(
                label_src,
                key="table",
                mode="a",
                append=True,
                format="table",
                min_itemsize=_ITEMSIZES,
                complevel=_COMPLEVEL,
                complib=_COMPLIB,
            )
        else:  # write and create the file with headers
            labels.loc[unknown].to_hdf(
                label_src,
                key="table",
                mode="w",
                append=True,
                format="fixed",
                min_itemsize=_ITEMSIZES,
                complevel=_COMPLEVEL,
                complib=_COMPLIB,
            )
    return labels


[docs]def molecule_from_components(components):
    """
    Builds a :class:`~periodictable.formulas.Formula` from a list of atom or
    isotope components.

    Parameters
    ----------
    components : :class:`list`
        Atomic, isotope or molecular components to construct an ionic molecule from.

    Returns
    -------
    :class:`~periodictable.formulas.Formula`

    Todo
    -----
    * Modify to accept consumption of molecular components (e.g. Fe2O3+)

    See Also
    ---------
    :func:`pyrolite.mineral.transform.merge_formulae`
    """
    return merge_formulae(components)