Source code for interferences.table.store

import os
import pandas as pd
import pathlib
from ..util.meta import interferences_datafolder
from ..util.mz import process_window
from .molecules import deduplicate, _find_duplicate_multiples
from ..util.log import Handle

logger = Handle(__name__)

_COMPLEVEL = 4
_COMPLIB = "lzo"
_ITEMSIZES = {"elements": 30, "parts": 40}


[docs]def load_store(path=None, complevel=_COMPLEVEL, complib=_COMPLIB, **kwargs):
    """
    Load the interferences HDF store.

    Parameters
    ----------
    path : :class:`str` | :class:`pathlib.Path`
        Path to the store.
    complevel : :class:`int`
        Compression level option for the HDF store. Uncompressed tables can easily
        reach a few hundred MB - this isn't an issue on a local disk, but can be
        limiting for web transfer.
    complib : :class:`str`
        Which compression library to use.

    Returns
    -------
    :class:`pandas.HDFStore`
    """
    path = path or interferences_datafolder(subfolder="table") / "interferences.h5"
    if not path.exists():
        reset_table(
            path=path, complevel=complevel, complib=complib, remove=False
        )  # init table
    store = pd.HDFStore(path, complevel=complevel, complib=complib, **kwargs)
    return store


[docs]def lookup_components(identifier, path=None, key="table", window=None, **kwargs):
    """
    Look up a a list of components from the store based on their identifiers.

    Parameters
    ----------
    identifiers : :class:`str`
        Identifiers for the components to look up.
    path : :class:`str` | :class:`pathlib.Path`
        Path to store to search.
    key : :class:`str`
        Key for the table within the store.
    window : :class:`tuple`
        Window for indexing along m/z to return a subset of results.
    drop_first_level : :class:`bool`
        Whether to drop the first level of the index for simplicity.

    Returns
    -------
    :class:`pandas.DataFrame`
    """
    logger.debug("Attempting identifier lookup.")
    window = process_window(window)
    name = "/" + key

    multi = ""
    if isinstance(identifier, str):
        multi_lookup = False
    elif isinstance(identifier, (list, pd.Index)) and len(identifier) == 1:
        multi_lookup = False
        identifier = identifier[0]
    else:
        multi_lookup = True
        multi = "multi-"
    # try:
    with load_store(path, **kwargs) as store:
        where = []
        empty = False
        if not multi_lookup:
            where += ["elements == '{}'".format(identifier)]
            empty = store.select(name, where=" & ".join(where)).empty
        if window:  # add the m_z window information
            where += ["m_z >= {:5f} & m_z <= {:5f}".format(*window)]

        msg = "Performing {}lookup".format(multi)
        if where:
            msg += " & ".join(where)
        logger.debug(msg)

        if not empty:
            df = store.select(name, where=" & ".join(where))
        else:
            raise IndexError("Identifer(s) not in table.")

        if multi_lookup:
            tbl_idents = pd.unique(df.index.droplevel("parts"))
            df = df.loc[[i for i in identifier if i in tbl_idents], :]

    return df
    # except KeyError:
    #    raise KeyError("Key not in HDFStore.")


def _get_default_multiindex():
    """
    Build an empty multi-index for the table.

    Returns
    -------
    :class:`pandas.MultiIndex`
    """
    return pd.MultiIndex.from_product([[], []], names=["elements", "parts"])


[docs]def get_store_index(path, drop_first_level=True, **kwargs):
    """
    """
    with pd.HDFStore(path, **kwargs) as store:
        if "/table" in store.keys():
            index = store.select("/table", columns=["elements", "parts"]).index
        else:
            index = _get_default_multiindex()  # empty index
    if drop_first_level:
        index = index.droplevel("elements")
    return index


[docs]def process_subtables(
    dfs,
    charges=None,
    dump=True,
    path=None,
    mode="a",
    data_columns=["elements", "m_z", "iso_abund_product"],
    complevel=_COMPLEVEL,
    complib=_COMPLIB,
    **kwargs
):
    """
    Process and optionally dump a set of subtables to file,
    appending to the hierarchically-indexed table.

    Parameters
    ----------
    dfs : :class:`list`(:class:`pandas.DataFrame`)
        Dataframes to dump.
    charges : :class:`list`
        Charges used to create for the table.
    path : :class:`str` | :class:`pathlib.Path`
        Path to the file to add the table to.
    mode : :class:`str`
        Mode for accessing the HDF file.
    data_columns : :class:`list`
        List of columns to create an indexes for to allow query-by-data.
    complevel : :class:`int`
        Compression level option for the HDF store. Uncompressed tables can easily
        reach a few hundred MB - this isn't an issue on a local disk, but can be
        limiting for web transfer.
    complib : :class:`str`
        Which compression library to use.

    Returns
    -------
    :class:`pandas.DataFrame`
        De-duplicated concatenated version of new tables.
    """
    path = path or interferences_datafolder(subfolder="table") / "interferences.h5"
    logger.debug("Checking Store")
    current_index = get_store_index(path).to_list()
    logger.debug("Combining DataFrames")
    df = pd.concat(dfs, axis=0, ignore_index=False)
    df.index.rename("parts", inplace=True)
    df["elements"] = [id for d in dfs for id in [d.name] * d.index.size]
    ####################################################################################
    logger.debug("Deduplicating")
    output = df.loc[~df.index.duplicated(keep="first"), :]  # remove duplicated indexes
    # take the index from df, and the index from the store and combine them to dedupe
    duplicates = _find_duplicate_multiples(
        pd.DataFrame(index=output.index.to_list() + current_index), charges=charges
    )
    new_duplicates = [i for i in duplicates if i in output.index]
    if len(new_duplicates):
        logger.debug(
            "Dropping duplicates from new table: {}".format(", ".join(new_duplicates))
        )
        output.drop(index=new_duplicates, inplace=True)

    store_duplicates = [i for i in duplicates if i in current_index]

    if dump:
        logger.debug("Reindexing")
        # create hierarchical indexes for a copy of the table to dump into the store
        to_store = output.set_index("elements", append=True)
        to_store = to_store.reorder_levels(["elements", "parts"], axis=0)
        # convert non-string. non-numerical objects to string
        # append to the existing dataframe
        # somehow S[34]S[34]++ sneaks past
        if len(store_duplicates):
            logger.debug(
                "Removing duplicates from store: {}".format(", ".join(store_duplicates))
            )
            with pd.HDFStore(path) as store:
                store.remove(
                    "table", where=[i in store_duplicates for i in current_index]
                )
        logger.debug(
            "Dumping {} tables to HDF store.".format(
                ",".join(pd.unique(to_store.index.get_level_values("elements")))
            )
        )
        to_store.to_hdf(
            path,
            key="table",
            mode="a",
            append=True,
            format="table",
            data_columns=data_columns,
            min_itemsize=_ITEMSIZES,
            complevel=_COMPLEVEL,
            complib=_COMPLIB,
        )
    return output.drop(columns="elements") # elements only used for


[docs]def reset_table(
    path=None,
    remove=True,
    key="table",
    format="table",
    complevel=_COMPLEVEL,
    complib=_COMPLIB,
    **kwargs
):
    """
    Reset or remove a HDF store.

    Parameters
    ----------
    path : :class:`str` | :class:`pathlib.Path`
        Path to store.
    remove : :class:`bool`
        Whether to remove the table from disk, if possible.
    format : :class:`str`
        Format to set for the new tables.
    complevel : :class:`int`
        Compression level option for the HDF store. Uncompressed tables can easily
        reach a few hundred MB - this isn't an issue on a local disk, but can be
        limiting for web transfer.
    complib : :class:`str`
        Which compression library to use.
    """
    path = path or interferences_datafolder(subfolder="table") / "interferences.h5"
    if not path.parent.exists():
        logger.debug("Creating folder for store.")
        path.parent.mkdir(parents=True)  # ensure directory exists
    if remove:
        logger.debug("Removing store.")
        try:
            os.remove(path)  # remove the file
        except FileNotFoundError:
            logger.debug("Store already removed or not present.")
    else:  # keep table keys, set them to empty frames
        logger.debug("Resetting store table: {}/{}".format(path.name, key))
        df = pd.DataFrame(
            index=_get_default_multiindex(),
            columns=["m_z", "mass", "charge", "iso_product",],
        )
        # note - this will not work until a pytables bug is fixed,
        # where the table doesnt' generate from an empty frame.
        df.to_hdf(
            path,
            key=key,
            format=format,
            mode="w",
            append=True,
            complevel=complevel,
            complib=complib,
            min_itemsize=_ITEMSIZES,
            **kwargs
        )