Source code for interferences.table.store

import os
import pandas as pd
import pathlib
from ..util.meta import interferences_datafolder
from ..util.mz import process_window
from .molecules import deduplicate, _find_duplicate_multiples
from ..util.log import Handle

logger = Handle(__name__)

_COMPLEVEL = 4
_COMPLIB = "lzo"
_ITEMSIZES = {"elements": 30, "parts": 40}


[docs]def load_store(path=None, complevel=_COMPLEVEL, complib=_COMPLIB, **kwargs): """ Load the interferences HDF store. Parameters ---------- path : :class:`str` | :class:`pathlib.Path` Path to the store. complevel : :class:`int` Compression level option for the HDF store. Uncompressed tables can easily reach a few hundred MB - this isn't an issue on a local disk, but can be limiting for web transfer. complib : :class:`str` Which compression library to use. Returns ------- :class:`pandas.HDFStore` """ path = path or interferences_datafolder(subfolder="table") / "interferences.h5" if not path.exists(): reset_table( path=path, complevel=complevel, complib=complib, remove=False ) # init table store = pd.HDFStore(path, complevel=complevel, complib=complib, **kwargs) return store
[docs]def lookup_components(identifier, path=None, key="table", window=None, **kwargs): """ Look up a a list of components from the store based on their identifiers. Parameters ---------- identifiers : :class:`str` Identifiers for the components to look up. path : :class:`str` | :class:`pathlib.Path` Path to store to search. key : :class:`str` Key for the table within the store. window : :class:`tuple` Window for indexing along m/z to return a subset of results. drop_first_level : :class:`bool` Whether to drop the first level of the index for simplicity. Returns ------- :class:`pandas.DataFrame` """ logger.debug("Attempting identifier lookup.") window = process_window(window) name = "/" + key multi = "" if isinstance(identifier, str): multi_lookup = False elif isinstance(identifier, (list, pd.Index)) and len(identifier) == 1: multi_lookup = False identifier = identifier[0] else: multi_lookup = True multi = "multi-" # try: with load_store(path, **kwargs) as store: where = [] empty = False if not multi_lookup: where += ["elements == '{}'".format(identifier)] empty = store.select(name, where=" & ".join(where)).empty if window: # add the m_z window information where += ["m_z >= {:5f} & m_z <= {:5f}".format(*window)] msg = "Performing {}lookup".format(multi) if where: msg += " & ".join(where) logger.debug(msg) if not empty: df = store.select(name, where=" & ".join(where)) else: raise IndexError("Identifer(s) not in table.") if multi_lookup: tbl_idents = pd.unique(df.index.droplevel("parts")) df = df.loc[[i for i in identifier if i in tbl_idents], :] return df
# except KeyError: # raise KeyError("Key not in HDFStore.") def _get_default_multiindex(): """ Build an empty multi-index for the table. Returns ------- :class:`pandas.MultiIndex` """ return pd.MultiIndex.from_product([[], []], names=["elements", "parts"])
[docs]def get_store_index(path, drop_first_level=True, **kwargs): """ """ with pd.HDFStore(path, **kwargs) as store: if "/table" in store.keys(): index = store.select("/table", columns=["elements", "parts"]).index else: index = _get_default_multiindex() # empty index if drop_first_level: index = index.droplevel("elements") return index
[docs]def process_subtables( dfs, charges=None, dump=True, path=None, mode="a", data_columns=["elements", "m_z", "iso_abund_product"], complevel=_COMPLEVEL, complib=_COMPLIB, **kwargs ): """ Process and optionally dump a set of subtables to file, appending to the hierarchically-indexed table. Parameters ---------- dfs : :class:`list`(:class:`pandas.DataFrame`) Dataframes to dump. charges : :class:`list` Charges used to create for the table. path : :class:`str` | :class:`pathlib.Path` Path to the file to add the table to. mode : :class:`str` Mode for accessing the HDF file. data_columns : :class:`list` List of columns to create an indexes for to allow query-by-data. complevel : :class:`int` Compression level option for the HDF store. Uncompressed tables can easily reach a few hundred MB - this isn't an issue on a local disk, but can be limiting for web transfer. complib : :class:`str` Which compression library to use. Returns ------- :class:`pandas.DataFrame` De-duplicated concatenated version of new tables. """ path = path or interferences_datafolder(subfolder="table") / "interferences.h5" logger.debug("Checking Store") current_index = get_store_index(path).to_list() logger.debug("Combining DataFrames") df = pd.concat(dfs, axis=0, ignore_index=False) df.index.rename("parts", inplace=True) df["elements"] = [id for d in dfs for id in [d.name] * d.index.size] #################################################################################### logger.debug("Deduplicating") output = df.loc[~df.index.duplicated(keep="first"), :] # remove duplicated indexes # take the index from df, and the index from the store and combine them to dedupe duplicates = _find_duplicate_multiples( pd.DataFrame(index=output.index.to_list() + current_index), charges=charges ) new_duplicates = [i for i in duplicates if i in output.index] if len(new_duplicates): logger.debug( "Dropping duplicates from new table: {}".format(", ".join(new_duplicates)) ) output.drop(index=new_duplicates, inplace=True) store_duplicates = [i for i in duplicates if i in current_index] if dump: logger.debug("Reindexing") # create hierarchical indexes for a copy of the table to dump into the store to_store = output.set_index("elements", append=True) to_store = to_store.reorder_levels(["elements", "parts"], axis=0) # convert non-string. non-numerical objects to string # append to the existing dataframe # somehow S[34]S[34]++ sneaks past if len(store_duplicates): logger.debug( "Removing duplicates from store: {}".format(", ".join(store_duplicates)) ) with pd.HDFStore(path) as store: store.remove( "table", where=[i in store_duplicates for i in current_index] ) logger.debug( "Dumping {} tables to HDF store.".format( ",".join(pd.unique(to_store.index.get_level_values("elements"))) ) ) to_store.to_hdf( path, key="table", mode="a", append=True, format="table", data_columns=data_columns, min_itemsize=_ITEMSIZES, complevel=_COMPLEVEL, complib=_COMPLIB, ) return output.drop(columns="elements") # elements only used for
[docs]def reset_table( path=None, remove=True, key="table", format="table", complevel=_COMPLEVEL, complib=_COMPLIB, **kwargs ): """ Reset or remove a HDF store. Parameters ---------- path : :class:`str` | :class:`pathlib.Path` Path to store. remove : :class:`bool` Whether to remove the table from disk, if possible. format : :class:`str` Format to set for the new tables. complevel : :class:`int` Compression level option for the HDF store. Uncompressed tables can easily reach a few hundred MB - this isn't an issue on a local disk, but can be limiting for web transfer. complib : :class:`str` Which compression library to use. """ path = path or interferences_datafolder(subfolder="table") / "interferences.h5" if not path.parent.exists(): logger.debug("Creating folder for store.") path.parent.mkdir(parents=True) # ensure directory exists if remove: logger.debug("Removing store.") try: os.remove(path) # remove the file except FileNotFoundError: logger.debug("Store already removed or not present.") else: # keep table keys, set them to empty frames logger.debug("Resetting store table: {}/{}".format(path.name, key)) df = pd.DataFrame( index=_get_default_multiindex(), columns=["m_z", "mass", "charge", "iso_product",], ) # note - this will not work until a pytables bug is fixed, # where the table doesnt' generate from an empty frame. df.to_hdf( path, key=key, format=format, mode="w", append=True, complevel=complevel, complib=complib, min_itemsize=_ITEMSIZES, **kwargs )