Source code for interferences.table.build

import pandas as pd
import numpy as np
import periodictable as pt
from tqdm import tqdm
from pyrolite.util.log import ToLogger
from .molecules import (
    molecule_from_components,
    get_molecule_labels,
    repr_formula,
    _get_isotope,
    deduplicate,
)
from .store import (
    load_store,
    process_subtables,
    lookup_components,
)
from .intensity import isotope_abundance_threshold, get_isotopic_abundance_product
from .combinations import get_elemental_combinations, component_subtable
from ..util.sorting import get_first_atom
from ..util.mz import process_window
from ..util.log import Handle

logger = Handle(__name__)


[docs]def build_table( elements=None, max_atoms=3, sortby=["m_z", "charge", "mass"], charges=[1, 2], add_labels=False, threshold=None, window=None, cache_results=True, ): """ Build the interferences table. Parameters ---------- elements : :class:`list` List of elements to include in the table. max_atoms : :class:`int` Largest size of molecule to build, in atoms. sortby : :class:`str` | :class:`list` Column or list of columns to sort the final table by. charges : :class:`list` ( :class:`int` ) Ionic charges to include in the model. add_labels : :class:`bool` Whether to produce molecule names which are nicely formatted. This takes additional computation time. threshold : :class:`float` Threshold for isotopic abundance for inclusion of low-abudance/non-stable isotopes. mass_window : :class:`tuple` Window of interest to filter out irrelevant examples (here a mass window, which directly translates to m/z window with z=1). cache_results : :class:`bool` Whether to store the results on disk for Todo ----- Consider options for parellizing this to reduce build time. This would allow larger molecules to be included. Invalid molecules (e.g. `H{2+}`) will currently be present, but will ideally be filtered out In some cases, mass peaks will be duplicated, and we want to keep the simplest version (e.g. `Ar[40]+` and `Ar[40]2{2+}`). We here remove duplicate mass peaks before sorting (i.e. take the first one, as higher charges would be penalised), but we could potentially add a check that both contain the same isotopic components for verificaiton (this would be slow..). While "m/z" would be an appropriate column name, it can't be used in HDF indexes. """ table = pd.DataFrame( columns=[ "m_z", "mass", "charge", "iso_product", ] ) # set numeric datatypes table = table.astype( { "mass": "float", "charge": "int8", "iso_product": "float", "m_z": "float", } ) window = process_window(window) # build up combinations of elements, forming the components column # this can't be split easily combinations = get_elemental_combinations(elements, max_atoms=max_atoms) logger.info("Building {:d} component combinations.".format(len(combinations))) # convert elements to pt.core.Element combinations = [ [get_first_atom(el) if isinstance(el, str) else el for el in components] for components in combinations ] identifiers = [ "-".join([repr(c) for c in components]) for components in combinations ] # do a loookup here for all the identifiers, then go build the unkonwn ones cached_combinations = [] try: lookup = lookup_components(identifiers) # ignore window here cached_combinations = list(pd.unique(lookup.index.get_level_values("elements"))) lookup = lookup.droplevel("elements") if window is not None: # process_window for lookup lookup = lookup.loc[lookup.m_z.between(*window), :] if charges is not None: # filter for charges too! lookup = lookup.loc[lookup.charge.isin(charges), :] # add the lookup table to the end table = pd.concat([table, lookup], axis=0, ignore_index=False) except KeyError as e: pytables_expect = "No object named /table in the file" if pytables_expect in str(e): logger.debug("Table not built - Defaulting to build-all.") build = identifiers else: raise e except IndexError as e: logger.debug("No identifiers in table - Defaulting to build-all.") build = identifiers if threshold is not None: # build them from scratch, as it's more difficult to filter after the fact cached_combinations = [] beyond_bounds = [] if window is not None: # check potential m_z relevance # check whether mz is within margin of target margin = 0.10 # 10% in_mass_bounds = lambda ID: any( [ window[0] * (1 - margin) < pt.formula(ID.replace("-", "")).mass / c < window[1] * (1 + margin) for c in charges ] ) beyond_bounds = [] beyond_bounds = [ID for ID in identifiers if not in_mass_bounds(ID)] if beyond_bounds: logger.debug( "Skipping tables outside m/z bounds {}.".format(",".join(beyond_bounds)) ) need_to_build = [ (ID, components) for (ID, components) in zip(identifiers, combinations) if (ID not in cached_combinations + beyond_bounds) ] new_tables = [] if need_to_build: progressbar = tqdm(need_to_build) # file=ToLogger(logger) barwidth = 16 + 3 * max_atoms progressbar.set_description(" " * barwidth) for ID, components in progressbar: # create the whole table, ignoring window, to dump into refernce. df = component_subtable(components, charges=charges, threshold=threshold) df.name = ID msg = "{} @ {:d} rows".format(ID, df.index.size) msg += " " * (barwidth - len(msg)) progressbar.set_description(msg) logger.debug("Building table for {} @ {:d} rows".format(ID, df.index.size)) new_tables.append(df) if new_tables: # append new dfs to the HDF store for later use # if we use threshold, we'll put an incomplete table into the reference store additions = process_subtables( new_tables, charges=charges, dump=(cache_results and (threshold is None)) ) # should de-duplicate the new_tables in this table # could rearrange and return deduped tables from dump_subtables if window is not None: additions = additions.loc[additions["m_z"].between(*window), :] table = pd.concat( [table, additions], axis=0, ignore_index=False, ) # filter out invalid entries, eg. H{2+} ############################################ # TODO # sort table ####################################################################### table.sort_values(sortby, axis="index", inplace=True) # additional columns ############################################################### if add_labels: # this step is string-operation intensive, and hence very slow logger.info("Adding labels to the table.") table["label"] = get_molecule_labels(table) # for consistency with prevsiouly serialized data: return table