diff options
Diffstat (limited to 'silx/io/dictdump.py')
-rw-r--r-- | silx/io/dictdump.py | 303 |
1 files changed, 228 insertions, 75 deletions
diff --git a/silx/io/dictdump.py b/silx/io/dictdump.py index bbb244a..e907668 100644 --- a/silx/io/dictdump.py +++ b/silx/io/dictdump.py @@ -26,6 +26,7 @@ by text strings to following file formats: `HDF5, INI, JSON` """ from collections import OrderedDict +from collections.abc import Mapping import json import logging import numpy @@ -34,11 +35,16 @@ import sys import h5py from .configdict import ConfigDict -from .utils import is_group, is_link, is_softlink, is_externallink +from .utils import is_group +from .utils import is_dataset +from .utils import is_link +from .utils import is_softlink +from .utils import is_externallink from .utils import is_file as is_h5_file_like from .utils import open as h5open from .utils import h5py_read_dataset from .utils import H5pyAttributesReadWrapper +from silx.utils.deprecation import deprecated_warning __authors__ = ["P. Knobel"] __license__ = "MIT" @@ -66,7 +72,7 @@ def _prepare_hdf5_write_value(array_like): return array -class _SafeH5FileWrite(object): +class _SafeH5FileWrite: """Context manager returning a :class:`h5py.File` object. If this object is initialized with a file path, we open the file @@ -82,7 +88,6 @@ class _SafeH5FileWrite(object): """ def __init__(self, h5file, mode="w"): """ - :param h5file: HDF5 file path or :class:`h5py.File` instance :param str mode: Can be ``"r+"`` (read/write, file must exist), ``"w"`` (write, existing file is lost), ``"w-"`` (write, fail if @@ -106,7 +111,7 @@ class _SafeH5FileWrite(object): self.h5file.close() -class _SafeH5FileRead(object): +class _SafeH5FileRead: """Context manager returning a :class:`h5py.File` or a :class:`silx.io.spech5.SpecH5` or a :class:`silx.io.fabioh5.File` object. @@ -136,18 +141,48 @@ class _SafeH5FileRead(object): self.h5file.close() +def _normalize_h5_path(h5root, h5path): + """ + :param h5root: File name or h5py-like File, Group or Dataset + :param str h5path: relative to ``h5root`` + :returns 2-tuple: (File or file object, h5path) + """ + if is_group(h5root): + group_name = h5root.name + if group_name == "/": + pass + elif h5path: + h5path = group_name + "/" + h5path + else: + h5path = group_name + h5file = h5root.file + elif is_dataset(h5root): + h5path = h5root.name + h5file = h5root.file + else: + h5file = h5root + if not h5path: + h5path = "/" + elif not h5path.endswith("/"): + h5path += "/" + return h5file, h5path + + def dicttoh5(treedict, h5file, h5path='/', - mode="w", overwrite_data=False, - create_dataset_args=None): + mode="w", overwrite_data=None, + create_dataset_args=None, update_mode=None): """Write a nested dictionary to a HDF5 file, using keys as member names. If a dictionary value is a sub-dictionary, a group is created. If it is any other data type, it is cast into a numpy array and written as a :mod:`h5py` dataset. Dictionary keys must be strings and cannot contain the ``/`` character. - + If dictionary keys are tuples they are interpreted to set h5 attributes. - The tuples should have the format (dataset_name,attr_name) + The tuples should have the format (dataset_name, attr_name). + + Existing HDF5 items can be deleted by providing the dictionary value + ``None``, provided that ``update_mode in ["modify", "replace"]``. .. note:: @@ -158,21 +193,29 @@ def dicttoh5(treedict, h5file, h5path='/', to define sub trees. If tuples are used as keys they should have the format (dataset_name,attr_name) and will add a 5h attribute with the corresponding value. - :param h5file: HDF5 file name or handle. If a file name is provided, the - function opens the file in the specified mode and closes it again - before completing. - :param h5path: Target path in HDF5 file in which scan groups are created. + :param h5file: File name or h5py-like File, Group or Dataset + :param h5path: Target path in the HDF5 file relative to ``h5file``. Default is root (``"/"``) :param mode: Can be ``"r+"`` (read/write, file must exist), ``"w"`` (write, existing file is lost), ``"w-"`` (write, fail if exists) or ``"a"`` (read/write if exists, create otherwise). This parameter is ignored if ``h5file`` is a file handle. - :param overwrite_data: If ``True``, existing groups and datasets can be - overwritten, if ``False`` they are skipped. This parameter is only - relevant if ``h5file_mode`` is ``"r+"`` or ``"a"``. + :param overwrite_data: Deprecated. ``True`` is approximately equivalent + to ``update_mode="modify"`` and ``False`` is equivalent to + ``update_mode="add"``. :param create_dataset_args: Dictionary of args you want to pass to ``h5f.create_dataset``. This allows you to specify filters and compression parameters. Don't specify ``name`` and ``data``. + :param update_mode: Can be ``add`` (default), ``modify`` or ``replace``. + + * ``add``: Extend the existing HDF5 tree when possible. Existing HDF5 + items (groups, datasets and attributes) remain untouched. + * ``modify``: Extend the existing HDF5 tree when possible, modify + existing attributes, modify same-sized dataset values and delete + HDF5 items with a ``None`` value in the dict tree. + * ``replace``: Replace the existing HDF5 tree. Items from the root of + the HDF5 tree that are not present in the root of the dict tree + will remain untouched. Example:: @@ -201,44 +244,110 @@ def dicttoh5(treedict, h5file, h5path='/', create_dataset_args=create_ds_args) """ - if not h5path.endswith("/"): - h5path += "/" + if overwrite_data is not None: + reason = ( + "`overwrite_data=True` becomes `update_mode='modify'` and " + "`overwrite_data=False` becomes `update_mode='add'`" + ) + deprecated_warning( + type_="argument", + name="overwrite_data", + reason=reason, + replacement="update_mode", + since_version="0.15", + ) + + if update_mode is None: + if overwrite_data: + update_mode = "modify" + else: + update_mode = "add" + else: + valid_existing_values = ("add", "replace", "modify") + if update_mode not in valid_existing_values: + raise ValueError(( + "Argument 'update_mode' can only have values: {}" + "".format(valid_existing_values) + )) + if overwrite_data is not None: + logger.warning("The argument `overwrite_data` is ignored") - with _SafeH5FileWrite(h5file, mode=mode) as h5f: - if isinstance(treedict, dict) and h5path != "/": - if h5path not in h5f: - h5f.create_group(h5path) + if not isinstance(treedict, Mapping): + raise TypeError("'treedict' must be a dictionary") - for key in filter(lambda k: not isinstance(k, tuple), treedict): - key_is_group = isinstance(treedict[key], dict) - h5name = h5path + key + h5file, h5path = _normalize_h5_path(h5file, h5path) - if key_is_group and treedict[key]: - # non-empty group: recurse - dicttoh5(treedict[key], h5f, h5name, - overwrite_data=overwrite_data, - create_dataset_args=create_dataset_args) - continue + def _iter_treedict(attributes=False): + nonlocal treedict + for key, value in treedict.items(): + if isinstance(key, tuple) == attributes: + yield key, value - if h5name in h5f: - # key already exists: delete or skip - if overwrite_data is True: - del h5f[h5name] + change_allowed = update_mode in ("replace", "modify") + + with _SafeH5FileWrite(h5file, mode=mode) as h5f: + # Create the root of the tree + if h5path in h5f: + if not is_group(h5f[h5path]): + if update_mode == "replace": + del h5f[h5path] + h5f.create_group(h5path) else: - logger.warning('key (%s) already exists. ' - 'Not overwriting.' % (h5name)) - continue + return + else: + h5f.create_group(h5path) - value = treedict[key] + # Loop over all groups, links and datasets + for key, value in _iter_treedict(attributes=False): + h5name = h5path + key + exists = h5name in h5f - if value is None or key_is_group: - # Create empty group - h5f.create_group(h5name) + if value is None: + # Delete HDF5 item + if exists and change_allowed: + del h5f[h5name] + exists = False + elif isinstance(value, Mapping): + # HDF5 group + if exists and update_mode == "replace": + del h5f[h5name] + exists = False + if value: + dicttoh5(value, h5f, h5name, + update_mode=update_mode, + create_dataset_args=create_dataset_args) + elif not exists: + h5f.create_group(h5name) elif is_link(value): - h5f[h5name] = value + # HDF5 link + if exists and update_mode == "replace": + del h5f[h5name] + exists = False + if not exists: + # Create link from h5py link object + h5f[h5name] = value else: + # HDF5 dataset + if exists and not change_allowed: + continue data = _prepare_hdf5_write_value(value) - # can't apply filters on scalars (datasets with shape == () ) + + # Edit the existing dataset + attrs_backup = None + if exists: + try: + h5f[h5name][()] = data + continue + except Exception: + # Delete the existing dataset + if update_mode != "replace": + if not is_dataset(h5f[h5name]): + continue + attrs_backup = dict(h5f[h5name].attrs) + del h5f[h5name] + + # Create dataset + # can't apply filters on scalars (datasets with shape == ()) if data.shape == () or create_dataset_args is None: h5f.create_dataset(h5name, data=data) @@ -246,36 +355,58 @@ def dicttoh5(treedict, h5file, h5path='/', h5f.create_dataset(h5name, data=data, **create_dataset_args) + if attrs_backup: + h5f[h5name].attrs.update(attrs_backup) - # deal with h5 attributes which have tuples as keys in treedict - for key in filter(lambda k: isinstance(k, tuple), treedict): - assert len(key) == 2, "attribute must be defined by 2 values" + # Loop over all attributes + for key, value in _iter_treedict(attributes=True): + if len(key) != 2: + raise ValueError("HDF5 attribute must be described by 2 values") h5name = h5path + key[0] attr_name = key[1] if h5name not in h5f: - # Create empty group if key for attr does not exist + # Create an empty group to store the attribute h5f.create_group(h5name) - logger.warning( - "key (%s) does not exist. attr %s " - "will be written to ." % (h5name, attr_name) - ) - - if attr_name in h5f[h5name].attrs: - if not overwrite_data: - logger.warning( - "attribute %s@%s already exists. Not overwriting." - "" % (h5name, attr_name) - ) + + h5a = h5f[h5name].attrs + exists = attr_name in h5a + + if value is None: + # Delete HDF5 attribute + if exists and change_allowed: + del h5a[attr_name] + exists = False + else: + # Add/modify HDF5 attribute + if exists and not change_allowed: continue + data = _prepare_hdf5_write_value(value) + h5a[attr_name] = data - # Write attribute - value = treedict[key] - data = _prepare_hdf5_write_value(value) - h5f[h5name].attrs[attr_name] = data + +def _has_nx_class(treedict, key=""): + return key + "@NX_class" in treedict or \ + (key, "NX_class") in treedict + + +def _ensure_nx_class(treedict, parents=tuple()): + """Each group needs an "NX_class" attribute. + """ + if _has_nx_class(treedict): + return + nparents = len(parents) + if nparents == 0: + treedict[("", "NX_class")] = "NXroot" + elif nparents == 1: + treedict[("", "NX_class")] = "NXentry" + else: + treedict[("", "NX_class")] = "NXcollection" -def nexus_to_h5_dict(treedict, parents=tuple()): +def nexus_to_h5_dict( + treedict, parents=tuple(), add_nx_class=True, has_nx_class=False +): """The following conversions are applied: * key with "{name}@{attr_name}" notation: key converted to 2-tuple * key with ">{url}" notation: strip ">" and convert value to @@ -286,14 +417,20 @@ def nexus_to_h5_dict(treedict, parents=tuple()): to define sub tree. The ``"@"`` character is used to write attributes. The ``">"`` prefix is used to define links. :param parents: Needed to resolve up-links (tuple of HDF5 group names) + :param add_nx_class: Add "NX_class" attribute when missing + :param has_nx_class: The "NX_class" attribute is defined in the parent :rtype dict: """ + if not isinstance(treedict, Mapping): + raise TypeError("'treedict' must be a dictionary") copy = dict() for key, value in treedict.items(): if "@" in key: + # HDF5 attribute key = tuple(key.rsplit("@", 1)) elif key.startswith(">"): + # HDF5 link if isinstance(value, str): key = key[1:] first, sep, second = value.partition("::") @@ -314,10 +451,19 @@ def nexus_to_h5_dict(treedict, parents=tuple()): value = h5py.SoftLink(first) elif is_link(value): key = key[1:] - if isinstance(value, dict): - copy[key] = nexus_to_h5_dict(value, parents=parents+(key,)) + if isinstance(value, Mapping): + # HDF5 group + key_has_nx_class = add_nx_class and _has_nx_class(treedict, key) + copy[key] = nexus_to_h5_dict( + value, + parents=parents+(key,), + add_nx_class=add_nx_class, + has_nx_class=key_has_nx_class) else: + # HDF5 dataset or link copy[key] = value + if add_nx_class and not has_nx_class: + _ensure_nx_class(copy, parents) return copy @@ -336,7 +482,8 @@ def h5_to_nexus_dict(treedict): copy = dict() for key, value in treedict.items(): if isinstance(key, tuple): - assert len(key)==2, "attribute must be defined by 2 values" + if len(key) != 2: + raise ValueError("HDF5 attribute must be described by 2 values") key = "%s@%s" % (key[0], key[1]) elif is_softlink(value): key = ">" + key @@ -344,7 +491,7 @@ def h5_to_nexus_dict(treedict): elif is_externallink(value): key = ">" + key value = value.filename + "::" + value.path - if isinstance(value, dict): + if isinstance(value, Mapping): copy[key] = h5_to_nexus_dict(value) else: copy[key] = value @@ -414,10 +561,8 @@ def h5todict(h5file, scalars). In some cases, you may find that a list of heterogeneous data types is converted to a numpy array of strings. - :param h5file: File name or :class:`h5py.File` object or spech5 file or - fabioh5 file. - :param str path: Name of HDF5 group to use as dictionary root level, - to read only a sub-group in the file + :param h5file: File name or h5py-like File, Group or Dataset + :param str path: Target path in the HDF5 file relative to ``h5file`` :param List[str] exclude_names: Groups and datasets whose name contains a string in this list will be ignored. Default is None (ignore nothing) :param bool asarray: True (default) to read scalar as arrays, False to @@ -431,6 +576,7 @@ def h5todict(h5file, - 'ignore': Ignore errors :return: Nested dictionary """ + h5file, path = _normalize_h5_path(h5file, path) with _SafeH5FileRead(h5file) as h5f: ddict = {} if path not in h5f: @@ -508,7 +654,7 @@ def h5todict(h5file, return ddict -def dicttonx(treedict, h5file, h5path="/", **kw): +def dicttonx(treedict, h5file, h5path="/", add_nx_class=None, **kw): """ Write a nested dictionary to a HDF5 file, using string keys as member names. The NeXus convention is used to identify attributes with ``"@"`` character, @@ -521,6 +667,8 @@ def dicttonx(treedict, h5file, h5path="/", **kw): and array-like objects as leafs. The ``"/"`` character can be used to define sub tree. The ``"@"`` character is used to write attributes. The ``">"`` prefix is used to define links. + :param add_nx_class: Add "NX_class" attribute when missing. By default it + is ``True`` when ``update_mode`` is ``"add"`` or ``None``. The named parameters are passed to dicttoh5. @@ -557,12 +705,17 @@ def dicttonx(treedict, h5file, h5path="/", **kw): dicttonx(gauss,"test.h5") """ + h5file, h5path = _normalize_h5_path(h5file, h5path) parents = tuple(p for p in h5path.split("/") if p) - nxtreedict = nexus_to_h5_dict(treedict, parents=parents) + if add_nx_class is None: + add_nx_class = kw.get("update_mode", None) in (None, "add") + nxtreedict = nexus_to_h5_dict( + treedict, parents=parents, add_nx_class=add_nx_class + ) dicttoh5(nxtreedict, h5file, h5path=h5path, **kw) -def nxtodict(h5file, **kw): +def nxtodict(h5file, include_attributes=True, **kw): """Read a HDF5 file and return a nested dictionary with the complete file structure and all data. @@ -571,7 +724,7 @@ def nxtodict(h5file, **kw): The named parameters are passed to h5todict. """ - nxtreedict = h5todict(h5file, **kw) + nxtreedict = h5todict(h5file, include_attributes=include_attributes, **kw) return h5_to_nexus_dict(nxtreedict) |