diff options
Diffstat (limited to 'silx/io/dictdump.py')
-rw-r--r-- | silx/io/dictdump.py | 421 |
1 files changed, 281 insertions, 140 deletions
diff --git a/silx/io/dictdump.py b/silx/io/dictdump.py index f2318e0..bbb244a 100644 --- a/silx/io/dictdump.py +++ b/silx/io/dictdump.py @@ -34,9 +34,11 @@ import sys import h5py from .configdict import ConfigDict -from .utils import is_group +from .utils import is_group, is_link, is_softlink, is_externallink from .utils import is_file as is_h5_file_like from .utils import open as h5open +from .utils import h5py_read_dataset +from .utils import H5pyAttributesReadWrapper __authors__ = ["P. Knobel"] __license__ = "MIT" @@ -44,35 +46,24 @@ __date__ = "17/07/2018" logger = logging.getLogger(__name__) -string_types = (basestring,) if sys.version_info[0] == 2 else (str,) # noqa +vlen_utf8 = h5py.special_dtype(vlen=str) +vlen_bytes = h5py.special_dtype(vlen=bytes) -def _prepare_hdf5_dataset(array_like): +def _prepare_hdf5_write_value(array_like): """Cast a python object into a numpy array in a HDF5 friendly format. :param array_like: Input dataset in a type that can be digested by ``numpy.array()`` (`str`, `list`, `numpy.ndarray`…) :return: ``numpy.ndarray`` ready to be written as an HDF5 dataset """ - # simple strings - if isinstance(array_like, string_types): - array_like = numpy.string_(array_like) - - # Ensure our data is a numpy.ndarray - if not isinstance(array_like, (numpy.ndarray, numpy.string_)): - array = numpy.array(array_like) + array = numpy.asarray(array_like) + if numpy.issubdtype(array.dtype, numpy.bytes_): + return numpy.array(array_like, dtype=vlen_bytes) + elif numpy.issubdtype(array.dtype, numpy.str_): + return numpy.array(array_like, dtype=vlen_utf8) else: - array = array_like - - # handle list of strings or numpy array of strings - if not isinstance(array, numpy.string_): - data_kind = array.dtype.kind - # unicode: convert to byte strings - # (http://docs.h5py.org/en/latest/strings.html) - if data_kind.lower() in ["s", "u"]: - array = numpy.asarray(array, dtype=numpy.string_) - - return array + return array class _SafeH5FileWrite(object): @@ -219,150 +210,145 @@ def dicttoh5(treedict, h5file, h5path='/', h5f.create_group(h5path) for key in filter(lambda k: not isinstance(k, tuple), treedict): - if isinstance(treedict[key], dict) and len(treedict[key]): + key_is_group = isinstance(treedict[key], dict) + h5name = h5path + key + + if key_is_group and treedict[key]: # non-empty group: recurse - dicttoh5(treedict[key], h5f, h5path + key, + dicttoh5(treedict[key], h5f, h5name, overwrite_data=overwrite_data, create_dataset_args=create_dataset_args) + continue - elif treedict[key] is None or (isinstance(treedict[key], dict) and - not len(treedict[key])): - if (h5path + key) in h5f: - if overwrite_data is True: - del h5f[h5path + key] - else: - logger.warning('key (%s) already exists. ' - 'Not overwriting.' % (h5path + key)) - continue - # Create empty group - h5f.create_group(h5path + key) + if h5name in h5f: + # key already exists: delete or skip + if overwrite_data is True: + del h5f[h5name] + else: + logger.warning('key (%s) already exists. ' + 'Not overwriting.' % (h5name)) + continue + + value = treedict[key] + if value is None or key_is_group: + # Create empty group + h5f.create_group(h5name) + elif is_link(value): + h5f[h5name] = value else: - ds = _prepare_hdf5_dataset(treedict[key]) + data = _prepare_hdf5_write_value(value) # can't apply filters on scalars (datasets with shape == () ) - if ds.shape == () or create_dataset_args is None: - if h5path + key in h5f: - if overwrite_data is True: - del h5f[h5path + key] - else: - logger.warning('key (%s) already exists. ' - 'Not overwriting.' % (h5path + key)) - continue - - h5f.create_dataset(h5path + key, - data=ds) + if data.shape == () or create_dataset_args is None: + h5f.create_dataset(h5name, + data=data) else: - if h5path + key in h5f: - if overwrite_data is True: - del h5f[h5path + key] - else: - logger.warning('key (%s) already exists. ' - 'Not overwriting.' % (h5path + key)) - continue - - h5f.create_dataset(h5path + key, - data=ds, + h5f.create_dataset(h5name, + data=data, **create_dataset_args) # deal with h5 attributes which have tuples as keys in treedict for key in filter(lambda k: isinstance(k, tuple), treedict): - if (h5path + key[0]) not in h5f: + assert len(key) == 2, "attribute must be defined by 2 values" + h5name = h5path + key[0] + attr_name = key[1] + + if h5name not in h5f: # Create empty group if key for attr does not exist - h5f.create_group(h5path + key[0]) + h5f.create_group(h5name) logger.warning( "key (%s) does not exist. attr %s " - "will be written to ." % (h5path + key[0], key[1]) + "will be written to ." % (h5name, attr_name) ) - if key[1] in h5f[h5path + key[0]].attrs: + if attr_name in h5f[h5name].attrs: if not overwrite_data: logger.warning( "attribute %s@%s already exists. Not overwriting." - "" % (h5path + key[0], key[1]) + "" % (h5name, attr_name) ) continue # Write attribute value = treedict[key] + data = _prepare_hdf5_write_value(value) + h5f[h5name].attrs[attr_name] = data - # Makes list/tuple of str being encoded as vlen unicode array - # Workaround for h5py<2.9.0 (e.g. debian 10). - if (isinstance(value, (list, tuple)) and - numpy.asarray(value).dtype.type == numpy.unicode_): - value = numpy.array(value, dtype=h5py.special_dtype(vlen=str)) - - h5f[h5path + key[0]].attrs[key[1]] = value - -def dicttonx( - treedict, - h5file, - h5path="/", - mode="w", - overwrite_data=False, - create_dataset_args=None, -): - """ - Write a nested dictionary to a HDF5 file, using string keys as member names. - The NeXus convention is used to identify attributes with ``"@"`` character, - therefor the dataset_names should not contain ``"@"``. +def nexus_to_h5_dict(treedict, parents=tuple()): + """The following conversions are applied: + * key with "{name}@{attr_name}" notation: key converted to 2-tuple + * key with ">{url}" notation: strip ">" and convert value to + h5py.SoftLink or h5py.ExternalLink :param treedict: Nested dictionary/tree structure with strings as keys and array-like objects as leafs. The ``"/"`` character can be used to define sub tree. The ``"@"`` character is used to write attributes. + The ``">"`` prefix is used to define links. + :param parents: Needed to resolve up-links (tuple of HDF5 group names) - Detais on all other params can be found in doc of dicttoh5. + :rtype dict: + """ + copy = dict() + for key, value in treedict.items(): + if "@" in key: + key = tuple(key.rsplit("@", 1)) + elif key.startswith(">"): + if isinstance(value, str): + key = key[1:] + first, sep, second = value.partition("::") + if sep: + value = h5py.ExternalLink(first, second) + else: + if ".." in first: + # Up-links not supported: make absolute + parts = [] + for p in list(parents) + first.split("/"): + if not p or p == ".": + continue + elif p == "..": + parts.pop(-1) + else: + parts.append(p) + first = "/" + "/".join(parts) + value = h5py.SoftLink(first) + elif is_link(value): + key = key[1:] + if isinstance(value, dict): + copy[key] = nexus_to_h5_dict(value, parents=parents+(key,)) + else: + copy[key] = value + return copy - Example:: - import numpy - from silx.io.dictdump import dicttonx +def h5_to_nexus_dict(treedict): + """The following conversions are applied: + * 2-tuple key: converted to string ("@" notation) + * h5py.Softlink value: converted to string (">" key prefix) + * h5py.ExternalLink value: converted to string (">" key prefix) - gauss = { - "entry":{ - "title":u"A plot of a gaussian", - "plot": { - "y": numpy.array([0.08, 0.19, 0.39, 0.66, 0.9, 1., - 0.9, 0.66, 0.39, 0.19, 0.08]), - "x": numpy.arange(0,1.1,.1), - "@signal": "y", - "@axes": "x", - "@NX_class":u"NXdata", - "title:u"Gauss Plot", - }, - "@NX_class":u"NXentry", - "default":"plot", - } - "@NX_class": u"NXroot", - "@default": "entry", - } + :param treedict: Nested dictionary/tree structure with strings as keys + and array-like objects as leafs. The ``"/"`` character can be used + to define sub tree. - dicttonx(gauss,"test.h5") + :rtype dict: """ - - def copy_keys_keep_values(original): - # create a new treedict with with modified keys but keep values - copy = dict() - for key, value in original.items(): - if "@" in key: - newkey = tuple(key.rsplit("@", 1)) - else: - newkey = key - if isinstance(value, dict): - copy[newkey] = copy_keys_keep_values(value) - else: - copy[newkey] = value - return copy - - nxtreedict = copy_keys_keep_values(treedict) - dicttoh5( - nxtreedict, - h5file, - h5path=h5path, - mode=mode, - overwrite_data=overwrite_data, - create_dataset_args=create_dataset_args, - ) + copy = dict() + for key, value in treedict.items(): + if isinstance(key, tuple): + assert len(key)==2, "attribute must be defined by 2 values" + key = "%s@%s" % (key[0], key[1]) + elif is_softlink(value): + key = ">" + key + value = value.path + elif is_externallink(value): + key = ">" + key + value = value.filename + "::" + value.path + if isinstance(value, dict): + copy[key] = h5_to_nexus_dict(value) + else: + copy[key] = value + return copy def _name_contains_string_in_list(name, strlist): @@ -374,7 +360,31 @@ def _name_contains_string_in_list(name, strlist): return False -def h5todict(h5file, path="/", exclude_names=None, asarray=True): +def _handle_error(mode: str, exception, msg: str, *args) -> None: + """Handle errors. + + :param str mode: 'raise', 'log', 'ignore' + :param type exception: Exception class to use in 'raise' mode + :param str msg: Error message template + :param List[str] args: Arguments for error message template + """ + if mode == 'ignore': + return # no-op + elif mode == 'log': + logger.error(msg, *args) + elif mode == 'raise': + raise exception(msg % args) + else: + raise ValueError("Unsupported error handling: %s" % mode) + + +def h5todict(h5file, + path="/", + exclude_names=None, + asarray=True, + dereference_links=True, + include_attributes=False, + errors='raise'): """Read a HDF5 file and return a nested dictionary with the complete file structure and all data. @@ -397,7 +407,7 @@ def h5todict(h5file, path="/", exclude_names=None, asarray=True): .. note:: This function requires `h5py <http://www.h5py.org/>`_ to be installed. - .. note:: If you write a dictionary to a HDF5 file with + .. note:: If you write a dictionary to a HDF5 file with :func:`dicttoh5` and then read it back with :func:`h5todict`, data types are not preserved. All values are cast to numpy arrays before being written to file, and they are read back as numpy arrays (or @@ -412,28 +422,159 @@ def h5todict(h5file, path="/", exclude_names=None, asarray=True): a string in this list will be ignored. Default is None (ignore nothing) :param bool asarray: True (default) to read scalar as arrays, False to read them as scalar + :param bool dereference_links: True (default) to dereference links, False + to preserve the link itself + :param bool include_attributes: False (default) + :param str errors: Handling of errors (HDF5 access issue, broken link,...): + - 'raise' (default): Raise an exception + - 'log': Log as errors + - 'ignore': Ignore errors :return: Nested dictionary """ with _SafeH5FileRead(h5file) as h5f: ddict = {} - for key in h5f[path]: + if path not in h5f: + _handle_error( + errors, KeyError, 'Path "%s" does not exist in file.', path) + return ddict + + try: + root = h5f[path] + except KeyError as e: + if not isinstance(h5f.get(path, getlink=True), h5py.HardLink): + _handle_error(errors, + KeyError, + 'Cannot retrieve path "%s" (broken link)', + path) + else: + _handle_error(errors, KeyError, ', '.join(e.args)) + return ddict + + # Read the attributes of the group + if include_attributes: + attrs = H5pyAttributesReadWrapper(root.attrs) + for aname, avalue in attrs.items(): + ddict[("", aname)] = avalue + # Read the children of the group + for key in root: if _name_contains_string_in_list(key, exclude_names): continue - if is_group(h5f[path + "/" + key]): + h5name = path + "/" + key + # Preserve HDF5 link when requested + if not dereference_links: + lnk = h5f.get(h5name, getlink=True) + if is_link(lnk): + ddict[key] = lnk + continue + + try: + h5obj = h5f[h5name] + except KeyError as e: + if not isinstance(h5f.get(h5name, getlink=True), h5py.HardLink): + _handle_error(errors, + KeyError, + 'Cannot retrieve path "%s" (broken link)', + h5name) + else: + _handle_error(errors, KeyError, ', '.join(e.args)) + continue + + if is_group(h5obj): + # Child is an HDF5 group ddict[key] = h5todict(h5f, - path + "/" + key, + h5name, exclude_names=exclude_names, - asarray=asarray) + asarray=asarray, + dereference_links=dereference_links, + include_attributes=include_attributes) else: - # Read HDF5 datset - data = h5f[path + "/" + key][()] - if asarray: # Convert HDF5 dataset to numpy array - data = numpy.array(data, copy=False) - ddict[key] = data - + # Child is an HDF5 dataset + try: + data = h5py_read_dataset(h5obj) + except OSError: + _handle_error(errors, + OSError, + 'Cannot retrieve dataset "%s"', + h5name) + else: + if asarray: # Convert HDF5 dataset to numpy array + data = numpy.array(data, copy=False) + ddict[key] = data + # Read the attributes of the child + if include_attributes: + attrs = H5pyAttributesReadWrapper(h5obj.attrs) + for aname, avalue in attrs.items(): + ddict[(key, aname)] = avalue return ddict +def dicttonx(treedict, h5file, h5path="/", **kw): + """ + Write a nested dictionary to a HDF5 file, using string keys as member names. + The NeXus convention is used to identify attributes with ``"@"`` character, + therefore the dataset_names should not contain ``"@"``. + + Similarly, links are identified by keys starting with the ``">"`` character. + The corresponding value can be a soft or external link. + + :param treedict: Nested dictionary/tree structure with strings as keys + and array-like objects as leafs. The ``"/"`` character can be used + to define sub tree. The ``"@"`` character is used to write attributes. + The ``">"`` prefix is used to define links. + + The named parameters are passed to dicttoh5. + + Example:: + + import numpy + from silx.io.dictdump import dicttonx + + gauss = { + "entry":{ + "title":u"A plot of a gaussian", + "instrument": { + "@NX_class": u"NXinstrument", + "positioners": { + "@NX_class": u"NXCollection", + "x": numpy.arange(0,1.1,.1) + } + } + "plot": { + "y": numpy.array([0.08, 0.19, 0.39, 0.66, 0.9, 1., + 0.9, 0.66, 0.39, 0.19, 0.08]), + ">x": "../instrument/positioners/x", + "@signal": "y", + "@axes": "x", + "@NX_class":u"NXdata", + "title:u"Gauss Plot", + }, + "@NX_class": u"NXentry", + "default":"plot", + } + "@NX_class": u"NXroot", + "@default": "entry", + } + + dicttonx(gauss,"test.h5") + """ + parents = tuple(p for p in h5path.split("/") if p) + nxtreedict = nexus_to_h5_dict(treedict, parents=parents) + dicttoh5(nxtreedict, h5file, h5path=h5path, **kw) + + +def nxtodict(h5file, **kw): + """Read a HDF5 file and return a nested dictionary with the complete file + structure and all data. + + As opposed to h5todict, all keys will be strings and no h5py objects are + present in the tree. + + The named parameters are passed to h5todict. + """ + nxtreedict = h5todict(h5file, **kw) + return h5_to_nexus_dict(nxtreedict) + + def dicttojson(ddict, jsonfile, indent=None, mode="w"): """Serialize ``ddict`` as a JSON formatted stream to ``jsonfile``. |