1 files changed, 281 insertions, 140 deletions
diff --git a/silx/io/dictdump.py b/silx/io/dictdump.py
index f2318e0..bbb244a 100644
--- a/silx/io/dictdump.py
+++ b/silx/io/dictdump.py
@@ -34,9 +34,11 @@ import sys
 import h5py
 
 from .configdict import ConfigDict
-from .utils import is_group
+from .utils import is_group, is_link, is_softlink, is_externallink
 from .utils import is_file as is_h5_file_like
 from .utils import open as h5open
+from .utils import h5py_read_dataset
+from .utils import H5pyAttributesReadWrapper
 
 __authors__ = ["P. Knobel"]
 __license__ = "MIT"
@@ -44,35 +46,24 @@ __date__ = "17/07/2018"
 
 logger = logging.getLogger(__name__)
 
-string_types = (basestring,) if sys.version_info[0] == 2 else (str,)    # noqa
+vlen_utf8 = h5py.special_dtype(vlen=str)
+vlen_bytes = h5py.special_dtype(vlen=bytes)
 
 
-def _prepare_hdf5_dataset(array_like):
+def _prepare_hdf5_write_value(array_like):
     """Cast a python object into a numpy array in a HDF5 friendly format.
 
     :param array_like: Input dataset in a type that can be digested by
         ``numpy.array()`` (`str`, `list`, `numpy.ndarray`…)
     :return: ``numpy.ndarray`` ready to be written as an HDF5 dataset
     """
-    # simple strings
-    if isinstance(array_like, string_types):
-        array_like = numpy.string_(array_like)
-
-    # Ensure our data is a numpy.ndarray
-    if not isinstance(array_like, (numpy.ndarray, numpy.string_)):
-        array = numpy.array(array_like)
+    array = numpy.asarray(array_like)
+    if numpy.issubdtype(array.dtype, numpy.bytes_):
+        return numpy.array(array_like, dtype=vlen_bytes)
+    elif numpy.issubdtype(array.dtype, numpy.str_):
+        return numpy.array(array_like, dtype=vlen_utf8)
     else:
-        array = array_like
-
-    # handle list of strings or numpy array of strings
-    if not isinstance(array, numpy.string_):
-        data_kind = array.dtype.kind
-        # unicode: convert to byte strings
-        # (http://docs.h5py.org/en/latest/strings.html)
-        if data_kind.lower() in ["s", "u"]:
-            array = numpy.asarray(array, dtype=numpy.string_)
-
-    return array
+        return array
 
 
 class _SafeH5FileWrite(object):
@@ -219,150 +210,145 @@ def dicttoh5(treedict, h5file, h5path='/',
                 h5f.create_group(h5path)
 
         for key in filter(lambda k: not isinstance(k, tuple), treedict):
-            if isinstance(treedict[key], dict) and len(treedict[key]):
+            key_is_group = isinstance(treedict[key], dict)
+            h5name = h5path + key
+
+            if key_is_group and treedict[key]:
                 # non-empty group: recurse
-                dicttoh5(treedict[key], h5f, h5path + key,
+                dicttoh5(treedict[key], h5f, h5name,
                          overwrite_data=overwrite_data,
                          create_dataset_args=create_dataset_args)
+                continue
 
-            elif treedict[key] is None or (isinstance(treedict[key], dict) and
-                                           not len(treedict[key])):
-                if (h5path + key) in h5f:
-                    if overwrite_data is True:
-                        del h5f[h5path + key]
-                    else:
-                        logger.warning('key (%s) already exists. '
-                                       'Not overwriting.' % (h5path + key))
-                        continue
-                # Create empty group
-                h5f.create_group(h5path + key)
+            if h5name in h5f:
+                # key already exists: delete or skip
+                if overwrite_data is True:
+                    del h5f[h5name]
+                else:
+                    logger.warning('key (%s) already exists. '
+                                    'Not overwriting.' % (h5name))
+                    continue
+
+            value = treedict[key]
 
+            if value is None or key_is_group:
+                # Create empty group
+                h5f.create_group(h5name)
+            elif is_link(value):
+                h5f[h5name] = value
             else:
-                ds = _prepare_hdf5_dataset(treedict[key])
+                data = _prepare_hdf5_write_value(value)
                 # can't apply filters on scalars (datasets with shape == () )
-                if ds.shape == () or create_dataset_args is None:
-                    if h5path + key in h5f:
-                        if overwrite_data is True:
-                            del h5f[h5path + key]
-                        else:
-                            logger.warning('key (%s) already exists. '
-                                           'Not overwriting.' % (h5path + key))
-                            continue
-
-                    h5f.create_dataset(h5path + key,
-                                       data=ds)
+                if data.shape == () or create_dataset_args is None:
+                    h5f.create_dataset(h5name,
+                                       data=data)
                 else:
-                    if h5path + key in h5f:
-                        if overwrite_data is True:
-                            del h5f[h5path + key]
-                        else:
-                            logger.warning('key (%s) already exists. '
-                                           'Not overwriting.' % (h5path + key))
-                            continue
-
-                    h5f.create_dataset(h5path + key,
-                                       data=ds,
+                    h5f.create_dataset(h5name,
+                                       data=data,
                                        **create_dataset_args)
 
         # deal with h5 attributes which have tuples as keys in treedict
         for key in filter(lambda k: isinstance(k, tuple), treedict):
-            if (h5path + key[0]) not in h5f:
+            assert len(key) == 2, "attribute must be defined by 2 values"
+            h5name = h5path + key[0]
+            attr_name = key[1]
+
+            if h5name not in h5f:
                 # Create empty group if key for attr does not exist
-                h5f.create_group(h5path + key[0])
+                h5f.create_group(h5name)
                 logger.warning(
                     "key (%s) does not exist. attr %s "
-                    "will be written to ." % (h5path + key[0], key[1])
+                    "will be written to ." % (h5name, attr_name)
                 )
 
-            if key[1] in h5f[h5path + key[0]].attrs:
+            if attr_name in h5f[h5name].attrs:
                 if not overwrite_data:
                     logger.warning(
                         "attribute %s@%s already exists. Not overwriting."
-                        "" % (h5path + key[0], key[1])
+                        "" % (h5name, attr_name)
                     )
                     continue
 
             # Write attribute
             value = treedict[key]
+            data = _prepare_hdf5_write_value(value)
+            h5f[h5name].attrs[attr_name] = data
 
-            # Makes list/tuple of str being encoded as vlen unicode array
-            # Workaround for h5py<2.9.0 (e.g. debian 10).
-            if (isinstance(value, (list, tuple)) and
-                    numpy.asarray(value).dtype.type == numpy.unicode_):
-                value = numpy.array(value, dtype=h5py.special_dtype(vlen=str))
-
-            h5f[h5path + key[0]].attrs[key[1]] = value
 
-
-def dicttonx(
-    treedict,
-    h5file,
-    h5path="/",
-    mode="w",
-    overwrite_data=False,
-    create_dataset_args=None,
-):
-    """
-    Write a nested dictionary to a HDF5 file, using string keys as member names.
-    The NeXus convention is used to identify attributes with ``"@"`` character,
-    therefor the dataset_names should not contain ``"@"``.
+def nexus_to_h5_dict(treedict, parents=tuple()):
+    """The following conversions are applied:
+        * key with "{name}@{attr_name}" notation: key converted to 2-tuple
+        * key with ">{url}" notation: strip ">" and convert value to
+                                      h5py.SoftLink or h5py.ExternalLink 
 
     :param treedict: Nested dictionary/tree structure with strings as keys
          and array-like objects as leafs. The ``"/"`` character can be used
          to define sub tree. The ``"@"`` character is used to write attributes.
+         The ``">"`` prefix is used to define links.
+    :param parents: Needed to resolve up-links (tuple of HDF5 group names)
 
-    Detais on all other params can be found in doc of dicttoh5.
+    :rtype dict:
+    """
+    copy = dict()
+    for key, value in treedict.items():
+        if "@" in key:
+            key = tuple(key.rsplit("@", 1))
+        elif key.startswith(">"):
+            if isinstance(value, str):
+                key = key[1:]
+                first, sep, second = value.partition("::")
+                if sep:
+                    value = h5py.ExternalLink(first, second)
+                else:
+                    if ".." in first:
+                        # Up-links not supported: make absolute
+                        parts = []
+                        for p in list(parents) + first.split("/"):
+                            if not p or p == ".":
+                                continue
+                            elif p == "..":
+                                parts.pop(-1)
+                            else:
+                                parts.append(p)
+                        first = "/" + "/".join(parts)
+                    value = h5py.SoftLink(first)
+            elif is_link(value):
+                key = key[1:]
+        if isinstance(value, dict):
+            copy[key] = nexus_to_h5_dict(value, parents=parents+(key,))
+        else:
+            copy[key] = value
+    return copy
 
-    Example::
 
-        import numpy
-        from silx.io.dictdump import dicttonx
+def h5_to_nexus_dict(treedict):
+    """The following conversions are applied:
+        * 2-tuple key: converted to string ("@" notation)
+        * h5py.Softlink value: converted to string (">" key prefix)
+        * h5py.ExternalLink value: converted to string (">" key prefix)
 
-        gauss = {
-            "entry":{
-                "title":u"A plot of a gaussian",
-                "plot": {
-                    "y": numpy.array([0.08, 0.19, 0.39, 0.66, 0.9, 1.,
-                                  0.9, 0.66, 0.39, 0.19, 0.08]),
-                    "x": numpy.arange(0,1.1,.1),
-                    "@signal": "y",
-                    "@axes": "x",
-                    "@NX_class":u"NXdata",
-                    "title:u"Gauss Plot",
-                 },
-                 "@NX_class":u"NXentry",
-                 "default":"plot", 
-            }
-            "@NX_class": u"NXroot",
-            "@default": "entry",
-        }
+    :param treedict: Nested dictionary/tree structure with strings as keys
+         and array-like objects as leafs. The ``"/"`` character can be used
+         to define sub tree.
 
-        dicttonx(gauss,"test.h5")
+    :rtype dict:
     """
-
-    def copy_keys_keep_values(original):
-        # create a new treedict with with modified keys but keep values
-        copy = dict()
-        for key, value in original.items():
-            if "@" in key:
-                newkey = tuple(key.rsplit("@", 1))
-            else:
-                newkey = key
-            if isinstance(value, dict):
-                copy[newkey] = copy_keys_keep_values(value)
-            else:
-                copy[newkey] = value
-        return copy
-
-    nxtreedict = copy_keys_keep_values(treedict)
-    dicttoh5(
-        nxtreedict,
-        h5file,
-        h5path=h5path,
-        mode=mode,
-        overwrite_data=overwrite_data,
-        create_dataset_args=create_dataset_args,
-    )
+    copy = dict()
+    for key, value in treedict.items():
+        if isinstance(key, tuple):
+            assert len(key)==2, "attribute must be defined by 2 values"
+            key = "%s@%s" % (key[0], key[1])
+        elif is_softlink(value):
+            key = ">" + key
+            value = value.path
+        elif is_externallink(value):
+            key = ">" + key
+            value = value.filename + "::" + value.path
+        if isinstance(value, dict):
+            copy[key] = h5_to_nexus_dict(value)
+        else:
+            copy[key] = value
+    return copy
 
 
 def _name_contains_string_in_list(name, strlist):
@@ -374,7 +360,31 @@ def _name_contains_string_in_list(name, strlist):
     return False
 
 
-def h5todict(h5file, path="/", exclude_names=None, asarray=True):
+def _handle_error(mode: str, exception, msg: str, *args) -> None:
+    """Handle errors.
+
+    :param str mode: 'raise', 'log', 'ignore'
+    :param type exception: Exception class to use in 'raise' mode
+    :param str msg: Error message template
+    :param List[str] args: Arguments for error message template
+    """
+    if mode == 'ignore':
+        return  # no-op
+    elif mode == 'log':
+        logger.error(msg, *args)
+    elif mode == 'raise':
+        raise exception(msg % args)
+    else:
+        raise ValueError("Unsupported error handling: %s" % mode)
+
+
+def h5todict(h5file,
+             path="/",
+             exclude_names=None,
+             asarray=True,
+             dereference_links=True,
+             include_attributes=False,
+             errors='raise'):
     """Read a HDF5 file and return a nested dictionary with the complete file
     structure and all data.
 
@@ -397,7 +407,7 @@ def h5todict(h5file, path="/", exclude_names=None, asarray=True):
     .. note:: This function requires `h5py <http://www.h5py.org/>`_ to be
         installed.
 
-    .. note:: If you write a dictionary to a HDF5 file with
+    .. note:: If you write a dictionary to a HDF5 file with
         :func:`dicttoh5` and then read it back with :func:`h5todict`, data
         types are not preserved. All values are cast to numpy arrays before
         being written to file, and they are read back as numpy arrays (or
@@ -412,28 +422,159 @@ def h5todict(h5file, path="/", exclude_names=None, asarray=True):
         a string in this list will be ignored. Default is None (ignore nothing)
     :param bool asarray: True (default) to read scalar as arrays, False to
         read them as scalar
+    :param bool dereference_links: True (default) to dereference links, False
+        to preserve the link itself
+    :param bool include_attributes: False (default)
+    :param str errors: Handling of errors (HDF5 access issue, broken link,...):
+        - 'raise' (default): Raise an exception
+        - 'log': Log as errors
+        - 'ignore': Ignore errors
     :return: Nested dictionary
     """
     with _SafeH5FileRead(h5file) as h5f:
         ddict = {}
-        for key in h5f[path]:
+        if path not in h5f:
+            _handle_error(
+                errors, KeyError, 'Path "%s" does not exist in file.', path)
+            return ddict
+
+        try:
+            root = h5f[path]
+        except KeyError as e:
+            if not isinstance(h5f.get(path, getlink=True), h5py.HardLink):
+                _handle_error(errors,
+                              KeyError,
+                              'Cannot retrieve path "%s" (broken link)',
+                              path)
+            else:
+                _handle_error(errors, KeyError, ', '.join(e.args))
+            return ddict
+
+        # Read the attributes of the group
+        if include_attributes:
+            attrs = H5pyAttributesReadWrapper(root.attrs)
+            for aname, avalue in attrs.items():
+                ddict[("", aname)] = avalue
+        # Read the children of the group
+        for key in root:
             if _name_contains_string_in_list(key, exclude_names):
                 continue
-            if is_group(h5f[path + "/" + key]):
+            h5name = path + "/" + key
+            # Preserve HDF5 link when requested
+            if not dereference_links:
+                lnk = h5f.get(h5name, getlink=True)
+                if is_link(lnk):
+                    ddict[key] = lnk
+                    continue
+
+            try:
+                h5obj = h5f[h5name]
+            except KeyError as e:
+                if not isinstance(h5f.get(h5name, getlink=True), h5py.HardLink):
+                    _handle_error(errors,
+                                  KeyError,
+                                  'Cannot retrieve path "%s" (broken link)',
+                                  h5name)
+                else:
+                    _handle_error(errors, KeyError, ', '.join(e.args))
+                continue
+
+            if is_group(h5obj):
+                # Child is an HDF5 group
                 ddict[key] = h5todict(h5f,
-                                      path + "/" + key,
+                                      h5name,
                                       exclude_names=exclude_names,
-                                      asarray=asarray)
+                                      asarray=asarray,
+                                      dereference_links=dereference_links,
+                                      include_attributes=include_attributes)
             else:
-                # Read HDF5 datset
-                data = h5f[path + "/" + key][()]
-                if asarray:  # Convert HDF5 dataset to numpy array
-                    data = numpy.array(data, copy=False)
-                ddict[key] = data
-
+                # Child is an HDF5 dataset
+                try:
+                    data = h5py_read_dataset(h5obj)
+                except OSError:
+                    _handle_error(errors,
+                                  OSError,
+                                  'Cannot retrieve dataset "%s"',
+                                  h5name)
+                else:
+                    if asarray:  # Convert HDF5 dataset to numpy array
+                        data = numpy.array(data, copy=False)
+                    ddict[key] = data
+                    # Read the attributes of the child
+                    if include_attributes:
+                        attrs = H5pyAttributesReadWrapper(h5obj.attrs)
+                        for aname, avalue in attrs.items():
+                            ddict[(key, aname)] = avalue
     return ddict
 
 
+def dicttonx(treedict, h5file, h5path="/", **kw):
+    """
+    Write a nested dictionary to a HDF5 file, using string keys as member names.
+    The NeXus convention is used to identify attributes with ``"@"`` character,
+    therefore the dataset_names should not contain ``"@"``.
+
+    Similarly, links are identified by keys starting with the ``">"`` character.
+    The corresponding value can be a soft or external link.
+
+    :param treedict: Nested dictionary/tree structure with strings as keys
+         and array-like objects as leafs. The ``"/"`` character can be used
+         to define sub tree. The ``"@"`` character is used to write attributes.
+         The ``">"`` prefix is used to define links.
+
+    The named parameters are passed to dicttoh5.
+
+    Example::
+
+        import numpy
+        from silx.io.dictdump import dicttonx
+
+        gauss = {
+            "entry":{
+                "title":u"A plot of a gaussian",
+                "instrument": {
+                    "@NX_class": u"NXinstrument",
+                    "positioners": {
+                        "@NX_class": u"NXCollection",
+                        "x": numpy.arange(0,1.1,.1)
+                    }
+                }
+                "plot": {
+                    "y": numpy.array([0.08, 0.19, 0.39, 0.66, 0.9, 1.,
+                                  0.9, 0.66, 0.39, 0.19, 0.08]),
+                    ">x": "../instrument/positioners/x",
+                    "@signal": "y",
+                    "@axes": "x",
+                    "@NX_class":u"NXdata",
+                    "title:u"Gauss Plot",
+                 },
+                 "@NX_class": u"NXentry",
+                 "default":"plot",
+            }
+            "@NX_class": u"NXroot",
+            "@default": "entry",
+        }
+
+        dicttonx(gauss,"test.h5")
+    """
+    parents = tuple(p for p in h5path.split("/") if p)
+    nxtreedict = nexus_to_h5_dict(treedict, parents=parents)
+    dicttoh5(nxtreedict, h5file, h5path=h5path, **kw)
+
+
+def nxtodict(h5file, **kw):
+    """Read a HDF5 file and return a nested dictionary with the complete file
+    structure and all data.
+
+    As opposed to h5todict, all keys will be strings and no h5py objects are
+    present in the tree.
+
+    The named parameters are passed to h5todict.
+    """
+    nxtreedict = h5todict(h5file, **kw)
+    return h5_to_nexus_dict(nxtreedict)
+
+
 def dicttojson(ddict, jsonfile, indent=None, mode="w"):
     """Serialize ``ddict`` as a JSON formatted stream to ``jsonfile``.