summaryrefslogtreecommitdiff
path: root/silx/io/dictdump.py
diff options
context:
space:
mode:
Diffstat (limited to 'silx/io/dictdump.py')
-rw-r--r--silx/io/dictdump.py421
1 files changed, 281 insertions, 140 deletions
diff --git a/silx/io/dictdump.py b/silx/io/dictdump.py
index f2318e0..bbb244a 100644
--- a/silx/io/dictdump.py
+++ b/silx/io/dictdump.py
@@ -34,9 +34,11 @@ import sys
import h5py
from .configdict import ConfigDict
-from .utils import is_group
+from .utils import is_group, is_link, is_softlink, is_externallink
from .utils import is_file as is_h5_file_like
from .utils import open as h5open
+from .utils import h5py_read_dataset
+from .utils import H5pyAttributesReadWrapper
__authors__ = ["P. Knobel"]
__license__ = "MIT"
@@ -44,35 +46,24 @@ __date__ = "17/07/2018"
logger = logging.getLogger(__name__)
-string_types = (basestring,) if sys.version_info[0] == 2 else (str,) # noqa
+vlen_utf8 = h5py.special_dtype(vlen=str)
+vlen_bytes = h5py.special_dtype(vlen=bytes)
-def _prepare_hdf5_dataset(array_like):
+def _prepare_hdf5_write_value(array_like):
"""Cast a python object into a numpy array in a HDF5 friendly format.
:param array_like: Input dataset in a type that can be digested by
``numpy.array()`` (`str`, `list`, `numpy.ndarray`…)
:return: ``numpy.ndarray`` ready to be written as an HDF5 dataset
"""
- # simple strings
- if isinstance(array_like, string_types):
- array_like = numpy.string_(array_like)
-
- # Ensure our data is a numpy.ndarray
- if not isinstance(array_like, (numpy.ndarray, numpy.string_)):
- array = numpy.array(array_like)
+ array = numpy.asarray(array_like)
+ if numpy.issubdtype(array.dtype, numpy.bytes_):
+ return numpy.array(array_like, dtype=vlen_bytes)
+ elif numpy.issubdtype(array.dtype, numpy.str_):
+ return numpy.array(array_like, dtype=vlen_utf8)
else:
- array = array_like
-
- # handle list of strings or numpy array of strings
- if not isinstance(array, numpy.string_):
- data_kind = array.dtype.kind
- # unicode: convert to byte strings
- # (http://docs.h5py.org/en/latest/strings.html)
- if data_kind.lower() in ["s", "u"]:
- array = numpy.asarray(array, dtype=numpy.string_)
-
- return array
+ return array
class _SafeH5FileWrite(object):
@@ -219,150 +210,145 @@ def dicttoh5(treedict, h5file, h5path='/',
h5f.create_group(h5path)
for key in filter(lambda k: not isinstance(k, tuple), treedict):
- if isinstance(treedict[key], dict) and len(treedict[key]):
+ key_is_group = isinstance(treedict[key], dict)
+ h5name = h5path + key
+
+ if key_is_group and treedict[key]:
# non-empty group: recurse
- dicttoh5(treedict[key], h5f, h5path + key,
+ dicttoh5(treedict[key], h5f, h5name,
overwrite_data=overwrite_data,
create_dataset_args=create_dataset_args)
+ continue
- elif treedict[key] is None or (isinstance(treedict[key], dict) and
- not len(treedict[key])):
- if (h5path + key) in h5f:
- if overwrite_data is True:
- del h5f[h5path + key]
- else:
- logger.warning('key (%s) already exists. '
- 'Not overwriting.' % (h5path + key))
- continue
- # Create empty group
- h5f.create_group(h5path + key)
+ if h5name in h5f:
+ # key already exists: delete or skip
+ if overwrite_data is True:
+ del h5f[h5name]
+ else:
+ logger.warning('key (%s) already exists. '
+ 'Not overwriting.' % (h5name))
+ continue
+
+ value = treedict[key]
+ if value is None or key_is_group:
+ # Create empty group
+ h5f.create_group(h5name)
+ elif is_link(value):
+ h5f[h5name] = value
else:
- ds = _prepare_hdf5_dataset(treedict[key])
+ data = _prepare_hdf5_write_value(value)
# can't apply filters on scalars (datasets with shape == () )
- if ds.shape == () or create_dataset_args is None:
- if h5path + key in h5f:
- if overwrite_data is True:
- del h5f[h5path + key]
- else:
- logger.warning('key (%s) already exists. '
- 'Not overwriting.' % (h5path + key))
- continue
-
- h5f.create_dataset(h5path + key,
- data=ds)
+ if data.shape == () or create_dataset_args is None:
+ h5f.create_dataset(h5name,
+ data=data)
else:
- if h5path + key in h5f:
- if overwrite_data is True:
- del h5f[h5path + key]
- else:
- logger.warning('key (%s) already exists. '
- 'Not overwriting.' % (h5path + key))
- continue
-
- h5f.create_dataset(h5path + key,
- data=ds,
+ h5f.create_dataset(h5name,
+ data=data,
**create_dataset_args)
# deal with h5 attributes which have tuples as keys in treedict
for key in filter(lambda k: isinstance(k, tuple), treedict):
- if (h5path + key[0]) not in h5f:
+ assert len(key) == 2, "attribute must be defined by 2 values"
+ h5name = h5path + key[0]
+ attr_name = key[1]
+
+ if h5name not in h5f:
# Create empty group if key for attr does not exist
- h5f.create_group(h5path + key[0])
+ h5f.create_group(h5name)
logger.warning(
"key (%s) does not exist. attr %s "
- "will be written to ." % (h5path + key[0], key[1])
+ "will be written to ." % (h5name, attr_name)
)
- if key[1] in h5f[h5path + key[0]].attrs:
+ if attr_name in h5f[h5name].attrs:
if not overwrite_data:
logger.warning(
"attribute %s@%s already exists. Not overwriting."
- "" % (h5path + key[0], key[1])
+ "" % (h5name, attr_name)
)
continue
# Write attribute
value = treedict[key]
+ data = _prepare_hdf5_write_value(value)
+ h5f[h5name].attrs[attr_name] = data
- # Makes list/tuple of str being encoded as vlen unicode array
- # Workaround for h5py<2.9.0 (e.g. debian 10).
- if (isinstance(value, (list, tuple)) and
- numpy.asarray(value).dtype.type == numpy.unicode_):
- value = numpy.array(value, dtype=h5py.special_dtype(vlen=str))
-
- h5f[h5path + key[0]].attrs[key[1]] = value
-
-def dicttonx(
- treedict,
- h5file,
- h5path="/",
- mode="w",
- overwrite_data=False,
- create_dataset_args=None,
-):
- """
- Write a nested dictionary to a HDF5 file, using string keys as member names.
- The NeXus convention is used to identify attributes with ``"@"`` character,
- therefor the dataset_names should not contain ``"@"``.
+def nexus_to_h5_dict(treedict, parents=tuple()):
+ """The following conversions are applied:
+ * key with "{name}@{attr_name}" notation: key converted to 2-tuple
+ * key with ">{url}" notation: strip ">" and convert value to
+ h5py.SoftLink or h5py.ExternalLink
:param treedict: Nested dictionary/tree structure with strings as keys
and array-like objects as leafs. The ``"/"`` character can be used
to define sub tree. The ``"@"`` character is used to write attributes.
+ The ``">"`` prefix is used to define links.
+ :param parents: Needed to resolve up-links (tuple of HDF5 group names)
- Detais on all other params can be found in doc of dicttoh5.
+ :rtype dict:
+ """
+ copy = dict()
+ for key, value in treedict.items():
+ if "@" in key:
+ key = tuple(key.rsplit("@", 1))
+ elif key.startswith(">"):
+ if isinstance(value, str):
+ key = key[1:]
+ first, sep, second = value.partition("::")
+ if sep:
+ value = h5py.ExternalLink(first, second)
+ else:
+ if ".." in first:
+ # Up-links not supported: make absolute
+ parts = []
+ for p in list(parents) + first.split("/"):
+ if not p or p == ".":
+ continue
+ elif p == "..":
+ parts.pop(-1)
+ else:
+ parts.append(p)
+ first = "/" + "/".join(parts)
+ value = h5py.SoftLink(first)
+ elif is_link(value):
+ key = key[1:]
+ if isinstance(value, dict):
+ copy[key] = nexus_to_h5_dict(value, parents=parents+(key,))
+ else:
+ copy[key] = value
+ return copy
- Example::
- import numpy
- from silx.io.dictdump import dicttonx
+def h5_to_nexus_dict(treedict):
+ """The following conversions are applied:
+ * 2-tuple key: converted to string ("@" notation)
+ * h5py.Softlink value: converted to string (">" key prefix)
+ * h5py.ExternalLink value: converted to string (">" key prefix)
- gauss = {
- "entry":{
- "title":u"A plot of a gaussian",
- "plot": {
- "y": numpy.array([0.08, 0.19, 0.39, 0.66, 0.9, 1.,
- 0.9, 0.66, 0.39, 0.19, 0.08]),
- "x": numpy.arange(0,1.1,.1),
- "@signal": "y",
- "@axes": "x",
- "@NX_class":u"NXdata",
- "title:u"Gauss Plot",
- },
- "@NX_class":u"NXentry",
- "default":"plot",
- }
- "@NX_class": u"NXroot",
- "@default": "entry",
- }
+ :param treedict: Nested dictionary/tree structure with strings as keys
+ and array-like objects as leafs. The ``"/"`` character can be used
+ to define sub tree.
- dicttonx(gauss,"test.h5")
+ :rtype dict:
"""
-
- def copy_keys_keep_values(original):
- # create a new treedict with with modified keys but keep values
- copy = dict()
- for key, value in original.items():
- if "@" in key:
- newkey = tuple(key.rsplit("@", 1))
- else:
- newkey = key
- if isinstance(value, dict):
- copy[newkey] = copy_keys_keep_values(value)
- else:
- copy[newkey] = value
- return copy
-
- nxtreedict = copy_keys_keep_values(treedict)
- dicttoh5(
- nxtreedict,
- h5file,
- h5path=h5path,
- mode=mode,
- overwrite_data=overwrite_data,
- create_dataset_args=create_dataset_args,
- )
+ copy = dict()
+ for key, value in treedict.items():
+ if isinstance(key, tuple):
+ assert len(key)==2, "attribute must be defined by 2 values"
+ key = "%s@%s" % (key[0], key[1])
+ elif is_softlink(value):
+ key = ">" + key
+ value = value.path
+ elif is_externallink(value):
+ key = ">" + key
+ value = value.filename + "::" + value.path
+ if isinstance(value, dict):
+ copy[key] = h5_to_nexus_dict(value)
+ else:
+ copy[key] = value
+ return copy
def _name_contains_string_in_list(name, strlist):
@@ -374,7 +360,31 @@ def _name_contains_string_in_list(name, strlist):
return False
-def h5todict(h5file, path="/", exclude_names=None, asarray=True):
+def _handle_error(mode: str, exception, msg: str, *args) -> None:
+ """Handle errors.
+
+ :param str mode: 'raise', 'log', 'ignore'
+ :param type exception: Exception class to use in 'raise' mode
+ :param str msg: Error message template
+ :param List[str] args: Arguments for error message template
+ """
+ if mode == 'ignore':
+ return # no-op
+ elif mode == 'log':
+ logger.error(msg, *args)
+ elif mode == 'raise':
+ raise exception(msg % args)
+ else:
+ raise ValueError("Unsupported error handling: %s" % mode)
+
+
+def h5todict(h5file,
+ path="/",
+ exclude_names=None,
+ asarray=True,
+ dereference_links=True,
+ include_attributes=False,
+ errors='raise'):
"""Read a HDF5 file and return a nested dictionary with the complete file
structure and all data.
@@ -397,7 +407,7 @@ def h5todict(h5file, path="/", exclude_names=None, asarray=True):
.. note:: This function requires `h5py <http://www.h5py.org/>`_ to be
installed.
- .. note:: If you write a dictionary to a HDF5 file with
+ .. note:: If you write a dictionary to a HDF5 file with
:func:`dicttoh5` and then read it back with :func:`h5todict`, data
types are not preserved. All values are cast to numpy arrays before
being written to file, and they are read back as numpy arrays (or
@@ -412,28 +422,159 @@ def h5todict(h5file, path="/", exclude_names=None, asarray=True):
a string in this list will be ignored. Default is None (ignore nothing)
:param bool asarray: True (default) to read scalar as arrays, False to
read them as scalar
+ :param bool dereference_links: True (default) to dereference links, False
+ to preserve the link itself
+ :param bool include_attributes: False (default)
+ :param str errors: Handling of errors (HDF5 access issue, broken link,...):
+ - 'raise' (default): Raise an exception
+ - 'log': Log as errors
+ - 'ignore': Ignore errors
:return: Nested dictionary
"""
with _SafeH5FileRead(h5file) as h5f:
ddict = {}
- for key in h5f[path]:
+ if path not in h5f:
+ _handle_error(
+ errors, KeyError, 'Path "%s" does not exist in file.', path)
+ return ddict
+
+ try:
+ root = h5f[path]
+ except KeyError as e:
+ if not isinstance(h5f.get(path, getlink=True), h5py.HardLink):
+ _handle_error(errors,
+ KeyError,
+ 'Cannot retrieve path "%s" (broken link)',
+ path)
+ else:
+ _handle_error(errors, KeyError, ', '.join(e.args))
+ return ddict
+
+ # Read the attributes of the group
+ if include_attributes:
+ attrs = H5pyAttributesReadWrapper(root.attrs)
+ for aname, avalue in attrs.items():
+ ddict[("", aname)] = avalue
+ # Read the children of the group
+ for key in root:
if _name_contains_string_in_list(key, exclude_names):
continue
- if is_group(h5f[path + "/" + key]):
+ h5name = path + "/" + key
+ # Preserve HDF5 link when requested
+ if not dereference_links:
+ lnk = h5f.get(h5name, getlink=True)
+ if is_link(lnk):
+ ddict[key] = lnk
+ continue
+
+ try:
+ h5obj = h5f[h5name]
+ except KeyError as e:
+ if not isinstance(h5f.get(h5name, getlink=True), h5py.HardLink):
+ _handle_error(errors,
+ KeyError,
+ 'Cannot retrieve path "%s" (broken link)',
+ h5name)
+ else:
+ _handle_error(errors, KeyError, ', '.join(e.args))
+ continue
+
+ if is_group(h5obj):
+ # Child is an HDF5 group
ddict[key] = h5todict(h5f,
- path + "/" + key,
+ h5name,
exclude_names=exclude_names,
- asarray=asarray)
+ asarray=asarray,
+ dereference_links=dereference_links,
+ include_attributes=include_attributes)
else:
- # Read HDF5 datset
- data = h5f[path + "/" + key][()]
- if asarray: # Convert HDF5 dataset to numpy array
- data = numpy.array(data, copy=False)
- ddict[key] = data
-
+ # Child is an HDF5 dataset
+ try:
+ data = h5py_read_dataset(h5obj)
+ except OSError:
+ _handle_error(errors,
+ OSError,
+ 'Cannot retrieve dataset "%s"',
+ h5name)
+ else:
+ if asarray: # Convert HDF5 dataset to numpy array
+ data = numpy.array(data, copy=False)
+ ddict[key] = data
+ # Read the attributes of the child
+ if include_attributes:
+ attrs = H5pyAttributesReadWrapper(h5obj.attrs)
+ for aname, avalue in attrs.items():
+ ddict[(key, aname)] = avalue
return ddict
+def dicttonx(treedict, h5file, h5path="/", **kw):
+ """
+ Write a nested dictionary to a HDF5 file, using string keys as member names.
+ The NeXus convention is used to identify attributes with ``"@"`` character,
+ therefore the dataset_names should not contain ``"@"``.
+
+ Similarly, links are identified by keys starting with the ``">"`` character.
+ The corresponding value can be a soft or external link.
+
+ :param treedict: Nested dictionary/tree structure with strings as keys
+ and array-like objects as leafs. The ``"/"`` character can be used
+ to define sub tree. The ``"@"`` character is used to write attributes.
+ The ``">"`` prefix is used to define links.
+
+ The named parameters are passed to dicttoh5.
+
+ Example::
+
+ import numpy
+ from silx.io.dictdump import dicttonx
+
+ gauss = {
+ "entry":{
+ "title":u"A plot of a gaussian",
+ "instrument": {
+ "@NX_class": u"NXinstrument",
+ "positioners": {
+ "@NX_class": u"NXCollection",
+ "x": numpy.arange(0,1.1,.1)
+ }
+ }
+ "plot": {
+ "y": numpy.array([0.08, 0.19, 0.39, 0.66, 0.9, 1.,
+ 0.9, 0.66, 0.39, 0.19, 0.08]),
+ ">x": "../instrument/positioners/x",
+ "@signal": "y",
+ "@axes": "x",
+ "@NX_class":u"NXdata",
+ "title:u"Gauss Plot",
+ },
+ "@NX_class": u"NXentry",
+ "default":"plot",
+ }
+ "@NX_class": u"NXroot",
+ "@default": "entry",
+ }
+
+ dicttonx(gauss,"test.h5")
+ """
+ parents = tuple(p for p in h5path.split("/") if p)
+ nxtreedict = nexus_to_h5_dict(treedict, parents=parents)
+ dicttoh5(nxtreedict, h5file, h5path=h5path, **kw)
+
+
+def nxtodict(h5file, **kw):
+ """Read a HDF5 file and return a nested dictionary with the complete file
+ structure and all data.
+
+ As opposed to h5todict, all keys will be strings and no h5py objects are
+ present in the tree.
+
+ The named parameters are passed to h5todict.
+ """
+ nxtreedict = h5todict(h5file, **kw)
+ return h5_to_nexus_dict(nxtreedict)
+
+
def dicttojson(ddict, jsonfile, indent=None, mode="w"):
"""Serialize ``ddict`` as a JSON formatted stream to ``jsonfile``.