From b3bea947efa55d2c0f198b6c6795b3177be27f45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Picca=20Fr=C3=A9d=C3=A9ric-Emmanuel?= Date: Wed, 6 Jan 2021 14:10:12 +0100 Subject: New upstream version 0.14.0+dfsg --- silx/io/commonh5.py | 22 +- silx/io/dictdump.py | 421 +++++++++++++++++++++---------- silx/io/fabioh5.py | 10 +- silx/io/nxdata/parse.py | 4 +- silx/io/setup.py | 2 +- silx/io/specfile/src/locale_management.c | 5 +- silx/io/test/test_dictdump.py | 257 ++++++++++++++++--- silx/io/test/test_spectoh5.py | 3 +- silx/io/test/test_url.py | 10 + silx/io/test/test_utils.py | 244 ++++++++++++++++-- silx/io/url.py | 21 +- silx/io/utils.py | 331 ++++++++++++++++++++---- 12 files changed, 1071 insertions(+), 259 deletions(-) (limited to 'silx/io') diff --git a/silx/io/commonh5.py b/silx/io/commonh5.py index b624816..57232d8 100644 --- a/silx/io/commonh5.py +++ b/silx/io/commonh5.py @@ -1,6 +1,6 @@ # coding: utf-8 # /*########################################################################## -# Copyright (C) 2016-2019 European Synchrotron Radiation Facility +# Copyright (C) 2016-2020 European Synchrotron Radiation Facility # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -376,6 +376,24 @@ class Dataset(Node): There is no chunks.""" return None + @property + def is_virtual(self): + """Checks virtual data as provided by `h5py.Dataset`""" + return False + + def virtual_sources(self): + """Returns virtual dataset sources as provided by `h5py.Dataset`. + + :rtype: list""" + raise RuntimeError("Not a virtual dataset") + + @property + def external(self): + """Returns external sources as provided by `h5py.Dataset`. + + :rtype: list or None""" + return None + def __array__(self, dtype=None): # Special case for (0,)*-shape datasets if numpy.product(self.shape) == 0: @@ -958,7 +976,7 @@ class Group(Node): raise TypeError("Path are not supported") if data is None: if dtype is None: - dtype = numpy.float + dtype = numpy.float64 data = numpy.empty(shape=shape, dtype=dtype) elif dtype is not None: data = data.astype(dtype) diff --git a/silx/io/dictdump.py b/silx/io/dictdump.py index f2318e0..bbb244a 100644 --- a/silx/io/dictdump.py +++ b/silx/io/dictdump.py @@ -34,9 +34,11 @@ import sys import h5py from .configdict import ConfigDict -from .utils import is_group +from .utils import is_group, is_link, is_softlink, is_externallink from .utils import is_file as is_h5_file_like from .utils import open as h5open +from .utils import h5py_read_dataset +from .utils import H5pyAttributesReadWrapper __authors__ = ["P. Knobel"] __license__ = "MIT" @@ -44,35 +46,24 @@ __date__ = "17/07/2018" logger = logging.getLogger(__name__) -string_types = (basestring,) if sys.version_info[0] == 2 else (str,) # noqa +vlen_utf8 = h5py.special_dtype(vlen=str) +vlen_bytes = h5py.special_dtype(vlen=bytes) -def _prepare_hdf5_dataset(array_like): +def _prepare_hdf5_write_value(array_like): """Cast a python object into a numpy array in a HDF5 friendly format. :param array_like: Input dataset in a type that can be digested by ``numpy.array()`` (`str`, `list`, `numpy.ndarray`…) :return: ``numpy.ndarray`` ready to be written as an HDF5 dataset """ - # simple strings - if isinstance(array_like, string_types): - array_like = numpy.string_(array_like) - - # Ensure our data is a numpy.ndarray - if not isinstance(array_like, (numpy.ndarray, numpy.string_)): - array = numpy.array(array_like) + array = numpy.asarray(array_like) + if numpy.issubdtype(array.dtype, numpy.bytes_): + return numpy.array(array_like, dtype=vlen_bytes) + elif numpy.issubdtype(array.dtype, numpy.str_): + return numpy.array(array_like, dtype=vlen_utf8) else: - array = array_like - - # handle list of strings or numpy array of strings - if not isinstance(array, numpy.string_): - data_kind = array.dtype.kind - # unicode: convert to byte strings - # (http://docs.h5py.org/en/latest/strings.html) - if data_kind.lower() in ["s", "u"]: - array = numpy.asarray(array, dtype=numpy.string_) - - return array + return array class _SafeH5FileWrite(object): @@ -219,150 +210,145 @@ def dicttoh5(treedict, h5file, h5path='/', h5f.create_group(h5path) for key in filter(lambda k: not isinstance(k, tuple), treedict): - if isinstance(treedict[key], dict) and len(treedict[key]): + key_is_group = isinstance(treedict[key], dict) + h5name = h5path + key + + if key_is_group and treedict[key]: # non-empty group: recurse - dicttoh5(treedict[key], h5f, h5path + key, + dicttoh5(treedict[key], h5f, h5name, overwrite_data=overwrite_data, create_dataset_args=create_dataset_args) + continue - elif treedict[key] is None or (isinstance(treedict[key], dict) and - not len(treedict[key])): - if (h5path + key) in h5f: - if overwrite_data is True: - del h5f[h5path + key] - else: - logger.warning('key (%s) already exists. ' - 'Not overwriting.' % (h5path + key)) - continue - # Create empty group - h5f.create_group(h5path + key) + if h5name in h5f: + # key already exists: delete or skip + if overwrite_data is True: + del h5f[h5name] + else: + logger.warning('key (%s) already exists. ' + 'Not overwriting.' % (h5name)) + continue + + value = treedict[key] + if value is None or key_is_group: + # Create empty group + h5f.create_group(h5name) + elif is_link(value): + h5f[h5name] = value else: - ds = _prepare_hdf5_dataset(treedict[key]) + data = _prepare_hdf5_write_value(value) # can't apply filters on scalars (datasets with shape == () ) - if ds.shape == () or create_dataset_args is None: - if h5path + key in h5f: - if overwrite_data is True: - del h5f[h5path + key] - else: - logger.warning('key (%s) already exists. ' - 'Not overwriting.' % (h5path + key)) - continue - - h5f.create_dataset(h5path + key, - data=ds) + if data.shape == () or create_dataset_args is None: + h5f.create_dataset(h5name, + data=data) else: - if h5path + key in h5f: - if overwrite_data is True: - del h5f[h5path + key] - else: - logger.warning('key (%s) already exists. ' - 'Not overwriting.' % (h5path + key)) - continue - - h5f.create_dataset(h5path + key, - data=ds, + h5f.create_dataset(h5name, + data=data, **create_dataset_args) # deal with h5 attributes which have tuples as keys in treedict for key in filter(lambda k: isinstance(k, tuple), treedict): - if (h5path + key[0]) not in h5f: + assert len(key) == 2, "attribute must be defined by 2 values" + h5name = h5path + key[0] + attr_name = key[1] + + if h5name not in h5f: # Create empty group if key for attr does not exist - h5f.create_group(h5path + key[0]) + h5f.create_group(h5name) logger.warning( "key (%s) does not exist. attr %s " - "will be written to ." % (h5path + key[0], key[1]) + "will be written to ." % (h5name, attr_name) ) - if key[1] in h5f[h5path + key[0]].attrs: + if attr_name in h5f[h5name].attrs: if not overwrite_data: logger.warning( "attribute %s@%s already exists. Not overwriting." - "" % (h5path + key[0], key[1]) + "" % (h5name, attr_name) ) continue # Write attribute value = treedict[key] + data = _prepare_hdf5_write_value(value) + h5f[h5name].attrs[attr_name] = data - # Makes list/tuple of str being encoded as vlen unicode array - # Workaround for h5py<2.9.0 (e.g. debian 10). - if (isinstance(value, (list, tuple)) and - numpy.asarray(value).dtype.type == numpy.unicode_): - value = numpy.array(value, dtype=h5py.special_dtype(vlen=str)) - - h5f[h5path + key[0]].attrs[key[1]] = value - -def dicttonx( - treedict, - h5file, - h5path="/", - mode="w", - overwrite_data=False, - create_dataset_args=None, -): - """ - Write a nested dictionary to a HDF5 file, using string keys as member names. - The NeXus convention is used to identify attributes with ``"@"`` character, - therefor the dataset_names should not contain ``"@"``. +def nexus_to_h5_dict(treedict, parents=tuple()): + """The following conversions are applied: + * key with "{name}@{attr_name}" notation: key converted to 2-tuple + * key with ">{url}" notation: strip ">" and convert value to + h5py.SoftLink or h5py.ExternalLink :param treedict: Nested dictionary/tree structure with strings as keys and array-like objects as leafs. The ``"/"`` character can be used to define sub tree. The ``"@"`` character is used to write attributes. + The ``">"`` prefix is used to define links. + :param parents: Needed to resolve up-links (tuple of HDF5 group names) - Detais on all other params can be found in doc of dicttoh5. + :rtype dict: + """ + copy = dict() + for key, value in treedict.items(): + if "@" in key: + key = tuple(key.rsplit("@", 1)) + elif key.startswith(">"): + if isinstance(value, str): + key = key[1:] + first, sep, second = value.partition("::") + if sep: + value = h5py.ExternalLink(first, second) + else: + if ".." in first: + # Up-links not supported: make absolute + parts = [] + for p in list(parents) + first.split("/"): + if not p or p == ".": + continue + elif p == "..": + parts.pop(-1) + else: + parts.append(p) + first = "/" + "/".join(parts) + value = h5py.SoftLink(first) + elif is_link(value): + key = key[1:] + if isinstance(value, dict): + copy[key] = nexus_to_h5_dict(value, parents=parents+(key,)) + else: + copy[key] = value + return copy - Example:: - import numpy - from silx.io.dictdump import dicttonx +def h5_to_nexus_dict(treedict): + """The following conversions are applied: + * 2-tuple key: converted to string ("@" notation) + * h5py.Softlink value: converted to string (">" key prefix) + * h5py.ExternalLink value: converted to string (">" key prefix) - gauss = { - "entry":{ - "title":u"A plot of a gaussian", - "plot": { - "y": numpy.array([0.08, 0.19, 0.39, 0.66, 0.9, 1., - 0.9, 0.66, 0.39, 0.19, 0.08]), - "x": numpy.arange(0,1.1,.1), - "@signal": "y", - "@axes": "x", - "@NX_class":u"NXdata", - "title:u"Gauss Plot", - }, - "@NX_class":u"NXentry", - "default":"plot", - } - "@NX_class": u"NXroot", - "@default": "entry", - } + :param treedict: Nested dictionary/tree structure with strings as keys + and array-like objects as leafs. The ``"/"`` character can be used + to define sub tree. - dicttonx(gauss,"test.h5") + :rtype dict: """ - - def copy_keys_keep_values(original): - # create a new treedict with with modified keys but keep values - copy = dict() - for key, value in original.items(): - if "@" in key: - newkey = tuple(key.rsplit("@", 1)) - else: - newkey = key - if isinstance(value, dict): - copy[newkey] = copy_keys_keep_values(value) - else: - copy[newkey] = value - return copy - - nxtreedict = copy_keys_keep_values(treedict) - dicttoh5( - nxtreedict, - h5file, - h5path=h5path, - mode=mode, - overwrite_data=overwrite_data, - create_dataset_args=create_dataset_args, - ) + copy = dict() + for key, value in treedict.items(): + if isinstance(key, tuple): + assert len(key)==2, "attribute must be defined by 2 values" + key = "%s@%s" % (key[0], key[1]) + elif is_softlink(value): + key = ">" + key + value = value.path + elif is_externallink(value): + key = ">" + key + value = value.filename + "::" + value.path + if isinstance(value, dict): + copy[key] = h5_to_nexus_dict(value) + else: + copy[key] = value + return copy def _name_contains_string_in_list(name, strlist): @@ -374,7 +360,31 @@ def _name_contains_string_in_list(name, strlist): return False -def h5todict(h5file, path="/", exclude_names=None, asarray=True): +def _handle_error(mode: str, exception, msg: str, *args) -> None: + """Handle errors. + + :param str mode: 'raise', 'log', 'ignore' + :param type exception: Exception class to use in 'raise' mode + :param str msg: Error message template + :param List[str] args: Arguments for error message template + """ + if mode == 'ignore': + return # no-op + elif mode == 'log': + logger.error(msg, *args) + elif mode == 'raise': + raise exception(msg % args) + else: + raise ValueError("Unsupported error handling: %s" % mode) + + +def h5todict(h5file, + path="/", + exclude_names=None, + asarray=True, + dereference_links=True, + include_attributes=False, + errors='raise'): """Read a HDF5 file and return a nested dictionary with the complete file structure and all data. @@ -397,7 +407,7 @@ def h5todict(h5file, path="/", exclude_names=None, asarray=True): .. note:: This function requires `h5py `_ to be installed. - .. note:: If you write a dictionary to a HDF5 file with + .. note:: If you write a dictionary to a HDF5 file with :func:`dicttoh5` and then read it back with :func:`h5todict`, data types are not preserved. All values are cast to numpy arrays before being written to file, and they are read back as numpy arrays (or @@ -412,28 +422,159 @@ def h5todict(h5file, path="/", exclude_names=None, asarray=True): a string in this list will be ignored. Default is None (ignore nothing) :param bool asarray: True (default) to read scalar as arrays, False to read them as scalar + :param bool dereference_links: True (default) to dereference links, False + to preserve the link itself + :param bool include_attributes: False (default) + :param str errors: Handling of errors (HDF5 access issue, broken link,...): + - 'raise' (default): Raise an exception + - 'log': Log as errors + - 'ignore': Ignore errors :return: Nested dictionary """ with _SafeH5FileRead(h5file) as h5f: ddict = {} - for key in h5f[path]: + if path not in h5f: + _handle_error( + errors, KeyError, 'Path "%s" does not exist in file.', path) + return ddict + + try: + root = h5f[path] + except KeyError as e: + if not isinstance(h5f.get(path, getlink=True), h5py.HardLink): + _handle_error(errors, + KeyError, + 'Cannot retrieve path "%s" (broken link)', + path) + else: + _handle_error(errors, KeyError, ', '.join(e.args)) + return ddict + + # Read the attributes of the group + if include_attributes: + attrs = H5pyAttributesReadWrapper(root.attrs) + for aname, avalue in attrs.items(): + ddict[("", aname)] = avalue + # Read the children of the group + for key in root: if _name_contains_string_in_list(key, exclude_names): continue - if is_group(h5f[path + "/" + key]): + h5name = path + "/" + key + # Preserve HDF5 link when requested + if not dereference_links: + lnk = h5f.get(h5name, getlink=True) + if is_link(lnk): + ddict[key] = lnk + continue + + try: + h5obj = h5f[h5name] + except KeyError as e: + if not isinstance(h5f.get(h5name, getlink=True), h5py.HardLink): + _handle_error(errors, + KeyError, + 'Cannot retrieve path "%s" (broken link)', + h5name) + else: + _handle_error(errors, KeyError, ', '.join(e.args)) + continue + + if is_group(h5obj): + # Child is an HDF5 group ddict[key] = h5todict(h5f, - path + "/" + key, + h5name, exclude_names=exclude_names, - asarray=asarray) + asarray=asarray, + dereference_links=dereference_links, + include_attributes=include_attributes) else: - # Read HDF5 datset - data = h5f[path + "/" + key][()] - if asarray: # Convert HDF5 dataset to numpy array - data = numpy.array(data, copy=False) - ddict[key] = data - + # Child is an HDF5 dataset + try: + data = h5py_read_dataset(h5obj) + except OSError: + _handle_error(errors, + OSError, + 'Cannot retrieve dataset "%s"', + h5name) + else: + if asarray: # Convert HDF5 dataset to numpy array + data = numpy.array(data, copy=False) + ddict[key] = data + # Read the attributes of the child + if include_attributes: + attrs = H5pyAttributesReadWrapper(h5obj.attrs) + for aname, avalue in attrs.items(): + ddict[(key, aname)] = avalue return ddict +def dicttonx(treedict, h5file, h5path="/", **kw): + """ + Write a nested dictionary to a HDF5 file, using string keys as member names. + The NeXus convention is used to identify attributes with ``"@"`` character, + therefore the dataset_names should not contain ``"@"``. + + Similarly, links are identified by keys starting with the ``">"`` character. + The corresponding value can be a soft or external link. + + :param treedict: Nested dictionary/tree structure with strings as keys + and array-like objects as leafs. The ``"/"`` character can be used + to define sub tree. The ``"@"`` character is used to write attributes. + The ``">"`` prefix is used to define links. + + The named parameters are passed to dicttoh5. + + Example:: + + import numpy + from silx.io.dictdump import dicttonx + + gauss = { + "entry":{ + "title":u"A plot of a gaussian", + "instrument": { + "@NX_class": u"NXinstrument", + "positioners": { + "@NX_class": u"NXCollection", + "x": numpy.arange(0,1.1,.1) + } + } + "plot": { + "y": numpy.array([0.08, 0.19, 0.39, 0.66, 0.9, 1., + 0.9, 0.66, 0.39, 0.19, 0.08]), + ">x": "../instrument/positioners/x", + "@signal": "y", + "@axes": "x", + "@NX_class":u"NXdata", + "title:u"Gauss Plot", + }, + "@NX_class": u"NXentry", + "default":"plot", + } + "@NX_class": u"NXroot", + "@default": "entry", + } + + dicttonx(gauss,"test.h5") + """ + parents = tuple(p for p in h5path.split("/") if p) + nxtreedict = nexus_to_h5_dict(treedict, parents=parents) + dicttoh5(nxtreedict, h5file, h5path=h5path, **kw) + + +def nxtodict(h5file, **kw): + """Read a HDF5 file and return a nested dictionary with the complete file + structure and all data. + + As opposed to h5todict, all keys will be strings and no h5py objects are + present in the tree. + + The named parameters are passed to h5todict. + """ + nxtreedict = h5todict(h5file, **kw) + return h5_to_nexus_dict(nxtreedict) + + def dicttojson(ddict, jsonfile, indent=None, mode="w"): """Serialize ``ddict`` as a JSON formatted stream to ``jsonfile``. diff --git a/silx/io/fabioh5.py b/silx/io/fabioh5.py index cfaa0a0..2fd719d 100755 --- a/silx/io/fabioh5.py +++ b/silx/io/fabioh5.py @@ -1,6 +1,6 @@ # coding: utf-8 # /*########################################################################## -# Copyright (C) 2016-2019 European Synchrotron Radiation Facility +# Copyright (C) 2016-2020 European Synchrotron Radiation Facility # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -656,13 +656,13 @@ class FabioReader(object): elif result_type.kind == "U": none_value = u"" elif result_type.kind == "f": - none_value = numpy.float("NaN") + none_value = numpy.float64("NaN") elif result_type.kind == "i": - none_value = numpy.int(0) + none_value = numpy.int64(0) elif result_type.kind == "u": - none_value = numpy.int(0) + none_value = numpy.int64(0) elif result_type.kind == "b": - none_value = numpy.bool(False) + none_value = numpy.bool_(False) else: none_value = None diff --git a/silx/io/nxdata/parse.py b/silx/io/nxdata/parse.py index 6bd18d6..b1c1bba 100644 --- a/silx/io/nxdata/parse.py +++ b/silx/io/nxdata/parse.py @@ -45,7 +45,7 @@ import json import numpy import six -from silx.io.utils import is_group, is_file, is_dataset +from silx.io.utils import is_group, is_file, is_dataset, h5py_read_dataset from ._utils import get_attr_as_unicode, INTERPDIM, nxdata_logger, \ get_uncertainties_names, get_signal_name, \ @@ -628,7 +628,7 @@ class NXdata(object): data_dataset_names = [self.signal_name] + self.axes_dataset_names if (title is not None and is_dataset(title) and "title" not in data_dataset_names): - return str(title[()]) + return str(h5py_read_dataset(title)) title = self.group.attrs.get("title") if title is None: diff --git a/silx/io/setup.py b/silx/io/setup.py index 4aaf324..9cafa17 100644 --- a/silx/io/setup.py +++ b/silx/io/setup.py @@ -51,7 +51,7 @@ else: SPECFILE_USE_GNU_SOURCE = int(SPECFILE_USE_GNU_SOURCE) if sys.platform == "win32": - define_macros = [('WIN32', None)] + define_macros = [('WIN32', None), ('SPECFILE_POSIX', None)] elif os.name.lower().startswith('posix'): define_macros = [('SPECFILE_POSIX', None)] # the best choice is to have _GNU_SOURCE defined diff --git a/silx/io/specfile/src/locale_management.c b/silx/io/specfile/src/locale_management.c index 54695f5..0c5f7ca 100644 --- a/silx/io/specfile/src/locale_management.c +++ b/silx/io/specfile/src/locale_management.c @@ -39,6 +39,9 @@ # else # ifdef SPECFILE_POSIX # include +# ifndef LOCALE_NAME_MAX_LENGTH +# define LOCALE_NAME_MAX_LENGTH 85 +# endif # endif # endif #endif @@ -60,7 +63,7 @@ double PyMcaAtof(const char * inputString) #else #ifdef SPECFILE_POSIX char *currentLocaleBuffer; - char localeBuffer[21]; + char localeBuffer[LOCALE_NAME_MAX_LENGTH + 1] = {'\0'}; double result; currentLocaleBuffer = setlocale(LC_NUMERIC, NULL); strcpy(localeBuffer, currentLocaleBuffer); diff --git a/silx/io/test/test_dictdump.py b/silx/io/test/test_dictdump.py index c0b6914..b99116b 100644 --- a/silx/io/test/test_dictdump.py +++ b/silx/io/test/test_dictdump.py @@ -43,6 +43,8 @@ from .. import dictdump from ..dictdump import dicttoh5, dicttojson, dump from ..dictdump import h5todict, load from ..dictdump import logger as dictdump_logger +from ..utils import is_link +from ..utils import h5py_read_dataset def tree(): @@ -58,15 +60,29 @@ city_attrs["Europe"]["France"]["Grenoble"]["inhabitants"] = inhabitants city_attrs["Europe"]["France"]["Grenoble"]["coordinates"] = [45.1830, 5.7196] city_attrs["Europe"]["France"]["Tourcoing"]["area"] +ext_attrs = tree() +ext_attrs["ext_group"]["dataset"] = 10 +ext_filename = "ext.h5" + +link_attrs = tree() +link_attrs["links"]["group"]["dataset"] = 10 +link_attrs["links"]["group"]["relative_softlink"] = h5py.SoftLink("dataset") +link_attrs["links"]["relative_softlink"] = h5py.SoftLink("group/dataset") +link_attrs["links"]["absolute_softlink"] = h5py.SoftLink("/links/group/dataset") +link_attrs["links"]["external_link"] = h5py.ExternalLink(ext_filename, "/ext_group/dataset") + class TestDictToH5(unittest.TestCase): def setUp(self): self.tempdir = tempfile.mkdtemp() self.h5_fname = os.path.join(self.tempdir, "cityattrs.h5") + self.h5_ext_fname = os.path.join(self.tempdir, ext_filename) def tearDown(self): if os.path.exists(self.h5_fname): os.unlink(self.h5_fname) + if os.path.exists(self.h5_ext_fname): + os.unlink(self.h5_ext_fname) os.rmdir(self.tempdir) def testH5CityAttrs(self): @@ -201,31 +217,129 @@ class TestDictToH5(unittest.TestCase): self.assertEqual(h5file["group/group/dataset"].attrs['attr'], 11) self.assertEqual(h5file["group/group"].attrs['attr'], 12) + def testLinks(self): + with h5py.File(self.h5_ext_fname, "w") as h5file: + dictdump.dicttoh5(ext_attrs, h5file) + with h5py.File(self.h5_fname, "w") as h5file: + dictdump.dicttoh5(link_attrs, h5file) + with h5py.File(self.h5_fname, "r") as h5file: + self.assertEqual(h5file["links/group/dataset"][()], 10) + self.assertEqual(h5file["links/group/relative_softlink"][()], 10) + self.assertEqual(h5file["links/relative_softlink"][()], 10) + self.assertEqual(h5file["links/absolute_softlink"][()], 10) + self.assertEqual(h5file["links/external_link"][()], 10) + + def testDumpNumpyArray(self): + ddict = { + 'darks': { + '0': numpy.array([[0, 0, 0], [0, 0, 0]], dtype=numpy.uint16) + } + } + with h5py.File(self.h5_fname, "w") as h5file: + dictdump.dicttoh5(ddict, h5file) + with h5py.File(self.h5_fname, "r") as h5file: + numpy.testing.assert_array_equal(h5py_read_dataset(h5file["darks"]["0"]), + ddict['darks']['0']) + + +class TestH5ToDict(unittest.TestCase): + def setUp(self): + self.tempdir = tempfile.mkdtemp() + self.h5_fname = os.path.join(self.tempdir, "cityattrs.h5") + self.h5_ext_fname = os.path.join(self.tempdir, ext_filename) + dicttoh5(city_attrs, self.h5_fname) + dicttoh5(link_attrs, self.h5_fname, mode="a") + dicttoh5(ext_attrs, self.h5_ext_fname) + + def tearDown(self): + if os.path.exists(self.h5_fname): + os.unlink(self.h5_fname) + if os.path.exists(self.h5_ext_fname): + os.unlink(self.h5_ext_fname) + os.rmdir(self.tempdir) + + def testExcludeNames(self): + ddict = h5todict(self.h5_fname, path="/Europe/France", + exclude_names=["ourcoing", "inhab", "toto"]) + self.assertNotIn("Tourcoing", ddict) + self.assertIn("Grenoble", ddict) + + self.assertNotIn("inhabitants", ddict["Grenoble"]) + self.assertIn("coordinates", ddict["Grenoble"]) + self.assertIn("area", ddict["Grenoble"]) + + def testAsArrayTrue(self): + """Test with asarray=True, the default""" + ddict = h5todict(self.h5_fname, path="/Europe/France/Grenoble") + self.assertTrue(numpy.array_equal(ddict["inhabitants"], numpy.array(inhabitants))) + + def testAsArrayFalse(self): + """Test with asarray=False""" + ddict = h5todict(self.h5_fname, path="/Europe/France/Grenoble", asarray=False) + self.assertEqual(ddict["inhabitants"], inhabitants) + + def testDereferenceLinks(self): + ddict = h5todict(self.h5_fname, path="links", dereference_links=True) + self.assertTrue(ddict["absolute_softlink"], 10) + self.assertTrue(ddict["relative_softlink"], 10) + self.assertTrue(ddict["external_link"], 10) + self.assertTrue(ddict["group"]["relative_softlink"], 10) + + def testPreserveLinks(self): + ddict = h5todict(self.h5_fname, path="links", dereference_links=False) + self.assertTrue(is_link(ddict["absolute_softlink"])) + self.assertTrue(is_link(ddict["relative_softlink"])) + self.assertTrue(is_link(ddict["external_link"])) + self.assertTrue(is_link(ddict["group"]["relative_softlink"])) + + def testStrings(self): + ddict = {"dset_bytes": b"bytes", + "dset_utf8": "utf8", + "dset_2bytes": [b"bytes", b"bytes"], + "dset_2utf8": ["utf8", "utf8"], + ("", "attr_bytes"): b"bytes", + ("", "attr_utf8"): "utf8", + ("", "attr_2bytes"): [b"bytes", b"bytes"], + ("", "attr_2utf8"): ["utf8", "utf8"]} + dicttoh5(ddict, self.h5_fname, mode="w") + adict = h5todict(self.h5_fname, include_attributes=True, asarray=False) + self.assertEqual(ddict["dset_bytes"], adict["dset_bytes"]) + self.assertEqual(ddict["dset_utf8"], adict["dset_utf8"]) + self.assertEqual(ddict[("", "attr_bytes")], adict[("", "attr_bytes")]) + self.assertEqual(ddict[("", "attr_utf8")], adict[("", "attr_utf8")]) + numpy.testing.assert_array_equal(ddict["dset_2bytes"], adict["dset_2bytes"]) + numpy.testing.assert_array_equal(ddict["dset_2utf8"], adict["dset_2utf8"]) + numpy.testing.assert_array_equal(ddict[("", "attr_2bytes")], adict[("", "attr_2bytes")]) + numpy.testing.assert_array_equal(ddict[("", "attr_2utf8")], adict[("", "attr_2utf8")]) + class TestDictToNx(unittest.TestCase): def setUp(self): self.tempdir = tempfile.mkdtemp() self.h5_fname = os.path.join(self.tempdir, "nx.h5") + self.h5_ext_fname = os.path.join(self.tempdir, "nx_ext.h5") def tearDown(self): if os.path.exists(self.h5_fname): os.unlink(self.h5_fname) + if os.path.exists(self.h5_ext_fname): + os.unlink(self.h5_ext_fname) os.rmdir(self.tempdir) def testAttributes(self): """Any kind of attribute can be described""" ddict = { - "group": {"datatset": "hmmm", "@group_attr": 10}, - "dataset": "aaaaaaaaaaaaaaa", + "group": {"dataset": 100, "@group_attr1": 10}, + "dataset": 200, "@root_attr": 11, - "dataset@dataset_attr": 12, + "dataset@dataset_attr": "12", "group@group_attr2": 13, } with h5py.File(self.h5_fname, "w") as h5file: dictdump.dicttonx(ddict, h5file) - self.assertEqual(h5file["group"].attrs['group_attr'], 10) + self.assertEqual(h5file["group"].attrs['group_attr1'], 10) self.assertEqual(h5file.attrs['root_attr'], 11) - self.assertEqual(h5file["dataset"].attrs['dataset_attr'], 12) + self.assertEqual(h5file["dataset"].attrs['dataset_attr'], "12") self.assertEqual(h5file["group"].attrs['group_attr2'], 13) def testKeyOrder(self): @@ -280,36 +394,120 @@ class TestDictToNx(unittest.TestCase): self.assertEqual(h5file["group/group/dataset"].attrs['attr'], 11) self.assertEqual(h5file["group/group"].attrs['attr'], 12) - -class TestH5ToDict(unittest.TestCase): + def testLinks(self): + ddict = {"ext_group": {"dataset": 10}} + dictdump.dicttonx(ddict, self.h5_ext_fname) + ddict = {"links": {"group": {"dataset": 10, ">relative_softlink": "dataset"}, + ">relative_softlink": "group/dataset", + ">absolute_softlink": "/links/group/dataset", + ">external_link": "nx_ext.h5::/ext_group/dataset"}} + dictdump.dicttonx(ddict, self.h5_fname) + with h5py.File(self.h5_fname, "r") as h5file: + self.assertEqual(h5file["links/group/dataset"][()], 10) + self.assertEqual(h5file["links/group/relative_softlink"][()], 10) + self.assertEqual(h5file["links/relative_softlink"][()], 10) + self.assertEqual(h5file["links/absolute_softlink"][()], 10) + self.assertEqual(h5file["links/external_link"][()], 10) + + def testUpLinks(self): + ddict = {"data": {"group": {"dataset": 10, ">relative_softlink": "dataset"}}, + "links": {"group": {"subgroup": {">relative_softlink": "../../../data/group/dataset"}}}} + dictdump.dicttonx(ddict, self.h5_fname) + with h5py.File(self.h5_fname, "r") as h5file: + self.assertEqual(h5file["/links/group/subgroup/relative_softlink"][()], 10) + + +class TestNxToDict(unittest.TestCase): def setUp(self): self.tempdir = tempfile.mkdtemp() - self.h5_fname = os.path.join(self.tempdir, "cityattrs.h5") - dicttoh5(city_attrs, self.h5_fname) + self.h5_fname = os.path.join(self.tempdir, "nx.h5") + self.h5_ext_fname = os.path.join(self.tempdir, "nx_ext.h5") def tearDown(self): - os.unlink(self.h5_fname) + if os.path.exists(self.h5_fname): + os.unlink(self.h5_fname) + if os.path.exists(self.h5_ext_fname): + os.unlink(self.h5_ext_fname) os.rmdir(self.tempdir) - def testExcludeNames(self): - ddict = h5todict(self.h5_fname, path="/Europe/France", - exclude_names=["ourcoing", "inhab", "toto"]) - self.assertNotIn("Tourcoing", ddict) - self.assertIn("Grenoble", ddict) - - self.assertNotIn("inhabitants", ddict["Grenoble"]) - self.assertIn("coordinates", ddict["Grenoble"]) - self.assertIn("area", ddict["Grenoble"]) - - def testAsArrayTrue(self): - """Test with asarray=True, the default""" - ddict = h5todict(self.h5_fname, path="/Europe/France/Grenoble") - self.assertTrue(numpy.array_equal(ddict["inhabitants"], numpy.array(inhabitants))) - - def testAsArrayFalse(self): - """Test with asarray=False""" - ddict = h5todict(self.h5_fname, path="/Europe/France/Grenoble", asarray=False) - self.assertEqual(ddict["inhabitants"], inhabitants) + def testAttributes(self): + """Any kind of attribute can be described""" + ddict = { + "group": {"dataset": 100, "@group_attr1": 10}, + "dataset": 200, + "@root_attr": 11, + "dataset@dataset_attr": "12", + "group@group_attr2": 13, + } + dictdump.dicttonx(ddict, self.h5_fname) + ddict = dictdump.nxtodict(self.h5_fname, include_attributes=True) + self.assertEqual(ddict["group"]["@group_attr1"], 10) + self.assertEqual(ddict["@root_attr"], 11) + self.assertEqual(ddict["dataset@dataset_attr"], "12") + self.assertEqual(ddict["group"]["@group_attr2"], 13) + + def testDereferenceLinks(self): + """Write links and dereference on read""" + ddict = {"ext_group": {"dataset": 10}} + dictdump.dicttonx(ddict, self.h5_ext_fname) + ddict = {"links": {"group": {"dataset": 10, ">relative_softlink": "dataset"}, + ">relative_softlink": "group/dataset", + ">absolute_softlink": "/links/group/dataset", + ">external_link": "nx_ext.h5::/ext_group/dataset"}} + dictdump.dicttonx(ddict, self.h5_fname) + + ddict = dictdump.h5todict(self.h5_fname, dereference_links=True) + self.assertTrue(ddict["links"]["absolute_softlink"], 10) + self.assertTrue(ddict["links"]["relative_softlink"], 10) + self.assertTrue(ddict["links"]["external_link"], 10) + self.assertTrue(ddict["links"]["group"]["relative_softlink"], 10) + + def testPreserveLinks(self): + """Write/read links""" + ddict = {"ext_group": {"dataset": 10}} + dictdump.dicttonx(ddict, self.h5_ext_fname) + ddict = {"links": {"group": {"dataset": 10, ">relative_softlink": "dataset"}, + ">relative_softlink": "group/dataset", + ">absolute_softlink": "/links/group/dataset", + ">external_link": "nx_ext.h5::/ext_group/dataset"}} + dictdump.dicttonx(ddict, self.h5_fname) + + ddict = dictdump.nxtodict(self.h5_fname, dereference_links=False) + self.assertTrue(ddict["links"][">absolute_softlink"], "dataset") + self.assertTrue(ddict["links"][">relative_softlink"], "group/dataset") + self.assertTrue(ddict["links"][">external_link"], "/links/group/dataset") + self.assertTrue(ddict["links"]["group"][">relative_softlink"], "nx_ext.h5::/ext_group/datase") + + def testNotExistingPath(self): + """Test converting not existing path""" + with h5py.File(self.h5_fname, 'a') as f: + f['data'] = 1 + + ddict = h5todict(self.h5_fname, path="/I/am/not/a/path", errors='ignore') + self.assertFalse(ddict) + + with TestLogging(dictdump_logger, error=1): + ddict = h5todict(self.h5_fname, path="/I/am/not/a/path", errors='log') + self.assertFalse(ddict) + + with self.assertRaises(KeyError): + h5todict(self.h5_fname, path="/I/am/not/a/path", errors='raise') + + def testBrokenLinks(self): + """Test with broken links""" + with h5py.File(self.h5_fname, 'a') as f: + f["/Mars/BrokenSoftLink"] = h5py.SoftLink("/Idontexists") + f["/Mars/BrokenExternalLink"] = h5py.ExternalLink("notexistingfile.h5", "/Idontexists") + + ddict = h5todict(self.h5_fname, path="/Mars", errors='ignore') + self.assertFalse(ddict) + + with TestLogging(dictdump_logger, error=2): + ddict = h5todict(self.h5_fname, path="/Mars", errors='log') + self.assertFalse(ddict) + + with self.assertRaises(KeyError): + h5todict(self.h5_fname, path="/Mars", errors='raise') class TestDictToJson(unittest.TestCase): @@ -436,6 +634,7 @@ def suite(): test_suite.addTest(loadTests(TestDictToNx)) test_suite.addTest(loadTests(TestDictToJson)) test_suite.addTest(loadTests(TestH5ToDict)) + test_suite.addTest(loadTests(TestNxToDict)) return test_suite diff --git a/silx/io/test/test_spectoh5.py b/silx/io/test/test_spectoh5.py index c3f03e9..903a62c 100644 --- a/silx/io/test/test_spectoh5.py +++ b/silx/io/test/test_spectoh5.py @@ -33,6 +33,7 @@ import h5py from ..spech5 import SpecH5, SpecH5Group from ..convert import convert, write_to_h5 +from ..utils import h5py_read_dataset __authors__ = ["P. Knobel"] __license__ = "MIT" @@ -129,7 +130,7 @@ class TestConvertSpecHDF5(unittest.TestCase): def testTitle(self): """Test the value of a dataset""" - title12 = self.h5f["/1.2/title"][()] + title12 = h5py_read_dataset(self.h5f["/1.2/title"]) self.assertEqual(title12, u"aaaaaa") diff --git a/silx/io/test/test_url.py b/silx/io/test/test_url.py index e68c67a..114f6a7 100644 --- a/silx/io/test/test_url.py +++ b/silx/io/test/test_url.py @@ -152,6 +152,16 @@ class TestDataUrl(unittest.TestCase): expected = [True, True, None, "/a.h5", "/b", (5, 1)] self.assertUrl(url, expected) + def test_slice2(self): + url = DataUrl("/a.h5?path=/b&slice=2:5") + expected = [True, True, None, "/a.h5", "/b", (slice(2, 5),)] + self.assertUrl(url, expected) + + def test_slice3(self): + url = DataUrl("/a.h5?path=/b&slice=::2") + expected = [True, True, None, "/a.h5", "/b", (slice(None, None, 2),)] + self.assertUrl(url, expected) + def test_slice_ellipsis(self): url = DataUrl("/a.h5?path=/b&slice=...") expected = [True, True, None, "/a.h5", "/b", (Ellipsis, )] diff --git a/silx/io/test/test_utils.py b/silx/io/test/test_utils.py index 6c70636..13ab532 100644 --- a/silx/io/test/test_utils.py +++ b/silx/io/test/test_utils.py @@ -33,6 +33,7 @@ import unittest import sys from .. import utils +from ..._version import calc_hexversion import silx.io.url import h5py @@ -40,11 +41,9 @@ from ..utils import h5ls import fabio - __authors__ = ["P. Knobel"] __license__ = "MIT" -__date__ = "12/02/2018" - +__date__ = "03/12/2020" expected_spec1 = r"""#F .* #D .* @@ -67,6 +66,28 @@ expected_spec2 = expected_spec1 + r""" 2 8\.00 3 9\.00 """ + +expected_spec2reg = r"""#F .* +#D .* + +#S 1 Ordinate1 +#D .* +#N 3 +#L Abscissa Ordinate1 Ordinate2 +1 4\.00 7\.00 +2 5\.00 8\.00 +3 6\.00 9\.00 +""" + +expected_spec2irr = expected_spec1 + r""" +#S 2 Ordinate2 +#D .* +#N 2 +#L Abscissa Ordinate2 +1 7\.00 +2 8\.00 +""" + expected_csv = r"""Abscissa;Ordinate1;Ordinate2 1;4\.00;7\.00e\+00 2;5\.00;8\.00e\+00 @@ -83,6 +104,7 @@ expected_csv2 = r"""x;y0;y1 class TestSave(unittest.TestCase): """Test saving curves as SpecFile: """ + def setUp(self): self.tempdir = tempfile.mkdtemp() self.spec_fname = os.path.join(self.tempdir, "savespec.dat") @@ -92,6 +114,7 @@ class TestSave(unittest.TestCase): self.x = [1, 2, 3] self.xlab = "Abscissa" self.y = [[4, 5, 6], [7, 8, 9]] + self.y_irr = [[4, 5, 6], [7, 8]] self.ylabs = ["Ordinate1", "Ordinate2"] def tearDown(self): @@ -103,13 +126,6 @@ class TestSave(unittest.TestCase): os.unlink(self.npy_fname) shutil.rmtree(self.tempdir) - def assertRegex(self, *args, **kwargs): - # Python 2 compatibility - if sys.version_info.major >= 3: - return super(TestSave, self).assertRegex(*args, **kwargs) - else: - return self.assertRegexpMatches(*args, **kwargs) - def test_save_csv(self): utils.save1D(self.csv_fname, self.x, self.y, xlabel=self.xlab, ylabels=self.ylabs, @@ -145,7 +161,6 @@ class TestSave(unittest.TestCase): specf = open(self.spec_fname) actual_spec = specf.read() specf.close() - self.assertRegex(actual_spec, expected_spec1) def test_savespec_file_handle(self): @@ -165,18 +180,30 @@ class TestSave(unittest.TestCase): specf = open(self.spec_fname) actual_spec = specf.read() specf.close() - self.assertRegex(actual_spec, expected_spec2) - def test_save_spec(self): - """Save SpecFile using save()""" + def test_save_spec_reg(self): + """Save SpecFile using save() on a regular pattern""" utils.save1D(self.spec_fname, self.x, self.y, xlabel=self.xlab, ylabels=self.ylabs, filetype="spec", fmt=["%d", "%.2f"]) specf = open(self.spec_fname) actual_spec = specf.read() specf.close() - self.assertRegex(actual_spec, expected_spec2) + + self.assertRegex(actual_spec, expected_spec2reg) + + def test_save_spec_irr(self): + """Save SpecFile using save() on an irregular pattern""" + # invalid test case ?! + return + utils.save1D(self.spec_fname, self.x, self.y_irr, xlabel=self.xlab, + ylabels=self.ylabs, filetype="spec", fmt=["%d", "%.2f"]) + + specf = open(self.spec_fname) + actual_spec = specf.read() + specf.close() + self.assertRegex(actual_spec, expected_spec2irr) def test_save_csv_no_labels(self): """Save csv using save(), with autoheader=True but @@ -217,6 +244,7 @@ class TestH5Ls(unittest.TestCase): """ + def assertMatchAnyStringInList(self, pattern, list_of_strings): for string_ in list_of_strings: if re.match(pattern, string_): @@ -395,6 +423,7 @@ class TestOpen(unittest.TestCase): class TestNodes(unittest.TestCase): """Test `silx.io.utils.is_` functions.""" + def test_real_h5py_objects(self): name = tempfile.mktemp(suffix=".h5") try: @@ -417,45 +446,60 @@ class TestNodes(unittest.TestCase): os.unlink(name) def test_h5py_like_file(self): + class Foo(object): + def __init__(self): self.h5_class = utils.H5Type.FILE + obj = Foo() self.assertTrue(utils.is_file(obj)) self.assertTrue(utils.is_group(obj)) self.assertFalse(utils.is_dataset(obj)) def test_h5py_like_group(self): + class Foo(object): + def __init__(self): self.h5_class = utils.H5Type.GROUP + obj = Foo() self.assertFalse(utils.is_file(obj)) self.assertTrue(utils.is_group(obj)) self.assertFalse(utils.is_dataset(obj)) def test_h5py_like_dataset(self): + class Foo(object): + def __init__(self): self.h5_class = utils.H5Type.DATASET + obj = Foo() self.assertFalse(utils.is_file(obj)) self.assertFalse(utils.is_group(obj)) self.assertTrue(utils.is_dataset(obj)) def test_bad(self): + class Foo(object): + def __init__(self): pass + obj = Foo() self.assertFalse(utils.is_file(obj)) self.assertFalse(utils.is_group(obj)) self.assertFalse(utils.is_dataset(obj)) def test_bad_api(self): + class Foo(object): + def __init__(self): self.h5_class = int + obj = Foo() self.assertFalse(utils.is_file(obj)) self.assertFalse(utils.is_group(obj)) @@ -513,18 +557,20 @@ class TestGetData(unittest.TestCase): def test_hdf5_array(self): url = "silx:%s?/group/group/array" % self.h5_filename data = utils.get_data(url=url) - self.assertEqual(data.shape, (5, )) + self.assertEqual(data.shape, (5,)) self.assertEqual(data[0], 1) def test_hdf5_array_slice(self): url = "silx:%s?path=/group/group/array2d&slice=1" % self.h5_filename data = utils.get_data(url=url) - self.assertEqual(data.shape, (5, )) + self.assertEqual(data.shape, (5,)) self.assertEqual(data[0], 6) def test_hdf5_array_slice_out_of_range(self): url = "silx:%s?path=/group/group/array2d&slice=5" % self.h5_filename - self.assertRaises(ValueError, utils.get_data, url) + # ValueError: h5py 2.x + # IndexError: h5py 3.x + self.assertRaises((ValueError, IndexError), utils.get_data, url) def test_edf_using_silx(self): url = "silx:%s?/scan_0/instrument/detector_0/data" % self.edf_filename @@ -568,14 +614,15 @@ class TestGetData(unittest.TestCase): def _h5_py_version_older_than(version): - v_majeur, v_mineur, v_micro = h5py.version.version.split('.')[:3] - r_majeur, r_mineur, r_micro = version.split('.') - return v_majeur >= r_majeur and v_mineur >= r_mineur + v_majeur, v_mineur, v_micro = [int(i) for i in h5py.version.version.split('.')[:3]] + r_majeur, r_mineur, r_micro = [int(i) for i in version.split('.')] + return calc_hexversion(v_majeur, v_mineur, v_micro) >= calc_hexversion(r_majeur, r_mineur, r_micro) @unittest.skipUnless(_h5_py_version_older_than('2.9.0'), 'h5py version < 2.9.0') class TestRawFileToH5(unittest.TestCase): """Test conversion of .vol file to .h5 external dataset""" + def setUp(self): self.tempdir = tempfile.mkdtemp() self._vol_file = os.path.join(self.tempdir, 'test_vol.vol') @@ -589,7 +636,7 @@ class TestRawFileToH5(unittest.TestCase): assert os.path.exists(self._vol_file + '.npy') os.rename(self._vol_file + '.npy', self._vol_file) self.h5_file = os.path.join(self.tempdir, 'test_h5.h5') - self.external_dataset_path= '/root/my_external_dataset' + self.external_dataset_path = '/root/my_external_dataset' self._data_url = silx.io.url.DataUrl(file_path=self.h5_file, data_path=self.external_dataset_path) with open(self._file_info, 'w') as _fi: @@ -672,6 +719,158 @@ class TestRawFileToH5(unittest.TestCase): shape=self._dataset_shape)) +class TestH5Strings(unittest.TestCase): + """Test HDF5 str and bytes writing and reading""" + + @classmethod + def setUpClass(cls): + cls.tempdir = tempfile.mkdtemp() + cls.vlenstr = h5py.special_dtype(vlen=str) + cls.vlenbytes = h5py.special_dtype(vlen=bytes) + try: + cls.unicode = unicode + except NameError: + cls.unicode = str + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tempdir) + + def setUp(self): + self.file = h5py.File(os.path.join(self.tempdir, 'file.h5'), mode="w") + + def tearDown(self): + self.file.close() + + @classmethod + def _make_array(cls, value, n): + if isinstance(value, bytes): + dtype = cls.vlenbytes + elif isinstance(value, cls.unicode): + dtype = cls.vlenstr + else: + return numpy.array([value] * n) + return numpy.array([value] * n, dtype=dtype) + + @classmethod + def _get_charset(cls, value): + if isinstance(value, bytes): + return h5py.h5t.CSET_ASCII + elif isinstance(value, cls.unicode): + return h5py.h5t.CSET_UTF8 + else: + return None + + def _check_dataset(self, value, result=None): + # Write+read scalar + if result: + decode_ascii = True + else: + decode_ascii = False + result = value + charset = self._get_charset(value) + self.file["data"] = value + data = utils.h5py_read_dataset(self.file["data"], decode_ascii=decode_ascii) + assert type(data) == type(result), data + assert data == result, data + if charset: + assert self.file["data"].id.get_type().get_cset() == charset + + # Write+read variable length + self.file["vlen_data"] = self._make_array(value, 2) + data = utils.h5py_read_dataset(self.file["vlen_data"], decode_ascii=decode_ascii, index=0) + assert type(data) == type(result), data + assert data == result, data + data = utils.h5py_read_dataset(self.file["vlen_data"], decode_ascii=decode_ascii) + numpy.testing.assert_array_equal(data, [result] * 2) + if charset: + assert self.file["vlen_data"].id.get_type().get_cset() == charset + + def _check_attribute(self, value, result=None): + if result: + decode_ascii = True + else: + decode_ascii = False + result = value + self.file.attrs["data"] = value + data = utils.h5py_read_attribute(self.file.attrs, "data", decode_ascii=decode_ascii) + assert type(data) == type(result), data + assert data == result, data + + self.file.attrs["vlen_data"] = self._make_array(value, 2) + data = utils.h5py_read_attribute(self.file.attrs, "vlen_data", decode_ascii=decode_ascii) + assert type(data[0]) == type(result), data[0] + assert data[0] == result, data[0] + numpy.testing.assert_array_equal(data, [result] * 2) + + data = utils.h5py_read_attributes(self.file.attrs, decode_ascii=decode_ascii)["vlen_data"] + assert type(data[0]) == type(result), data[0] + assert data[0] == result, data[0] + numpy.testing.assert_array_equal(data, [result] * 2) + + def test_dataset_ascii_bytes(self): + self._check_dataset(b"abc") + + def test_attribute_ascii_bytes(self): + self._check_attribute(b"abc") + + def test_dataset_ascii_bytes_decode(self): + self._check_dataset(b"abc", result="abc") + + def test_attribute_ascii_bytes_decode(self): + self._check_attribute(b"abc", result="abc") + + def test_dataset_ascii_str(self): + self._check_dataset("abc") + + def test_attribute_ascii_str(self): + self._check_attribute("abc") + + def test_dataset_utf8_str(self): + self._check_dataset("\u0101bc") + + def test_attribute_utf8_str(self): + self._check_attribute("\u0101bc") + + def test_dataset_utf8_bytes(self): + # 0xC481 is the byte representation of U+0101 + self._check_dataset(b"\xc4\x81bc") + + def test_attribute_utf8_bytes(self): + # 0xC481 is the byte representation of U+0101 + self._check_attribute(b"\xc4\x81bc") + + def test_dataset_utf8_bytes_decode(self): + # 0xC481 is the byte representation of U+0101 + self._check_dataset(b"\xc4\x81bc", result="\u0101bc") + + def test_attribute_utf8_bytes_decode(self): + # 0xC481 is the byte representation of U+0101 + self._check_attribute(b"\xc4\x81bc", result="\u0101bc") + + def test_dataset_latin1_bytes(self): + # extended ascii character 0xE4 + self._check_dataset(b"\xe423") + + def test_attribute_latin1_bytes(self): + # extended ascii character 0xE4 + self._check_attribute(b"\xe423") + + def test_dataset_latin1_bytes_decode(self): + # U+DCE4: surrogate for extended ascii character 0xE4 + self._check_dataset(b"\xe423", result="\udce423") + + def test_attribute_latin1_bytes_decode(self): + # U+DCE4: surrogate for extended ascii character 0xE4 + self._check_attribute(b"\xe423", result="\udce423") + + def test_dataset_no_string(self): + self._check_dataset(numpy.int64(10)) + + def test_attribute_no_string(self): + self._check_attribute(numpy.int64(10)) + + def suite(): loadTests = unittest.defaultTestLoader.loadTestsFromTestCase test_suite = unittest.TestSuite() @@ -681,6 +880,7 @@ def suite(): test_suite.addTest(loadTests(TestNodes)) test_suite.addTest(loadTests(TestGetData)) test_suite.addTest(loadTests(TestRawFileToH5)) + test_suite.addTest(loadTests(TestH5Strings)) return test_suite diff --git a/silx/io/url.py b/silx/io/url.py index 7607ae5..044977c 100644 --- a/silx/io/url.py +++ b/silx/io/url.py @@ -178,8 +178,20 @@ class DataUrl(object): def str_to_slice(string): if string == "...": return Ellipsis - elif string == ":": - return slice(None) + elif ':' in string: + if string == ":": + return slice(None) + else: + def get_value(my_str): + if my_str in ('', None): + return None + else: + return int(my_str) + sss = string.split(':') + start = get_value(sss[0]) + stop = get_value(sss[1] if len(sss) > 1 else None) + step = get_value(sss[2] if len(sss) > 2 else None) + return slice(start, stop, step) else: return int(string) @@ -201,7 +213,10 @@ class DataUrl(object): :param str path: Path representing the URL. """ self.__path = path - path = path.replace("::", "?", 1) + # only replace if ? not here already. Otherwise can mess sith + # data_slice if == ::2 for example + if '?' not in path: + path = path.replace("::", "?", 1) url = parse.urlparse(path) is_valid = True diff --git a/silx/io/utils.py b/silx/io/utils.py index 5da344d..12e9a7e 100644 --- a/silx/io/utils.py +++ b/silx/io/utils.py @@ -25,8 +25,7 @@ __authors__ = ["P. Knobel", "V. Valls"] __license__ = "MIT" -__date__ = "18/04/2018" - +__date__ = "03/12/2020" import enum import os.path @@ -40,18 +39,19 @@ import six from silx.utils.proxy import Proxy import silx.io.url +from .._version import calc_hexversion import h5py +import h5py.h5t +import h5py.h5a try: import h5pyd except ImportError as e: h5pyd = None - logger = logging.getLogger(__name__) - NEXUS_HDF5_EXT = [".h5", ".nx5", ".nxs", ".hdf", ".hdf5", ".cxi"] """List of possible extensions for HDF5 file formats.""" @@ -190,34 +190,46 @@ def save1D(fname, x, y, xlabel=None, ylabels=None, filetype=None, if xlabel is None: xlabel = "x" if ylabels is None: - if len(numpy.array(y).shape) > 1: + if numpy.array(y).ndim > 1: ylabels = ["y%d" % i for i in range(len(y))] else: ylabels = ["y"] elif isinstance(ylabels, (list, tuple)): # if ylabels is provided as a list, every element must # be a string - ylabels = [ylabels[i] if ylabels[i] is not None else "y%d" % i - for i in range(len(ylabels))] + ylabels = [ylabel if isinstance(ylabel, string_types) else "y%d" % i + for ylabel in ylabels] if filetype.lower() == "spec": - y_array = numpy.asarray(y) - - # make sure y_array is a 2D array even for a single curve - if len(y_array.shape) == 1: - y_array = y_array.reshape(1, y_array.shape[0]) - elif len(y_array.shape) > 2 or len(y_array.shape) < 1: - raise IndexError("y must be a 1D or 2D array") - - # First curve - specf = savespec(fname, x, y_array[0], xlabel, ylabels[0], fmt=fmt, - scan_number=1, mode="w", write_file_header=True, - close_file=False) - # Other curves - for i in range(1, y_array.shape[0]): - specf = savespec(specf, x, y_array[i], xlabel, ylabels[i], - fmt=fmt, scan_number=i + 1, mode="w", - write_file_header=False, close_file=False) + # Check if we have regular data: + ref = len(x) + regular = True + for one_y in y: + regular &= len(one_y) == ref + if regular: + if isinstance(fmt, (list, tuple)) and len(fmt) < (len(ylabels) + 1): + fmt = fmt + [fmt[-1] * (1 + len(ylabels) - len(fmt))] + specf = savespec(fname, x, y, xlabel, ylabels, fmt=fmt, + scan_number=1, mode="w", write_file_header=True, + close_file=False) + else: + y_array = numpy.asarray(y) + # make sure y_array is a 2D array even for a single curve + if y_array.ndim == 1: + y_array.shape = 1, -1 + elif y_array.ndim not in [1, 2]: + raise IndexError("y must be a 1D or 2D array") + + # First curve + specf = savespec(fname, x, y_array[0], xlabel, ylabels[0], fmt=fmt, + scan_number=1, mode="w", write_file_header=True, + close_file=False) + # Other curves + for i in range(1, y_array.shape[0]): + specf = savespec(specf, x, y_array[i], xlabel, ylabels[i], + fmt=fmt, scan_number=i + 1, mode="w", + write_file_header=False, close_file=False) + # close file if we created it if not hasattr(fname, "write"): specf.close() @@ -307,9 +319,11 @@ def savespec(specfile, x, y, xlabel="X", ylabel="Y", fmt="%.7g", or append mode. If a file name is provided, a new file is open in write mode (existing file with the same name will be lost) :param x: 1D-Array (or list) of abscissa values - :param y: 1D-array (or list) of ordinates values + :param y: 1D-array (or list), or list of them of ordinates values. + All dataset must have the same length as x :param xlabel: Abscissa label (default ``"X"``) - :param ylabel: Ordinate label + :param ylabel: Ordinate label, may be a list of labels when multiple curves + are to be saved together. :param fmt: Format string for data. You can specify a short format string that defines a single format for both ``x`` and ``y`` values, or a list of two different format strings (e.g. ``["%d", "%.7g"]``). @@ -333,40 +347,51 @@ def savespec(specfile, x, y, xlabel="X", ylabel="Y", fmt="%.7g", x_array = numpy.asarray(x) y_array = numpy.asarray(y) + if y_array.ndim > 2: + raise IndexError("Y columns must have be packed as 1D") - if y_array.shape[0] != x_array.shape[0]: + if y_array.shape[-1] != x_array.shape[0]: raise IndexError("X and Y columns must have the same length") + if y_array.ndim == 2: + assert isinstance(ylabel, (list, tuple)) + assert y_array.shape[0] == len(ylabel) + labels = (xlabel, *ylabel) + else: + labels = (xlabel, ylabel) + data = numpy.vstack((x_array, y_array)) + ncol = data.shape[0] + assert len(labels) == ncol + + print(xlabel, ylabel, fmt, ncol, x_array, y_array) if isinstance(fmt, string_types) and fmt.count("%") == 1: - full_fmt_string = fmt + " " + fmt + "\n" - elif isinstance(fmt, (list, tuple)) and len(fmt) == 2: - full_fmt_string = " ".join(fmt) + "\n" + full_fmt_string = " ".join([fmt] * ncol) + elif isinstance(fmt, (list, tuple)) and len(fmt) == ncol: + full_fmt_string = " ".join(fmt) else: - raise ValueError("fmt must be a single format string or a list of " + - "two format strings") + raise ValueError("`fmt` must be a single format string or a list of " + + "format strings with as many format as ncolumns") if not hasattr(specfile, "write"): f = builtin_open(specfile, mode) else: f = specfile - output = "" - - current_date = "#D %s\n" % (time.ctime(time.time())) - + current_date = "#D %s" % (time.ctime(time.time())) if write_file_header: - output += "#F %s\n" % f.name - output += current_date - output += "\n" - - output += "#S %d %s\n" % (scan_number, ylabel) - output += current_date - output += "#N 2\n" - output += "#L %s %s\n" % (xlabel, ylabel) - for i in range(y_array.shape[0]): - output += full_fmt_string % (x_array[i], y_array[i]) - output += "\n" + lines = [ "#F %s" % f.name, current_date, ""] + else: + lines = [""] + lines += [ "#S %d %s" % (scan_number, labels[1]), + current_date, + "#N %d" % ncol, + "#L " + " ".join(labels)] + + for i in data.T: + lines.append(full_fmt_string % tuple(i)) + lines.append("") + output = "\n".join(lines) f.write(output.encode()) if close_file: @@ -406,7 +431,7 @@ def h5ls(h5group, lvl=0): if is_group(h5group): h5f = h5group elif isinstance(h5group, string_types): - h5f = open(h5group) # silx.io.open + h5f = open(h5group) # silx.io.open else: raise TypeError("h5group must be a hdf5-like group object or a file name.") @@ -735,6 +760,26 @@ def is_softlink(obj): return t == H5Type.SOFT_LINK +def is_externallink(obj): + """ + True if the object is a h5py.ExternalLink-like object. + + :param obj: An object + """ + t = get_h5_class(obj) + return t == H5Type.EXTERNAL_LINK + + +def is_link(obj): + """ + True if the object is a h5py link-like object. + + :param obj: An object + """ + t = get_h5_class(obj) + return t in {H5Type.SOFT_LINK, H5Type.EXTERNAL_LINK} + + def get_data(url): """Returns a numpy data from an URL. @@ -791,16 +836,16 @@ def get_data(url): raise ValueError("Data path from URL '%s' is not a dataset" % url.path()) if data_slice is not None: - data = data[data_slice] + data = h5py_read_dataset(data, index=data_slice) else: # works for scalar and array - data = data[()] + data = h5py_read_dataset(data) elif url.scheme() == "fabio": import fabio data_slice = url.data_slice() if data_slice is None: - data_slice = (0, ) + data_slice = (0,) if data_slice is None or len(data_slice) != 1: raise ValueError("Fabio slice expect a single frame, but %s found" % data_slice) index = data_slice[0] @@ -844,8 +889,8 @@ def rawfile_to_h5_external_dataset(bin_file, output_url, shape, dtype, """ assert isinstance(output_url, silx.io.url.DataUrl) assert isinstance(shape, (tuple, list)) - v_majeur, v_mineur, v_micro = h5py.version.version.split('.') - if v_majeur <= '2' and v_mineur < '9': + v_majeur, v_mineur, v_micro = [int(i) for i in h5py.version.version.split('.')[:3]] + if calc_hexversion(v_majeur, v_mineur, v_micro)< calc_hexversion(2,9,0): raise Exception('h5py >= 2.9 should be installed to access the ' 'external feature.') @@ -915,3 +960,183 @@ def vol_to_h5_external_dataset(vol_file, output_url, info_file=None, shape=shape, dtype=vol_dtype, overwrite=overwrite) + + +def h5py_decode_value(value, encoding="utf-8", errors="surrogateescape"): + """Keep bytes when value cannot be decoded + + :param value: bytes or array of bytes + :param encoding str: + :param errors str: + """ + try: + if numpy.isscalar(value): + return value.decode(encoding, errors=errors) + str_item = [b.decode(encoding, errors=errors) for b in value.flat] + return numpy.array(str_item, dtype=object).reshape(value.shape) + except UnicodeDecodeError: + return value + + +def h5py_encode_value(value, encoding="utf-8", errors="surrogateescape"): + """Keep string when value cannot be encoding + + :param value: string or array of strings + :param encoding str: + :param errors str: + """ + try: + if numpy.isscalar(value): + return value.encode(encoding, errors=errors) + bytes_item = [s.encode(encoding, errors=errors) for s in value.flat] + return numpy.array(bytes_item, dtype=object).reshape(value.shape) + except UnicodeEncodeError: + return value + + +class H5pyDatasetReadWrapper: + """Wrapper to handle H5T_STRING decoding on-the-fly when reading + a dataset. Uniform behaviour for h5py 2.x and h5py 3.x + + h5py abuses H5T_STRING with ASCII character set + to store `bytes`: dset[()] = b"..." + Therefore an H5T_STRING with ASCII encoding is not decoded by default. + """ + + H5PY_AUTODECODE_NONASCII = int(h5py.version.version.split(".")[0]) < 3 + + def __init__(self, dset, decode_ascii=False): + """ + :param h5py.Dataset dset: + :param bool decode_ascii: + """ + try: + string_info = h5py.h5t.check_string_dtype(dset.dtype) + except AttributeError: + # h5py < 2.10 + try: + idx = dset.id.get_type().get_cset() + except AttributeError: + # Not an H5T_STRING + encoding = None + else: + encoding = ["ascii", "utf-8"][idx] + else: + # h5py >= 2.10 + try: + encoding = string_info.encoding + except AttributeError: + # Not an H5T_STRING + encoding = None + if encoding == "ascii" and not decode_ascii: + encoding = None + if encoding != "ascii" and self.H5PY_AUTODECODE_NONASCII: + # Decoding is already done by the h5py library + encoding = None + if encoding == "ascii": + # ASCII can be decoded as UTF-8 + encoding = "utf-8" + self._encoding = encoding + self._dset = dset + + def __getitem__(self, args): + value = self._dset[args] + if self._encoding: + return h5py_decode_value(value, encoding=self._encoding) + else: + return value + + +class H5pyAttributesReadWrapper: + """Wrapper to handle H5T_STRING decoding on-the-fly when reading + an attribute. Uniform behaviour for h5py 2.x and h5py 3.x + + h5py abuses H5T_STRING with ASCII character set + to store `bytes`: dset[()] = b"..." + Therefore an H5T_STRING with ASCII encoding is not decoded by default. + """ + + H5PY_AUTODECODE = int(h5py.version.version.split(".")[0]) >= 3 + + def __init__(self, attrs, decode_ascii=False): + """ + :param h5py.Dataset dset: + :param bool decode_ascii: + """ + self._attrs = attrs + self._decode_ascii = decode_ascii + + def __getitem__(self, args): + value = self._attrs[args] + + # Get the string encoding (if a string) + try: + dtype = self._attrs.get_id(args).dtype + except AttributeError: + # h5py < 2.10 + attr_id = h5py.h5a.open(self._attrs._id, self._attrs._e(args)) + try: + idx = attr_id.get_type().get_cset() + except AttributeError: + # Not an H5T_STRING + return value + else: + encoding = ["ascii", "utf-8"][idx] + else: + # h5py >= 2.10 + try: + encoding = h5py.h5t.check_string_dtype(dtype).encoding + except AttributeError: + # Not an H5T_STRING + return value + + if self.H5PY_AUTODECODE: + if encoding == "ascii" and not self._decode_ascii: + # Undo decoding by the h5py library + return h5py_encode_value(value, encoding="utf-8") + else: + if encoding == "ascii" and self._decode_ascii: + # Decode ASCII as UTF-8 for consistency + return h5py_decode_value(value, encoding="utf-8") + + # Decoding is already done by the h5py library + return value + + def items(self): + for k in self._attrs.keys(): + yield k, self[k] + + +def h5py_read_dataset(dset, index=tuple(), decode_ascii=False): + """Read data from dataset object. UTF-8 strings will be + decoded while ASCII strings will only be decoded when + `decode_ascii=True`. + + :param h5py.Dataset dset: + :param index: slicing (all by default) + :param bool decode_ascii: + """ + return H5pyDatasetReadWrapper(dset, decode_ascii=decode_ascii)[index] + + +def h5py_read_attribute(attrs, name, decode_ascii=False): + """Read data from attributes. UTF-8 strings will be + decoded while ASCII strings will only be decoded when + `decode_ascii=True`. + + :param h5py.AttributeManager attrs: + :param str name: attribute name + :param bool decode_ascii: + """ + return H5pyAttributesReadWrapper(attrs, decode_ascii=decode_ascii)[name] + + +def h5py_read_attributes(attrs, decode_ascii=False): + """Read data from attributes. UTF-8 strings will be + decoded while ASCII strings will only be decoded when + `decode_ascii=True`. + + :param h5py.AttributeManager attrs: + :param bool decode_ascii: + """ + return dict(H5pyAttributesReadWrapper(attrs, decode_ascii=decode_ascii).items()) -- cgit v1.2.3