diff options
Diffstat (limited to 'silx/io/dictdump.py')
-rw-r--r-- | silx/io/dictdump.py | 366 |
1 files changed, 366 insertions, 0 deletions
diff --git a/silx/io/dictdump.py b/silx/io/dictdump.py new file mode 100644 index 0000000..ae7a457 --- /dev/null +++ b/silx/io/dictdump.py @@ -0,0 +1,366 @@ +# coding: utf-8 +# /*########################################################################## +# Copyright (C) 2016-2017 European Synchrotron Radiation Facility +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# ############################################################################*/ +"""This module offers a set of functions to dump a python dictionary indexed +by text strings to following file formats: `HDF5, INI, JSON` +""" + +from collections import OrderedDict +import json +import logging +import numpy +import os.path +import sys + +try: + import h5py +except ImportError as e: + h5py_missing = True + h5py_import_error = e +else: + h5py_missing = False + +from .configdict import ConfigDict +from .utils import is_group, is_file + +from silx.io import open as h5open + +__authors__ = ["P. Knobel"] +__license__ = "MIT" +__date__ = "10/02/2017" + +logger = logging.getLogger(__name__) + +string_types = (basestring,) if sys.version_info[0] == 2 else (str,) # noqa + + +def _prepare_hdf5_dataset(array_like): + """Cast a python object into a numpy array in a HDF5 friendly format. + + :param array_like: Input dataset in a type that can be digested by + ``numpy.array()`` (`str`, `list`, `numpy.ndarray`…) + :return: ``numpy.ndarray`` ready to be written as an HDF5 dataset + """ + # simple strings + if isinstance(array_like, string_types): + array_like = numpy.string_(array_like) + + # Ensure our data is a numpy.ndarray + if not isinstance(array_like, (numpy.ndarray, numpy.string_)): + array = numpy.array(array_like) + else: + array = array_like + + # handle list of strings or numpy array of strings + if not isinstance(array, numpy.string_): + data_kind = array.dtype.kind + # unicode: convert to byte strings + # (http://docs.h5py.org/en/latest/strings.html) + if data_kind.lower() in ["s", "u"]: + array = numpy.asarray(array, dtype=numpy.string_) + + return array + + +def dicttoh5(treedict, h5file, h5path='/', + mode="w", overwrite_data=False, + create_dataset_args=None): + """Write a nested dictionary to a HDF5 file, using keys as member names. + + If a dictionary value is a sub-dictionary, a group is created. If it is + any other data type, it is cast into a numpy array and written as a + :mod:`h5py` dataset. Dictionary keys must be strings and cannot contain + the ``/`` character. + + .. note:: + + This function requires `h5py <http://www.h5py.org/>`_ to be installed. + + :param treedict: Nested dictionary/tree structure with strings as keys + and array-like objects as leafs. The ``"/"`` character is not allowed + in keys. + :param h5file: HDF5 file name or handle. If a file name is provided, the + function opens the file in the specified mode and closes it again + before completing. + :param h5path: Target path in HDF5 file in which scan groups are created. + Default is root (``"/"``) + :param mode: Can be ``"r+"`` (read/write, file must exist), + ``"w"`` (write, existing file is lost), ``"w-"`` (write, fail if + exists) or ``"a"`` (read/write if exists, create otherwise). + This parameter is ignored if ``h5file`` is a file handle. + :param overwrite_data: If ``True``, existing groups and datasets can be + overwritten, if ``False`` they are skipped. This parameter is only + relevant if ``h5file_mode`` is ``"r+"`` or ``"a"``. + :param create_dataset_args: Dictionary of args you want to pass to + ``h5f.create_dataset``. This allows you to specify filters and + compression parameters. Don't specify ``name`` and ``data``. + + Example:: + + from silx.io.dictdump import dicttoh5 + + city_area = { + "Europe": { + "France": { + "Isère": { + "Grenoble": "18.44 km2" + }, + "Nord": { + "Tourcoing": "15.19 km2" + }, + }, + }, + } + + create_ds_args = {'compression': "gzip", + 'shuffle': True, + 'fletcher32': True} + + dicttoh5(city_area, "cities.h5", h5path="/area", + create_dataset_args=create_ds_args) + """ + if h5py_missing: + raise h5py_import_error + + if not isinstance(h5file, h5py.File): + h5f = h5py.File(h5file, mode) + else: + h5f = h5file + + if not h5path.endswith("/"): + h5path += "/" + + for key in treedict: + + if isinstance(treedict[key], dict) and len(treedict[key]): + # non-empty group: recurse + dicttoh5(treedict[key], h5f, h5path + key, + overwrite_data=overwrite_data, + create_dataset_args=create_dataset_args) + + elif treedict[key] is None or (isinstance(treedict[key], dict) and + not len(treedict[key])): + # Create empty group + h5f.create_group(h5path + key) + + else: + ds = _prepare_hdf5_dataset(treedict[key]) + # can't apply filters on scalars (datasets with shape == () ) + if ds.shape == () or create_dataset_args is None: + h5f.create_dataset(h5path + key, + data=ds) + else: + h5f.create_dataset(h5path + key, + data=ds, + **create_dataset_args) + + if isinstance(h5file, string_types): + h5f.close() + + +def _name_contains_string_in_list(name, strlist): + if strlist is None: + return False + for filter_str in strlist: + if filter_str in name: + return True + return False + + +def h5todict(h5file, path="/", exclude_names=None): + """Read a HDF5 file and return a nested dictionary with the complete file + structure and all data. + + Example of usage:: + + from silx.io.dictdump import h5todict + + # initialize dict with file header and scan header + header94 = h5todict("oleg.dat", + "/94.1/instrument/specfile") + # add positioners subdict + header94["positioners"] = h5todict("oleg.dat", + "/94.1/instrument/positioners") + # add scan data without mca data + header94["detector data"] = h5todict("oleg.dat", + "/94.1/measurement", + exclude_names="mca_") + + + .. note:: This function requires `h5py <http://www.h5py.org/>`_ to be + installed. + + .. note:: If you write a dictionary to a HDF5 file with + :func:`dicttoh5` and then read it back with :func:`h5todict`, data + types are not preserved. All values are cast to numpy arrays before + being written to file, and they are read back as numpy arrays (or + scalars). In some cases, you may find that a list of heterogeneous + data types is converted to a numpy array of strings. + + :param h5file: File name or :class:`h5py.File` object or spech5 file or + fabioh5 file. + :param str path: Name of HDF5 group to use as dictionary root level, + to read only a sub-group in the file + :param list[str] exclude_names: Groups and datasets whose name contains + a string in this list will be ignored. Default is None (ignore nothing) + :return: Nested dictionary + """ + if h5py_missing: + raise h5py_import_error + + if not is_file(h5file): + h5f = h5open(h5file) + else: + h5f = h5file + + ddict = {} + for key in h5f[path]: + if _name_contains_string_in_list(key, exclude_names): + continue + if is_group(h5f[path + "/" + key]): + ddict[key] = h5todict(h5f, + path + "/" + key, + exclude_names=exclude_names) + else: + # Convert HDF5 dataset to numpy array + ddict[key] = h5f[path + "/" + key][...] + + if not is_file(h5file): + # close file, if we opened it + h5f.close() + + return ddict + + +def dicttojson(ddict, jsonfile, indent=None, mode="w"): + """Serialize ``ddict`` as a JSON formatted stream to ``jsonfile``. + + :param ddict: Dictionary (or any object compatible with ``json.dump``). + :param jsonfile: JSON file name or file-like object. + If a file name is provided, the function opens the file in the + specified mode and closes it again. + :param indent: If indent is a non-negative integer, then JSON array + elements and object members will be pretty-printed with that indent + level. An indent level of ``0`` will only insert newlines. + ``None`` (the default) selects the most compact representation. + :param mode: File opening mode (``w``, ``a``, ``w+``…) + """ + if not hasattr(jsonfile, "write"): + jsonf = open(jsonfile, mode) + else: + jsonf = jsonfile + + json.dump(ddict, jsonf, indent=indent) + + if not hasattr(jsonfile, "write"): + jsonf.close() + + +def dicttoini(ddict, inifile, mode="w"): + """Output dict as configuration file (similar to Microsoft Windows INI). + + :param dict: Dictionary of configuration parameters + :param inifile: INI file name or file-like object. + If a file name is provided, the function opens the file in the + specified mode and closes it again. + :param mode: File opening mode (``w``, ``a``, ``w+``…) + """ + if not hasattr(inifile, "write"): + inif = open(inifile, mode) + else: + inif = inifile + + ConfigDict(initdict=ddict).write(inif) + + if not hasattr(inifile, "write"): + inif.close() + + +def dump(ddict, ffile, mode="w", fmat=None): + """Dump dictionary to a file + + :param ddict: Dictionary with string keys + :param ffile: File name or file-like object with a ``write`` method + :param str fmat: Output format: ``"json"``, ``"hdf5"`` or ``"ini"``. + When None (the default), it uses the filename extension as the format. + Dumping to a HDF5 file requires `h5py <http://www.h5py.org/>`_ to be + installed. + :param str mode: File opening mode (``w``, ``a``, ``w+``…) + Default is *"w"*, write mode, overwrite if exists. + :raises IOError: if file format is not supported + """ + if fmat is None: + # If file-like object get its name, else use ffile as filename + filename = getattr(ffile, 'name', ffile) + fmat = os.path.splitext(filename)[1][1:] # Strip extension leading '.' + fmat = fmat.lower() + + if fmat == "json": + dicttojson(ddict, ffile, indent=2, mode=mode) + elif fmat in ["hdf5", "h5"]: + if h5py_missing: + logger.error("Cannot dump to HDF5 format, missing h5py library") + raise h5py_import_error + dicttoh5(ddict, ffile, mode=mode) + elif fmat in ["ini", "cfg"]: + dicttoini(ddict, ffile, mode=mode) + else: + raise IOError("Unknown format " + fmat) + + +def load(ffile, fmat=None): + """Load dictionary from a file + + When loading from a JSON or INI file, an OrderedDict is returned to + preserve the values' insertion order. + + :param ffile: File name or file-like object with a ``read`` method + :param fmat: Input format: ``json``, ``hdf5`` or ``ini``. + When None (the default), it uses the filename extension as the format. + Loading from a HDF5 file requires `h5py <http://www.h5py.org/>`_ to be + installed. + :return: Dictionary (ordered dictionary for JSON and INI) + :raises IOError: if file format is not supported + """ + if not hasattr(ffile, "read"): + f = open(ffile, "r") + fname = ffile + else: + f = ffile + fname = ffile.name + + if fmat is None: # Use file extension as format + fmat = os.path.splitext(fname)[1][1:] # Strip extension leading '.' + fmat = fmat.lower() + + if fmat == "json": + return json.load(f, object_pairs_hook=OrderedDict) + if fmat in ["hdf5", "h5"]: + if h5py_missing: + logger.error("Cannot load from HDF5 format, missing h5py library") + raise h5py_import_error + return h5todict(fname) + elif fmat in ["ini", "cfg"]: + return ConfigDict(filelist=[fname]) + else: + raise IOError("Unknown format " + fmat) |