# coding: utf-8 # /*########################################################################## # Copyright (C) 2017-2018 European Synchrotron Radiation Facility # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # # ############################################################################*/ """Convert silx supported data files into HDF5 files""" __authors__ = ["P. Knobel"] __license__ = "MIT" __date__ = "05/02/2019" import ast import os import argparse from glob import glob import logging import re import time import numpy import six import silx.io from silx.io.specfile import is_specfile from silx.io import fabioh5 _logger = logging.getLogger(__name__) """Module logger""" def c_format_string_to_re(pattern_string): """ :param pattern_string: C style format string with integer patterns (e.g. "%d", "%04d"). Not supported: fixed length padded with whitespaces (e.g "%4d", "%-4d") :return: Equivalent regular expression (e.g. "\\d+", "\\d{4}") """ # escape dots and backslashes pattern_string = pattern_string.replace("\\", "\\\\") pattern_string = pattern_string.replace(".", r"\.") # %d pattern_string = pattern_string.replace("%d", r"([-+]?\d+)") # %0nd for sub_pattern in re.findall(r"%0\d+d", pattern_string): n = int(re.search(r"%0(\d+)d", sub_pattern).group(1)) if n == 1: re_sub_pattern = r"([+-]?\d)" else: re_sub_pattern = r"([\d+-]\d{%d})" % (n - 1) pattern_string = pattern_string.replace(sub_pattern, re_sub_pattern, 1) return pattern_string def drop_indices_before_begin(filenames, regex, begin): """ :param List[str] filenames: list of filenames :param str regex: Regexp used to find indices in a filename :param str begin: Comma separated list of begin indices :return: List of filenames with only indices >= begin """ begin_indices = list(map(int, begin.split(","))) output_filenames = [] for fname in filenames: m = re.match(regex, fname) file_indices = list(map(int, m.groups())) if len(file_indices) != len(begin_indices): raise IOError("Number of indices found in filename " "does not match number of parsed end indices.") good_indices = True for i, fidx in enumerate(file_indices): if fidx < begin_indices[i]: good_indices = False if good_indices: output_filenames.append(fname) return output_filenames def drop_indices_after_end(filenames, regex, end): """ :param List[str] filenames: list of filenames :param str regex: Regexp used to find indices in a filename :param str end: Comma separated list of end indices :return: List of filenames with only indices <= end """ end_indices = list(map(int, end.split(","))) output_filenames = [] for fname in filenames: m = re.match(regex, fname) file_indices = list(map(int, m.groups())) if len(file_indices) != len(end_indices): raise IOError("Number of indices found in filename " "does not match number of parsed end indices.") good_indices = True for i, fidx in enumerate(file_indices): if fidx > end_indices[i]: good_indices = False if good_indices: output_filenames.append(fname) return output_filenames def are_files_missing_in_series(filenames, regex): """Return True if any file is missing in a list of filenames that are supposed to follow a pattern. :param List[str] filenames: list of filenames :param str regex: Regexp used to find indices in a filename :return: boolean :raises AssertionError: if a filename does not match the regexp """ previous_indices = None for fname in filenames: m = re.match(regex, fname) assert m is not None, \ "regex %s does not match filename %s" % (fname, regex) new_indices = list(map(int, m.groups())) if previous_indices is not None: for old_idx, new_idx in zip(previous_indices, new_indices): if (new_idx - old_idx) > 1: _logger.error("Index increment > 1 in file series: " "previous idx %d, next idx %d", old_idx, new_idx) return True previous_indices = new_indices return False def are_all_specfile(filenames): """Return True if all files in a list are SPEC files. :param List[str] filenames: list of filenames """ for fname in filenames: if not is_specfile(fname): return False return True def contains_specfile(filenames): """Return True if any file in a list are SPEC files. :param List[str] filenames: list of filenames """ for fname in filenames: if is_specfile(fname): return True return False def main(argv): """ Main function to launch the converter as an application :param argv: Command line arguments :returns: exit status """ parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( 'input_files', nargs="*", help='Input files (EDF, TIFF, SPEC...). When specifying multiple ' 'files, you cannot specify both fabio images and SPEC files. ' 'Multiple SPEC files will simply be concatenated, with one ' 'entry per scan. Multiple image files will be merged into ' 'a single entry with a stack of images.') # input_files and --filepattern are mutually exclusive parser.add_argument( '--file-pattern', help='File name pattern for loading a series of indexed image files ' '(toto_%%04d.edf). This argument is incompatible with argument ' 'input_files. If an output URI with a HDF5 path is provided, ' 'only the content of the NXdetector group will be copied there. ' 'If no HDF5 path, or just "/", is given, a complete NXdata ' 'structure will be created.') parser.add_argument( '-o', '--output-uri', default=time.strftime("%Y%m%d-%H%M%S") + '.h5', help='Output file name (HDF5). An URI can be provided to write' ' the data into a specific group in the output file: ' '/path/to/file::/path/to/group. ' 'If not provided, the filename defaults to a timestamp:' ' YYYYmmdd-HHMMSS.h5') parser.add_argument( '-m', '--mode', default="w-", help='Write mode: "r+" (read/write, file must exist), ' '"w" (write, existing file is lost), ' '"w-" (write, fail if file exists) or ' '"a" (read/write if exists, create otherwise)') parser.add_argument( '--begin', help='First file index, or first file indices to be considered. ' 'This argument only makes sense when used together with ' '--file-pattern. Provide as many start indices as there ' 'are indices in the file pattern, separated by commas. ' 'Examples: "--filepattern toto_%%d.edf --begin 100", ' ' "--filepattern toto_%%d_%%04d_%%02d.edf --begin 100,2000,5".') parser.add_argument( '--end', help='Last file index, or last file indices to be considered. ' 'The same rules as with argument --begin apply. ' 'Example: "--filepattern toto_%%d_%%d.edf --end 199,1999"') parser.add_argument( '--add-root-group', action="store_true", help='This option causes each input file to be written to a ' 'specific root group with the same name as the file. When ' 'merging multiple input files, this can help preventing conflicts' ' when datasets have the same name (see --overwrite-data). ' 'This option is ignored when using --file-pattern.') parser.add_argument( '--overwrite-data', action="store_true", help='If the output path exists and an input dataset has the same' ' name as an existing output dataset, overwrite the output ' 'dataset (in modes "r+" or "a").') parser.add_argument( '--min-size', type=int, default=500, help='Minimum number of elements required to be in a dataset to ' 'apply compression or chunking (default 500).') parser.add_argument( '--chunks', nargs="?", const="auto", help='Chunk shape. Provide an argument that evaluates as a python ' 'tuple (e.g. "(1024, 768)"). If this option is provided without ' 'specifying an argument, the h5py library will guess a chunk for ' 'you. Note that if you specify an explicit chunking shape, it ' 'will be applied identically to all datasets with a large enough ' 'size (see --min-size). ') parser.add_argument( '--compression', nargs="?", const="gzip", help='Compression filter. By default, the datasets in the output ' 'file are not compressed. If this option is specified without ' 'argument, the GZIP compression is used. Additional compression ' 'filters may be available, depending on your HDF5 installation.') def check_gzip_compression_opts(value): ivalue = int(value) if ivalue < 0 or ivalue > 9: raise argparse.ArgumentTypeError( "--compression-opts must be an int from 0 to 9") return ivalue parser.add_argument( '--compression-opts', type=check_gzip_compression_opts, help='Compression options. For "gzip", this may be an integer from ' '0 to 9, with a default of 4. This is only supported for GZIP.') parser.add_argument( '--shuffle', action="store_true", help='Enables the byte shuffle filter. This may improve the compression ' 'ratio for block oriented compressors like GZIP or LZF.') parser.add_argument( '--fletcher32', action="store_true", help='Adds a checksum to each chunk to detect data corruption.') parser.add_argument( '--debug', action="store_true", default=False, help='Set logging system in debug mode') options = parser.parse_args(argv[1:]) if options.debug: logging.root.setLevel(logging.DEBUG) # Import after parsing --debug try: # it should be loaded before h5py import hdf5plugin # noqa except ImportError: _logger.debug("Backtrace", exc_info=True) hdf5plugin = None import h5py try: from silx.io.convert import write_to_h5 except ImportError: _logger.debug("Backtrace", exc_info=True) write_to_h5 = None if hdf5plugin is None: message = "Module 'hdf5plugin' is not installed. It supports additional hdf5"\ + " compressions. You can install it using \"pip install hdf5plugin\"." _logger.debug(message) # Process input arguments (mutually exclusive arguments) if bool(options.input_files) == bool(options.file_pattern is not None): if not options.input_files: message = "You must specify either input files (at least one), " message += "or a file pattern." else: message = "You cannot specify input files and a file pattern" message += " at the same time." _logger.error(message) return -1 elif options.input_files: # some shells (windows) don't interpret wildcard characters (*, ?, []) old_input_list = list(options.input_files) options.input_files = [] for fname in old_input_list: globbed_files = glob(fname) if not globbed_files: # no files found, keep the name as it is, to raise an error later options.input_files += [fname] else: # glob does not sort files, but the bash shell does options.input_files += sorted(globbed_files) else: # File series dirname = os.path.dirname(options.file_pattern) file_pattern_re = c_format_string_to_re(options.file_pattern) + "$" files_in_dir = glob(os.path.join(dirname, "*")) _logger.debug(""" Processing file_pattern dirname: %s file_pattern_re: %s files_in_dir: %s """, dirname, file_pattern_re, files_in_dir) options.input_files = sorted(list(filter(lambda name: re.match(file_pattern_re, name), files_in_dir))) _logger.debug("options.input_files: %s", options.input_files) if options.begin is not None: options.input_files = drop_indices_before_begin(options.input_files, file_pattern_re, options.begin) _logger.debug("options.input_files after applying --begin: %s", options.input_files) if options.end is not None: options.input_files = drop_indices_after_end(options.input_files, file_pattern_re, options.end) _logger.debug("options.input_files after applying --end: %s", options.input_files) if are_files_missing_in_series(options.input_files, file_pattern_re): _logger.error("File missing in the file series. Aborting.") return -1 if not options.input_files: _logger.error("No file matching --file-pattern found.") return -1 # Test that the output path is writeable if "::" in options.output_uri: output_name, hdf5_path = options.output_uri.split("::") else: output_name, hdf5_path = options.output_uri, "/" if os.path.isfile(output_name): if options.mode == "w-": _logger.error("Output file %s exists and mode is 'w-' (default)." " Aborting. To append data to an existing file, " "use 'a' or 'r+'.", output_name) return -1 elif not os.access(output_name, os.W_OK): _logger.error("Output file %s exists and is not writeable.", output_name) return -1 elif options.mode == "w": _logger.info("Output file %s exists and mode is 'w'. " "Overwriting existing file.", output_name) elif options.mode in ["a", "r+"]: _logger.info("Appending data to existing file %s.", output_name) else: if options.mode == "r+": _logger.error("Output file %s does not exist and mode is 'r+'" " (append, file must exist). Aborting.", output_name) return -1 else: _logger.info("Creating new output file %s.", output_name) # Test that all input files exist and are readable bad_input = False for fname in options.input_files: if not os.access(fname, os.R_OK): _logger.error("Cannot read input file %s.", fname) bad_input = True if bad_input: _logger.error("Aborting.") return -1 # create_dataset special args create_dataset_args = {} if options.chunks is not None: if options.chunks.lower() in ["auto", "true"]: create_dataset_args["chunks"] = True else: try: chunks = ast.literal_eval(options.chunks) except (ValueError, SyntaxError): _logger.error("Invalid --chunks argument %s", options.chunks) return -1 if not isinstance(chunks, (tuple, list)): _logger.error("--chunks argument str does not evaluate to a tuple") return -1 else: nitems = numpy.prod(chunks) nbytes = nitems * 8 if nbytes > 10**6: _logger.warning("Requested chunk size might be larger than" " the default 1MB chunk cache, for float64" " data. This can dramatically affect I/O " "performances.") create_dataset_args["chunks"] = chunks if options.compression is not None: try: compression = int(options.compression) except ValueError: compression = options.compression create_dataset_args["compression"] = compression if options.compression_opts is not None: create_dataset_args["compression_opts"] = options.compression_opts if options.shuffle: create_dataset_args["shuffle"] = True if options.fletcher32: create_dataset_args["fletcher32"] = True if (len(options.input_files) > 1 and not contains_specfile(options.input_files) and not options.add_root_group) or options.file_pattern is not None: # File series -> stack of images input_group = fabioh5.File(file_series=options.input_files) if hdf5_path != "/": # we want to append only data and headers to an existing file input_group = input_group["/scan_0/instrument/detector_0"] with h5py.File(output_name, mode=options.mode) as h5f: write_to_h5(input_group, h5f, h5path=hdf5_path, overwrite_data=options.overwrite_data, create_dataset_args=create_dataset_args, min_size=options.min_size) elif len(options.input_files) == 1 or \ are_all_specfile(options.input_files) or\ options.add_root_group: # single file, or spec files h5paths_and_groups = [] for input_name in options.input_files: hdf5_path_for_file = hdf5_path if options.add_root_group: hdf5_path_for_file = hdf5_path.rstrip("/") + "/" + os.path.basename(input_name) try: h5paths_and_groups.append((hdf5_path_for_file, silx.io.open(input_name))) except IOError: _logger.error("Cannot read file %s. If this is a file format " "supported by the fabio library, you can try to" " install fabio (`pip install fabio`)." " Aborting conversion.", input_name) return -1 with h5py.File(output_name, mode=options.mode) as h5f: for hdf5_path_for_file, input_group in h5paths_and_groups: write_to_h5(input_group, h5f, h5path=hdf5_path_for_file, overwrite_data=options.overwrite_data, create_dataset_args=create_dataset_args, min_size=options.min_size) else: # multiple file, SPEC and fabio images mixed _logger.error("Multiple files with incompatible formats specified. " "You can provide multiple SPEC files or multiple image " "files, but not both.") return -1 with h5py.File(output_name, mode="r+") as h5f: # append "silx convert" to the creator attribute, for NeXus files previous_creator = h5f.attrs.get("creator", u"") creator = "silx convert (v%s)" % silx.version # only if it not already there if creator not in previous_creator: if not previous_creator: new_creator = creator else: new_creator = previous_creator + "; " + creator h5f.attrs["creator"] = numpy.array( new_creator, dtype=h5py.special_dtype(vlen=six.text_type)) return 0