diff options
Diffstat (limited to 'src/silx/app/convert.py')
-rw-r--r-- | src/silx/app/convert.py | 601 |
1 files changed, 601 insertions, 0 deletions
diff --git a/src/silx/app/convert.py b/src/silx/app/convert.py new file mode 100644 index 0000000..e20a448 --- /dev/null +++ b/src/silx/app/convert.py @@ -0,0 +1,601 @@ +# /*########################################################################## +# Copyright (C) 2017-2021 European Synchrotron Radiation Facility +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# ############################################################################*/ +"""Convert silx supported data files into HDF5 files""" + +__authors__ = ["P. Knobel"] +__license__ = "MIT" +__date__ = "05/02/2019" + +import ast +import os +import argparse +from glob import glob +import logging +import re +import time +import numpy + +import silx.io +from silx.io.specfile import is_specfile +from silx.io.fioh5 import is_fiofile +from silx.io import fabioh5 + +_logger = logging.getLogger(__name__) +"""Module logger""" + + +def c_format_string_to_re(pattern_string): + """ + + :param pattern_string: C style format string with integer patterns + (e.g. "%d", "%04d"). + Not supported: fixed length padded with whitespaces (e.g "%4d", "%-4d") + :return: Equivalent regular expression (e.g. "\\d+", "\\d{4}") + """ + # escape dots and backslashes + pattern_string = pattern_string.replace("\\", "\\\\") + pattern_string = pattern_string.replace(".", r"\.") + + # %d + pattern_string = pattern_string.replace("%d", r"([-+]?\d+)") + + # %0nd + for sub_pattern in re.findall(r"%0\d+d", pattern_string): + n = int(re.search(r"%0(\d+)d", sub_pattern).group(1)) + if n == 1: + re_sub_pattern = r"([+-]?\d)" + else: + re_sub_pattern = r"([\d+-]\d{%d})" % (n - 1) + pattern_string = pattern_string.replace(sub_pattern, re_sub_pattern, 1) + + return pattern_string + + +def drop_indices_before_begin(filenames, regex, begin): + """ + + :param List[str] filenames: list of filenames + :param str regex: Regexp used to find indices in a filename + :param str begin: Comma separated list of begin indices + :return: List of filenames with only indices >= begin + """ + begin_indices = list(map(int, begin.split(","))) + output_filenames = [] + for fname in filenames: + m = re.match(regex, fname) + file_indices = list(map(int, m.groups())) + if len(file_indices) != len(begin_indices): + raise IOError( + "Number of indices found in filename " + "does not match number of parsed end indices." + ) + good_indices = True + for i, fidx in enumerate(file_indices): + if fidx < begin_indices[i]: + good_indices = False + if good_indices: + output_filenames.append(fname) + return output_filenames + + +def drop_indices_after_end(filenames, regex, end): + """ + + :param List[str] filenames: list of filenames + :param str regex: Regexp used to find indices in a filename + :param str end: Comma separated list of end indices + :return: List of filenames with only indices <= end + """ + end_indices = list(map(int, end.split(","))) + output_filenames = [] + for fname in filenames: + m = re.match(regex, fname) + file_indices = list(map(int, m.groups())) + if len(file_indices) != len(end_indices): + raise IOError( + "Number of indices found in filename " + "does not match number of parsed end indices." + ) + good_indices = True + for i, fidx in enumerate(file_indices): + if fidx > end_indices[i]: + good_indices = False + if good_indices: + output_filenames.append(fname) + return output_filenames + + +def are_files_missing_in_series(filenames, regex): + """Return True if any file is missing in a list of filenames + that are supposed to follow a pattern. + + :param List[str] filenames: list of filenames + :param str regex: Regexp used to find indices in a filename + :return: boolean + :raises AssertionError: if a filename does not match the regexp + """ + previous_indices = None + for fname in filenames: + m = re.match(regex, fname) + assert m is not None, "regex %s does not match filename %s" % (fname, regex) + new_indices = list(map(int, m.groups())) + if previous_indices is not None: + for old_idx, new_idx in zip(previous_indices, new_indices): + if (new_idx - old_idx) > 1: + _logger.error( + "Index increment > 1 in file series: " + "previous idx %d, next idx %d", + old_idx, + new_idx, + ) + return True + previous_indices = new_indices + return False + + +def are_all_specfile(filenames): + """Return True if all files in a list are SPEC files. + :param List[str] filenames: list of filenames + """ + for fname in filenames: + if not is_specfile(fname): + return False + return True + + +def contains_specfile(filenames): + """Return True if any file in a list are SPEC files. + :param List[str] filenames: list of filenames + """ + for fname in filenames: + if is_specfile(fname): + return True + return False + + +def contains_fiofile(filenames): + """Return True if any file in a list are FIO files. + :param List[str] filenames: list of filenames + """ + for fname in filenames: + if is_fiofile(fname): + return True + return False + + +def are_all_fiofile(filenames): + """Return True if all files in a list are FIO files. + :param List[str] filenames: list of filenames + """ + for fname in filenames: + if not is_fiofile(fname): + return False + return True + + +def main(argv): + """ + Main function to launch the converter as an application + + :param argv: Command line arguments + :returns: exit status + """ + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "input_files", + nargs="*", + help="Input files (EDF, TIFF, FIO, SPEC...). When specifying " + "multiple files, you cannot specify both fabio images " + "and SPEC (or FIO) files. Multiple SPEC or FIO files will " + "simply be concatenated, with one entry per scan. " + "Multiple image files will be merged into a single " + "entry with a stack of images.", + ) + # input_files and --filepattern are mutually exclusive + parser.add_argument( + "--file-pattern", + help="File name pattern for loading a series of indexed image files " + "(toto_%%04d.edf). This argument is incompatible with argument " + "input_files. If an output URI with a HDF5 path is provided, " + "only the content of the NXdetector group will be copied there. " + 'If no HDF5 path, or just "/", is given, a complete NXdata ' + "structure will be created.", + ) + parser.add_argument( + "-o", + "--output-uri", + default=time.strftime("%Y%m%d-%H%M%S") + ".h5", + help="Output file name (HDF5). An URI can be provided to write" + " the data into a specific group in the output file: " + "/path/to/file::/path/to/group. " + "If not provided, the filename defaults to a timestamp:" + " YYYYmmdd-HHMMSS.h5", + ) + parser.add_argument( + "-m", + "--mode", + default="w-", + help='Write mode: "r+" (read/write, file must exist), ' + '"w" (write, existing file is lost), ' + '"w-" (write, fail if file exists) or ' + '"a" (read/write if exists, create otherwise)', + ) + parser.add_argument( + "--begin", + help="First file index, or first file indices to be considered. " + "This argument only makes sense when used together with " + "--file-pattern. Provide as many start indices as there " + "are indices in the file pattern, separated by commas. " + 'Examples: "--filepattern toto_%%d.edf --begin 100", ' + ' "--filepattern toto_%%d_%%04d_%%02d.edf --begin 100,2000,5".', + ) + parser.add_argument( + "--end", + help="Last file index, or last file indices to be considered. " + "The same rules as with argument --begin apply. " + 'Example: "--filepattern toto_%%d_%%d.edf --end 199,1999"', + ) + parser.add_argument( + "--add-root-group", + action="store_true", + help="This option causes each input file to be written to a " + "specific root group with the same name as the file. When " + "merging multiple input files, this can help preventing conflicts" + " when datasets have the same name (see --overwrite-data). " + "This option is ignored when using --file-pattern.", + ) + parser.add_argument( + "--overwrite-data", + action="store_true", + help="If the output path exists and an input dataset has the same" + " name as an existing output dataset, overwrite the output " + 'dataset (in modes "r+" or "a").', + ) + parser.add_argument( + "--min-size", + type=int, + default=500, + help="Minimum number of elements required to be in a dataset to " + "apply compression or chunking (default 500).", + ) + parser.add_argument( + "--chunks", + nargs="?", + const="auto", + help="Chunk shape. Provide an argument that evaluates as a python " + 'tuple (e.g. "(1024, 768)"). If this option is provided without ' + "specifying an argument, the h5py library will guess a chunk for " + "you. Note that if you specify an explicit chunking shape, it " + "will be applied identically to all datasets with a large enough " + "size (see --min-size). ", + ) + parser.add_argument( + "--compression", + nargs="?", + const="gzip", + help="Compression filter. By default, the datasets in the output " + "file are not compressed. If this option is specified without " + "argument, the GZIP compression is used. Additional compression " + "filters may be available, depending on your HDF5 installation.", + ) + + def check_gzip_compression_opts(value): + ivalue = int(value) + if ivalue < 0 or ivalue > 9: + raise argparse.ArgumentTypeError( + "--compression-opts must be an int from 0 to 9" + ) + return ivalue + + parser.add_argument( + "--compression-opts", + type=check_gzip_compression_opts, + help='Compression options. For "gzip", this may be an integer from ' + "0 to 9, with a default of 4. This is only supported for GZIP.", + ) + parser.add_argument( + "--shuffle", + action="store_true", + help="Enables the byte shuffle filter. This may improve the compression " + "ratio for block oriented compressors like GZIP or LZF.", + ) + parser.add_argument( + "--fletcher32", + action="store_true", + help="Adds a checksum to each chunk to detect data corruption.", + ) + parser.add_argument( + "--debug", + action="store_true", + default=False, + help="Set logging system in debug mode", + ) + + options = parser.parse_args(argv[1:]) + + if options.debug: + logging.root.setLevel(logging.DEBUG) + + # Import after parsing --debug + try: + # it should be loaded before h5py + import hdf5plugin # noqa + except ImportError: + _logger.debug("Backtrace", exc_info=True) + hdf5plugin = None + + import h5py + + try: + from silx.io.convert import write_to_h5 + except ImportError: + _logger.debug("Backtrace", exc_info=True) + write_to_h5 = None + + if hdf5plugin is None: + message = ( + "Module 'hdf5plugin' is not installed. It supports additional hdf5" + + ' compressions. You can install it using "pip install hdf5plugin".' + ) + _logger.debug(message) + + # Process input arguments (mutually exclusive arguments) + if bool(options.input_files) == bool(options.file_pattern is not None): + if not options.input_files: + message = "You must specify either input files (at least one), " + message += "or a file pattern." + else: + message = "You cannot specify input files and a file pattern" + message += " at the same time." + _logger.error(message) + return -1 + elif options.input_files: + # some shells (windows) don't interpret wildcard characters (*, ?, []) + old_input_list = list(options.input_files) + options.input_files = [] + for fname in old_input_list: + globbed_files = glob(fname) + if not globbed_files: + # no files found, keep the name as it is, to raise an error later + options.input_files += [fname] + else: + # glob does not sort files, but the bash shell does + options.input_files += sorted(globbed_files) + else: + # File series + dirname = os.path.dirname(options.file_pattern) + file_pattern_re = c_format_string_to_re(options.file_pattern) + "$" + files_in_dir = glob(os.path.join(dirname, "*")) + _logger.debug( + """ + Processing file_pattern + dirname: %s + file_pattern_re: %s + files_in_dir: %s + """, + dirname, + file_pattern_re, + files_in_dir, + ) + + options.input_files = sorted( + list(filter(lambda name: re.match(file_pattern_re, name), files_in_dir)) + ) + _logger.debug("options.input_files: %s", options.input_files) + + if options.begin is not None: + options.input_files = drop_indices_before_begin( + options.input_files, file_pattern_re, options.begin + ) + _logger.debug( + "options.input_files after applying --begin: %s", options.input_files + ) + + if options.end is not None: + options.input_files = drop_indices_after_end( + options.input_files, file_pattern_re, options.end + ) + _logger.debug( + "options.input_files after applying --end: %s", options.input_files + ) + + if are_files_missing_in_series(options.input_files, file_pattern_re): + _logger.error("File missing in the file series. Aborting.") + return -1 + + if not options.input_files: + _logger.error("No file matching --file-pattern found.") + return -1 + + # Test that the output path is writeable + if "::" in options.output_uri: + output_name, hdf5_path = options.output_uri.split("::") + else: + output_name, hdf5_path = options.output_uri, "/" + + if os.path.isfile(output_name): + if options.mode == "w-": + _logger.error( + "Output file %s exists and mode is 'w-' (default)." + " Aborting. To append data to an existing file, " + "use 'a' or 'r+'.", + output_name, + ) + return -1 + elif not os.access(output_name, os.W_OK): + _logger.error("Output file %s exists and is not writeable.", output_name) + return -1 + elif options.mode == "w": + _logger.info( + "Output file %s exists and mode is 'w'. " "Overwriting existing file.", + output_name, + ) + elif options.mode in ["a", "r+"]: + _logger.info("Appending data to existing file %s.", output_name) + else: + if options.mode == "r+": + _logger.error( + "Output file %s does not exist and mode is 'r+'" + " (append, file must exist). Aborting.", + output_name, + ) + return -1 + else: + _logger.info("Creating new output file %s.", output_name) + + # Test that all input files exist and are readable + bad_input = False + for fname in options.input_files: + if not os.access(fname, os.R_OK): + _logger.error("Cannot read input file %s.", fname) + bad_input = True + if bad_input: + _logger.error("Aborting.") + return -1 + + # create_dataset special args + create_dataset_args = {} + if options.chunks is not None: + if options.chunks.lower() in ["auto", "true"]: + create_dataset_args["chunks"] = True + else: + try: + chunks = ast.literal_eval(options.chunks) + except (ValueError, SyntaxError): + _logger.error("Invalid --chunks argument %s", options.chunks) + return -1 + if not isinstance(chunks, (tuple, list)): + _logger.error("--chunks argument str does not evaluate to a tuple") + return -1 + else: + nitems = numpy.prod(chunks) + nbytes = nitems * 8 + if nbytes > 10**6: + _logger.warning( + "Requested chunk size might be larger than" + " the default 1MB chunk cache, for float64" + " data. This can dramatically affect I/O " + "performances." + ) + create_dataset_args["chunks"] = chunks + + if options.compression is not None: + try: + compression = int(options.compression) + except ValueError: + compression = options.compression + create_dataset_args["compression"] = compression + + if options.compression_opts is not None: + create_dataset_args["compression_opts"] = options.compression_opts + + if options.shuffle: + create_dataset_args["shuffle"] = True + + if options.fletcher32: + create_dataset_args["fletcher32"] = True + + if ( + len(options.input_files) > 1 + and not contains_specfile(options.input_files) + and not contains_fiofile(options.input_files) + and not options.add_root_group + ) or options.file_pattern is not None: + # File series -> stack of images + input_group = fabioh5.File(file_series=options.input_files) + if hdf5_path != "/": + # we want to append only data and headers to an existing file + input_group = input_group["/scan_0/instrument/detector_0"] + with h5py.File(output_name, mode=options.mode) as h5f: + write_to_h5( + input_group, + h5f, + h5path=hdf5_path, + overwrite_data=options.overwrite_data, + create_dataset_args=create_dataset_args, + min_size=options.min_size, + ) + + elif ( + len(options.input_files) == 1 + or are_all_specfile(options.input_files) + or are_all_fiofile(options.input_files) + or options.add_root_group + ): + # single file, or spec files + h5paths_and_groups = [] + for input_name in options.input_files: + hdf5_path_for_file = hdf5_path + if options.add_root_group: + hdf5_path_for_file = ( + hdf5_path.rstrip("/") + "/" + os.path.basename(input_name) + ) + try: + h5paths_and_groups.append( + (hdf5_path_for_file, silx.io.open(input_name)) + ) + except IOError: + _logger.error( + "Cannot read file %s. If this is a file format " + "supported by the fabio library, you can try to" + " install fabio (`pip install fabio`)." + " Aborting conversion.", + input_name, + ) + return -1 + + with h5py.File(output_name, mode=options.mode) as h5f: + for hdf5_path_for_file, input_group in h5paths_and_groups: + write_to_h5( + input_group, + h5f, + h5path=hdf5_path_for_file, + overwrite_data=options.overwrite_data, + create_dataset_args=create_dataset_args, + min_size=options.min_size, + ) + + else: + # multiple file, SPEC and fabio images mixed + _logger.error( + "Multiple files with incompatible formats specified. " + "You can provide multiple SPEC files or multiple image " + "files, but not both." + ) + return -1 + + with h5py.File(output_name, mode="r+") as h5f: + # append "silx convert" to the creator attribute, for NeXus files + previous_creator = h5f.attrs.get("creator", "") + creator = "silx convert (v%s)" % silx.version + # only if it not already there + if creator not in previous_creator: + if not previous_creator: + new_creator = creator + else: + new_creator = previous_creator + "; " + creator + h5f.attrs["creator"] = numpy.array( + new_creator, dtype=h5py.special_dtype(vlen=str) + ) + + return 0 |