diff options
author | Picca Frédéric-Emmanuel <picca@debian.org> | 2018-03-04 10:20:27 +0100 |
---|---|---|
committer | Picca Frédéric-Emmanuel <picca@debian.org> | 2018-03-04 10:20:27 +0100 |
commit | 270d5ddc31c26b62379e3caa9044dd75ccc71847 (patch) | |
tree | 55c5bfc851dfce7172d335cd2405b214323e3caf /silx/app/convert.py | |
parent | e19c96eff0c310c06c4f268c8b80cb33bd08996f (diff) |
New upstream version 0.7.0+dfsg
Diffstat (limited to 'silx/app/convert.py')
-rw-r--r-- | silx/app/convert.py | 368 |
1 files changed, 315 insertions, 53 deletions
diff --git a/silx/app/convert.py b/silx/app/convert.py index a092ec1..cd48deb 100644 --- a/silx/app/convert.py +++ b/silx/app/convert.py @@ -1,6 +1,6 @@ # coding: utf-8 # /*########################################################################## -# Copyright (C) 2017 European Synchrotron Radiation Facility +# Copyright (C) 2017-2018 European Synchrotron Radiation Facility # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -24,13 +24,22 @@ """Convert silx supported data files into HDF5 files""" import ast -import sys import os import argparse from glob import glob import logging import numpy -import silx +import re +import time + +import silx.io +from silx.io.specfile import is_specfile +from silx.third_party import six + +try: + from silx.io import fabioh5 +except ImportError: + fabioh5 = None __authors__ = ["P. Knobel"] @@ -42,6 +51,129 @@ _logger = logging.getLogger(__name__) """Module logger""" +def c_format_string_to_re(pattern_string): + """ + + :param pattern_string: C style format string with integer patterns + (e.g. "%d", "%04d"). + Not supported: fixed length padded with whitespaces (e.g "%4d", "%-4d") + :return: Equivalent regular expression (e.g. "\d+", "\d{4}") + """ + # escape dots and backslashes + pattern_string = pattern_string.replace("\\", "\\\\") + pattern_string = pattern_string.replace(".", "\.") + + # %d + pattern_string = pattern_string.replace("%d", "([-+]?\d+)") + + # %0nd + for sub_pattern in re.findall("%0\d+d", pattern_string): + n = int(re.search("%0(\d+)d", sub_pattern).group(1)) + if n == 1: + re_sub_pattern = "([+-]?\d)" + else: + re_sub_pattern = "([\d+-]\d{%d})" % (n - 1) + pattern_string = pattern_string.replace(sub_pattern, re_sub_pattern, 1) + + return pattern_string + + +def drop_indices_before_begin(filenames, regex, begin): + """ + + :param List[str] filenames: list of filenames + :param str regex: Regexp used to find indices in a filename + :param str begin: Comma separated list of begin indices + :return: List of filenames with only indices >= begin + """ + begin_indices = list(map(int, begin.split(","))) + output_filenames = [] + for fname in filenames: + m = re.match(regex, fname) + file_indices = list(map(int, m.groups())) + if len(file_indices) != len(begin_indices): + raise IOError("Number of indices found in filename " + "does not match number of parsed end indices.") + good_indices = True + for i, fidx in enumerate(file_indices): + if fidx < begin_indices[i]: + good_indices = False + if good_indices: + output_filenames.append(fname) + return output_filenames + + +def drop_indices_after_end(filenames, regex, end): + """ + + :param List[str] filenames: list of filenames + :param str regex: Regexp used to find indices in a filename + :param str end: Comma separated list of end indices + :return: List of filenames with only indices <= end + """ + end_indices = list(map(int, end.split(","))) + output_filenames = [] + for fname in filenames: + m = re.match(regex, fname) + file_indices = list(map(int, m.groups())) + if len(file_indices) != len(end_indices): + raise IOError("Number of indices found in filename " + "does not match number of parsed end indices.") + good_indices = True + for i, fidx in enumerate(file_indices): + if fidx > end_indices[i]: + good_indices = False + if good_indices: + output_filenames.append(fname) + return output_filenames + + +def are_files_missing_in_series(filenames, regex): + """Return True if any file is missing in a list of filenames + that are supposed to follow a pattern. + + :param List[str] filenames: list of filenames + :param str regex: Regexp used to find indices in a filename + :return: boolean + :raises AssertionError: if a filename does not match the regexp + """ + previous_indices = None + for fname in filenames: + m = re.match(regex, fname) + assert m is not None, \ + "regex %s does not match filename %s" % (fname, regex) + new_indices = list(map(int, m.groups())) + if previous_indices is not None: + for old_idx, new_idx in zip(previous_indices, new_indices): + if (new_idx - old_idx) > 1: + _logger.error("Index increment > 1 in file series: " + "previous idx %d, next idx %d", + old_idx, new_idx) + return True + previous_indices = new_indices + return False + + +def are_all_specfile(filenames): + """Return True if all files in a list are SPEC files. + :param List[str] filenames: list of filenames + """ + for fname in filenames: + if not is_specfile(fname): + return False + return True + + +def contains_specfile(filenames): + """Return True if any file in a list are SPEC files. + :param List[str] filenames: list of filenames + """ + for fname in filenames: + if is_specfile(fname): + return True + return False + + def main(argv): """ Main function to launch the converter as an application @@ -52,15 +184,29 @@ def main(argv): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( 'input_files', - nargs="+", - help='Input files (EDF, SPEC)') + nargs="*", + help='Input files (EDF, TIFF, SPEC...). When specifying multiple ' + 'files, you cannot specify both fabio images and SPEC files. ' + 'Multiple SPEC files will simply be concatenated, with one ' + 'entry per scan. Multiple image files will be merged into ' + 'a single entry with a stack of images.') + # input_files and --filepattern are mutually exclusive + parser.add_argument( + '--file-pattern', + help='File name pattern for loading a series of indexed image files ' + '(toto_%%04d.edf). This argument is incompatible with argument ' + 'input_files. If an output URI with a HDF5 path is provided, ' + 'only the content of the NXdetector group will be copied there. ' + 'If no HDF5 path, or just "/", is given, a complete NXdata ' + 'structure will be created.') parser.add_argument( '-o', '--output-uri', - nargs="?", - help='Output file (HDF5). If omitted, it will be the ' - 'concatenated input file names, with a ".h5" suffix added.' - ' An URI can be provided to write the data into a specific ' - 'group in the output file: /path/to/file::/path/to/group') + default=time.strftime("%Y%m%d-%H%M%S") + '.h5', + help='Output file name (HDF5). An URI can be provided to write' + ' the data into a specific group in the output file: ' + '/path/to/file::/path/to/group. ' + 'If not provided, the filename defaults to a timestamp:' + ' YYYYmmdd-HHMMSS.h5') parser.add_argument( '-m', '--mode', default="w-", @@ -69,12 +215,26 @@ def main(argv): '"w-" (write, fail if file exists) or ' '"a" (read/write if exists, create otherwise)') parser.add_argument( - '--no-root-group', + '--begin', + help='First file index, or first file indices to be considered. ' + 'This argument only makes sense when used together with ' + '--file-pattern. Provide as many start indices as there ' + 'are indices in the file pattern, separated by commas. ' + 'Examples: "--filepattern toto_%%d.edf --begin 100", ' + ' "--filepattern toto_%%d_%%04d_%%02d.edf --begin 100,2000,5".') + parser.add_argument( + '--end', + help='Last file index, or last file indices to be considered. ' + 'The same rules as with argument --begin apply. ' + 'Example: "--filepattern toto_%%d_%%d.edf --end 199,1999"') + parser.add_argument( + '--add-root-group', action="store_true", - help='This option disables the default behavior of creating a ' - 'root group (entry) for each file to be converted. When ' - 'merging multiple input files, this can cause conflicts ' - 'when datasets have the same name (see --overwrite-data).') + help='This option causes each input file to be written to a ' + 'specific root group with the same name as the file. When ' + 'merging multiple input files, this can help preventing conflicts' + ' when datasets have the same name (see --overwrite-data). ' + 'This option is ignored when using --file-pattern.') parser.add_argument( '--overwrite-data', action="store_true", @@ -121,7 +281,7 @@ def main(argv): parser.add_argument( '--shuffle', action="store_true", - help='Enables the byte shuffle filter, may improve the compression ' + help='Enables the byte shuffle filter. This may improve the compression ' 'ratio for block oriented compressors like GZIP or LZF.') parser.add_argument( '--fletcher32', @@ -135,22 +295,10 @@ def main(argv): options = parser.parse_args(argv[1:]) - # some shells (windows) don't interpret wildcard characters (*, ?, []) - old_input_list = list(options.input_files) - options.input_files = [] - for fname in old_input_list: - globbed_files = glob(fname) - if not globbed_files: - # no files found, keep the name as it is, to raise an error later - options.input_files += [fname] - else: - options.input_files += globbed_files - old_input_list = None - if options.debug: logging.root.setLevel(logging.DEBUG) - # Import most of the things here to be sure to use the right logging level + # Import after parsing --debug try: # it should be loaded before h5py import hdf5plugin # noqa @@ -177,22 +325,78 @@ def main(argv): + " compressions. You can install it using \"pip install hdf5plugin\"." _logger.debug(message) + # Process input arguments (mutually exclusive arguments) + if bool(options.input_files) == bool(options.file_pattern is not None): + if not options.input_files: + message = "You must specify either input files (at least one), " + message += "or a file pattern." + else: + message = "You cannot specify input files and a file pattern" + message += " at the same time." + _logger.error(message) + return -1 + elif options.input_files: + # some shells (windows) don't interpret wildcard characters (*, ?, []) + old_input_list = list(options.input_files) + options.input_files = [] + for fname in old_input_list: + globbed_files = glob(fname) + if not globbed_files: + # no files found, keep the name as it is, to raise an error later + options.input_files += [fname] + else: + # glob does not sort files, but the bash shell does + options.input_files += sorted(globbed_files) + else: + # File series + dirname = os.path.dirname(options.file_pattern) + file_pattern_re = c_format_string_to_re(options.file_pattern) + "$" + files_in_dir = glob(os.path.join(dirname, "*")) + _logger.debug(""" + Processing file_pattern + dirname: %s + file_pattern_re: %s + files_in_dir: %s + """, dirname, file_pattern_re, files_in_dir) + + options.input_files = sorted(list(filter(lambda name: re.match(file_pattern_re, name), + files_in_dir))) + _logger.debug("options.input_files: %s", options.input_files) + + if options.begin is not None: + options.input_files = drop_indices_before_begin(options.input_files, + file_pattern_re, + options.begin) + _logger.debug("options.input_files after applying --begin: %s", + options.input_files) + + if options.end is not None: + options.input_files = drop_indices_after_end(options.input_files, + file_pattern_re, + options.end) + _logger.debug("options.input_files after applying --end: %s", + options.input_files) + + if are_files_missing_in_series(options.input_files, + file_pattern_re): + _logger.error("File missing in the file series. Aborting.") + return -1 + + if not options.input_files: + _logger.error("No file matching --file-pattern found.") + return -1 + # Test that the output path is writeable - if options.output_uri is None: - input_basenames = [os.path.basename(name) for name in options.input_files] - output_name = ''.join(input_basenames) + ".h5" - _logger.info("No output file specified, using %s", output_name) - hdf5_path = "/" + if "::" in options.output_uri: + output_name, hdf5_path = options.output_uri.split("::") else: - if "::" in options.output_uri: - output_name, hdf5_path = options.output_uri.split("::") - else: - output_name, hdf5_path = options.output_uri, "/" + output_name, hdf5_path = options.output_uri, "/" if os.path.isfile(output_name): if options.mode == "w-": - _logger.error("Output file %s exists and mode is 'w-'" - " (write, file must not exist). Aborting.", + _logger.error("Output file %s exists and mode is 'w-' (default)." + " Aborting. To append data to an existing file, " + "use 'a' or 'r+'.", output_name) return -1 elif not os.access(output_name, os.W_OK): @@ -262,22 +466,80 @@ def main(argv): if options.fletcher32: create_dataset_args["fletcher32"] = True - with h5py.File(output_name, mode=options.mode) as h5f: - for input_name in options.input_files: - hdf5_path_for_file = hdf5_path - if not options.no_root_group: - hdf5_path_for_file = hdf5_path.rstrip("/") + "/" + os.path.basename(input_name) - write_to_h5(input_name, h5f, - h5path=hdf5_path_for_file, + if (len(options.input_files) > 1 and + not contains_specfile(options.input_files) and + not options.add_root_group) or options.file_pattern is not None: + # File series -> stack of images + if fabioh5 is None: + # return a helpful error message if fabio is missing + try: + import fabio + except ImportError: + _logger.error("The fabio library is required to convert" + " edf files. Please install it with 'pip " + "install fabio` and try again.") + else: + # unexpected problem in silx.io.fabioh5 + raise + return -1 + input_group = fabioh5.File(file_series=options.input_files) + if hdf5_path != "/": + # we want to append only data and headers to an existing file + input_group = input_group["/scan_0/instrument/detector_0"] + with h5py.File(output_name, mode=options.mode) as h5f: + write_to_h5(input_group, h5f, + h5path=hdf5_path, overwrite_data=options.overwrite_data, create_dataset_args=create_dataset_args, min_size=options.min_size) - # append the convert command to the creator attribute, for NeXus files - creator = h5f[hdf5_path_for_file].attrs.get("creator", b"").decode() - convert_command = " ".join(argv) - if convert_command not in creator: - h5f[hdf5_path_for_file].attrs["creator"] = \ - numpy.string_(creator + "; convert command: %s" % " ".join(argv)) + elif len(options.input_files) == 1 or \ + are_all_specfile(options.input_files) or\ + options.add_root_group: + # single file, or spec files + h5paths_and_groups = [] + for input_name in options.input_files: + hdf5_path_for_file = hdf5_path + if options.add_root_group: + hdf5_path_for_file = hdf5_path.rstrip("/") + "/" + os.path.basename(input_name) + try: + h5paths_and_groups.append((hdf5_path_for_file, + silx.io.open(input_name))) + except IOError: + _logger.error("Cannot read file %s. If this is a file format " + "supported by the fabio library, you can try to" + " install fabio (`pip install fabio`)." + " Aborting conversion.", + input_name) + return -1 + + with h5py.File(output_name, mode=options.mode) as h5f: + for hdf5_path_for_file, input_group in h5paths_and_groups: + write_to_h5(input_group, h5f, + h5path=hdf5_path_for_file, + overwrite_data=options.overwrite_data, + create_dataset_args=create_dataset_args, + min_size=options.min_size) + + else: + # multiple file, SPEC and fabio images mixed + _logger.error("Multiple files with incompatible formats specified. " + "You can provide multiple SPEC files or multiple image " + "files, but not both.") + return -1 + + with h5py.File(output_name, mode="r+") as h5f: + # append "silx convert" to the creator attribute, for NeXus files + previous_creator = h5f.attrs.get("creator", u"") + creator = "silx convert (v%s)" % silx.version + # only if it not already there + if creator not in previous_creator: + if not previous_creator: + new_creator = creator + else: + new_creator = previous_creator + "; " + creator + h5f.attrs["creator"] = numpy.array( + new_creator, + dtype=h5py.special_dtype(vlen=six.text_type)) return 0 |