summaryrefslogtreecommitdiff
path: root/src/silx/app/convert.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/silx/app/convert.py')
-rw-r--r--src/silx/app/convert.py601
1 files changed, 601 insertions, 0 deletions
diff --git a/src/silx/app/convert.py b/src/silx/app/convert.py
new file mode 100644
index 0000000..e20a448
--- /dev/null
+++ b/src/silx/app/convert.py
@@ -0,0 +1,601 @@
+# /*##########################################################################
+# Copyright (C) 2017-2021 European Synchrotron Radiation Facility
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ############################################################################*/
+"""Convert silx supported data files into HDF5 files"""
+
+__authors__ = ["P. Knobel"]
+__license__ = "MIT"
+__date__ = "05/02/2019"
+
+import ast
+import os
+import argparse
+from glob import glob
+import logging
+import re
+import time
+import numpy
+
+import silx.io
+from silx.io.specfile import is_specfile
+from silx.io.fioh5 import is_fiofile
+from silx.io import fabioh5
+
+_logger = logging.getLogger(__name__)
+"""Module logger"""
+
+
+def c_format_string_to_re(pattern_string):
+ """
+
+ :param pattern_string: C style format string with integer patterns
+ (e.g. "%d", "%04d").
+ Not supported: fixed length padded with whitespaces (e.g "%4d", "%-4d")
+ :return: Equivalent regular expression (e.g. "\\d+", "\\d{4}")
+ """
+ # escape dots and backslashes
+ pattern_string = pattern_string.replace("\\", "\\\\")
+ pattern_string = pattern_string.replace(".", r"\.")
+
+ # %d
+ pattern_string = pattern_string.replace("%d", r"([-+]?\d+)")
+
+ # %0nd
+ for sub_pattern in re.findall(r"%0\d+d", pattern_string):
+ n = int(re.search(r"%0(\d+)d", sub_pattern).group(1))
+ if n == 1:
+ re_sub_pattern = r"([+-]?\d)"
+ else:
+ re_sub_pattern = r"([\d+-]\d{%d})" % (n - 1)
+ pattern_string = pattern_string.replace(sub_pattern, re_sub_pattern, 1)
+
+ return pattern_string
+
+
+def drop_indices_before_begin(filenames, regex, begin):
+ """
+
+ :param List[str] filenames: list of filenames
+ :param str regex: Regexp used to find indices in a filename
+ :param str begin: Comma separated list of begin indices
+ :return: List of filenames with only indices >= begin
+ """
+ begin_indices = list(map(int, begin.split(",")))
+ output_filenames = []
+ for fname in filenames:
+ m = re.match(regex, fname)
+ file_indices = list(map(int, m.groups()))
+ if len(file_indices) != len(begin_indices):
+ raise IOError(
+ "Number of indices found in filename "
+ "does not match number of parsed end indices."
+ )
+ good_indices = True
+ for i, fidx in enumerate(file_indices):
+ if fidx < begin_indices[i]:
+ good_indices = False
+ if good_indices:
+ output_filenames.append(fname)
+ return output_filenames
+
+
+def drop_indices_after_end(filenames, regex, end):
+ """
+
+ :param List[str] filenames: list of filenames
+ :param str regex: Regexp used to find indices in a filename
+ :param str end: Comma separated list of end indices
+ :return: List of filenames with only indices <= end
+ """
+ end_indices = list(map(int, end.split(",")))
+ output_filenames = []
+ for fname in filenames:
+ m = re.match(regex, fname)
+ file_indices = list(map(int, m.groups()))
+ if len(file_indices) != len(end_indices):
+ raise IOError(
+ "Number of indices found in filename "
+ "does not match number of parsed end indices."
+ )
+ good_indices = True
+ for i, fidx in enumerate(file_indices):
+ if fidx > end_indices[i]:
+ good_indices = False
+ if good_indices:
+ output_filenames.append(fname)
+ return output_filenames
+
+
+def are_files_missing_in_series(filenames, regex):
+ """Return True if any file is missing in a list of filenames
+ that are supposed to follow a pattern.
+
+ :param List[str] filenames: list of filenames
+ :param str regex: Regexp used to find indices in a filename
+ :return: boolean
+ :raises AssertionError: if a filename does not match the regexp
+ """
+ previous_indices = None
+ for fname in filenames:
+ m = re.match(regex, fname)
+ assert m is not None, "regex %s does not match filename %s" % (fname, regex)
+ new_indices = list(map(int, m.groups()))
+ if previous_indices is not None:
+ for old_idx, new_idx in zip(previous_indices, new_indices):
+ if (new_idx - old_idx) > 1:
+ _logger.error(
+ "Index increment > 1 in file series: "
+ "previous idx %d, next idx %d",
+ old_idx,
+ new_idx,
+ )
+ return True
+ previous_indices = new_indices
+ return False
+
+
+def are_all_specfile(filenames):
+ """Return True if all files in a list are SPEC files.
+ :param List[str] filenames: list of filenames
+ """
+ for fname in filenames:
+ if not is_specfile(fname):
+ return False
+ return True
+
+
+def contains_specfile(filenames):
+ """Return True if any file in a list are SPEC files.
+ :param List[str] filenames: list of filenames
+ """
+ for fname in filenames:
+ if is_specfile(fname):
+ return True
+ return False
+
+
+def contains_fiofile(filenames):
+ """Return True if any file in a list are FIO files.
+ :param List[str] filenames: list of filenames
+ """
+ for fname in filenames:
+ if is_fiofile(fname):
+ return True
+ return False
+
+
+def are_all_fiofile(filenames):
+ """Return True if all files in a list are FIO files.
+ :param List[str] filenames: list of filenames
+ """
+ for fname in filenames:
+ if not is_fiofile(fname):
+ return False
+ return True
+
+
+def main(argv):
+ """
+ Main function to launch the converter as an application
+
+ :param argv: Command line arguments
+ :returns: exit status
+ """
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument(
+ "input_files",
+ nargs="*",
+ help="Input files (EDF, TIFF, FIO, SPEC...). When specifying "
+ "multiple files, you cannot specify both fabio images "
+ "and SPEC (or FIO) files. Multiple SPEC or FIO files will "
+ "simply be concatenated, with one entry per scan. "
+ "Multiple image files will be merged into a single "
+ "entry with a stack of images.",
+ )
+ # input_files and --filepattern are mutually exclusive
+ parser.add_argument(
+ "--file-pattern",
+ help="File name pattern for loading a series of indexed image files "
+ "(toto_%%04d.edf). This argument is incompatible with argument "
+ "input_files. If an output URI with a HDF5 path is provided, "
+ "only the content of the NXdetector group will be copied there. "
+ 'If no HDF5 path, or just "/", is given, a complete NXdata '
+ "structure will be created.",
+ )
+ parser.add_argument(
+ "-o",
+ "--output-uri",
+ default=time.strftime("%Y%m%d-%H%M%S") + ".h5",
+ help="Output file name (HDF5). An URI can be provided to write"
+ " the data into a specific group in the output file: "
+ "/path/to/file::/path/to/group. "
+ "If not provided, the filename defaults to a timestamp:"
+ " YYYYmmdd-HHMMSS.h5",
+ )
+ parser.add_argument(
+ "-m",
+ "--mode",
+ default="w-",
+ help='Write mode: "r+" (read/write, file must exist), '
+ '"w" (write, existing file is lost), '
+ '"w-" (write, fail if file exists) or '
+ '"a" (read/write if exists, create otherwise)',
+ )
+ parser.add_argument(
+ "--begin",
+ help="First file index, or first file indices to be considered. "
+ "This argument only makes sense when used together with "
+ "--file-pattern. Provide as many start indices as there "
+ "are indices in the file pattern, separated by commas. "
+ 'Examples: "--filepattern toto_%%d.edf --begin 100", '
+ ' "--filepattern toto_%%d_%%04d_%%02d.edf --begin 100,2000,5".',
+ )
+ parser.add_argument(
+ "--end",
+ help="Last file index, or last file indices to be considered. "
+ "The same rules as with argument --begin apply. "
+ 'Example: "--filepattern toto_%%d_%%d.edf --end 199,1999"',
+ )
+ parser.add_argument(
+ "--add-root-group",
+ action="store_true",
+ help="This option causes each input file to be written to a "
+ "specific root group with the same name as the file. When "
+ "merging multiple input files, this can help preventing conflicts"
+ " when datasets have the same name (see --overwrite-data). "
+ "This option is ignored when using --file-pattern.",
+ )
+ parser.add_argument(
+ "--overwrite-data",
+ action="store_true",
+ help="If the output path exists and an input dataset has the same"
+ " name as an existing output dataset, overwrite the output "
+ 'dataset (in modes "r+" or "a").',
+ )
+ parser.add_argument(
+ "--min-size",
+ type=int,
+ default=500,
+ help="Minimum number of elements required to be in a dataset to "
+ "apply compression or chunking (default 500).",
+ )
+ parser.add_argument(
+ "--chunks",
+ nargs="?",
+ const="auto",
+ help="Chunk shape. Provide an argument that evaluates as a python "
+ 'tuple (e.g. "(1024, 768)"). If this option is provided without '
+ "specifying an argument, the h5py library will guess a chunk for "
+ "you. Note that if you specify an explicit chunking shape, it "
+ "will be applied identically to all datasets with a large enough "
+ "size (see --min-size). ",
+ )
+ parser.add_argument(
+ "--compression",
+ nargs="?",
+ const="gzip",
+ help="Compression filter. By default, the datasets in the output "
+ "file are not compressed. If this option is specified without "
+ "argument, the GZIP compression is used. Additional compression "
+ "filters may be available, depending on your HDF5 installation.",
+ )
+
+ def check_gzip_compression_opts(value):
+ ivalue = int(value)
+ if ivalue < 0 or ivalue > 9:
+ raise argparse.ArgumentTypeError(
+ "--compression-opts must be an int from 0 to 9"
+ )
+ return ivalue
+
+ parser.add_argument(
+ "--compression-opts",
+ type=check_gzip_compression_opts,
+ help='Compression options. For "gzip", this may be an integer from '
+ "0 to 9, with a default of 4. This is only supported for GZIP.",
+ )
+ parser.add_argument(
+ "--shuffle",
+ action="store_true",
+ help="Enables the byte shuffle filter. This may improve the compression "
+ "ratio for block oriented compressors like GZIP or LZF.",
+ )
+ parser.add_argument(
+ "--fletcher32",
+ action="store_true",
+ help="Adds a checksum to each chunk to detect data corruption.",
+ )
+ parser.add_argument(
+ "--debug",
+ action="store_true",
+ default=False,
+ help="Set logging system in debug mode",
+ )
+
+ options = parser.parse_args(argv[1:])
+
+ if options.debug:
+ logging.root.setLevel(logging.DEBUG)
+
+ # Import after parsing --debug
+ try:
+ # it should be loaded before h5py
+ import hdf5plugin # noqa
+ except ImportError:
+ _logger.debug("Backtrace", exc_info=True)
+ hdf5plugin = None
+
+ import h5py
+
+ try:
+ from silx.io.convert import write_to_h5
+ except ImportError:
+ _logger.debug("Backtrace", exc_info=True)
+ write_to_h5 = None
+
+ if hdf5plugin is None:
+ message = (
+ "Module 'hdf5plugin' is not installed. It supports additional hdf5"
+ + ' compressions. You can install it using "pip install hdf5plugin".'
+ )
+ _logger.debug(message)
+
+ # Process input arguments (mutually exclusive arguments)
+ if bool(options.input_files) == bool(options.file_pattern is not None):
+ if not options.input_files:
+ message = "You must specify either input files (at least one), "
+ message += "or a file pattern."
+ else:
+ message = "You cannot specify input files and a file pattern"
+ message += " at the same time."
+ _logger.error(message)
+ return -1
+ elif options.input_files:
+ # some shells (windows) don't interpret wildcard characters (*, ?, [])
+ old_input_list = list(options.input_files)
+ options.input_files = []
+ for fname in old_input_list:
+ globbed_files = glob(fname)
+ if not globbed_files:
+ # no files found, keep the name as it is, to raise an error later
+ options.input_files += [fname]
+ else:
+ # glob does not sort files, but the bash shell does
+ options.input_files += sorted(globbed_files)
+ else:
+ # File series
+ dirname = os.path.dirname(options.file_pattern)
+ file_pattern_re = c_format_string_to_re(options.file_pattern) + "$"
+ files_in_dir = glob(os.path.join(dirname, "*"))
+ _logger.debug(
+ """
+ Processing file_pattern
+ dirname: %s
+ file_pattern_re: %s
+ files_in_dir: %s
+ """,
+ dirname,
+ file_pattern_re,
+ files_in_dir,
+ )
+
+ options.input_files = sorted(
+ list(filter(lambda name: re.match(file_pattern_re, name), files_in_dir))
+ )
+ _logger.debug("options.input_files: %s", options.input_files)
+
+ if options.begin is not None:
+ options.input_files = drop_indices_before_begin(
+ options.input_files, file_pattern_re, options.begin
+ )
+ _logger.debug(
+ "options.input_files after applying --begin: %s", options.input_files
+ )
+
+ if options.end is not None:
+ options.input_files = drop_indices_after_end(
+ options.input_files, file_pattern_re, options.end
+ )
+ _logger.debug(
+ "options.input_files after applying --end: %s", options.input_files
+ )
+
+ if are_files_missing_in_series(options.input_files, file_pattern_re):
+ _logger.error("File missing in the file series. Aborting.")
+ return -1
+
+ if not options.input_files:
+ _logger.error("No file matching --file-pattern found.")
+ return -1
+
+ # Test that the output path is writeable
+ if "::" in options.output_uri:
+ output_name, hdf5_path = options.output_uri.split("::")
+ else:
+ output_name, hdf5_path = options.output_uri, "/"
+
+ if os.path.isfile(output_name):
+ if options.mode == "w-":
+ _logger.error(
+ "Output file %s exists and mode is 'w-' (default)."
+ " Aborting. To append data to an existing file, "
+ "use 'a' or 'r+'.",
+ output_name,
+ )
+ return -1
+ elif not os.access(output_name, os.W_OK):
+ _logger.error("Output file %s exists and is not writeable.", output_name)
+ return -1
+ elif options.mode == "w":
+ _logger.info(
+ "Output file %s exists and mode is 'w'. " "Overwriting existing file.",
+ output_name,
+ )
+ elif options.mode in ["a", "r+"]:
+ _logger.info("Appending data to existing file %s.", output_name)
+ else:
+ if options.mode == "r+":
+ _logger.error(
+ "Output file %s does not exist and mode is 'r+'"
+ " (append, file must exist). Aborting.",
+ output_name,
+ )
+ return -1
+ else:
+ _logger.info("Creating new output file %s.", output_name)
+
+ # Test that all input files exist and are readable
+ bad_input = False
+ for fname in options.input_files:
+ if not os.access(fname, os.R_OK):
+ _logger.error("Cannot read input file %s.", fname)
+ bad_input = True
+ if bad_input:
+ _logger.error("Aborting.")
+ return -1
+
+ # create_dataset special args
+ create_dataset_args = {}
+ if options.chunks is not None:
+ if options.chunks.lower() in ["auto", "true"]:
+ create_dataset_args["chunks"] = True
+ else:
+ try:
+ chunks = ast.literal_eval(options.chunks)
+ except (ValueError, SyntaxError):
+ _logger.error("Invalid --chunks argument %s", options.chunks)
+ return -1
+ if not isinstance(chunks, (tuple, list)):
+ _logger.error("--chunks argument str does not evaluate to a tuple")
+ return -1
+ else:
+ nitems = numpy.prod(chunks)
+ nbytes = nitems * 8
+ if nbytes > 10**6:
+ _logger.warning(
+ "Requested chunk size might be larger than"
+ " the default 1MB chunk cache, for float64"
+ " data. This can dramatically affect I/O "
+ "performances."
+ )
+ create_dataset_args["chunks"] = chunks
+
+ if options.compression is not None:
+ try:
+ compression = int(options.compression)
+ except ValueError:
+ compression = options.compression
+ create_dataset_args["compression"] = compression
+
+ if options.compression_opts is not None:
+ create_dataset_args["compression_opts"] = options.compression_opts
+
+ if options.shuffle:
+ create_dataset_args["shuffle"] = True
+
+ if options.fletcher32:
+ create_dataset_args["fletcher32"] = True
+
+ if (
+ len(options.input_files) > 1
+ and not contains_specfile(options.input_files)
+ and not contains_fiofile(options.input_files)
+ and not options.add_root_group
+ ) or options.file_pattern is not None:
+ # File series -> stack of images
+ input_group = fabioh5.File(file_series=options.input_files)
+ if hdf5_path != "/":
+ # we want to append only data and headers to an existing file
+ input_group = input_group["/scan_0/instrument/detector_0"]
+ with h5py.File(output_name, mode=options.mode) as h5f:
+ write_to_h5(
+ input_group,
+ h5f,
+ h5path=hdf5_path,
+ overwrite_data=options.overwrite_data,
+ create_dataset_args=create_dataset_args,
+ min_size=options.min_size,
+ )
+
+ elif (
+ len(options.input_files) == 1
+ or are_all_specfile(options.input_files)
+ or are_all_fiofile(options.input_files)
+ or options.add_root_group
+ ):
+ # single file, or spec files
+ h5paths_and_groups = []
+ for input_name in options.input_files:
+ hdf5_path_for_file = hdf5_path
+ if options.add_root_group:
+ hdf5_path_for_file = (
+ hdf5_path.rstrip("/") + "/" + os.path.basename(input_name)
+ )
+ try:
+ h5paths_and_groups.append(
+ (hdf5_path_for_file, silx.io.open(input_name))
+ )
+ except IOError:
+ _logger.error(
+ "Cannot read file %s. If this is a file format "
+ "supported by the fabio library, you can try to"
+ " install fabio (`pip install fabio`)."
+ " Aborting conversion.",
+ input_name,
+ )
+ return -1
+
+ with h5py.File(output_name, mode=options.mode) as h5f:
+ for hdf5_path_for_file, input_group in h5paths_and_groups:
+ write_to_h5(
+ input_group,
+ h5f,
+ h5path=hdf5_path_for_file,
+ overwrite_data=options.overwrite_data,
+ create_dataset_args=create_dataset_args,
+ min_size=options.min_size,
+ )
+
+ else:
+ # multiple file, SPEC and fabio images mixed
+ _logger.error(
+ "Multiple files with incompatible formats specified. "
+ "You can provide multiple SPEC files or multiple image "
+ "files, but not both."
+ )
+ return -1
+
+ with h5py.File(output_name, mode="r+") as h5f:
+ # append "silx convert" to the creator attribute, for NeXus files
+ previous_creator = h5f.attrs.get("creator", "")
+ creator = "silx convert (v%s)" % silx.version
+ # only if it not already there
+ if creator not in previous_creator:
+ if not previous_creator:
+ new_creator = creator
+ else:
+ new_creator = previous_creator + "; " + creator
+ h5f.attrs["creator"] = numpy.array(
+ new_creator, dtype=h5py.special_dtype(vlen=str)
+ )
+
+ return 0