summaryrefslogtreecommitdiff
path: root/silx/app/convert.py
diff options
context:
space:
mode:
Diffstat (limited to 'silx/app/convert.py')
-rw-r--r--silx/app/convert.py368
1 files changed, 315 insertions, 53 deletions
diff --git a/silx/app/convert.py b/silx/app/convert.py
index a092ec1..cd48deb 100644
--- a/silx/app/convert.py
+++ b/silx/app/convert.py
@@ -1,6 +1,6 @@
# coding: utf-8
# /*##########################################################################
-# Copyright (C) 2017 European Synchrotron Radiation Facility
+# Copyright (C) 2017-2018 European Synchrotron Radiation Facility
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -24,13 +24,22 @@
"""Convert silx supported data files into HDF5 files"""
import ast
-import sys
import os
import argparse
from glob import glob
import logging
import numpy
-import silx
+import re
+import time
+
+import silx.io
+from silx.io.specfile import is_specfile
+from silx.third_party import six
+
+try:
+ from silx.io import fabioh5
+except ImportError:
+ fabioh5 = None
__authors__ = ["P. Knobel"]
@@ -42,6 +51,129 @@ _logger = logging.getLogger(__name__)
"""Module logger"""
+def c_format_string_to_re(pattern_string):
+ """
+
+ :param pattern_string: C style format string with integer patterns
+ (e.g. "%d", "%04d").
+ Not supported: fixed length padded with whitespaces (e.g "%4d", "%-4d")
+ :return: Equivalent regular expression (e.g. "\d+", "\d{4}")
+ """
+ # escape dots and backslashes
+ pattern_string = pattern_string.replace("\\", "\\\\")
+ pattern_string = pattern_string.replace(".", "\.")
+
+ # %d
+ pattern_string = pattern_string.replace("%d", "([-+]?\d+)")
+
+ # %0nd
+ for sub_pattern in re.findall("%0\d+d", pattern_string):
+ n = int(re.search("%0(\d+)d", sub_pattern).group(1))
+ if n == 1:
+ re_sub_pattern = "([+-]?\d)"
+ else:
+ re_sub_pattern = "([\d+-]\d{%d})" % (n - 1)
+ pattern_string = pattern_string.replace(sub_pattern, re_sub_pattern, 1)
+
+ return pattern_string
+
+
+def drop_indices_before_begin(filenames, regex, begin):
+ """
+
+ :param List[str] filenames: list of filenames
+ :param str regex: Regexp used to find indices in a filename
+ :param str begin: Comma separated list of begin indices
+ :return: List of filenames with only indices >= begin
+ """
+ begin_indices = list(map(int, begin.split(",")))
+ output_filenames = []
+ for fname in filenames:
+ m = re.match(regex, fname)
+ file_indices = list(map(int, m.groups()))
+ if len(file_indices) != len(begin_indices):
+ raise IOError("Number of indices found in filename "
+ "does not match number of parsed end indices.")
+ good_indices = True
+ for i, fidx in enumerate(file_indices):
+ if fidx < begin_indices[i]:
+ good_indices = False
+ if good_indices:
+ output_filenames.append(fname)
+ return output_filenames
+
+
+def drop_indices_after_end(filenames, regex, end):
+ """
+
+ :param List[str] filenames: list of filenames
+ :param str regex: Regexp used to find indices in a filename
+ :param str end: Comma separated list of end indices
+ :return: List of filenames with only indices <= end
+ """
+ end_indices = list(map(int, end.split(",")))
+ output_filenames = []
+ for fname in filenames:
+ m = re.match(regex, fname)
+ file_indices = list(map(int, m.groups()))
+ if len(file_indices) != len(end_indices):
+ raise IOError("Number of indices found in filename "
+ "does not match number of parsed end indices.")
+ good_indices = True
+ for i, fidx in enumerate(file_indices):
+ if fidx > end_indices[i]:
+ good_indices = False
+ if good_indices:
+ output_filenames.append(fname)
+ return output_filenames
+
+
+def are_files_missing_in_series(filenames, regex):
+ """Return True if any file is missing in a list of filenames
+ that are supposed to follow a pattern.
+
+ :param List[str] filenames: list of filenames
+ :param str regex: Regexp used to find indices in a filename
+ :return: boolean
+ :raises AssertionError: if a filename does not match the regexp
+ """
+ previous_indices = None
+ for fname in filenames:
+ m = re.match(regex, fname)
+ assert m is not None, \
+ "regex %s does not match filename %s" % (fname, regex)
+ new_indices = list(map(int, m.groups()))
+ if previous_indices is not None:
+ for old_idx, new_idx in zip(previous_indices, new_indices):
+ if (new_idx - old_idx) > 1:
+ _logger.error("Index increment > 1 in file series: "
+ "previous idx %d, next idx %d",
+ old_idx, new_idx)
+ return True
+ previous_indices = new_indices
+ return False
+
+
+def are_all_specfile(filenames):
+ """Return True if all files in a list are SPEC files.
+ :param List[str] filenames: list of filenames
+ """
+ for fname in filenames:
+ if not is_specfile(fname):
+ return False
+ return True
+
+
+def contains_specfile(filenames):
+ """Return True if any file in a list are SPEC files.
+ :param List[str] filenames: list of filenames
+ """
+ for fname in filenames:
+ if is_specfile(fname):
+ return True
+ return False
+
+
def main(argv):
"""
Main function to launch the converter as an application
@@ -52,15 +184,29 @@ def main(argv):
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
'input_files',
- nargs="+",
- help='Input files (EDF, SPEC)')
+ nargs="*",
+ help='Input files (EDF, TIFF, SPEC...). When specifying multiple '
+ 'files, you cannot specify both fabio images and SPEC files. '
+ 'Multiple SPEC files will simply be concatenated, with one '
+ 'entry per scan. Multiple image files will be merged into '
+ 'a single entry with a stack of images.')
+ # input_files and --filepattern are mutually exclusive
+ parser.add_argument(
+ '--file-pattern',
+ help='File name pattern for loading a series of indexed image files '
+ '(toto_%%04d.edf). This argument is incompatible with argument '
+ 'input_files. If an output URI with a HDF5 path is provided, '
+ 'only the content of the NXdetector group will be copied there. '
+ 'If no HDF5 path, or just "/", is given, a complete NXdata '
+ 'structure will be created.')
parser.add_argument(
'-o', '--output-uri',
- nargs="?",
- help='Output file (HDF5). If omitted, it will be the '
- 'concatenated input file names, with a ".h5" suffix added.'
- ' An URI can be provided to write the data into a specific '
- 'group in the output file: /path/to/file::/path/to/group')
+ default=time.strftime("%Y%m%d-%H%M%S") + '.h5',
+ help='Output file name (HDF5). An URI can be provided to write'
+ ' the data into a specific group in the output file: '
+ '/path/to/file::/path/to/group. '
+ 'If not provided, the filename defaults to a timestamp:'
+ ' YYYYmmdd-HHMMSS.h5')
parser.add_argument(
'-m', '--mode',
default="w-",
@@ -69,12 +215,26 @@ def main(argv):
'"w-" (write, fail if file exists) or '
'"a" (read/write if exists, create otherwise)')
parser.add_argument(
- '--no-root-group',
+ '--begin',
+ help='First file index, or first file indices to be considered. '
+ 'This argument only makes sense when used together with '
+ '--file-pattern. Provide as many start indices as there '
+ 'are indices in the file pattern, separated by commas. '
+ 'Examples: "--filepattern toto_%%d.edf --begin 100", '
+ ' "--filepattern toto_%%d_%%04d_%%02d.edf --begin 100,2000,5".')
+ parser.add_argument(
+ '--end',
+ help='Last file index, or last file indices to be considered. '
+ 'The same rules as with argument --begin apply. '
+ 'Example: "--filepattern toto_%%d_%%d.edf --end 199,1999"')
+ parser.add_argument(
+ '--add-root-group',
action="store_true",
- help='This option disables the default behavior of creating a '
- 'root group (entry) for each file to be converted. When '
- 'merging multiple input files, this can cause conflicts '
- 'when datasets have the same name (see --overwrite-data).')
+ help='This option causes each input file to be written to a '
+ 'specific root group with the same name as the file. When '
+ 'merging multiple input files, this can help preventing conflicts'
+ ' when datasets have the same name (see --overwrite-data). '
+ 'This option is ignored when using --file-pattern.')
parser.add_argument(
'--overwrite-data',
action="store_true",
@@ -121,7 +281,7 @@ def main(argv):
parser.add_argument(
'--shuffle',
action="store_true",
- help='Enables the byte shuffle filter, may improve the compression '
+ help='Enables the byte shuffle filter. This may improve the compression '
'ratio for block oriented compressors like GZIP or LZF.')
parser.add_argument(
'--fletcher32',
@@ -135,22 +295,10 @@ def main(argv):
options = parser.parse_args(argv[1:])
- # some shells (windows) don't interpret wildcard characters (*, ?, [])
- old_input_list = list(options.input_files)
- options.input_files = []
- for fname in old_input_list:
- globbed_files = glob(fname)
- if not globbed_files:
- # no files found, keep the name as it is, to raise an error later
- options.input_files += [fname]
- else:
- options.input_files += globbed_files
- old_input_list = None
-
if options.debug:
logging.root.setLevel(logging.DEBUG)
- # Import most of the things here to be sure to use the right logging level
+ # Import after parsing --debug
try:
# it should be loaded before h5py
import hdf5plugin # noqa
@@ -177,22 +325,78 @@ def main(argv):
+ " compressions. You can install it using \"pip install hdf5plugin\"."
_logger.debug(message)
+ # Process input arguments (mutually exclusive arguments)
+ if bool(options.input_files) == bool(options.file_pattern is not None):
+ if not options.input_files:
+ message = "You must specify either input files (at least one), "
+ message += "or a file pattern."
+ else:
+ message = "You cannot specify input files and a file pattern"
+ message += " at the same time."
+ _logger.error(message)
+ return -1
+ elif options.input_files:
+ # some shells (windows) don't interpret wildcard characters (*, ?, [])
+ old_input_list = list(options.input_files)
+ options.input_files = []
+ for fname in old_input_list:
+ globbed_files = glob(fname)
+ if not globbed_files:
+ # no files found, keep the name as it is, to raise an error later
+ options.input_files += [fname]
+ else:
+ # glob does not sort files, but the bash shell does
+ options.input_files += sorted(globbed_files)
+ else:
+ # File series
+ dirname = os.path.dirname(options.file_pattern)
+ file_pattern_re = c_format_string_to_re(options.file_pattern) + "$"
+ files_in_dir = glob(os.path.join(dirname, "*"))
+ _logger.debug("""
+ Processing file_pattern
+ dirname: %s
+ file_pattern_re: %s
+ files_in_dir: %s
+ """, dirname, file_pattern_re, files_in_dir)
+
+ options.input_files = sorted(list(filter(lambda name: re.match(file_pattern_re, name),
+ files_in_dir)))
+ _logger.debug("options.input_files: %s", options.input_files)
+
+ if options.begin is not None:
+ options.input_files = drop_indices_before_begin(options.input_files,
+ file_pattern_re,
+ options.begin)
+ _logger.debug("options.input_files after applying --begin: %s",
+ options.input_files)
+
+ if options.end is not None:
+ options.input_files = drop_indices_after_end(options.input_files,
+ file_pattern_re,
+ options.end)
+ _logger.debug("options.input_files after applying --end: %s",
+ options.input_files)
+
+ if are_files_missing_in_series(options.input_files,
+ file_pattern_re):
+ _logger.error("File missing in the file series. Aborting.")
+ return -1
+
+ if not options.input_files:
+ _logger.error("No file matching --file-pattern found.")
+ return -1
+
# Test that the output path is writeable
- if options.output_uri is None:
- input_basenames = [os.path.basename(name) for name in options.input_files]
- output_name = ''.join(input_basenames) + ".h5"
- _logger.info("No output file specified, using %s", output_name)
- hdf5_path = "/"
+ if "::" in options.output_uri:
+ output_name, hdf5_path = options.output_uri.split("::")
else:
- if "::" in options.output_uri:
- output_name, hdf5_path = options.output_uri.split("::")
- else:
- output_name, hdf5_path = options.output_uri, "/"
+ output_name, hdf5_path = options.output_uri, "/"
if os.path.isfile(output_name):
if options.mode == "w-":
- _logger.error("Output file %s exists and mode is 'w-'"
- " (write, file must not exist). Aborting.",
+ _logger.error("Output file %s exists and mode is 'w-' (default)."
+ " Aborting. To append data to an existing file, "
+ "use 'a' or 'r+'.",
output_name)
return -1
elif not os.access(output_name, os.W_OK):
@@ -262,22 +466,80 @@ def main(argv):
if options.fletcher32:
create_dataset_args["fletcher32"] = True
- with h5py.File(output_name, mode=options.mode) as h5f:
- for input_name in options.input_files:
- hdf5_path_for_file = hdf5_path
- if not options.no_root_group:
- hdf5_path_for_file = hdf5_path.rstrip("/") + "/" + os.path.basename(input_name)
- write_to_h5(input_name, h5f,
- h5path=hdf5_path_for_file,
+ if (len(options.input_files) > 1 and
+ not contains_specfile(options.input_files) and
+ not options.add_root_group) or options.file_pattern is not None:
+ # File series -> stack of images
+ if fabioh5 is None:
+ # return a helpful error message if fabio is missing
+ try:
+ import fabio
+ except ImportError:
+ _logger.error("The fabio library is required to convert"
+ " edf files. Please install it with 'pip "
+ "install fabio` and try again.")
+ else:
+ # unexpected problem in silx.io.fabioh5
+ raise
+ return -1
+ input_group = fabioh5.File(file_series=options.input_files)
+ if hdf5_path != "/":
+ # we want to append only data and headers to an existing file
+ input_group = input_group["/scan_0/instrument/detector_0"]
+ with h5py.File(output_name, mode=options.mode) as h5f:
+ write_to_h5(input_group, h5f,
+ h5path=hdf5_path,
overwrite_data=options.overwrite_data,
create_dataset_args=create_dataset_args,
min_size=options.min_size)
- # append the convert command to the creator attribute, for NeXus files
- creator = h5f[hdf5_path_for_file].attrs.get("creator", b"").decode()
- convert_command = " ".join(argv)
- if convert_command not in creator:
- h5f[hdf5_path_for_file].attrs["creator"] = \
- numpy.string_(creator + "; convert command: %s" % " ".join(argv))
+ elif len(options.input_files) == 1 or \
+ are_all_specfile(options.input_files) or\
+ options.add_root_group:
+ # single file, or spec files
+ h5paths_and_groups = []
+ for input_name in options.input_files:
+ hdf5_path_for_file = hdf5_path
+ if options.add_root_group:
+ hdf5_path_for_file = hdf5_path.rstrip("/") + "/" + os.path.basename(input_name)
+ try:
+ h5paths_and_groups.append((hdf5_path_for_file,
+ silx.io.open(input_name)))
+ except IOError:
+ _logger.error("Cannot read file %s. If this is a file format "
+ "supported by the fabio library, you can try to"
+ " install fabio (`pip install fabio`)."
+ " Aborting conversion.",
+ input_name)
+ return -1
+
+ with h5py.File(output_name, mode=options.mode) as h5f:
+ for hdf5_path_for_file, input_group in h5paths_and_groups:
+ write_to_h5(input_group, h5f,
+ h5path=hdf5_path_for_file,
+ overwrite_data=options.overwrite_data,
+ create_dataset_args=create_dataset_args,
+ min_size=options.min_size)
+
+ else:
+ # multiple file, SPEC and fabio images mixed
+ _logger.error("Multiple files with incompatible formats specified. "
+ "You can provide multiple SPEC files or multiple image "
+ "files, but not both.")
+ return -1
+
+ with h5py.File(output_name, mode="r+") as h5f:
+ # append "silx convert" to the creator attribute, for NeXus files
+ previous_creator = h5f.attrs.get("creator", u"")
+ creator = "silx convert (v%s)" % silx.version
+ # only if it not already there
+ if creator not in previous_creator:
+ if not previous_creator:
+ new_creator = creator
+ else:
+ new_creator = previous_creator + "; " + creator
+ h5f.attrs["creator"] = numpy.array(
+ new_creator,
+ dtype=h5py.special_dtype(vlen=six.text_type))
return 0