1 files changed, 315 insertions, 53 deletions
diff --git a/silx/app/convert.py b/silx/app/convert.py
index a092ec1..cd48deb 100644
--- a/silx/app/convert.py
+++ b/silx/app/convert.py
@@ -1,6 +1,6 @@
 # coding: utf-8
 # /*##########################################################################
-# Copyright (C) 2017 European Synchrotron Radiation Facility
+# Copyright (C) 2017-2018 European Synchrotron Radiation Facility
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -24,13 +24,22 @@
 """Convert silx supported data files into HDF5 files"""
 
 import ast
-import sys
 import os
 import argparse
 from glob import glob
 import logging
 import numpy
-import silx
+import re
+import time
+
+import silx.io
+from silx.io.specfile import is_specfile
+from silx.third_party import six
+
+try:
+    from silx.io import fabioh5
+except ImportError:
+    fabioh5 = None
 
 
 __authors__ = ["P. Knobel"]
@@ -42,6 +51,129 @@ _logger = logging.getLogger(__name__)
 """Module logger"""
 
 
+def c_format_string_to_re(pattern_string):
+    """
+
+    :param pattern_string: C style format string with integer patterns
+        (e.g. "%d", "%04d").
+        Not supported: fixed length padded with whitespaces (e.g "%4d", "%-4d")
+    :return: Equivalent regular expression (e.g. "\d+", "\d{4}")
+    """
+    # escape dots and backslashes
+    pattern_string = pattern_string.replace("\\", "\\\\")
+    pattern_string = pattern_string.replace(".", "\.")
+
+    # %d
+    pattern_string = pattern_string.replace("%d", "([-+]?\d+)")
+
+    # %0nd
+    for sub_pattern in re.findall("%0\d+d", pattern_string):
+        n = int(re.search("%0(\d+)d", sub_pattern).group(1))
+        if n == 1:
+            re_sub_pattern = "([+-]?\d)"
+        else:
+            re_sub_pattern = "([\d+-]\d{%d})" % (n - 1)
+        pattern_string = pattern_string.replace(sub_pattern, re_sub_pattern, 1)
+
+    return pattern_string
+
+
+def drop_indices_before_begin(filenames, regex, begin):
+    """
+
+    :param List[str] filenames: list of filenames
+    :param str regex: Regexp used to find indices in a filename
+    :param str begin: Comma separated list of begin indices
+    :return: List of filenames with only indices >= begin
+    """
+    begin_indices = list(map(int, begin.split(",")))
+    output_filenames = []
+    for fname in filenames:
+        m = re.match(regex, fname)
+        file_indices = list(map(int, m.groups()))
+        if len(file_indices) != len(begin_indices):
+            raise IOError("Number of indices found in filename "
+                          "does not match number of parsed end indices.")
+        good_indices = True
+        for i, fidx in enumerate(file_indices):
+            if fidx < begin_indices[i]:
+                good_indices = False
+        if good_indices:
+            output_filenames.append(fname)
+    return output_filenames
+
+
+def drop_indices_after_end(filenames, regex, end):
+    """
+
+    :param List[str] filenames: list of filenames
+    :param str regex: Regexp used to find indices in a filename
+    :param str end: Comma separated list of end indices
+    :return: List of filenames with only indices <= end
+    """
+    end_indices = list(map(int, end.split(",")))
+    output_filenames = []
+    for fname in filenames:
+        m = re.match(regex, fname)
+        file_indices = list(map(int, m.groups()))
+        if len(file_indices) != len(end_indices):
+            raise IOError("Number of indices found in filename "
+                          "does not match number of parsed end indices.")
+        good_indices = True
+        for i, fidx in enumerate(file_indices):
+            if fidx > end_indices[i]:
+                good_indices = False
+        if good_indices:
+            output_filenames.append(fname)
+    return output_filenames
+
+
+def are_files_missing_in_series(filenames, regex):
+    """Return True if any file is missing in a list of filenames
+    that are supposed to follow a pattern.
+
+    :param List[str] filenames: list of filenames
+    :param str regex: Regexp used to find indices in a filename
+    :return: boolean
+    :raises AssertionError: if a filename does not match the regexp
+    """
+    previous_indices = None
+    for fname in filenames:
+        m = re.match(regex, fname)
+        assert m is not None, \
+            "regex %s does not match filename %s" % (fname, regex)
+        new_indices = list(map(int, m.groups()))
+        if previous_indices is not None:
+            for old_idx, new_idx in zip(previous_indices, new_indices):
+                if (new_idx - old_idx) > 1:
+                    _logger.error("Index increment > 1 in file series: "
+                                  "previous idx %d, next idx %d",
+                                  old_idx, new_idx)
+                    return True
+        previous_indices = new_indices
+    return False
+
+
+def are_all_specfile(filenames):
+    """Return True if all files in a list are SPEC files.
+    :param List[str] filenames: list of filenames
+    """
+    for fname in filenames:
+        if not is_specfile(fname):
+            return False
+    return True
+
+
+def contains_specfile(filenames):
+    """Return True if any file in a list are SPEC files.
+    :param List[str] filenames: list of filenames
+    """
+    for fname in filenames:
+        if is_specfile(fname):
+            return True
+    return False
+
+
 def main(argv):
     """
     Main function to launch the converter as an application
@@ -52,15 +184,29 @@ def main(argv):
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument(
         'input_files',
-        nargs="+",
-        help='Input files (EDF, SPEC)')
+        nargs="*",
+        help='Input files (EDF, TIFF, SPEC...). When specifying multiple '
+             'files, you cannot specify both fabio images and SPEC files. '
+             'Multiple SPEC files will simply be concatenated, with one '
+             'entry per scan. Multiple image files will be merged into '
+             'a single entry with a stack of images.')
+    # input_files and --filepattern are mutually exclusive
+    parser.add_argument(
+        '--file-pattern',
+        help='File name pattern for loading a series of indexed image files '
+             '(toto_%%04d.edf). This argument is incompatible with argument '
+             'input_files. If an output URI with a HDF5 path is provided, '
+             'only the content of the NXdetector group will be copied there. '
+             'If no HDF5 path, or just "/", is given, a complete NXdata '
+             'structure will be created.')
     parser.add_argument(
         '-o', '--output-uri',
-        nargs="?",
-        help='Output file (HDF5). If omitted, it will be the '
-             'concatenated input file names, with a ".h5" suffix added.'
-             ' An URI can be provided to write the data into a specific '
-             'group in the output file: /path/to/file::/path/to/group')
+        default=time.strftime("%Y%m%d-%H%M%S") + '.h5',
+        help='Output file name (HDF5). An URI can be provided to write'
+             ' the data into a specific group in the output file: '
+             '/path/to/file::/path/to/group. '
+             'If not provided, the filename defaults to a timestamp:'
+             ' YYYYmmdd-HHMMSS.h5')
     parser.add_argument(
         '-m', '--mode',
         default="w-",
@@ -69,12 +215,26 @@ def main(argv):
              '"w-" (write, fail if file exists) or '
              '"a" (read/write if exists, create otherwise)')
     parser.add_argument(
-        '--no-root-group',
+        '--begin',
+        help='First file index, or first file indices to be considered. '
+             'This argument only makes sense when used together with '
+             '--file-pattern. Provide as many start indices as there '
+             'are indices in the file pattern, separated by commas. '
+             'Examples: "--filepattern toto_%%d.edf --begin 100", '
+             ' "--filepattern toto_%%d_%%04d_%%02d.edf --begin 100,2000,5".')
+    parser.add_argument(
+        '--end',
+        help='Last file index, or last file indices to be considered. '
+             'The same rules as with argument --begin apply. '
+             'Example: "--filepattern toto_%%d_%%d.edf --end 199,1999"')
+    parser.add_argument(
+        '--add-root-group',
         action="store_true",
-        help='This option disables the default behavior of creating a '
-             'root group (entry) for each file to be converted. When '
-             'merging multiple input files, this can cause conflicts '
-             'when datasets have the same name (see --overwrite-data).')
+        help='This option causes each input file to be written to a '
+             'specific root group with the same name as the file. When '
+             'merging multiple input files, this can help preventing conflicts'
+             ' when datasets have the same name (see --overwrite-data). '
+             'This option is ignored when using --file-pattern.')
     parser.add_argument(
         '--overwrite-data',
         action="store_true",
@@ -121,7 +281,7 @@ def main(argv):
     parser.add_argument(
         '--shuffle',
         action="store_true",
-        help='Enables the byte shuffle filter, may improve the compression '
+        help='Enables the byte shuffle filter. This may improve the compression '
              'ratio for block oriented compressors like GZIP or LZF.')
     parser.add_argument(
         '--fletcher32',
@@ -135,22 +295,10 @@ def main(argv):
 
     options = parser.parse_args(argv[1:])
 
-    # some shells (windows) don't interpret wildcard characters (*, ?, [])
-    old_input_list = list(options.input_files)
-    options.input_files = []
-    for fname in old_input_list:
-        globbed_files = glob(fname)
-        if not globbed_files:
-            # no files found, keep the name as it is, to raise an error later
-            options.input_files += [fname]
-        else:
-            options.input_files += globbed_files
-        old_input_list = None
-
     if options.debug:
         logging.root.setLevel(logging.DEBUG)
 
-    # Import most of the things here to be sure to use the right logging level
+    # Import after parsing --debug
     try:
         # it should be loaded before h5py
         import hdf5plugin  # noqa
@@ -177,22 +325,78 @@ def main(argv):
             + " compressions. You can install it using \"pip install hdf5plugin\"."
         _logger.debug(message)
 
+    # Process input arguments (mutually exclusive arguments)
+    if bool(options.input_files) == bool(options.file_pattern is not None):
+        if not options.input_files:
+            message = "You must specify either input files (at least one), "
+            message += "or a file pattern."
+        else:
+            message = "You cannot specify input files and a file pattern"
+            message += " at the same time."
+        _logger.error(message)
+        return -1
+    elif options.input_files:
+        # some shells (windows) don't interpret wildcard characters (*, ?, [])
+        old_input_list = list(options.input_files)
+        options.input_files = []
+        for fname in old_input_list:
+            globbed_files = glob(fname)
+            if not globbed_files:
+                # no files found, keep the name as it is, to raise an error later
+                options.input_files += [fname]
+            else:
+                # glob does not sort files, but the bash shell does
+                options.input_files += sorted(globbed_files)
+    else:
+        # File series
+        dirname = os.path.dirname(options.file_pattern)
+        file_pattern_re = c_format_string_to_re(options.file_pattern) + "$"
+        files_in_dir = glob(os.path.join(dirname, "*"))
+        _logger.debug("""
+            Processing file_pattern
+            dirname: %s
+            file_pattern_re: %s
+            files_in_dir: %s
+            """, dirname, file_pattern_re, files_in_dir)
+
+        options.input_files = sorted(list(filter(lambda name: re.match(file_pattern_re, name),
+                                                 files_in_dir)))
+        _logger.debug("options.input_files: %s", options.input_files)
+
+        if options.begin is not None:
+            options.input_files = drop_indices_before_begin(options.input_files,
+                                                            file_pattern_re,
+                                                            options.begin)
+            _logger.debug("options.input_files after applying --begin: %s",
+                          options.input_files)
+
+        if options.end is not None:
+            options.input_files = drop_indices_after_end(options.input_files,
+                                                         file_pattern_re,
+                                                         options.end)
+            _logger.debug("options.input_files after applying --end: %s",
+                          options.input_files)
+
+        if are_files_missing_in_series(options.input_files,
+                                       file_pattern_re):
+            _logger.error("File missing in the file series. Aborting.")
+            return -1
+
+        if not options.input_files:
+            _logger.error("No file matching --file-pattern found.")
+            return -1
+
     # Test that the output path is writeable
-    if options.output_uri is None:
-        input_basenames = [os.path.basename(name) for name in options.input_files]
-        output_name = ''.join(input_basenames) + ".h5"
-        _logger.info("No output file specified, using %s", output_name)
-        hdf5_path = "/"
+    if "::" in options.output_uri:
+        output_name, hdf5_path = options.output_uri.split("::")
     else:
-        if "::" in options.output_uri:
-            output_name, hdf5_path = options.output_uri.split("::")
-        else:
-            output_name, hdf5_path = options.output_uri, "/"
+        output_name, hdf5_path = options.output_uri, "/"
 
     if os.path.isfile(output_name):
         if options.mode == "w-":
-            _logger.error("Output file %s exists and mode is 'w-'"
-                          " (write, file must not exist). Aborting.",
+            _logger.error("Output file %s exists and mode is 'w-' (default)."
+                          " Aborting. To append data to an existing file, "
+                          "use 'a' or 'r+'.",
                           output_name)
             return -1
         elif not os.access(output_name, os.W_OK):
@@ -262,22 +466,80 @@ def main(argv):
     if options.fletcher32:
         create_dataset_args["fletcher32"] = True
 
-    with h5py.File(output_name, mode=options.mode) as h5f:
-        for input_name in options.input_files:
-            hdf5_path_for_file = hdf5_path
-            if not options.no_root_group:
-                hdf5_path_for_file = hdf5_path.rstrip("/") + "/" + os.path.basename(input_name)
-            write_to_h5(input_name, h5f,
-                        h5path=hdf5_path_for_file,
+    if (len(options.input_files) > 1 and
+            not contains_specfile(options.input_files) and
+            not options.add_root_group) or options.file_pattern is not None:
+        # File series -> stack of images
+        if fabioh5 is None:
+            # return a helpful error message if fabio is missing
+            try:
+                import fabio
+            except ImportError:
+                _logger.error("The fabio library is required to convert"
+                              " edf files. Please install it with 'pip "
+                              "install fabio` and try again.")
+            else:
+                # unexpected problem in silx.io.fabioh5
+                raise
+            return -1
+        input_group = fabioh5.File(file_series=options.input_files)
+        if hdf5_path != "/":
+            # we want to append only data and headers to an existing file
+            input_group = input_group["/scan_0/instrument/detector_0"]
+        with h5py.File(output_name, mode=options.mode) as h5f:
+            write_to_h5(input_group, h5f,
+                        h5path=hdf5_path,
                         overwrite_data=options.overwrite_data,
                         create_dataset_args=create_dataset_args,
                         min_size=options.min_size)
 
-            # append the convert command to the creator attribute, for NeXus files
-            creator = h5f[hdf5_path_for_file].attrs.get("creator", b"").decode()
-            convert_command = " ".join(argv)
-            if convert_command not in creator:
-                h5f[hdf5_path_for_file].attrs["creator"] = \
-                    numpy.string_(creator + "; convert command: %s" % " ".join(argv))
+    elif len(options.input_files) == 1 or \
+            are_all_specfile(options.input_files) or\
+            options.add_root_group:
+        # single file, or spec files
+        h5paths_and_groups = []
+        for input_name in options.input_files:
+            hdf5_path_for_file = hdf5_path
+            if options.add_root_group:
+                hdf5_path_for_file = hdf5_path.rstrip("/") + "/" + os.path.basename(input_name)
+            try:
+                h5paths_and_groups.append((hdf5_path_for_file,
+                                           silx.io.open(input_name)))
+            except IOError:
+                _logger.error("Cannot read file %s. If this is a file format "
+                              "supported by the fabio library, you can try to"
+                              " install fabio (`pip install fabio`)."
+                              " Aborting conversion.",
+                              input_name)
+                return -1
+
+        with h5py.File(output_name, mode=options.mode) as h5f:
+            for hdf5_path_for_file, input_group in h5paths_and_groups:
+                write_to_h5(input_group, h5f,
+                            h5path=hdf5_path_for_file,
+                            overwrite_data=options.overwrite_data,
+                            create_dataset_args=create_dataset_args,
+                            min_size=options.min_size)
+
+    else:
+        # multiple file, SPEC and fabio images mixed
+        _logger.error("Multiple files with incompatible formats specified. "
+                      "You can provide multiple SPEC files or multiple image "
+                      "files, but not both.")
+        return -1
+
+    with h5py.File(output_name, mode="r+") as h5f:
+        # append "silx convert" to the creator attribute, for NeXus files
+        previous_creator = h5f.attrs.get("creator", u"")
+        creator = "silx convert (v%s)" % silx.version
+        # only if it not already there
+        if creator not in previous_creator:
+            if not previous_creator:
+                new_creator = creator
+            else:
+                new_creator = previous_creator + "; " + creator
+            h5f.attrs["creator"] = numpy.array(
+                    new_creator,
+                    dtype=h5py.special_dtype(vlen=six.text_type))
 
     return 0