New upstream version 2.4.0

author: Picca Frédéric-Emmanuel <picca@debian.org> 2022-03-20 08:46:25 +0100
committer: Picca Frédéric-Emmanuel <picca@debian.org> 2022-03-20 08:46:25 +0100
commit: 102a43171cebe81d073159c08e883a37e699a165 (patch)
tree: 83851f33a57848d81cff02d5b885538a2c5dc902
parent: a0ff70176cc3d6642a806914e6dcb751c5f29828 (diff)
29 files changed, 322 insertions, 208 deletions
diff --git a/.azure-pipelines/azure-pipelines.yml b/.azure-pipelines/azure-pipelines.yml
index 93363f9..dc03351 100644
--- a/.azure-pipelines/azure-pipelines.yml
+++ b/.azure-pipelines/azure-pipelines.yml
@@ -26,10 +26,17 @@ stages:
 
       - bash: |
           set -eux
-          pip install --disable-pip-version-check flake8
+          pip install --disable-pip-version-check flake8 flake8-comprehensions
           python .azure-pipelines/flake8-validation.py
         displayName: Flake8 validation
 
+      - bash: |
+          set -eux
+          # install versions matching the ones in the corresponding pre-commit hook
+          pip install --disable-pip-version-check mypy==0.931 types-PyYAML==6.0.4 types-requests==2.27.11
+          mypy --no-strict-optional dials_data/
+        displayName: Type checking
+
       # Set up constants for further build steps
       - bash: |
           echo "##vso[task.setvariable variable=BUILD_REPOSITORY_NAME;isOutput=true]${BUILD_REPOSITORY_NAME}"
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index fea6d8d..b97c10c 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 2.3.0
+current_version = 2.4.0
 commit = True
 tag = True
 
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2248526..c6476bf 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ repos:
 
 # Automatically sort imports
 - repo: https://github.com/PyCQA/isort
-  rev: 5.9.3
+  rev: 5.10.1
   hooks:
   - id: isort
     args: [
@@ -30,22 +30,24 @@ repos:
 
 # Automatic source code formatting
 - repo: https://github.com/psf/black
-  rev: 21.6b0
+  rev: 22.1.0
   hooks:
   - id: black
     args: [--safe, --quiet]
 
 # Linting
 - repo: https://github.com/PyCQA/flake8
-  rev: 3.9.2
+  rev: 4.0.1
   hooks:
   - id: flake8
-    additional_dependencies: ['flake8-comprehensions==3.5.0']
+    additional_dependencies: ['flake8-comprehensions==3.8.0']
 
 # Type checking
+# Remember to change versions in .azure-pipelines/azure-pipelines.yml to match
+# the versions here.
 - repo: https://github.com/pre-commit/mirrors-mypy
-  rev: v0.910
+  rev: v0.931
   hooks:
   - id: mypy
     files: 'dials_data/.*\.py$'
-    additional_dependencies: ['types-PyYAML==5.4.9']
+    additional_dependencies: ['types-PyYAML==6.0.4', 'types-requests==2.27.11']
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index eb59d8c..95338a6 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -53,10 +53,10 @@ Get Started!
 
 Ready to contribute? Here's how to set up `dials_data` for local development.
 
-1. Fork the `dials_data` repo on GitHub.
+1. Fork the `dials/data` `repository on GitHub <https://github.com/dials/data>`__.
 2. Clone your fork locally::
 
-    $ git clone git@github.com:your_name_here/dials_data.git
+    $ git clone git@github.com:your_name_here/data.git
 
 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
 
@@ -70,14 +70,11 @@ Ready to contribute? Here's how to set up `dials_data` for local development.
 
    Now you can make your changes locally.
 
-5. When you're done making changes, check that your changes pass flake8 and the
-   tests, including testing other Python versions with tox::
+5. Before committing changes to the repository you should install pre-commit::
 
-    $ flake8 dials_data tests
-    $ python setup.py test or py.test
-    $ tox
+    $ pre-commit install
 
-   To get flake8 and tox, just pip install them into your virtualenv.
+   If you do not have pre-commit set up, you can install it with pip.
 
 6. Commit your changes and push your branch to GitHub::
 
@@ -92,15 +89,12 @@ Pull Request Guidelines
 
 Before you submit a pull request, check that it meets these guidelines:
 
-1. The pull request should include tests, unless you are adding or updating
-   a dataset.
+1. Unless you are only touching datasets the pull request should include tests.
 2. If you add or update a dataset then make individual pull requests for each
    dataset, so that they can be discussed and approved separately.
 3. If the pull request adds functionality, the docs should be updated. Put
    your new functionality into a function with a docstring, and add the
    feature to the list in HISTORY.rst.
-4. The pull request should work for all supported Python versions. Check
-   https://travis-ci.com/dials/data/pull_requests
 
 
 Deploying
diff --git a/HISTORY.rst b/HISTORY.rst
index e10709e..1f7595d 100644
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -2,6 +2,14 @@
 History
 =======
 
+2.4 (2022-03-07)
+^^^^^^^^^^^^^^^^
+
+* dials_data no longer uses ``py.path`` internally.
+* dials_data now includes type checking with mypy.
+* We started using the ``requests`` library for faster downloads.
+* Downloads now happen in parallel.
+
 2.3 (2022-01-11)
 ^^^^^^^^^^^^^^^^
 
diff --git a/dials_data/__init__.py b/dials_data/__init__.py
index d94fe09..25a6614 100644
--- a/dials_data/__init__.py
+++ b/dials_data/__init__.py
@@ -8,6 +8,6 @@ from __future__ import annotations
 __all__: list = []
 __author__ = """Markus Gerstel"""
 __email__ = "dials-support@lists.sourceforge.net"
-__version__ = "2.3.0"
+__version__ = "2.4.0"
 __commit__ = ""
 __version_tuple__ = tuple(int(x) for x in __version__.split("."))
diff --git a/dials_data/cli.py b/dials_data/cli.py
index 970befd..d8c5c22 100644
--- a/dials_data/cli.py
+++ b/dials_data/cli.py
@@ -5,7 +5,6 @@ import sys
 
 import yaml
 
-import dials_data
 import dials_data.datasets
 import dials_data.download
 
@@ -72,7 +71,7 @@ def cli_get(cmd_args):
 
     repository = dials_data.datasets.repository_location()
     if not args.quiet:
-        print(f"Repository location: {repository.strpath}\n")
+        print(f"Repository location: {repository}\n")
 
     for ds in args.dataset:
         if not args.quiet:
@@ -86,9 +85,9 @@ def cli_get(cmd_args):
             with open(f"{ds}.yml", "w") as fh:
                 yaml.dump(hashinfo, fh, default_flow_style=False)
         if args.quiet:
-            print(repository.join(ds).strpath)
+            print(repository / ds)
         else:
-            print(f"Dataset {ds} stored in {repository.join(ds).strpath}")
+            print(f"Dataset {ds} stored in {repository.joinpath(ds)}")
 
 
 def cli_list(cmd_args):
diff --git a/dials_data/datasets.py b/dials_data/datasets.py
index ccd54fb..2fe193a 100644
--- a/dials_data/datasets.py
+++ b/dials_data/datasets.py
@@ -5,15 +5,19 @@ from __future__ import annotations
 import hashlib
 import os
 import textwrap
+from pathlib import Path
+from typing import Any
 
 import importlib_resources
-import py
 import yaml
 
 _hashinfo_formatversion = 1
 
+definition: dict[str, Any]
+fileinfo_dirty: set[str]
 
-def _load_yml_definitions():
+
+def _load_yml_definitions() -> None:
     """
     Read dataset .yml files from definitions/ and hashinfo/ directories.
     This is done once during the module import stage.
@@ -48,7 +52,7 @@ def _load_yml_definitions():
 _load_yml_definitions()
 
 
-def create_integrity_record(dataset_name):
+def create_integrity_record(dataset_name) -> dict[str, Any]:
     """
     Generate a dictionary for the integrity information of a specific dataset.
     """
@@ -58,7 +62,7 @@ def create_integrity_record(dataset_name):
     }
 
 
-def repository_location():
+def repository_location() -> Path:
     """
     Returns an appropriate location where the downloaded regression data should
     be stored.
@@ -75,53 +79,51 @@ def repository_location():
     """
     if os.getenv("DIALS_DATA"):
         try:
-            repository = py.path.local(os.getenv("DIALS_DATA"))
-            repository.ensure(dir=1)
+            repository = Path(os.environ["DIALS_DATA"])
+            repository.mkdir(parents=True, exist_ok=True)
             return repository
-        except Exception:
+        except (KeyError, TypeError, OSError):
             pass
     try:
-        repository = py.path.local("/dls/science/groups/scisoft/DIALS/dials_data")
-        if repository.check(dir=1):
+        repository = Path("/dls/science/groups/scisoft/DIALS/dials_data")
+        if repository.is_dir():
             return repository
-    except Exception:
+    except OSError:
         pass
     if os.getenv("LIBTBX_BUILD"):
         try:
-            repository = py.path.local(os.getenv("LIBTBX_BUILD")).join("dials_data")
-            repository.ensure(dir=1)
+            repository = Path(os.environ["LIBTBX_BUILD"]) / "dials_data"
+            repository.mkdir(parents=True, exist_ok=True)
             return repository
-        except Exception:
+        except (KeyError, TypeError, OSError):
             pass
-    repository = (
-        py.path.local(os.path.expanduser("~")).join(".cache").join("dials_data")
-    )
     try:
-        repository.ensure(dir=1)
+        repository = Path.home() / ".cache" / "dials_data"
+        repository.mkdir(parents=True, exist_ok=True)
         return repository
-    except Exception:
+    except (TypeError, OSError):
         raise RuntimeError(
             "Could not determine regression data location. Use environment variable DIALS_DATA"
         )
 
 
-def get_resident_size(ds):
+def get_resident_size(ds) -> int:
     if ds in fileinfo_dirty:
         return 0
     return sum(item["size"] for item in definition[ds]["hashinfo"]["verify"])
 
 
-def _human_readable(num, suffix="B"):
+def _human_readable(num: float, suffix: str = "B") -> str:
     for unit in ("", "k", "M", "G"):
         if num < 10:
             return f"{num:.1f}{unit}{suffix}"
         if num < 1024:
             return f"{num:.0f}{unit}{suffix}"
         num /= 1024.0
-    return "{:.0f}{}{}".format(num, "T", suffix)
+    return f"{num:.0f}T{suffix}"
 
 
-def list_known_definitions(ds_list, quiet=False):
+def list_known_definitions(ds_list, quiet=False) -> None:
     indent = " " * 4
     for shortname in sorted(ds_list):
         if quiet:
diff --git a/dials_data/definitions/aluminium_standard.yml b/dials_data/definitions/aluminium_standard.yml
new file mode 100644
index 0000000..46619d1
--- /dev/null
+++ b/dials_data/definitions/aluminium_standard.yml
@@ -0,0 +1,27 @@
+name: Aluminium standard
+author: Elena Pascal, Yun Song (2021)
+license: CC-BY 4.0
+description: >
+  Powder diffraction data collected by Yun Song at DLS on eBIC's Talos Arctica (m12).
+  The sample is a polycrystalline aluminium standard.
+
+  imported.expt was generated with
+    $ dials.import $(dials.data get -q aluminium_standard)/0p67_5s_0000.mrc
+
+  eyeballed.expt was generated with dials.powder_calibrate_widget
+
+  calibrated.expt was generated with
+    starting_file = "eyeballed.expt"
+    test_args = [starting_file, "standard=Al", "eyeball=False"]
+    expt_parameters, user_arguments = parse_args(args=test_args)
+    calibrator = PowderCalibrator(expt_params=expt_parameters, user_args=user_arguments)
+    calibrator.calibrate_with_calibrant(verbose=True)
+  
+  Used for powder geometry calibration testing.
+
+data:
+
+  - url: https://raw.githubusercontent.com/dials/data-files/35addc5660df90b3b9579bc932e95a9aa01389fe/aluminium_standard/0p67_5s_0000.mrc
+  - url: https://raw.githubusercontent.com/dials/data-files/35addc5660df90b3b9579bc932e95a9aa01389fe/aluminium_standard/imported.expt
+  - url: https://raw.githubusercontent.com/dials/data-files/35addc5660df90b3b9579bc932e95a9aa01389fe/aluminium_standard/eyeballed.expt
+  - url: https://raw.githubusercontent.com/dials/data-files/35addc5660df90b3b9579bc932e95a9aa01389fe/aluminium_standard/calibrated.expt
diff --git a/dials_data/definitions/cunir_serial_processed.yml b/dials_data/definitions/cunir_serial_processed.yml
index 9290066..d4a76c0 100644
--- a/dials_data/definitions/cunir_serial_processed.yml
+++ b/dials_data/definitions/cunir_serial_processed.yml
@@ -6,11 +6,13 @@ description: >
   Light Source, courtesy of Robin Owen and Mike Hough.
   5 images were processed with DIALS v3.7.3, with and without a reference
   geometry, for testing indexing.
-  Images are located at Diamond Light Source at
-  /dls/mx-scratch/dials/i24-ssx/CuNIR/merlin/
+  Images can be found in the 'cunir_serial' data set; file references are
+  laid out so that the images can be accessed if that data set is present.
 
 data:
-  - url: https://github.com/dials/data-files/raw/da38a5de509cb7cb22cc28f439058c66923333d1/ssx_CuNiR_test_data/imported_no_ref_5.expt
-  - url: https://github.com/dials/data-files/raw/da38a5de509cb7cb22cc28f439058c66923333d1/ssx_CuNiR_test_data/imported_with_ref_5.expt
+  - url: https://github.com/dials/data-files/raw/13e97f957175c3e31d34e567b7369b748cfa1d55/ssx_CuNiR_test_data/imported_no_ref_5.expt
+  - url: https://github.com/dials/data-files/raw/13e97f957175c3e31d34e567b7369b748cfa1d55/ssx_CuNiR_test_data/imported_with_ref_5.expt
   - url: https://github.com/dials/data-files/raw/da38a5de509cb7cb22cc28f439058c66923333d1/ssx_CuNiR_test_data/strong_1.refl
   - url: https://github.com/dials/data-files/raw/da38a5de509cb7cb22cc28f439058c66923333d1/ssx_CuNiR_test_data/strong_5.refl
+  - url: https://github.com/dials/data-files/raw/70d2cbb1bb5d0678b133039a7e009a623762f0ff/ssx_CuNiR_test_data/indexed.expt
+  - url: https://github.com/dials/data-files/raw/70d2cbb1bb5d0678b133039a7e009a623762f0ff/ssx_CuNiR_test_data/indexed.refl
diff --git a/dials_data/definitions/four_circle_eiger.yml b/dials_data/definitions/four_circle_eiger.yml
new file mode 100644
index 0000000..47ac681
--- /dev/null
+++ b/dials_data/definitions/four_circle_eiger.yml
@@ -0,0 +1,48 @@
+name: Single-crystal small-molecule X-ray diffractometry data in NXmx format, using a four-circle goniometer
+author: Rebecca Scatena, Mark Warren, Ben Williams (2022)
+license: CC-BY 4.0
+url: https://doi.org/10.5281/zenodo.6093230
+description: >
+  Example NXmx data of X-ray diffraction from a sample of
+  a small-molecule material, [Cu(HF₂)(pyrazine)₂]PF₆, on
+  a four-circle diffractometer.  Data recorded on I19-2
+  (beamline I19, experiments hutch 2) at Diamond Light
+  Source using a Dectris Eiger2 X 4M CdTe detector.
+  
+  Inventory of data:
+
+  01_CuHF2pyz2PF6b_Phi.tar.xz
+  A single 1750-image 350° φ rotation scan with detector
+  position 2θ = 0°.
+
+  02_CuHF2pyz2PF6b_2T.tar.xz
+  A single 1750-image 350° φ rotation scan with detector
+  position 2θ = 20°.
+
+  03_CuHF2pyz2PF6b_P_O.tar.xz
+  Two sequential rotation scans:
+    CuHF2pyz2PF6b_P_O_01.nxs — a 1750-image 350° φ scan.
+    CuHF2pyz2PF6b_P_O_02.nxs — a 600-image 120° ω scan.
+  
+data:
+  - url: https://zenodo.org/record/6093231/files/01_CuHF2pyz2PF6b_Phi.tar.xz
+    files:
+      - 01_CuHF2pyz2PF6b_Phi/CuHF2pyz2PF6b_Phi_01_000001.h5
+      - 01_CuHF2pyz2PF6b_Phi/CuHF2pyz2PF6b_Phi_01_000002.h5
+      - 01_CuHF2pyz2PF6b_Phi/CuHF2pyz2PF6b_Phi_01_meta.h5
+      - 01_CuHF2pyz2PF6b_Phi/CuHF2pyz2PF6b_Phi_01.nxs
+  - url: https://zenodo.org/record/6093231/files/02_CuHF2pyz2PF6b_2T.tar.xz
+    files:
+      - 02_CuHF2pyz2PF6b_2T/CuHF2pyz2PF6b_2T_01_000001.h5
+      - 02_CuHF2pyz2PF6b_2T/CuHF2pyz2PF6b_2T_01_000002.h5
+      - 02_CuHF2pyz2PF6b_2T/CuHF2pyz2PF6b_2T_01_meta.h5
+      - 02_CuHF2pyz2PF6b_2T/CuHF2pyz2PF6b_2T_01.nxs
+  - url: https://zenodo.org/record/6093231/files/03_CuHF2pyz2PF6b_P_O.tar.xz
+    files:
+      - 03_CuHF2pyz2PF6b_P_O/CuHF2pyz2PF6b_P_O_01_000001.h5
+      - 03_CuHF2pyz2PF6b_P_O/CuHF2pyz2PF6b_P_O_01_000002.h5
+      - 03_CuHF2pyz2PF6b_P_O/CuHF2pyz2PF6b_P_O_01_meta.h5
+      - 03_CuHF2pyz2PF6b_P_O/CuHF2pyz2PF6b_P_O_01.nxs
+      - 03_CuHF2pyz2PF6b_P_O/CuHF2pyz2PF6b_P_O_02_000001.h5
+      - 03_CuHF2pyz2PF6b_P_O/CuHF2pyz2PF6b_P_O_02_meta.h5
+      - 03_CuHF2pyz2PF6b_P_O/CuHF2pyz2PF6b_P_O_02.nxs
diff --git a/dials_data/download.py b/dials_data/download.py
index 26d2932..70d9ec1 100644
--- a/dials_data/download.py
+++ b/dials_data/download.py
@@ -1,14 +1,20 @@
 from __future__ import annotations
 
+import concurrent.futures
 import contextlib
 import errno
+import functools
+import hashlib
 import os
 import tarfile
 import warnings
 import zipfile
 from pathlib import Path
+from typing import Any, Optional, Union
 from urllib.parse import urlparse
-from urllib.request import urlopen
+
+import py.path
+import requests
 
 import dials_data.datasets
 
@@ -21,7 +27,6 @@ if os.name == "posix":
     def _platform_unlock(file_handle):
         fcntl.lockf(file_handle, fcntl.LOCK_UN)
 
-
 elif os.name == "nt":
     import msvcrt
 
@@ -41,7 +46,6 @@ elif os.name == "nt":
         file_handle.seek(0)
         msvcrt.locking(file_handle.fileno(), msvcrt.LK_UNLCK, 1)
 
-
 else:
 
     def _platform_lock(file_handle):
@@ -71,59 +75,51 @@ def _file_lock(file_handle):
 
 
 @contextlib.contextmanager
-def download_lock(target_dir):
+def download_lock(target_dir: Optional[Path]):
     """
     Obtains a (cooperative) lock on a lockfile in a target directory, so only a
     single (cooperative) process can enter this context manager at any one time.
     If the lock is held this will block until the existing lock is released.
     """
-    with target_dir.join(".lock").open(mode="w", ensure=True) as fh:
+    if not target_dir:
+        yield
+        return
+    target_dir.mkdir(parents=True, exist_ok=True)
+    with target_dir.joinpath(".lock").open(mode="w") as fh:
         with _file_lock(fh):
             yield
 
 
-def _download_to_file(url, pyfile):
+def _download_to_file(session: requests.Session, url: str, pyfile: Path):
     """
     Downloads a single URL to a file.
     """
-    with contextlib.closing(urlopen(url)) as socket:
-        file_size = socket.info().get("Content-Length")
-        if file_size:
-            file_size = int(file_size)
-        # There is no guarantee that the content-length header is set
-        received = 0
-        block_size = 8192
-        # Allow for writing the file immediately so we can empty the buffer
-        with pyfile.open(mode="wb", ensure=True) as f:
-            while True:
-                block = socket.read(block_size)
-                received += len(block)
-                f.write(block)
-                if not block:
-                    break
-
-    if file_size and file_size != received:
-        raise OSError(
-            "Error downloading {url}: received {received} bytes instead of expected {file_size} bytes".format(
-                file_size=file_size, received=received, url=url
-            )
-        )
+    with session.get(url, stream=True) as r:
+        r.raise_for_status()
+        pyfile.parent.mkdir(parents=True, exist_ok=True)
+        with pyfile.open(mode="wb") as f:
+            for chunk in r.iter_content(chunk_size=40960):
+                f.write(chunk)
 
 
-def file_hash(path):
+def file_hash(file_to_hash: Path) -> str:
     """Returns the SHA256 digest of a file."""
-    return path.computehash(hashtype="sha256")
+    sha256_hash = hashlib.sha256()
+    with file_to_hash.open("rb") as f:
+        for block in iter(lambda: f.read(4096), b""):
+            sha256_hash.update(block)
+    return sha256_hash.hexdigest()
 
 
 def fetch_dataset(
     dataset,
-    ignore_hashinfo=False,
-    verify=False,
-    read_only=False,
-    verbose=False,
-    pre_scan=True,
-    download_lockdir=None,
-):
+    ignore_hashinfo: bool = False,
+    verify: bool = False,
+    read_only: bool = False,
+    verbose: bool = False,
+    pre_scan: bool = True,
+    download_lockdir: Optional[Path] = None,
+) -> Union[bool, Any]:
     """Check for the presence or integrity of the local copy of the specified
     test dataset. If the dataset is not available or out of date then attempt
     to download/update it transparently.
@@ -144,8 +140,8 @@ def fetch_dataset(
         return False
     definition = dials_data.datasets.definition[dataset]
 
-    target_dir = dials_data.datasets.repository_location().join(dataset)
-    if read_only and not target_dir.check(dir=1):
+    target_dir: Path = dials_data.datasets.repository_location() / dataset
+    if read_only and not target_dir.is_dir():
         return False
 
     integrity_info = definition.get("hashinfo")
@@ -154,10 +150,10 @@ def fetch_dataset(
 
     if "verify" not in integrity_info:
         integrity_info["verify"] = [{} for _ in definition["data"]]
-    filelist = [
+    filelist: list[dict[str, Any]] = [
         {
             "url": source["url"],
-            "file": target_dir.join(os.path.basename(urlparse(source["url"]).path)),
+            "file": target_dir / os.path.basename(urlparse(source["url"]).path),
             "files": source.get("files"),
             "verify": hashinfo,
         }
@@ -166,90 +162,86 @@ def fetch_dataset(
 
     if pre_scan or read_only:
         if all(
-            item["file"].check()
+            item["file"].is_file()
             and item["verify"].get("size")
-            and item["verify"]["size"] == item["file"].size()
+            and item["verify"]["size"] == item["file"].stat().st_size
             for item in filelist
         ):
             return True
         if read_only:
             return False
 
-    if download_lockdir:
-        # Acquire lock if required as files may be downloaded/written.
-        with download_lock(download_lockdir):
-            _fetch_filelist(filelist, file_hash)
-    else:
-        _fetch_filelist(filelist, file_hash)
+    # Acquire lock if required as files may be downloaded/written.
+    with download_lock(download_lockdir):
+        _fetch_filelist(filelist)
 
     return integrity_info
 
 
-def _fetch_filelist(filelist, file_hash):
-    for source in filelist:  # parallelize this
-        if source.get("type", "file") == "file":
-            valid = False
-            if source["file"].check(file=1):
-                # verify
-                valid = True
-                if source["verify"]:
-                    if source["verify"]["size"] != source["file"].size():
-                        valid = False
-                        print("size")
-                    elif source["verify"]["hash"] != file_hash(source["file"]):
-                        valid = False
+def _fetch_filelist(filelist: list[dict[str, Any]]) -> None:
+    with requests.Session() as rs:
+        pool = concurrent.futures.ThreadPoolExecutor(max_workers=5)
+        pool.map(functools.partial(_fetch_file, rs), filelist)
+
+
+def _fetch_file(session: requests.Session, source: dict[str, Any]) -> None:
+    valid = False
+    if source["file"].is_file():
+        # verify
+        valid = True
+        if source["verify"]:
+            if source["verify"]["size"] != source["file"].stat().st_size:
+                valid = False
+            elif source["verify"]["hash"] != file_hash(source["file"]):
+                valid = False
+
+    downloaded = False
+    if not valid:
+        print(f"Downloading {source['url']}")
+        _download_to_file(session, source["url"], source["file"])
+        downloaded = True
+
+    # verify
+    valid = True
+    if source["verify"]:
+        if source["verify"]["size"] != source["file"].stat().st_size:
+            print(
+                f"File size mismatch on {source['file']}: "
+                f"{source['file'].stat().st_size}, expected {source['verify']['size']}"
+            )
+        elif source["verify"]["hash"] != file_hash(source["file"]):
+            print(f"File hash mismatch on {source['file']}")
+    else:
+        source["verify"]["size"] = source["file"].stat().st_size
+        source["verify"]["hash"] = file_hash(source["file"])
+
+    # If the file is a tar archive, then decompress
+    if source["files"]:
+        target_dir = source["file"].parent
+        if downloaded or not all((target_dir / f).is_file() for f in source["files"]):
+            # If the file has been (re)downloaded, or we don't have all the requested
+            # files from the archive, then we need to decompress the archive
+            print(f"Decompressing {source['file']}")
+            if source["file"].suffix == ".zip":
+                with zipfile.ZipFile(source["file"]) as zf:
+                    try:
+                        for f in source["files"]:
+                            zf.extract(f, path=source["file"].parent)
+                    except KeyError:
                         print(
-                            "hash", source["verify"]["hash"], file_hash(source["file"])
+                            f"Expected file {f} not present "
+                            f"in zip archive {source['file']}"
                         )
-
-            downloaded = False
-            if not valid:
-                print("Downloading {}".format(source["url"]))
-                _download_to_file(source["url"], source["file"])
-                downloaded = True
-
-            # verify
-            valid = True
-            fileinfo = {
-                "size": source["file"].size(),
-                "hash": file_hash(source["file"]),
-            }
-            if source["verify"]:
-                if source["verify"]["size"] != fileinfo["size"]:
-                    valid = False
-                elif source["verify"]["hash"] != fileinfo["hash"]:
-                    valid = False
             else:
-                source["verify"]["size"] = fileinfo["size"]
-                source["verify"]["hash"] = fileinfo["hash"]
-
-        # If the file is a tar archive, then decompress
-        if source["files"]:
-            target_dir = source["file"].dirpath()
-            if downloaded or not all(
-                (target_dir / f).check(file=1) for f in source["files"]
-            ):
-                # If the file has been (re)downloaded, or we don't have all the requested
-                # files from the archive, then we need to decompress the archive
-                print("Decompressing {file}".format(file=source["file"]))
-                if source["file"].ext == ".zip":
-                    with zipfile.ZipFile(source["file"].strpath) as zf:
+                with tarfile.open(source["file"]) as tar:
+                    for f in source["files"]:
                         try:
-                            for f in source["files"]:
-                                zf.extract(f, path=source["file"].dirname)
+                            tar.extract(f, path=source["file"].parent)
                         except KeyError:
                             print(
-                                f"Expected file {f} not present in zip archive {source['file']}"
+                                f"Expected file {f} not present "
+                                f"in tar archive {source['file']}"
                             )
-                else:
-                    with tarfile.open(source["file"].strpath) as tar:
-                        for f in source["files"]:
-                            try:
-                                tar.extract(f, path=source["file"].dirname)
-                            except KeyError:
-                                print(
-                                    f"Expected file {f} not present in tar archive {source['file']}"
-                                )
 
 
 class DataFetcher:
@@ -259,7 +251,7 @@ class DataFetcher:
         df = DataFetcher()
     Then
         df('insulin')
-    returns a py.path object to the insulin data. If that data is not already
+    returns a Path object to the insulin data. If that data is not already
     on disk it is downloaded automatically.
 
     To disable all downloads:
@@ -269,14 +261,14 @@ class DataFetcher:
     """
 
     def __init__(self, read_only=False):
-        self._cache = {}
-        self._target_dir = dials_data.datasets.repository_location()
-        self._read_only = read_only and os.access(self._target_dir.strpath, os.W_OK)
+        self._cache: dict[str, Optional[Path]] = {}
+        self._target_dir: Path = dials_data.datasets.repository_location()
+        self._read_only: bool = read_only and os.access(self._target_dir, os.W_OK)
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return "<{}DataFetcher: {}>".format(
             "R/O " if self._read_only else "",
-            self._target_dir.strpath,
+            self._target_dir,
         )
 
     def result_filter(self, result, **kwargs):
@@ -288,7 +280,7 @@ class DataFetcher:
         """
         return result
 
-    def __call__(self, test_data, pathlib=None, **kwargs):
+    def __call__(self, test_data: str, pathlib=None, **kwargs):
         """
         Return the location of a dataset, transparently downloading it if
         necessary and possible.
@@ -304,7 +296,7 @@ class DataFetcher:
                  if the dataset is not available.
         """
         if test_data not in self._cache:
-            self._cache[test_data] = self._attempt_fetch(test_data, **kwargs)
+            self._cache[test_data] = self._attempt_fetch(test_data)
         if pathlib is None:
             warnings.warn(
                 "The DataFetcher currently returns py.path.local() objects. "
@@ -314,15 +306,13 @@ class DataFetcher:
                 DeprecationWarning,
                 stacklevel=2,
             )
-        if pathlib and self._cache[test_data]["result"]:
-            result = {
-                **self._cache[test_data],
-                "result": Path(self._cache[test_data]["result"]),
-            }
-            return self.result_filter(**result)
-        return self.result_filter(**self._cache[test_data])
-
-    def _attempt_fetch(self, test_data):
+        if not self._cache[test_data]:
+            return self.result_filter(result=False)
+        elif not pathlib:
+            return self.result_filter(result=py.path.local(self._cache[test_data]))
+        return self.result_filter(result=self._cache[test_data])
+
+    def _attempt_fetch(self, test_data: str) -> Optional[Path]:
         if self._read_only:
             data_available = fetch_dataset(test_data, pre_scan=True, read_only=True)
         else:
@@ -333,6 +323,6 @@ class DataFetcher:
                 download_lockdir=self._target_dir,
             )
         if data_available:
-            return {"result": self._target_dir.join(test_data)}
+            return self._target_dir / test_data
         else:
-            return {"result": False}
+            return None
diff --git a/dials_data/hashinfo/aluminium_standard.yml b/dials_data/hashinfo/aluminium_standard.yml
new file mode 100644
index 0000000..bf6acaf
--- /dev/null
+++ b/dials_data/hashinfo/aluminium_standard.yml
@@ -0,0 +1,11 @@
+definition: 4eaca4076872da390c5db4ca7e545159674de804ca8639cb38a99db05651e80b
+formatversion: 1
+verify:
+- hash: 1438d6a48e3d6c9086d0d5d24ab677b88564a7208276a146e543ff0309fd29b3
+  size: 16779008
+- hash: 062d4e03c306a47898068d2e94c98201bdd2ac5be0e4b1862558571f81a49405
+  size: 3534
+- hash: f7c20d958c5493929b4aac017e3d153f79ecfa18c750a469eeabf875c0a00ae8
+  size: 3568
+- hash: 8517435c4a14d53d0b0f25f05a1d05b70cf52baf05b14c2b5a3b12519915ba79
+  size: 3568
diff --git a/dials_data/hashinfo/cunir_serial_processed.yml b/dials_data/hashinfo/cunir_serial_processed.yml
index 3840e27..65bcd64 100644
--- a/dials_data/hashinfo/cunir_serial_processed.yml
+++ b/dials_data/hashinfo/cunir_serial_processed.yml
@@ -1,11 +1,15 @@
-definition: 9140f0bd0ad5c90e7a3f36941ae88399dab2bec0a10746504836d7ceb683515c
+definition: 16b3f8545250a8fb50634d426f23c4d914d1896fd7fab31420ae10854c22e30f
 formatversion: 1
 verify:
-- hash: 16a8e3c363bed6d22d6877738203039883d83cfecd9d142672a434815ae41461
-  size: 6951
-- hash: e50bb7a02c11ebcb69d6788a6780b67ac25a4fbdc8d12f201926d2830baa642f
-  size: 7060
+- hash: dca8ecd27f43c0140fbbfc946e7fff5931a829a3ab9358f351d2908fce536806
+  size: 6967
+- hash: b43b2058c93d0bf9a958fbbf1905252f8f8e2b54926875aa184a28a0d880c3ec
+  size: 7076
 - hash: d8a33e241a2c8f96edcbfa7b7634ca7fee99b6a92b19b2a7d54affffe7e98211
   size: 32769
 - hash: 17243f85082e370f147a40236be3c05d35bf29071bde9ff94534115b8c22602c
   size: 152073
+- hash: cbd49cc528de4880d01ddfc99efdb4260a649162cdd7108b75934f6a9a60c06d
+  size: 22264
+- hash: 8336e3ebf9d472225c1aa4fa80afb380f3521265f15746a1742b65f5c1ead0c7
+  size: 148385
diff --git a/dials_data/hashinfo/four_circle_eiger.yml b/dials_data/hashinfo/four_circle_eiger.yml
new file mode 100644
index 0000000..5e8c2c8
--- /dev/null
+++ b/dials_data/hashinfo/four_circle_eiger.yml
@@ -0,0 +1,9 @@
+definition: 26dad0598c500ceb4a535234ce0515dc9562b861dd3573fd671785713ecdf44d
+formatversion: 1
+verify:
+- hash: 13a4088f838f20b4959e5607d799bb6f418d35f893f2f8b4effc01ecf0cf1c9c
+  size: 382763416
+- hash: 496f96c265a091eb151d8fc25e352e297be4257fbacda035403036094a8f0b4c
+  size: 376888156
+- hash: d4eda31c0a9285e4f064298ea7001a6d5531ddd00ab2b8cee644bb9520738cb5
+  size: 509819364
diff --git a/dials_data/hashinfo/isis_sxd_example_data.yml b/dials_data/hashinfo/isis_sxd_example_data.yml
index a8c213e..7c7ede6 100644
--- a/dials_data/hashinfo/isis_sxd_example_data.yml
+++ b/dials_data/hashinfo/isis_sxd_example_data.yml
@@ -1,4 +1,4 @@
-definition: e96c799b2b252663cb8c54c005eccb3911be7d3ec9469ba9993bf172c5872468
+definition: 63d2bcc84235e3d5773bbed0b16d8c597223f0aae74b84fc6482e66bd2b4fb41
 formatversion: 1
 verify:
 - hash: b11851993e5b33a047fc5a03e5f320836fd8d1f4bb2a64ca442ea73aebf1a074
diff --git a/dials_data/hashinfo/lcls_rayonix_kapton.yml b/dials_data/hashinfo/lcls_rayonix_kapton.yml
index 83f0736..1c810ec 100644
--- a/dials_data/hashinfo/lcls_rayonix_kapton.yml
+++ b/dials_data/hashinfo/lcls_rayonix_kapton.yml
@@ -1,4 +1,4 @@
-definition: cceba9f63167c6ff5f568479dc42b26463d2a770a2cf1430263cebc8710e02b4
+definition: 29db1408c99173ed6fa64ae5ca5e5a091a00a88836e0d506a8dbea4c5d5f092b
 formatversion: 1
 verify:
 - hash: 6fca790fe0845b7475d2db2c2f88c6c307551179ca51ddda00fa849ae235ceed
diff --git a/dials_data/hashinfo/mpro_x0692.yml b/dials_data/hashinfo/mpro_x0692.yml
index 550842f..8c35d39 100644
--- a/dials_data/hashinfo/mpro_x0692.yml
+++ b/dials_data/hashinfo/mpro_x0692.yml
@@ -1,4 +1,4 @@
-definition: 8139f395a31405c7a5fa4003c1dabf18ca4258a9e8fff12a97127d81af41f30a
+definition: 51b7efa2411239a6499b499af66a1f7a7e3ba5a8eef98fee7607677a4cdc3eee
 formatversion: 1
 verify:
 - hash: cd47b6f7ec61d7acf74fb4522ff9636ed5786c03d737d96b98e4a0df7e276af0
diff --git a/dials_data/py.typed b/dials_data/py.typed
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/dials_data/py.typed
diff --git a/dials_data/pytest11.py b/dials_data/pytest11.py
index d344289..8287a73 100644
--- a/dials_data/pytest11.py
+++ b/dials_data/pytest11.py
@@ -20,7 +20,7 @@ def pytest_addoption(parser):
 
 
 @pytest.fixture(scope="session")
-def dials_data(request):
+def dials_data(request) -> DataFetcher:
     """
     Return the location of a regression dataset as py.path object.
     Download the files if they are not on disk already.
diff --git a/docs/conf.py b/docs/conf.py
index f95206c..749d655 100755
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -56,7 +56,7 @@ author = "Markus Gerstel"
 # the built documents.
 #
 # The short X.Y version.
-version = "2.3.0"
+version = "2.4.0"
 # The full version, including alpha/beta/rc tags.
 release = version
 
diff --git a/docs/installation.rst b/docs/installation.rst
index 50d1223..5f2d412 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -26,11 +26,11 @@ Both are valid opinions.
 
 However you do not need to install ``dials_data`` from source. You can simply run::
 
-    pip install -U dials_data
+    pip install -U dials-data
 
 or, in a conda environment::
 
-    conda install -c conda-forge dials_data
+    conda install -c conda-forge dials-data
 
 This will install or update an existing installation of ``dials_data``.
 
diff --git a/docs/why.rst b/docs/why.rst
index 4b0622f..6accf9d 100644
--- a/docs/why.rst
+++ b/docs/why.rst
@@ -33,7 +33,7 @@ more files are added. This quickly becomes impractical in distributed
 testing environments. The disk space required for checkouts can be
 reduced by compressing the data, but then they need to be unpacked for
 using the data in tests. By its nature the internal SVN repository was
-not publically accessible. The data files were too large to convert the
+not publicly accessible. The data files were too large to convert the
 repository to a git repository to be hosted on Github, and in any case
 a git repository was not the best place either to store large amounts
 of data, as old versions of the data or retired datasets are kept
@@ -48,7 +48,7 @@ first.
 With dxtbx_, dials_ and xia2_ moving to pytest_ we extended the
 xia2_regression_ concept into the regression_data_ fixture to provide
 a simple way to access the datasets in tests, but the data still
-needed downloading separately and coult not easily be used outside
+needed downloading separately and could not easily be used outside
 of the dials_ repository and not at all outside of a dials_
 distribution. Adding data files was still a very involved process.
 
diff --git a/pyproject.toml b/pyproject.toml
index 121a39f..d1c90b8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,7 @@
 [build-system]
 requires = ["setuptools >= 40.6.0", "wheel"]
 build-backend = "setuptools.build_meta"
+
+[[tool.mypy.overrides]]
+module = [ "py", "py.path", "pytest", "importlib_resources" ]
+ignore_missing_imports = true
diff --git a/requirements.txt b/requirements.txt
index 5c2deea..2b32889 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 importlib_resources==5.4.0
 py==1.11.0
-pytest==6.2.5
+pytest==7.0.1
 pyyaml==6.0
+requests==2.27.1
diff --git a/requirements_dev.txt b/requirements_dev.txt
index 34a0b2c..c4b6d93 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -1,7 +1,8 @@
 collective.checkdocs==0.2
-coverage==6.2
+coverage==6.3.2
 importlib_resources==5.4.0
 py==1.11.0
-pytest==6.2.5
+pytest==7.0.1
 pyyaml==6.0
-wheel==0.37.0
+requests==2.27.1
+wheel==0.37.1
diff --git a/requirements_doc.txt b/requirements_doc.txt
index f097e9e..9a901e1 100644
--- a/requirements_doc.txt
+++ b/requirements_doc.txt
@@ -1,5 +1,5 @@
 py==1.11.0
-pytest==6.2.5
+pytest==7.0.1
 pyyaml==6.0
-Sphinx==4.3.1
+Sphinx==4.4.0
 sphinx_rtd_theme==1.0.0
diff --git a/setup.cfg b/setup.cfg
index be8d34b..9b0d33d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = dials_data
-version = 2.3.0
+version = 2.4.0
 url = https://github.com/dials/data
 project_urls =
     Bug Tracker = https://github.com/dials/data/issues
@@ -31,6 +31,7 @@ install_requires =
     importlib_resources>=1.1
     pytest
     pyyaml
+    requests
 # importlib; python_version == "2.6"
 packages = find:
 python_requires = >=3.7
@@ -46,6 +47,9 @@ libtbx.precommit =
 pytest11 =
     dials_data = dials_data.pytest11
 
+[options.package_data]
+dials_data = py.typed
+
 [flake8]
 # Black disagrees with flake8 on a few points. Ignore those.
 ignore = E203, E266, E501, W503
diff --git a/tests/test_dials_data.py b/tests/test_dials_data.py
index 8905285..c0d5ffb 100644
--- a/tests/test_dials_data.py
+++ b/tests/test_dials_data.py
@@ -17,39 +17,39 @@ def test_all_datasets_can_be_parsed():
 
 def test_repository_location():
     rl = dials_data.datasets.repository_location()
-    assert rl.check(dir=1)
+    assert rl.is_dir()
 
 
 def test_fetching_undefined_datasets_does_not_crash():
     df = dials_data.download.DataFetcher(read_only=True)
-    assert df("aardvark") is False
+    assert df("aardvark", pathlib=True) is False
 
 
 def test_requests_for_future_datasets_can_be_intercepted():
     df = dials_data.download.DataFetcher(read_only=True)
     df.result_filter = mock.Mock()
     df.result_filter.return_value = False
-    assert df("aardvark") is False
+    assert df("aardvark", pathlib=True) is False
     df.result_filter.assert_called_once_with(result=False)
 
 
 @mock.patch("dials_data.datasets.repository_location")
 @mock.patch("dials_data.download.fetch_dataset")
 def test_datafetcher_constructs_py_path(fetcher, root):
-    root.return_value = py.path.local("/tmp/root")
+    root.return_value = pathlib.Path("/tmp/root")
     fetcher.return_value = True
 
     df = dials_data.download.DataFetcher(read_only=True)
     with pytest.warns(DeprecationWarning):
         ds = df("dataset")
-    assert ds == py.path.local("/tmp/root/dataset")
+    assert pathlib.Path(ds).resolve() == pathlib.Path("/tmp/root/dataset").resolve()
     assert isinstance(ds, py.path.local)
     fetcher.assert_called_once_with(
         "dataset", pre_scan=True, read_only=False, download_lockdir=mock.ANY
     )
 
     ds = df("dataset", pathlib=False)
-    assert ds == py.path.local("/tmp/root/dataset")
+    assert pathlib.Path(ds).resolve() == pathlib.Path("/tmp/root/dataset").resolve()
     assert isinstance(ds, py.path.local)
     fetcher.assert_called_once()
 
@@ -57,21 +57,22 @@ def test_datafetcher_constructs_py_path(fetcher, root):
 @mock.patch("dials_data.datasets.repository_location")
 @mock.patch("dials_data.download.fetch_dataset")
 def test_datafetcher_constructs_path(fetcher, root):
-    test_path = py.path.local("/tmp/root")
+    test_path = pathlib.Path("/tmp/root")
     root.return_value = test_path
     fetcher.return_value = True
 
     df = dials_data.download.DataFetcher(read_only=True)
     ds = df("dataset", pathlib=True)
-    assert ds == pathlib.Path(test_path) / "dataset"
+    assert ds == test_path / "dataset"
 
     assert isinstance(ds, pathlib.Path)
     fetcher.assert_called_once_with(
         "dataset", pre_scan=True, read_only=False, download_lockdir=mock.ANY
     )
 
-    ds = df("dataset")
-    assert ds == pathlib.Path(test_path) / "dataset"
+    with pytest.warns(DeprecationWarning):
+        ds = df("dataset")
+    assert pathlib.Path(ds).resolve() == test_path.joinpath("dataset").resolve()
     assert not isinstance(
         ds, pathlib.Path
     )  # default is currently to return py.path.local()
author	Picca Frédéric-Emmanuel <picca@debian.org>	2022-03-20 08:46:25 +0100
committer	Picca Frédéric-Emmanuel <picca@debian.org>	2022-03-20 08:46:25 +0100
commit	102a43171cebe81d073159c08e883a37e699a165 (patch)
tree	83851f33a57848d81cff02d5b885538a2c5dc902
parent	a0ff70176cc3d6642a806914e6dcb751c5f29828 (diff)