diff options
author | Picca Frédéric-Emmanuel <picca@debian.org> | 2022-03-20 08:46:25 +0100 |
---|---|---|
committer | Picca Frédéric-Emmanuel <picca@debian.org> | 2022-03-20 08:46:25 +0100 |
commit | 102a43171cebe81d073159c08e883a37e699a165 (patch) | |
tree | 83851f33a57848d81cff02d5b885538a2c5dc902 | |
parent | a0ff70176cc3d6642a806914e6dcb751c5f29828 (diff) |
New upstream version 2.4.0
29 files changed, 322 insertions, 208 deletions
diff --git a/.azure-pipelines/azure-pipelines.yml b/.azure-pipelines/azure-pipelines.yml index 93363f9..dc03351 100644 --- a/.azure-pipelines/azure-pipelines.yml +++ b/.azure-pipelines/azure-pipelines.yml @@ -26,10 +26,17 @@ stages: - bash: | set -eux - pip install --disable-pip-version-check flake8 + pip install --disable-pip-version-check flake8 flake8-comprehensions python .azure-pipelines/flake8-validation.py displayName: Flake8 validation + - bash: | + set -eux + # install versions matching the ones in the corresponding pre-commit hook + pip install --disable-pip-version-check mypy==0.931 types-PyYAML==6.0.4 types-requests==2.27.11 + mypy --no-strict-optional dials_data/ + displayName: Type checking + # Set up constants for further build steps - bash: | echo "##vso[task.setvariable variable=BUILD_REPOSITORY_NAME;isOutput=true]${BUILD_REPOSITORY_NAME}" diff --git a/.bumpversion.cfg b/.bumpversion.cfg index fea6d8d..b97c10c 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 2.3.0 +current_version = 2.4.0 commit = True tag = True diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2248526..c6476bf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,7 +13,7 @@ repos: # Automatically sort imports - repo: https://github.com/PyCQA/isort - rev: 5.9.3 + rev: 5.10.1 hooks: - id: isort args: [ @@ -30,22 +30,24 @@ repos: # Automatic source code formatting - repo: https://github.com/psf/black - rev: 21.6b0 + rev: 22.1.0 hooks: - id: black args: [--safe, --quiet] # Linting - repo: https://github.com/PyCQA/flake8 - rev: 3.9.2 + rev: 4.0.1 hooks: - id: flake8 - additional_dependencies: ['flake8-comprehensions==3.5.0'] + additional_dependencies: ['flake8-comprehensions==3.8.0'] # Type checking +# Remember to change versions in .azure-pipelines/azure-pipelines.yml to match +# the versions here. - repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.910 + rev: v0.931 hooks: - id: mypy files: 'dials_data/.*\.py$' - additional_dependencies: ['types-PyYAML==5.4.9'] + additional_dependencies: ['types-PyYAML==6.0.4', 'types-requests==2.27.11'] diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index eb59d8c..95338a6 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -53,10 +53,10 @@ Get Started! Ready to contribute? Here's how to set up `dials_data` for local development. -1. Fork the `dials_data` repo on GitHub. +1. Fork the `dials/data` `repository on GitHub <https://github.com/dials/data>`__. 2. Clone your fork locally:: - $ git clone git@github.com:your_name_here/dials_data.git + $ git clone git@github.com:your_name_here/data.git 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: @@ -70,14 +70,11 @@ Ready to contribute? Here's how to set up `dials_data` for local development. Now you can make your changes locally. -5. When you're done making changes, check that your changes pass flake8 and the - tests, including testing other Python versions with tox:: +5. Before committing changes to the repository you should install pre-commit:: - $ flake8 dials_data tests - $ python setup.py test or py.test - $ tox + $ pre-commit install - To get flake8 and tox, just pip install them into your virtualenv. + If you do not have pre-commit set up, you can install it with pip. 6. Commit your changes and push your branch to GitHub:: @@ -92,15 +89,12 @@ Pull Request Guidelines Before you submit a pull request, check that it meets these guidelines: -1. The pull request should include tests, unless you are adding or updating - a dataset. +1. Unless you are only touching datasets the pull request should include tests. 2. If you add or update a dataset then make individual pull requests for each dataset, so that they can be discussed and approved separately. 3. If the pull request adds functionality, the docs should be updated. Put your new functionality into a function with a docstring, and add the feature to the list in HISTORY.rst. -4. The pull request should work for all supported Python versions. Check - https://travis-ci.com/dials/data/pull_requests Deploying diff --git a/HISTORY.rst b/HISTORY.rst index e10709e..1f7595d 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,6 +2,14 @@ History ======= +2.4 (2022-03-07) +^^^^^^^^^^^^^^^^ + +* dials_data no longer uses ``py.path`` internally. +* dials_data now includes type checking with mypy. +* We started using the ``requests`` library for faster downloads. +* Downloads now happen in parallel. + 2.3 (2022-01-11) ^^^^^^^^^^^^^^^^ diff --git a/dials_data/__init__.py b/dials_data/__init__.py index d94fe09..25a6614 100644 --- a/dials_data/__init__.py +++ b/dials_data/__init__.py @@ -8,6 +8,6 @@ from __future__ import annotations __all__: list = [] __author__ = """Markus Gerstel""" __email__ = "dials-support@lists.sourceforge.net" -__version__ = "2.3.0" +__version__ = "2.4.0" __commit__ = "" __version_tuple__ = tuple(int(x) for x in __version__.split(".")) diff --git a/dials_data/cli.py b/dials_data/cli.py index 970befd..d8c5c22 100644 --- a/dials_data/cli.py +++ b/dials_data/cli.py @@ -5,7 +5,6 @@ import sys import yaml -import dials_data import dials_data.datasets import dials_data.download @@ -72,7 +71,7 @@ def cli_get(cmd_args): repository = dials_data.datasets.repository_location() if not args.quiet: - print(f"Repository location: {repository.strpath}\n") + print(f"Repository location: {repository}\n") for ds in args.dataset: if not args.quiet: @@ -86,9 +85,9 @@ def cli_get(cmd_args): with open(f"{ds}.yml", "w") as fh: yaml.dump(hashinfo, fh, default_flow_style=False) if args.quiet: - print(repository.join(ds).strpath) + print(repository / ds) else: - print(f"Dataset {ds} stored in {repository.join(ds).strpath}") + print(f"Dataset {ds} stored in {repository.joinpath(ds)}") def cli_list(cmd_args): diff --git a/dials_data/datasets.py b/dials_data/datasets.py index ccd54fb..2fe193a 100644 --- a/dials_data/datasets.py +++ b/dials_data/datasets.py @@ -5,15 +5,19 @@ from __future__ import annotations import hashlib import os import textwrap +from pathlib import Path +from typing import Any import importlib_resources -import py import yaml _hashinfo_formatversion = 1 +definition: dict[str, Any] +fileinfo_dirty: set[str] -def _load_yml_definitions(): + +def _load_yml_definitions() -> None: """ Read dataset .yml files from definitions/ and hashinfo/ directories. This is done once during the module import stage. @@ -48,7 +52,7 @@ def _load_yml_definitions(): _load_yml_definitions() -def create_integrity_record(dataset_name): +def create_integrity_record(dataset_name) -> dict[str, Any]: """ Generate a dictionary for the integrity information of a specific dataset. """ @@ -58,7 +62,7 @@ def create_integrity_record(dataset_name): } -def repository_location(): +def repository_location() -> Path: """ Returns an appropriate location where the downloaded regression data should be stored. @@ -75,53 +79,51 @@ def repository_location(): """ if os.getenv("DIALS_DATA"): try: - repository = py.path.local(os.getenv("DIALS_DATA")) - repository.ensure(dir=1) + repository = Path(os.environ["DIALS_DATA"]) + repository.mkdir(parents=True, exist_ok=True) return repository - except Exception: + except (KeyError, TypeError, OSError): pass try: - repository = py.path.local("/dls/science/groups/scisoft/DIALS/dials_data") - if repository.check(dir=1): + repository = Path("/dls/science/groups/scisoft/DIALS/dials_data") + if repository.is_dir(): return repository - except Exception: + except OSError: pass if os.getenv("LIBTBX_BUILD"): try: - repository = py.path.local(os.getenv("LIBTBX_BUILD")).join("dials_data") - repository.ensure(dir=1) + repository = Path(os.environ["LIBTBX_BUILD"]) / "dials_data" + repository.mkdir(parents=True, exist_ok=True) return repository - except Exception: + except (KeyError, TypeError, OSError): pass - repository = ( - py.path.local(os.path.expanduser("~")).join(".cache").join("dials_data") - ) try: - repository.ensure(dir=1) + repository = Path.home() / ".cache" / "dials_data" + repository.mkdir(parents=True, exist_ok=True) return repository - except Exception: + except (TypeError, OSError): raise RuntimeError( "Could not determine regression data location. Use environment variable DIALS_DATA" ) -def get_resident_size(ds): +def get_resident_size(ds) -> int: if ds in fileinfo_dirty: return 0 return sum(item["size"] for item in definition[ds]["hashinfo"]["verify"]) -def _human_readable(num, suffix="B"): +def _human_readable(num: float, suffix: str = "B") -> str: for unit in ("", "k", "M", "G"): if num < 10: return f"{num:.1f}{unit}{suffix}" if num < 1024: return f"{num:.0f}{unit}{suffix}" num /= 1024.0 - return "{:.0f}{}{}".format(num, "T", suffix) + return f"{num:.0f}T{suffix}" -def list_known_definitions(ds_list, quiet=False): +def list_known_definitions(ds_list, quiet=False) -> None: indent = " " * 4 for shortname in sorted(ds_list): if quiet: diff --git a/dials_data/definitions/aluminium_standard.yml b/dials_data/definitions/aluminium_standard.yml new file mode 100644 index 0000000..46619d1 --- /dev/null +++ b/dials_data/definitions/aluminium_standard.yml @@ -0,0 +1,27 @@ +name: Aluminium standard +author: Elena Pascal, Yun Song (2021) +license: CC-BY 4.0 +description: > + Powder diffraction data collected by Yun Song at DLS on eBIC's Talos Arctica (m12). + The sample is a polycrystalline aluminium standard. + + imported.expt was generated with + $ dials.import $(dials.data get -q aluminium_standard)/0p67_5s_0000.mrc + + eyeballed.expt was generated with dials.powder_calibrate_widget + + calibrated.expt was generated with + starting_file = "eyeballed.expt" + test_args = [starting_file, "standard=Al", "eyeball=False"] + expt_parameters, user_arguments = parse_args(args=test_args) + calibrator = PowderCalibrator(expt_params=expt_parameters, user_args=user_arguments) + calibrator.calibrate_with_calibrant(verbose=True) + + Used for powder geometry calibration testing. + +data: + + - url: https://raw.githubusercontent.com/dials/data-files/35addc5660df90b3b9579bc932e95a9aa01389fe/aluminium_standard/0p67_5s_0000.mrc + - url: https://raw.githubusercontent.com/dials/data-files/35addc5660df90b3b9579bc932e95a9aa01389fe/aluminium_standard/imported.expt + - url: https://raw.githubusercontent.com/dials/data-files/35addc5660df90b3b9579bc932e95a9aa01389fe/aluminium_standard/eyeballed.expt + - url: https://raw.githubusercontent.com/dials/data-files/35addc5660df90b3b9579bc932e95a9aa01389fe/aluminium_standard/calibrated.expt diff --git a/dials_data/definitions/cunir_serial_processed.yml b/dials_data/definitions/cunir_serial_processed.yml index 9290066..d4a76c0 100644 --- a/dials_data/definitions/cunir_serial_processed.yml +++ b/dials_data/definitions/cunir_serial_processed.yml @@ -6,11 +6,13 @@ description: > Light Source, courtesy of Robin Owen and Mike Hough. 5 images were processed with DIALS v3.7.3, with and without a reference geometry, for testing indexing. - Images are located at Diamond Light Source at - /dls/mx-scratch/dials/i24-ssx/CuNIR/merlin/ + Images can be found in the 'cunir_serial' data set; file references are + laid out so that the images can be accessed if that data set is present. data: - - url: https://github.com/dials/data-files/raw/da38a5de509cb7cb22cc28f439058c66923333d1/ssx_CuNiR_test_data/imported_no_ref_5.expt - - url: https://github.com/dials/data-files/raw/da38a5de509cb7cb22cc28f439058c66923333d1/ssx_CuNiR_test_data/imported_with_ref_5.expt + - url: https://github.com/dials/data-files/raw/13e97f957175c3e31d34e567b7369b748cfa1d55/ssx_CuNiR_test_data/imported_no_ref_5.expt + - url: https://github.com/dials/data-files/raw/13e97f957175c3e31d34e567b7369b748cfa1d55/ssx_CuNiR_test_data/imported_with_ref_5.expt - url: https://github.com/dials/data-files/raw/da38a5de509cb7cb22cc28f439058c66923333d1/ssx_CuNiR_test_data/strong_1.refl - url: https://github.com/dials/data-files/raw/da38a5de509cb7cb22cc28f439058c66923333d1/ssx_CuNiR_test_data/strong_5.refl + - url: https://github.com/dials/data-files/raw/70d2cbb1bb5d0678b133039a7e009a623762f0ff/ssx_CuNiR_test_data/indexed.expt + - url: https://github.com/dials/data-files/raw/70d2cbb1bb5d0678b133039a7e009a623762f0ff/ssx_CuNiR_test_data/indexed.refl diff --git a/dials_data/definitions/four_circle_eiger.yml b/dials_data/definitions/four_circle_eiger.yml new file mode 100644 index 0000000..47ac681 --- /dev/null +++ b/dials_data/definitions/four_circle_eiger.yml @@ -0,0 +1,48 @@ +name: Single-crystal small-molecule X-ray diffractometry data in NXmx format, using a four-circle goniometer +author: Rebecca Scatena, Mark Warren, Ben Williams (2022) +license: CC-BY 4.0 +url: https://doi.org/10.5281/zenodo.6093230 +description: > + Example NXmx data of X-ray diffraction from a sample of + a small-molecule material, [Cu(HF₂)(pyrazine)₂]PF₆, on + a four-circle diffractometer. Data recorded on I19-2 + (beamline I19, experiments hutch 2) at Diamond Light + Source using a Dectris Eiger2 X 4M CdTe detector. + + Inventory of data: + + 01_CuHF2pyz2PF6b_Phi.tar.xz + A single 1750-image 350° φ rotation scan with detector + position 2θ = 0°. + + 02_CuHF2pyz2PF6b_2T.tar.xz + A single 1750-image 350° φ rotation scan with detector + position 2θ = 20°. + + 03_CuHF2pyz2PF6b_P_O.tar.xz + Two sequential rotation scans: + CuHF2pyz2PF6b_P_O_01.nxs — a 1750-image 350° φ scan. + CuHF2pyz2PF6b_P_O_02.nxs — a 600-image 120° ω scan. + +data: + - url: https://zenodo.org/record/6093231/files/01_CuHF2pyz2PF6b_Phi.tar.xz + files: + - 01_CuHF2pyz2PF6b_Phi/CuHF2pyz2PF6b_Phi_01_000001.h5 + - 01_CuHF2pyz2PF6b_Phi/CuHF2pyz2PF6b_Phi_01_000002.h5 + - 01_CuHF2pyz2PF6b_Phi/CuHF2pyz2PF6b_Phi_01_meta.h5 + - 01_CuHF2pyz2PF6b_Phi/CuHF2pyz2PF6b_Phi_01.nxs + - url: https://zenodo.org/record/6093231/files/02_CuHF2pyz2PF6b_2T.tar.xz + files: + - 02_CuHF2pyz2PF6b_2T/CuHF2pyz2PF6b_2T_01_000001.h5 + - 02_CuHF2pyz2PF6b_2T/CuHF2pyz2PF6b_2T_01_000002.h5 + - 02_CuHF2pyz2PF6b_2T/CuHF2pyz2PF6b_2T_01_meta.h5 + - 02_CuHF2pyz2PF6b_2T/CuHF2pyz2PF6b_2T_01.nxs + - url: https://zenodo.org/record/6093231/files/03_CuHF2pyz2PF6b_P_O.tar.xz + files: + - 03_CuHF2pyz2PF6b_P_O/CuHF2pyz2PF6b_P_O_01_000001.h5 + - 03_CuHF2pyz2PF6b_P_O/CuHF2pyz2PF6b_P_O_01_000002.h5 + - 03_CuHF2pyz2PF6b_P_O/CuHF2pyz2PF6b_P_O_01_meta.h5 + - 03_CuHF2pyz2PF6b_P_O/CuHF2pyz2PF6b_P_O_01.nxs + - 03_CuHF2pyz2PF6b_P_O/CuHF2pyz2PF6b_P_O_02_000001.h5 + - 03_CuHF2pyz2PF6b_P_O/CuHF2pyz2PF6b_P_O_02_meta.h5 + - 03_CuHF2pyz2PF6b_P_O/CuHF2pyz2PF6b_P_O_02.nxs diff --git a/dials_data/download.py b/dials_data/download.py index 26d2932..70d9ec1 100644 --- a/dials_data/download.py +++ b/dials_data/download.py @@ -1,14 +1,20 @@ from __future__ import annotations +import concurrent.futures import contextlib import errno +import functools +import hashlib import os import tarfile import warnings import zipfile from pathlib import Path +from typing import Any, Optional, Union from urllib.parse import urlparse -from urllib.request import urlopen + +import py.path +import requests import dials_data.datasets @@ -21,7 +27,6 @@ if os.name == "posix": def _platform_unlock(file_handle): fcntl.lockf(file_handle, fcntl.LOCK_UN) - elif os.name == "nt": import msvcrt @@ -41,7 +46,6 @@ elif os.name == "nt": file_handle.seek(0) msvcrt.locking(file_handle.fileno(), msvcrt.LK_UNLCK, 1) - else: def _platform_lock(file_handle): @@ -71,59 +75,51 @@ def _file_lock(file_handle): @contextlib.contextmanager -def download_lock(target_dir): +def download_lock(target_dir: Optional[Path]): """ Obtains a (cooperative) lock on a lockfile in a target directory, so only a single (cooperative) process can enter this context manager at any one time. If the lock is held this will block until the existing lock is released. """ - with target_dir.join(".lock").open(mode="w", ensure=True) as fh: + if not target_dir: + yield + return + target_dir.mkdir(parents=True, exist_ok=True) + with target_dir.joinpath(".lock").open(mode="w") as fh: with _file_lock(fh): yield -def _download_to_file(url, pyfile): +def _download_to_file(session: requests.Session, url: str, pyfile: Path): """ Downloads a single URL to a file. """ - with contextlib.closing(urlopen(url)) as socket: - file_size = socket.info().get("Content-Length") - if file_size: - file_size = int(file_size) - # There is no guarantee that the content-length header is set - received = 0 - block_size = 8192 - # Allow for writing the file immediately so we can empty the buffer - with pyfile.open(mode="wb", ensure=True) as f: - while True: - block = socket.read(block_size) - received += len(block) - f.write(block) - if not block: - break - - if file_size and file_size != received: - raise OSError( - "Error downloading {url}: received {received} bytes instead of expected {file_size} bytes".format( - file_size=file_size, received=received, url=url - ) - ) + with session.get(url, stream=True) as r: + r.raise_for_status() + pyfile.parent.mkdir(parents=True, exist_ok=True) + with pyfile.open(mode="wb") as f: + for chunk in r.iter_content(chunk_size=40960): + f.write(chunk) -def file_hash(path): +def file_hash(file_to_hash: Path) -> str: """Returns the SHA256 digest of a file.""" - return path.computehash(hashtype="sha256") + sha256_hash = hashlib.sha256() + with file_to_hash.open("rb") as f: + for block in iter(lambda: f.read(4096), b""): + sha256_hash.update(block) + return sha256_hash.hexdigest() def fetch_dataset( dataset, - ignore_hashinfo=False, - verify=False, - read_only=False, - verbose=False, - pre_scan=True, - download_lockdir=None, -): + ignore_hashinfo: bool = False, + verify: bool = False, + read_only: bool = False, + verbose: bool = False, + pre_scan: bool = True, + download_lockdir: Optional[Path] = None, +) -> Union[bool, Any]: """Check for the presence or integrity of the local copy of the specified test dataset. If the dataset is not available or out of date then attempt to download/update it transparently. @@ -144,8 +140,8 @@ def fetch_dataset( return False definition = dials_data.datasets.definition[dataset] - target_dir = dials_data.datasets.repository_location().join(dataset) - if read_only and not target_dir.check(dir=1): + target_dir: Path = dials_data.datasets.repository_location() / dataset + if read_only and not target_dir.is_dir(): return False integrity_info = definition.get("hashinfo") @@ -154,10 +150,10 @@ def fetch_dataset( if "verify" not in integrity_info: integrity_info["verify"] = [{} for _ in definition["data"]] - filelist = [ + filelist: list[dict[str, Any]] = [ { "url": source["url"], - "file": target_dir.join(os.path.basename(urlparse(source["url"]).path)), + "file": target_dir / os.path.basename(urlparse(source["url"]).path), "files": source.get("files"), "verify": hashinfo, } @@ -166,90 +162,86 @@ def fetch_dataset( if pre_scan or read_only: if all( - item["file"].check() + item["file"].is_file() and item["verify"].get("size") - and item["verify"]["size"] == item["file"].size() + and item["verify"]["size"] == item["file"].stat().st_size for item in filelist ): return True if read_only: return False - if download_lockdir: - # Acquire lock if required as files may be downloaded/written. - with download_lock(download_lockdir): - _fetch_filelist(filelist, file_hash) - else: - _fetch_filelist(filelist, file_hash) + # Acquire lock if required as files may be downloaded/written. + with download_lock(download_lockdir): + _fetch_filelist(filelist) return integrity_info -def _fetch_filelist(filelist, file_hash): - for source in filelist: # parallelize this - if source.get("type", "file") == "file": - valid = False - if source["file"].check(file=1): - # verify - valid = True - if source["verify"]: - if source["verify"]["size"] != source["file"].size(): - valid = False - print("size") - elif source["verify"]["hash"] != file_hash(source["file"]): - valid = False +def _fetch_filelist(filelist: list[dict[str, Any]]) -> None: + with requests.Session() as rs: + pool = concurrent.futures.ThreadPoolExecutor(max_workers=5) + pool.map(functools.partial(_fetch_file, rs), filelist) + + +def _fetch_file(session: requests.Session, source: dict[str, Any]) -> None: + valid = False + if source["file"].is_file(): + # verify + valid = True + if source["verify"]: + if source["verify"]["size"] != source["file"].stat().st_size: + valid = False + elif source["verify"]["hash"] != file_hash(source["file"]): + valid = False + + downloaded = False + if not valid: + print(f"Downloading {source['url']}") + _download_to_file(session, source["url"], source["file"]) + downloaded = True + + # verify + valid = True + if source["verify"]: + if source["verify"]["size"] != source["file"].stat().st_size: + print( + f"File size mismatch on {source['file']}: " + f"{source['file'].stat().st_size}, expected {source['verify']['size']}" + ) + elif source["verify"]["hash"] != file_hash(source["file"]): + print(f"File hash mismatch on {source['file']}") + else: + source["verify"]["size"] = source["file"].stat().st_size + source["verify"]["hash"] = file_hash(source["file"]) + + # If the file is a tar archive, then decompress + if source["files"]: + target_dir = source["file"].parent + if downloaded or not all((target_dir / f).is_file() for f in source["files"]): + # If the file has been (re)downloaded, or we don't have all the requested + # files from the archive, then we need to decompress the archive + print(f"Decompressing {source['file']}") + if source["file"].suffix == ".zip": + with zipfile.ZipFile(source["file"]) as zf: + try: + for f in source["files"]: + zf.extract(f, path=source["file"].parent) + except KeyError: print( - "hash", source["verify"]["hash"], file_hash(source["file"]) + f"Expected file {f} not present " + f"in zip archive {source['file']}" ) - - downloaded = False - if not valid: - print("Downloading {}".format(source["url"])) - _download_to_file(source["url"], source["file"]) - downloaded = True - - # verify - valid = True - fileinfo = { - "size": source["file"].size(), - "hash": file_hash(source["file"]), - } - if source["verify"]: - if source["verify"]["size"] != fileinfo["size"]: - valid = False - elif source["verify"]["hash"] != fileinfo["hash"]: - valid = False else: - source["verify"]["size"] = fileinfo["size"] - source["verify"]["hash"] = fileinfo["hash"] - - # If the file is a tar archive, then decompress - if source["files"]: - target_dir = source["file"].dirpath() - if downloaded or not all( - (target_dir / f).check(file=1) for f in source["files"] - ): - # If the file has been (re)downloaded, or we don't have all the requested - # files from the archive, then we need to decompress the archive - print("Decompressing {file}".format(file=source["file"])) - if source["file"].ext == ".zip": - with zipfile.ZipFile(source["file"].strpath) as zf: + with tarfile.open(source["file"]) as tar: + for f in source["files"]: try: - for f in source["files"]: - zf.extract(f, path=source["file"].dirname) + tar.extract(f, path=source["file"].parent) except KeyError: print( - f"Expected file {f} not present in zip archive {source['file']}" + f"Expected file {f} not present " + f"in tar archive {source['file']}" ) - else: - with tarfile.open(source["file"].strpath) as tar: - for f in source["files"]: - try: - tar.extract(f, path=source["file"].dirname) - except KeyError: - print( - f"Expected file {f} not present in tar archive {source['file']}" - ) class DataFetcher: @@ -259,7 +251,7 @@ class DataFetcher: df = DataFetcher() Then df('insulin') - returns a py.path object to the insulin data. If that data is not already + returns a Path object to the insulin data. If that data is not already on disk it is downloaded automatically. To disable all downloads: @@ -269,14 +261,14 @@ class DataFetcher: """ def __init__(self, read_only=False): - self._cache = {} - self._target_dir = dials_data.datasets.repository_location() - self._read_only = read_only and os.access(self._target_dir.strpath, os.W_OK) + self._cache: dict[str, Optional[Path]] = {} + self._target_dir: Path = dials_data.datasets.repository_location() + self._read_only: bool = read_only and os.access(self._target_dir, os.W_OK) - def __repr__(self): + def __repr__(self) -> str: return "<{}DataFetcher: {}>".format( "R/O " if self._read_only else "", - self._target_dir.strpath, + self._target_dir, ) def result_filter(self, result, **kwargs): @@ -288,7 +280,7 @@ class DataFetcher: """ return result - def __call__(self, test_data, pathlib=None, **kwargs): + def __call__(self, test_data: str, pathlib=None, **kwargs): """ Return the location of a dataset, transparently downloading it if necessary and possible. @@ -304,7 +296,7 @@ class DataFetcher: if the dataset is not available. """ if test_data not in self._cache: - self._cache[test_data] = self._attempt_fetch(test_data, **kwargs) + self._cache[test_data] = self._attempt_fetch(test_data) if pathlib is None: warnings.warn( "The DataFetcher currently returns py.path.local() objects. " @@ -314,15 +306,13 @@ class DataFetcher: DeprecationWarning, stacklevel=2, ) - if pathlib and self._cache[test_data]["result"]: - result = { - **self._cache[test_data], - "result": Path(self._cache[test_data]["result"]), - } - return self.result_filter(**result) - return self.result_filter(**self._cache[test_data]) - - def _attempt_fetch(self, test_data): + if not self._cache[test_data]: + return self.result_filter(result=False) + elif not pathlib: + return self.result_filter(result=py.path.local(self._cache[test_data])) + return self.result_filter(result=self._cache[test_data]) + + def _attempt_fetch(self, test_data: str) -> Optional[Path]: if self._read_only: data_available = fetch_dataset(test_data, pre_scan=True, read_only=True) else: @@ -333,6 +323,6 @@ class DataFetcher: download_lockdir=self._target_dir, ) if data_available: - return {"result": self._target_dir.join(test_data)} + return self._target_dir / test_data else: - return {"result": False} + return None diff --git a/dials_data/hashinfo/aluminium_standard.yml b/dials_data/hashinfo/aluminium_standard.yml new file mode 100644 index 0000000..bf6acaf --- /dev/null +++ b/dials_data/hashinfo/aluminium_standard.yml @@ -0,0 +1,11 @@ +definition: 4eaca4076872da390c5db4ca7e545159674de804ca8639cb38a99db05651e80b +formatversion: 1 +verify: +- hash: 1438d6a48e3d6c9086d0d5d24ab677b88564a7208276a146e543ff0309fd29b3 + size: 16779008 +- hash: 062d4e03c306a47898068d2e94c98201bdd2ac5be0e4b1862558571f81a49405 + size: 3534 +- hash: f7c20d958c5493929b4aac017e3d153f79ecfa18c750a469eeabf875c0a00ae8 + size: 3568 +- hash: 8517435c4a14d53d0b0f25f05a1d05b70cf52baf05b14c2b5a3b12519915ba79 + size: 3568 diff --git a/dials_data/hashinfo/cunir_serial_processed.yml b/dials_data/hashinfo/cunir_serial_processed.yml index 3840e27..65bcd64 100644 --- a/dials_data/hashinfo/cunir_serial_processed.yml +++ b/dials_data/hashinfo/cunir_serial_processed.yml @@ -1,11 +1,15 @@ -definition: 9140f0bd0ad5c90e7a3f36941ae88399dab2bec0a10746504836d7ceb683515c +definition: 16b3f8545250a8fb50634d426f23c4d914d1896fd7fab31420ae10854c22e30f formatversion: 1 verify: -- hash: 16a8e3c363bed6d22d6877738203039883d83cfecd9d142672a434815ae41461 - size: 6951 -- hash: e50bb7a02c11ebcb69d6788a6780b67ac25a4fbdc8d12f201926d2830baa642f - size: 7060 +- hash: dca8ecd27f43c0140fbbfc946e7fff5931a829a3ab9358f351d2908fce536806 + size: 6967 +- hash: b43b2058c93d0bf9a958fbbf1905252f8f8e2b54926875aa184a28a0d880c3ec + size: 7076 - hash: d8a33e241a2c8f96edcbfa7b7634ca7fee99b6a92b19b2a7d54affffe7e98211 size: 32769 - hash: 17243f85082e370f147a40236be3c05d35bf29071bde9ff94534115b8c22602c size: 152073 +- hash: cbd49cc528de4880d01ddfc99efdb4260a649162cdd7108b75934f6a9a60c06d + size: 22264 +- hash: 8336e3ebf9d472225c1aa4fa80afb380f3521265f15746a1742b65f5c1ead0c7 + size: 148385 diff --git a/dials_data/hashinfo/four_circle_eiger.yml b/dials_data/hashinfo/four_circle_eiger.yml new file mode 100644 index 0000000..5e8c2c8 --- /dev/null +++ b/dials_data/hashinfo/four_circle_eiger.yml @@ -0,0 +1,9 @@ +definition: 26dad0598c500ceb4a535234ce0515dc9562b861dd3573fd671785713ecdf44d +formatversion: 1 +verify: +- hash: 13a4088f838f20b4959e5607d799bb6f418d35f893f2f8b4effc01ecf0cf1c9c + size: 382763416 +- hash: 496f96c265a091eb151d8fc25e352e297be4257fbacda035403036094a8f0b4c + size: 376888156 +- hash: d4eda31c0a9285e4f064298ea7001a6d5531ddd00ab2b8cee644bb9520738cb5 + size: 509819364 diff --git a/dials_data/hashinfo/isis_sxd_example_data.yml b/dials_data/hashinfo/isis_sxd_example_data.yml index a8c213e..7c7ede6 100644 --- a/dials_data/hashinfo/isis_sxd_example_data.yml +++ b/dials_data/hashinfo/isis_sxd_example_data.yml @@ -1,4 +1,4 @@ -definition: e96c799b2b252663cb8c54c005eccb3911be7d3ec9469ba9993bf172c5872468 +definition: 63d2bcc84235e3d5773bbed0b16d8c597223f0aae74b84fc6482e66bd2b4fb41 formatversion: 1 verify: - hash: b11851993e5b33a047fc5a03e5f320836fd8d1f4bb2a64ca442ea73aebf1a074 diff --git a/dials_data/hashinfo/lcls_rayonix_kapton.yml b/dials_data/hashinfo/lcls_rayonix_kapton.yml index 83f0736..1c810ec 100644 --- a/dials_data/hashinfo/lcls_rayonix_kapton.yml +++ b/dials_data/hashinfo/lcls_rayonix_kapton.yml @@ -1,4 +1,4 @@ -definition: cceba9f63167c6ff5f568479dc42b26463d2a770a2cf1430263cebc8710e02b4 +definition: 29db1408c99173ed6fa64ae5ca5e5a091a00a88836e0d506a8dbea4c5d5f092b formatversion: 1 verify: - hash: 6fca790fe0845b7475d2db2c2f88c6c307551179ca51ddda00fa849ae235ceed diff --git a/dials_data/hashinfo/mpro_x0692.yml b/dials_data/hashinfo/mpro_x0692.yml index 550842f..8c35d39 100644 --- a/dials_data/hashinfo/mpro_x0692.yml +++ b/dials_data/hashinfo/mpro_x0692.yml @@ -1,4 +1,4 @@ -definition: 8139f395a31405c7a5fa4003c1dabf18ca4258a9e8fff12a97127d81af41f30a +definition: 51b7efa2411239a6499b499af66a1f7a7e3ba5a8eef98fee7607677a4cdc3eee formatversion: 1 verify: - hash: cd47b6f7ec61d7acf74fb4522ff9636ed5786c03d737d96b98e4a0df7e276af0 diff --git a/dials_data/py.typed b/dials_data/py.typed new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/dials_data/py.typed diff --git a/dials_data/pytest11.py b/dials_data/pytest11.py index d344289..8287a73 100644 --- a/dials_data/pytest11.py +++ b/dials_data/pytest11.py @@ -20,7 +20,7 @@ def pytest_addoption(parser): @pytest.fixture(scope="session") -def dials_data(request): +def dials_data(request) -> DataFetcher: """ Return the location of a regression dataset as py.path object. Download the files if they are not on disk already. diff --git a/docs/conf.py b/docs/conf.py index f95206c..749d655 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -56,7 +56,7 @@ author = "Markus Gerstel" # the built documents. # # The short X.Y version. -version = "2.3.0" +version = "2.4.0" # The full version, including alpha/beta/rc tags. release = version diff --git a/docs/installation.rst b/docs/installation.rst index 50d1223..5f2d412 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -26,11 +26,11 @@ Both are valid opinions. However you do not need to install ``dials_data`` from source. You can simply run:: - pip install -U dials_data + pip install -U dials-data or, in a conda environment:: - conda install -c conda-forge dials_data + conda install -c conda-forge dials-data This will install or update an existing installation of ``dials_data``. diff --git a/docs/why.rst b/docs/why.rst index 4b0622f..6accf9d 100644 --- a/docs/why.rst +++ b/docs/why.rst @@ -33,7 +33,7 @@ more files are added. This quickly becomes impractical in distributed testing environments. The disk space required for checkouts can be reduced by compressing the data, but then they need to be unpacked for using the data in tests. By its nature the internal SVN repository was -not publically accessible. The data files were too large to convert the +not publicly accessible. The data files were too large to convert the repository to a git repository to be hosted on Github, and in any case a git repository was not the best place either to store large amounts of data, as old versions of the data or retired datasets are kept @@ -48,7 +48,7 @@ first. With dxtbx_, dials_ and xia2_ moving to pytest_ we extended the xia2_regression_ concept into the regression_data_ fixture to provide a simple way to access the datasets in tests, but the data still -needed downloading separately and coult not easily be used outside +needed downloading separately and could not easily be used outside of the dials_ repository and not at all outside of a dials_ distribution. Adding data files was still a very involved process. diff --git a/pyproject.toml b/pyproject.toml index 121a39f..d1c90b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,7 @@ [build-system] requires = ["setuptools >= 40.6.0", "wheel"] build-backend = "setuptools.build_meta" + +[[tool.mypy.overrides]] +module = [ "py", "py.path", "pytest", "importlib_resources" ] +ignore_missing_imports = true diff --git a/requirements.txt b/requirements.txt index 5c2deea..2b32889 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ importlib_resources==5.4.0 py==1.11.0 -pytest==6.2.5 +pytest==7.0.1 pyyaml==6.0 +requests==2.27.1 diff --git a/requirements_dev.txt b/requirements_dev.txt index 34a0b2c..c4b6d93 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,7 +1,8 @@ collective.checkdocs==0.2 -coverage==6.2 +coverage==6.3.2 importlib_resources==5.4.0 py==1.11.0 -pytest==6.2.5 +pytest==7.0.1 pyyaml==6.0 -wheel==0.37.0 +requests==2.27.1 +wheel==0.37.1 diff --git a/requirements_doc.txt b/requirements_doc.txt index f097e9e..9a901e1 100644 --- a/requirements_doc.txt +++ b/requirements_doc.txt @@ -1,5 +1,5 @@ py==1.11.0 -pytest==6.2.5 +pytest==7.0.1 pyyaml==6.0 -Sphinx==4.3.1 +Sphinx==4.4.0 sphinx_rtd_theme==1.0.0 @@ -1,6 +1,6 @@ [metadata] name = dials_data -version = 2.3.0 +version = 2.4.0 url = https://github.com/dials/data project_urls = Bug Tracker = https://github.com/dials/data/issues @@ -31,6 +31,7 @@ install_requires = importlib_resources>=1.1 pytest pyyaml + requests # importlib; python_version == "2.6" packages = find: python_requires = >=3.7 @@ -46,6 +47,9 @@ libtbx.precommit = pytest11 = dials_data = dials_data.pytest11 +[options.package_data] +dials_data = py.typed + [flake8] # Black disagrees with flake8 on a few points. Ignore those. ignore = E203, E266, E501, W503 diff --git a/tests/test_dials_data.py b/tests/test_dials_data.py index 8905285..c0d5ffb 100644 --- a/tests/test_dials_data.py +++ b/tests/test_dials_data.py @@ -17,39 +17,39 @@ def test_all_datasets_can_be_parsed(): def test_repository_location(): rl = dials_data.datasets.repository_location() - assert rl.check(dir=1) + assert rl.is_dir() def test_fetching_undefined_datasets_does_not_crash(): df = dials_data.download.DataFetcher(read_only=True) - assert df("aardvark") is False + assert df("aardvark", pathlib=True) is False def test_requests_for_future_datasets_can_be_intercepted(): df = dials_data.download.DataFetcher(read_only=True) df.result_filter = mock.Mock() df.result_filter.return_value = False - assert df("aardvark") is False + assert df("aardvark", pathlib=True) is False df.result_filter.assert_called_once_with(result=False) @mock.patch("dials_data.datasets.repository_location") @mock.patch("dials_data.download.fetch_dataset") def test_datafetcher_constructs_py_path(fetcher, root): - root.return_value = py.path.local("/tmp/root") + root.return_value = pathlib.Path("/tmp/root") fetcher.return_value = True df = dials_data.download.DataFetcher(read_only=True) with pytest.warns(DeprecationWarning): ds = df("dataset") - assert ds == py.path.local("/tmp/root/dataset") + assert pathlib.Path(ds).resolve() == pathlib.Path("/tmp/root/dataset").resolve() assert isinstance(ds, py.path.local) fetcher.assert_called_once_with( "dataset", pre_scan=True, read_only=False, download_lockdir=mock.ANY ) ds = df("dataset", pathlib=False) - assert ds == py.path.local("/tmp/root/dataset") + assert pathlib.Path(ds).resolve() == pathlib.Path("/tmp/root/dataset").resolve() assert isinstance(ds, py.path.local) fetcher.assert_called_once() @@ -57,21 +57,22 @@ def test_datafetcher_constructs_py_path(fetcher, root): @mock.patch("dials_data.datasets.repository_location") @mock.patch("dials_data.download.fetch_dataset") def test_datafetcher_constructs_path(fetcher, root): - test_path = py.path.local("/tmp/root") + test_path = pathlib.Path("/tmp/root") root.return_value = test_path fetcher.return_value = True df = dials_data.download.DataFetcher(read_only=True) ds = df("dataset", pathlib=True) - assert ds == pathlib.Path(test_path) / "dataset" + assert ds == test_path / "dataset" assert isinstance(ds, pathlib.Path) fetcher.assert_called_once_with( "dataset", pre_scan=True, read_only=False, download_lockdir=mock.ANY ) - ds = df("dataset") - assert ds == pathlib.Path(test_path) / "dataset" + with pytest.warns(DeprecationWarning): + ds = df("dataset") + assert pathlib.Path(ds).resolve() == test_path.joinpath("dataset").resolve() assert not isinstance( ds, pathlib.Path ) # default is currently to return py.path.local() |