nixpkgs/pkgs/development/python-modules/dask/default.nix
Noah D. Brenowitz ceeaf2d066 python3Packages.dask: fix sandboxed builds
Importing dask.dataframe in a sandboxed build results in a TypeError like
this:

  File "/nix/store/nv60iri29bia4szhhcvsdxgsci4wxvp6-python3.8-dask-2021.03.0/lib/python3.8/site-packages/dask/dataframe/io/csv.py", line 392, in <module>
    AUTO_BLOCKSIZE = auto_blocksize(TOTAL_MEM, CPU_COUNT)
  File "/nix/store/nv60iri29bia4szhhcvsdxgsci4wxvp6-python3.8-dask-2021.03.0/lib/python3.8/site-packages/dask/dataframe/io/csv.py", line 382, in auto_blocksize
    blocksize = int(total_memory // cpu_count / memory_factor)
  TypeError: unsupported operand type(s) for //: 'int' and 'NoneType'

This occurs because dask.dataframe has a non-deterministic component which
generates an automatic chunk-size based on system information.

This went unnoticed because the dask tests were disabled.

Changes:
- add a patch making the chunk-size inference more robust
- re-enable the tests

Resolves #120307
2021-04-23 17:57:01 -07:00

95 lines
2.1 KiB
Nix

{ lib
, bokeh
, buildPythonPackage
, fetchpatch
, fetchFromGitHub
, fsspec
, pytestCheckHook
, pytest-rerunfailures
, pythonOlder
, cloudpickle
, numpy
, toolz
, dill
, pandas
, partd
, pytest-xdist
, withExtraComplete ? false
, distributed
}:
buildPythonPackage rec {
pname = "dask";
version = "2021.03.0";
disabled = pythonOlder "3.5";
src = fetchFromGitHub {
owner = "dask";
repo = pname;
rev = version;
sha256 = "LACv7lWpQULQknNGX/9vH9ckLsypbqKDGnsNBgKT1eI=";
};
propagatedBuildInputs = [
bokeh
cloudpickle
dill
fsspec
numpy
pandas
partd
toolz
] ++ lib.optionals withExtraComplete [
distributed
];
doCheck = true;
checkInputs = [
pytestCheckHook
pytest-rerunfailures
pytest-xdist
];
dontUseSetuptoolsCheck = true;
patches = [
# dask dataframe cannot be imported in sandboxed builds
# See https://github.com/dask/dask/pull/7601
(fetchpatch {
url = "https://github.com/dask/dask/commit/9ce5b0d258cecb3ef38fd844135ad1f7ac3cea5f.patch";
sha256 = "sha256-1EVRYwAdTSEEH9jp+UOnrijzezZN3iYR6q6ieYJM3kY=";
name = "fix-dask-dataframe-imports-in-sandbox.patch";
})
];
postPatch = ''
# versioneer hack to set version of github package
echo "def get_versions(): return {'dirty': False, 'error': None, 'full-revisionid': None, 'version': '${version}'}" > dask/_version.py
substituteInPlace setup.py \
--replace "version=versioneer.get_version()," "version='${version}'," \
--replace "cmdclass=versioneer.get_cmdclass()," ""
'';
pytestFlagsArray = [ "-n $NIX_BUILD_CORES" ];
disabledTests = [
"test_annotation_pack_unpack"
"test_annotations_blockwise_unpack"
# this test requires features of python3Packages.psutil that are
# blocked in sandboxed-builds
"test_auto_blocksize_csv"
];
pythonImportsCheck = [ "dask.dataframe" "dask" "dask.array" ];
meta = with lib; {
description = "Minimal task scheduling abstraction";
homepage = "https://dask.org/";
changelog = "https://docs.dask.org/en/latest/changelog.html";
license = licenses.bsd3;
maintainers = with maintainers; [ fridh ];
};
}