From cbf635795d2c40b1f5c1983248e327e872c13f6e Mon Sep 17 00:00:00 2001 From: luftmensch-luftmensch Date: Wed, 15 May 2024 09:18:29 +0200 Subject: [PATCH] datatrove: init at 0.2.0 --- pkgs/by-name/da/datatrove/package.nix | 65 +++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 pkgs/by-name/da/datatrove/package.nix diff --git a/pkgs/by-name/da/datatrove/package.nix b/pkgs/by-name/da/datatrove/package.nix new file mode 100644 index 000000000000..61ac911cc8e4 --- /dev/null +++ b/pkgs/by-name/da/datatrove/package.nix @@ -0,0 +1,65 @@ +{ + lib, + fetchFromGitHub, + python3Packages, +}: +let + version = "0.2.0"; +in +python3Packages.buildPythonPackage { + pname = "datatrove"; + inherit version; + pyproject = true; + + src = fetchFromGitHub { + owner = "huggingface"; + repo = "datatrove"; + rev = "refs/tags/v${version}"; + hash = "sha256-2NJja2yWeHOgo1pCuwHN6SgYnsimuZdK0jE8ucTH4r8="; + }; + + nativeBuildInputs = with python3Packages; [ setuptools ]; + + propagatedBuildInputs = with python3Packages; [ + dill + fsspec + huggingface-hub + tokenizers + humanize + loguru + multiprocess + numpy + rich + ]; + + nativeCheckInputs = with python3Packages; [ pytestCheckHook ]; + dependencies = with python3Packages; [ + boto3 + fasteners + huggingface-hub + moto + nltk + s3fs + xxhash + ]; + + disabledTestPaths = [ + "tests/executor/test_local.py" + "tests/pipeline/test_filters.py" + "tests/pipeline/test_bloom_filter.py" + "tests/pipeline/test_minhash.py" + "tests/pipeline/test_sentence_deduplication.py" + "tests/pipeline/test_tokenization.py" + "tests/pipeline/test_exact_substrings.py" + ]; + + pythonImportsCheck = [ "datatrove" ]; + meta = { + description = "Set of platform-agnostic customizable pipeline processing blocks for data processing"; + homepage = "https://github.com/huggingface/datatrove"; + changelog = "https://github.com/huggingface/datatrove/releases/tag/v${version}"; + license = lib.licenses.asl20; + maintainers = with lib.maintainers; [ luftmensch-luftmensch ]; + platforms = lib.platforms.all; + }; +}