Merge pull request #311867 from luftmensch-luftmensch/datatrove_0.2.0

datatrove: init at 0.2.0
This commit is contained in:
Sandro 2024-06-11 19:48:02 +02:00 committed by GitHub
commit 1d472db6a3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -0,0 +1,65 @@
{
lib,
fetchFromGitHub,
python3Packages,
}:
let
version = "0.2.0";
in
python3Packages.buildPythonPackage {
pname = "datatrove";
inherit version;
pyproject = true;
src = fetchFromGitHub {
owner = "huggingface";
repo = "datatrove";
rev = "refs/tags/v${version}";
hash = "sha256-2NJja2yWeHOgo1pCuwHN6SgYnsimuZdK0jE8ucTH4r8=";
};
nativeBuildInputs = with python3Packages; [ setuptools ];
propagatedBuildInputs = with python3Packages; [
dill
fsspec
huggingface-hub
tokenizers
humanize
loguru
multiprocess
numpy
rich
];
nativeCheckInputs = with python3Packages; [ pytestCheckHook ];
dependencies = with python3Packages; [
boto3
fasteners
huggingface-hub
moto
nltk
s3fs
xxhash
];
disabledTestPaths = [
"tests/executor/test_local.py"
"tests/pipeline/test_filters.py"
"tests/pipeline/test_bloom_filter.py"
"tests/pipeline/test_minhash.py"
"tests/pipeline/test_sentence_deduplication.py"
"tests/pipeline/test_tokenization.py"
"tests/pipeline/test_exact_substrings.py"
];
pythonImportsCheck = [ "datatrove" ];
meta = {
description = "Set of platform-agnostic customizable pipeline processing blocks for data processing";
homepage = "https://github.com/huggingface/datatrove";
changelog = "https://github.com/huggingface/datatrove/releases/tag/v${version}";
license = lib.licenses.asl20;
maintainers = with lib.maintainers; [ luftmensch-luftmensch ];
platforms = lib.platforms.all;
};
}