144 lines
3.4 KiB
Nix
144 lines
3.4 KiB
Nix
{ lib
|
|
, stdenv
|
|
, botocore
|
|
, buildPythonPackage
|
|
, cryptography
|
|
, cssselect
|
|
, fetchFromGitHub
|
|
, fetchpatch
|
|
, glibcLocales
|
|
, installShellFiles
|
|
, itemadapter
|
|
, itemloaders
|
|
, jmespath
|
|
, lxml
|
|
, parsel
|
|
, pillow
|
|
, protego
|
|
, pydispatcher
|
|
, pyopenssl
|
|
, pytest-twisted
|
|
, pytestCheckHook
|
|
, pythonOlder
|
|
, queuelib
|
|
, service-identity
|
|
, sybil
|
|
, testfixtures
|
|
, twisted
|
|
, w3lib
|
|
, zope_interface
|
|
}:
|
|
|
|
buildPythonPackage rec {
|
|
pname = "scrapy";
|
|
version = "2.5.0";
|
|
disabled = pythonOlder "3.6";
|
|
|
|
src = fetchFromGitHub {
|
|
owner = pname;
|
|
repo = pname;
|
|
rev = version;
|
|
sha256 = "09lxnjz1cw37i9bgk8sci2xxknj20gi2lq8l7i0b3xw7q8bxzp7h";
|
|
};
|
|
|
|
nativeBuildInputs = [
|
|
installShellFiles
|
|
];
|
|
|
|
propagatedBuildInputs = [
|
|
cryptography
|
|
cssselect
|
|
itemadapter
|
|
itemloaders
|
|
lxml
|
|
parsel
|
|
protego
|
|
pydispatcher
|
|
pyopenssl
|
|
queuelib
|
|
service-identity
|
|
twisted
|
|
w3lib
|
|
zope_interface
|
|
];
|
|
|
|
checkInputs = [
|
|
botocore
|
|
glibcLocales
|
|
jmespath
|
|
pytestCheckHook
|
|
sybil
|
|
testfixtures
|
|
];
|
|
|
|
patches = [
|
|
# Require setuptools, https://github.com/scrapy/scrapy/pull/5122
|
|
(fetchpatch {
|
|
name = "add-setuptools.patch";
|
|
url = "https://github.com/scrapy/scrapy/commit/4f500342c8ad4674b191e1fab0d1b2ac944d7d3e.patch";
|
|
sha256 = "14030sfv1cf7dy4yww02b49mg39cfcg4bv7ys1iwycfqag3xcjda";
|
|
})
|
|
# Make Twisted[http2] installation optional, https://github.com/scrapy/scrapy/pull/5113
|
|
(fetchpatch {
|
|
name = "remove-h2.patch";
|
|
url = "https://github.com/scrapy/scrapy/commit/c5b1ee810167266fcd259f263dbfc0fe0204761a.patch";
|
|
sha256 = "1gw28wg8qcb0al59rz214hm17smspi6j5kg62nr1r850pykyrsqk";
|
|
})
|
|
];
|
|
|
|
LC_ALL = "en_US.UTF-8";
|
|
|
|
# Disable doctest plugin because it causes pytest to hang
|
|
preCheck = ''
|
|
substituteInPlace pytest.ini --replace "--doctest-modules" ""
|
|
'';
|
|
|
|
disabledTestPaths = [
|
|
"tests/test_proxy_connect.py"
|
|
"tests/test_utils_display.py"
|
|
"tests/test_command_check.py"
|
|
# Don't test the documentation
|
|
"docs"
|
|
];
|
|
|
|
disabledTests = [
|
|
# It's unclear if the failures are related to libxml2, https://github.com/NixOS/nixpkgs/pull/123890
|
|
"test_nested_css"
|
|
"test_nested_xpath"
|
|
"test_flavor_detection"
|
|
# Requires network access
|
|
"FTPFeedStorageTest"
|
|
"FeedExportTest"
|
|
"test_custom_asyncio_loop_enabled_true"
|
|
"test_custom_loop_asyncio"
|
|
"test_custom_loop_asyncio_deferred_signal"
|
|
"FileFeedStoragePreFeedOptionsTest" # https://github.com/scrapy/scrapy/issues/5157
|
|
] ++ lib.optionals stdenv.isDarwin [
|
|
"test_xmliter_encoding"
|
|
"test_download"
|
|
];
|
|
|
|
postInstall = ''
|
|
installManPage extras/scrapy.1
|
|
install -m 644 -D extras/scrapy_bash_completion $out/share/bash-completion/completions/scrapy
|
|
install -m 644 -D extras/scrapy_zsh_completion $out/share/zsh/site-functions/_scrapy
|
|
'';
|
|
|
|
pythonImportsCheck = [ "scrapy" ];
|
|
|
|
__darwinAllowLocalNetworking = true;
|
|
|
|
meta = with lib; {
|
|
description = "High-level web crawling and web scraping framework";
|
|
longDescription = ''
|
|
Scrapy is a fast high-level web crawling and web scraping framework, used to crawl
|
|
websites and extract structured data from their pages. It can be used for a wide
|
|
range of purposes, from data mining to monitoring and automated testing.
|
|
'';
|
|
homepage = "https://scrapy.org/";
|
|
license = licenses.bsd3;
|
|
maintainers = with maintainers; [ drewkett marsam ];
|
|
platforms = platforms.unix;
|
|
};
|
|
}
|