nixpkgs/pkgs/applications/graphics/tesseract/default.nix

68 lines
2.2 KiB
Nix
Raw Normal View History

tesseract: 3.04.01 -> 3.05.00 Upstream changelog: * Made some fine tuning to the hOCR output. * Added TSV as another optional output format. * Fixed ABI break introduced in 3.04.00 with the AnalyseLayout() method. * text2image tool - Enable all OpenType ligatures available in a font. This feature requires Pango 1.38 or newer. * Training tools - Replaced asserts with tprintf() and exit(1). * Fixed Cygwin compatibility. * Improved multipage tiff processing. * Improved the embedded pdf font (pdf.ttf). * Enable selection of OCR engine mode from command line. * Changed tesseract command line parameter '-psm' to '--psm'. * Added new C API for orientation and script detection, removed the old one. * Increased minimum autoconf version to 2.59. * Removed dead code. * Fixed many compiler warning. * Fixed memory and resource leaks. * Fixed some issues with the 'Cube' OCR engine. * Fixed some openCL issues. * Added option to build Tesseract with CMake build system. * Implemented CPPAN support for easy Windows building. The upstream URL of the change log is: https://github.com/tesseract-ocr/tesseract/releases/tag/3.05.00 Tested by building against the following packages that directly depend on it: * vapoursynth (with ocrSupport = true) * pyocr (fails) * vobsub2srt Also tested against the following NixOS VM tests that have OCR enabled: * nixos/tests/chromium.nix -A stable * nixos/tests/emacs-daemon.nix * nixos/tests/installer.nix -A luksroot * nixos/tests/lightdm.nix * nixos/tests/plasma5.nix * nixos/tests/sddm.nix All of the packages and tests except pyocr build/succeed on x86_64-linux. Fixing pyocr is outside of the scope of this commit and will happen very soon. Signed-off-by: aszlig <aszlig@redmoonstudios.org>
2017-04-08 01:43:18 +01:00
{ stdenv, fetchFromGitHub, autoreconfHook, pkgconfig
, leptonica, libpng, libtiff, icu, pango, opencl-headers
# Supported list of languages or `null' for all available languages
, enableLanguages ? null
# if you want just a specific list of languages, optionally specify a hash
# to make tessdata a fixed output derivation.
, enableLanguagesHash ? (if enableLanguages == null # all languages
then "1h48xfzabhn0ldbx5ib67cp9607pr0zpblsy8z6fs4knn0zznfnw"
else null)
}:
let tessdata = stdenv.mkDerivation ({
name = "tessdata";
src = fetchFromGitHub {
owner = "tesseract-ocr";
repo = "tessdata";
rev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d";
# when updating don't forget to update the default value fo enableLanguagesHash
sha256 = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7";
};
buildCommand = ''
cd $src;
for lang in ${if enableLanguages==null then "*.traineddata" else stdenv.lib.concatMapStringsSep " " (x: x+".traineddata") enableLanguages} ; do
install -Dt $out/share/tessdata $src/$lang ;
done;
'';
preferLocalBuild = true;
} // (stdenv.lib.optionalAttrs (enableLanguagesHash != null) {
# when a hash is given, we make this a fixed output derivation.
outputHashMode = "recursive";
outputHashAlgo = "sha256";
outputHash = enableLanguagesHash;
}));
in
stdenv.mkDerivation rec {
name = "tesseract-${version}";
tesseract: 3.04.01 -> 3.05.00 Upstream changelog: * Made some fine tuning to the hOCR output. * Added TSV as another optional output format. * Fixed ABI break introduced in 3.04.00 with the AnalyseLayout() method. * text2image tool - Enable all OpenType ligatures available in a font. This feature requires Pango 1.38 or newer. * Training tools - Replaced asserts with tprintf() and exit(1). * Fixed Cygwin compatibility. * Improved multipage tiff processing. * Improved the embedded pdf font (pdf.ttf). * Enable selection of OCR engine mode from command line. * Changed tesseract command line parameter '-psm' to '--psm'. * Added new C API for orientation and script detection, removed the old one. * Increased minimum autoconf version to 2.59. * Removed dead code. * Fixed many compiler warning. * Fixed memory and resource leaks. * Fixed some issues with the 'Cube' OCR engine. * Fixed some openCL issues. * Added option to build Tesseract with CMake build system. * Implemented CPPAN support for easy Windows building. The upstream URL of the change log is: https://github.com/tesseract-ocr/tesseract/releases/tag/3.05.00 Tested by building against the following packages that directly depend on it: * vapoursynth (with ocrSupport = true) * pyocr (fails) * vobsub2srt Also tested against the following NixOS VM tests that have OCR enabled: * nixos/tests/chromium.nix -A stable * nixos/tests/emacs-daemon.nix * nixos/tests/installer.nix -A luksroot * nixos/tests/lightdm.nix * nixos/tests/plasma5.nix * nixos/tests/sddm.nix All of the packages and tests except pyocr build/succeed on x86_64-linux. Fixing pyocr is outside of the scope of this commit and will happen very soon. Signed-off-by: aszlig <aszlig@redmoonstudios.org>
2017-04-08 01:43:18 +01:00
version = "3.05.00";
src = fetchFromGitHub {
owner = "tesseract-ocr";
repo = "tesseract";
rev = version;
tesseract: 3.04.01 -> 3.05.00 Upstream changelog: * Made some fine tuning to the hOCR output. * Added TSV as another optional output format. * Fixed ABI break introduced in 3.04.00 with the AnalyseLayout() method. * text2image tool - Enable all OpenType ligatures available in a font. This feature requires Pango 1.38 or newer. * Training tools - Replaced asserts with tprintf() and exit(1). * Fixed Cygwin compatibility. * Improved multipage tiff processing. * Improved the embedded pdf font (pdf.ttf). * Enable selection of OCR engine mode from command line. * Changed tesseract command line parameter '-psm' to '--psm'. * Added new C API for orientation and script detection, removed the old one. * Increased minimum autoconf version to 2.59. * Removed dead code. * Fixed many compiler warning. * Fixed memory and resource leaks. * Fixed some issues with the 'Cube' OCR engine. * Fixed some openCL issues. * Added option to build Tesseract with CMake build system. * Implemented CPPAN support for easy Windows building. The upstream URL of the change log is: https://github.com/tesseract-ocr/tesseract/releases/tag/3.05.00 Tested by building against the following packages that directly depend on it: * vapoursynth (with ocrSupport = true) * pyocr (fails) * vobsub2srt Also tested against the following NixOS VM tests that have OCR enabled: * nixos/tests/chromium.nix -A stable * nixos/tests/emacs-daemon.nix * nixos/tests/installer.nix -A luksroot * nixos/tests/lightdm.nix * nixos/tests/plasma5.nix * nixos/tests/sddm.nix All of the packages and tests except pyocr build/succeed on x86_64-linux. Fixing pyocr is outside of the scope of this commit and will happen very soon. Signed-off-by: aszlig <aszlig@redmoonstudios.org>
2017-04-08 01:43:18 +01:00
sha256 = "11wrpcfl118wxsv2c3w2scznwb48c4547qml42s2bpdz079g8y30";
};
enableParallelBuilding = true;
tesseract: 3.04.01 -> 3.05.00 Upstream changelog: * Made some fine tuning to the hOCR output. * Added TSV as another optional output format. * Fixed ABI break introduced in 3.04.00 with the AnalyseLayout() method. * text2image tool - Enable all OpenType ligatures available in a font. This feature requires Pango 1.38 or newer. * Training tools - Replaced asserts with tprintf() and exit(1). * Fixed Cygwin compatibility. * Improved multipage tiff processing. * Improved the embedded pdf font (pdf.ttf). * Enable selection of OCR engine mode from command line. * Changed tesseract command line parameter '-psm' to '--psm'. * Added new C API for orientation and script detection, removed the old one. * Increased minimum autoconf version to 2.59. * Removed dead code. * Fixed many compiler warning. * Fixed memory and resource leaks. * Fixed some issues with the 'Cube' OCR engine. * Fixed some openCL issues. * Added option to build Tesseract with CMake build system. * Implemented CPPAN support for easy Windows building. The upstream URL of the change log is: https://github.com/tesseract-ocr/tesseract/releases/tag/3.05.00 Tested by building against the following packages that directly depend on it: * vapoursynth (with ocrSupport = true) * pyocr (fails) * vobsub2srt Also tested against the following NixOS VM tests that have OCR enabled: * nixos/tests/chromium.nix -A stable * nixos/tests/emacs-daemon.nix * nixos/tests/installer.nix -A luksroot * nixos/tests/lightdm.nix * nixos/tests/plasma5.nix * nixos/tests/sddm.nix All of the packages and tests except pyocr build/succeed on x86_64-linux. Fixing pyocr is outside of the scope of this commit and will happen very soon. Signed-off-by: aszlig <aszlig@redmoonstudios.org>
2017-04-08 01:43:18 +01:00
nativeBuildInputs = [ pkgconfig autoreconfHook ];
buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ];
2016-02-20 22:33:10 +00:00
LIBLEPT_HEADERSDIR = "${leptonica}/include";
postInstall = ''
for i in ${tessdata}/share/tessdata/*; do
ln -s $i $out/share/tessdata;
done
'';
meta = {
description = "OCR engine";
homepage = https://github.com/tesseract-ocr/tesseract;
license = stdenv.lib.licenses.asl20;
maintainers = with stdenv.lib.maintainers; [viric];
2017-04-21 06:10:52 +01:00
platforms = with stdenv.lib.platforms; linux ++ darwin;
};
}