pythonPackages.pyocr: 0.5.3 -> 0.7.2

This commit is contained in:
Symphorien Gibol 2020-03-10 12:00:00 +00:00
parent 99dfc57bce
commit bcb40a5f04
2 changed files with 272 additions and 62 deletions

View File

@ -1,10 +1,10 @@
{ lib, fetchFromGitLab, buildPythonPackage, pillow, six
, tesseract, cuneiform, isPy3k, substituteAll, pytest, tox
}:
{ lib, fetchFromGitLab, buildPythonPackage, pillow, setuptools_scm,
setuptools-scm-git-archive , tesseract, cuneiform, isPy3k, substituteAll,
pytest, tox }:
buildPythonPackage rec {
pname = "pyocr";
version = "0.5.3";
version = "0.7.2";
disabled = !isPy3k;
# Don't fetch from PYPI because it doesn't contain tests.
@ -14,7 +14,7 @@ buildPythonPackage rec {
owner = "OpenPaperwork";
repo = "pyocr";
rev = version;
sha256 = "1nihf0qmbpg3yj3yp11jp6hp5z5dqf39nz6j9lqbvgi1nqbs7x15";
sha256 = "09ab86bmizpv94w3mdvdqkjyyvk1vafw3jqhkiw5xx7p180xn3il";
};
patches = [ (substituteAll {
@ -23,38 +23,8 @@ buildPythonPackage rec {
})
];
postPatch = ''
echo 'version = "${version}"' > src/pyocr/_version.py
# Disable specific tests that are probably failing because of this issue:
# https://github.com/jflesch/pyocr/issues/52
for test in $disabledTests; do
file="''${test%%:*}"
fun="''${test#*:}"
echo "import pytest" >> "tests/tests_$file.py"
echo "$fun = pytest.mark.skip($fun)" >> "tests/tests_$file.py"
done
'';
disabledTests = [
"cuneiform:TestTxt.test_basic"
"cuneiform:TestTxt.test_european"
"cuneiform:TestTxt.test_french"
"cuneiform:TestWordBox.test_basic"
"cuneiform:TestWordBox.test_european"
"cuneiform:TestWordBox.test_french"
"libtesseract:TestBasicDoc.test_basic"
"libtesseract:TestDigitLineBox.test_digits"
"libtesseract:TestLineBox.test_japanese"
"libtesseract:TestTxt.test_japanese"
"libtesseract:TestWordBox.test_japanese"
"libtesseract:TestTxt.test_multi"
"tesseract:TestTxt.test_multi"
"tesseract:TestDigitLineBox.test_digits"
"tesseract:TestTxt.test_japanese"
];
propagatedBuildInputs = [ pillow six ];
buildInputs = [ setuptools_scm setuptools-scm-git-archive ];
propagatedBuildInputs = [ pillow ];
checkInputs = [ pytest tox ];
checkPhase = "pytest";

View File

@ -1,9 +1,9 @@
Index: current/src/pyocr/cuneiform.py
===================================================================
--- current.orig/src/pyocr/cuneiform.py
+++ current/src/pyocr/cuneiform.py
@@ -27,13 +27,9 @@ from . import error
from . import util
diff --git a/src/pyocr/cuneiform.py b/src/pyocr/cuneiform.py
index 2e5b717..35647e2 100644
--- a/src/pyocr/cuneiform.py
+++ b/src/pyocr/cuneiform.py
@@ -25,13 +25,9 @@ from . import builders
from .error import CuneiformError
-# CHANGE THIS IF CUNEIFORM IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY
@ -18,25 +18,34 @@ Index: current/src/pyocr/cuneiform.py
LANGUAGES_LINE_PREFIX = "Supported languages: "
LANGUAGES_SPLIT_RE = re.compile("[^a-z]")
Index: current/src/pyocr/libtesseract/tesseract_raw.py
===================================================================
--- current.orig/src/pyocr/libtesseract/tesseract_raw.py
+++ current/src/pyocr/libtesseract/tesseract_raw.py
@@ -1,52 +1,13 @@
import ctypes
diff --git a/src/pyocr/libtesseract/tesseract_raw.py b/src/pyocr/libtesseract/tesseract_raw.py
index a068e73..9ebea5c 100644
--- a/src/pyocr/libtesseract/tesseract_raw.py
+++ b/src/pyocr/libtesseract/tesseract_raw.py
@@ -2,7 +2,6 @@ import ctypes
import locale
import logging
import os
-import sys
from ..error import TesseractError
@@ -10,48 +9,16 @@ from ..error import TesseractError
logger = logging.getLogger(__name__)
-TESSDATA_PREFIX = os.getenv('TESSDATA_PREFIX', None)
TESSDATA_PREFIX = os.getenv('TESSDATA_PREFIX', None)
-libnames = []
+if TESSDATA_PREFIX is None:
+ TESSDATA_PREFIX = '@tesseract@/share/tessdata'
+ os.environ['TESSDATA_PREFIX'] = TESSDATA_PREFIX
+
+
# 70 is the minimum credible dpi for tesseract and force it to compute an
# estimate of the image dpi
DPI_DEFAULT = 70
-
-if getattr(sys, 'frozen', False):
-if getattr(sys, 'frozen', False): # pragma: no cover
- # Pyinstaller integration
- libnames += [os.path.join(sys._MEIPASS, "libtesseract-4.dll")]
- libnames += [os.path.join(sys._MEIPASS, "libtesseract-3.dll")]
@ -51,7 +60,7 @@ Index: current/src/pyocr/libtesseract/tesseract_raw.py
- TESSDATA_PREFIX = tessdata
-
-
-if sys.platform[:3] == "win":
-if sys.platform[:3] == "win": # pragma: no cover
- libnames += [
- # Jflesch> Don't they have the equivalent of LD_LIBRARY_PATH on
- # Windows ?
@ -76,15 +85,16 @@ Index: current/src/pyocr/libtesseract/tesseract_raw.py
g_libtesseract = None
@@ -346,12 +307,11 @@ def init(lang=None):
@@ -364,12 +331,12 @@ def init(lang=None):
try:
if lang:
lang = lang.encode("utf-8")
- prefix = None
- if TESSDATA_PREFIX:
- if TESSDATA_PREFIX: # pragma: no cover
- prefix = TESSDATA_PREFIX.encode("utf-8")
+ prefix = os.getenv('TESSDATA_PREFIX', '@tesseract@/share/tessdata')
+ os.environ['TESSDATA_PREFIX'] = prefix
+
+ prefix = TESSDATA_PREFIX
+
g_libtesseract.TessBaseAPIInit3(
ctypes.c_void_p(handle),
- ctypes.c_char_p(prefix),
@ -92,11 +102,11 @@ Index: current/src/pyocr/libtesseract/tesseract_raw.py
ctypes.c_char_p(lang)
)
g_libtesseract.TessBaseAPISetVariable(
Index: current/src/pyocr/tesseract.py
===================================================================
--- current.orig/src/pyocr/tesseract.py
+++ current/src/pyocr/tesseract.py
@@ -31,8 +31,7 @@ from .builders import DigitBuilder # ba
diff --git a/src/pyocr/tesseract.py b/src/pyocr/tesseract.py
index 7c30852..44e8446 100644
--- a/src/pyocr/tesseract.py
+++ b/src/pyocr/tesseract.py
@@ -28,8 +28,7 @@ from .builders import DigitBuilder # backward compatibility
from .error import TesseractError # backward compatibility
from .util import digits_only
@ -106,3 +116,233 @@ Index: current/src/pyocr/tesseract.py
TESSDATA_EXTENSION = ".traineddata"
diff --git a/tests/tests_cuneiform.py b/tests/tests_cuneiform.py
index 45b7f6a..95f55c6 100644
--- a/tests/tests_cuneiform.py
+++ b/tests/tests_cuneiform.py
@@ -21,7 +21,7 @@ class TestCuneiform(BaseTest):
# XXX is it useful?
which.return_value = True
self.assertTrue(cuneiform.is_available())
- which.assert_called_once_with("cuneiform")
+ which.assert_called_once_with("@cuneiform@/bin/cuneiform")
@patch("subprocess.Popen")
def test_version(self, popen):
@@ -54,7 +54,7 @@ class TestCuneiform(BaseTest):
self.assertIn("eng", langs)
self.assertIn("fra", langs)
popen.assert_called_once_with(
- ["cuneiform", "-l"],
+ ["@cuneiform@/bin/cuneiform", "-l"],
stdout=subprocess.PIPE, stderr=subprocess.STDOUT
)
@@ -109,7 +109,7 @@ class TestCuneiformTxt(BaseTest):
output = cuneiform.image_to_string(self.image)
self.assertEqual(output, self._get_file_content("text").strip())
popen.assert_called_once_with(
- ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
+ ["@cuneiform@/bin/cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)
@@ -125,7 +125,7 @@ class TestCuneiformTxt(BaseTest):
builder=self.builder)
self.assertEqual(output, self._get_file_content("text").strip())
popen.assert_called_once_with(
- ["cuneiform", "-l", "fra", "-f", "text", "-o", self.tmp_filename,
+ ["@cuneiform@/bin/cuneiform", "-l", "fra", "-f", "text", "-o", self.tmp_filename,
"-"],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
@@ -142,7 +142,7 @@ class TestCuneiformTxt(BaseTest):
builder=self.builder)
self.assertEqual(output, self._get_file_content("text").strip())
popen.assert_called_once_with(
- ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
+ ["@cuneiform@/bin/cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)
@@ -173,7 +173,7 @@ class TestCuneiformTxt(BaseTest):
output = cuneiform.image_to_string(image, builder=self.builder)
self.assertEqual(output, self._get_file_content("text").strip())
popen.assert_called_once_with(
- ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
+ ["@cuneiform@/bin/cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)
@@ -227,7 +227,7 @@ class TestCuneiformWordBox(BaseTest):
output = cuneiform.image_to_string(self.image,
builder=self.builder)
popen.assert_called_once_with(
- ["cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"],
+ ["@cuneiform@/bin/cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)
@@ -280,7 +280,7 @@ class TestCuneiformLineBox(BaseTest):
output = cuneiform.image_to_string(self.image,
builder=self.builder)
popen.assert_called_once_with(
- ["cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"],
+ ["@cuneiform@/bin/cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)
diff --git a/tests/tests_libtesseract.py b/tests/tests_libtesseract.py
index ad7fdc9..57e7a60 100644
--- a/tests/tests_libtesseract.py
+++ b/tests/tests_libtesseract.py
@@ -165,7 +165,8 @@ class TestLibTesseractRaw(BaseTest):
args = libtess.TessBaseAPIInit3.call_args[0]
self.assertEqual(len(args), 3)
self.assertEqual(args[0].value, self.handle)
- self.assertEqual(args[1].value, None)
+ # we hardcode tesseract data, so we don't get None
+ #self.assertEqual(args[1].value, None)
self.assertEqual(args[2].value, lang.encode() if lang else None)
self.assertEqual(
@@ -201,7 +202,8 @@ class TestLibTesseractRaw(BaseTest):
args = libtess.TessBaseAPIInit3.call_args[0]
self.assertEqual(len(args), 3)
self.assertEqual(args[0].value, self.handle)
- self.assertEqual(args[1].value, None)
+ # we hardcode tesseract data, so we don't get None
+ #self.assertEqual(args[1].value, None)
self.assertEqual(args[2].value, lang.encode() if lang else None)
self.assertEqual(
diff --git a/tests/tests_tesseract.py b/tests/tests_tesseract.py
index 1a55567..a24d96f 100644
--- a/tests/tests_tesseract.py
+++ b/tests/tests_tesseract.py
@@ -36,7 +36,7 @@ class TestTesseract(BaseTest):
def test_available(self, which):
which.return_value = True
self.assertTrue(tesseract.is_available())
- which.assert_called_once_with("tesseract")
+ which.assert_called_once_with("@tesseract@/bin/tesseract")
@patch("subprocess.Popen")
def test_version_error(self, popen):
@@ -156,7 +156,7 @@ class TestTesseract(BaseTest):
for lang in ("eng", "fra", "jpn", "osd"):
self.assertIn(lang, langs)
popen.assert_called_once_with(
- ["tesseract", "--list-langs"],
+ ["@tesseract@/bin/tesseract", "--list-langs"],
startupinfo=None, creationflags=0,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT
)
@@ -171,7 +171,7 @@ class TestTesseract(BaseTest):
self.assertEqual(te.exception.status, 1)
self.assertEqual("unable to get languages", te.exception.message)
popen.assert_called_once_with(
- ["tesseract", "--list-langs"],
+ ["@tesseract@/bin/tesseract", "--list-langs"],
startupinfo=None, creationflags=0,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT
)
@@ -248,7 +248,7 @@ class TestTesseract(BaseTest):
self.assertEqual(status, 0)
self.assertEqual(error, message)
popen.assert_called_once_with(
- ["tesseract", "input.bmp", "output"],
+ ["@tesseract@/bin/tesseract", "input.bmp", "output"],
cwd=tmpdir,
startupinfo=None,
creationflags=0,
@@ -271,7 +271,7 @@ class TestTesseract(BaseTest):
self.assertEqual(status, 0)
self.assertEqual(error, message)
popen.assert_called_with(
- ["tesseract", "input2.bmp", "output2", "-l", "fra", "--psm", "3"],
+ ["@tesseract@/bin/tesseract", "input2.bmp", "output2", "-l", "fra", "--psm", "3"],
cwd=tmpdir,
startupinfo=None,
creationflags=0,
@@ -302,7 +302,7 @@ class TestTesseract(BaseTest):
self.assertEqual(result["angle"], 90)
self.assertEqual(result["confidence"], 9.30)
popen.assert_called_once_with(
- ["tesseract", "input.bmp", "stdout", "--psm", "0"],
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"],
stdin=subprocess.PIPE,
shell=False,
startupinfo=None,
@@ -338,7 +338,7 @@ class TestTesseract(BaseTest):
self.assertEqual(result["angle"], 90)
self.assertEqual(result["confidence"], 9.30)
popen.assert_called_once_with(
- ["tesseract", "input.bmp", "stdout", "--psm", "0"],
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"],
stdin=subprocess.PIPE,
shell=False,
startupinfo=None,
@@ -371,7 +371,7 @@ class TestTesseract(BaseTest):
self.assertEqual(result["angle"], 90)
self.assertEqual(result["confidence"], 9.30)
popen.assert_called_once_with(
- ["tesseract", "input.bmp", "stdout",
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout",
"--psm", "0", "-l", "osd"],
stdin=subprocess.PIPE,
shell=False,
@@ -399,7 +399,7 @@ class TestTesseract(BaseTest):
with self.assertRaises(tesseract.TesseractError) as te:
tesseract.detect_orientation(self.image)
popen.assert_called_once_with(
- ["tesseract", "input.bmp", "stdout", "--psm", "0"],
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"],
stdin=subprocess.PIPE,
shell=False,
startupinfo=None,
@@ -433,7 +433,7 @@ class TestTesseract(BaseTest):
with self.assertRaises(tesseract.TesseractError) as te:
tesseract.detect_orientation(self.image)
popen.assert_called_once_with(
- ["tesseract", "input.bmp", "stdout", "--psm", "0"],
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "--psm", "0"],
stdin=subprocess.PIPE,
shell=False,
startupinfo=None,
@@ -467,7 +467,7 @@ class TestTesseract(BaseTest):
self.assertEqual(result["angle"], 90)
self.assertEqual(result["confidence"], 9.30)
popen.assert_called_once_with(
- ["tesseract", "input.bmp", "stdout", "-psm", "0"],
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"],
stdin=subprocess.PIPE,
shell=False,
startupinfo=None,
@@ -500,7 +500,7 @@ class TestTesseract(BaseTest):
self.assertEqual(result["angle"], 90)
self.assertEqual(result["confidence"], 9.30)
popen.assert_called_once_with(
- ["tesseract", "input.bmp", "stdout", "-psm", "0", "-l", "fra"],
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0", "-l", "fra"],
stdin=subprocess.PIPE,
shell=False,
startupinfo=None,
@@ -527,7 +527,7 @@ class TestTesseract(BaseTest):
with self.assertRaises(tesseract.TesseractError) as te:
tesseract.detect_orientation(self.image)
popen.assert_called_once_with(
- ["tesseract", "input.bmp", "stdout", "-psm", "0"],
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"],
stdin=subprocess.PIPE,
shell=False,
startupinfo=None,
@@ -561,7 +561,7 @@ class TestTesseract(BaseTest):
with self.assertRaises(tesseract.TesseractError) as te:
tesseract.detect_orientation(self.image)
popen.assert_called_once_with(
- ["tesseract", "input.bmp", "stdout", "-psm", "0"],
+ ["@tesseract@/bin/tesseract", "input.bmp", "stdout", "-psm", "0"],
stdin=subprocess.PIPE,
shell=False,
startupinfo=None,