summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSean Whitton <spwhitton@spwhitton.name>2018-07-21 20:27:36 +0800
committerSean Whitton <spwhitton@spwhitton.name>2018-07-21 21:12:31 +0800
commit6c34eda357591482bec17e89c28d20e56389155b (patch)
tree342adf38582c8c8ae877ea090a5e2af07047df11
parent50c1d70bd5092a1aea3132ac6bf362e674aba6a9 (diff)
Commit Debian 3.0 (quilt) metadatadebian/6.2.2-2_bpo9+1archive/debian/6.2.2-2_bpo9+1
[dgit (5.10~bpo9+1) quilt-fixup]
-rw-r--r--debian/patches/debian-changes164
1 files changed, 164 insertions, 0 deletions
diff --git a/debian/patches/debian-changes b/debian/patches/debian-changes
index 03ff2cc2..b0a3332f 100644
--- a/debian/patches/debian-changes
+++ b/debian/patches/debian-changes
@@ -87,6 +87,15 @@ See dgit(1), dgit(7) and dgit-maint-merge(7) for more information.
+* The default "hocr" PDF renderer does not handle Asian fonts properly
--- ocrmypdf-6.2.2.orig/src/ocrmypdf/__main__.py
+++ ocrmypdf-6.2.2/src/ocrmypdf/__main__.py
+@@ -70,7 +70,7 @@ def complain(message):
+ if 'IDE_PROJECT_ROOTS' in os.environ:
+ os.environ['PATH'] = '/usr/local/bin:' + os.environ['PATH']
+
+-# --------
++# --------
+ # Critical environment tests
+
+ verify_python3_env()
@@ -138,17 +138,17 @@ your PDF, use --output-type pdf.
If OCRmyPDF is given an image file as input, it will attempt to convert the
@@ -110,3 +119,158 @@ See dgit(1), dgit(7) and dgit-maint-merge(7) for more information.
""")
+@@ -170,7 +170,7 @@ parser.add_argument(
+ '--image-dpi', metavar='DPI', type=int,
+ help="For input image instead of PDF, use this DPI instead of file's.")
+ parser.add_argument(
+- '--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'],
++ '--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'],
+ default='pdfa',
+ help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
+ "long term archiving (default, recommended) but may not suitable "
+@@ -314,7 +314,7 @@ advanced.add_argument(
+ help='Give up on OCR after the timeout, but copy the preprocessed page '
+ 'into the final output')
+ advanced.add_argument(
+- '--rotate-pages-threshold', default=14.0, type=numeric(float, max_=1000), metavar='CONFIDENCE',
++ '--rotate-pages-threshold', default=14.0, type=numeric(float, 1000), metavar='CONFIDENCE',
+ help="Only rotate pages when confidence is above this value (arbitrary "
+ "units reported by tesseract)")
+ advanced.add_argument(
+@@ -504,7 +504,7 @@ def check_options_advanced(options, log)
+ "--pdfa-image-compression argument has no effect when "
+ "--output-type is not 'pdfa', 'pdfa-1', or 'pdfa-2'"
+ )
+-
++
+ if tesseract.v4() and (options.user_words or options.user_patterns):
+ log.warning(
+ 'Tesseract 4.x ignores --user-words, so this has no effect')
+@@ -592,7 +592,7 @@ def do_ruffus_exception(ruffus_five_tupl
+ if exc_name == 'builtins.SystemExit':
+ match = re.search(r"\.(.+?)\)", exc_value)
+ exit_code_name = match.groups()[0]
+- exit_code = getattr(ExitCode, exit_code_name, 'other_error')
++ exit_code = getattr(ExitCode, exit_code_name, 'other_error')
+ elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
+ log.error(cleanup_ruffus_error_message(exc_value))
+ exit_code = ExitCode.input_file
+@@ -616,7 +616,7 @@ def do_ruffus_exception(ruffus_five_tupl
+ (exc_name == 'ocrmypdf.exceptions.EncryptedPdfError'):
+ log.error(textwrap.dedent("""\
+ Input PDF is encrypted. The encryption must be removed to
+- perform OCR.
++ perform OCR.
+
+ For information about this PDF's security use
+ qpdf --show-encryption infilename
+@@ -625,7 +625,7 @@ def do_ruffus_exception(ruffus_five_tupl
+ qpdf --decrypt [--password=[password]] infilename
+
+ """))
+- exit_code = ExitCode.encrypted_pdf
++ exit_code = ExitCode.encrypted_pdf
+ elif exc_name == 'ocrmypdf.exceptions.PdfMergeFailedError':
+ log.error(textwrap.dedent("""\
+ Failed to merge PDF image layer with OCR layer
+@@ -656,31 +656,33 @@ def do_ruffus_exception(ruffus_five_tupl
+ return ExitCode.other_error
+
+
+-def traverse_ruffus_exception(exceptions, options, log):
+- """Traverse a RethrownJobError and output the exceptions
++def traverse_ruffus_exception(e_args, options, log):
++ """Walk through a RethrownJobError and find the first exception.
+
+- Ruffus presents exceptions as 5 element tuples. The RethrownJobException
+- has a list of exceptions like
+- e.job_exceptions = [(5-tuple), (5-tuple), ...]
+-
+- ruffus < 2.7.0 had a bug with exception marshalling that would give
+- different output whether the main or child process raised the exception.
+- We no longer support this.
+-
+- Attempting to log the exception itself will re-marshall it to the logger
+- which is normally running in another process. It's better to avoid re-
+- marshalling.
++ Ruffus flattens exception to 5 element tuples. Because of a bug
++ in <= 2.6.3 it may present either the single:
++ (task, job, exc, value, stack)
++ or something like:
++ [[(task, job, exc, value, stack)]]
++
++ Generally cross-process exception marshalling doesn't work well
++ and ruffus doesn't support because BaseException has its own
++ implementation of __reduce__ that attempts to reconstruct the
++ exception based on e.__init__(e.args).
++
++ Attempting to log the exception directly marshalls it to the logger
++ which is probably in another process, so it's better to log only
++ data from the exception at this point.
+
+ The exit code will be based on this, even if multiple exceptions occurred
+ at the same time."""
+
+- exit_codes = []
+- for exc in exceptions:
+- exit_code = do_ruffus_exception(exc, options, log)
+- exit_codes.append(exit_code)
+-
+- return exit_codes[0] # Multiple codes are rare so take the first one
+-
++ if isinstance(e_args, Sequence) and isinstance(e_args[0], str) and \
++ len(e_args) == 5:
++ return do_ruffus_exception(e_args, options, log)
++ elif is_iterable_notstr(e_args):
++ for exc in e_args:
++ return traverse_ruffus_exception(exc, options, log)
+
+
+ def check_closed_streams(options):
+@@ -765,7 +767,7 @@ def check_environ(options, _log):
+ for k in old_envvars:
+ if k in os.environ:
+ _log.warning(textwrap.dedent("""\
+- OCRmyPDF no longer uses the environment variable {}.
++ OCRmyPDF no longer uses the environment variable {}.
+ Change PATH to select alternate programs.""".format(k)))
+
+
+@@ -808,14 +810,14 @@ def report_output_file_size(options, _lo
+ ratio = output_size / input_size
+ if ratio < 1.35 or input_size < 25000:
+ return # Seems fine
+-
++
+ reasons = []
+ if not fitz:
+ reasons.append("The optional dependency PyMuPDF is not installed.")
+ image_preproc = {
+- 'deskew',
+- 'clean_final',
+- 'remove_background',
++ 'deskew',
++ 'clean_final',
++ 'remove_background',
+ 'oversample',
+ 'force_ocr'
+ }
+@@ -902,8 +904,7 @@ def run_pipeline():
+ except ruffus_exceptions.RethrownJobError as e:
+ if options.verbose:
+ _log.debug(str(e)) # stringify exception so logger doesn't have to
+- exceptions = e.job_exceptions
+- exitcode = traverse_ruffus_exception(exceptions, options, _log)
++ exitcode = traverse_ruffus_exception(e.args, options, _log)
+ if exitcode is None:
+ _log.error("Unexpected ruffus exception: " + str(e))
+ _log.error(repr(e))
+@@ -936,7 +937,7 @@ def run_pipeline():
+ _log.warning('Output file: The generated PDF is INVALID')
+ return ExitCode.invalid_output_pdf
+
+- report_output_file_size(options, _log, start_input_file,
++ report_output_file_size(options, _log, start_input_file,
+ options.output_file)
+
+ pdfinfo = context.get_pdfinfo()