Commit Debian 3.0 (quilt) metadatadebian/6.2.2-2_bpo9+1 archive/debian/6.2.2-2_bpo9+1

[dgit (5.10~bpo9+1) quilt-fixup]
author: Sean Whitton <spwhitton@spwhitton.name> 2018-07-21 20:27:36 +0800
committer: Sean Whitton <spwhitton@spwhitton.name> 2018-07-21 21:12:31 +0800
commit: 6c34eda357591482bec17e89c28d20e56389155b (patch)
tree: 342adf38582c8c8ae877ea090a5e2af07047df11
parent: 50c1d70bd5092a1aea3132ac6bf362e674aba6a9 (diff)
1 files changed, 164 insertions, 0 deletions
diff --git a/debian/patches/debian-changes b/debian/patches/debian-changes
index 03ff2cc2..b0a3332f 100644
--- a/debian/patches/debian-changes
+++ b/debian/patches/debian-changes
@@ -87,6 +87,15 @@ See dgit(1), dgit(7) and dgit-maint-merge(7) for more information.
 +* The default "hocr" PDF renderer does not handle Asian fonts properly
 --- ocrmypdf-6.2.2.orig/src/ocrmypdf/__main__.py
 +++ ocrmypdf-6.2.2/src/ocrmypdf/__main__.py
+@@ -70,7 +70,7 @@ def complain(message):
+ if 'IDE_PROJECT_ROOTS' in os.environ:
+     os.environ['PATH'] = '/usr/local/bin:' + os.environ['PATH']
+ 
+-# --------
++# -------- 
+ # Critical environment tests
+ 
+ verify_python3_env()
 @@ -138,17 +138,17 @@ your PDF, use --output-type pdf.
  
  If OCRmyPDF is given an image file as input, it will attempt to convert the
@@ -110,3 +119,158 @@ See dgit(1), dgit(7) and dgit-maint-merge(7) for more information.
  
  """)
  
+@@ -170,7 +170,7 @@ parser.add_argument(
+     '--image-dpi', metavar='DPI', type=int,
+     help="For input image instead of PDF, use this DPI instead of file's.")
+ parser.add_argument(
+-    '--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'],
++    '--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'], 
+     default='pdfa',
+     help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
+          "long term archiving (default, recommended) but may not suitable "
+@@ -314,7 +314,7 @@ advanced.add_argument(
+     help='Give up on OCR after the timeout, but copy the preprocessed page '
+          'into the final output')
+ advanced.add_argument(
+-    '--rotate-pages-threshold', default=14.0, type=numeric(float, max_=1000), metavar='CONFIDENCE',
++    '--rotate-pages-threshold', default=14.0, type=numeric(float, 1000), metavar='CONFIDENCE',
+     help="Only rotate pages when confidence is above this value (arbitrary "
+          "units reported by tesseract)")
+ advanced.add_argument(
+@@ -504,7 +504,7 @@ def check_options_advanced(options, log)
+             "--pdfa-image-compression argument has no effect when "
+             "--output-type is not 'pdfa', 'pdfa-1', or 'pdfa-2'"
+         )
+-
++    
+     if tesseract.v4() and (options.user_words or options.user_patterns):
+         log.warning(
+             'Tesseract 4.x ignores --user-words, so this has no effect')
+@@ -592,7 +592,7 @@ def do_ruffus_exception(ruffus_five_tupl
+     if exc_name == 'builtins.SystemExit':
+         match = re.search(r"\.(.+?)\)", exc_value)
+         exit_code_name = match.groups()[0]
+-        exit_code = getattr(ExitCode, exit_code_name, 'other_error')
++        exit_code = getattr(ExitCode, exit_code_name, 'other_error')        
+     elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
+         log.error(cleanup_ruffus_error_message(exc_value))
+         exit_code = ExitCode.input_file
+@@ -616,7 +616,7 @@ def do_ruffus_exception(ruffus_five_tupl
+             (exc_name == 'ocrmypdf.exceptions.EncryptedPdfError'):
+         log.error(textwrap.dedent("""\
+             Input PDF is encrypted. The encryption must be removed to
+-            perform OCR.
++            perform OCR. 
+ 
+             For information about this PDF's security use
+                 qpdf --show-encryption infilename
+@@ -625,7 +625,7 @@ def do_ruffus_exception(ruffus_five_tupl
+                 qpdf --decrypt [--password=[password]] infilename
+ 
+             """))
+-        exit_code = ExitCode.encrypted_pdf
++        exit_code = ExitCode.encrypted_pdf        
+     elif exc_name == 'ocrmypdf.exceptions.PdfMergeFailedError':
+         log.error(textwrap.dedent("""\
+             Failed to merge PDF image layer with OCR layer
+@@ -656,31 +656,33 @@ def do_ruffus_exception(ruffus_five_tupl
+     return ExitCode.other_error
+ 
+ 
+-def traverse_ruffus_exception(exceptions, options, log):
+-    """Traverse a RethrownJobError and output the exceptions
++def traverse_ruffus_exception(e_args, options, log):
++    """Walk through a RethrownJobError and find the first exception.
+ 
+-    Ruffus presents exceptions as 5 element tuples. The RethrownJobException
+-    has a list of exceptions like
+-        e.job_exceptions = [(5-tuple), (5-tuple), ...]
+-
+-    ruffus < 2.7.0 had a bug with exception marshalling that would give
+-    different output whether the main or child process raised the exception.
+-    We no longer support this.
+-
+-    Attempting to log the exception itself will re-marshall it to the logger
+-    which is normally running in another process. It's better to avoid re-
+-    marshalling.
++    Ruffus flattens exception to 5 element tuples. Because of a bug
++    in <= 2.6.3 it may present either the single:
++      (task, job, exc, value, stack)
++    or something like:
++      [[(task, job, exc, value, stack)]]
++    
++    Generally cross-process exception marshalling doesn't work well
++    and ruffus doesn't support because BaseException has its own
++    implementation of __reduce__ that attempts to reconstruct the
++    exception based on e.__init__(e.args).
++    
++    Attempting to log the exception directly marshalls it to the logger
++    which is probably in another process, so it's better to log only
++    data from the exception at this point.
+ 
+     The exit code will be based on this, even if multiple exceptions occurred
+     at the same time."""
+ 
+-    exit_codes = []
+-    for exc in exceptions:
+-        exit_code = do_ruffus_exception(exc, options, log)
+-        exit_codes.append(exit_code)
+-
+-    return exit_codes[0]  # Multiple codes are rare so take the first one
+-
++    if isinstance(e_args, Sequence) and isinstance(e_args[0], str) and \
++            len(e_args) == 5:
++        return do_ruffus_exception(e_args, options, log)
++    elif is_iterable_notstr(e_args):
++        for exc in e_args:
++            return traverse_ruffus_exception(exc, options, log)
+ 
+ 
+ def check_closed_streams(options):
+@@ -765,7 +767,7 @@ def check_environ(options, _log):
+     for k in old_envvars:
+         if k in os.environ:
+             _log.warning(textwrap.dedent("""\
+-                OCRmyPDF no longer uses the environment variable {}.
++                OCRmyPDF no longer uses the environment variable {}. 
+                 Change PATH to select alternate programs.""".format(k)))
+ 
+ 
+@@ -808,14 +810,14 @@ def report_output_file_size(options, _lo
+     ratio = output_size / input_size
+     if ratio < 1.35 or input_size < 25000:
+         return  # Seems fine
+-
++    
+     reasons = []
+     if not fitz:
+         reasons.append("The optional dependency PyMuPDF is not installed.")
+     image_preproc = {
+-        'deskew',
+-        'clean_final',
+-        'remove_background',
++        'deskew', 
++        'clean_final', 
++        'remove_background', 
+         'oversample',
+         'force_ocr'
+     }
+@@ -902,8 +904,7 @@ def run_pipeline():
+     except ruffus_exceptions.RethrownJobError as e:
+         if options.verbose:
+             _log.debug(str(e))  # stringify exception so logger doesn't have to
+-        exceptions = e.job_exceptions
+-        exitcode = traverse_ruffus_exception(exceptions, options, _log)
++        exitcode = traverse_ruffus_exception(e.args, options, _log)
+         if exitcode is None:
+             _log.error("Unexpected ruffus exception: " + str(e))
+             _log.error(repr(e))
+@@ -936,7 +937,7 @@ def run_pipeline():
+             _log.warning('Output file: The generated PDF is INVALID')
+             return ExitCode.invalid_output_pdf
+ 
+-        report_output_file_size(options, _log, start_input_file,
++        report_output_file_size(options, _log, start_input_file, 
+                                 options.output_file)
+ 
+     pdfinfo = context.get_pdfinfo()
author	Sean Whitton <spwhitton@spwhitton.name>	2018-07-21 20:27:36 +0800
committer	Sean Whitton <spwhitton@spwhitton.name>	2018-07-21 21:12:31 +0800
commit	6c34eda357591482bec17e89c28d20e56389155b (patch)
tree	342adf38582c8c8ae877ea090a5e2af07047df11
parent	50c1d70bd5092a1aea3132ac6bf362e674aba6a9 (diff)