diff options
author | Sean Whitton <spwhitton@spwhitton.name> | 2018-07-21 20:27:36 +0800 |
---|---|---|
committer | Sean Whitton <spwhitton@spwhitton.name> | 2018-07-21 21:12:31 +0800 |
commit | 6c34eda357591482bec17e89c28d20e56389155b (patch) | |
tree | 342adf38582c8c8ae877ea090a5e2af07047df11 | |
parent | 50c1d70bd5092a1aea3132ac6bf362e674aba6a9 (diff) |
Commit Debian 3.0 (quilt) metadatadebian/6.2.2-2_bpo9+1archive/debian/6.2.2-2_bpo9+1
[dgit (5.10~bpo9+1) quilt-fixup]
-rw-r--r-- | debian/patches/debian-changes | 164 |
1 files changed, 164 insertions, 0 deletions
diff --git a/debian/patches/debian-changes b/debian/patches/debian-changes index 03ff2cc2..b0a3332f 100644 --- a/debian/patches/debian-changes +++ b/debian/patches/debian-changes @@ -87,6 +87,15 @@ See dgit(1), dgit(7) and dgit-maint-merge(7) for more information. +* The default "hocr" PDF renderer does not handle Asian fonts properly --- ocrmypdf-6.2.2.orig/src/ocrmypdf/__main__.py +++ ocrmypdf-6.2.2/src/ocrmypdf/__main__.py +@@ -70,7 +70,7 @@ def complain(message): + if 'IDE_PROJECT_ROOTS' in os.environ: + os.environ['PATH'] = '/usr/local/bin:' + os.environ['PATH'] + +-# -------- ++# -------- + # Critical environment tests + + verify_python3_env() @@ -138,17 +138,17 @@ your PDF, use --output-type pdf. If OCRmyPDF is given an image file as input, it will attempt to convert the @@ -110,3 +119,158 @@ See dgit(1), dgit(7) and dgit-maint-merge(7) for more information. """) +@@ -170,7 +170,7 @@ parser.add_argument( + '--image-dpi', metavar='DPI', type=int, + help="For input image instead of PDF, use this DPI instead of file's.") + parser.add_argument( +- '--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'], ++ '--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'], + default='pdfa', + help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for " + "long term archiving (default, recommended) but may not suitable " +@@ -314,7 +314,7 @@ advanced.add_argument( + help='Give up on OCR after the timeout, but copy the preprocessed page ' + 'into the final output') + advanced.add_argument( +- '--rotate-pages-threshold', default=14.0, type=numeric(float, max_=1000), metavar='CONFIDENCE', ++ '--rotate-pages-threshold', default=14.0, type=numeric(float, 1000), metavar='CONFIDENCE', + help="Only rotate pages when confidence is above this value (arbitrary " + "units reported by tesseract)") + advanced.add_argument( +@@ -504,7 +504,7 @@ def check_options_advanced(options, log) + "--pdfa-image-compression argument has no effect when " + "--output-type is not 'pdfa', 'pdfa-1', or 'pdfa-2'" + ) +- ++ + if tesseract.v4() and (options.user_words or options.user_patterns): + log.warning( + 'Tesseract 4.x ignores --user-words, so this has no effect') +@@ -592,7 +592,7 @@ def do_ruffus_exception(ruffus_five_tupl + if exc_name == 'builtins.SystemExit': + match = re.search(r"\.(.+?)\)", exc_value) + exit_code_name = match.groups()[0] +- exit_code = getattr(ExitCode, exit_code_name, 'other_error') ++ exit_code = getattr(ExitCode, exit_code_name, 'other_error') + elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError': + log.error(cleanup_ruffus_error_message(exc_value)) + exit_code = ExitCode.input_file +@@ -616,7 +616,7 @@ def do_ruffus_exception(ruffus_five_tupl + (exc_name == 'ocrmypdf.exceptions.EncryptedPdfError'): + log.error(textwrap.dedent("""\ + Input PDF is encrypted. The encryption must be removed to +- perform OCR. ++ perform OCR. + + For information about this PDF's security use + qpdf --show-encryption infilename +@@ -625,7 +625,7 @@ def do_ruffus_exception(ruffus_five_tupl + qpdf --decrypt [--password=[password]] infilename + + """)) +- exit_code = ExitCode.encrypted_pdf ++ exit_code = ExitCode.encrypted_pdf + elif exc_name == 'ocrmypdf.exceptions.PdfMergeFailedError': + log.error(textwrap.dedent("""\ + Failed to merge PDF image layer with OCR layer +@@ -656,31 +656,33 @@ def do_ruffus_exception(ruffus_five_tupl + return ExitCode.other_error + + +-def traverse_ruffus_exception(exceptions, options, log): +- """Traverse a RethrownJobError and output the exceptions ++def traverse_ruffus_exception(e_args, options, log): ++ """Walk through a RethrownJobError and find the first exception. + +- Ruffus presents exceptions as 5 element tuples. The RethrownJobException +- has a list of exceptions like +- e.job_exceptions = [(5-tuple), (5-tuple), ...] +- +- ruffus < 2.7.0 had a bug with exception marshalling that would give +- different output whether the main or child process raised the exception. +- We no longer support this. +- +- Attempting to log the exception itself will re-marshall it to the logger +- which is normally running in another process. It's better to avoid re- +- marshalling. ++ Ruffus flattens exception to 5 element tuples. Because of a bug ++ in <= 2.6.3 it may present either the single: ++ (task, job, exc, value, stack) ++ or something like: ++ [[(task, job, exc, value, stack)]] ++ ++ Generally cross-process exception marshalling doesn't work well ++ and ruffus doesn't support because BaseException has its own ++ implementation of __reduce__ that attempts to reconstruct the ++ exception based on e.__init__(e.args). ++ ++ Attempting to log the exception directly marshalls it to the logger ++ which is probably in another process, so it's better to log only ++ data from the exception at this point. + + The exit code will be based on this, even if multiple exceptions occurred + at the same time.""" + +- exit_codes = [] +- for exc in exceptions: +- exit_code = do_ruffus_exception(exc, options, log) +- exit_codes.append(exit_code) +- +- return exit_codes[0] # Multiple codes are rare so take the first one +- ++ if isinstance(e_args, Sequence) and isinstance(e_args[0], str) and \ ++ len(e_args) == 5: ++ return do_ruffus_exception(e_args, options, log) ++ elif is_iterable_notstr(e_args): ++ for exc in e_args: ++ return traverse_ruffus_exception(exc, options, log) + + + def check_closed_streams(options): +@@ -765,7 +767,7 @@ def check_environ(options, _log): + for k in old_envvars: + if k in os.environ: + _log.warning(textwrap.dedent("""\ +- OCRmyPDF no longer uses the environment variable {}. ++ OCRmyPDF no longer uses the environment variable {}. + Change PATH to select alternate programs.""".format(k))) + + +@@ -808,14 +810,14 @@ def report_output_file_size(options, _lo + ratio = output_size / input_size + if ratio < 1.35 or input_size < 25000: + return # Seems fine +- ++ + reasons = [] + if not fitz: + reasons.append("The optional dependency PyMuPDF is not installed.") + image_preproc = { +- 'deskew', +- 'clean_final', +- 'remove_background', ++ 'deskew', ++ 'clean_final', ++ 'remove_background', + 'oversample', + 'force_ocr' + } +@@ -902,8 +904,7 @@ def run_pipeline(): + except ruffus_exceptions.RethrownJobError as e: + if options.verbose: + _log.debug(str(e)) # stringify exception so logger doesn't have to +- exceptions = e.job_exceptions +- exitcode = traverse_ruffus_exception(exceptions, options, _log) ++ exitcode = traverse_ruffus_exception(e.args, options, _log) + if exitcode is None: + _log.error("Unexpected ruffus exception: " + str(e)) + _log.error(repr(e)) +@@ -936,7 +937,7 @@ def run_pipeline(): + _log.warning('Output file: The generated PDF is INVALID') + return ExitCode.invalid_output_pdf + +- report_output_file_size(options, _log, start_input_file, ++ report_output_file_size(options, _log, start_input_file, + options.output_file) + + pdfinfo = context.get_pdfinfo() |