diff options
author | Johannes 'josch' Schauer <josch@debian.org> | 2018-08-05 21:10:32 +0200 |
---|---|---|
committer | Johannes 'josch' Schauer <josch@debian.org> | 2018-08-05 21:10:32 +0200 |
commit | 244f64ef12075bc9ad3a396e4cb1d510800140d3 (patch) | |
tree | 56e18cd7dc73edf3d80fec142153e035937c0a52 | |
parent | aef245f415aae671df75502700826d2bb682e257 (diff) |
Import upstream version 0.3.1
-rw-r--r-- | CHANGES.rst | 50 | ||||
-rw-r--r-- | LICENSE | 165 | ||||
-rw-r--r-- | MANIFEST.in | 1 | ||||
-rw-r--r-- | PKG-INFO | 227 | ||||
-rw-r--r-- | README.md | 223 | ||||
-rw-r--r-- | setup.py | 2 | ||||
-rw-r--r-- | src/img2pdf.egg-info/PKG-INFO | 227 | ||||
-rw-r--r-- | src/img2pdf.egg-info/SOURCES.txt | 1 | ||||
-rwxr-xr-x | src/img2pdf.py | 141 | ||||
-rw-r--r-- | src/tests/__init__.py | 21 | ||||
-rw-r--r-- | src/tests/input/mono.tif | bin | 262 -> 720 bytes | |||
-rw-r--r-- | src/tests/output/mono.tif.pdf | bin | 921 -> 915 bytes | |||
-rwxr-xr-x | test_comp.sh | 6 |
13 files changed, 722 insertions, 342 deletions
diff --git a/CHANGES.rst b/CHANGES.rst index d4476a8..4f5bee3 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -2,42 +2,48 @@ CHANGES ======= -0.3.0 ------ +0.3.1 (2018-08-04) +------------------ + + - Directly copy data from CCITT Group 4 encoded TIFF images into the PDF + container without re-encoding + +0.3.0 (2018-06-18) +------------------ - Store non-jpeg images using PNG compression - Support arbitrarily large pages via PDF /UserUnit field - Disallow input with alpha channel as it cannot be preserved - Add option --pillow-limit-break to support very large input -0.2.4 ------ +0.2.4 (2017-05-23) +------------------ - Restore support for Python 2.7 - Add support for PyPy - Add support for testing using tox -0.2.3 ------ +0.2.3 (2017-01-20) +------------------ - version number bump for botched pypi upload... -0.2.2 ------ +0.2.2 (2017-01-20) +------------------ - automatic monochrome CCITT Group4 encoding via Pillow/libtiff -0.2.1 ------ +0.2.1 (2016-05-04) +------------------ - set img2pdf as /producer value - support multi-frame images like multipage TIFF and animated GIF - support for palette images like GIF - - support all colorspaces and imageformats knows by PIL + - support all colorspaces and imageformats known by PIL - read horizontal and vertical dpi from JPEG2000 files -0.2.0 ------ +0.2.0 (2015-05-10) +------------------ - now Python3 only - pep8 compliant code @@ -72,34 +78,34 @@ CHANGES - explicitly store date in UTC and allow parsing all date formats understood by dateutil and `date --date` -0.1.5 ------ +0.1.5 (2015-02-16) +------------------ - Enable support for CMYK images - Rework test suite - support file objects as input -0.1.4 ------ +0.1.4 (2015-01-21) +------------------ - add Python 3 support - make output reproducible by sorting and --nodate option -0.1.3 ------ +0.1.3 (2014-11-10) +------------------ - Avoid leaking file descriptors - Convert unrecognized colorspaces to RGB -0.1.1 ------ +0.1.1 (2014-09-07) +------------------ - allow running src/img2pdf.py standalone - license change from GPL to LGPL - Add pillow 2.4.0 support - add options to specify pdf dimensions in points -0.1.0 (unreleased) +0.1.0 (2014-03-14, unreleased) ------------------ - Initial PyPI release. @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/> + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/MANIFEST.in b/MANIFEST.in index 4ee2b37..9249d3f 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,7 @@ include README.md include test_comp.sh include CHANGES.rst +include LICENSE recursive-include src *.jpg recursive-include src *.pdf recursive-include src *.png @@ -1,80 +1,61 @@ Metadata-Version: 1.1 Name: img2pdf -Version: 0.3.0 +Version: 0.3.1 Summary: Convert images to PDF via direct JPEG inclusion. Home-page: https://gitlab.mister-muffin.de/josch/img2pdf Author: Johannes 'josch' Schauer Author-email: josch@mister-muffin.de License: LGPL -Download-URL: https://gitlab.mister-muffin.de/josch/img2pdf/repository/archive.tar.gz?ref=0.3.0 +Download-URL: https://gitlab.mister-muffin.de/josch/img2pdf/repository/archive.tar.gz?ref=0.3.1 Description-Content-Type: UNKNOWN Description: img2pdf ======= - Losslessly convert raster images to PDF. The file size will not unnecessarily - increase. It can for example be used to create a PDF document from a number of - scans that are only available in JPEG format. Existing solutions would either - re-encode the input JPEG files (leading to quality loss) or store them in the - zip/flate format which results into the PDF becoming unnecessarily large in - terms of its file size. - - Background - ---------- - - Quality loss can be avoided when converting PNG, JPEG and JPEG2000 images to - PDF by embedding them into the PDF without re-encoding them. This is what - img2pdf does. It thus treats the PDF format merely as a container format for - storing one or more JPEGs or PNGs without re-encoding the images themselves. - - If you know an existing tool which allows one to embed PNG, JPEG and JPEG2000 - images into a PDF container without recompression, please contact me so that I - can put this code into the garbage bin. - - Functionality - ------------- - - This program will take a list of raster images and produce a PDF file with the - images embedded in it. PNG, JPEG and JPEG2000 images will be included without - recompression and the resulting PDF will only be slightly larger than the input - images due to the overhead of the PDF container. Raster images in other - formats (like gif or tif) will be included using the lossless zip/flate - encoding using the PNG Paeth predictor. - - As a result, this tool is able to losslessly wrap raster images into a PDF - container with a quality to filesize ratio that is typically better (in case of - JPEG and JPEG2000 images) or equal (in case of other formats) than that of - existing tools. - - For example, imagemagick will re-encode the input JPEG image (thus changing - its content): - - $ convert img.jpg img.pdf - $ pdfimages img.pdf img.extr # not using -j to be extra sure there is no recompression - $ compare -metric AE img.jpg img.extr-000.ppm null: - 1.6301e+06 - - If one wants to losslessly convert from any format to PDF with - imagemagick, one has to use zip compression: - - $ convert input.jpg -compress Zip output.pdf - $ pdfimages img.pdf img.extr # not using -j to be extra sure there is no recompression - $ compare -metric AE img.jpg img.extr-000.ppm null: - 0 - - However, this approach will result in PDF files that are a few times larger - than the input JPEG or JPEG2000 file. - - Furthermore, when converting PNG images, popular tools like imagemagick use - flate encoding without a predictor. This means, that image file size ends up - being several orders of magnitude larger then necessary. - - img2pdf is able to losslessly embed PNG, JPEG and JPEG2000 files into a PDF - container without additional overhead (aside from the PDF structure itself), - save other graphics formats using lossless zip compression, and produce - multi-page PDF files when more than one input image is given. - - Also, since PNG, JPEG and JPEG2000 images are not reencoded, conversion with - img2pdf is several times faster than with other tools. + Lossless conversion of raster images to PDF. You should use img2pdf if your + priorities are (in this order): + + 1. **always lossless**: the image embedded in the PDF will always have the + exact same color information for every pixel as the input + 2. **small**: if possible, the difference in filesize between the input image + and the output PDF will only be the overhead of the PDF container itself + 3. **fast**: if possible, the input image is just pasted into the PDF document + as-is without any CPU hungry re-encoding of the pixel data + + Conventional conversion software (like ImageMagick) would either: + + 1. not be lossless because lossy re-encoding to JPEG + 2. not be small because using wasteful flate encoding of raw pixel data + 3. not be fast because input data gets re-encoded + + Another advantage of not having to re-encode the input (in most common + situations) is, that img2pdf is able to handle much larger input than other + software, because the raw pixel data never has to be loaded into memory. + + The following table shows how img2pdf handles different input depending on the + input file format and image color space. + + | Format | Colorspace | Result | + | -------------------- | ------------------------------ | ------------- | + | JPEG | any | direct | + | JPEG2000 | any | direct | + | PNG (non-interlaced) | any | direct | + | TIFF (CCITT Group 4) | monochrome | direct | + | any | any except CMYK and monochrome | PNG Paeth | + | any | monochrome | CCITT Group 4 | + | any | CMYK | flate | + + For JPEG, JPEG2000, non-interlaced PNG and TIFF images with CCITT Group 4 + encoded data, img2pdf directly embeds the image data into the PDF without + re-encoding it. It thus treats the PDF format merely as a container format for + the image data. In these cases, img2pdf only increases the filesize by the size + of the PDF container (typically around 500 to 700 bytes). Since data is only + copied and not re-encoded, img2pdf is also typically faster than other + solutions for these input formats. + + For all other input types, img2pdf first has to transform the pixel data to + make it compatible with PDF. In most cases, the PNG Paeth filter is applied to + the pixel data. For monochrome input, CCITT Group 4 is used instead. Only for + CMYK input no filter is applied before finally applying flate compression. Usage ----- @@ -85,44 +66,45 @@ Description: img2pdf If no output file is specified with the `-o`/`--output` option, output will be done to stdout. A typical invocation is: - img2pdf img1.png img2.jpg -o out.pdf + $ img2pdf img1.png img2.jpg -o out.pdf The detailed documentation can be accessed by running: - img2pdf --help - + $ img2pdf --help Bugs ---- - If you find a JPEG or JPEG2000 file that, when embedded cannot be read - by the Adobe Acrobat Reader, please contact me. - - I have not yet figured out how to determine the colorspace of JPEG2000 files. - Therefore JPEG2000 files use DeviceRGB by default. For JPEG2000 files with - other colorspaces, you must explicitly specify it using the `--colorspace` - option. + - If you find a JPEG, JPEG2000, PNG or CCITT Group 4 encoded TIFF file that, + when embedded into the PDF cannot be read by the Adobe Acrobat Reader, + please contact me. - It might be possible to store transparency using masks but it is not clear - what the utility of such a functionality would be. + - I have not yet figured out how to determine the colorspace of JPEG2000 + files. Therefore JPEG2000 files use DeviceRGB by default. For JPEG2000 + files with other colorspaces, you must explicitly specify it using the + `--colorspace` option. - Most vector graphic formats can be losslessly turned into PDF (minus some of - the features unsupported by PDF) but img2pdf will currently turn vector - graphics into their lossy raster representations. For converting raster - graphics to PDF, use another tool like inkscape and then join the resulting - pages with a tool like pdftk. + - Input images with alpha channels are not allowed. PDF doesn't support alpha + channels in images and thus, the alpha channel of the input would have to be + discarded. But img2pdf will always be lossless and thus, input images must + not carry transparency information. - A configuration file could be used for default options. + - img2pdf uses PIL (or Pillow) to obtain image meta data and to convert the + input if necessary. To prevent decompression bomb denial of service attacks, + Pillow limits the maximum number of pixels an input image is allowed to + have. If you are sure that you know what you are doing, then you can disable + this safeguard by passing the `--pillow-limit-break` option to img2pdf. This + allows one to process even very large input images. Installation ------------ - On a Debian- and Ubuntu-based systems, dependencies may be installed - with the following command: + On a Debian- and Ubuntu-based systems, img2pdf can be installed from the + official repositories: - apt-get install python3 python3-pil python3-setuptools + $ apt install img2pdf - You can then install the package using: + If you want to install it using pip, you can run: $ pip3 install img2pdf @@ -176,6 +158,75 @@ Description: img2pdf with open("name.pdf","wb") as f: f.write(img2pdf.convert('test.jpg', layout_fun=layout_fun)) + Comparison to ImageMagick + ------------------------- + + Create a large test image: + + $ convert logo: -resize 8000x original.jpg + + Convert it into PDF using ImageMagick and img2pdf: + + $ time img2pdf original.jpg -o img2pdf.pdf + $ time convert original.jpg imagemagick.pdf + + Notice how ImageMagick took an order of magnitude longer to do the conversion + than img2pdf. It also used twice the memory. + + Now extract the image data from both PDF documents and compare it to the + original: + + $ pdfimages -all img2pdf.pdf tmp + $ compare -metric AE original.jpg tmp-000.jpg null: + 0 + $ pdfimages -all imagemagick.pdf tmp + $ compare -metric AE original.jpg tmp-000.jpg null: + 118716 + + To get lossless output with ImageMagick we can use Zip compression but that + unnecessarily increases the size of the output: + + $ convert original.jpg -compress Zip imagemagick.pdf + $ pdfimages -all imagemagick.pdf tmp + $ compare -metric AE original.jpg tmp-000.png null: + 0 + $ stat --format="%s %n" original.jpg img2pdf.pdf imagemagick.pdf + 1535837 original.jpg + 1536683 img2pdf.pdf + 9397809 imagemagick.pdf + + Comparison to pdfLaTeX + ---------------------- + + pdfLaTeX performs a lossless conversion from included images to PDF by default. + If the input is a JPEG, then it simply embeds the JPEG into the PDF in the same + way as img2pdf does it. But for other image formats it uses flate compression + of the plain pixel data and thus needlessly increases the output file size: + + $ convert logo: -resize 8000x original.png + $ cat << END > pdflatex.tex + \documentclass{article} + \usepackage{graphicx} + \begin{document} + \includegraphics{original.png} + \end{document} + END + $ pdflatex pdflatex.tex + $ stat --format="%s %n" original.png pdflatex.pdf + 4500182 original.png + 9318120 pdflatex.pdf + + Comparison to Tesseract OCR + --------------------------- + + Tesseract OCR comes closest to the functionality img2pdf provides. It is able + to convert JPEG and PNG input to PDF without needlessly increasing the filesize + and is at the same time lossless. So if your input is JPEG and PNG images, then + you should safely be able to use Tesseract instead of img2pdf. For other input, + Tesseract might not do a lossless conversion. For example it converts CMYK + input to RGB and removes the alpha channel from images with transparency. For + multipage TIFF or animated GIF, it will only convert the first frame. + Keywords: jpeg pdf converter Platform: UNKNOWN Classifier: Development Status :: 5 - Production/Stable @@ -1,70 +1,51 @@ img2pdf ======= -Losslessly convert raster images to PDF. The file size will not unnecessarily -increase. It can for example be used to create a PDF document from a number of -scans that are only available in JPEG format. Existing solutions would either -re-encode the input JPEG files (leading to quality loss) or store them in the -zip/flate format which results into the PDF becoming unnecessarily large in -terms of its file size. - -Background ----------- - -Quality loss can be avoided when converting PNG, JPEG and JPEG2000 images to -PDF by embedding them into the PDF without re-encoding them. This is what -img2pdf does. It thus treats the PDF format merely as a container format for -storing one or more JPEGs or PNGs without re-encoding the images themselves. - -If you know an existing tool which allows one to embed PNG, JPEG and JPEG2000 -images into a PDF container without recompression, please contact me so that I -can put this code into the garbage bin. - -Functionality -------------- - -This program will take a list of raster images and produce a PDF file with the -images embedded in it. PNG, JPEG and JPEG2000 images will be included without -recompression and the resulting PDF will only be slightly larger than the input -images due to the overhead of the PDF container. Raster images in other -formats (like gif or tif) will be included using the lossless zip/flate -encoding using the PNG Paeth predictor. - -As a result, this tool is able to losslessly wrap raster images into a PDF -container with a quality to filesize ratio that is typically better (in case of -JPEG and JPEG2000 images) or equal (in case of other formats) than that of -existing tools. - -For example, imagemagick will re-encode the input JPEG image (thus changing -its content): - - $ convert img.jpg img.pdf - $ pdfimages img.pdf img.extr # not using -j to be extra sure there is no recompression - $ compare -metric AE img.jpg img.extr-000.ppm null: - 1.6301e+06 - -If one wants to losslessly convert from any format to PDF with -imagemagick, one has to use zip compression: - - $ convert input.jpg -compress Zip output.pdf - $ pdfimages img.pdf img.extr # not using -j to be extra sure there is no recompression - $ compare -metric AE img.jpg img.extr-000.ppm null: - 0 - -However, this approach will result in PDF files that are a few times larger -than the input JPEG or JPEG2000 file. - -Furthermore, when converting PNG images, popular tools like imagemagick use -flate encoding without a predictor. This means, that image file size ends up -being several orders of magnitude larger then necessary. - -img2pdf is able to losslessly embed PNG, JPEG and JPEG2000 files into a PDF -container without additional overhead (aside from the PDF structure itself), -save other graphics formats using lossless zip compression, and produce -multi-page PDF files when more than one input image is given. - -Also, since PNG, JPEG and JPEG2000 images are not reencoded, conversion with -img2pdf is several times faster than with other tools. +Lossless conversion of raster images to PDF. You should use img2pdf if your +priorities are (in this order): + + 1. **always lossless**: the image embedded in the PDF will always have the + exact same color information for every pixel as the input + 2. **small**: if possible, the difference in filesize between the input image + and the output PDF will only be the overhead of the PDF container itself + 3. **fast**: if possible, the input image is just pasted into the PDF document + as-is without any CPU hungry re-encoding of the pixel data + +Conventional conversion software (like ImageMagick) would either: + + 1. not be lossless because lossy re-encoding to JPEG + 2. not be small because using wasteful flate encoding of raw pixel data + 3. not be fast because input data gets re-encoded + +Another advantage of not having to re-encode the input (in most common +situations) is, that img2pdf is able to handle much larger input than other +software, because the raw pixel data never has to be loaded into memory. + +The following table shows how img2pdf handles different input depending on the +input file format and image color space. + +| Format | Colorspace | Result | +| -------------------- | ------------------------------ | ------------- | +| JPEG | any | direct | +| JPEG2000 | any | direct | +| PNG (non-interlaced) | any | direct | +| TIFF (CCITT Group 4) | monochrome | direct | +| any | any except CMYK and monochrome | PNG Paeth | +| any | monochrome | CCITT Group 4 | +| any | CMYK | flate | + +For JPEG, JPEG2000, non-interlaced PNG and TIFF images with CCITT Group 4 +encoded data, img2pdf directly embeds the image data into the PDF without +re-encoding it. It thus treats the PDF format merely as a container format for +the image data. In these cases, img2pdf only increases the filesize by the size +of the PDF container (typically around 500 to 700 bytes). Since data is only +copied and not re-encoded, img2pdf is also typically faster than other +solutions for these input formats. + +For all other input types, img2pdf first has to transform the pixel data to +make it compatible with PDF. In most cases, the PNG Paeth filter is applied to +the pixel data. For monochrome input, CCITT Group 4 is used instead. Only for +CMYK input no filter is applied before finally applying flate compression. Usage ----- @@ -75,44 +56,45 @@ descriptor. If no output file is specified with the `-o`/`--output` option, output will be done to stdout. A typical invocation is: - img2pdf img1.png img2.jpg -o out.pdf + $ img2pdf img1.png img2.jpg -o out.pdf The detailed documentation can be accessed by running: - img2pdf --help - + $ img2pdf --help Bugs ---- -If you find a JPEG or JPEG2000 file that, when embedded cannot be read -by the Adobe Acrobat Reader, please contact me. - -I have not yet figured out how to determine the colorspace of JPEG2000 files. -Therefore JPEG2000 files use DeviceRGB by default. For JPEG2000 files with -other colorspaces, you must explicitly specify it using the `--colorspace` -option. + - If you find a JPEG, JPEG2000, PNG or CCITT Group 4 encoded TIFF file that, + when embedded into the PDF cannot be read by the Adobe Acrobat Reader, + please contact me. -It might be possible to store transparency using masks but it is not clear -what the utility of such a functionality would be. + - I have not yet figured out how to determine the colorspace of JPEG2000 + files. Therefore JPEG2000 files use DeviceRGB by default. For JPEG2000 + files with other colorspaces, you must explicitly specify it using the + `--colorspace` option. -Most vector graphic formats can be losslessly turned into PDF (minus some of -the features unsupported by PDF) but img2pdf will currently turn vector -graphics into their lossy raster representations. For converting raster -graphics to PDF, use another tool like inkscape and then join the resulting -pages with a tool like pdftk. + - Input images with alpha channels are not allowed. PDF doesn't support alpha + channels in images and thus, the alpha channel of the input would have to be + discarded. But img2pdf will always be lossless and thus, input images must + not carry transparency information. -A configuration file could be used for default options. + - img2pdf uses PIL (or Pillow) to obtain image meta data and to convert the + input if necessary. To prevent decompression bomb denial of service attacks, + Pillow limits the maximum number of pixels an input image is allowed to + have. If you are sure that you know what you are doing, then you can disable + this safeguard by passing the `--pillow-limit-break` option to img2pdf. This + allows one to process even very large input images. Installation ------------ -On a Debian- and Ubuntu-based systems, dependencies may be installed -with the following command: +On a Debian- and Ubuntu-based systems, img2pdf can be installed from the +official repositories: - apt-get install python3 python3-pil python3-setuptools + $ apt install img2pdf -You can then install the package using: +If you want to install it using pip, you can run: $ pip3 install img2pdf @@ -165,3 +147,72 @@ The package can also be used as a library: layout_fun = img2pdf.get_layout_fun(a4inpt) with open("name.pdf","wb") as f: f.write(img2pdf.convert('test.jpg', layout_fun=layout_fun)) + +Comparison to ImageMagick +------------------------- + +Create a large test image: + + $ convert logo: -resize 8000x original.jpg + +Convert it into PDF using ImageMagick and img2pdf: + + $ time img2pdf original.jpg -o img2pdf.pdf + $ time convert original.jpg imagemagick.pdf + +Notice how ImageMagick took an order of magnitude longer to do the conversion +than img2pdf. It also used twice the memory. + +Now extract the image data from both PDF documents and compare it to the +original: + + $ pdfimages -all img2pdf.pdf tmp + $ compare -metric AE original.jpg tmp-000.jpg null: + 0 + $ pdfimages -all imagemagick.pdf tmp + $ compare -metric AE original.jpg tmp-000.jpg null: + 118716 + +To get lossless output with ImageMagick we can use Zip compression but that +unnecessarily increases the size of the output: + + $ convert original.jpg -compress Zip imagemagick.pdf + $ pdfimages -all imagemagick.pdf tmp + $ compare -metric AE original.jpg tmp-000.png null: + 0 + $ stat --format="%s %n" original.jpg img2pdf.pdf imagemagick.pdf + 1535837 original.jpg + 1536683 img2pdf.pdf + 9397809 imagemagick.pdf + +Comparison to pdfLaTeX +---------------------- + +pdfLaTeX performs a lossless conversion from included images to PDF by default. +If the input is a JPEG, then it simply embeds the JPEG into the PDF in the same +way as img2pdf does it. But for other image formats it uses flate compression +of the plain pixel data and thus needlessly increases the output file size: + + $ convert logo: -resize 8000x original.png + $ cat << END > pdflatex.tex + \documentclass{article} + \usepackage{graphicx} + \begin{document} + \includegraphics{original.png} + \end{document} + END + $ pdflatex pdflatex.tex + $ stat --format="%s %n" original.png pdflatex.pdf + 4500182 original.png + 9318120 pdflatex.pdf + +Comparison to Tesseract OCR +--------------------------- + +Tesseract OCR comes closest to the functionality img2pdf provides. It is able +to convert JPEG and PNG input to PDF without needlessly increasing the filesize +and is at the same time lossless. So if your input is JPEG and PNG images, then +you should safely be able to use Tesseract instead of img2pdf. For other input, +Tesseract might not do a lossless conversion. For example it converts CMYK +input to RGB and removes the alpha channel from images with transparency. For +multipage TIFF or animated GIF, it will only convert the first frame. @@ -3,7 +3,7 @@ from setuptools import setup PY3 = sys.version_info[0] >= 3 -VERSION = "0.3.0" +VERSION = "0.3.1" INSTALL_REQUIRES = ( 'Pillow', diff --git a/src/img2pdf.egg-info/PKG-INFO b/src/img2pdf.egg-info/PKG-INFO index e3ecf4b..975388d 100644 --- a/src/img2pdf.egg-info/PKG-INFO +++ b/src/img2pdf.egg-info/PKG-INFO @@ -1,80 +1,61 @@ Metadata-Version: 1.1 Name: img2pdf -Version: 0.3.0 +Version: 0.3.1 Summary: Convert images to PDF via direct JPEG inclusion. Home-page: https://gitlab.mister-muffin.de/josch/img2pdf Author: Johannes 'josch' Schauer Author-email: josch@mister-muffin.de License: LGPL -Download-URL: https://gitlab.mister-muffin.de/josch/img2pdf/repository/archive.tar.gz?ref=0.3.0 +Download-URL: https://gitlab.mister-muffin.de/josch/img2pdf/repository/archive.tar.gz?ref=0.3.1 Description-Content-Type: UNKNOWN Description: img2pdf ======= - Losslessly convert raster images to PDF. The file size will not unnecessarily - increase. It can for example be used to create a PDF document from a number of - scans that are only available in JPEG format. Existing solutions would either - re-encode the input JPEG files (leading to quality loss) or store them in the - zip/flate format which results into the PDF becoming unnecessarily large in - terms of its file size. - - Background - ---------- - - Quality loss can be avoided when converting PNG, JPEG and JPEG2000 images to - PDF by embedding them into the PDF without re-encoding them. This is what - img2pdf does. It thus treats the PDF format merely as a container format for - storing one or more JPEGs or PNGs without re-encoding the images themselves. - - If you know an existing tool which allows one to embed PNG, JPEG and JPEG2000 - images into a PDF container without recompression, please contact me so that I - can put this code into the garbage bin. - - Functionality - ------------- - - This program will take a list of raster images and produce a PDF file with the - images embedded in it. PNG, JPEG and JPEG2000 images will be included without - recompression and the resulting PDF will only be slightly larger than the input - images due to the overhead of the PDF container. Raster images in other - formats (like gif or tif) will be included using the lossless zip/flate - encoding using the PNG Paeth predictor. - - As a result, this tool is able to losslessly wrap raster images into a PDF - container with a quality to filesize ratio that is typically better (in case of - JPEG and JPEG2000 images) or equal (in case of other formats) than that of - existing tools. - - For example, imagemagick will re-encode the input JPEG image (thus changing - its content): - - $ convert img.jpg img.pdf - $ pdfimages img.pdf img.extr # not using -j to be extra sure there is no recompression - $ compare -metric AE img.jpg img.extr-000.ppm null: - 1.6301e+06 - - If one wants to losslessly convert from any format to PDF with - imagemagick, one has to use zip compression: - - $ convert input.jpg -compress Zip output.pdf - $ pdfimages img.pdf img.extr # not using -j to be extra sure there is no recompression - $ compare -metric AE img.jpg img.extr-000.ppm null: - 0 - - However, this approach will result in PDF files that are a few times larger - than the input JPEG or JPEG2000 file. - - Furthermore, when converting PNG images, popular tools like imagemagick use - flate encoding without a predictor. This means, that image file size ends up - being several orders of magnitude larger then necessary. - - img2pdf is able to losslessly embed PNG, JPEG and JPEG2000 files into a PDF - container without additional overhead (aside from the PDF structure itself), - save other graphics formats using lossless zip compression, and produce - multi-page PDF files when more than one input image is given. - - Also, since PNG, JPEG and JPEG2000 images are not reencoded, conversion with - img2pdf is several times faster than with other tools. + Lossless conversion of raster images to PDF. You should use img2pdf if your + priorities are (in this order): + + 1. **always lossless**: the image embedded in the PDF will always have the + exact same color information for every pixel as the input + 2. **small**: if possible, the difference in filesize between the input image + and the output PDF will only be the overhead of the PDF container itself + 3. **fast**: if possible, the input image is just pasted into the PDF document + as-is without any CPU hungry re-encoding of the pixel data + + Conventional conversion software (like ImageMagick) would either: + + 1. not be lossless because lossy re-encoding to JPEG + 2. not be small because using wasteful flate encoding of raw pixel data + 3. not be fast because input data gets re-encoded + + Another advantage of not having to re-encode the input (in most common + situations) is, that img2pdf is able to handle much larger input than other + software, because the raw pixel data never has to be loaded into memory. + + The following table shows how img2pdf handles different input depending on the + input file format and image color space. + + | Format | Colorspace | Result | + | -------------------- | ------------------------------ | ------------- | + | JPEG | any | direct | + | JPEG2000 | any | direct | + | PNG (non-interlaced) | any | direct | + | TIFF (CCITT Group 4) | monochrome | direct | + | any | any except CMYK and monochrome | PNG Paeth | + | any | monochrome | CCITT Group 4 | + | any | CMYK | flate | + + For JPEG, JPEG2000, non-interlaced PNG and TIFF images with CCITT Group 4 + encoded data, img2pdf directly embeds the image data into the PDF without + re-encoding it. It thus treats the PDF format merely as a container format for + the image data. In these cases, img2pdf only increases the filesize by the size + of the PDF container (typically around 500 to 700 bytes). Since data is only + copied and not re-encoded, img2pdf is also typically faster than other + solutions for these input formats. + + For all other input types, img2pdf first has to transform the pixel data to + make it compatible with PDF. In most cases, the PNG Paeth filter is applied to + the pixel data. For monochrome input, CCITT Group 4 is used instead. Only for + CMYK input no filter is applied before finally applying flate compression. Usage ----- @@ -85,44 +66,45 @@ Description: img2pdf If no output file is specified with the `-o`/`--output` option, output will be done to stdout. A typical invocation is: - img2pdf img1.png img2.jpg -o out.pdf + $ img2pdf img1.png img2.jpg -o out.pdf The detailed documentation can be accessed by running: - img2pdf --help - + $ img2pdf --help Bugs ---- - If you find a JPEG or JPEG2000 file that, when embedded cannot be read - by the Adobe Acrobat Reader, please contact me. - - I have not yet figured out how to determine the colorspace of JPEG2000 files. - Therefore JPEG2000 files use DeviceRGB by default. For JPEG2000 files with - other colorspaces, you must explicitly specify it using the `--colorspace` - option. + - If you find a JPEG, JPEG2000, PNG or CCITT Group 4 encoded TIFF file that, + when embedded into the PDF cannot be read by the Adobe Acrobat Reader, + please contact me. - It might be possible to store transparency using masks but it is not clear - what the utility of such a functionality would be. + - I have not yet figured out how to determine the colorspace of JPEG2000 + files. Therefore JPEG2000 files use DeviceRGB by default. For JPEG2000 + files with other colorspaces, you must explicitly specify it using the + `--colorspace` option. - Most vector graphic formats can be losslessly turned into PDF (minus some of - the features unsupported by PDF) but img2pdf will currently turn vector - graphics into their lossy raster representations. For converting raster - graphics to PDF, use another tool like inkscape and then join the resulting - pages with a tool like pdftk. + - Input images with alpha channels are not allowed. PDF doesn't support alpha + channels in images and thus, the alpha channel of the input would have to be + discarded. But img2pdf will always be lossless and thus, input images must + not carry transparency information. - A configuration file could be used for default options. + - img2pdf uses PIL (or Pillow) to obtain image meta data and to convert the + input if necessary. To prevent decompression bomb denial of service attacks, + Pillow limits the maximum number of pixels an input image is allowed to + have. If you are sure that you know what you are doing, then you can disable + this safeguard by passing the `--pillow-limit-break` option to img2pdf. This + allows one to process even very large input images. Installation ------------ - On a Debian- and Ubuntu-based systems, dependencies may be installed - with the following command: + On a Debian- and Ubuntu-based systems, img2pdf can be installed from the + official repositories: - apt-get install python3 python3-pil python3-setuptools + $ apt install img2pdf - You can then install the package using: + If you want to install it using pip, you can run: $ pip3 install img2pdf @@ -176,6 +158,75 @@ Description: img2pdf with open("name.pdf","wb") as f: f.write(img2pdf.convert('test.jpg', layout_fun=layout_fun)) + Comparison to ImageMagick + ------------------------- + + Create a large test image: + + $ convert logo: -resize 8000x original.jpg + + Convert it into PDF using ImageMagick and img2pdf: + + $ time img2pdf original.jpg -o img2pdf.pdf + $ time convert original.jpg imagemagick.pdf + + Notice how ImageMagick took an order of magnitude longer to do the conversion + than img2pdf. It also used twice the memory. + + Now extract the image data from both PDF documents and compare it to the + original: + + $ pdfimages -all img2pdf.pdf tmp + $ compare -metric AE original.jpg tmp-000.jpg null: + 0 + $ pdfimages -all imagemagick.pdf tmp + $ compare -metric AE original.jpg tmp-000.jpg null: + 118716 + + To get lossless output with ImageMagick we can use Zip compression but that + unnecessarily increases the size of the output: + + $ convert original.jpg -compress Zip imagemagick.pdf + $ pdfimages -all imagemagick.pdf tmp + $ compare -metric AE original.jpg tmp-000.png null: + 0 + $ stat --format="%s %n" original.jpg img2pdf.pdf imagemagick.pdf + 1535837 original.jpg + 1536683 img2pdf.pdf + 9397809 imagemagick.pdf + + Comparison to pdfLaTeX + ---------------------- + + pdfLaTeX performs a lossless conversion from included images to PDF by default. + If the input is a JPEG, then it simply embeds the JPEG into the PDF in the same + way as img2pdf does it. But for other image formats it uses flate compression + of the plain pixel data and thus needlessly increases the output file size: + + $ convert logo: -resize 8000x original.png + $ cat << END > pdflatex.tex + \documentclass{article} + \usepackage{graphicx} + \begin{document} + \includegraphics{original.png} + \end{document} + END + $ pdflatex pdflatex.tex + $ stat --format="%s %n" original.png pdflatex.pdf + 4500182 original.png + 9318120 pdflatex.pdf + + Comparison to Tesseract OCR + --------------------------- + + Tesseract OCR comes closest to the functionality img2pdf provides. It is able + to convert JPEG and PNG input to PDF without needlessly increasing the filesize + and is at the same time lossless. So if your input is JPEG and PNG images, then + you should safely be able to use Tesseract instead of img2pdf. For other input, + Tesseract might not do a lossless conversion. For example it converts CMYK + input to RGB and removes the alpha channel from images with transparency. For + multipage TIFF or animated GIF, it will only convert the first frame. + Keywords: jpeg pdf converter Platform: UNKNOWN Classifier: Development Status :: 5 - Production/Stable diff --git a/src/img2pdf.egg-info/SOURCES.txt b/src/img2pdf.egg-info/SOURCES.txt index ae6e816..3271401 100644 --- a/src/img2pdf.egg-info/SOURCES.txt +++ b/src/img2pdf.egg-info/SOURCES.txt @@ -1,4 +1,5 @@ CHANGES.rst +LICENSE MANIFEST.in README.md setup.cfg diff --git a/src/img2pdf.py b/src/img2pdf.py index 48ef964..7c1978e 100755 --- a/src/img2pdf.py +++ b/src/img2pdf.py @@ -22,7 +22,7 @@ import sys import os import zlib import argparse -from PIL import Image +from PIL import Image, TiffImagePlugin from datetime import datetime from jp2 import parsejp2 from enum import Enum @@ -32,7 +32,7 @@ import struct PY3 = sys.version_info[0] >= 3 -__version__ = "0.3.0" +__version__ = "0.3.1" default_dpi = 96.0 papersizes = { "letter": "8.5inx11in", @@ -62,7 +62,7 @@ PageOrientation = Enum('PageOrientation', 'portrait landscape') Colorspace = Enum('Colorspace', 'RGB L 1 CMYK CMYK;I RGBA P other') -ImageFormat = Enum('ImageFormat', 'JPEG JPEG2000 CCITTGroup4 PNG other') +ImageFormat = Enum('ImageFormat', 'JPEG JPEG2000 CCITTGroup4 PNG TIFF other') PageMode = Enum('PageMode', 'none outlines thumbs') @@ -277,7 +277,8 @@ if PY3: @classmethod def encode(cls, string, hextype=False): if hextype: - return b'< ' + b' '.join(("%06x"%c).encode('ascii') for c in string) + b' >' + return b'< ' + b' '.join( + ("%06x" % c).encode('ascii') for c in string) + b' >' else: try: string = string.encode('ascii') @@ -292,7 +293,8 @@ else: @classmethod def encode(cls, string, hextype=False): if hextype: - return b'< ' + b' '.join(("%06x"%c).encode('ascii') for c in string) + b' >' + return b'< ' + b' '.join( + ("%06x" % c).encode('ascii') for c in string) + b' >' else: # This mimics exactely to what pdfrw does. string = string.replace(b'\\', b'\\\\') @@ -374,7 +376,7 @@ class pdfdoc(object): def add_imagepage(self, color, imgwidthpx, imgheightpx, imgformat, imgdata, imgwidthpdf, imgheightpdf, imgxpdf, imgypdf, pagewidth, - pageheight, userunit=None, palette=None): + pageheight, userunit=None, palette=None, inverted=False): if self.with_pdfrw: from pdfrw import PdfDict, PdfName, PdfObject, PdfString from pdfrw.py23_diffs import convert_load @@ -393,8 +395,11 @@ class pdfdoc(object): colorspace = PdfName.DeviceCMYK elif color == Colorspace.P: if self.with_pdfrw: - raise Exception("pdfrw does not support hex strings for palette image input, re-run with --without-pdfrw") - colorspace = [ PdfName.Indexed, PdfName.DeviceRGB, len(palette)-1, PdfString.encode(palette, hextype=True)] + raise Exception("pdfrw does not support hex strings for " + "palette image input, re-run with " + "--without-pdfrw") + colorspace = [PdfName.Indexed, PdfName.DeviceRGB, len(palette)-1, + PdfString.encode(palette, hextype=True)] else: raise UnsupportedColorspaceError("unsupported color space: %s" % color.name) @@ -440,15 +445,20 @@ class pdfdoc(object): if imgformat is ImageFormat.CCITTGroup4: decodeparms = PdfDict() + # The default for the K parameter is 0 which indicates Group 3 1-D + # encoding. We set it to -1 because we want Group 4 encoding. decodeparms[PdfName.K] = -1 - decodeparms[PdfName.BlackIs1] = PdfObject('true') + if inverted: + decodeparms[PdfName.BlackIs1] = PdfObject('false') + else: + decodeparms[PdfName.BlackIs1] = PdfObject('true') decodeparms[PdfName.Columns] = imgwidthpx decodeparms[PdfName.Rows] = imgheightpx image[PdfName.DecodeParms] = [decodeparms] elif imgformat is ImageFormat.PNG: decodeparms = PdfDict() decodeparms[PdfName.Predictor] = 15 - if color in [ Colorspace.P, Colorspace['1'], Colorspace.L ]: + if color in [Colorspace.P, Colorspace['1'], Colorspace.L]: decodeparms[PdfName.Colors] = 1 else: decodeparms[PdfName.Colors] = 3 @@ -642,13 +652,14 @@ def get_imgmetadata(imgdata, imgformat, default_dpi, colorspace, rawdata=None): ics = imgdata.mode if ics in ["LA", "PA", "RGBA"]: - logging.warning("Image contains transparency which cannot be retained in PDF.") + logging.warning("Image contains transparency which cannot be retained " + "in PDF.") logging.warning("img2pdf will not perform a lossy operation.") logging.warning("You can remove the alpha channel using imagemagick:") - logging.warning(" $ convert input.png -background white -alpha remove -alpha off output.png") + logging.warning(" $ convert input.png -background white -alpha " + "remove -alpha off output.png") raise Exception("Refusing to work on images with alpha channel") - # Since commit 07a96209597c5e8dfe785c757d7051ce67a980fb or release 4.1.0 # Pillow retrieves the DPI from EXIF if it cannot find the DPI in the JPEG # header. In that case it can happen that the horizontal and vertical DPI @@ -685,11 +696,33 @@ def get_imgmetadata(imgdata, imgformat, default_dpi, colorspace, rawdata=None): return (color, ndpi, imgwidthpx, imgheightpx) +def ccitt_payload_location_from_pil(img): + # If Pillow is passed an invalid compression argument it will ignore it; + # make sure the image actually got compressed. + if img.info['compression'] != 'group4': + raise ValueError("Image not compressed with CCITT Group 4 but with: %s" + % img.info['compression']) + + # Read the TIFF tags to find the offset(s) of the compressed data strips. + strip_offsets = img.tag_v2[TiffImagePlugin.STRIPOFFSETS] + strip_bytes = img.tag_v2[TiffImagePlugin.STRIPBYTECOUNTS] + rows_per_strip = img.tag_v2[TiffImagePlugin.ROWSPERSTRIP] + + # PIL always seems to create a single strip even for very large TIFFs when + # it saves images, so assume we only have to read a single strip. + # A test ~10 GPixel image was still encoded as a single strip. Just to be + # safe check throw an error if there is more than one offset. + if len(strip_offsets) != 1 or len(strip_bytes) != 1: + raise NotImplementedError("Transcoding multiple strips not supported") + + (offset, ), (length, ) = strip_offsets, strip_bytes + + return offset, length + + def transcode_monochrome(imgdata): """Convert the open PIL.Image imgdata to compressed CCITT Group4 data""" - from PIL import TiffImagePlugin - logging.debug("Converting monochrome to CCITT Group4") # Convert the image to Group 4 in memory. If libtiff is not installed and @@ -707,27 +740,11 @@ def transcode_monochrome(imgdata): newimgio.seek(0) newimg = Image.open(newimgio) - # If Pillow is passed an invalid compression argument it will ignore it; - # make sure the image actually got compressed. - if newimg.info['compression'] != 'group4': - raise ValueError("Image not compressed as expected") - - # Read the TIFF tags to find the offset(s) of the compressed data strips. - strip_offsets = newimg.tag_v2[TiffImagePlugin.STRIPOFFSETS] - strip_bytes = newimg.tag_v2[TiffImagePlugin.STRIPBYTECOUNTS] - rows_per_strip = newimg.tag_v2[TiffImagePlugin.ROWSPERSTRIP] - - # PIL always seems to create a single strip even for very large TIFFs when - # it saves images, so assume we only have to read a single strip. - # A test ~10 GPixel image was still encoded as a single strip. Just to be - # safe check throw an error if there is more than one offset. - if len(strip_offsets) > 1: - raise NotImplementedError("Transcoding multiple strips not supported") + offset, length = ccitt_payload_location_from_pil(newimg) - newimgio.seek(strip_offsets[0]) - ccittdata = newimgio.read(strip_bytes[0]) + newimgio.seek(offset) + return newimgio.read(length) - return ccittdata def parse_png(rawdata): pngidat = b"" @@ -737,18 +754,20 @@ def parse_png(rawdata): # once we can require Python >= 3.2 we can use int.from_bytes() instead n, = struct.unpack('>I', rawdata[i-8:i-4]) if i + n > len(rawdata): - raise Exception("invalid png: %d %d %d"%(i, n, len(rawdata))) + raise Exception("invalid png: %d %d %d" % (i, n, len(rawdata))) if rawdata[i-4:i] == b"IDAT": pngidat += rawdata[i:i+n] elif rawdata[i-4:i] == b"PLTE": for j in range(i, i+n, 3): - # with int.from_bytes() we would not have to prepend extra zeroes + # with int.from_bytes() we would not have to prepend extra + # zeroes color, = struct.unpack('>I', b'\x00'+rawdata[j:j+3]) palette.append(color) i += n i += 12 return pngidat, palette + def read_images(rawdata, colorspace, first_frame_only=False): im = BytesIO(rawdata) im.seek(0) @@ -786,7 +805,8 @@ def read_images(rawdata, colorspace, first_frame_only=False): if color == Colorspace['RGBA']: raise JpegColorspaceError("jpeg can't have an alpha channel") im.close() - return [(color, ndpi, imgformat, rawdata, imgwidthpx, imgheightpx, [])] + return [(color, ndpi, imgformat, rawdata, imgwidthpx, imgheightpx, [], + False)] # We can directly embed the IDAT chunk of PNG images if the PNG is not # interlaced @@ -799,7 +819,30 @@ def read_images(rawdata, colorspace, first_frame_only=False): color, ndpi, imgwidthpx, imgheightpx = get_imgmetadata( imgdata, imgformat, default_dpi, colorspace, rawdata) pngidat, palette = parse_png(rawdata) - return [(color, ndpi, imgformat, pngidat, imgwidthpx, imgheightpx, palette)] + im.close() + return [(color, ndpi, imgformat, pngidat, imgwidthpx, imgheightpx, + palette, False)] + + # We can directly copy the data out of a CCITT Group 4 encoded TIFF, if it + # only contains a single strip + if imgformat == ImageFormat.TIFF \ + and imgdata.info['compression'] == "group4" \ + and len(imgdata.tag_v2[TiffImagePlugin.STRIPOFFSETS]) == 1: + photo = imgdata.tag_v2[TiffImagePlugin.PHOTOMETRIC_INTERPRETATION] + inverted = False + if photo == 0: + inverted = True + elif photo != 1: + raise ValueError("unsupported photometric interpretation for " + "group4 tiff: %d" % photo) + color, ndpi, imgwidthpx, imgheightpx = get_imgmetadata( + imgdata, imgformat, default_dpi, colorspace, rawdata) + offset, length = ccitt_payload_location_from_pil(imgdata) + im.seek(offset) + rawdata = im.read(length) + im.close() + return [(color, ndpi, ImageFormat.CCITTGroup4, rawdata, imgwidthpx, + imgheightpx, [], inverted)] # Everything else has to be encoded @@ -826,7 +869,7 @@ def read_images(rawdata, colorspace, first_frame_only=False): ccittdata = transcode_monochrome(imgdata) imgformat = ImageFormat.CCITTGroup4 result.append((color, ndpi, imgformat, ccittdata, - imgwidthpx, imgheightpx, [])) + imgwidthpx, imgheightpx, [], False)) img_page_count += 1 continue except Exception as e: @@ -839,13 +882,14 @@ def read_images(rawdata, colorspace, first_frame_only=False): logging.debug("Colorspace is OK: %s", color) newimg = imgdata else: - raise ValueError("unknown or unsupported colorspace: %s" % color.name) + raise ValueError("unknown or unsupported colorspace: %s" + % color.name) # the PNG format does not support CMYK, so we fall back to normal # compression if color in [Colorspace.CMYK, Colorspace["CMYK;I"]]: imggz = zlib.compress(newimg.tobytes()) result.append((color, ndpi, imgformat, imggz, imgwidthpx, - imgheightpx, [])) + imgheightpx, [], False)) else: # cheapo version to retrieve a PNG encoding of the payload is to # just save it with PIL. In the future this could be replaced by @@ -855,7 +899,7 @@ def read_images(rawdata, colorspace, first_frame_only=False): pngidat, palette = parse_png(pngbuffer.getvalue()) imgformat = ImageFormat.PNG result.append((color, ndpi, imgformat, pngidat, imgwidthpx, - imgheightpx, palette)) + imgheightpx, palette, False)) img_page_count += 1 # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the # close() method @@ -1164,14 +1208,14 @@ def convert(*images, **kwargs): try: with open(img, "rb") as f: rawdata = f.read() - except: + except Exception: # whatever the exception is (string could contain NUL # characters or the path could just not exist) it's not a file # name so we now try treating it as raw image content rawdata = img - for color, ndpi, imgformat, imgdata, imgwidthpx, imgheightpx, palette \ - in read_images( + for color, ndpi, imgformat, imgdata, imgwidthpx, imgheightpx, \ + palette, inverted in read_images( rawdata, kwargs['colorspace'], kwargs['first_frame_only']): pagewidth, pageheight, imgwidthpdf, imgheightpdf = \ kwargs['layout_fun'](imgwidthpx, imgheightpx, ndpi) @@ -1195,7 +1239,8 @@ def convert(*images, **kwargs): imgypdf = (pageheight - imgheightpdf)/2.0 pdf.add_imagepage(color, imgwidthpx, imgheightpx, imgformat, imgdata, imgwidthpdf, imgheightpdf, imgxpdf, - imgypdf, pagewidth, pageheight, userunit, palette) + imgypdf, pagewidth, pageheight, userunit, + palette, inverted) if kwargs['outputstream']: pdf.tostream(kwargs['outputstream']) @@ -1542,7 +1587,7 @@ Fit options: Argument parsing: Argument long options can be abbreviated to a prefix if the abbreviation is - anambiguous. That is, the prefix must match a unique option. + unambiguous. That is, the prefix must match a unique option. Beware of your shell interpreting argument values as special characters (like the semicolon in the CMYK;I colorspace option). If in doubt, put the argument @@ -1667,7 +1712,7 @@ RGB.''') "to prevent decompression bomb denial of service attacks. If " "your input image contains more pixels than that, use this " "option to disable this safety measure during this run of img2pdf" - %Image.MAX_IMAGE_PIXELS) + % Image.MAX_IMAGE_PIXELS) sizeargs = parser.add_argument_group( title='Image and page size and layout arguments', diff --git a/src/tests/__init__.py b/src/tests/__init__.py index b1c1797..c9b85e3 100644 --- a/src/tests/__init__.py +++ b/src/tests/__init__.py @@ -592,10 +592,17 @@ def test_suite(): if imgprops.DecodeParms: if orig_img.format == 'PNG': pngidat, palette = img2pdf.parse_png(orig_imgdata) + elif orig_img.format == 'TIFF' \ + and orig_img.info['compression'] == "group4": + offset, length = \ + img2pdf.ccitt_payload_location_from_pil( + orig_img) + pngidat = orig_imgdata[offset:offset+length] else: pngbuffer = BytesIO() orig_img.save(pngbuffer, format="png") - pngidat, palette = img2pdf.parse_png(pngbuffer.getvalue()) + pngidat, palette = img2pdf.parse_png( + pngbuffer.getvalue()) self.assertEqual(zlib.decompress(pngidat), imgdata) else: colorspace = imgprops.ColorSpace @@ -607,17 +614,19 @@ def test_suite(): colorspace = 'CMYK' else: raise Exception("invalid colorspace") - im = Image.frombytes(colorspace, (int(imgprops.Width), - int(imgprops.Height)), + im = Image.frombytes(colorspace, + (int(imgprops.Width), + int(imgprops.Height)), imgdata) if orig_img.mode == '1': self.assertEqual(im.tobytes(), orig_img.convert("L").tobytes()) - elif orig_img.mode not in ("RGB", "L", "CMYK", "CMYK;I"): + elif orig_img.mode not in ("RGB", "L", "CMYK", + "CMYK;I"): self.assertEqual(im.tobytes(), orig_img.convert("RGB").tobytes()) - # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not - # have the close() method + # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does + # not have the close() method try: im.close() except AttributeError: diff --git a/src/tests/input/mono.tif b/src/tests/input/mono.tif Binary files differindex 53e85bc..3718d52 100644 --- a/src/tests/input/mono.tif +++ b/src/tests/input/mono.tif diff --git a/src/tests/output/mono.tif.pdf b/src/tests/output/mono.tif.pdf Binary files differindex d23e65e..eda3ec7 100644 --- a/src/tests/output/mono.tif.pdf +++ b/src/tests/output/mono.tif.pdf diff --git a/test_comp.sh b/test_comp.sh index ae832e2..44edefd 100755 --- a/test_comp.sh +++ b/test_comp.sh @@ -16,17 +16,17 @@ for a in `convert -list compress`; do echo "encode:\t$a" convert "$1" -compress $a "`basename $1 .jpg`.pdf" pdfimages "`basename $1 .jpg`.pdf" "`basename $1 .jpg`" - /bin/echo -ne "diff:\t" + printf "diff:\t" diff=`compare -metric AE "$1" "\`basename $1 .jpg\`-000.ppm" null: 2>&1` if [ "$diff" != "0" ]; then echo "lossy" else echo "lossless" fi - /bin/echo -ne "size:\t" + printf "size:\t" pdfsize=`stat -c "%s" "\`basename $1 .jpg\`.pdf"` echo "scale=1;$pdfsize/$imsize" | bc - /bin/echo -ne "pdf:\t" + printf "pdf:\t" grep --max-count=1 --text /Filter "`basename $1 .jpg`.pdf" echo done |