Import upstream version 0.3.1

author: Johannes 'josch' Schauer <josch@debian.org> 2018-08-05 21:10:32 +0200
committer: Johannes 'josch' Schauer <josch@debian.org> 2018-08-05 21:10:32 +0200
commit: 244f64ef12075bc9ad3a396e4cb1d510800140d3 (patch)
tree: 56e18cd7dc73edf3d80fec142153e035937c0a52
parent: aef245f415aae671df75502700826d2bb682e257 (diff)
13 files changed, 722 insertions, 342 deletions
diff --git a/CHANGES.rst b/CHANGES.rst
index d4476a8..4f5bee3 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -2,42 +2,48 @@
 CHANGES
 =======
 
-0.3.0
------
+0.3.1 (2018-08-04)
+------------------
+
+ - Directly copy data from CCITT Group 4 encoded TIFF images into the PDF
+   container without re-encoding
+
+0.3.0 (2018-06-18)
+------------------
 
  - Store non-jpeg images using PNG compression
  - Support arbitrarily large pages via PDF /UserUnit field
  - Disallow input with alpha channel as it cannot be preserved
  - Add option --pillow-limit-break to support very large input
 
-0.2.4
------
+0.2.4 (2017-05-23)
+------------------
 
  - Restore support for Python 2.7
  - Add support for PyPy
  - Add support for testing using tox
 
-0.2.3
------
+0.2.3 (2017-01-20)
+------------------
 
  - version number bump for botched pypi upload...
 
-0.2.2
------
+0.2.2 (2017-01-20)
+------------------
 
  - automatic monochrome CCITT Group4 encoding via Pillow/libtiff
 
-0.2.1
------
+0.2.1 (2016-05-04)
+------------------
 
  - set img2pdf as /producer value
  - support multi-frame images like multipage TIFF and animated GIF
  - support for palette images like GIF
- - support all colorspaces and imageformats knows by PIL
+ - support all colorspaces and imageformats known by PIL
  - read horizontal and vertical dpi from JPEG2000 files
 
-0.2.0
------
+0.2.0 (2015-05-10)
+------------------
 
  - now Python3 only
  - pep8 compliant code
@@ -72,34 +78,34 @@ CHANGES
  - explicitly store date in UTC and allow parsing all date formats understood
    by dateutil and `date --date`
 
-0.1.5
------
+0.1.5 (2015-02-16)
+------------------
 
 - Enable support for CMYK images
 - Rework test suite
 - support file objects as input
 
-0.1.4
------
+0.1.4 (2015-01-21)
+------------------
 
 - add Python 3 support
 - make output reproducible by sorting and --nodate option
 
-0.1.3
------
+0.1.3 (2014-11-10)
+------------------
 
 - Avoid leaking file descriptors
 - Convert unrecognized colorspaces to RGB
 
-0.1.1
------
+0.1.1 (2014-09-07)
+------------------
 
 - allow running src/img2pdf.py standalone
 - license change from GPL to LGPL
 - Add pillow 2.4.0 support
 - add options to specify pdf dimensions in points
 
-0.1.0 (unreleased)
+0.1.0 (2014-03-14, unreleased)
 ------------------
 
 - Initial PyPI release.
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..0a04128
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,165 @@
+                   GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions.
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/MANIFEST.in b/MANIFEST.in
index 4ee2b37..9249d3f 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,7 @@
 include README.md
 include test_comp.sh
 include CHANGES.rst
+include LICENSE
 recursive-include src *.jpg
 recursive-include src *.pdf
 recursive-include src *.png
diff --git a/PKG-INFO b/PKG-INFO
index e3ecf4b..975388d 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,80 +1,61 @@
 Metadata-Version: 1.1
 Name: img2pdf
-Version: 0.3.0
+Version: 0.3.1
 Summary: Convert images to PDF via direct JPEG inclusion.
 Home-page: https://gitlab.mister-muffin.de/josch/img2pdf
 Author: Johannes 'josch' Schauer
 Author-email: josch@mister-muffin.de
 License: LGPL
-Download-URL: https://gitlab.mister-muffin.de/josch/img2pdf/repository/archive.tar.gz?ref=0.3.0
+Download-URL: https://gitlab.mister-muffin.de/josch/img2pdf/repository/archive.tar.gz?ref=0.3.1
 Description-Content-Type: UNKNOWN
 Description: img2pdf
         =======
         
-        Losslessly convert raster images to PDF. The file size will not unnecessarily
-        increase. It can for example be used to create a PDF document from a number of
-        scans that are only available in JPEG format. Existing solutions would either
-        re-encode the input JPEG files (leading to quality loss) or store them in the
-        zip/flate format which results into the PDF becoming unnecessarily large in
-        terms of its file size.
-        
-        Background
-        ----------
-        
-        Quality loss can be avoided when converting PNG, JPEG and JPEG2000 images to
-        PDF by embedding them into the PDF without re-encoding them. This is what
-        img2pdf does. It thus treats the PDF format merely as a container format for
-        storing one or more JPEGs or PNGs without re-encoding the images themselves.
-        
-        If you know an existing tool which allows one to embed PNG, JPEG and JPEG2000
-        images into a PDF container without recompression, please contact me so that I
-        can put this code into the garbage bin.
-        
-        Functionality
-        -------------
-        
-        This program will take a list of raster images and produce a PDF file with the
-        images embedded in it. PNG, JPEG and JPEG2000 images will be included without
-        recompression and the resulting PDF will only be slightly larger than the input
-        images due to the overhead of the PDF container.  Raster images in other
-        formats (like gif or tif) will be included using the lossless zip/flate
-        encoding using the PNG Paeth predictor.
-        
-        As a result, this tool is able to losslessly wrap raster images into a PDF
-        container with a quality to filesize ratio that is typically better (in case of
-        JPEG and JPEG2000 images) or equal (in case of other formats) than that of
-        existing tools.
-        
-        For example, imagemagick will re-encode the input JPEG image (thus changing
-        its content):
-        
-        	$ convert img.jpg img.pdf
-        	$ pdfimages img.pdf img.extr # not using -j to be extra sure there is no recompression
-        	$ compare -metric AE img.jpg img.extr-000.ppm null:
-        	1.6301e+06
-        
-        If one wants to losslessly convert from any format to PDF with
-        imagemagick, one has to use zip compression:
-        
-        	$ convert input.jpg -compress Zip output.pdf
-        	$ pdfimages img.pdf img.extr # not using -j to be extra sure there is no recompression
-        	$ compare -metric AE img.jpg img.extr-000.ppm null:
-        	0
-        
-        However, this approach will result in PDF files that are a few times larger
-        than the input JPEG or JPEG2000 file.
-        
-        Furthermore, when converting PNG images, popular tools like imagemagick use
-        flate encoding without a predictor. This means, that image file size ends up
-        being several orders of magnitude larger then necessary.
-        
-        img2pdf is able to losslessly embed PNG, JPEG and JPEG2000 files into a PDF
-        container without additional overhead (aside from the PDF structure itself),
-        save other graphics formats using lossless zip compression, and produce
-        multi-page PDF files when more than one input image is given.
-        
-        Also, since PNG, JPEG and JPEG2000 images are not reencoded, conversion with
-        img2pdf is several times faster than with other tools.
+        Lossless conversion of raster images to PDF. You should use img2pdf if your
+        priorities are (in this order):
+        
+         1. **always lossless**: the image embedded in the PDF will always have the
+            exact same color information for every pixel as the input
+         2. **small**: if possible, the difference in filesize between the input image
+            and the output PDF will only be the overhead of the PDF container itself
+         3. **fast**: if possible, the input image is just pasted into the PDF document
+            as-is without any CPU hungry re-encoding of the pixel data
+        
+        Conventional conversion software (like ImageMagick) would either:
+        
+         1. not be lossless because lossy re-encoding to JPEG
+         2. not be small because using wasteful flate encoding of raw pixel data
+         3. not be fast because input data gets re-encoded
+        
+        Another advantage of not having to re-encode the input (in most common
+        situations) is, that img2pdf is able to handle much larger input than other
+        software, because the raw pixel data never has to be loaded into memory.
+        
+        The following table shows how img2pdf handles different input depending on the
+        input file format and image color space.
+        
+        | Format               | Colorspace                     | Result        |
+        | -------------------- | ------------------------------ | ------------- |
+        | JPEG                 | any                            | direct        |
+        | JPEG2000             | any                            | direct        |
+        | PNG (non-interlaced) | any                            | direct        |
+        | TIFF (CCITT Group 4) | monochrome                     | direct        |
+        | any                  | any except CMYK and monochrome | PNG Paeth     |
+        | any                  | monochrome                     | CCITT Group 4 |
+        | any                  | CMYK                           | flate         |
+        
+        For JPEG, JPEG2000, non-interlaced PNG and TIFF images with CCITT Group 4
+        encoded data, img2pdf directly embeds the image data into the PDF without
+        re-encoding it. It thus treats the PDF format merely as a container format for
+        the image data. In these cases, img2pdf only increases the filesize by the size
+        of the PDF container (typically around 500 to 700 bytes). Since data is only
+        copied and not re-encoded, img2pdf is also typically faster than other
+        solutions for these input formats.
+        
+        For all other input types, img2pdf first has to transform the pixel data to
+        make it compatible with PDF. In most cases, the PNG Paeth filter is applied to
+        the pixel data. For monochrome input, CCITT Group 4 is used instead. Only for
+        CMYK input no filter is applied before finally applying flate compression.
         
         Usage
         -----
@@ -85,44 +66,45 @@ Description: img2pdf
         If no output file is specified with the `-o`/`--output` option, output will be
         done to stdout. A typical invocation is:
         
-        	img2pdf img1.png img2.jpg -o out.pdf
+        	$ img2pdf img1.png img2.jpg -o out.pdf
         
         The detailed documentation can be accessed by running:
         
-        	img2pdf --help
-        
+        	$ img2pdf --help
         
         Bugs
         ----
         
-        If you find a JPEG or JPEG2000 file that, when embedded cannot be read
-        by the Adobe Acrobat Reader, please contact me.
-        
-        I have not yet figured out how to determine the colorspace of JPEG2000 files.
-        Therefore JPEG2000 files use DeviceRGB by default. For JPEG2000 files with
-        other colorspaces, you must explicitly specify it using the `--colorspace`
-        option.
+         - If you find a JPEG, JPEG2000, PNG or CCITT Group 4 encoded TIFF file that,
+           when embedded into the PDF cannot be read by the Adobe Acrobat Reader,
+           please contact me.
         
-        It might be possible to store transparency using masks but it is not clear
-        what the utility of such a functionality would be.
+         - I have not yet figured out how to determine the colorspace of JPEG2000
+           files.  Therefore JPEG2000 files use DeviceRGB by default. For JPEG2000
+           files with other colorspaces, you must explicitly specify it using the
+           `--colorspace` option.
         
-        Most vector graphic formats can be losslessly turned into PDF (minus some of
-        the features unsupported by PDF) but img2pdf will currently turn vector
-        graphics into their lossy raster representations. For converting raster
-        graphics to PDF, use another tool like inkscape and then join the resulting
-        pages with a tool like pdftk.
+         - Input images with alpha channels are not allowed. PDF doesn't support alpha
+           channels in images and thus, the alpha channel of the input would have to be
+           discarded. But img2pdf will always be lossless and thus, input images must
+           not carry transparency information.
         
-        A configuration file could be used for default options.
+         - img2pdf uses PIL (or Pillow) to obtain image meta data and to convert the
+           input if necessary. To prevent decompression bomb denial of service attacks,
+           Pillow limits the maximum number of pixels an input image is allowed to
+           have. If you are sure that you know what you are doing, then you can disable
+           this safeguard by passing the `--pillow-limit-break` option to img2pdf. This
+           allows one to process even very large input images.
         
         Installation
         ------------
         
-        On a Debian- and Ubuntu-based systems, dependencies may be installed
-        with the following command:
+        On a Debian- and Ubuntu-based systems, img2pdf can be installed from the
+        official repositories:
         
-        	apt-get install python3 python3-pil python3-setuptools
+        	$ apt install img2pdf
         
-        You can then install the package using:
+        If you want to install it using pip, you can run:
         
         	$ pip3 install img2pdf
         
@@ -176,6 +158,75 @@ Description: img2pdf
         	with open("name.pdf","wb") as f:
         		f.write(img2pdf.convert('test.jpg', layout_fun=layout_fun))
         
+        Comparison to ImageMagick
+        -------------------------
+        
+        Create a large test image:
+        
+        	$ convert logo: -resize 8000x original.jpg
+        
+        Convert it into PDF using ImageMagick and img2pdf:
+        
+        	$ time img2pdf original.jpg -o img2pdf.pdf
+        	$ time convert original.jpg imagemagick.pdf
+        
+        Notice how ImageMagick took an order of magnitude longer to do the conversion
+        than img2pdf. It also used twice the memory.
+        
+        Now extract the image data from both PDF documents and compare it to the
+        original:
+        
+        	$ pdfimages -all img2pdf.pdf tmp
+        	$ compare -metric AE original.jpg tmp-000.jpg null:
+        	0
+        	$ pdfimages -all imagemagick.pdf tmp
+        	$ compare -metric AE original.jpg tmp-000.jpg null:
+        	118716
+        
+        To get lossless output with ImageMagick we can use Zip compression but that
+        unnecessarily increases the size of the output:
+        
+        	$ convert original.jpg -compress Zip imagemagick.pdf
+        	$ pdfimages -all imagemagick.pdf tmp
+        	$ compare -metric AE original.jpg tmp-000.png null:
+        	0
+        	$ stat --format="%s %n" original.jpg img2pdf.pdf imagemagick.pdf
+        	1535837 original.jpg
+        	1536683 img2pdf.pdf
+        	9397809 imagemagick.pdf
+        
+        Comparison to pdfLaTeX
+        ----------------------
+        
+        pdfLaTeX performs a lossless conversion from included images to PDF by default.
+        If the input is a JPEG, then it simply embeds the JPEG into the PDF in the same
+        way as img2pdf does it. But for other image formats it uses flate compression
+        of the plain pixel data and thus needlessly increases the output file size:
+        
+        	$ convert logo: -resize 8000x original.png
+        	$ cat << END > pdflatex.tex
+        	\documentclass{article}
+        	\usepackage{graphicx}
+        	\begin{document}
+        	\includegraphics{original.png}
+        	\end{document}
+        	END
+        	$ pdflatex pdflatex.tex
+        	$ stat --format="%s %n" original.png pdflatex.pdf
+        	4500182 original.png
+        	9318120 pdflatex.pdf
+        
+        Comparison to Tesseract OCR
+        ---------------------------
+        
+        Tesseract OCR comes closest to the functionality img2pdf provides. It is able
+        to convert JPEG and PNG input to PDF without needlessly increasing the filesize
+        and is at the same time lossless. So if your input is JPEG and PNG images, then
+        you should safely be able to use Tesseract instead of img2pdf. For other input,
+        Tesseract might not do a lossless conversion. For example it converts CMYK
+        input to RGB and removes the alpha channel from images with transparency. For
+        multipage TIFF or animated GIF, it will only convert the first frame.
+        
 Keywords: jpeg pdf converter
 Platform: UNKNOWN
 Classifier: Development Status :: 5 - Production/Stable
diff --git a/README.md b/README.md
index 249abb8..ef25643 100644
--- a/README.md
+++ b/README.md
@@ -1,70 +1,51 @@
 img2pdf
 =======
 
-Losslessly convert raster images to PDF. The file size will not unnecessarily
-increase. It can for example be used to create a PDF document from a number of
-scans that are only available in JPEG format. Existing solutions would either
-re-encode the input JPEG files (leading to quality loss) or store them in the
-zip/flate format which results into the PDF becoming unnecessarily large in
-terms of its file size.
-
-Background
-----------
-
-Quality loss can be avoided when converting PNG, JPEG and JPEG2000 images to
-PDF by embedding them into the PDF without re-encoding them. This is what
-img2pdf does. It thus treats the PDF format merely as a container format for
-storing one or more JPEGs or PNGs without re-encoding the images themselves.
-
-If you know an existing tool which allows one to embed PNG, JPEG and JPEG2000
-images into a PDF container without recompression, please contact me so that I
-can put this code into the garbage bin.
-
-Functionality
--------------
-
-This program will take a list of raster images and produce a PDF file with the
-images embedded in it. PNG, JPEG and JPEG2000 images will be included without
-recompression and the resulting PDF will only be slightly larger than the input
-images due to the overhead of the PDF container.  Raster images in other
-formats (like gif or tif) will be included using the lossless zip/flate
-encoding using the PNG Paeth predictor.
-
-As a result, this tool is able to losslessly wrap raster images into a PDF
-container with a quality to filesize ratio that is typically better (in case of
-JPEG and JPEG2000 images) or equal (in case of other formats) than that of
-existing tools.
-
-For example, imagemagick will re-encode the input JPEG image (thus changing
-its content):
-
-	$ convert img.jpg img.pdf
-	$ pdfimages img.pdf img.extr # not using -j to be extra sure there is no recompression
-	$ compare -metric AE img.jpg img.extr-000.ppm null:
-	1.6301e+06
-
-If one wants to losslessly convert from any format to PDF with
-imagemagick, one has to use zip compression:
-
-	$ convert input.jpg -compress Zip output.pdf
-	$ pdfimages img.pdf img.extr # not using -j to be extra sure there is no recompression
-	$ compare -metric AE img.jpg img.extr-000.ppm null:
-	0
-
-However, this approach will result in PDF files that are a few times larger
-than the input JPEG or JPEG2000 file.
-
-Furthermore, when converting PNG images, popular tools like imagemagick use
-flate encoding without a predictor. This means, that image file size ends up
-being several orders of magnitude larger then necessary.
-
-img2pdf is able to losslessly embed PNG, JPEG and JPEG2000 files into a PDF
-container without additional overhead (aside from the PDF structure itself),
-save other graphics formats using lossless zip compression, and produce
-multi-page PDF files when more than one input image is given.
-
-Also, since PNG, JPEG and JPEG2000 images are not reencoded, conversion with
-img2pdf is several times faster than with other tools.
+Lossless conversion of raster images to PDF. You should use img2pdf if your
+priorities are (in this order):
+
+ 1. **always lossless**: the image embedded in the PDF will always have the
+    exact same color information for every pixel as the input
+ 2. **small**: if possible, the difference in filesize between the input image
+    and the output PDF will only be the overhead of the PDF container itself
+ 3. **fast**: if possible, the input image is just pasted into the PDF document
+    as-is without any CPU hungry re-encoding of the pixel data
+
+Conventional conversion software (like ImageMagick) would either:
+
+ 1. not be lossless because lossy re-encoding to JPEG
+ 2. not be small because using wasteful flate encoding of raw pixel data
+ 3. not be fast because input data gets re-encoded
+
+Another advantage of not having to re-encode the input (in most common
+situations) is, that img2pdf is able to handle much larger input than other
+software, because the raw pixel data never has to be loaded into memory.
+
+The following table shows how img2pdf handles different input depending on the
+input file format and image color space.
+
+| Format               | Colorspace                     | Result        |
+| -------------------- | ------------------------------ | ------------- |
+| JPEG                 | any                            | direct        |
+| JPEG2000             | any                            | direct        |
+| PNG (non-interlaced) | any                            | direct        |
+| TIFF (CCITT Group 4) | monochrome                     | direct        |
+| any                  | any except CMYK and monochrome | PNG Paeth     |
+| any                  | monochrome                     | CCITT Group 4 |
+| any                  | CMYK                           | flate         |
+
+For JPEG, JPEG2000, non-interlaced PNG and TIFF images with CCITT Group 4
+encoded data, img2pdf directly embeds the image data into the PDF without
+re-encoding it. It thus treats the PDF format merely as a container format for
+the image data. In these cases, img2pdf only increases the filesize by the size
+of the PDF container (typically around 500 to 700 bytes). Since data is only
+copied and not re-encoded, img2pdf is also typically faster than other
+solutions for these input formats.
+
+For all other input types, img2pdf first has to transform the pixel data to
+make it compatible with PDF. In most cases, the PNG Paeth filter is applied to
+the pixel data. For monochrome input, CCITT Group 4 is used instead. Only for
+CMYK input no filter is applied before finally applying flate compression.
 
 Usage
 -----
@@ -75,44 +56,45 @@ descriptor.
 If no output file is specified with the `-o`/`--output` option, output will be
 done to stdout. A typical invocation is:
 
-	img2pdf img1.png img2.jpg -o out.pdf
+	$ img2pdf img1.png img2.jpg -o out.pdf
 
 The detailed documentation can be accessed by running:
 
-	img2pdf --help
-
+	$ img2pdf --help
 
 Bugs
 ----
 
-If you find a JPEG or JPEG2000 file that, when embedded cannot be read
-by the Adobe Acrobat Reader, please contact me.
-
-I have not yet figured out how to determine the colorspace of JPEG2000 files.
-Therefore JPEG2000 files use DeviceRGB by default. For JPEG2000 files with
-other colorspaces, you must explicitly specify it using the `--colorspace`
-option.
+ - If you find a JPEG, JPEG2000, PNG or CCITT Group 4 encoded TIFF file that,
+   when embedded into the PDF cannot be read by the Adobe Acrobat Reader,
+   please contact me.
 
-It might be possible to store transparency using masks but it is not clear
-what the utility of such a functionality would be.
+ - I have not yet figured out how to determine the colorspace of JPEG2000
+   files.  Therefore JPEG2000 files use DeviceRGB by default. For JPEG2000
+   files with other colorspaces, you must explicitly specify it using the
+   `--colorspace` option.
 
-Most vector graphic formats can be losslessly turned into PDF (minus some of
-the features unsupported by PDF) but img2pdf will currently turn vector
-graphics into their lossy raster representations. For converting raster
-graphics to PDF, use another tool like inkscape and then join the resulting
-pages with a tool like pdftk.
+ - Input images with alpha channels are not allowed. PDF doesn't support alpha
+   channels in images and thus, the alpha channel of the input would have to be
+   discarded. But img2pdf will always be lossless and thus, input images must
+   not carry transparency information.
 
-A configuration file could be used for default options.
+ - img2pdf uses PIL (or Pillow) to obtain image meta data and to convert the
+   input if necessary. To prevent decompression bomb denial of service attacks,
+   Pillow limits the maximum number of pixels an input image is allowed to
+   have. If you are sure that you know what you are doing, then you can disable
+   this safeguard by passing the `--pillow-limit-break` option to img2pdf. This
+   allows one to process even very large input images.
 
 Installation
 ------------
 
-On a Debian- and Ubuntu-based systems, dependencies may be installed
-with the following command:
+On a Debian- and Ubuntu-based systems, img2pdf can be installed from the
+official repositories:
 
-	apt-get install python3 python3-pil python3-setuptools
+	$ apt install img2pdf
 
-You can then install the package using:
+If you want to install it using pip, you can run:
 
 	$ pip3 install img2pdf
 
@@ -165,3 +147,72 @@ The package can also be used as a library:
 	layout_fun = img2pdf.get_layout_fun(a4inpt)
 	with open("name.pdf","wb") as f:
 		f.write(img2pdf.convert('test.jpg', layout_fun=layout_fun))
+
+Comparison to ImageMagick
+-------------------------
+
+Create a large test image:
+
+	$ convert logo: -resize 8000x original.jpg
+
+Convert it into PDF using ImageMagick and img2pdf:
+
+	$ time img2pdf original.jpg -o img2pdf.pdf
+	$ time convert original.jpg imagemagick.pdf
+
+Notice how ImageMagick took an order of magnitude longer to do the conversion
+than img2pdf. It also used twice the memory.
+
+Now extract the image data from both PDF documents and compare it to the
+original:
+
+	$ pdfimages -all img2pdf.pdf tmp
+	$ compare -metric AE original.jpg tmp-000.jpg null:
+	0
+	$ pdfimages -all imagemagick.pdf tmp
+	$ compare -metric AE original.jpg tmp-000.jpg null:
+	118716
+
+To get lossless output with ImageMagick we can use Zip compression but that
+unnecessarily increases the size of the output:
+
+	$ convert original.jpg -compress Zip imagemagick.pdf
+	$ pdfimages -all imagemagick.pdf tmp
+	$ compare -metric AE original.jpg tmp-000.png null:
+	0
+	$ stat --format="%s %n" original.jpg img2pdf.pdf imagemagick.pdf
+	1535837 original.jpg
+	1536683 img2pdf.pdf
+	9397809 imagemagick.pdf
+
+Comparison to pdfLaTeX
+----------------------
+
+pdfLaTeX performs a lossless conversion from included images to PDF by default.
+If the input is a JPEG, then it simply embeds the JPEG into the PDF in the same
+way as img2pdf does it. But for other image formats it uses flate compression
+of the plain pixel data and thus needlessly increases the output file size:
+
+	$ convert logo: -resize 8000x original.png
+	$ cat << END > pdflatex.tex
+	\documentclass{article}
+	\usepackage{graphicx}
+	\begin{document}
+	\includegraphics{original.png}
+	\end{document}
+	END
+	$ pdflatex pdflatex.tex
+	$ stat --format="%s %n" original.png pdflatex.pdf
+	4500182 original.png
+	9318120 pdflatex.pdf
+
+Comparison to Tesseract OCR
+---------------------------
+
+Tesseract OCR comes closest to the functionality img2pdf provides. It is able
+to convert JPEG and PNG input to PDF without needlessly increasing the filesize
+and is at the same time lossless. So if your input is JPEG and PNG images, then
+you should safely be able to use Tesseract instead of img2pdf. For other input,
+Tesseract might not do a lossless conversion. For example it converts CMYK
+input to RGB and removes the alpha channel from images with transparency. For
+multipage TIFF or animated GIF, it will only convert the first frame.
diff --git a/setup.py b/setup.py
index 56e9c4c..cc56301 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ from setuptools import setup
 
 PY3 = sys.version_info[0] >= 3
 
-VERSION = "0.3.0"
+VERSION = "0.3.1"
 
 INSTALL_REQUIRES = (
     'Pillow',
diff --git a/src/img2pdf.egg-info/PKG-INFO b/src/img2pdf.egg-info/PKG-INFO
index e3ecf4b..975388d 100644
--- a/src/img2pdf.egg-info/PKG-INFO
+++ b/src/img2pdf.egg-info/PKG-INFO
@@ -1,80 +1,61 @@
 Metadata-Version: 1.1
 Name: img2pdf
-Version: 0.3.0
+Version: 0.3.1
 Summary: Convert images to PDF via direct JPEG inclusion.
 Home-page: https://gitlab.mister-muffin.de/josch/img2pdf
 Author: Johannes 'josch' Schauer
 Author-email: josch@mister-muffin.de
 License: LGPL
-Download-URL: https://gitlab.mister-muffin.de/josch/img2pdf/repository/archive.tar.gz?ref=0.3.0
+Download-URL: https://gitlab.mister-muffin.de/josch/img2pdf/repository/archive.tar.gz?ref=0.3.1
 Description-Content-Type: UNKNOWN
 Description: img2pdf
         =======
         
-        Losslessly convert raster images to PDF. The file size will not unnecessarily
-        increase. It can for example be used to create a PDF document from a number of
-        scans that are only available in JPEG format. Existing solutions would either
-        re-encode the input JPEG files (leading to quality loss) or store them in the
-        zip/flate format which results into the PDF becoming unnecessarily large in
-        terms of its file size.
-        
-        Background
-        ----------
-        
-        Quality loss can be avoided when converting PNG, JPEG and JPEG2000 images to
-        PDF by embedding them into the PDF without re-encoding them. This is what
-        img2pdf does. It thus treats the PDF format merely as a container format for
-        storing one or more JPEGs or PNGs without re-encoding the images themselves.
-        
-        If you know an existing tool which allows one to embed PNG, JPEG and JPEG2000
-        images into a PDF container without recompression, please contact me so that I
-        can put this code into the garbage bin.
-        
-        Functionality
-        -------------
-        
-        This program will take a list of raster images and produce a PDF file with the
-        images embedded in it. PNG, JPEG and JPEG2000 images will be included without
-        recompression and the resulting PDF will only be slightly larger than the input
-        images due to the overhead of the PDF container.  Raster images in other
-        formats (like gif or tif) will be included using the lossless zip/flate
-        encoding using the PNG Paeth predictor.
-        
-        As a result, this tool is able to losslessly wrap raster images into a PDF
-        container with a quality to filesize ratio that is typically better (in case of
-        JPEG and JPEG2000 images) or equal (in case of other formats) than that of
-        existing tools.
-        
-        For example, imagemagick will re-encode the input JPEG image (thus changing
-        its content):
-        
-        	$ convert img.jpg img.pdf
-        	$ pdfimages img.pdf img.extr # not using -j to be extra sure there is no recompression
-        	$ compare -metric AE img.jpg img.extr-000.ppm null:
-        	1.6301e+06
-        
-        If one wants to losslessly convert from any format to PDF with
-        imagemagick, one has to use zip compression:
-        
-        	$ convert input.jpg -compress Zip output.pdf
-        	$ pdfimages img.pdf img.extr # not using -j to be extra sure there is no recompression
-        	$ compare -metric AE img.jpg img.extr-000.ppm null:
-        	0
-        
-        However, this approach will result in PDF files that are a few times larger
-        than the input JPEG or JPEG2000 file.
-        
-        Furthermore, when converting PNG images, popular tools like imagemagick use
-        flate encoding without a predictor. This means, that image file size ends up
-        being several orders of magnitude larger then necessary.
-        
-        img2pdf is able to losslessly embed PNG, JPEG and JPEG2000 files into a PDF
-        container without additional overhead (aside from the PDF structure itself),
-        save other graphics formats using lossless zip compression, and produce
-        multi-page PDF files when more than one input image is given.
-        
-        Also, since PNG, JPEG and JPEG2000 images are not reencoded, conversion with
-        img2pdf is several times faster than with other tools.
+        Lossless conversion of raster images to PDF. You should use img2pdf if your
+        priorities are (in this order):
+        
+         1. **always lossless**: the image embedded in the PDF will always have the
+            exact same color information for every pixel as the input
+         2. **small**: if possible, the difference in filesize between the input image
+            and the output PDF will only be the overhead of the PDF container itself
+         3. **fast**: if possible, the input image is just pasted into the PDF document
+            as-is without any CPU hungry re-encoding of the pixel data
+        
+        Conventional conversion software (like ImageMagick) would either:
+        
+         1. not be lossless because lossy re-encoding to JPEG
+         2. not be small because using wasteful flate encoding of raw pixel data
+         3. not be fast because input data gets re-encoded
+        
+        Another advantage of not having to re-encode the input (in most common
+        situations) is, that img2pdf is able to handle much larger input than other
+        software, because the raw pixel data never has to be loaded into memory.
+        
+        The following table shows how img2pdf handles different input depending on the
+        input file format and image color space.
+        
+        | Format               | Colorspace                     | Result        |
+        | -------------------- | ------------------------------ | ------------- |
+        | JPEG                 | any                            | direct        |
+        | JPEG2000             | any                            | direct        |
+        | PNG (non-interlaced) | any                            | direct        |
+        | TIFF (CCITT Group 4) | monochrome                     | direct        |
+        | any                  | any except CMYK and monochrome | PNG Paeth     |
+        | any                  | monochrome                     | CCITT Group 4 |
+        | any                  | CMYK                           | flate         |
+        
+        For JPEG, JPEG2000, non-interlaced PNG and TIFF images with CCITT Group 4
+        encoded data, img2pdf directly embeds the image data into the PDF without
+        re-encoding it. It thus treats the PDF format merely as a container format for
+        the image data. In these cases, img2pdf only increases the filesize by the size
+        of the PDF container (typically around 500 to 700 bytes). Since data is only
+        copied and not re-encoded, img2pdf is also typically faster than other
+        solutions for these input formats.
+        
+        For all other input types, img2pdf first has to transform the pixel data to
+        make it compatible with PDF. In most cases, the PNG Paeth filter is applied to
+        the pixel data. For monochrome input, CCITT Group 4 is used instead. Only for
+        CMYK input no filter is applied before finally applying flate compression.
         
         Usage
         -----
@@ -85,44 +66,45 @@ Description: img2pdf
         If no output file is specified with the `-o`/`--output` option, output will be
         done to stdout. A typical invocation is:
         
-        	img2pdf img1.png img2.jpg -o out.pdf
+        	$ img2pdf img1.png img2.jpg -o out.pdf
         
         The detailed documentation can be accessed by running:
         
-        	img2pdf --help
-        
+        	$ img2pdf --help
         
         Bugs
         ----
         
-        If you find a JPEG or JPEG2000 file that, when embedded cannot be read
-        by the Adobe Acrobat Reader, please contact me.
-        
-        I have not yet figured out how to determine the colorspace of JPEG2000 files.
-        Therefore JPEG2000 files use DeviceRGB by default. For JPEG2000 files with
-        other colorspaces, you must explicitly specify it using the `--colorspace`
-        option.
+         - If you find a JPEG, JPEG2000, PNG or CCITT Group 4 encoded TIFF file that,
+           when embedded into the PDF cannot be read by the Adobe Acrobat Reader,
+           please contact me.
         
-        It might be possible to store transparency using masks but it is not clear
-        what the utility of such a functionality would be.
+         - I have not yet figured out how to determine the colorspace of JPEG2000
+           files.  Therefore JPEG2000 files use DeviceRGB by default. For JPEG2000
+           files with other colorspaces, you must explicitly specify it using the
+           `--colorspace` option.
         
-        Most vector graphic formats can be losslessly turned into PDF (minus some of
-        the features unsupported by PDF) but img2pdf will currently turn vector
-        graphics into their lossy raster representations. For converting raster
-        graphics to PDF, use another tool like inkscape and then join the resulting
-        pages with a tool like pdftk.
+         - Input images with alpha channels are not allowed. PDF doesn't support alpha
+           channels in images and thus, the alpha channel of the input would have to be
+           discarded. But img2pdf will always be lossless and thus, input images must
+           not carry transparency information.
         
-        A configuration file could be used for default options.
+         - img2pdf uses PIL (or Pillow) to obtain image meta data and to convert the
+           input if necessary. To prevent decompression bomb denial of service attacks,
+           Pillow limits the maximum number of pixels an input image is allowed to
+           have. If you are sure that you know what you are doing, then you can disable
+           this safeguard by passing the `--pillow-limit-break` option to img2pdf. This
+           allows one to process even very large input images.
         
         Installation
         ------------
         
-        On a Debian- and Ubuntu-based systems, dependencies may be installed
-        with the following command:
+        On a Debian- and Ubuntu-based systems, img2pdf can be installed from the
+        official repositories:
         
-        	apt-get install python3 python3-pil python3-setuptools
+        	$ apt install img2pdf
         
-        You can then install the package using:
+        If you want to install it using pip, you can run:
         
         	$ pip3 install img2pdf
         
@@ -176,6 +158,75 @@ Description: img2pdf
         	with open("name.pdf","wb") as f:
         		f.write(img2pdf.convert('test.jpg', layout_fun=layout_fun))
         
+        Comparison to ImageMagick
+        -------------------------
+        
+        Create a large test image:
+        
+        	$ convert logo: -resize 8000x original.jpg
+        
+        Convert it into PDF using ImageMagick and img2pdf:
+        
+        	$ time img2pdf original.jpg -o img2pdf.pdf
+        	$ time convert original.jpg imagemagick.pdf
+        
+        Notice how ImageMagick took an order of magnitude longer to do the conversion
+        than img2pdf. It also used twice the memory.
+        
+        Now extract the image data from both PDF documents and compare it to the
+        original:
+        
+        	$ pdfimages -all img2pdf.pdf tmp
+        	$ compare -metric AE original.jpg tmp-000.jpg null:
+        	0
+        	$ pdfimages -all imagemagick.pdf tmp
+        	$ compare -metric AE original.jpg tmp-000.jpg null:
+        	118716
+        
+        To get lossless output with ImageMagick we can use Zip compression but that
+        unnecessarily increases the size of the output:
+        
+        	$ convert original.jpg -compress Zip imagemagick.pdf
+        	$ pdfimages -all imagemagick.pdf tmp
+        	$ compare -metric AE original.jpg tmp-000.png null:
+        	0
+        	$ stat --format="%s %n" original.jpg img2pdf.pdf imagemagick.pdf
+        	1535837 original.jpg
+        	1536683 img2pdf.pdf
+        	9397809 imagemagick.pdf
+        
+        Comparison to pdfLaTeX
+        ----------------------
+        
+        pdfLaTeX performs a lossless conversion from included images to PDF by default.
+        If the input is a JPEG, then it simply embeds the JPEG into the PDF in the same
+        way as img2pdf does it. But for other image formats it uses flate compression
+        of the plain pixel data and thus needlessly increases the output file size:
+        
+        	$ convert logo: -resize 8000x original.png
+        	$ cat << END > pdflatex.tex
+        	\documentclass{article}
+        	\usepackage{graphicx}
+        	\begin{document}
+        	\includegraphics{original.png}
+        	\end{document}
+        	END
+        	$ pdflatex pdflatex.tex
+        	$ stat --format="%s %n" original.png pdflatex.pdf
+        	4500182 original.png
+        	9318120 pdflatex.pdf
+        
+        Comparison to Tesseract OCR
+        ---------------------------
+        
+        Tesseract OCR comes closest to the functionality img2pdf provides. It is able
+        to convert JPEG and PNG input to PDF without needlessly increasing the filesize
+        and is at the same time lossless. So if your input is JPEG and PNG images, then
+        you should safely be able to use Tesseract instead of img2pdf. For other input,
+        Tesseract might not do a lossless conversion. For example it converts CMYK
+        input to RGB and removes the alpha channel from images with transparency. For
+        multipage TIFF or animated GIF, it will only convert the first frame.
+        
 Keywords: jpeg pdf converter
 Platform: UNKNOWN
 Classifier: Development Status :: 5 - Production/Stable
diff --git a/src/img2pdf.egg-info/SOURCES.txt b/src/img2pdf.egg-info/SOURCES.txt
index ae6e816..3271401 100644
--- a/src/img2pdf.egg-info/SOURCES.txt
+++ b/src/img2pdf.egg-info/SOURCES.txt
@@ -1,4 +1,5 @@
 CHANGES.rst
+LICENSE
 MANIFEST.in
 README.md
 setup.cfg
diff --git a/src/img2pdf.py b/src/img2pdf.py
index 48ef964..7c1978e 100755
--- a/src/img2pdf.py
+++ b/src/img2pdf.py
@@ -22,7 +22,7 @@ import sys
 import os
 import zlib
 import argparse
-from PIL import Image
+from PIL import Image, TiffImagePlugin
 from datetime import datetime
 from jp2 import parsejp2
 from enum import Enum
@@ -32,7 +32,7 @@ import struct
 
 PY3 = sys.version_info[0] >= 3
 
-__version__ = "0.3.0"
+__version__ = "0.3.1"
 default_dpi = 96.0
 papersizes = {
     "letter": "8.5inx11in",
@@ -62,7 +62,7 @@ PageOrientation = Enum('PageOrientation', 'portrait landscape')
 
 Colorspace = Enum('Colorspace', 'RGB L 1 CMYK CMYK;I RGBA P other')
 
-ImageFormat = Enum('ImageFormat', 'JPEG JPEG2000 CCITTGroup4 PNG other')
+ImageFormat = Enum('ImageFormat', 'JPEG JPEG2000 CCITTGroup4 PNG TIFF other')
 
 PageMode = Enum('PageMode', 'none outlines thumbs')
 
@@ -277,7 +277,8 @@ if PY3:
         @classmethod
         def encode(cls, string, hextype=False):
             if hextype:
-                return b'< ' + b' '.join(("%06x"%c).encode('ascii') for c in string) + b' >'
+                return b'< ' + b' '.join(
+                        ("%06x" % c).encode('ascii') for c in string) + b' >'
             else:
                 try:
                     string = string.encode('ascii')
@@ -292,7 +293,8 @@ else:
         @classmethod
         def encode(cls, string, hextype=False):
             if hextype:
-                return b'< ' + b' '.join(("%06x"%c).encode('ascii') for c in string) + b' >'
+                return b'< ' + b' '.join(
+                        ("%06x" % c).encode('ascii') for c in string) + b' >'
             else:
                 # This mimics exactely to what pdfrw does.
                 string = string.replace(b'\\', b'\\\\')
@@ -374,7 +376,7 @@ class pdfdoc(object):
 
     def add_imagepage(self, color, imgwidthpx, imgheightpx, imgformat, imgdata,
                       imgwidthpdf, imgheightpdf, imgxpdf, imgypdf, pagewidth,
-                      pageheight, userunit=None, palette=None):
+                      pageheight, userunit=None, palette=None, inverted=False):
         if self.with_pdfrw:
             from pdfrw import PdfDict, PdfName, PdfObject, PdfString
             from pdfrw.py23_diffs import convert_load
@@ -393,8 +395,11 @@ class pdfdoc(object):
             colorspace = PdfName.DeviceCMYK
         elif color == Colorspace.P:
             if self.with_pdfrw:
-                raise Exception("pdfrw does not support hex strings for palette image input, re-run with --without-pdfrw")
-            colorspace = [ PdfName.Indexed, PdfName.DeviceRGB, len(palette)-1, PdfString.encode(palette, hextype=True)]
+                raise Exception("pdfrw does not support hex strings for "
+                                "palette image input, re-run with "
+                                "--without-pdfrw")
+            colorspace = [PdfName.Indexed, PdfName.DeviceRGB, len(palette)-1,
+                          PdfString.encode(palette, hextype=True)]
         else:
             raise UnsupportedColorspaceError("unsupported color space: %s"
                                              % color.name)
@@ -440,15 +445,20 @@ class pdfdoc(object):
 
         if imgformat is ImageFormat.CCITTGroup4:
             decodeparms = PdfDict()
+            # The default for the K parameter is 0 which indicates Group 3 1-D
+            # encoding. We set it to -1 because we want Group 4 encoding.
             decodeparms[PdfName.K] = -1
-            decodeparms[PdfName.BlackIs1] = PdfObject('true')
+            if inverted:
+                decodeparms[PdfName.BlackIs1] = PdfObject('false')
+            else:
+                decodeparms[PdfName.BlackIs1] = PdfObject('true')
             decodeparms[PdfName.Columns] = imgwidthpx
             decodeparms[PdfName.Rows] = imgheightpx
             image[PdfName.DecodeParms] = [decodeparms]
         elif imgformat is ImageFormat.PNG:
             decodeparms = PdfDict()
             decodeparms[PdfName.Predictor] = 15
-            if color in [ Colorspace.P, Colorspace['1'], Colorspace.L ]:
+            if color in [Colorspace.P, Colorspace['1'], Colorspace.L]:
                 decodeparms[PdfName.Colors] = 1
             else:
                 decodeparms[PdfName.Colors] = 3
@@ -642,13 +652,14 @@ def get_imgmetadata(imgdata, imgformat, default_dpi, colorspace, rawdata=None):
         ics = imgdata.mode
 
     if ics in ["LA", "PA", "RGBA"]:
-        logging.warning("Image contains transparency which cannot be retained in PDF.")
+        logging.warning("Image contains transparency which cannot be retained "
+                        "in PDF.")
         logging.warning("img2pdf will not perform a lossy operation.")
         logging.warning("You can remove the alpha channel using imagemagick:")
-        logging.warning("  $ convert input.png -background white -alpha remove -alpha off output.png")
+        logging.warning("  $ convert input.png -background white -alpha "
+                        "remove -alpha off output.png")
         raise Exception("Refusing to work on images with alpha channel")
 
-
     # Since commit 07a96209597c5e8dfe785c757d7051ce67a980fb or release 4.1.0
     # Pillow retrieves the DPI from EXIF if it cannot find the DPI in the JPEG
     # header. In that case it can happen that the horizontal and vertical DPI
@@ -685,11 +696,33 @@ def get_imgmetadata(imgdata, imgformat, default_dpi, colorspace, rawdata=None):
     return (color, ndpi, imgwidthpx, imgheightpx)
 
 
+def ccitt_payload_location_from_pil(img):
+    # If Pillow is passed an invalid compression argument it will ignore it;
+    # make sure the image actually got compressed.
+    if img.info['compression'] != 'group4':
+        raise ValueError("Image not compressed with CCITT Group 4 but with: %s"
+                         % img.info['compression'])
+
+    # Read the TIFF tags to find the offset(s) of the compressed data strips.
+    strip_offsets = img.tag_v2[TiffImagePlugin.STRIPOFFSETS]
+    strip_bytes = img.tag_v2[TiffImagePlugin.STRIPBYTECOUNTS]
+    rows_per_strip = img.tag_v2[TiffImagePlugin.ROWSPERSTRIP]
+
+    # PIL always seems to create a single strip even for very large TIFFs when
+    # it saves images, so assume we only have to read a single strip.
+    # A test ~10 GPixel image was still encoded as a single strip. Just to be
+    # safe check throw an error if there is more than one offset.
+    if len(strip_offsets) != 1 or len(strip_bytes) != 1:
+        raise NotImplementedError("Transcoding multiple strips not supported")
+
+    (offset, ), (length, ) = strip_offsets, strip_bytes
+
+    return offset, length
+
+
 def transcode_monochrome(imgdata):
     """Convert the open PIL.Image imgdata to compressed CCITT Group4 data"""
 
-    from PIL import TiffImagePlugin
-
     logging.debug("Converting monochrome to CCITT Group4")
 
     # Convert the image to Group 4 in memory. If libtiff is not installed and
@@ -707,27 +740,11 @@ def transcode_monochrome(imgdata):
     newimgio.seek(0)
     newimg = Image.open(newimgio)
 
-    # If Pillow is passed an invalid compression argument it will ignore it;
-    # make sure the image actually got compressed.
-    if newimg.info['compression'] != 'group4':
-        raise ValueError("Image not compressed as expected")
-
-    # Read the TIFF tags to find the offset(s) of the compressed data strips.
-    strip_offsets = newimg.tag_v2[TiffImagePlugin.STRIPOFFSETS]
-    strip_bytes = newimg.tag_v2[TiffImagePlugin.STRIPBYTECOUNTS]
-    rows_per_strip = newimg.tag_v2[TiffImagePlugin.ROWSPERSTRIP]
-
-    # PIL always seems to create a single strip even for very large TIFFs when
-    # it saves images, so assume we only have to read a single strip.
-    # A test ~10 GPixel image was still encoded as a single strip. Just to be
-    # safe check throw an error if there is more than one offset.
-    if len(strip_offsets) > 1:
-        raise NotImplementedError("Transcoding multiple strips not supported")
+    offset, length = ccitt_payload_location_from_pil(newimg)
 
-    newimgio.seek(strip_offsets[0])
-    ccittdata = newimgio.read(strip_bytes[0])
+    newimgio.seek(offset)
+    return newimgio.read(length)
 
-    return ccittdata
 
 def parse_png(rawdata):
     pngidat = b""
@@ -737,18 +754,20 @@ def parse_png(rawdata):
         # once we can require Python >= 3.2 we can use int.from_bytes() instead
         n, = struct.unpack('>I', rawdata[i-8:i-4])
         if i + n > len(rawdata):
-            raise Exception("invalid png: %d %d %d"%(i, n, len(rawdata)))
+            raise Exception("invalid png: %d %d %d" % (i, n, len(rawdata)))
         if rawdata[i-4:i] == b"IDAT":
             pngidat += rawdata[i:i+n]
         elif rawdata[i-4:i] == b"PLTE":
             for j in range(i, i+n, 3):
-                # with int.from_bytes() we would not have to prepend extra zeroes
+                # with int.from_bytes() we would not have to prepend extra
+                # zeroes
                 color, = struct.unpack('>I', b'\x00'+rawdata[j:j+3])
                 palette.append(color)
         i += n
         i += 12
     return pngidat, palette
 
+
 def read_images(rawdata, colorspace, first_frame_only=False):
     im = BytesIO(rawdata)
     im.seek(0)
@@ -786,7 +805,8 @@ def read_images(rawdata, colorspace, first_frame_only=False):
         if color == Colorspace['RGBA']:
             raise JpegColorspaceError("jpeg can't have an alpha channel")
         im.close()
-        return [(color, ndpi, imgformat, rawdata, imgwidthpx, imgheightpx, [])]
+        return [(color, ndpi, imgformat, rawdata, imgwidthpx, imgheightpx, [],
+                 False)]
 
     # We can directly embed the IDAT chunk of PNG images if the PNG is not
     # interlaced
@@ -799,7 +819,30 @@ def read_images(rawdata, colorspace, first_frame_only=False):
         color, ndpi, imgwidthpx, imgheightpx = get_imgmetadata(
                 imgdata, imgformat, default_dpi, colorspace, rawdata)
         pngidat, palette = parse_png(rawdata)
-        return [(color, ndpi, imgformat, pngidat, imgwidthpx, imgheightpx, palette)]
+        im.close()
+        return [(color, ndpi, imgformat, pngidat, imgwidthpx, imgheightpx,
+                 palette, False)]
+
+    # We can directly copy the data out of a CCITT Group 4 encoded TIFF, if it
+    # only contains a single strip
+    if imgformat == ImageFormat.TIFF \
+            and imgdata.info['compression'] == "group4" \
+            and len(imgdata.tag_v2[TiffImagePlugin.STRIPOFFSETS]) == 1:
+        photo = imgdata.tag_v2[TiffImagePlugin.PHOTOMETRIC_INTERPRETATION]
+        inverted = False
+        if photo == 0:
+            inverted = True
+        elif photo != 1:
+            raise ValueError("unsupported photometric interpretation for "
+                             "group4 tiff: %d" % photo)
+        color, ndpi, imgwidthpx, imgheightpx = get_imgmetadata(
+                imgdata, imgformat, default_dpi, colorspace, rawdata)
+        offset, length = ccitt_payload_location_from_pil(imgdata)
+        im.seek(offset)
+        rawdata = im.read(length)
+        im.close()
+        return [(color, ndpi, ImageFormat.CCITTGroup4, rawdata, imgwidthpx,
+                 imgheightpx, [], inverted)]
 
     # Everything else has to be encoded
 
@@ -826,7 +869,7 @@ def read_images(rawdata, colorspace, first_frame_only=False):
                 ccittdata = transcode_monochrome(imgdata)
                 imgformat = ImageFormat.CCITTGroup4
                 result.append((color, ndpi, imgformat, ccittdata,
-                               imgwidthpx, imgheightpx, []))
+                               imgwidthpx, imgheightpx, [], False))
                 img_page_count += 1
                 continue
             except Exception as e:
@@ -839,13 +882,14 @@ def read_images(rawdata, colorspace, first_frame_only=False):
             logging.debug("Colorspace is OK: %s", color)
             newimg = imgdata
         else:
-            raise ValueError("unknown or unsupported colorspace: %s" % color.name)
+            raise ValueError("unknown or unsupported colorspace: %s"
+                             % color.name)
         # the PNG format does not support CMYK, so we fall back to normal
         # compression
         if color in [Colorspace.CMYK, Colorspace["CMYK;I"]]:
             imggz = zlib.compress(newimg.tobytes())
             result.append((color, ndpi, imgformat, imggz, imgwidthpx,
-                           imgheightpx, []))
+                           imgheightpx, [], False))
         else:
             # cheapo version to retrieve a PNG encoding of the payload is to
             # just save it with PIL. In the future this could be replaced by
@@ -855,7 +899,7 @@ def read_images(rawdata, colorspace, first_frame_only=False):
             pngidat, palette = parse_png(pngbuffer.getvalue())
             imgformat = ImageFormat.PNG
             result.append((color, ndpi, imgformat, pngidat, imgwidthpx,
-                           imgheightpx, palette))
+                           imgheightpx, palette, False))
         img_page_count += 1
     # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not have the
     # close() method
@@ -1164,14 +1208,14 @@ def convert(*images, **kwargs):
             try:
                 with open(img, "rb") as f:
                     rawdata = f.read()
-            except:
+            except Exception:
                 # whatever the exception is (string could contain NUL
                 # characters or the path could just not exist) it's not a file
                 # name so we now try treating it as raw image content
                 rawdata = img
 
-        for color, ndpi, imgformat, imgdata, imgwidthpx, imgheightpx, palette \
-                in read_images(
+        for color, ndpi, imgformat, imgdata, imgwidthpx, imgheightpx, \
+                palette, inverted in read_images(
                     rawdata, kwargs['colorspace'], kwargs['first_frame_only']):
             pagewidth, pageheight, imgwidthpdf, imgheightpdf = \
                 kwargs['layout_fun'](imgwidthpx, imgheightpx, ndpi)
@@ -1195,7 +1239,8 @@ def convert(*images, **kwargs):
             imgypdf = (pageheight - imgheightpdf)/2.0
             pdf.add_imagepage(color, imgwidthpx, imgheightpx, imgformat,
                               imgdata, imgwidthpdf, imgheightpdf, imgxpdf,
-                              imgypdf, pagewidth, pageheight, userunit, palette)
+                              imgypdf, pagewidth, pageheight, userunit,
+                              palette, inverted)
 
     if kwargs['outputstream']:
         pdf.tostream(kwargs['outputstream'])
@@ -1542,7 +1587,7 @@ Fit options:
 
 Argument parsing:
   Argument long options can be abbreviated to a prefix if the abbreviation is
-  anambiguous. That is, the prefix must match a unique option.
+  unambiguous. That is, the prefix must match a unique option.
 
   Beware of your shell interpreting argument values as special characters (like
   the semicolon in the CMYK;I colorspace option). If in doubt, put the argument
@@ -1667,7 +1712,7 @@ RGB.''')
              "to prevent decompression bomb denial of service attacks. If "
              "your input image contains more pixels than that, use this "
              "option to disable this safety measure during this run of img2pdf"
-             %Image.MAX_IMAGE_PIXELS)
+             % Image.MAX_IMAGE_PIXELS)
 
     sizeargs = parser.add_argument_group(
         title='Image and page size and layout arguments',
diff --git a/src/tests/__init__.py b/src/tests/__init__.py
index b1c1797..c9b85e3 100644
--- a/src/tests/__init__.py
+++ b/src/tests/__init__.py
@@ -592,10 +592,17 @@ def test_suite():
                     if imgprops.DecodeParms:
                         if orig_img.format == 'PNG':
                             pngidat, palette = img2pdf.parse_png(orig_imgdata)
+                        elif orig_img.format == 'TIFF' \
+                                and orig_img.info['compression'] == "group4":
+                            offset, length = \
+                                    img2pdf.ccitt_payload_location_from_pil(
+                                            orig_img)
+                            pngidat = orig_imgdata[offset:offset+length]
                         else:
                             pngbuffer = BytesIO()
                             orig_img.save(pngbuffer, format="png")
-                            pngidat, palette = img2pdf.parse_png(pngbuffer.getvalue())
+                            pngidat, palette = img2pdf.parse_png(
+                                    pngbuffer.getvalue())
                         self.assertEqual(zlib.decompress(pngidat), imgdata)
                     else:
                         colorspace = imgprops.ColorSpace
@@ -607,17 +614,19 @@ def test_suite():
                             colorspace = 'CMYK'
                         else:
                             raise Exception("invalid colorspace")
-                        im = Image.frombytes(colorspace, (int(imgprops.Width),
-                                                          int(imgprops.Height)),
+                        im = Image.frombytes(colorspace,
+                                             (int(imgprops.Width),
+                                              int(imgprops.Height)),
                                              imgdata)
                         if orig_img.mode == '1':
                             self.assertEqual(im.tobytes(),
                                              orig_img.convert("L").tobytes())
-                        elif orig_img.mode not in ("RGB", "L", "CMYK", "CMYK;I"):
+                        elif orig_img.mode not in ("RGB", "L", "CMYK",
+                                                   "CMYK;I"):
                             self.assertEqual(im.tobytes(),
                                              orig_img.convert("RGB").tobytes())
-                        # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does not
-                        # have the close() method
+                        # the python-pil version 2.3.0-1ubuntu3 in Ubuntu does
+                        # not have the close() method
                         try:
                             im.close()
                         except AttributeError:
diff --git a/src/tests/input/mono.tif b/src/tests/input/mono.tif
index 53e85bc..3718d52 100644
--- a/src/tests/input/mono.tif
+++ b/src/tests/input/mono.tif
diff --git a/src/tests/output/mono.tif.pdf b/src/tests/output/mono.tif.pdf
index d23e65e..eda3ec7 100644
--- a/src/tests/output/mono.tif.pdf
+++ b/src/tests/output/mono.tif.pdf
diff --git a/test_comp.sh b/test_comp.sh
index ae832e2..44edefd 100755
--- a/test_comp.sh
+++ b/test_comp.sh
@@ -16,17 +16,17 @@ for a in `convert -list compress`; do
 	echo "encode:\t$a"
 	convert "$1" -compress $a "`basename $1 .jpg`.pdf"
 	pdfimages "`basename $1 .jpg`.pdf" "`basename $1 .jpg`"
-	/bin/echo -ne "diff:\t"
+	printf "diff:\t"
 	diff=`compare -metric AE "$1" "\`basename $1 .jpg\`-000.ppm" null: 2>&1`
 	if [ "$diff" != "0" ]; then
 		echo "lossy"
 	else
 		echo "lossless"
 	fi
-	/bin/echo -ne "size:\t"
+	printf "size:\t"
 	pdfsize=`stat -c "%s" "\`basename $1 .jpg\`.pdf"`
 	echo "scale=1;$pdfsize/$imsize" | bc
-	/bin/echo -ne "pdf:\t"
+	printf "pdf:\t"
 	grep --max-count=1 --text /Filter "`basename $1 .jpg`.pdf"
 	echo
 done
author	Johannes 'josch' Schauer <josch@debian.org>	2018-08-05 21:10:32 +0200
committer	Johannes 'josch' Schauer <josch@debian.org>	2018-08-05 21:10:32 +0200
commit	244f64ef12075bc9ad3a396e4cb1d510800140d3 (patch)
tree	56e18cd7dc73edf3d80fec142153e035937c0a52
parent	aef245f415aae671df75502700826d2bb682e257 (diff)