Import upstream version 0.2.3

author: Johannes Schauer <josch@debian.org> 2017-01-20 05:49:31 +0100
committer: Johannes Schauer <josch@debian.org> 2017-01-20 05:49:31 +0100
commit: f71d3883871752e9ab72bb175c89a378df2af529 (patch)
tree: 98fdde17ba8a53ac5d03fe672289b4d3cdeba2cf /src
parent: d3481fe48afe150f38f331048abe6452b8389723 (diff)
7 files changed, 175 insertions, 51 deletions
diff --git a/src/img2pdf.egg-info/PKG-INFO b/src/img2pdf.egg-info/PKG-INFO
index b18e9d6..870fa2d 100644
--- a/src/img2pdf.egg-info/PKG-INFO
+++ b/src/img2pdf.egg-info/PKG-INFO
@@ -1,12 +1,12 @@
 Metadata-Version: 1.1
 Name: img2pdf
-Version: 0.2.1
+Version: 0.2.3
 Summary: Convert images to PDF via direct JPEG inclusion.
 Home-page: https://gitlab.mister-muffin.de/josch/img2pdf
 Author: Johannes 'josch' Schauer
 Author-email: josch@mister-muffin.de
 License: LGPL
-Download-URL: https://gitlab.mister-muffin.de/josch/img2pdf/repository/archive.tar.gz?ref=0.2.1
+Download-URL: https://gitlab.mister-muffin.de/josch/img2pdf/repository/archive.tar.gz?ref=0.2.3
 Description: img2pdf
         =======
         
@@ -157,7 +157,7 @@ Classifier: Intended Audience :: Other Audience
 Classifier: Environment :: Console
 Classifier: Programming Language :: Python
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.4
+Classifier: Programming Language :: Python :: 3.5
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Classifier: License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)
 Classifier: Natural Language :: English
diff --git a/src/img2pdf.egg-info/SOURCES.txt b/src/img2pdf.egg-info/SOURCES.txt
index 192589d..add31f1 100644
--- a/src/img2pdf.egg-info/SOURCES.txt
+++ b/src/img2pdf.egg-info/SOURCES.txt
@@ -15,9 +15,11 @@ src/img2pdf.egg-info/top_level.txt
 src/img2pdf.egg-info/zip-safe
 src/tests/__init__.py
 src/tests/input/CMYK.jpg
+src/tests/input/mono.png
 src/tests/input/normal.jpg
 src/tests/input/normal.png
 src/tests/output/CMYK.jpg.pdf
 src/tests/output/CMYK.tif.pdf
+src/tests/output/mono.png.pdf
 src/tests/output/normal.jpg.pdf
 src/tests/output/normal.png.pdf
 \ No newline at end of file
diff --git a/src/img2pdf.py b/src/img2pdf.py
index 2042d13..20fe784 100755
--- a/src/img2pdf.py
+++ b/src/img2pdf.py
@@ -28,7 +28,7 @@ from enum import Enum
 from io import BytesIO
 import logging
 
-__version__ = "0.2.1"
+__version__ = "0.2.3"
 default_dpi = 96.0
 papersizes = {
     "letter": "8.5inx11in",
@@ -58,7 +58,7 @@ PageOrientation = Enum('PageOrientation', 'portrait landscape')
 
 Colorspace = Enum('Colorspace', 'RGB L 1 CMYK CMYK;I RGBA P other')
 
-ImageFormat = Enum('ImageFormat', 'JPEG JPEG2000 other')
+ImageFormat = Enum('ImageFormat', 'JPEG JPEG2000 CCITTGroup4 other')
 
 PageMode = Enum('PageMode', 'none outlines thumbs')
 
@@ -167,6 +167,8 @@ class MyPdfDict(object):
 class MyPdfName():
     def __getattr__(self, name):
         return b'/' + name.encode('ascii')
+
+
 MyPdfName = MyPdfName()
 
 
@@ -314,7 +316,7 @@ class pdfdoc(object):
             self.info[PdfName.Author] = PdfString.encode(author)
         if creator is not None:
             self.info[PdfName.Creator] = PdfString.encode(creator)
-        if producer is not None:
+        if producer is not None and producer != "":
             self.info[PdfName.Producer] = PdfString.encode(producer)
         if creationdate is not None:
             self.info[PdfName.CreationDate] = \
@@ -354,14 +356,15 @@ class pdfdoc(object):
                       imgwidthpdf, imgheightpdf, imgxpdf, imgypdf, pagewidth,
                       pageheight):
         if self.with_pdfrw:
-            from pdfrw import PdfDict, PdfName
+            from pdfrw import PdfDict, PdfName, PdfObject
             from pdfrw.py23_diffs import convert_load
         else:
             PdfDict = MyPdfDict
             PdfName = MyPdfName
+            PdfObject = MyPdfObject
             convert_load = my_convert_load
 
-        if color == Colorspace.L:
+        if color == Colorspace['1'] or color == Colorspace.L:
             colorspace = PdfName.DeviceGray
         elif color == Colorspace.RGB:
             colorspace = PdfName.DeviceRGB
@@ -372,11 +375,14 @@ class pdfdoc(object):
                                              % color.name)
 
         # either embed the whole jpeg or deflate the bitmap representation
+        logging.debug(imgformat)
         if imgformat is ImageFormat.JPEG:
             ofilter = [PdfName.DCTDecode]
         elif imgformat is ImageFormat.JPEG2000:
             ofilter = [PdfName.JPXDecode]
             self.writer.version = "1.5"  # jpeg2000 needs pdf 1.5
+        elif imgformat is ImageFormat.CCITTGroup4:
+            ofilter = [PdfName.CCITTFaxDecode]
         else:
             ofilter = [PdfName.FlateDecode]
 
@@ -389,12 +395,23 @@ class pdfdoc(object):
         image[PdfName.Height] = imgheightpx
         image[PdfName.ColorSpace] = colorspace
         # hardcoded as PIL doesn't provide bits for non-jpeg formats
-        image[PdfName.BitsPerComponent] = 8
+        if imgformat is ImageFormat.CCITTGroup4:
+            image[PdfName.BitsPerComponent] = 1
+        else:
+            image[PdfName.BitsPerComponent] = 8
 
         if color == Colorspace['CMYK;I']:
             # Inverts all four channels
             image[PdfName.Decode] = [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0]
 
+        if imgformat is ImageFormat.CCITTGroup4:
+            decodeparms = PdfDict()
+            decodeparms[PdfName.K] = -1
+            decodeparms[PdfName.BlackIs1] = PdfObject('true')
+            decodeparms[PdfName.Columns] = imgwidthpx
+            decodeparms[PdfName.Rows] = imgheightpx
+            image[PdfName.DecodeParms] = [decodeparms]
+
         text = ("q\n%0.4f 0 0 %0.4f %0.4f %0.4f cm\n/Im0 Do\nQ" %
                 (imgwidthpdf, imgheightpdf, imgxpdf, imgypdf)).encode("ascii")
 
@@ -594,6 +611,45 @@ def get_imgmetadata(imgdata, imgformat, default_dpi, colorspace, rawdata=None):
     return (color, ndpi, imgwidthpx, imgheightpx)
 
 
+def transcode_monochrome(imgdata):
+    """Convert the open PIL.Image imgdata to compressed CCITT Group4 data"""
+
+    from PIL import TiffImagePlugin
+
+    logging.debug("Converting monochrome to CCITT Group4")
+
+    # Convert the image to Group 4 in memory. If libtiff is not installed and
+    # Pillow is not compiled against it, .save() will raise an exception.
+    newimgio = BytesIO()
+    imgdata.save(newimgio, format='TIFF', compression='group4')
+
+    # Open new image in memory
+    newimgio.seek(0)
+    newimg = Image.open(newimgio)
+
+    # If Pillow is passed an invalid compression argument it will ignore it;
+    # make sure the image actually got compressed.
+    if newimg.info['compression'] != 'group4':
+        raise ValueError("Image not compressed as expected")
+
+    # Read the TIFF tags to find the offset(s) of the compressed data strips.
+    strip_offsets = newimg.tag_v2[TiffImagePlugin.STRIPOFFSETS]
+    strip_bytes = newimg.tag_v2[TiffImagePlugin.STRIPBYTECOUNTS]
+    rows_per_strip = newimg.tag_v2[TiffImagePlugin.ROWSPERSTRIP]
+
+    # PIL always seems to create a single strip even for very large TIFFs when
+    # it saves images, so assume we only have to read a single strip.
+    # A test ~10 GPixel image was still encoded as a single strip. Just to be
+    # safe check throw an error if there is more than one offset.
+    if len(strip_offsets) > 1:
+        raise NotImplementedError("Transcoding multiple strips not supported")
+
+    newimgio.seek(strip_offsets[0])
+    ccittdata = newimgio.read(strip_bytes[0])
+
+    return ccittdata
+
+
 def read_images(rawdata, colorspace, first_frame_only=False):
     im = BytesIO(rawdata)
     im.seek(0)
@@ -648,11 +704,20 @@ def read_images(rawdata, colorspace, first_frame_only=False):
             color, ndpi, imgwidthpx, imgheightpx = get_imgmetadata(
                     imgdata, imgformat, default_dpi, colorspace)
 
-            # because we do not support /CCITTFaxDecode
+            newimg = None
             if color == Colorspace['1']:
-                logging.debug("Converting colorspace 1 to L")
-                newimg = imgdata.convert('L')
-                color = Colorspace.L
+                try:
+                    ccittdata = transcode_monochrome(imgdata)
+                    imgformat = ImageFormat.CCITTGroup4
+                    result.append((color, ndpi, imgformat, ccittdata,
+                                   imgwidthpx, imgheightpx))
+                    img_page_count += 1
+                    continue
+                except Exception as e:
+                    logging.debug(e)
+                    logging.debug("Converting colorspace 1 to L")
+                    newimg = imgdata.convert('L')
+                    color = Colorspace.L
             elif color in [Colorspace.RGB, Colorspace.L, Colorspace.CMYK,
                            Colorspace["CMYK;I"]]:
                 logging.debug("Colorspace is OK: %s", color)
@@ -927,12 +992,22 @@ def convert(*images, title=None,
                  viewer_fit_window, viewer_center_window, viewer_fullscreen,
                  with_pdfrw)
 
+    # backwards compatibility with older img2pdf versions where the first
+    # argument to the function had to be given as a list
+    if len(images) == 1:
+        # if only one argument was given and it is a list, expand it
+        if isinstance(images[0], (list, tuple)):
+            images = images[0]
+
     for img in images:
         # img is allowed to be a path, a binary string representing image data
         # or a file-like object (really anything that implements read())
         try:
             rawdata = img.read()
         except AttributeError:
+            if not isinstance(img, (str, bytes)):
+                raise TypeError(
+                        "Neither implements read() nor is str or bytes")
             # the thing doesn't have a read() function, so try if we can treat
             # it as a file name
             try:
@@ -1256,10 +1331,11 @@ useful to convert JPEG and JPEG2000 images to PDF.
 The output is sent to standard output so that it can be redirected into a file
 or to another program as part of a shell pipe. To directly write the output
 into a file, use the -o or --output option.
+
+Options:
 ''',
             epilog='''\
-Colorspace
-
+Colorspace:
   Currently, the colorspace must be forced for JPEG 2000 images that are not in
   the RGB colorspace.  Available colorspace options are based on Python Imaging
   Library (PIL) short handles.
@@ -1270,8 +1346,7 @@ Colorspace
     CMYK     CMYK color
     CMYK;I   CMYK color with inversion (for CMYK JPEG files from Adobe)
 
-Paper sizes
-
+Paper sizes:
   You can specify the short hand paper size names shown in the first column in
   the table below as arguments to the --pagesize and --imgsize options.  The
   width and height they are mapping to is shown in the second column.  Giving
@@ -1282,8 +1357,7 @@ Paper sizes
 
 %s
 
-Fit options
-
+Fit options:
   The img2pdf options for the --fit argument are shown in the first column in
   the table below. The function of these options can be mapped to the geometry
   operators of imagemagick. For users who are familiar with imagemagick, the
@@ -1307,8 +1381,32 @@ Fit options
     enlarge | < | Y | Enlarges an image with dimensions smaller than the given
             |   |   | ones (and otherwise behaves like "into").
 
-Examples
+Argument parsing:
+  Argument long options can be abbreviated to a prefix if the abbreviation is
+  anambiguous. That is, the prefix must match a unique option.
+
+  Beware of your shell interpreting argument values as special characters (like
+  the semicolon in the CMYK;I colorspace option). If in doubt, put the argument
+  values in single quotes.
+
+  If you want an argument value to start with one or more minus characters, you
+  must use the long option name and join them with an equal sign like so:
+
+    $ img2pdf --author=--test--
+
+  If your input file name starts with one or more minus characters, either
+  separate the input files from the other arguments by two minus signs:
 
+    $ img2pdf -- --my-file-starts-with-two-minuses.jpg
+
+  Or be more explicit about its relative path by prepending a ./:
+
+    $ img2pdf ./--my-file-starts-with-two-minuses.jpg
+
+  The order of non-positional arguments (all arguments other than the input
+  images) does not matter.
+
+Examples:
   Lines starting with a dollar sign denote commands you can enter into your
   terminal. The dollar sign signifies your command prompt. It is not part of
   the command you type.
@@ -1340,31 +1438,9 @@ Examples
 
     $ img2pdf --output out.pdf --colorspace L input.jp2
 
-Argument parsing
-
-  Argument long options can be abbreviated to a prefix if the abbreviation is
-  anambiguous. That is, the prefix must match a unique option.
-
-  Beware of your shell interpreting argument values as special characters (like
-  the semicolon in the CMYK;I colorspace option). If in doubt, put the argument
-  values in single quotes.
-
-  If you want an argument value to start with one or more minus characters, you
-  must use the long option name and join them with an equal sign like so:
-
-    $ img2pdf --author=--test--
-
-  If your input file name starts with one or more minus characters, either
-  separate the input files from the other arguments by two minus signs:
-
-    $ img2pdf -- --my-file-starts-with-two-minuses.jpg
+Written by Johannes 'josch' Schauer <josch@mister-muffin.de>
 
-  Or be more explicit about its relative path by prepending a ./:
-
-    $ img2pdf ./--my-file-starts-with-two-minuses.jpg
-
-  The order of non-positional arguments (all arguments other than the input
-  images) does not matter.
+Report bugs at https://gitlab.mister-muffin.de/josch/img2pdf/issues
 ''' % rendered_papersizes)
 
     parser.add_argument(
@@ -1385,7 +1461,7 @@ Argument parsing
 
     outargs = parser.add_argument_group(
             title='General output arguments',
-            description='')
+            description='Arguments controlling the output format.')
 
     outargs.add_argument(
         '-o', '--output', metavar='out', type=argparse.FileType('wb'),
@@ -1428,8 +1504,7 @@ RGB.''')
     sizeargs = parser.add_argument_group(
         title='Image and page size and layout arguments',
         description='''\
-
-Every input image will be placed on its own page.  The image size is controlled
+Every input image will be placed on its own page. The image size is controlled
 by the dpi value of the input image or, if unset or missing, the default dpi of
 %.2f. By default, each page will have the same size as the image it shows.
 Thus, there will be no visible border between the image and the page border by
@@ -1518,8 +1593,10 @@ of the input image. If the orientation of a page gets flipped, then so do the
 values set via the --border option.
 ''')
 
-    metaargs = parser.add_argument_group(title='Arguments setting metadata',
-                                         description='')
+    metaargs = parser.add_argument_group(
+        title='Arguments setting metadata',
+        description='Options handling embedded timestamps, title and author '
+                    'information.')
     metaargs.add_argument(
         '--title', metavar='title', type=str,
         help='Sets the title metadata value')
@@ -1532,7 +1609,8 @@ values set via the --border option.
     metaargs.add_argument(
         '--producer', metavar='producer', type=str,
         default="img2pdf " + __version__,
-        help='Sets the producer metadata value (default is: img2pdf)')
+        help='Sets the producer metadata value '
+             '(default is: img2pdf ' + __version__ + ')')
     metaargs.add_argument(
         '--creationdate', metavar='creationdate', type=valid_date,
         help='Sets the UTC creation date metadata value in YYYY-MM-DD or '
@@ -1646,5 +1724,6 @@ values set via the --border option.
             traceback.print_exc(file=sys.stderr)
         exit(1)
 
+
 if __name__ == '__main__':
     main()
diff --git a/src/jp2.py b/src/jp2.py
index 7f61312..30edb7e 100644
--- a/src/jp2.py
+++ b/src/jp2.py
@@ -116,6 +116,7 @@ def parsejp2(data):
     # retrieving the dpi is optional so we do not error out if not present
     return (width, height, colorspace, hdpi, vdpi)
 
+
 if __name__ == "__main__":
     import sys
     width, height, colorspace = parsejp2(open(sys.argv[1]).read())
diff --git a/src/tests/__init__.py b/src/tests/__init__.py
index b668054..506fc48 100644
--- a/src/tests/__init__.py
+++ b/src/tests/__init__.py
@@ -4,6 +4,8 @@ import os
 import img2pdf
 import zlib
 from PIL import Image
+from io import BytesIO
+import struct
 
 HERE = os.path.dirname(__file__)
 
@@ -396,6 +398,29 @@ layout_test_cases = [
 ]
 
 
+def tiff_header_for_ccitt(width, height, img_size, ccitt_group=4):
+    # Quick and dirty TIFF header builder from
+    # https://stackoverflow.com/questions/2641770
+    tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h'
+    return struct.pack(
+        tiff_header_struct,
+        b'II',  # Byte order indication: Little indian
+        42,  # Version number (always 42)
+        8,  # Offset to first IFD
+        8,  # Number of tags in IFD
+        256, 4, 1, width,  # ImageWidth, LONG, 1, width
+        257, 4, 1, height,  # ImageLength, LONG, 1, lenght
+        258, 3, 1, 1,  # BitsPerSample, SHORT, 1, 1
+        259, 3, 1, ccitt_group,  # Compression, SHORT, 1, 4 = CCITT Group 4
+        262, 3, 1, 1,  # Threshholding, SHORT, 1, 0 = WhiteIsZero
+        273, 4, 1, struct.calcsize(
+            tiff_header_struct),  # StripOffsets, LONG, 1, len of header
+        278, 4, 1, height,  # RowsPerStrip, LONG, 1, lenght
+        279, 4, 1, img_size,  # StripByteCounts, LONG, 1, size of image
+        0  # last IFD
+        )
+
+
 def test_suite():
     class TestImg2Pdf(unittest.TestCase):
         pass
@@ -485,7 +510,8 @@ def test_suite():
             # test if the filter is valid:
             self.assertIn(
                 imgprops.Filter, [[PdfName.DCTDecode], [PdfName.JPXDecode],
-                                  [PdfName.FlateDecode]])
+                                  [PdfName.FlateDecode],
+                                  [PdfName.CCITTFaxDecode]])
             # test if the colorspace is valid
             self.assertIn(
                 imgprops.ColorSpace, [PdfName.DeviceGray, PdfName.DeviceRGB,
@@ -500,6 +526,22 @@ def test_suite():
                 self.assertEqual(
                     x.Root.Pages.Kids[0].Resources.XObject.Im0.stream,
                     convert_load(orig_imgdata))
+            elif imgprops.Filter == [PdfName.CCITTFaxDecode]:
+                tiff_header = tiff_header_for_ccitt(
+                    int(imgprops.Width), int(imgprops.Height),
+                    int(imgprops.Length), 4)
+                imgio = BytesIO()
+                imgio.write(tiff_header)
+                imgio.write(convert_store(
+                    x.Root.Pages.Kids[0].Resources.XObject.Im0.stream))
+                imgio.seek(0)
+                im = Image.open(imgio)
+                self.assertEqual(im.tobytes(), orig_img.tobytes())
+                try:
+                    im.close()
+                except AttributeError:
+                    pass
+
             elif imgprops.Filter == [PdfName.FlateDecode]:
                 # otherwise, the data is flate encoded and has to be equal to
                 # the pixel data of the input image
diff --git a/src/tests/input/mono.png b/src/tests/input/mono.png
new file mode 100644
index 0000000..59b17ad
--- /dev/null
+++ b/src/tests/input/mono.png
diff --git a/src/tests/output/mono.png.pdf b/src/tests/output/mono.png.pdf
new file mode 100644
index 0000000..eda3ec7
--- /dev/null
+++ b/src/tests/output/mono.png.pdf
author	Johannes Schauer <josch@debian.org>	2017-01-20 05:49:31 +0100
committer	Johannes Schauer <josch@debian.org>	2017-01-20 05:49:31 +0100
commit	f71d3883871752e9ab72bb175c89a378df2af529 (patch)
tree	98fdde17ba8a53ac5d03fe672289b4d3cdeba2cf /src
parent	d3481fe48afe150f38f331048abe6452b8389723 (diff)