New upstream version 0.5.0

author: Johannes Schauer Marin Rodrigues <josch@debian.org> 2023-10-28 10:21:25 +0200
committer: Johannes Schauer Marin Rodrigues <josch@debian.org> 2023-10-28 10:21:25 +0200
commit: d6d4451cf308865725d044a4331bda9ba7ec066d (patch)
tree: 378aa1e2bc45e361ca5aedff0a841bd41a4d5341 /src/img2pdf.py
parent: 7abe2f2f089f38a0ba403da8f1459f5c6bf2ffa6 (diff)
1 files changed, 453 insertions, 53 deletions
diff --git a/src/img2pdf.py b/src/img2pdf.py
index 39a311b..036232b 100755
--- a/src/img2pdf.py
+++ b/src/img2pdf.py
@@ -22,7 +22,7 @@ import sys
 import os
 import zlib
 import argparse
-from PIL import Image, TiffImagePlugin, GifImagePlugin
+from PIL import Image, TiffImagePlugin, GifImagePlugin, ImageCms
 
 if hasattr(GifImagePlugin, "LoadingStrategy"):
     # Pillow 9.0.0 started emitting all frames but the first as RGB instead of
@@ -36,8 +36,8 @@ if hasattr(GifImagePlugin, "LoadingStrategy"):
 
 # TiffImagePlugin.DEBUG = True
 from PIL.ExifTags import TAGS
-from datetime import datetime
-from jp2 import parsejp2
+from datetime import datetime, timezone
+import jp2
 from enum import Enum
 from io import BytesIO
 import logging
@@ -45,6 +45,8 @@ import struct
 import platform
 import hashlib
 from itertools import chain
+import re
+import io
 
 logger = logging.getLogger(__name__)
 
@@ -60,7 +62,7 @@ try:
 except ImportError:
     have_pikepdf = False
 
-__version__ = "0.4.4"
+__version__ = "0.5.0"
 default_dpi = 96.0
 papersizes = {
     "letter": "8.5inx11in",
@@ -125,7 +127,9 @@ PageOrientation = Enum("PageOrientation", "portrait landscape")
 
 Colorspace = Enum("Colorspace", "RGB RGBA L LA 1 CMYK CMYK;I P PA other")
 
-ImageFormat = Enum("ImageFormat", "JPEG JPEG2000 CCITTGroup4 PNG GIF TIFF MPO other")
+ImageFormat = Enum(
+    "ImageFormat", "JPEG JPEG2000 CCITTGroup4 PNG GIF TIFF MPO MIFF other"
+)
 
 PageMode = Enum("PageMode", "none outlines thumbs")
 
@@ -442,7 +446,7 @@ class temp_attr:
         if hasattr(self.obj, self.field):
             self.exists = True
             self.old_value = getattr(self.obj, self.field)
-        print(f"setting {self.obj}.{self.field} = {self.value}")
+        logger.debug(f"setting {self.obj}.{self.field} = {self.value}")
         setattr(self.obj, self.field, self.value)
 
     def __exit__(self, exctype, excinst, exctb):
@@ -718,7 +722,7 @@ class pdfdoc(object):
             self.writer.docinfo = PdfDict(indirect=True)
 
         def datetime_to_pdfdate(dt):
-            return dt.strftime("%Y%m%d%H%M%SZ")
+            return dt.astimezone(tz=timezone.utc).strftime("%Y%m%d%H%M%SZ")
 
         for k in ["Title", "Author", "Creator", "Producer", "Subject"]:
             v = locals()[k.lower()]
@@ -728,7 +732,7 @@ class pdfdoc(object):
                 v = PdfString.encode(v)
             self.writer.docinfo[getattr(PdfName, k)] = v
 
-        now = datetime.now()
+        now = datetime.now().astimezone()
         for k in ["CreationDate", "ModDate"]:
             v = locals()[k.lower()]
             if v is None and nodate:
@@ -748,7 +752,7 @@ class pdfdoc(object):
                 )
 
         def datetime_to_xmpdate(dt):
-            return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
+            return dt.astimezone(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
 
         self.xmp = b"""<?xpacket begin='\xef\xbb\xbf' id='W5M0MpCehiHzreSzNTczkc9d'?>
 <x:xmpmeta xmlns:x='adobe:ns:meta/' x:xmptk='XMP toolkit 2.9.1-13, framework 1.6'>
@@ -823,8 +827,10 @@ class pdfdoc(object):
         artborder=None,
         iccp=None,
     ):
-        assert (color != Colorspace.RGBA and color != Colorspace.LA) or (
-            imgformat == ImageFormat.PNG and smaskdata is not None
+        assert (
+            color not in [Colorspace.RGBA, Colorspace.LA]
+            or (imgformat == ImageFormat.PNG and smaskdata is not None)
+            or imgformat == ImageFormat.JPEG2000
         )
 
         if self.engine == Engine.pikepdf:
@@ -848,7 +854,13 @@ class pdfdoc(object):
         if color == Colorspace["1"] or color == Colorspace.L or color == Colorspace.LA:
             colorspace = PdfName.DeviceGray
         elif color == Colorspace.RGB or color == Colorspace.RGBA:
-            colorspace = PdfName.DeviceRGB
+            if color == Colorspace.RGBA and imgformat == ImageFormat.JPEG2000:
+                # there is no DeviceRGBA and for JPXDecode it is okay to have
+                # no colorspace as the pdf reader is supposed to get this info
+                # from the jpeg2000 payload itself
+                colorspace = None
+            else:
+                colorspace = PdfName.DeviceRGB
         elif color == Colorspace.CMYK or color == Colorspace["CMYK;I"]:
             colorspace = PdfName.DeviceCMYK
         elif color == Colorspace.P:
@@ -919,7 +931,8 @@ class pdfdoc(object):
         image[PdfName.Filter] = ofilter
         image[PdfName.Width] = imgwidthpx
         image[PdfName.Height] = imgheightpx
-        image[PdfName.ColorSpace] = colorspace
+        if colorspace is not None:
+            image[PdfName.ColorSpace] = colorspace
         image[PdfName.BitsPerComponent] = depth
 
         smask = None
@@ -1256,8 +1269,11 @@ class pdfdoc(object):
 
         # now write out the PDF
         if self.engine == Engine.pikepdf:
+            kwargs = {}
+            if pikepdf.__version__ >= "6.2.0":
+                kwargs["deterministic_id"] = True
             self.writer.save(
-                outputstream, min_version=self.output_version, linearize=True
+                outputstream, min_version=self.output_version, linearize=True, **kwargs
             )
         elif self.engine == Engine.pdfrw:
             self.writer.trailer.Info = self.writer.docinfo
@@ -1285,7 +1301,7 @@ def get_imgmetadata(
     if imgformat == ImageFormat.JPEG2000 and rawdata is not None and imgdata is None:
         # this codepath gets called if the PIL installation is not able to
         # handle JPEG2000 files
-        imgwidthpx, imgheightpx, ics, hdpi, vdpi = parsejp2(rawdata)
+        imgwidthpx, imgheightpx, ics, hdpi, vdpi, channels, bpp = jp2.parse(rawdata)
 
         if hdpi is None:
             hdpi = default_dpi
@@ -1305,7 +1321,7 @@ def get_imgmetadata(
         ics = imgdata.mode
 
     # GIF and PNG files with transparency are supported
-    if (imgformat == ImageFormat.PNG or imgformat == ImageFormat.GIF) and (
+    if imgformat in [ImageFormat.PNG, ImageFormat.GIF, ImageFormat.JPEG2000] and (
         ics in ["RGBA", "LA"] or "transparency" in imgdata.info
     ):
         # Must check the IHDR chunk for the bit depth, because PIL would lossily
@@ -1315,6 +1331,10 @@ def get_imgmetadata(
             if depth > 8:
                 logger.warning("Image with transparency and a bit depth of %d." % depth)
                 logger.warning("This is unsupported due to PIL limitations.")
+                logger.warning(
+                    "If you accept a lossy conversion, you can manually convert "
+                    "your images to 8 bit using `convert -depth 8` from imagemagick"
+                )
                 raise AlphaChannelError(
                     "Refusing to work with multiple >8bit channels."
                 )
@@ -1425,6 +1445,53 @@ def get_imgmetadata(
     iccp = None
     if "icc_profile" in imgdata.info:
         iccp = imgdata.info.get("icc_profile")
+    # GIMP saves bilevel TIFF images and palette PNG images with only black and
+    # white in the palette with an RGB ICC profile which is useless
+    # https://gitlab.gnome.org/GNOME/gimp/-/issues/3438
+    # and produces an error in Adobe Acrobat, so we ignore it with a warning.
+    # imagemagick also used to (wrongly) include an RGB ICC profile for bilevel
+    # images: https://github.com/ImageMagick/ImageMagick/issues/2070
+    if iccp is not None and (
+        (color == Colorspace["1"] and imgformat == ImageFormat.TIFF)
+        or (
+            imgformat == ImageFormat.PNG
+            and color == Colorspace.P
+            and rawdata is not None
+            and parse_png(rawdata)[1]
+            in [b"\x00\x00\x00\xff\xff\xff", b"\xff\xff\xff\x00\x00\x00"]
+        )
+    ):
+        with io.BytesIO(iccp) as f:
+            prf = ImageCms.ImageCmsProfile(f)
+        if (
+            prf.profile.model == "sRGB"
+            and prf.profile.manufacturer == "GIMP"
+            and prf.profile.profile_description == "GIMP built-in sRGB"
+        ):
+            if imgformat == ImageFormat.TIFF:
+                logger.warning(
+                    "Ignoring RGB ICC profile in bilevel TIFF produced by GIMP."
+                )
+            elif imgformat == ImageFormat.PNG:
+                logger.warning(
+                    "Ignoring RGB ICC profile in 2-color palette PNG produced by GIMP."
+                )
+            logger.warning("https://gitlab.gnome.org/GNOME/gimp/-/issues/3438")
+            iccp = None
+    # SmartAlbums old version (found 2.2.6) exports JPG with only 1 compone
+    # with an RGB ICC profile which is useless.
+    # This produces an error in Adobe Acrobat, so we ignore it with a warning.
+    # Update: Found another case, the JPG is created by Adobe PhotoShop, so we
+    # don't check software anymore.
+    if iccp is not None and (
+        (color == Colorspace["L"] and imgformat == ImageFormat.JPEG)
+    ):
+        with io.BytesIO(iccp) as f:
+            prf = ImageCms.ImageCmsProfile(f)
+
+        if prf.profile.xcolor_space not in ("GRAY"):
+            logger.warning("Ignoring non-GRAY ICC profile in Grayscale JPG")
+            iccp = None
 
     logger.debug("width x height = %dpx x %dpx", imgwidthpx, imgheightpx)
 
@@ -1533,7 +1600,204 @@ def parse_png(rawdata):
     return pngidat, palette
 
 
-def read_images(rawdata, colorspace, first_frame_only=False, rot=None):
+miff_re = re.compile(
+    r"""
+    [^\x00-\x20\x7f-\x9f] # the field name must not start with a control char or space
+    [^=]+                 # the field name can even contain spaces
+    =                     # field name and value are separated by an equal sign
+    (?:
+        [^\x00-\x20\x7f-\x9f{}] # either chars that are not braces and not control chars
+        |{[^}]*}                # or any kind of char surrounded by braces
+    )+""",
+    re.VERBOSE,
+)
+
+# https://imagemagick.org/script/miff.php
+# turn off black formatting until python 3.10 is available on more platforms
+# and we can use match/case
+# fmt: off
+def parse_miff(data):
+    results = []
+    header, rest = data.split(b":\x1a", 1)
+    header = header.decode("ISO-8859-1")
+    assert header.lower().startswith("id=imagemagick")
+    hdata = {}
+    for i, line in enumerate(re.findall(miff_re, header)):
+        if not line:
+            continue
+        k, v = line.split("=", 1)
+        if i == 0:
+            assert k.lower() == "id"
+            assert v.lower() == "imagemagick"
+        #match k.lower():
+        #    case "class":
+        if k.lower() == "class":
+                #match v:
+                #    case "DirectClass" | "PseudoClass":
+                if v in ["DirectClass", "PseudoClass"]:
+                        hdata["class"] = v
+                #    case _:
+                else:
+                        print("cannot understand class", v)
+        #    case "colorspace":
+        elif k.lower() == "colorspace":
+                # theoretically RGBA and CMYKA should be supported as well
+                # please teach me how to create such a MIFF file
+                #match v:
+                #    case "sRGB" | "CMYK" | "Gray":
+                if v in ["sRGB", "CMYK", "Gray"]:
+                        hdata["colorspace"] = v
+                #    case _:
+                else:
+                        print("cannot understand colorspace", v)
+        #    case "depth":
+        elif k.lower() == "depth":
+                #match v:
+                #    case "8" | "16" | "32":
+                if v in ["8", "16", "32"]:
+                        hdata["depth"] = int(v)
+                #    case _:
+                else:
+                        print("cannot understand depth", v)
+        #    case "colors":
+        elif k.lower() == "colors":
+                hdata["colors"] = int(v)
+        #    case "matte":
+        elif k.lower() == "matte":
+                #match v:
+                #    case "True":
+                if v == "True":
+                        hdata["matte"] = True
+                #    case "False":
+                elif v == "False":
+                        hdata["matte"] = False
+                #    case _:
+                else:
+                        print("cannot understand matte", v)
+        #    case "columns" | "rows":
+        elif k.lower() in ["columns", "rows"]:
+                hdata[k.lower()] = int(v)
+        #    case "compression":
+        elif k.lower() == "compression":
+                print("compression not yet supported")
+        #    case "profile":
+        elif k.lower() == "profile":
+                assert v in ["icc", "exif"]
+                hdata["profile"] = v
+        #    case "resolution":
+        elif k.lower() == "resolution":
+                dpix, dpiy = v.split("x", 1)
+                hdata["resolution"] = (float(dpix), float(dpiy))
+
+    assert "depth" in hdata
+    assert "columns" in hdata
+    assert "rows" in hdata
+    #match hdata["class"]:
+    #    case "DirectClass":
+    if hdata["class"] == "DirectClass":
+            if "colors" in hdata:
+                assert hdata["colors"] == 0
+            #match hdata["colorspace"]:
+            #    case "sRGB":
+            if hdata["colorspace"] == "sRGB":
+                    numchannels = 3
+                    colorspace = Colorspace.RGB
+            #    case "CMYK":
+            elif hdata["colorspace"] == "CMYK":
+                    numchannels = 4
+                    colorspace = Colorspace.CMYK
+            #    case "Gray":
+            elif hdata["colorspace"] == "Gray":
+                    numchannels = 1
+                    colorspace = Colorspace.L
+            if hdata.get("matte"):
+                numchannels += 1
+            if hdata.get("profile"):
+                # there is no key encoding the length of icc or exif data
+                # according to the docs, the profile-icc key is supposed to do this
+                print("FAIL: exif")
+            else:
+                lenimgdata = (
+                    hdata["depth"] // 8 * numchannels * hdata["columns"] * hdata["rows"]
+                )
+                assert len(rest) >= lenimgdata, (
+                    len(rest),
+                    hdata["depth"],
+                    numchannels,
+                    hdata["columns"],
+                    hdata["rows"],
+                    lenimgdata,
+                )
+                if colorspace == Colorspace.RGB and hdata["depth"] == 8:
+                    newimg = Image.frombytes("RGB", (hdata["columns"], hdata["rows"]), rest[:lenimgdata])
+                    imgdata, palette, depth = to_png_data(newimg)
+                    assert palette == b""
+                    assert depth == hdata["depth"]
+                    imgfmt = ImageFormat.PNG
+                else:
+                    imgdata = zlib.compress(rest[:lenimgdata])
+                    imgfmt = ImageFormat.MIFF
+                results.append(
+                    (
+                        colorspace,
+                        hdata.get("resolution") or (default_dpi, default_dpi),
+                        imgfmt,
+                        imgdata,
+                        None,  # smask
+                        hdata["columns"],
+                        hdata["rows"],
+                        [],  # palette
+                        False,  # inverted
+                        hdata["depth"],
+                        0,  # rotation
+                        None,  # icc profile
+                    )
+                )
+                if len(rest) > lenimgdata:
+                    # another image is here
+                    assert rest[lenimgdata:][:14].lower() == b"id=imagemagick"
+                    results.extend(parse_miff(rest[lenimgdata:]))
+    #    case "PseudoClass":
+    elif hdata["class"] == "PseudoClass":
+            assert "colors" in hdata
+            if hdata.get("matte"):
+                numchannels = 2
+            else:
+                numchannels = 1
+            lenpal = 3 * hdata["colors"] * hdata["depth"] // 8
+            lenimgdata = numchannels * hdata["rows"] * hdata["columns"]
+            assert len(rest) >= lenpal + lenimgdata, (len(rest), lenpal, lenimgdata)
+            results.append(
+                (
+                    Colorspace.RGB,
+                    hdata.get("resolution") or (default_dpi, default_dpi),
+                    ImageFormat.MIFF,
+                    zlib.compress(rest[lenpal : lenpal + lenimgdata]),
+                    None,  # FIXME: allow alpha channel smask
+                    hdata["columns"],
+                    hdata["rows"],
+                    rest[:lenpal],  # palette
+                    False,  # inverted
+                    hdata["depth"],
+                    0,  # rotation
+                    None,  # icc profile
+                )
+            )
+            if len(rest) > lenpal + lenimgdata:
+                # another image is here
+                assert rest[lenpal + lenimgdata :][:14].lower() == b"id=imagemagick", (
+                    len(rest),
+                    lenpal,
+                    lenimgdata,
+                )
+                results.extend(parse_miff(rest[lenpal + lenimgdata :]))
+    return results
+# fmt: on
+
+
+def read_images(
+    rawdata, colorspace, first_frame_only=False, rot=None, include_thumbnails=False
+):
     im = BytesIO(rawdata)
     im.seek(0)
     imgdata = None
@@ -1541,13 +1805,19 @@ def read_images(rawdata, colorspace, first_frame_only=False, rot=None):
         imgdata = Image.open(im)
     except IOError as e:
         # test if it is a jpeg2000 image
-        if rawdata[:12] != b"\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A":
+        if rawdata[:12] == b"\x00\x00\x00\x0C\x6A\x50\x20\x20\x0D\x0A\x87\x0A":
+            # image is jpeg2000
+            imgformat = ImageFormat.JPEG2000
+        if rawdata[:14].lower() == b"id=imagemagick":
+            # image is in MIFF format
+            # this is useful for 16 bit CMYK because PNG cannot do CMYK and thus
+            # we need PIL but PIL cannot do 16 bit
+            imgformat = ImageFormat.MIFF
+        else:
             raise ImageOpenError(
                 "cannot read input image (not jpeg2000). "
                 "PIL: error reading image: %s" % e
             )
-        # image is jpeg2000
-        imgformat = ImageFormat.JPEG2000
     else:
         logger.debug("PIL format = %s", imgdata.format)
         imgformat = None
@@ -1581,10 +1851,13 @@ def read_images(rawdata, colorspace, first_frame_only=False, rot=None):
             raise JpegColorspaceError("jpeg can't be monochrome")
         if color == Colorspace["P"]:
             raise JpegColorspaceError("jpeg can't have a color palette")
-        if color == Colorspace["RGBA"]:
+        if color == Colorspace["RGBA"] and imgformat != ImageFormat.JPEG2000:
             raise JpegColorspaceError("jpeg can't have an alpha channel")
         logger.debug("read_images() embeds a JPEG")
         cleanup()
+        depth = 8
+        if imgformat == ImageFormat.JPEG2000:
+            *_, depth = jp2.parse(rawdata)
         return [
             (
                 color,
@@ -1596,7 +1869,7 @@ def read_images(rawdata, colorspace, first_frame_only=False, rot=None):
                 imgheightpx,
                 [],
                 False,
-                8,
+                depth,
                 rotation,
                 iccp,
             )
@@ -1613,6 +1886,77 @@ def read_images(rawdata, colorspace, first_frame_only=False, rot=None):
     if imgformat == ImageFormat.MPO:
         result = []
         img_page_count = 0
+        assert len(imgdata._MpoImageFile__mpoffsets) == len(imgdata.mpinfo[0xB002])
+        num_frames = len(imgdata.mpinfo[0xB002])
+        # An MPO file can be a main image together with one or more thumbnails
+        # if that is the case, then we only include all frames if the
+        # --include-thumbnails option is given. If it is not, such an MPO file
+        # will be embedded as is, so including its thumbnails but showing up
+        # as a single image page in the resulting PDF.
+        num_main_frames = 0
+        num_thumbnail_frames = 0
+        for i, mpent in enumerate(imgdata.mpinfo[0xB002]):
+            # check only the first frame for being the main image
+            if (
+                i == 0
+                and mpent["Attribute"]["DependentParentImageFlag"]
+                and not mpent["Attribute"]["DependentChildImageFlag"]
+                and mpent["Attribute"]["RepresentativeImageFlag"]
+                and mpent["Attribute"]["MPType"] == "Baseline MP Primary Image"
+            ):
+                num_main_frames += 1
+            elif (
+                not mpent["Attribute"]["DependentParentImageFlag"]
+                and mpent["Attribute"]["DependentChildImageFlag"]
+                and not mpent["Attribute"]["RepresentativeImageFlag"]
+                and mpent["Attribute"]["MPType"]
+                in [
+                    "Large Thumbnail (VGA Equivalent)",
+                    "Large Thumbnail (Full HD Equivalent)",
+                ]
+            ):
+                num_thumbnail_frames += 1
+        logger.debug(f"number of frames: {num_frames}")
+        logger.debug(f"number of main frames: {num_main_frames}")
+        logger.debug(f"number of thumbnail frames: {num_thumbnail_frames}")
+        # this MPO file is a main image plus zero or more thumbnails
+        # embed as-is unless the --include-thumbnails option was given
+        if num_frames == 1 or (
+            not include_thumbnails
+            and num_main_frames == 1
+            and num_thumbnail_frames + 1 == num_frames
+        ):
+            color, ndpi, imgwidthpx, imgheightpx, rotation, iccp = get_imgmetadata(
+                imgdata, imgformat, default_dpi, colorspace, rawdata, rot
+            )
+            if color == Colorspace["1"]:
+                raise JpegColorspaceError("jpeg can't be monochrome")
+            if color == Colorspace["P"]:
+                raise JpegColorspaceError("jpeg can't have a color palette")
+            if color == Colorspace["RGBA"]:
+                raise JpegColorspaceError("jpeg can't have an alpha channel")
+            logger.debug("read_images() embeds an MPO verbatim")
+            cleanup()
+            return [
+                (
+                    color,
+                    ndpi,
+                    ImageFormat.JPEG,
+                    rawdata,
+                    None,
+                    imgwidthpx,
+                    imgheightpx,
+                    [],
+                    False,
+                    8,
+                    rotation,
+                    iccp,
+                )
+            ]
+        # If the control flow reaches here, the MPO has more than a single
+        # frame but was not detected to be a main image followed by multiple
+        # thumbnails. We thus treat this MPO as we do other multi-frame images
+        # and include all its frames as individual pages.
         for offset, mpent in zip(
             imgdata._MpoImageFile__mpoffsets, imgdata.mpinfo[0xB002]
         ):
@@ -1710,6 +2054,9 @@ def read_images(rawdata, colorspace, first_frame_only=False, rot=None):
                     )
                 ]
 
+    if imgformat == ImageFormat.MIFF:
+        return parse_miff(rawdata)
+
     # If our input is not JPEG or PNG, then we might have a format that
     # supports multiple frames (like TIFF or GIF), so we need a loop to
     # iterate through all frames of the image.
@@ -1875,7 +2222,16 @@ def read_images(rawdata, colorspace, first_frame_only=False, rot=None):
                 )
             )
         else:
-            if (
+            if color in [Colorspace.P, Colorspace.PA] and iccp is not None:
+                # PDF does not support palette images with icc profile
+                if color == Colorspace.P:
+                    newcolor = Colorspace.RGB
+                    newimg = newimg.convert(mode="RGB")
+                elif color == Colorspace.PA:
+                    newcolor = Colorspace.RGBA
+                    newimg = newimg.convert(mode="RGBA")
+                smaskidat = None
+            elif (
                 color == Colorspace.RGBA
                 or color == Colorspace.LA
                 or color == Colorspace.PA
@@ -1889,25 +2245,21 @@ def read_images(rawdata, colorspace, first_frame_only=False, rot=None):
                     newcolor = color
                     l, a = newimg.split()
                     newimg = l
+                elif color == Colorspace.PA or (
+                    color == Colorspace.P and "transparency" in newimg.info
+                ):
+                    newcolor = color
+                    a = newimg.convert(mode="RGBA").split()[-1]
                 else:
                     newcolor = Colorspace.RGBA
                     r, g, b, a = newimg.convert(mode="RGBA").split()
                     newimg = Image.merge("RGB", (r, g, b))
 
-                smaskidat, _, _ = to_png_data(a)
+                smaskidat, *_ = to_png_data(a)
                 logger.warning(
                     "Image contains an alpha channel. Computing a separate "
                     "soft mask (/SMask) image to store transparency in PDF."
                 )
-            elif color in [Colorspace.P, Colorspace.PA] and iccp is not None:
-                # PDF does not support palette images with icc profile
-                if color == Colorspace.P:
-                    newcolor = Colorspace.RGB
-                    newimg = newimg.convert(mode="RGB")
-                elif color == Colorspace.PA:
-                    newcolor = Colorspace.RGBA
-                    newimg = newimg.convert(mode="RGBA")
-                smaskidat = None
             else:
                 newcolor = color
                 smaskidat = None
@@ -2249,7 +2601,6 @@ def find_scale(pagewidth, pageheight):
 # as a binary string representing the image content or as filenames to the
 # images.
 def convert(*images, **kwargs):
-
     _default_kwargs = dict(
         engine=None,
         title=None,
@@ -2279,6 +2630,7 @@ def convert(*images, **kwargs):
         artborder=None,
         pdfa=None,
         rotation=None,
+        include_thumbnails=False,
     )
     for kwname, default in _default_kwargs.items():
         if kwname not in kwargs:
@@ -2322,11 +2674,16 @@ def convert(*images, **kwargs):
     for img in images:
         # img is allowed to be a path, a binary string representing image data
         # or a file-like object (really anything that implements read())
-        try:
-            rawdata = img.read()
-        except AttributeError:
+        # or a pathlib.Path object (really anything that implements read_bytes())
+        rawdata = None
+        for fun in "read", "read_bytes":
+            try:
+                rawdata = getattr(img, fun)()
+            except AttributeError:
+                pass
+        if rawdata is None:
             if not isinstance(img, (str, bytes)):
-                raise TypeError("Neither implements read() nor is str or bytes")
+                raise TypeError("Neither read(), read_bytes() nor is str or bytes")
             # the thing doesn't have a read() function, so try if we can treat
             # it as a file name
             try:
@@ -2344,6 +2701,10 @@ def convert(*images, **kwargs):
                 rawdata = f.read()
                 f.close()
 
+        # md5 = hashlib.md5(rawdata).hexdigest()
+        # with open("./testdata/" + md5, "wb") as f:
+        #    f.write(rawdata)
+
         for (
             color,
             ndpi,
@@ -2362,6 +2723,7 @@ def convert(*images, **kwargs):
             kwargs["colorspace"],
             kwargs["first_frame_only"],
             kwargs["rotation"],
+            kwargs["include_thumbnails"],
         ):
             pagewidth, pageheight, imgwidthpdf, imgheightpdf = kwargs["layout_fun"](
                 imgwidthpx, imgheightpx, ndpi
@@ -2737,7 +3099,7 @@ def valid_date(string):
     else:
         try:
             return parser.parse(string)
-        except TypeError:
+        except:
             pass
     # as a last resort, try the local date utility
     try:
@@ -2750,7 +3112,7 @@ def valid_date(string):
         except subprocess.CalledProcessError:
             pass
         else:
-            return datetime.utcfromtimestamp(int(utime))
+            return datetime.fromtimestamp(int(utime))
     raise argparse.ArgumentTypeError("cannot parse date: %s" % string)
 
 
@@ -3452,7 +3814,18 @@ def gui():
     app.mainloop()
 
 
-def main(argv=sys.argv):
+def get_default_icc_profile():
+    for profile in [
+        "/usr/share/color/icc/sRGB.icc",
+        "/usr/share/color/icc/OpenICC/sRGB.icc",
+        "/usr/share/color/icc/colord/sRGB.icc",
+    ]:
+        if os.path.exists(profile):
+            return profile
+    return "/usr/share/color/icc/sRGB.icc"
+
+
+def get_main_parser():
     rendered_papersizes = ""
     for k, v in sorted(papersizes.items()):
         rendered_papersizes += "    %-8s %s\n" % (papernames[k], v)
@@ -3493,7 +3866,9 @@ Paper sizes:
   the value in the second column has the same effect as giving the short hand
   in the first column. Appending ^T (a caret/circumflex followed by the letter
   T) turns the paper size from portrait into landscape. The postfix thus
-  symbolizes the transpose. The values are case insensitive.
+  symbolizes the transpose. Note that on Windows cmd.exe the caret symbol is
+  the escape character, so you need to put quotes around the option value.
+  The values are case insensitive.
 
 %s
 
@@ -3560,7 +3935,7 @@ Examples:
   while preserving its aspect ratio and a print border of 2 cm on the top and
   bottom and 2.5 cm on the left and right hand side.
 
-    $ img2pdf --output out.pdf --pagesize A4^T --border 2cm:2.5cm *.jpg
+    $ img2pdf --output out.pdf --pagesize "A4^T" --border 2cm:2.5cm *.jpg
 
   On each A4 page, fit images into a 10 cm times 15 cm rectangle but keep the
   original image size if the image is smaller than that.
@@ -3696,6 +4071,17 @@ RGB.""",
     )
 
     outargs.add_argument(
+        "--include-thumbnails",
+        action="store_true",
+        help="Some multi-frame formats like MPO carry a main image and "
+        "one or more scaled-down copies of the main image (thumbnails). "
+        "In such a case, img2pdf will only include the main image and "
+        "not create additional pages for each of the thumbnails. If this "
+        "option is set, img2pdf will instead create one page per frame and "
+        "thus store each thumbnail on its own page.",
+    )
+
+    outargs.add_argument(
         "--pillow-limit-break",
         action="store_true",
         help="img2pdf uses the Python Imaging Library Pillow to read input "
@@ -3706,14 +4092,20 @@ RGB.""",
         % Image.MAX_IMAGE_PIXELS,
     )
 
-    outargs.add_argument(
-        "--pdfa",
-        nargs="?",
-        const="/usr/share/color/icc/sRGB.icc",
-        default=None,
-        help="Output a PDF/A-1b compliant document. By default, this will "
-        "embed /usr/share/color/icc/sRGB.icc as the color profile.",
-    )
+    if sys.platform == "win32":
+        pass
+    else:
+        outargs.add_argument(
+            "--pdfa",
+            nargs="?",
+            const=get_default_icc_profile(),
+            default=None,
+            help="Output a PDF/A-1b compliant document. By default, this will "
+            "embed either /usr/share/color/icc/sRGB.icc, "
+            "/usr/share/color/icc/OpenICC/sRGB.icc or "
+            "/usr/share/color/icc/colord/sRGB.icc as the color profile, whichever "
+            "is found to exist first.",
+        )
 
     sizeargs = parser.add_argument_group(
         title="Image and page size and layout arguments",
@@ -4002,8 +4394,11 @@ and left/right, respectively. It is not possible to specify asymmetric borders.
         action="store_true",
         help="Instruct the PDF viewer to open the PDF in fullscreen mode",
     )
+    return parser
 
-    args = parser.parse_args(argv[1:])
+
+def main(argv=sys.argv):
+    args = get_main_parser().parse_args(argv[1:])
 
     if args.verbose:
         logging.basicConfig(level=logging.DEBUG)
@@ -4027,7 +4422,11 @@ and left/right, respectively. It is not possible to specify asymmetric borders.
     elif len(args.images) == 0 and len(args.from_file) == 0:
         # if no positional arguments were supplied, read a single image from
         # standard input
-        logger.info("reading image from standard input")
+        print(
+            "Reading image from standard input...\n"
+            "Re-run with -h or --help for usage information.",
+            file=sys.stderr,
+        )
         try:
             images = [sys.stdin.buffer.read()]
         except KeyboardInterrupt:
@@ -4088,6 +4487,7 @@ and left/right, respectively. It is not possible to specify asymmetric borders.
             artborder=args.art_border,
             pdfa=args.pdfa,
             rotation=args.rotation,
+            include_thumbnails=args.include_thumbnails,
         )
     except Exception as e:
         logger.error("error: " + str(e))
author	Johannes Schauer Marin Rodrigues <josch@debian.org>	2023-10-28 10:21:25 +0200
committer	Johannes Schauer Marin Rodrigues <josch@debian.org>	2023-10-28 10:21:25 +0200
commit	d6d4451cf308865725d044a4331bda9ba7ec066d (patch)
tree	378aa1e2bc45e361ca5aedff0a841bd41a4d5341 /src/img2pdf.py
parent	7abe2f2f089f38a0ba403da8f1459f5c6bf2ffa6 (diff)