4 files changed, 174 insertions, 99 deletions
diff --git a/synapse/rest/media/v1/__init__.py b/synapse/rest/media/v1/__init__.py
index 3dd16d4b..d5b74cdd 100644
--- a/synapse/rest/media/v1/__init__.py
+++ b/synapse/rest/media/v1/__init__.py
@@ -12,33 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import PIL.Image
+from PIL.features import check_codec
 
 # check for JPEG support.
-try:
-    PIL.Image._getdecoder("rgb", "jpeg", None)
-except OSError as e:
-    if str(e).startswith("decoder jpeg not available"):
-        raise Exception(
-            "FATAL: jpeg codec not supported. Install pillow correctly! "
-            " 'sudo apt-get install libjpeg-dev' then 'pip uninstall pillow &&"
-            " pip install pillow --user'"
-        )
-except Exception:
-    # any other exception is fine
-    pass
+if not check_codec("jpg"):
+    raise Exception(
+        "FATAL: jpeg codec not supported. Install pillow correctly! "
+        " 'sudo apt-get install libjpeg-dev' then 'pip uninstall pillow &&"
+        " pip install pillow --user'"
+    )
 
 
 # check for PNG support.
-try:
-    PIL.Image._getdecoder("rgb", "zip", None)
-except OSError as e:
-    if str(e).startswith("decoder zip not available"):
-        raise Exception(
-            "FATAL: zip codec not supported. Install pillow correctly! "
-            " 'sudo apt-get install libjpeg-dev' then 'pip uninstall pillow &&"
-            " pip install pillow --user'"
-        )
-except Exception:
-    # any other exception is fine
-    pass
+if not check_codec("zlib"):
+    raise Exception(
+        "FATAL: zip codec not supported. Install pillow correctly! "
+        " 'sudo apt-get install libjpeg-dev' then 'pip uninstall pillow &&"
+        " pip install pillow --user'"
+    )
diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py
index e04671fb..78b1603f 100644
--- a/synapse/rest/media/v1/oembed.py
+++ b/synapse/rest/media/v1/oembed.py
@@ -96,6 +96,32 @@ class OEmbedProvider:
         # No match.
         return None
 
+    def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
+        """
+        Search an HTML document for oEmbed autodiscovery information.
+
+        Args:
+            tree: The parsed HTML body.
+
+        Returns:
+            The URL to use for oEmbed information, or None if no URL was found.
+        """
+        # Search for link elements with the proper rel and type attributes.
+        for tag in tree.xpath(
+            "//link[@rel='alternate'][@type='application/json+oembed']"
+        ):
+            if "href" in tag.attrib:
+                return tag.attrib["href"]
+
+        # Some providers (e.g. Flickr) use alternative instead of alternate.
+        for tag in tree.xpath(
+            "//link[@rel='alternative'][@type='application/json+oembed']"
+        ):
+            if "href" in tag.attrib:
+                return tag.attrib["href"]
+
+        return None
+
     def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
         """
         Parse the oEmbed response into an Open Graph response.
@@ -165,7 +191,7 @@ class OEmbedProvider:
 
         except Exception as e:
             # Trap any exception and let the code follow as usual.
-            logger.warning(f"Error parsing oEmbed metadata from {url}: {e:r}")
+            logger.warning("Error parsing oEmbed metadata from %s: %r", url, e)
             open_graph_response = {}
             cache_age = None
 
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 79a42b24..1fe0fc8a 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -22,7 +22,7 @@ import re
 import shutil
 import sys
 import traceback
-from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Union
+from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Tuple, Union
 from urllib import parse as urlparse
 
 import attr
@@ -73,6 +73,7 @@ OG_TAG_VALUE_MAXLEN = 1000
 
 ONE_HOUR = 60 * 60 * 1000
 ONE_DAY = 24 * ONE_HOUR
+IMAGE_CACHE_EXPIRY_MS = 2 * ONE_DAY
 
 
 @attr.s(slots=True, frozen=True, auto_attribs=True)
@@ -295,22 +296,32 @@ class PreviewUrlResource(DirectServeJsonResource):
                 body = file.read()
 
             encoding = get_html_media_encoding(body, media_info.media_type)
-            og = decode_and_calc_og(body, media_info.uri, encoding)
-
-            await self._precache_image_url(user, media_info, og)
-
-        elif oembed_url and _is_json(media_info.media_type):
-            # Handle an oEmbed response.
-            with open(media_info.filename, "rb") as file:
-                body = file.read()
-
-            oembed_response = self._oembed.parse_oembed_response(url, body)
-            og = oembed_response.open_graph_result
-
-            # Use the cache age from the oEmbed result, instead of the HTTP response.
-            if oembed_response.cache_age is not None:
-                expiration_ms = oembed_response.cache_age
+            tree = decode_body(body, encoding)
+            if tree is not None:
+                # Check if this HTML document points to oEmbed information and
+                # defer to that.
+                oembed_url = self._oembed.autodiscover_from_html(tree)
+                og = {}
+                if oembed_url:
+                    oembed_info = await self._download_url(oembed_url, user)
+                    og, expiration_ms = await self._handle_oembed_response(
+                        url, oembed_info, expiration_ms
+                    )
+
+                # If there was no oEmbed URL (or oEmbed parsing failed), attempt
+                # to generate the Open Graph information from the HTML.
+                if not oembed_url or not og:
+                    og = _calc_og(tree, media_info.uri)
+
+                await self._precache_image_url(user, media_info, og)
+            else:
+                og = {}
 
+        elif oembed_url:
+            # Handle the oEmbed information.
+            og, expiration_ms = await self._handle_oembed_response(
+                url, media_info, expiration_ms
+            )
             await self._precache_image_url(user, media_info, og)
 
         else:
@@ -478,6 +489,39 @@ class PreviewUrlResource(DirectServeJsonResource):
         else:
             del og["og:image"]
 
+    async def _handle_oembed_response(
+        self, url: str, media_info: MediaInfo, expiration_ms: int
+    ) -> Tuple[JsonDict, int]:
+        """
+        Parse the downloaded oEmbed info.
+
+        Args:
+            url: The URL which is being previewed (not the one which was
+                requested).
+            media_info: The media being previewed.
+            expiration_ms: The length of time, in milliseconds, the media is valid for.
+
+        Returns:
+            A tuple of:
+                The Open Graph dictionary, if the oEmbed info can be parsed.
+                The (possibly updated) length of time, in milliseconds, the media is valid for.
+        """
+        # If JSON was not returned, there's nothing to do.
+        if not _is_json(media_info.media_type):
+            return {}, expiration_ms
+
+        with open(media_info.filename, "rb") as file:
+            body = file.read()
+
+        oembed_response = self._oembed.parse_oembed_response(url, body)
+        open_graph_result = oembed_response.open_graph_result
+
+        # Use the cache age from the oEmbed result, if one was given.
+        if open_graph_result and oembed_response.cache_age is not None:
+            expiration_ms = oembed_response.cache_age
+
+        return open_graph_result, expiration_ms
+
     def _start_expire_url_cache_data(self) -> Deferred:
         return run_as_background_process(
             "expire_url_cache_data", self._expire_url_cache_data
@@ -496,6 +540,27 @@ class PreviewUrlResource(DirectServeJsonResource):
             logger.info("Still running DB updates; skipping expiry")
             return
 
+        def try_remove_parent_dirs(dirs: Iterable[str]) -> None:
+            """Attempt to remove the given chain of parent directories
+
+            Args:
+                dirs: The list of directory paths to delete, with children appearing
+                    before their parents.
+            """
+            for dir in dirs:
+                try:
+                    os.rmdir(dir)
+                except FileNotFoundError:
+                    # Already deleted, continue with deleting the rest
+                    pass
+                except OSError as e:
+                    # Failed, skip deleting the rest of the parent dirs
+                    if e.errno != errno.ENOTEMPTY:
+                        logger.warning(
+                            "Failed to remove media directory: %r: %s", dir, e
+                        )
+                    break
+
         # First we delete expired url cache entries
         media_ids = await self.store.get_expired_url_cache(now)
 
@@ -504,20 +569,16 @@ class PreviewUrlResource(DirectServeJsonResource):
             fname = self.filepaths.url_cache_filepath(media_id)
             try:
                 os.remove(fname)
+            except FileNotFoundError:
+                pass  # If the path doesn't exist, meh
             except OSError as e:
-                # If the path doesn't exist, meh
-                if e.errno != errno.ENOENT:
-                    logger.warning("Failed to remove media: %r: %s", media_id, e)
-                    continue
+                logger.warning("Failed to remove media: %r: %s", media_id, e)
+                continue
 
             removed_media.append(media_id)
 
-            try:
-                dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
-                for dir in dirs:
-                    os.rmdir(dir)
-            except Exception:
-                pass
+            dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
+            try_remove_parent_dirs(dirs)
 
         await self.store.delete_url_cache(removed_media)
 
@@ -530,7 +591,7 @@ class PreviewUrlResource(DirectServeJsonResource):
         # These may be cached for a bit on the client (i.e., they
         # may have a room open with a preview url thing open).
         # So we wait a couple of days before deleting, just in case.
-        expire_before = now - 2 * ONE_DAY
+        expire_before = now - IMAGE_CACHE_EXPIRY_MS
         media_ids = await self.store.get_url_cache_media_before(expire_before)
 
         removed_media = []
@@ -538,36 +599,30 @@ class PreviewUrlResource(DirectServeJsonResource):
             fname = self.filepaths.url_cache_filepath(media_id)
             try:
                 os.remove(fname)
+            except FileNotFoundError:
+                pass  # If the path doesn't exist, meh
             except OSError as e:
-                # If the path doesn't exist, meh
-                if e.errno != errno.ENOENT:
-                    logger.warning("Failed to remove media: %r: %s", media_id, e)
-                    continue
+                logger.warning("Failed to remove media: %r: %s", media_id, e)
+                continue
 
-            try:
-                dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
-                for dir in dirs:
-                    os.rmdir(dir)
-            except Exception:
-                pass
+            dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
+            try_remove_parent_dirs(dirs)
 
             thumbnail_dir = self.filepaths.url_cache_thumbnail_directory(media_id)
             try:
                 shutil.rmtree(thumbnail_dir)
+            except FileNotFoundError:
+                pass  # If the path doesn't exist, meh
             except OSError as e:
-                # If the path doesn't exist, meh
-                if e.errno != errno.ENOENT:
-                    logger.warning("Failed to remove media: %r: %s", media_id, e)
-                    continue
+                logger.warning("Failed to remove media: %r: %s", media_id, e)
+                continue
 
             removed_media.append(media_id)
 
-            try:
-                dirs = self.filepaths.url_cache_thumbnail_dirs_to_delete(media_id)
-                for dir in dirs:
-                    os.rmdir(dir)
-            except Exception:
-                pass
+            dirs = self.filepaths.url_cache_thumbnail_dirs_to_delete(media_id)
+            # Note that one of the directories to be deleted has already been
+            # removed by the `rmtree` above.
+            try_remove_parent_dirs(dirs)
 
         await self.store.delete_url_cache_media(removed_media)
 
@@ -619,26 +674,22 @@ def get_html_media_encoding(body: bytes, content_type: str) -> str:
     return "utf-8"
 
 
-def decode_and_calc_og(
-    body: bytes, media_uri: str, request_encoding: Optional[str] = None
-) -> JsonDict:
+def decode_body(
+    body: bytes, request_encoding: Optional[str] = None
+) -> Optional["etree.Element"]:
     """
-    Calculate metadata for an HTML document.
-
-    This uses lxml to parse the HTML document into the OG response. If errors
-    occur during processing of the document, an empty response is returned.
+    This uses lxml to parse the HTML document.
 
     Args:
         body: The HTML document, as bytes.
-        media_url: The URI used to download the body.
         request_encoding: The character encoding of the body, as a string.
 
     Returns:
-        The OG response as a dictionary.
+        The parsed HTML body, or None if an error occurred during processed.
     """
     # If there's no body, nothing useful is going to be found.
     if not body:
-        return {}
+        return None
 
     from lxml import etree
 
@@ -650,25 +701,22 @@ def decode_and_calc_og(
         parser = etree.HTMLParser(recover=True, encoding="utf-8")
     except Exception as e:
         logger.warning("Unable to create HTML parser: %s" % (e,))
-        return {}
-
-    def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
-        # Attempt to parse the body. If this fails, log and return no metadata.
-        tree = etree.fromstring(body_attempt, parser)
-
-        # The data was successfully parsed, but no tree was found.
-        if tree is None:
-            return {}
+        return None
 
-        return _calc_og(tree, media_uri)
+    def _attempt_decode_body(
+        body_attempt: Union[bytes, str]
+    ) -> Optional["etree.Element"]:
+        # Attempt to parse the body. Returns None if the body was successfully
+        # parsed, but no tree was found.
+        return etree.fromstring(body_attempt, parser)
 
     # Attempt to parse the body. If this fails, log and return no metadata.
     try:
-        return _attempt_calc_og(body)
+        return _attempt_decode_body(body)
     except UnicodeDecodeError:
         # blindly try decoding the body as utf-8, which seems to fix
         # the charset mismatches on https://google.com
-        return _attempt_calc_og(body.decode("utf-8", "ignore"))
+        return _attempt_decode_body(body.decode("utf-8", "ignore"))
 
 
 def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
diff --git a/synapse/rest/media/v1/thumbnailer.py b/synapse/rest/media/v1/thumbnailer.py
index df54a406..46701a8b 100644
--- a/synapse/rest/media/v1/thumbnailer.py
+++ b/synapse/rest/media/v1/thumbnailer.py
@@ -61,9 +61,19 @@ class Thumbnailer:
         self.transpose_method = None
         try:
             # We don't use ImageOps.exif_transpose since it crashes with big EXIF
-            image_exif = self.image._getexif()
+            #
+            # Ignore safety: Pillow seems to acknowledge that this method is
+            # "private, experimental, but generally widely used". Pillow 6
+            # includes a public getexif() method (no underscore) that we might
+            # consider using instead when we can bump that dependency.
+            #
+            # At the time of writing, Debian buster (currently oldstable)
+            # provides version 5.4.1. It's expected to EOL in mid-2022, see
+            # https://wiki.debian.org/DebianReleases#Production_Releases
+            image_exif = self.image._getexif()  # type: ignore
             if image_exif is not None:
                 image_orientation = image_exif.get(EXIF_ORIENTATION_TAG)
+                assert isinstance(image_orientation, int)
                 self.transpose_method = EXIF_TRANSPOSE_MAPPINGS.get(image_orientation)
         except Exception as e:
             # A lot of parsing errors can happen when parsing EXIF
@@ -76,7 +86,10 @@ class Thumbnailer:
             A tuple containing the new image size in pixels as (width, height).
         """
         if self.transpose_method is not None:
-            self.image = self.image.transpose(self.transpose_method)
+            # Safety: `transpose` takes an int rather than e.g. an IntEnum.
+            # self.transpose_method is set above to be a value in
+            # EXIF_TRANSPOSE_MAPPINGS, and that only contains correct values.
+            self.image = self.image.transpose(self.transpose_method)  # type: ignore[arg-type]
             self.width, self.height = self.image.size
             self.transpose_method = None
             # We don't need EXIF any more
@@ -101,7 +114,7 @@ class Thumbnailer:
         else:
             return (max_height * self.width) // self.height, max_height
 
-    def _resize(self, width: int, height: int) -> Image:
+    def _resize(self, width: int, height: int) -> Image.Image:
         # 1-bit or 8-bit color palette images need converting to RGB
         # otherwise they will be scaled using nearest neighbour which
         # looks awful.
@@ -151,7 +164,7 @@ class Thumbnailer:
             cropped = scaled_image.crop((crop_left, 0, crop_right, height))
         return self._encode_image(cropped, output_type)
 
-    def _encode_image(self, output_image: Image, output_type: str) -> BytesIO:
+    def _encode_image(self, output_image: Image.Image, output_type: str) -> BytesIO:
         output_bytes_io = BytesIO()
         fmt = self.FORMATS[output_type]
         if fmt == "JPEG":