New upstream version 1.61.0

author: Andrej Shadura <andrewsh@debian.org> 2022-06-19 15:20:00 +0200
committer: Andrej Shadura <andrewsh@debian.org> 2022-06-19 15:20:00 +0200
commit: 6dc64c92c6991f09910f3e6db368e6eeb4b1981e (patch)
tree: d8bab73ee460e0a96bbda9c5988d8025dbbe2eb3 /synapse/rest/media
parent: c2d3cd76c24f663449bfa209ac920305f0501d3a (diff)
4 files changed, 370 insertions, 176 deletions
diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py
index 3e5d6c62..7435fd91 100644
--- a/synapse/rest/media/v1/media_repository.py
+++ b/synapse/rest/media/v1/media_repository.py
@@ -65,7 +65,12 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
 
 
-UPDATE_RECENTLY_ACCESSED_TS = 60 * 1000
+# How often to run the background job to update the "recently accessed"
+# attribute of local and remote media.
+UPDATE_RECENTLY_ACCESSED_TS = 60 * 1000  # 1 minute
+# How often to run the background job to check for local and remote media
+# that should be purged according to the configured media retention settings.
+MEDIA_RETENTION_CHECK_PERIOD_MS = 60 * 60 * 1000  # 1 hour
 
 
 class MediaRepository:
@@ -122,11 +127,36 @@ class MediaRepository:
             self._start_update_recently_accessed, UPDATE_RECENTLY_ACCESSED_TS
         )
 
+        # Media retention configuration options
+        self._media_retention_local_media_lifetime_ms = (
+            hs.config.media.media_retention_local_media_lifetime_ms
+        )
+        self._media_retention_remote_media_lifetime_ms = (
+            hs.config.media.media_retention_remote_media_lifetime_ms
+        )
+
+        # Check whether local or remote media retention is configured
+        if (
+            hs.config.media.media_retention_local_media_lifetime_ms is not None
+            or hs.config.media.media_retention_remote_media_lifetime_ms is not None
+        ):
+            # Run the background job to apply media retention rules routinely,
+            # with the duration between runs dictated by the homeserver config.
+            self.clock.looping_call(
+                self._start_apply_media_retention_rules,
+                MEDIA_RETENTION_CHECK_PERIOD_MS,
+            )
+
     def _start_update_recently_accessed(self) -> Deferred:
         return run_as_background_process(
             "update_recently_accessed_media", self._update_recently_accessed
         )
 
+    def _start_apply_media_retention_rules(self) -> Deferred:
+        return run_as_background_process(
+            "apply_media_retention_rules", self._apply_media_retention_rules
+        )
+
     async def _update_recently_accessed(self) -> None:
         remote_media = self.recently_accessed_remotes
         self.recently_accessed_remotes = set()
@@ -557,15 +587,16 @@ class MediaRepository:
             )
             return None
 
-        t_byte_source = await defer_to_thread(
-            self.hs.get_reactor(),
-            self._generate_thumbnail,
-            thumbnailer,
-            t_width,
-            t_height,
-            t_method,
-            t_type,
-        )
+        with thumbnailer:
+            t_byte_source = await defer_to_thread(
+                self.hs.get_reactor(),
+                self._generate_thumbnail,
+                thumbnailer,
+                t_width,
+                t_height,
+                t_method,
+                t_type,
+            )
 
         if t_byte_source:
             try:
@@ -627,15 +658,16 @@ class MediaRepository:
             )
             return None
 
-        t_byte_source = await defer_to_thread(
-            self.hs.get_reactor(),
-            self._generate_thumbnail,
-            thumbnailer,
-            t_width,
-            t_height,
-            t_method,
-            t_type,
-        )
+        with thumbnailer:
+            t_byte_source = await defer_to_thread(
+                self.hs.get_reactor(),
+                self._generate_thumbnail,
+                thumbnailer,
+                t_width,
+                t_height,
+                t_method,
+                t_type,
+            )
 
         if t_byte_source:
             try:
@@ -719,124 +751,182 @@ class MediaRepository:
             )
             return None
 
-        m_width = thumbnailer.width
-        m_height = thumbnailer.height
+        with thumbnailer:
+            m_width = thumbnailer.width
+            m_height = thumbnailer.height
 
-        if m_width * m_height >= self.max_image_pixels:
-            logger.info(
-                "Image too large to thumbnail %r x %r > %r",
-                m_width,
-                m_height,
-                self.max_image_pixels,
-            )
-            return None
-
-        if thumbnailer.transpose_method is not None:
-            m_width, m_height = await defer_to_thread(
-                self.hs.get_reactor(), thumbnailer.transpose
-            )
-
-        # We deduplicate the thumbnail sizes by ignoring the cropped versions if
-        # they have the same dimensions of a scaled one.
-        thumbnails: Dict[Tuple[int, int, str], str] = {}
-        for requirement in requirements:
-            if requirement.method == "crop":
-                thumbnails.setdefault(
-                    (requirement.width, requirement.height, requirement.media_type),
-                    requirement.method,
-                )
-            elif requirement.method == "scale":
-                t_width, t_height = thumbnailer.aspect(
-                    requirement.width, requirement.height
+            if m_width * m_height >= self.max_image_pixels:
+                logger.info(
+                    "Image too large to thumbnail %r x %r > %r",
+                    m_width,
+                    m_height,
+                    self.max_image_pixels,
                 )
-                t_width = min(m_width, t_width)
-                t_height = min(m_height, t_height)
-                thumbnails[
-                    (t_width, t_height, requirement.media_type)
-                ] = requirement.method
-
-        # Now we generate the thumbnails for each dimension, store it
-        for (t_width, t_height, t_type), t_method in thumbnails.items():
-            # Generate the thumbnail
-            if t_method == "crop":
-                t_byte_source = await defer_to_thread(
-                    self.hs.get_reactor(), thumbnailer.crop, t_width, t_height, t_type
+                return None
+
+            if thumbnailer.transpose_method is not None:
+                m_width, m_height = await defer_to_thread(
+                    self.hs.get_reactor(), thumbnailer.transpose
                 )
-            elif t_method == "scale":
-                t_byte_source = await defer_to_thread(
-                    self.hs.get_reactor(), thumbnailer.scale, t_width, t_height, t_type
+
+            # We deduplicate the thumbnail sizes by ignoring the cropped versions if
+            # they have the same dimensions of a scaled one.
+            thumbnails: Dict[Tuple[int, int, str], str] = {}
+            for requirement in requirements:
+                if requirement.method == "crop":
+                    thumbnails.setdefault(
+                        (requirement.width, requirement.height, requirement.media_type),
+                        requirement.method,
+                    )
+                elif requirement.method == "scale":
+                    t_width, t_height = thumbnailer.aspect(
+                        requirement.width, requirement.height
+                    )
+                    t_width = min(m_width, t_width)
+                    t_height = min(m_height, t_height)
+                    thumbnails[
+                        (t_width, t_height, requirement.media_type)
+                    ] = requirement.method
+
+            # Now we generate the thumbnails for each dimension, store it
+            for (t_width, t_height, t_type), t_method in thumbnails.items():
+                # Generate the thumbnail
+                if t_method == "crop":
+                    t_byte_source = await defer_to_thread(
+                        self.hs.get_reactor(),
+                        thumbnailer.crop,
+                        t_width,
+                        t_height,
+                        t_type,
+                    )
+                elif t_method == "scale":
+                    t_byte_source = await defer_to_thread(
+                        self.hs.get_reactor(),
+                        thumbnailer.scale,
+                        t_width,
+                        t_height,
+                        t_type,
+                    )
+                else:
+                    logger.error("Unrecognized method: %r", t_method)
+                    continue
+
+                if not t_byte_source:
+                    continue
+
+                file_info = FileInfo(
+                    server_name=server_name,
+                    file_id=file_id,
+                    url_cache=url_cache,
+                    thumbnail=ThumbnailInfo(
+                        width=t_width,
+                        height=t_height,
+                        method=t_method,
+                        type=t_type,
+                    ),
                 )
-            else:
-                logger.error("Unrecognized method: %r", t_method)
-                continue
-
-            if not t_byte_source:
-                continue
-
-            file_info = FileInfo(
-                server_name=server_name,
-                file_id=file_id,
-                url_cache=url_cache,
-                thumbnail=ThumbnailInfo(
-                    width=t_width,
-                    height=t_height,
-                    method=t_method,
-                    type=t_type,
-                ),
-            )
 
-            with self.media_storage.store_into_file(file_info) as (f, fname, finish):
-                try:
-                    await self.media_storage.write_to_file(t_byte_source, f)
-                    await finish()
-                finally:
-                    t_byte_source.close()
-
-                t_len = os.path.getsize(fname)
-
-                # Write to database
-                if server_name:
-                    # Multiple remote media download requests can race (when
-                    # using multiple media repos), so this may throw a violation
-                    # constraint exception. If it does we'll delete the newly
-                    # generated thumbnail from disk (as we're in the ctx
-                    # manager).
-                    #
-                    # However: we've already called `finish()` so we may have
-                    # also written to the storage providers. This is preferable
-                    # to the alternative where we call `finish()` *after* this,
-                    # where we could end up having an entry in the DB but fail
-                    # to write the files to the storage providers.
+                with self.media_storage.store_into_file(file_info) as (
+                    f,
+                    fname,
+                    finish,
+                ):
                     try:
-                        await self.store.store_remote_media_thumbnail(
-                            server_name,
-                            media_id,
-                            file_id,
-                            t_width,
-                            t_height,
-                            t_type,
-                            t_method,
-                            t_len,
-                        )
-                    except Exception as e:
-                        thumbnail_exists = await self.store.get_remote_media_thumbnail(
-                            server_name,
-                            media_id,
-                            t_width,
-                            t_height,
-                            t_type,
+                        await self.media_storage.write_to_file(t_byte_source, f)
+                        await finish()
+                    finally:
+                        t_byte_source.close()
+
+                    t_len = os.path.getsize(fname)
+
+                    # Write to database
+                    if server_name:
+                        # Multiple remote media download requests can race (when
+                        # using multiple media repos), so this may throw a violation
+                        # constraint exception. If it does we'll delete the newly
+                        # generated thumbnail from disk (as we're in the ctx
+                        # manager).
+                        #
+                        # However: we've already called `finish()` so we may have
+                        # also written to the storage providers. This is preferable
+                        # to the alternative where we call `finish()` *after* this,
+                        # where we could end up having an entry in the DB but fail
+                        # to write the files to the storage providers.
+                        try:
+                            await self.store.store_remote_media_thumbnail(
+                                server_name,
+                                media_id,
+                                file_id,
+                                t_width,
+                                t_height,
+                                t_type,
+                                t_method,
+                                t_len,
+                            )
+                        except Exception as e:
+                            thumbnail_exists = (
+                                await self.store.get_remote_media_thumbnail(
+                                    server_name,
+                                    media_id,
+                                    t_width,
+                                    t_height,
+                                    t_type,
+                                )
+                            )
+                            if not thumbnail_exists:
+                                raise e
+                    else:
+                        await self.store.store_local_thumbnail(
+                            media_id, t_width, t_height, t_type, t_method, t_len
                         )
-                        if not thumbnail_exists:
-                            raise e
-                else:
-                    await self.store.store_local_thumbnail(
-                        media_id, t_width, t_height, t_type, t_method, t_len
-                    )
 
         return {"width": m_width, "height": m_height}
 
+    async def _apply_media_retention_rules(self) -> None:
+        """
+        Purge old local and remote media according to the media retention rules
+        defined in the homeserver config.
+        """
+        # Purge remote media
+        if self._media_retention_remote_media_lifetime_ms is not None:
+            # Calculate a threshold timestamp derived from the configured lifetime. Any
+            # media that has not been accessed since this timestamp will be removed.
+            remote_media_threshold_timestamp_ms = (
+                self.clock.time_msec() - self._media_retention_remote_media_lifetime_ms
+            )
+
+            logger.info(
+                "Purging remote media last accessed before"
+                f" {remote_media_threshold_timestamp_ms}"
+            )
+
+            await self.delete_old_remote_media(
+                before_ts=remote_media_threshold_timestamp_ms
+            )
+
+        # And now do the same for local media
+        if self._media_retention_local_media_lifetime_ms is not None:
+            # This works the same as the remote media threshold
+            local_media_threshold_timestamp_ms = (
+                self.clock.time_msec() - self._media_retention_local_media_lifetime_ms
+            )
+
+            logger.info(
+                "Purging local media last accessed before"
+                f" {local_media_threshold_timestamp_ms}"
+            )
+
+            await self.delete_old_local_media(
+                before_ts=local_media_threshold_timestamp_ms,
+                keep_profiles=True,
+                delete_quarantined_media=False,
+                delete_protected_media=False,
+            )
+
     async def delete_old_remote_media(self, before_ts: int) -> Dict[str, int]:
-        old_media = await self.store.get_remote_media_before(before_ts)
+        old_media = await self.store.get_remote_media_ids(
+            before_ts, include_quarantined_media=False
+        )
 
         deleted = 0
 
@@ -889,6 +979,8 @@ class MediaRepository:
         before_ts: int,
         size_gt: int = 0,
         keep_profiles: bool = True,
+        delete_quarantined_media: bool = False,
+        delete_protected_media: bool = False,
     ) -> Tuple[List[str], int]:
         """
         Delete local or remote media from this server by size and timestamp. Removes
@@ -896,18 +988,22 @@ class MediaRepository:
 
         Args:
             before_ts: Unix timestamp in ms.
-                       Files that were last used before this timestamp will be deleted
-            size_gt: Size of the media in bytes. Files that are larger will be deleted
+                Files that were last used before this timestamp will be deleted.
+            size_gt: Size of the media in bytes. Files that are larger will be deleted.
             keep_profiles: Switch to delete also files that are still used in image data
-                           (e.g user profile, room avatar)
-                           If false these files will be deleted
+                (e.g user profile, room avatar). If false these files will be deleted.
+            delete_quarantined_media: If True, media marked as quarantined will be deleted.
+            delete_protected_media: If True, media marked as protected will be deleted.
+
         Returns:
             A tuple of (list of deleted media IDs, total deleted media IDs).
         """
-        old_media = await self.store.get_local_media_before(
+        old_media = await self.store.get_local_media_ids(
             before_ts,
             size_gt,
             keep_profiles,
+            include_quarantined_media=delete_quarantined_media,
+            include_protected_media=delete_protected_media,
         )
         return await self._remove_local_media_from_disk(old_media)
 
diff --git a/synapse/rest/media/v1/preview_html.py b/synapse/rest/media/v1/preview_html.py
index ca73965f..ed8f21a4 100644
--- a/synapse/rest/media/v1/preview_html.py
+++ b/synapse/rest/media/v1/preview_html.py
@@ -30,6 +30,9 @@ _xml_encoding_match = re.compile(
 )
 _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
 
+# Certain elements aren't meant for display.
+ARIA_ROLES_TO_IGNORE = {"directory", "menu", "menubar", "toolbar"}
+
 
 def _normalise_encoding(encoding: str) -> Optional[str]:
     """Use the Python codec's name as the normalised entry."""
@@ -174,13 +177,15 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
     # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
 
     og: Dict[str, Optional[str]] = {}
-    for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
-        if "content" in tag.attrib:
-            # if we've got more than 50 tags, someone is taking the piss
-            if len(og) >= 50:
-                logger.warning("Skipping OG for page with too many 'og:' tags")
-                return {}
-            og[tag.attrib["property"]] = tag.attrib["content"]
+    for tag in tree.xpath(
+        "//*/meta[starts-with(@property, 'og:')][@content][not(@content='')]"
+    ):
+        # if we've got more than 50 tags, someone is taking the piss
+        if len(og) >= 50:
+            logger.warning("Skipping OG for page with too many 'og:' tags")
+            return {}
+
+        og[tag.attrib["property"]] = tag.attrib["content"]
 
     # TODO: grab article: meta tags too, e.g.:
 
@@ -192,21 +197,23 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
     # "article:modified_time" content="2016-04-01T18:31:53+00:00" />
 
     if "og:title" not in og:
-        # do some basic spidering of the HTML
-        title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
-        if title and title[0].text is not None:
-            og["og:title"] = title[0].text.strip()
+        # Attempt to find a title from the title tag, or the biggest header on the page.
+        title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")
+        if title:
+            og["og:title"] = title[0].strip()
         else:
             og["og:title"] = None
 
     if "og:image" not in og:
-        # TODO: extract a favicon failing all else
         meta_image = tree.xpath(
-            "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
+            "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
         )
+        # If a meta image is found, use it.
         if meta_image:
             og["og:image"] = meta_image[0]
         else:
+            # Try to find images which are larger than 10px by 10px.
+            #
             # TODO: consider inlined CSS styles as well as width & height attribs
             images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
             images = sorted(
@@ -215,17 +222,24 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
                     -1 * float(i.attrib["width"]) * float(i.attrib["height"])
                 ),
             )
+            # If no images were found, try to find *any* images.
             if not images:
-                images = tree.xpath("//img[@src]")
+                images = tree.xpath("//img[@src][1]")
             if images:
                 og["og:image"] = images[0].attrib["src"]
 
+            # Finally, fallback to the favicon if nothing else.
+            else:
+                favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]")
+                if favicons:
+                    og["og:image"] = favicons[0]
+
     if "og:description" not in og:
+        # Check the first meta description tag for content.
         meta_description = tree.xpath(
-            "//*/meta"
-            "[translate(@name, 'DESCRIPTION', 'description')='description']"
-            "/@content"
+            "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
         )
+        # If a meta description is found with content, use it.
         if meta_description:
             og["og:description"] = meta_description[0]
         else:
@@ -246,7 +260,9 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
 
     Grabs any text nodes which are inside the <body/> tag, unless they are within
     an HTML5 semantic markup tag (<header/>, <nav/>, <aside/>, <footer/>), or
-    if they are within a <script/> or <style/> tag.
+    if they are within a <script/>, <svg/> or <style/> tag, or if they are within
+    a tag whose content is usually only shown to old browsers
+    (<iframe/>, <video/>, <canvas/>, <picture/>).
 
     This is a very very very coarse approximation to a plain text render of the page.
 
@@ -268,6 +284,12 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
         "script",
         "noscript",
         "style",
+        "svg",
+        "iframe",
+        "video",
+        "canvas",
+        "img",
+        "picture",
         etree.Comment,
     )
 
@@ -281,7 +303,7 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
 
 
 def _iterate_over_text(
-    tree: "etree.Element", *tags_to_ignore: Iterable[Union[str, "etree.Comment"]]
+    tree: "etree.Element", *tags_to_ignore: Union[str, "etree.Comment"]
 ) -> Generator[str, None, None]:
     """Iterate over the tree returning text nodes in a depth first fashion,
     skipping text nodes inside certain tags.
@@ -298,6 +320,10 @@ def _iterate_over_text(
         if isinstance(el, str):
             yield el
         elif el.tag not in tags_to_ignore:
+            # If the element isn't meant for display, ignore it.
+            if el.get("role") in ARIA_ROLES_TO_IGNORE:
+                continue
+
             # el.text is the text before the first child, so we can immediately
             # return it if the text exists.
             if el.text:
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 50383bdb..54a849ea 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -586,12 +586,16 @@ class PreviewUrlResource(DirectServeJsonResource):
             og: The Open Graph dictionary. This is modified with image information.
         """
         # If there's no image or it is blank, there's nothing to do.
-        if "og:image" not in og or not og["og:image"]:
+        if "og:image" not in og:
+            return
+
+        # Remove the raw image URL, this will be replaced with an MXC URL, if successful.
+        image_url = og.pop("og:image")
+        if not image_url:
             return
 
         # The image URL from the HTML might be relative to the previewed page,
         # convert it to an URL which can be requested directly.
-        image_url = og["og:image"]
         url_parts = urlparse(image_url)
         if url_parts.scheme != "data":
             image_url = urljoin(media_info.uri, image_url)
@@ -599,7 +603,16 @@ class PreviewUrlResource(DirectServeJsonResource):
         # FIXME: it might be cleaner to use the same flow as the main /preview_url
         # request itself and benefit from the same caching etc.  But for now we
         # just rely on the caching on the master request to speed things up.
-        image_info = await self._handle_url(image_url, user, allow_data_urls=True)
+        try:
+            image_info = await self._handle_url(image_url, user, allow_data_urls=True)
+        except Exception as e:
+            # Pre-caching the image failed, don't block the entire URL preview.
+            logger.warning(
+                "Pre-caching image failed during URL preview: %s errored with %s",
+                image_url,
+                e,
+            )
+            return
 
         if _is_media(image_info.media_type):
             # TODO: make sure we don't choke on white-on-transparent images
@@ -611,13 +624,11 @@ class PreviewUrlResource(DirectServeJsonResource):
                 og["og:image:width"] = dims["width"]
                 og["og:image:height"] = dims["height"]
             else:
-                logger.warning("Couldn't get dims for %s", og["og:image"])
+                logger.warning("Couldn't get dims for %s", image_url)
 
             og["og:image"] = f"mxc://{self.server_name}/{image_info.filesystem_id}"
             og["og:image:type"] = image_info.media_type
             og["matrix:image:size"] = image_info.media_length
-        else:
-            del og["og:image"]
 
     async def _handle_oembed_response(
         self, url: str, media_info: MediaInfo, expiration_ms: int
@@ -668,7 +679,7 @@ class PreviewUrlResource(DirectServeJsonResource):
         logger.debug("Running url preview cache expiry")
 
         if not (await self.store.db_pool.updates.has_completed_background_updates()):
-            logger.info("Still running DB updates; skipping expiry")
+            logger.debug("Still running DB updates; skipping url preview cache expiry")
             return
 
         def try_remove_parent_dirs(dirs: Iterable[str]) -> None:
@@ -688,7 +699,9 @@ class PreviewUrlResource(DirectServeJsonResource):
                     # Failed, skip deleting the rest of the parent dirs
                     if e.errno != errno.ENOTEMPTY:
                         logger.warning(
-                            "Failed to remove media directory: %r: %s", dir, e
+                            "Failed to remove media directory while clearing url preview cache: %r: %s",
+                            dir,
+                            e,
                         )
                     break
 
@@ -703,7 +716,11 @@ class PreviewUrlResource(DirectServeJsonResource):
             except FileNotFoundError:
                 pass  # If the path doesn't exist, meh
             except OSError as e:
-                logger.warning("Failed to remove media: %r: %s", media_id, e)
+                logger.warning(
+                    "Failed to remove media while clearing url preview cache: %r: %s",
+                    media_id,
+                    e,
+                )
                 continue
 
             removed_media.append(media_id)
@@ -714,9 +731,11 @@ class PreviewUrlResource(DirectServeJsonResource):
         await self.store.delete_url_cache(removed_media)
 
         if removed_media:
-            logger.info("Deleted %d entries from url cache", len(removed_media))
+            logger.debug(
+                "Deleted %d entries from url preview cache", len(removed_media)
+            )
         else:
-            logger.debug("No entries removed from url cache")
+            logger.debug("No entries removed from url preview cache")
 
         # Now we delete old images associated with the url cache.
         # These may be cached for a bit on the client (i.e., they
@@ -733,7 +752,9 @@ class PreviewUrlResource(DirectServeJsonResource):
             except FileNotFoundError:
                 pass  # If the path doesn't exist, meh
             except OSError as e:
-                logger.warning("Failed to remove media: %r: %s", media_id, e)
+                logger.warning(
+                    "Failed to remove media from url preview cache: %r: %s", media_id, e
+                )
                 continue
 
             dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
@@ -745,7 +766,9 @@ class PreviewUrlResource(DirectServeJsonResource):
             except FileNotFoundError:
                 pass  # If the path doesn't exist, meh
             except OSError as e:
-                logger.warning("Failed to remove media: %r: %s", media_id, e)
+                logger.warning(
+                    "Failed to remove media from url preview cache: %r: %s", media_id, e
+                )
                 continue
 
             removed_media.append(media_id)
@@ -758,9 +781,9 @@ class PreviewUrlResource(DirectServeJsonResource):
         await self.store.delete_url_cache_media(removed_media)
 
         if removed_media:
-            logger.info("Deleted %d media from url cache", len(removed_media))
+            logger.debug("Deleted %d media from url preview cache", len(removed_media))
         else:
-            logger.debug("No media removed from url cache")
+            logger.debug("No media removed from url preview cache")
 
 
 def _is_media(content_type: str) -> bool:
diff --git a/synapse/rest/media/v1/thumbnailer.py b/synapse/rest/media/v1/thumbnailer.py
index 390491eb..9b93b9b4 100644
--- a/synapse/rest/media/v1/thumbnailer.py
+++ b/synapse/rest/media/v1/thumbnailer.py
@@ -14,7 +14,8 @@
 # limitations under the License.
 import logging
 from io import BytesIO
-from typing import Tuple
+from types import TracebackType
+from typing import Optional, Tuple, Type
 
 from PIL import Image
 
@@ -45,6 +46,9 @@ class Thumbnailer:
         Image.MAX_IMAGE_PIXELS = max_image_pixels
 
     def __init__(self, input_path: str):
+        # Have we closed the image?
+        self._closed = False
+
         try:
             self.image = Image.open(input_path)
         except OSError as e:
@@ -89,7 +93,8 @@ class Thumbnailer:
             # Safety: `transpose` takes an int rather than e.g. an IntEnum.
             # self.transpose_method is set above to be a value in
             # EXIF_TRANSPOSE_MAPPINGS, and that only contains correct values.
-            self.image = self.image.transpose(self.transpose_method)  # type: ignore[arg-type]
+            with self.image:
+                self.image = self.image.transpose(self.transpose_method)  # type: ignore[arg-type]
             self.width, self.height = self.image.size
             self.transpose_method = None
             # We don't need EXIF any more
@@ -122,9 +127,11 @@ class Thumbnailer:
         # If the image has transparency, use RGBA instead.
         if self.image.mode in ["1", "L", "P"]:
             if self.image.info.get("transparency", None) is not None:
-                self.image = self.image.convert("RGBA")
+                with self.image:
+                    self.image = self.image.convert("RGBA")
             else:
-                self.image = self.image.convert("RGB")
+                with self.image:
+                    self.image = self.image.convert("RGB")
         return self.image.resize((width, height), Image.ANTIALIAS)
 
     def scale(self, width: int, height: int, output_type: str) -> BytesIO:
@@ -133,8 +140,8 @@ class Thumbnailer:
         Returns:
             BytesIO: the bytes of the encoded image ready to be written to disk
         """
-        scaled = self._resize(width, height)
-        return self._encode_image(scaled, output_type)
+        with self._resize(width, height) as scaled:
+            return self._encode_image(scaled, output_type)
 
     def crop(self, width: int, height: int, output_type: str) -> BytesIO:
         """Rescales and crops the image to the given dimensions preserving
@@ -151,18 +158,21 @@ class Thumbnailer:
             BytesIO: the bytes of the encoded image ready to be written to disk
         """
         if width * self.height > height * self.width:
+            scaled_width = width
             scaled_height = (width * self.height) // self.width
-            scaled_image = self._resize(width, scaled_height)
             crop_top = (scaled_height - height) // 2
             crop_bottom = height + crop_top
-            cropped = scaled_image.crop((0, crop_top, width, crop_bottom))
+            crop = (0, crop_top, width, crop_bottom)
         else:
             scaled_width = (height * self.width) // self.height
-            scaled_image = self._resize(scaled_width, height)
+            scaled_height = height
             crop_left = (scaled_width - width) // 2
             crop_right = width + crop_left
-            cropped = scaled_image.crop((crop_left, 0, crop_right, height))
-        return self._encode_image(cropped, output_type)
+            crop = (crop_left, 0, crop_right, height)
+
+        with self._resize(scaled_width, scaled_height) as scaled_image:
+            with scaled_image.crop(crop) as cropped:
+                return self._encode_image(cropped, output_type)
 
     def _encode_image(self, output_image: Image.Image, output_type: str) -> BytesIO:
         output_bytes_io = BytesIO()
@@ -171,3 +181,42 @@ class Thumbnailer:
             output_image = output_image.convert("RGB")
         output_image.save(output_bytes_io, fmt, quality=80)
         return output_bytes_io
+
+    def close(self) -> None:
+        """Closes the underlying image file.
+
+        Once closed no other functions can be called.
+
+        Can be called multiple times.
+        """
+
+        if self._closed:
+            return
+
+        self._closed = True
+
+        # Since we run this on the finalizer then we need to handle `__init__`
+        # raising an exception before it can define `self.image`.
+        image = getattr(self, "image", None)
+        if image is None:
+            return
+
+        image.close()
+
+    def __enter__(self) -> "Thumbnailer":
+        """Make `Thumbnailer` a context manager that calls `close` on
+        `__exit__`.
+        """
+        return self
+
+    def __exit__(
+        self,
+        type: Optional[Type[BaseException]],
+        value: Optional[BaseException],
+        traceback: Optional[TracebackType],
+    ) -> None:
+        self.close()
+
+    def __del__(self) -> None:
+        # Make sure we actually do close the image, rather than leak data.
+        self.close()
author	Andrej Shadura <andrewsh@debian.org>	2022-06-19 15:20:00 +0200
committer	Andrej Shadura <andrewsh@debian.org>	2022-06-19 15:20:00 +0200
commit	6dc64c92c6991f09910f3e6db368e6eeb4b1981e (patch)
tree	d8bab73ee460e0a96bbda9c5988d8025dbbe2eb3 /synapse/rest/media
parent	c2d3cd76c24f663449bfa209ac920305f0501d3a (diff)