summaryrefslogtreecommitdiff
path: root/src/decompress.h
diff options
context:
space:
mode:
authorColin Watson <cjwatson@debian.org>2022-01-24 01:28:10 +0000
committerColin Watson <cjwatson@debian.org>2022-01-24 01:28:11 +0000
commitad3d449064203e5f3348fe89871424fd5ecc089f (patch)
treeadb61b39a371f73ad6ca2cd069b0fa99a9aea3c0 /src/decompress.h
parent2b26ed2b9fd84ea36ab132ce3f7ffe22f07de81e (diff)
Significantly improve mandb performance
Forking large numbers of subprocesses is slow. An ideal fix would involve work in libpipeline (e.g. generator functions or multithreading), but in the meantime we can do much better in the common case of moderately-sized pages compressed using zlib by doing in-process decompression. On my test system, this takes `mandb -c` from 344 seconds (or 152 seconds with `MAN_DISABLE_SECCOMP=1` before 50200d151d, or 78 seconds with `MAN_DISABLE_SECCOMP=1` after 50200d151d) to 10 seconds. Thanks to Steinar H. Gunderson for a proof of concept, which I reworked extensively. Fixes Debian bugs #630799 and #1003089, and Ubuntu bug #1858777. * NEWS.md: Document this. Bump version to 2.10.0. * src/decompress.c (enum decompress_tag): Add DECOMPRESS_INPROCESS. (struct decompress_inprocess): New structure. (struct decompress): Add an inprocess element. (decompress_new_inprocess, decompress_try_zlib, decompress_is_pipeline, decompress_inprocess_len, decompress_inprocess_replace): New functions. (decompress_open): Add flags argument. Try in-process zlib decompression if requested via DECOMPRESS_ALLOW_INPROCESS. Update all callers to pass a flags argument (0 unless otherwise stated). (decompress_start, decompress_read, decompress_peek, decompress_peek_skip, decompress_readline, decompress_peekline, decompress_wait, decompress_free): Implement in-process mode. * src/decompress.h (DECOMPRESS_ALLOW_INPROCESS): Define. (decompress_open): Update prototype. (decompress_fdopen): Update comment to indicate that this always uses pipeline-based decompression. (decompress_is_pipeline, decompress_inprocess_len, decompress_inprocess_replace): Add prototypes. (decompress_start, decompress_wait): Update comments to document behaviour for in-process decompressors. * src/manconv.c (add_output): Add output buffer argument; append to output buffer if given, otherwise write to stdout as before. (try_iconv): Add and pass output buffer argument. (manconv): Likewise. Update all callers to pass output buffer argument (NULL unless otherwise stated). * src/manconv.h (struct manconv_outbuf): New structure. (manconv): Update prototype. * src/manconv_client.c (manconv_inprocess): New function. * src/manconv_client.h (manconv_inprocess): Add prototype. * src/lexgrog.l (find_name): Request in-process decompression unless operating on a cat page. Use manconv_inprocess rather than add_manconv in the in-process case. * src/man.c (grep): Request in-process decompression. * src/ult_src.c (ult_src): Likewise. * src/zsoelim.l (try_compressed, zsoelim_open_file): Likewise.
Diffstat (limited to 'src/decompress.h')
-rw-r--r--src/decompress.h60
1 files changed, 54 insertions, 6 deletions
diff --git a/src/decompress.h b/src/decompress.h
index 858e2781..733fa34f 100644
--- a/src/decompress.h
+++ b/src/decompress.h
@@ -23,27 +23,74 @@
#ifndef MAN_DECOMPRESS_H
#define MAN_DECOMPRESS_H
+#include <stdbool.h>
+
#include "pipeline.h"
struct decompress;
typedef struct decompress decompress;
-/* Open a decompressor reading from FILENAME. The caller must start the
- * resulting pipeline.
+/* Flags, combined using bitwise-or. */
+enum {
+ /* Allow the resulting decompressor to be constructed by reading and
+ * buffering the decompressed file contents in-process, rather than
+ * by starting a subprocess and streaming the output. This is
+ * suitable if and only if the file contents are only going to be
+ * handled in-process rather than being passed as input to some
+ * other program, but if that is the case then this is a significant
+ * optimization.
+ */
+ DECOMPRESS_ALLOW_INPROCESS = 1
+};
+
+/* Open a decompressor reading from FILENAME. The caller must start the
+ * resulting decompressor. If the DECOMPRESS_ALLOW_INPROCESS flag is given,
+ * then the resulting decompressor may be in-process (in which case
+ * decompress_get_pipeline will fail).
*/
-decompress *decompress_open (const char *filename);
+decompress *decompress_open (const char *filename, int flags);
-/* Open a decompressor reading from file descriptor FD. The caller must
- * start the resulting pipeline.
+/* Open a decompressor reading from file descriptor FD. The caller must
+ * start the resulting decompressor. This always uses pipeline-based
+ * decompression, since if it attempted to decompress data in process it
+ * would be unable to recover if it found that the data was too large.
*/
decompress *decompress_fdopen (int fd);
+/* Return true if and only if this is a pipeline-based decompressor. */
+bool decompress_is_pipeline (decompress *d);
+
/* Get the pipeline corresponding to a decompressor. Raises an assertion
* failure if this is not a pipeline-based decompressor.
*/
pipeline *decompress_get_pipeline (decompress *d);
-/* Start the processes in a pipeline-based decompressor.
+/* Return the total number of uncompressed bytes stored in an in-process
+ * decompressor. Raises an assertion failure if this is not an in-process
+ * decompressor.
+ */
+size_t decompress_inprocess_len (decompress *d);
+
+/* Replace an in-process decompressor's entire buffered file contents.
+ *
+ * In-process decompression works by buffering the whole file in memory,
+ * which works because we constrain it to only ever dealing with small
+ * files, and allows us to emulate streaming without having to resort to
+ * subprocesses, threads, or coroutines. However, there are some cases
+ * (notably encoding conversion) where it's useful to be able to do some
+ * kind of processing on the file contents in a way that similarly looks
+ * like streaming to its consumers. To allow for this, we allow consumers
+ * of decompressed data to replace the buffered file contents and reset the
+ * read offset so that their consumers in turn get the same useful read/peek
+ * API.
+ *
+ * This is of course a hack, and wouldn't be a wise thing to include in a
+ * general-purpose library API, but this is only used within man-db.
+ */
+void decompress_inprocess_replace (decompress *d, char *buf, size_t len);
+
+/* Start the processes in a pipeline-based decompressor. Does nothing for
+ * in-process decompressors.
*/
void decompress_start (decompress *d);
@@ -74,6 +121,7 @@ const char *decompress_readline (decompress *d);
const char *decompress_peekline (decompress *d);
/* Wait for a decompressor to complete and return its combined exit status.
+ * For in-process decompressors, simply returns 0.
*/
int decompress_wait (decompress *d);