From dd2ebc7d03e10632646085802e119a00d575f590 Mon Sep 17 00:00:00 2001 From: Joerg Sonnenberger Date: Tue, 12 Aug 2025 18:29:24 +0200 Subject: [PATCH] Add sanitize mode for stripping invisible text OCR processes typically produce text objects with text render mode 3, so this option allows stripping OCR layers for recreating them. This can also simplify documents for print processing without visual impact. --- include/mupdf/pdf/document.h | 1 + include/mupdf/pdf/interpret.h | 1 + source/pdf/pdf-op-filter.c | 6 +++++- source/pdf/pdf-write.c | 15 +++++++++++---- source/tools/pdfclean.c | 4 +++- 5 files changed, 21 insertions(+), 6 deletions(-) diff --git a/include/mupdf/pdf/document.h b/include/mupdf/pdf/document.h index 0bad2fecfb..fd7052eb30 100644 --- a/include/mupdf/pdf/document.h +++ b/include/mupdf/pdf/document.h @@ -763,6 +763,7 @@ typedef struct int do_use_objstms; /* Use objstms if possible */ int compression_effort; /* 0 for default. 100 = max, 1 = min. */ int do_labels; /* Add labels to each object showing how it can be reached from the Root. */ + int do_strip_invisible_text; /* Strip invisible text (text render mode 3). */ } pdf_write_options; FZ_DATA extern const pdf_write_options pdf_default_write_options; diff --git a/include/mupdf/pdf/interpret.h b/include/mupdf/pdf/interpret.h index 79cb108558..a8c35996e3 100644 --- a/include/mupdf/pdf/interpret.h +++ b/include/mupdf/pdf/interpret.h @@ -362,6 +362,7 @@ typedef struct int (*text_filter)(fz_context *ctx, void *opaque, int *ucsbuf, int ucslen, fz_matrix trm, fz_matrix ctm, fz_rect bbox); void (*after_text_object)(fz_context *ctx, void *opaque, pdf_document *doc, pdf_processor *chain, fz_matrix ctm); int (*culler)(fz_context *ctx, void *opaque, fz_rect bbox, fz_cull_type type); + int strip_invisible_text; } pdf_sanitize_filter_options; diff --git a/source/pdf/pdf-op-filter.c b/source/pdf/pdf-op-filter.c index 16efa7cde0..f39d79fab4 100644 --- a/source/pdf/pdf-op-filter.c +++ b/source/pdf/pdf-op-filter.c @@ -635,7 +635,11 @@ filter_show_char(fz_context *ctx, pdf_sanitize_processor *p, int cid, int *unico } *unicode = ucsbuf[0]; - if (p->options->text_filter || p->options->culler) + if (p->options->strip_invisible_text && gstate->pending.text.render == 3) + { + remove = 1; + } + else if (p->options->text_filter || p->options->culler) { fz_matrix ctm; fz_rect bbox; diff --git a/source/pdf/pdf-write.c b/source/pdf/pdf-write.c index b5450c08c5..b410c1430d 100644 --- a/source/pdf/pdf-write.c +++ b/source/pdf/pdf-write.c @@ -1721,7 +1721,7 @@ static void complete_signatures(fz_context *ctx, pdf_document *doc, pdf_write_st } } -static void clean_content_streams(fz_context *ctx, pdf_document *doc, int sanitize, int ascii, int newlines) +static void clean_content_streams(fz_context *ctx, pdf_document *doc, int sanitize, int ascii, int newlines, int strip_invisible_text) { int n = pdf_count_pages(ctx, doc); int i; @@ -1733,7 +1733,9 @@ static void clean_content_streams(fz_context *ctx, pdf_document *doc, int saniti options.recurse = 1; options.ascii = ascii; options.newlines = newlines; - options.filters = sanitize ? list : NULL; + options.filters = sanitize || strip_invisible_text ? list : NULL; + if (strip_invisible_text) + sopts.strip_invisible_text = 1; list[0].filter = pdf_new_sanitize_filter; list[0].options = &sopts; @@ -1916,6 +1918,8 @@ pdf_parse_write_options(fz_context *ctx, pdf_write_options *opts, const char *ar opts->do_clean = fz_option_eq(val, "yes"); if (fz_has_option(ctx, args, "sanitize", &val)) opts->do_sanitize = fz_option_eq(val, "yes"); + if (fz_has_option(ctx, args, "strip-invisible-text", &val)) + opts->do_strip_invisible_text = fz_option_eq(val, "yes"); if (fz_has_option(ctx, args, "incremental", &val)) opts->do_incremental = fz_option_eq(val, "yes"); if (fz_has_option(ctx, args, "objstms", &val)) @@ -1990,12 +1994,12 @@ prepare_for_save(fz_context *ctx, pdf_document *doc, const pdf_write_options *in fz_throw(ctx, FZ_ERROR_ARGUMENT, "annotations need resynthesis before saving"); /* Rewrite (and possibly sanitize) the operator streams */ - if (in_opts->do_clean || in_opts->do_sanitize) + if (in_opts->do_clean || in_opts->do_sanitize || in_opts->do_strip_invisible_text) { pdf_begin_operation(ctx, doc, "Clean content streams"); fz_try(ctx) { - clean_content_streams(ctx, doc, in_opts->do_sanitize, in_opts->do_ascii, in_opts->do_pretty); + clean_content_streams(ctx, doc, in_opts->do_sanitize, in_opts->do_ascii, in_opts->do_pretty, in_opts->do_strip_invisible_text); pdf_end_operation(ctx, doc); } fz_catch(ctx) @@ -2722,6 +2726,7 @@ void pdf_write_document(fz_context *ctx, pdf_document *doc, fz_output *out, cons in_opts->do_linear || in_opts->do_clean || in_opts->do_sanitize || + in_opts->do_strip_invisible_text || in_opts->do_appearance || in_opts->do_encrypt != PDF_ENCRYPT_KEEP) fz_throw(ctx, FZ_ERROR_ARGUMENT, "Can't use these options when snapshotting!"); @@ -2864,6 +2869,8 @@ pdf_format_write_options(fz_context *ctx, char *buffer, size_t buffer_len, const ADD_OPT("linearize=yes"); if (opts->do_clean) ADD_OPT("clean=yes"); + if (opts->do_strip_invisible_text) + ADD_OPT("strip-invisible-text=yes"); if (opts->do_sanitize) ADD_OPT("sanitize=yes"); if (opts->do_incremental) diff --git a/source/tools/pdfclean.c b/source/tools/pdfclean.c index 97186fda1c..ff0e91e51c 100644 --- a/source/tools/pdfclean.c +++ b/source/tools/pdfclean.c @@ -61,6 +61,7 @@ static int usage(void) "\t-i\tcompress image streams\n" "\t-c\tclean content streams\n" "\t-s\tsanitize content streams\n" + "\t-I\tstrip invisible text\n" "\t-t\tcompact object syntax\n" "\t-tt\tindented object syntax\n" "\t-L\twrite object labels\n" @@ -133,7 +134,7 @@ int pdfclean_main(int argc, char **argv) opts.write = pdf_default_write_options; opts.write.dont_regenerate_id = 1; - while ((c = fz_getopt_long(argc, argv, "ade:fgilmp:stczDAE:LO:U:P:SZ", longopts)) != -1) + while ((c = fz_getopt_long(argc, argv, "ade:fgilmp:stczDAE:ILO:U:P:SZ", longopts)) != -1) { switch (c) { @@ -149,6 +150,7 @@ int pdfclean_main(int argc, char **argv) case 'l': opts.write.do_linear += 1; break; case 'c': opts.write.do_clean += 1; break; case 's': opts.write.do_sanitize += 1; break; + case 'I': opts.write.do_strip_invisible_text += 1; break; case 't': pretty = (pretty < 0) ? 0 : 1; break; case 'A': opts.write.do_appearance += 1; break; case 'L': opts.write.do_labels = 1; break;