diff --git a/build.gradle.kts b/build.gradle.kts index cd0048c..dc45b61 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -13,7 +13,7 @@ plugins { allprojects { group = "org.grimmory" - version = "0.9.0" + version = "0.10.0" repositories { mavenCentral() diff --git a/src/main/java/org/grimmory/pdfium4j/PdfDocument.java b/src/main/java/org/grimmory/pdfium4j/PdfDocument.java index d7ed6cc..1d67bd8 100644 --- a/src/main/java/org/grimmory/pdfium4j/PdfDocument.java +++ b/src/main/java/org/grimmory/pdfium4j/PdfDocument.java @@ -51,7 +51,6 @@ public final class PdfDocument implements AutoCloseable { private final Thread ownerThread; private final List openPages; private volatile boolean closed = false; - private volatile boolean structurallyModified = false; private final Map pendingMetadata = new LinkedHashMap<>(); private String pendingXmpMetadata = null; @@ -429,7 +428,7 @@ public PdfPage page(int index) { ownerThread, policy.maxRenderPixels(), () -> unregisterPage(holder[0]), - () -> structurallyModified = true); + () -> {}); holder[0] = page; registerPage(page); return page; @@ -686,7 +685,6 @@ public void deletePage(int pageIndex) { } try { EditBindings.FPDFPage_Delete.invokeExact(handle, pageIndex); - structurallyModified = true; } catch (Throwable t) { throw new PdfiumException("Failed to delete page " + pageIndex, t); } @@ -717,7 +715,6 @@ public void insertBlankPage(int pageIndex, PageSize size) { } finally { ViewBindings.FPDF_ClosePage.invokeExact(pageSeg); } - structurallyModified = true; } catch (PdfiumException e) { throw e; } catch (Throwable t) { @@ -754,7 +751,6 @@ public void importPages(PdfDocument source, String pageRange, int insertIndex) { if (ok == 0) { throw new PdfiumException("FPDF_ImportPages failed for range: " + pageRange); } - structurallyModified = true; } catch (PdfiumException e) { throw e; } catch (Throwable t) { @@ -859,6 +855,14 @@ public Optional metadata(MetadataTag tag) { return (value == null || value.isEmpty()) ? Optional.empty() : Optional.of(value); } + return readNativeMetadata(tag); + } + + /** + * Read a metadata value directly from the native PDFium document handle, bypassing the pending + * metadata cache. Used internally to merge existing metadata with pending changes during save. + */ + private Optional readNativeMetadata(MetadataTag tag) { try (Arena arena = Arena.ofConfined()) { MemorySegment tagSeg = arena.allocateFrom(tag.pdfKey()); @@ -1231,19 +1235,14 @@ public List bookmarks() { /** * Save the document to a file. * - *

When only metadata has been modified (no page additions, deletions, or imports), this uses a - * fast path that reads the original PDF bytes and appends an incremental update - avoiding the - * expensive native {@code FPDF_SaveAsCopy} serialization entirely. + *

Always serializes through PDFium's native {@code FPDF_SaveAsCopy} to guarantee a + * structurally valid PDF. Metadata changes (Info dictionary, XMP) are applied as a validated + * incremental update on top of the native output. * * @param path output file path */ public void save(Path path) { - byte[] bytes; - if (structurallyModified) { - bytes = saveToBytes(); - } else { - bytes = saveMetadataOnly(); - } + byte[] bytes = saveToBytes(); try { Files.write(path, bytes); } catch (IOException e) { @@ -1258,31 +1257,35 @@ public void save(Path path) { */ public byte[] saveToBytes() { ensureOpen(); - return PdfSaver.saveToBytes(handle, pendingMetadata, pendingXmpMetadata); + Map mergedMetadata = buildMergedMetadata(); + return PdfSaver.saveToBytes(handle, mergedMetadata, pendingXmpMetadata); } /** - * Fast metadata-only save: reads original bytes and appends an incremental update with the - * pending Info Dictionary and/or XMP changes. Avoids the expensive {@code FPDF_SaveAsCopy} native - * serialization. + * Build a complete Info dictionary by merging existing metadata (read from PDFium) with pending + * changes. This ensures that setting a single tag (e.g., Title) does not discard other existing + * entries (e.g., Author, Subject) when the incremental update replaces the Info dictionary. */ - private byte[] saveMetadataOnly() { - ensureOpen(); - byte[] original; - if (sourceBytes != null) { - original = sourceBytes; - } else if (sourcePath != null) { - try { - original = Files.readAllBytes(sourcePath); - } catch (IOException e) { - throw new PdfiumException( - "Failed to read source PDF for metadata update: " + sourcePath, e); + private Map buildMergedMetadata() { + if (pendingMetadata.isEmpty()) { + return pendingMetadata; + } + + Map merged = new LinkedHashMap<>(); + // Read all existing metadata from the native PDFium document + for (MetadataTag tag : MetadataTag.values()) { + if (!pendingMetadata.containsKey(tag)) { + readNativeMetadata(tag).ifPresent(v -> merged.put(tag, v)); } - } else { - // No original bytes available, fall back to full save - return saveToBytes(); } - return PdfSaver.applyIncrementalUpdate(original, pendingMetadata, pendingXmpMetadata); + // Apply pending changes (overrides existing + adds new entries) + for (Map.Entry entry : pendingMetadata.entrySet()) { + if (entry.getValue() != null && !entry.getValue().isEmpty()) { + merged.put(entry.getKey(), entry.getValue()); + } + // Empty string means "clear this tag" - don't add it to merged + } + return merged; } /** @@ -1293,12 +1296,7 @@ private byte[] saveMetadataOnly() { * @throws PdfiumException if saving fails */ public void save(OutputStream out) { - byte[] bytes; - if (structurallyModified) { - bytes = saveToBytes(); - } else { - bytes = saveMetadataOnly(); - } + byte[] bytes = saveToBytes(); try { out.write(bytes); } catch (IOException e) { diff --git a/src/main/java/org/grimmory/pdfium4j/PdfSaver.java b/src/main/java/org/grimmory/pdfium4j/PdfSaver.java index 81d0653..03a4f9d 100644 --- a/src/main/java/org/grimmory/pdfium4j/PdfSaver.java +++ b/src/main/java/org/grimmory/pdfium4j/PdfSaver.java @@ -15,46 +15,81 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import org.grimmory.pdfium4j.exception.PdfiumException; +import org.grimmory.pdfium4j.internal.DocBindings; import org.grimmory.pdfium4j.internal.EditBindings; +import org.grimmory.pdfium4j.internal.FfmHelper; +import org.grimmory.pdfium4j.internal.ViewBindings; import org.grimmory.pdfium4j.model.MetadataTag; /** * Handles saving PDF documents. Uses PDFium's native FPDF_SaveAsCopy for the base save, then * applies pure-Java incremental updates for Info dictionary and XMP metadata. * + *

Why custom code is needed: PDFium's open-source public C API provides {@code + * FPDF_GetMetaText} for reading metadata but has no corresponding setter function (no {@code + * FPDF_SetMetaText}). The {@code FPDF_INCREMENTAL} save flag is non-functional in practice (it does + * not append modified object streams). Therefore, setting Info dictionary or XMP metadata requires + * a pure-Java incremental update appended after PDFium's native serialization. This is the only + * custom PDF byte manipulation in the library; all other operations delegate to PDFium. + * *

Incremental updates (PDF spec §7.5.6) append new objects, a new xref section, and a new * trailer at the end of the file - the original bytes are never modified. This is the standard * mechanism used by all PDF editors. + * + *

Corruption safety: All incremental updates are validated by re-opening the + * result with PDFium. If validation fails, the base (pre-metadata) bytes are returned instead, + * guaranteeing no corrupt output is ever produced. + * + * @see Missing FPDF_SetMetaText API + * @see FPDF_INCREMENTAL is broken */ final class PdfSaver { + private static final System.Logger LOG = System.getLogger(PdfSaver.class.getName()); + private static final ThreadLocal WRITE_BUFFER = new ThreadLocal<>(); private static final Pattern METADATA_REF_PATTERN = Pattern.compile("/Metadata\\s+\\d+\\s+\\d+\\s+R"); private static final Pattern ROOT_REF_PATTERN = Pattern.compile("/Root\\s+(\\d+\\s+\\d+\\s+R)"); private static final Pattern INFO_REF_PATTERN = Pattern.compile("/Info\\s+(\\d+\\s+\\d+\\s+R)"); - private static final Pattern OBJ_NUM_PATTERN = Pattern.compile("(\\d+)\\s+0\\s+obj\\b"); + private static final Pattern SIZE_PATTERN = Pattern.compile("/Size\\s+(\\d+)"); private static final Pattern FIRST_INT_PATTERN = Pattern.compile("(\\d+)"); + /** + * Maximum number of bytes from the end of the file to search for trailer/xref structures. PDF + * spec requires startxref within the last 1024 bytes, but we use a larger buffer to handle PDFs + * with comments or whitespace after %%EOF (common in incrementally updated files). + */ + private static final int TAIL_SEARCH_BYTES = 4096; + private PdfSaver() {} /** - * Apply an incremental update to existing PDF bytes without native serialization. This is the - * fast path for metadata-only changes - it reads the original file and appends new Info/XMP - * objects + xref + trailer at the end. + * Save a document to bytes using PDFium's native serialization, then apply metadata as a + * validated incremental update. + * + *

If the incremental update produces an invalid PDF (detected by re-opening with PDFium), the + * base bytes without metadata are returned instead. This guarantees the output is never corrupt. */ - static byte[] applyIncrementalUpdate( - byte[] originalPdf, Map pendingMetadata, String pendingXmp) { + static byte[] saveToBytes( + MemorySegment docHandle, Map pendingMetadata, String pendingXmp) { + byte[] baseBytes = nativeSave(docHandle); + boolean hasInfoUpdate = pendingMetadata != null && !pendingMetadata.isEmpty(); boolean hasXmpUpdate = pendingXmp != null && !pendingXmp.isEmpty(); if (!hasInfoUpdate && !hasXmpUpdate) { - return originalPdf; + return baseBytes; } - return appendIncrementalUpdate(originalPdf, pendingMetadata, pendingXmp); + + byte[] result = appendIncrementalUpdate(baseBytes, pendingMetadata, pendingXmp); + return validateOrFallback(result, baseBytes, pendingMetadata); } - static byte[] saveToBytes( - MemorySegment docHandle, Map pendingMetadata, String pendingXmp) { + /** + * Perform the native FPDF_SaveAsCopy and return raw PDF bytes. This is the only path that + * produces the base PDF - all metadata is applied on top via incremental update. + */ + private static byte[] nativeSave(MemorySegment docHandle) { ByteArrayOutputStream baos = new ByteArrayOutputStream(); WRITE_BUFFER.set(baos); try (Arena arena = Arena.ofConfined()) { @@ -78,15 +113,7 @@ static byte[] saveToBytes( throw new PdfiumException("FPDF_SaveAsCopy failed"); } - byte[] result = baos.toByteArray(); - - // Apply incremental updates for metadata - boolean hasInfoUpdate = pendingMetadata != null && !pendingMetadata.isEmpty(); - boolean hasXmpUpdate = pendingXmp != null && !pendingXmp.isEmpty(); - if (hasInfoUpdate || hasXmpUpdate) { - result = appendIncrementalUpdate(result, pendingMetadata, pendingXmp); - } - return result; + return baos.toByteArray(); } catch (PdfiumException e) { throw e; } catch (Throwable t) { @@ -96,24 +123,112 @@ static byte[] saveToBytes( } } + /** + * Validate PDF bytes by attempting to open them with PDFium. Checks that: (1) the document can be + * loaded, (2) the page count is valid, and (3) metadata entries can be read back. If any check + * fails, return the fallback bytes instead. This guarantees we never produce corrupt output. + */ + private static byte[] validateOrFallback( + byte[] result, byte[] fallback, Map expectedMetadata) { + PdfiumLibrary.ensureInitialized(); + try (Arena arena = Arena.ofConfined()) { + MemorySegment seg = arena.allocateFrom(JAVA_BYTE, result); + MemorySegment doc = + (MemorySegment) + ViewBindings.FPDF_LoadMemDocument.invokeExact(seg, result.length, MemorySegment.NULL); + if (FfmHelper.isNull(doc)) { + LOG.log( + System.Logger.Level.WARNING, + "Incremental metadata update produced invalid PDF; " + + "falling back to base save without metadata"); + return fallback; + } + try { + int pages = (int) ViewBindings.FPDF_GetPageCount.invokeExact(doc); + if (pages < 0) { + LOG.log( + System.Logger.Level.WARNING, + "Incremental metadata update produced PDF with invalid page count; " + + "falling back to base save without metadata"); + return fallback; + } + + // Verify metadata can be read back from the saved document + if (expectedMetadata != null && !expectedMetadata.isEmpty()) { + if (!verifyMetadataReadback(doc, expectedMetadata, arena)) { + LOG.log( + System.Logger.Level.WARNING, + "Metadata written but could not be read back from saved PDF; " + + "falling back to base save without metadata"); + return fallback; + } + } + } finally { + try { + ViewBindings.FPDF_CloseDocument.invokeExact(doc); + } catch (Throwable ignored) { + } + } + } catch (Throwable t) { + LOG.log( + System.Logger.Level.WARNING, + "Failed to validate incremental metadata update; " + + "falling back to base save without metadata", + t); + return fallback; + } + return result; + } + + /** + * Verify that at least one metadata entry from the expected set can be read back from the saved + * document. This confirms the Info dictionary was correctly written. + */ + private static boolean verifyMetadataReadback( + MemorySegment doc, Map expected, Arena arena) { + for (Map.Entry entry : expected.entrySet()) { + try { + MemorySegment tagSeg = arena.allocateFrom(entry.getKey().pdfKey()); + long needed = + (long) DocBindings.FPDF_GetMetaText.invokeExact(doc, tagSeg, MemorySegment.NULL, 0L); + if (needed > 2) { + // At least one tag is readable - Info dictionary is intact + return true; + } + } catch (Throwable ignored) { + } + } + return false; + } + /** * Append a PDF incremental update containing new Info dictionary and/or XMP metadata objects. * This appends after the existing %%EOF - the original file bytes are untouched. + * + *

All parsing is restricted to the tail of the file where trailer/xref structures live, making + * it immune to false matches in binary stream data. */ private static byte[] appendIncrementalUpdate( byte[] pdf, Map metadata, String xmp) { - String text = new String(pdf, StandardCharsets.ISO_8859_1); + String tail = tailString(pdf); // Find previous startxref value (points to the current xref/xref-stream) - int prevXrefOffset = findLastStartxrefValue(text); - - // Parse existing trailer to get /Size and /Root - TrailerInfo trailer = parseTrailer(text); - - int maxObjNum = findMaxObjectNumber(text); - int nextObj = maxObjNum + 1; + int prevXrefOffset = findStartxrefInTail(tail); + + // Parse existing trailer to get /Size, /Root, /Info + TrailerInfo trailer = parseTrailerFromTail(pdf, tail); + + // Use /Size from trailer for next object number (per PDF spec, /Size = total entries in xref) + int nextObj = trailer.size; + if (nextObj <= 0) { + // Should not happen with well-formed PDFium output, but be defensive + LOG.log( + System.Logger.Level.WARNING, + "Could not determine /Size from trailer; skipping incremental update"); + return pdf; + } - // Track new objects: objNum -> (offset relative to start of appended bytes, content) + // Track new objects: objNum -> content Map newObjects = new LinkedHashMap<>(); int infoObjNum = 0; int xmpObjNum = 0; @@ -149,7 +264,7 @@ private static byte[] appendIncrementalUpdate( } // Build the incremental update - int baseOffset = pdf.length; // offset where appended bytes start + int baseOffset = pdf.length; StringBuilder update = new StringBuilder(); update.append('\n'); // separator after %%EOF @@ -160,30 +275,40 @@ private static byte[] appendIncrementalUpdate( update.append(entry.getValue()); } - // Write xref table for the new objects - int xrefOffset = baseOffset + update.length(); - update.append("xref\n"); - // Write one subsection per object (or contiguous ranges) - for (Map.Entry entry : objOffsets.entrySet()) { - update.append(entry.getKey()).append(" 1\n"); - // PDF spec requires literal \r\n in xref entries, not platform line separator - update.append(String.format("%010d", entry.getValue())).append(" 00000 n \r\n"); - } - - // Write new trailer - update.append("trailer\n"); - update.append("<< /Size ").append(nextObj); - update.append(" /Root ").append(trailer.rootRef); - if (infoObjNum > 0) { - update.append(" /Info ").append(infoObjNum).append(" 0 R"); - } else if (trailer.infoRef != null) { - update.append(" /Info ").append(trailer.infoRef); + boolean xrefStream = usesXrefStreams(pdf, tail); + if (xrefStream) { + int xrefStreamObjNum = nextObj++; + String infoRef = (infoObjNum > 0) ? (infoObjNum + " 0 R") : trailer.infoRef; + appendXrefStreamSection( + update, + xrefStreamObjNum, + objOffsets, + trailer.rootRef, + infoRef, + nextObj, + prevXrefOffset, + baseOffset); + } else { + int xrefOffset = baseOffset + update.length(); + update.append("xref\n"); + for (Map.Entry entry : objOffsets.entrySet()) { + update.append(entry.getKey()).append(" 1\n"); + update.append(String.format("%010d", entry.getValue())).append(" 00000 n \r\n"); + } + update.append("trailer\n"); + update.append("<< /Size ").append(nextObj); + update.append(" /Root ").append(trailer.rootRef); + if (infoObjNum > 0) { + update.append(" /Info ").append(infoObjNum).append(" 0 R"); + } else if (trailer.infoRef != null) { + update.append(" /Info ").append(trailer.infoRef); + } + update.append(" /Prev ").append(prevXrefOffset); + update.append(" >>\n"); + update.append("startxref\n"); + update.append(xrefOffset).append("\n"); + update.append("%%EOF\n"); } - update.append(" /Prev ").append(prevXrefOffset); - update.append(" >>\n"); - update.append("startxref\n"); - update.append(xrefOffset).append("\n"); - update.append("%%EOF\n"); byte[] updateBytes = update.toString().getBytes(StandardCharsets.ISO_8859_1); byte[] result = new byte[pdf.length + updateBytes.length]; @@ -202,12 +327,15 @@ private static byte[] appendIncrementalUpdate( * Append another incremental update that rewrites the Catalog object with a /Metadata reference. */ private static byte[] appendCatalogUpdate( - byte[] pdf, TrailerInfo trailer, int metadataObjNum, int sizeBase) { - String text = new String(pdf, StandardCharsets.ISO_8859_1); - - // Find the Catalog object and its contents - int catalogObjNum = extractObjNum(trailer.rootRef); - String catalogDict = findObjectDict(text, catalogObjNum); + byte[] pdf, TrailerInfo originalTrailer, int metadataObjNum, int sizeBase) { + // Re-parse the tail of the (already modified) PDF to get current state + String tail = tailString(pdf); + boolean xrefStream = usesXrefStreams(pdf, tail); + + // Find the Catalog object - search entire file for the specific object + int catalogObjNum = extractObjNum(originalTrailer.rootRef); + String fullText = new String(pdf, StandardCharsets.ISO_8859_1); + String catalogDict = findObjectDict(fullText, catalogObjNum); if (catalogDict == null) return pdf; // Create a new Catalog object (same obj number, incremental update replaces it) @@ -217,7 +345,6 @@ private static byte[] appendCatalogUpdate( // Remove existing /Metadata if present, add our new one String dict = catalogDict; dict = METADATA_REF_PATTERN.matcher(dict).replaceFirst(""); - // Insert /Metadata ref before the closing >> int closeIdx = dict.lastIndexOf(">>"); if (closeIdx >= 0) { dict = @@ -237,27 +364,40 @@ private static byte[] appendCatalogUpdate( int catalogOffset = baseOffset + update.length(); update.append(newCatalog); - int actualPrev = findLastStartxrefValue(text); - - int xrefOffset = baseOffset + update.length(); - update.append("xref\n"); - update.append(catalogObjNum).append(" 1\n"); - // PDF spec requires literal \r\n in xref entries, not platform line separator - update.append(String.format("%010d", catalogOffset)).append(" 00000 n \r\n"); - - update.append("trailer\n"); - update.append("<< /Size ").append(sizeBase); - update.append(" /Root ").append(trailer.rootRef); - // Carry forward /Info from previous trailer - String prevTrailerInfo = findTrailerEntry(text, "Info"); - if (prevTrailerInfo != null) { - update.append(" /Info ").append(prevTrailerInfo); + int actualPrev = findStartxrefInTail(tail); + TrailerInfo currentTrailer = parseTrailerFromTail(pdf, tail); + + if (xrefStream) { + int xrefStreamObjNum = currentTrailer.size; + Map offsets = new LinkedHashMap<>(); + offsets.put(catalogObjNum, catalogOffset); + appendXrefStreamSection( + update, + xrefStreamObjNum, + offsets, + originalTrailer.rootRef, + currentTrailer.infoRef, + xrefStreamObjNum + 1, + actualPrev, + baseOffset); + } else { + int xrefOffset = baseOffset + update.length(); + update.append("xref\n"); + update.append(catalogObjNum).append(" 1\n"); + update.append(String.format("%010d", catalogOffset)).append(" 00000 n \r\n"); + + update.append("trailer\n"); + update.append("<< /Size ").append(sizeBase); + update.append(" /Root ").append(originalTrailer.rootRef); + if (currentTrailer.infoRef != null) { + update.append(" /Info ").append(currentTrailer.infoRef); + } + update.append(" /Prev ").append(actualPrev); + update.append(" >>\n"); + update.append("startxref\n"); + update.append(xrefOffset).append("\n"); + update.append("%%EOF\n"); } - update.append(" /Prev ").append(actualPrev); - update.append(" >>\n"); - update.append("startxref\n"); - update.append(xrefOffset).append("\n"); - update.append("%%EOF\n"); byte[] updateBytes = update.toString().getBytes(StandardCharsets.ISO_8859_1); byte[] result = new byte[pdf.length + updateBytes.length]; @@ -267,66 +407,113 @@ private static byte[] appendCatalogUpdate( } /** - * Write or replace the XMP metadata packet in the PDF by in-place replacement (without - * incremental update - used when XMP packet already exists with padding). + * Append a cross-reference stream section (§7.5.8) instead of a traditional xref table + trailer. + * The xref stream object dictionary serves as both xref header and trailer. + * + *

Uses W=[1,4,0]: 1-byte type (always 1 = in-use) + 4-byte big-endian byte offset + generation + * number implicit 0. Stream data is uncompressed (no /Filter). */ - static byte[] replaceXmpPacketInPlace(byte[] pdf, String xmpPacket) { - byte[] xmpBytes = xmpPacket.getBytes(StandardCharsets.UTF_8); - byte[] beginMarker = "".getBytes(StandardCharsets.US_ASCII), endPos); - if (endTagClose < 0) return null; - - int packetEnd = endTagClose + 2; - int oldLen = packetEnd - beginPos; - - byte[] paddedXmp = padXmpToLength(xmpBytes, oldLen); - if (paddedXmp.length != oldLen) { - return null; // can't fit in-place, caller should use incremental update + private static void appendXrefStreamSection( + StringBuilder update, + int xrefStreamObjNum, + Map objOffsets, + String rootRef, + String infoRef, + int size, + int prevXref, + int baseOffset) { + byte[] streamData = new byte[objOffsets.size() * 5]; + StringBuilder indexParts = new StringBuilder(); + int dataIdx = 0; + for (Map.Entry entry : objOffsets.entrySet()) { + if (indexParts.length() > 0) indexParts.append(' '); + indexParts.append(entry.getKey()).append(" 1"); + streamData[dataIdx++] = 1; // type 1 = in-use uncompressed object + int offset = entry.getValue(); + streamData[dataIdx++] = (byte) ((offset >> 24) & 0xFF); + streamData[dataIdx++] = (byte) ((offset >> 16) & 0xFF); + streamData[dataIdx++] = (byte) ((offset >> 8) & 0xFF); + streamData[dataIdx++] = (byte) (offset & 0xFF); } - byte[] result = new byte[pdf.length]; - System.arraycopy(pdf, 0, result, 0, pdf.length); - System.arraycopy(paddedXmp, 0, result, beginPos, paddedXmp.length); - return result; + int xrefStreamOffset = baseOffset + update.length(); + update.append(xrefStreamObjNum).append(" 0 obj\n"); + update.append("<< /Type /XRef"); + update.append(" /Size ").append(size); + update.append(" /Root ").append(rootRef); + if (infoRef != null) { + update.append(" /Info ").append(infoRef); + } + update.append(" /Prev ").append(prevXref); + update.append(" /W [1 4 0]"); + update.append(" /Index [").append(indexParts).append(']'); + update.append(" /Length ").append(streamData.length); + update.append(" >>\nstream\n"); + // Append binary data as ISO-8859-1 characters (byte-transparent encoding) + for (byte b : streamData) { + update.append((char) (b & 0xFF)); + } + update.append("\nendstream\nendobj\n"); + update.append("startxref\n").append(xrefStreamOffset).append('\n'); + update.append("%%EOF\n"); } - // --- Trailer parsing --- + // --- Tail-based parsing (only parses end of file, immune to binary stream false matches) --- - private record TrailerInfo(String rootRef, String infoRef) {} + /** + * Extract the tail of the PDF as an ISO-8859-1 string. Only this portion is parsed for + * trailer/xref structures, making parsing immune to false matches in binary stream data. + */ + private static String tailString(byte[] pdf) { + int tailLen = Math.min(TAIL_SEARCH_BYTES, pdf.length); + return new String(pdf, pdf.length - tailLen, tailLen, StandardCharsets.ISO_8859_1); + } - private static TrailerInfo parseTrailer(String text) { - // Try traditional trailer first - String rootRef = findTrailerEntry(text, "Root"); - String infoRef = findTrailerEntry(text, "Info"); + private record TrailerInfo(String rootRef, String infoRef, int size) {} - if (rootRef != null) { - return new TrailerInfo(rootRef, infoRef); + /** + * Parse trailer information from the tail of the file. Extracts /Root, /Info, and /Size entries. + */ + private static TrailerInfo parseTrailerFromTail(byte[] pdf, String tail) { + // Try traditional trailer first (search tail for "trailer" keyword) + String rootRef = findTrailerEntryInTail(tail, "Root"); + String infoRef = findTrailerEntryInTail(tail, "Info"); + int size = findTrailerSizeInTail(tail); + + if (rootRef != null && size > 0) { + return new TrailerInfo(rootRef, infoRef, size); } - // For cross-reference streams, find the last xref stream object - // which contains /Root in its dictionary - Matcher m = ROOT_REF_PATTERN.matcher(text); - String lastRoot = null; - String lastInfo = null; - while (m.find()) { - lastRoot = m.group(1); - } - Matcher m2 = INFO_REF_PATTERN.matcher(text); - while (m2.find()) { - lastInfo = m2.group(1); + // For cross-reference streams, parse the dictionary at the startxref offset + int startxref = findStartxrefInTail(tail); + if (startxref > 0 && startxref < pdf.length) { + // Read a small window around the xref stream object + int windowStart = startxref; + int windowEnd = Math.min(startxref + 512, pdf.length); + String window = + new String(pdf, windowStart, windowEnd - windowStart, StandardCharsets.ISO_8859_1); + String dict = findDictInWindow(window); + if (dict != null) { + Matcher rootM = ROOT_REF_PATTERN.matcher(dict); + if (rootM.find()) rootRef = rootM.group(1); + Matcher infoM = INFO_REF_PATTERN.matcher(dict); + if (infoM.find()) infoRef = infoM.group(1); + Matcher sizeM = SIZE_PATTERN.matcher(dict); + if (sizeM.find()) size = Integer.parseInt(sizeM.group(1)); + if (rootRef != null && size > 0) { + return new TrailerInfo(rootRef, infoRef, size); + } + } } - return new TrailerInfo(lastRoot != null ? lastRoot : "1 0 R", lastInfo); + + // Ultimate fallback: scan tail for the last /Root reference + Matcher m = ROOT_REF_PATTERN.matcher(tail); + while (m.find()) rootRef = m.group(1); + return new TrailerInfo(rootRef != null ? rootRef : "1 0 R", null, Math.max(size, 1)); } - private static String findTrailerEntry(String text, String key) { + /** Search tail for a trailer entry (/Root or /Info reference). */ + private static String findTrailerEntryInTail(String tail, String key) { Pattern p = switch (key) { case "Root" -> ROOT_REF_PATTERN; @@ -334,19 +521,18 @@ private static String findTrailerEntry(String text, String key) { default -> Pattern.compile("/" + key + "\\s+(\\d+\\s+\\d+\\s+R)"); }; - // Search backwards from end for the latest trailer - int searchFrom = text.length(); + int searchFrom = tail.length(); while (true) { - int trailerIdx = text.lastIndexOf("trailer", searchFrom); + int trailerIdx = tail.lastIndexOf("trailer", searchFrom); if (trailerIdx < 0) break; - int dictStart = text.indexOf("<<", trailerIdx); + int dictStart = tail.indexOf("<<", trailerIdx); if (dictStart < 0) break; - int dictEnd = text.indexOf(">>", dictStart); + int dictEnd = tail.indexOf(">>", dictStart); if (dictEnd < 0) break; - String dict = text.substring(dictStart, dictEnd + 2); + String dict = tail.substring(dictStart, dictEnd + 2); Matcher m = p.matcher(dict); if (m.find()) { return m.group(1); @@ -357,10 +543,35 @@ private static String findTrailerEntry(String text, String key) { return null; } - private static int findLastStartxrefValue(String text) { - int idx = text.lastIndexOf("startxref"); + /** Extract /Size value from the trailer in the tail. */ + private static int findTrailerSizeInTail(String tail) { + int searchFrom = tail.length(); + while (true) { + int trailerIdx = tail.lastIndexOf("trailer", searchFrom); + if (trailerIdx < 0) break; + + int dictStart = tail.indexOf("<<", trailerIdx); + if (dictStart < 0) break; + + int dictEnd = tail.indexOf(">>", dictStart); + if (dictEnd < 0) break; + + String dict = tail.substring(dictStart, dictEnd + 2); + Matcher m = SIZE_PATTERN.matcher(dict); + if (m.find()) { + return Integer.parseInt(m.group(1)); + } + searchFrom = trailerIdx - 1; + if (searchFrom < 0) break; + } + return 0; + } + + /** Find the startxref value from the tail of the file. */ + private static int findStartxrefInTail(String tail) { + int idx = tail.lastIndexOf("startxref"); if (idx < 0) return 0; - String after = text.substring(idx + "startxref".length()).trim(); + String after = tail.substring(idx + "startxref".length()).trim(); try { return Integer.parseInt(after.lines().findFirst().orElse("0").trim()); } catch (NumberFormatException e) { @@ -368,15 +579,59 @@ private static int findLastStartxrefValue(String text) { } } + private static final Pattern XREF_STREAM_HEADER = Pattern.compile("\\d+\\s+0\\s+obj\\b"); + + /** + * Detect whether the PDF uses cross-reference streams (§7.5.8). Reads only the bytes at the + * startxref offset, not the entire file. + */ + private static boolean usesXrefStreams(byte[] pdf, String tail) { + int startxref = findStartxrefInTail(tail); + if (startxref <= 0 || startxref >= pdf.length) return false; + int peekEnd = Math.min(startxref + 200, pdf.length); + String peek = + new String(pdf, startxref, peekEnd - startxref, StandardCharsets.ISO_8859_1).stripLeading(); + if (peek.startsWith("xref")) return false; + return XREF_STREAM_HEADER.matcher(peek).lookingAt() + && peek.contains("/Type") + && peek.contains("/XRef"); + } + + /** + * Find and extract the first dictionary {@code << ... >>} in a window string. Handles nested + * dictionaries. + */ + private static String findDictInWindow(String window) { + int dictStart = window.indexOf("<<"); + if (dictStart < 0) return null; + int depth = 0; + int pos = dictStart; + while (pos < window.length() - 1) { + if (window.charAt(pos) == '<' && window.charAt(pos + 1) == '<') { + depth++; + pos += 2; + } else if (window.charAt(pos) == '>' && window.charAt(pos + 1) == '>') { + depth--; + if (depth == 0) return window.substring(dictStart, pos + 2); + pos += 2; + } else { + pos++; + } + } + return null; + } + private static int extractObjNum(String ref) { - // "N 0 R" -> N Matcher m = FIRST_INT_PATTERN.matcher(ref); if (m.find()) return Integer.parseInt(m.group(1)); return 1; } + /** + * Find the dictionary for a specific object by number. Searches backwards from the end of the + * file to find the latest version of the object (incremental updates append newer versions). + */ private static String findObjectDict(String text, int objNum) { - // Find "N 0 obj" then extract the dictionary << ... >> String marker = objNum + " 0 obj"; int idx = text.lastIndexOf(marker); if (idx < 0) return null; @@ -384,7 +639,6 @@ private static String findObjectDict(String text, int objNum) { int dictStart = text.indexOf("<<", idx); if (dictStart < 0) return null; - // Find matching >> (handle nested <<>>) int depth = 0; int pos = dictStart; while (pos < text.length() - 1) { @@ -404,35 +658,7 @@ private static String findObjectDict(String text, int objNum) { return null; } - // --- Helpers --- - - private static int findMaxObjectNumber(String pdfText) { - Matcher matcher = OBJ_NUM_PATTERN.matcher(pdfText); - int max = 0; - while (matcher.find()) { - int num = Integer.parseInt(matcher.group(1)); - if (num > max) max = num; - } - return max; - } - - private static byte[] padXmpToLength(byte[] xmp, int targetLen) { - if (xmp.length >= targetLen) { - return xmp; - } - byte[] endPI = "Flags: 0 = full save, FPDF_INCREMENTAL (1) = incremental, FPDF_NO_INCREMENTAL (2) = full + * save removing old versions, FPDF_REMOVE_SECURITY (4) = remove encryption. + * + *

Warning: FPDF_INCREMENTAL (flag 1) is non-functional in the open-source + * PDFium build — it does not properly append modified objects, producing corrupt output. Always + * use flag 0 (full save). See: https://groups.google.com/g/pdfium/c/6SklEc2lYNM */ public static final MethodHandle FPDF_SaveAsCopy = downcall("FPDF_SaveAsCopy", FunctionDescriptor.of(JAVA_INT, ADDRESS, ADDRESS, JAVA_INT)); diff --git a/src/test/java/org/grimmory/pdfium4j/PdfDocumentTest.java b/src/test/java/org/grimmory/pdfium4j/PdfDocumentTest.java index 53d048f..a67d302 100644 --- a/src/test/java/org/grimmory/pdfium4j/PdfDocumentTest.java +++ b/src/test/java/org/grimmory/pdfium4j/PdfDocumentTest.java @@ -1121,39 +1121,37 @@ private static byte[] minimalPdfWithText() { return pdf.getBytes(StandardCharsets.US_ASCII); } - // --- Metadata-only fast save tests --- + // --- Metadata save correctness tests --- @Test @EnabledIf("pdfiumAvailable") - void metadataOnlySaveIsFasterThanFullSave(@TempDir Path tempDir) throws IOException { + void saveWithAndWithoutMetadataProducesValidPdfs(@TempDir Path tempDir) throws IOException { Path testPdf = getTestPdf(); if (testPdf == null) return; Path outNoChanges = tempDir.resolve("no-changes.pdf"); Path outFast = tempDir.resolve("fast-save.pdf"); - // Save with no changes - should use fast path, returning original bytes + // Save with no changes - always goes through PDFium native serialization try (PdfDocument doc = PdfDocument.open(testPdf)) { doc.save(outNoChanges); } - // Metadata-only save (fast path) + // Metadata save (always through PDFium + validated incremental update) try (PdfDocument doc = PdfDocument.open(testPdf)) { doc.setMetadata(MetadataTag.TITLE, "Fast Save Title"); doc.save(outFast); } - // Verify both produce valid PDFs + // Verify both produce valid PDFs that can be re-opened assertTrue(Files.size(outNoChanges) > 0); assertTrue(Files.size(outFast) > 0); - // No-changes save should produce same size as original - assertEquals( - Files.size(testPdf), - Files.size(outNoChanges), - "Save with no changes should preserve original file size exactly"); + try (PdfDocument doc = PdfDocument.open(outNoChanges)) { + assertTrue(doc.pageCount() > 0, "No-changes save should produce valid PDF"); + } - // Verify the fast-save file contains the metadata + // Verify the saved file contains the metadata try (PdfDocument doc = PdfDocument.open(outFast)) { assertEquals("Fast Save Title", doc.metadata(MetadataTag.TITLE).orElse("")); } @@ -1359,14 +1357,18 @@ private String buildBookloreXmp(String title, String author) { @Test @EnabledIf("pdfiumAvailable") - void fileSizeStableAfterMetadataSave(@TempDir Path tempDir) throws IOException { + void repeatedMetadataSavesProduceValidPdfs(@TempDir Path tempDir) throws IOException { Path testPdf = getTestPdf(); if (testPdf == null) return; - long originalSize = Files.size(testPdf); Path pdf = tempDir.resolve("stable.pdf"); Files.copy(testPdf, pdf); + int originalPageCount; + try (PdfDocument doc = PdfDocument.open(testPdf)) { + originalPageCount = doc.pageCount(); + } + // Save with metadata + XMP (simulates grimmory's write path) try (PdfDocument doc = PdfDocument.open(pdf)) { doc.setMetadata(MetadataTag.TITLE, "Test Title"); @@ -1375,11 +1377,12 @@ void fileSizeStableAfterMetadataSave(@TempDir Path tempDir) throws IOException { doc.save(pdf); } - long afterFirst = Files.size(pdf); - // Incremental update should add at most 20 KB for metadata + xref + trailer - assertTrue( - afterFirst - originalSize < 20_000, - "First save should add at most 20 KB, but grew by " + (afterFirst - originalSize)); + // Verify first save is valid and contains metadata + try (PdfDocument doc = PdfDocument.open(pdf)) { + assertEquals(originalPageCount, doc.pageCount()); + assertEquals("Test Title", doc.metadata(MetadataTag.TITLE).orElse("")); + assertEquals("Test Author", doc.metadata(MetadataTag.AUTHOR).orElse("")); + } // Save again (simulates second metadata update) try (PdfDocument doc = PdfDocument.open(pdf)) { @@ -1389,37 +1392,36 @@ void fileSizeStableAfterMetadataSave(@TempDir Path tempDir) throws IOException { doc.save(pdf); } - long afterSecond = Files.size(pdf); - // Second save grows by another incremental update - assertTrue( - afterSecond - afterFirst < 20_000, - "Second save should add at most 20 KB, but grew by " + (afterSecond - afterFirst)); - - // Total size should be within 40 KB of original - assertTrue( - afterSecond < originalSize + 40_000, - "After two saves, file should be within 40 KB of original (" - + originalSize - + "), but is " - + afterSecond); + // Verify second save is valid and contains updated metadata + try (PdfDocument doc = PdfDocument.open(pdf)) { + assertEquals(originalPageCount, doc.pageCount()); + assertEquals("Updated Title", doc.metadata(MetadataTag.TITLE).orElse("")); + assertEquals("Updated Author", doc.metadata(MetadataTag.AUTHOR).orElse("")); + } } @Test @EnabledIf("pdfiumAvailable") - void saveWithNoChangesPreservesSize(@TempDir Path tempDir) throws IOException { + void saveWithNoChangesProducesValidPdf(@TempDir Path tempDir) throws IOException { Path testPdf = getTestPdf(); if (testPdf == null) return; - long originalSize = Files.size(testPdf); Path pdf = tempDir.resolve("unchanged.pdf"); Files.copy(testPdf, pdf); - // Save with no changes + int originalPageCount; try (PdfDocument doc = PdfDocument.open(pdf)) { + originalPageCount = doc.pageCount(); doc.save(pdf); } - assertEquals(originalSize, Files.size(pdf), "Save with no changes should not change file size"); + // Save always goes through PDFium native serialization; the output must be + // a valid PDF with the same number of pages + assertTrue(Files.size(pdf) > 0, "Saved file should not be empty"); + try (PdfDocument doc = PdfDocument.open(pdf)) { + assertEquals( + originalPageCount, doc.pageCount(), "Save with no changes should preserve page count"); + } } @Test @@ -1428,24 +1430,163 @@ void xmpOnlySaveUsesIncrementalUpdate(@TempDir Path tempDir) throws IOException Path testPdf = getTestPdf(); if (testPdf == null) return; - long originalSize = Files.size(testPdf); Path pdf = tempDir.resolve("xmp-only.pdf"); Files.copy(testPdf, pdf); - // Only XMP, no setMetadata - should still use fast path + // Only XMP, no setMetadata - always uses PDFium native save + validated incremental update try (PdfDocument doc = PdfDocument.open(pdf)) { doc.setXmpMetadata(buildBookloreXmp("XMP Only Title", "XMP Author")); doc.save(pdf); } long afterSave = Files.size(pdf); - assertTrue( - afterSave - originalSize < 20_000, - "XMP-only save should add at most 20 KB, but grew by " + (afterSave - originalSize)); + assertTrue(afterSave > 0, "Saved file should not be empty"); try (PdfDocument doc = PdfDocument.open(pdf)) { String xmp = doc.xmpMetadataString(); assertTrue(xmp.contains("XMP Only Title"), "XMP should be present"); } } + + // --- Cross-reference stream (§7.5.8) tests --- + + /** + * Create a minimal valid PDF that uses a cross-reference stream instead of a traditional xref + * table. This is the format used by many modern PDF generators (e.g., Stirling-PDF, Chrome + * print). + */ + @SuppressWarnings("PMD.UnusedAssignment") + private static byte[] minimalXrefStreamPdf() { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + List offsets = new java.util.ArrayList<>(); + + writeBytes(out, "%PDF-1.5\n"); + + offsets.add(out.size()); + writeBytes(out, "1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n"); + + offsets.add(out.size()); + writeBytes(out, "2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n"); + + offsets.add(out.size()); + writeBytes(out, "3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>\nendobj\n"); + + int xrefStreamOffset = out.size(); + + // W=[1,4,0]: type(1 byte) + offset(4 bytes big-endian) + gen(0, implicit 0) + // Index=[0 5]: objects 0–4 contiguous + byte[] xrefData = new byte[5 * 5]; // 5 entries × 5 bytes + int di = 0; + // Object 0: free entry + xrefData[di++] = 0; + xrefData[di++] = 0; + xrefData[di++] = 0; + xrefData[di++] = 0; + xrefData[di++] = 0; + // Objects 1–3: in-use + for (int off : offsets) { + xrefData[di++] = 1; + xrefData[di++] = (byte) ((off >> 24) & 0xFF); + xrefData[di++] = (byte) ((off >> 16) & 0xFF); + xrefData[di++] = (byte) ((off >> 8) & 0xFF); + xrefData[di++] = (byte) (off & 0xFF); + } + // Object 4: the xref stream itself + xrefData[di++] = 1; + xrefData[di++] = (byte) ((xrefStreamOffset >> 24) & 0xFF); + xrefData[di++] = (byte) ((xrefStreamOffset >> 16) & 0xFF); + xrefData[di++] = (byte) ((xrefStreamOffset >> 8) & 0xFF); + xrefData[di++] = (byte) (xrefStreamOffset & 0xFF); + + writeBytes( + out, + "4 0 obj\n<< /Type /XRef /Size 5 /Root 1 0 R" + + " /W [1 4 0] /Index [0 5] /Length " + + xrefData.length + + " >>\nstream\n"); + out.write(xrefData, 0, xrefData.length); + writeBytes(out, "\nendstream\nendobj\n"); + writeBytes(out, "startxref\n" + xrefStreamOffset + "\n%%EOF\n"); + + return out.toByteArray(); + } + + private static void writeBytes(ByteArrayOutputStream out, String s) { + byte[] b = s.getBytes(StandardCharsets.ISO_8859_1); + out.write(b, 0, b.length); + } + + @Test + @EnabledIf("pdfiumAvailable") + void metadataSaveWithXrefStreamPdf(@TempDir Path tempDir) throws IOException { + byte[] xrefStreamPdf = minimalXrefStreamPdf(); + Path source = tempDir.resolve("xref-stream.pdf"); + Files.write(source, xrefStreamPdf); + + try (PdfDocument doc = PdfDocument.open(source)) { + assertEquals(1, doc.pageCount()); + } + + Path output = tempDir.resolve("xref-stream-meta.pdf"); + try (PdfDocument doc = PdfDocument.open(source)) { + doc.setMetadata(MetadataTag.TITLE, "XRef Stream Test"); + doc.setMetadata(MetadataTag.AUTHOR, "Test Author"); + doc.save(output); + } + + try (PdfDocument doc = PdfDocument.open(output)) { + assertEquals("XRef Stream Test", doc.metadata(MetadataTag.TITLE).orElse("")); + assertEquals("Test Author", doc.metadata(MetadataTag.AUTHOR).orElse("")); + } + } + + @Test + @EnabledIf("pdfiumAvailable") + void xmpMetadataSaveWithXrefStreamPdf(@TempDir Path tempDir) throws IOException { + byte[] xrefStreamPdf = minimalXrefStreamPdf(); + Path source = tempDir.resolve("xref-stream.pdf"); + Files.write(source, xrefStreamPdf); + + Path output = tempDir.resolve("xref-stream-xmp.pdf"); + try (PdfDocument doc = PdfDocument.open(source)) { + doc.setMetadata(MetadataTag.TITLE, "XRef Stream XMP"); + doc.setXmpMetadata(buildBookloreXmp("XRef Stream XMP", "Test Author")); + doc.save(output); + } + + try (PdfDocument doc = PdfDocument.open(output)) { + assertEquals("XRef Stream XMP", doc.metadata(MetadataTag.TITLE).orElse("")); + String xmp = doc.xmpMetadataString(); + assertTrue(xmp.contains("XRef Stream XMP"), "XMP should contain title"); + } + } + + @Test + @EnabledIf("pdfiumAvailable") + void xrefStreamPdfDoubleMetadataWrite(@TempDir Path tempDir) throws IOException { + byte[] xrefStreamPdf = minimalXrefStreamPdf(); + Path source = tempDir.resolve("xref-stream.pdf"); + Files.write(source, xrefStreamPdf); + + Path first = tempDir.resolve("first.pdf"); + try (PdfDocument doc = PdfDocument.open(source)) { + doc.setMetadata(MetadataTag.TITLE, "First Title"); + doc.setXmpMetadata(buildBookloreXmp("First Title", "First Author")); + doc.save(first); + } + + Path second = tempDir.resolve("second.pdf"); + try (PdfDocument doc = PdfDocument.open(first)) { + doc.setMetadata(MetadataTag.TITLE, "Second Title"); + doc.setXmpMetadata(buildBookloreXmp("Second Title", "Second Author")); + doc.save(second); + } + + try (PdfDocument doc = PdfDocument.open(second)) { + assertEquals("Second Title", doc.metadata(MetadataTag.TITLE).orElse("")); + XmpMetadata parsed = XmpMetadataParser.parse(doc.xmpMetadata()); + assertEquals("Second Title", parsed.title().orElse("")); + assertEquals(List.of("Second Author"), parsed.creators()); + } + } }