diff --git a/build.gradle.kts b/build.gradle.kts index 31b68e3..cd0048c 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -13,7 +13,7 @@ plugins { allprojects { group = "org.grimmory" - version = "0.6.0" + version = "0.9.0" repositories { mavenCentral() diff --git a/src/main/java/org/grimmory/pdfium4j/PdfDocument.java b/src/main/java/org/grimmory/pdfium4j/PdfDocument.java index 4bc8671..d7ed6cc 100644 --- a/src/main/java/org/grimmory/pdfium4j/PdfDocument.java +++ b/src/main/java/org/grimmory/pdfium4j/PdfDocument.java @@ -9,7 +9,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.*; -import java.util.stream.IntStream; import org.grimmory.pdfium4j.exception.PdfCorruptException; import org.grimmory.pdfium4j.exception.PdfPasswordException; import org.grimmory.pdfium4j.exception.PdfiumException; @@ -227,6 +226,27 @@ public static Optional koReaderPartialMd5(byte[] data) { return KoReaderChecksum.calculate(data); } + /** + * Create a new empty PDF document. + * + * @throws PdfiumException if the document cannot be created + */ + public static PdfDocument create() { + PdfiumLibrary.ensureInitialized(); + try { + MemorySegment handle = (MemorySegment) EditBindings.FPDF_CreateNewDocument.invokeExact(); + if (handle.equals(MemorySegment.NULL)) { + throw new PdfiumException("Failed to create new PDF document"); + } + return new PdfDocument( + handle, null, null, null, PdfProcessingPolicy.defaultPolicy(), Thread.currentThread()); + } catch (PdfiumException e) { + throw e; + } catch (Throwable t) { + throw new PdfiumException("Failed to create new PDF document", t); + } + } + /** * Open a PDF from a file path. * @@ -405,6 +425,7 @@ public PdfPage page(int index) { PdfPage page = new PdfPage( pageSeg, + handle, ownerThread, policy.maxRenderPixels(), () -> unregisterPage(holder[0]), @@ -569,8 +590,14 @@ public Map renderPages(int startIndex, int endIndex, int + " pages"); } - List indices = IntStream.rangeClosed(startIndex, endIndex).boxed().toList(); - return renderPages(indices, dpi); + ensureOpen(); + Map results = new LinkedHashMap<>(endIndex - startIndex + 1); + for (int i = startIndex; i <= endIndex; i++) { + try (PdfPage page = page(i)) { + results.put(i, page.render(dpi)); + } + } + return Collections.unmodifiableMap(results); } /** @@ -580,8 +607,44 @@ public Map renderPages(int startIndex, int endIndex, int * @return map of page index to render result, in iteration order */ public Map renderAllPages(int dpi) { - List indices = IntStream.range(0, pageCount()).boxed().toList(); - return renderPages(indices, dpi); + return renderPages(0, pageCount() - 1, dpi); + } + + /** + * Render a single page and return encoded image bytes. This is a convenience method that handles + * page opening, rendering, encoding, and resource cleanup in a single call. + * + * @param pageIndex 0-based page index + * @param dpi render resolution (e.g. 150 for thumbnails, 300 for high quality) + * @param format image format: "jpeg" or "png" + * @return encoded image bytes + * @throws IllegalArgumentException if format is not "jpeg" or "png", or pageIndex is invalid + */ + public byte[] renderPageToBytes(int pageIndex, int dpi, String format) { + return renderPageToBytes(pageIndex, dpi, format, 0.85f); + } + + /** + * Render a single page and return encoded image bytes with configurable JPEG quality. + * + * @param pageIndex 0-based page index + * @param dpi render resolution + * @param format image format: "jpeg" or "png" + * @param jpegQuality JPEG quality from 0.0 to 1.0 (ignored for PNG) + * @return encoded image bytes + * @throws IllegalArgumentException if format is not "jpeg" or "png", or pageIndex is invalid + */ + public byte[] renderPageToBytes(int pageIndex, int dpi, String format, float jpegQuality) { + Objects.requireNonNull(format, "format"); + String fmt = format.toLowerCase(java.util.Locale.ROOT); + if (!fmt.equals("jpeg") && !fmt.equals("png")) { + throw new IllegalArgumentException("Format must be 'jpeg' or 'png', got: " + format); + } + + try (PdfPage page = page(pageIndex)) { + RenderResult result = page.render(dpi); + return fmt.equals("png") ? result.toPngBytes() : result.toJpegBytes(jpegQuality); + } } /** @@ -818,6 +881,43 @@ public Optional metadata(MetadataTag tag) { } } + /** + * Get a metadata value by an arbitrary Info Dictionary key string. This allows reading + * non-standard keys like "EBX_PUBLISHER" that are not covered by {@link MetadataTag}. + * + * @param key the raw Info Dictionary key name (e.g. "Title", "EBX_PUBLISHER") + * @return the value, or empty if not present + */ + public Optional metadata(String key) { + ensureOpen(); + Objects.requireNonNull(key, "key"); + + MetadataTag standardTag = MetadataTag.fromKey(key); + if (standardTag != null && pendingMetadata.containsKey(standardTag)) { + String value = pendingMetadata.get(standardTag); + return (value == null || value.isEmpty()) ? Optional.empty() : Optional.of(value); + } + + try (Arena arena = Arena.ofConfined()) { + MemorySegment keySeg = arena.allocateFrom(key); + + long needed = + (long) DocBindings.FPDF_GetMetaText.invokeExact(handle, keySeg, MemorySegment.NULL, 0L); + if (needed <= 2) return Optional.empty(); + + MemorySegment buf = arena.allocate(needed); + long written = (long) DocBindings.FPDF_GetMetaText.invokeExact(handle, keySeg, buf, needed); + if (written <= 2) { + return Optional.empty(); + } + + String value = FfmHelper.fromWideString(buf, needed); + return value.isEmpty() ? Optional.empty() : Optional.of(value); + } catch (Throwable t) { + throw new PdfiumException("Failed to read metadata: " + key, t); + } + } + /** Get all standard metadata as a map. Only non-empty values are included. */ public Map metadata() { Map map = new LinkedHashMap<>(); @@ -994,7 +1094,19 @@ private static byte[] extractXmpPacketFromFile(Path path) { System.arraycopy(buf, available - carry, buf, 0, carry); } - if (lastBeginFilePos < 0) return new byte[0]; + if (lastBeginFilePos < 0) { + // Fallback: read entire file and scan for ... + channel.position(0); + byte[] allBytes = new byte[(int) fileSize]; + var allBuf = java.nio.ByteBuffer.wrap(allBytes); + int totalRead = 0; + while (totalRead < fileSize) { + int n = channel.read(allBuf, totalRead); + if (n < 0) break; + totalRead += n; + } + return extractXmpmetaFallback(allBytes); + } // Phase 2: find after the last begin marker offset = lastBeginFilePos; @@ -1051,16 +1163,41 @@ private static byte[] extractXmpPacket(byte[] pdf) { lastBeginPos = pos; searchFrom = pos + 1; } - if (lastBeginPos < 0) return new byte[0]; - int endPos = indexOf(pdf, endMarker, lastBeginPos); - if (endPos < 0) return new byte[0]; + if (lastBeginPos >= 0) { + int endPos = indexOf(pdf, endMarker, lastBeginPos); + if (endPos >= 0) { + int endTagClose = + indexOf(pdf, "?>".getBytes(java.nio.charset.StandardCharsets.US_ASCII), endPos); + if (endTagClose >= 0) { + int packetEnd = endTagClose + 2; + byte[] xmp = new byte[packetEnd - lastBeginPos]; + System.arraycopy(pdf, lastBeginPos, xmp, 0, xmp.length); + return xmp; + } + } + } - int endTagClose = - indexOf(pdf, "?>".getBytes(java.nio.charset.StandardCharsets.US_ASCII), endPos); - if (endTagClose < 0) return new byte[0]; - int packetEnd = endTagClose + 2; + // Fallback: scan for ... (no xpacket wrapper) + return extractXmpmetaFallback(pdf); + } + private static byte[] extractXmpmetaFallback(byte[] pdf) { + byte[] beginTag = " 0; } + /** + * Check if this page appears to be blank — no text and no embedded images. This is a lightweight + * heuristic useful for detecting filler pages in scanned books. + * + * @return true if the page has no extractable text and no embedded images + */ + public boolean isBlank() { + return charCount() == 0 && imageCount() == 0; + } + /** * Get all annotations on this page. * @@ -503,6 +516,167 @@ public List webLinks() { }); } + /** + * Get the number of image objects embedded on this page. This counts inline images and image + * XObjects, not rendered visuals. + * + * @return number of embedded images, or 0 if none + */ + public int imageCount() { + ensureOpen(); + try { + int total = (int) EditBindings.FPDFPage_CountObjects.invokeExact(handle); + int count = 0; + for (int i = 0; i < total; i++) { + MemorySegment obj = (MemorySegment) EditBindings.FPDFPage_GetObject.invokeExact(handle, i); + if (!FfmHelper.isNull(obj)) { + int type = (int) EditBindings.FPDFPageObj_GetType.invokeExact(obj); + if (type == EditBindings.FPDF_PAGEOBJ_IMAGE) { + count++; + } + } + } + return count; + } catch (PdfiumException e) { + throw e; + } catch (Throwable t) { + throw new PdfiumException("Failed to count images", t); + } + } + + /** + * Get metadata about all embedded images on this page. This is a lightweight method that returns + * image dimensions and DPI without extracting pixel data. + * + * @return list of embedded image metadata, or empty list if none + */ + public List embeddedImages() { + ensureOpen(); + try { + int total = (int) EditBindings.FPDFPage_CountObjects.invokeExact(handle); + List images = new ArrayList<>(); + int imageIndex = 0; + + for (int i = 0; i < total; i++) { + MemorySegment obj = (MemorySegment) EditBindings.FPDFPage_GetObject.invokeExact(handle, i); + if (FfmHelper.isNull(obj)) continue; + + int type = (int) EditBindings.FPDFPageObj_GetType.invokeExact(obj); + if (type != EditBindings.FPDF_PAGEOBJ_IMAGE) continue; + + try (Arena arena = Arena.ofConfined()) { + MemorySegment meta = arena.allocate(EditBindings.IMAGE_METADATA_LAYOUT); + int ok = (int) EditBindings.FPDFImageObj_GetImageMetadata.invokeExact(obj, handle, meta); + if (ok != 0) { + int w = meta.get(ValueLayout.JAVA_INT, 0); + int h = meta.get(ValueLayout.JAVA_INT, 4); + float hdpi = meta.get(ValueLayout.JAVA_FLOAT, 8); + float vdpi = meta.get(ValueLayout.JAVA_FLOAT, 12); + int bpp = meta.get(ValueLayout.JAVA_INT, 16); + images.add(new EmbeddedImage(imageIndex, w, h, bpp, hdpi, vdpi)); + } else { + images.add(new EmbeddedImage(imageIndex, 0, 0, 0, 0f, 0f)); + } + } + imageIndex++; + } + return List.copyOf(images); + } catch (PdfiumException e) { + throw e; + } catch (Throwable t) { + throw new PdfiumException("Failed to get embedded images", t); + } + } + + /** + * Render a specific embedded image to pixel data. The image is rendered with its original + * dimensions. Use the index from {@link EmbeddedImage#index()}. + * + * @param imageIndex the image index among image objects on this page (0-based) + * @return rendered pixel data + * @throws IllegalArgumentException if imageIndex is invalid + */ + public RenderResult renderEmbeddedImage(int imageIndex) { + ensureOpen(); + try { + int total = (int) EditBindings.FPDFPage_CountObjects.invokeExact(handle); + int currentImageIndex = 0; + + for (int i = 0; i < total; i++) { + MemorySegment obj = (MemorySegment) EditBindings.FPDFPage_GetObject.invokeExact(handle, i); + if (FfmHelper.isNull(obj)) continue; + + int type = (int) EditBindings.FPDFPageObj_GetType.invokeExact(obj); + if (type != EditBindings.FPDF_PAGEOBJ_IMAGE) continue; + + if (currentImageIndex == imageIndex) { + try (Arena arena = Arena.ofConfined()) { + MemorySegment meta = arena.allocate(EditBindings.IMAGE_METADATA_LAYOUT); + int metaOk = + (int) EditBindings.FPDFImageObj_GetImageMetadata.invokeExact(obj, handle, meta); + if (metaOk != 0) { + int metaW = meta.get(ValueLayout.JAVA_INT, 0); + int metaH = meta.get(ValueLayout.JAVA_INT, 4); + ensureRenderBudget(metaW, metaH); + } + } + + MemorySegment bitmap = MemorySegment.NULL; + try { + bitmap = + (MemorySegment) + EditBindings.FPDFImageObj_GetRenderedBitmap.invokeExact( + documentHandle, handle, obj); + if (FfmHelper.isNull(bitmap)) { + throw new PdfiumException( + "FPDFImageObj_GetRenderedBitmap failed for image " + imageIndex); + } + + int w = (int) BitmapBindings.FPDFBitmap_GetWidth.invokeExact(bitmap); + int h = (int) BitmapBindings.FPDFBitmap_GetHeight.invokeExact(bitmap); + int stride = (int) BitmapBindings.FPDFBitmap_GetStride.invokeExact(bitmap); + MemorySegment buffer = + (MemorySegment) BitmapBindings.FPDFBitmap_GetBuffer.invokeExact(bitmap); + + byte[] rgba = buffer.reinterpret((long) stride * h).toArray(ValueLayout.JAVA_BYTE); + + if (stride != w * BYTES_PER_PIXEL) { + byte[] packed = new byte[w * h * BYTES_PER_PIXEL]; + for (int row = 0; row < h; row++) { + System.arraycopy( + rgba, row * stride, packed, row * w * BYTES_PER_PIXEL, w * BYTES_PER_PIXEL); + } + rgba = packed; + } + + // FPDFImageObj_GetRenderedBitmap returns BGRA; convert to RGBA + for (int px = 0; px < rgba.length; px += BYTES_PER_PIXEL) { + byte tmp = rgba[px]; + rgba[px] = rgba[px + 2]; + rgba[px + 2] = tmp; + } + + return new RenderResult(w, h, rgba); + } finally { + if (!FfmHelper.isNull(bitmap)) { + try { + BitmapBindings.FPDFBitmap_Destroy.invokeExact(bitmap); + } catch (Throwable ignored) { + } + } + } + } + currentImageIndex++; + } + throw new IllegalArgumentException( + "Image index " + imageIndex + " not found, page has " + currentImageIndex + " images"); + } catch (PdfiumException | IllegalArgumentException e) { + throw e; + } catch (Throwable t) { + throw new PdfiumException("Failed to render embedded image " + imageIndex, t); + } + } + private static String getWebLinkUrl(MemorySegment pageLink, int linkIndex) { try (Arena arena = Arena.ofConfined()) { int charCount = diff --git a/src/main/java/org/grimmory/pdfium4j/PdfSaver.java b/src/main/java/org/grimmory/pdfium4j/PdfSaver.java index 987597a..81d0653 100644 --- a/src/main/java/org/grimmory/pdfium4j/PdfSaver.java +++ b/src/main/java/org/grimmory/pdfium4j/PdfSaver.java @@ -29,7 +29,12 @@ final class PdfSaver { private static final ThreadLocal WRITE_BUFFER = new ThreadLocal<>(); - private static final Pattern PATTERN = Pattern.compile("/Metadata\\s+\\d+\\s+\\d+\\s+R"); + private static final Pattern METADATA_REF_PATTERN = + Pattern.compile("/Metadata\\s+\\d+\\s+\\d+\\s+R"); + private static final Pattern ROOT_REF_PATTERN = Pattern.compile("/Root\\s+(\\d+\\s+\\d+\\s+R)"); + private static final Pattern INFO_REF_PATTERN = Pattern.compile("/Info\\s+(\\d+\\s+\\d+\\s+R)"); + private static final Pattern OBJ_NUM_PATTERN = Pattern.compile("(\\d+)\\s+0\\s+obj\\b"); + private static final Pattern FIRST_INT_PATTERN = Pattern.compile("(\\d+)"); private PdfSaver() {} @@ -211,7 +216,7 @@ private static byte[] appendCatalogUpdate( // Remove existing /Metadata if present, add our new one String dict = catalogDict; - dict = PATTERN.matcher(dict).replaceFirst(""); + dict = METADATA_REF_PATTERN.matcher(dict).replaceFirst(""); // Insert /Metadata ref before the closing >> int closeIdx = dict.lastIndexOf(">>"); if (closeIdx >= 0) { @@ -232,8 +237,7 @@ private static byte[] appendCatalogUpdate( int catalogOffset = baseOffset + update.length(); update.append(newCatalog); - // Find actual previous xref from the current file end - int actualPrev = findLastStartxrefValue(new String(pdf, StandardCharsets.ISO_8859_1)); + int actualPrev = findLastStartxrefValue(text); int xrefOffset = baseOffset + update.length(); update.append("xref\n"); @@ -245,7 +249,7 @@ private static byte[] appendCatalogUpdate( update.append("<< /Size ").append(sizeBase); update.append(" /Root ").append(trailer.rootRef); // Carry forward /Info from previous trailer - String prevTrailerInfo = findTrailerEntry(new String(pdf, StandardCharsets.ISO_8859_1), "Info"); + String prevTrailerInfo = findTrailerEntry(text, "Info"); if (prevTrailerInfo != null) { update.append(" /Info ").append(prevTrailerInfo); } @@ -309,15 +313,13 @@ private static TrailerInfo parseTrailer(String text) { // For cross-reference streams, find the last xref stream object // which contains /Root in its dictionary - Pattern rootPattern = Pattern.compile("/Root\\s+(\\d+\\s+\\d+\\s+R)"); - Matcher m = rootPattern.matcher(text); + Matcher m = ROOT_REF_PATTERN.matcher(text); String lastRoot = null; String lastInfo = null; while (m.find()) { lastRoot = m.group(1); } - Pattern infoPattern = Pattern.compile("/Info\\s+(\\d+\\s+\\d+\\s+R)"); - Matcher m2 = infoPattern.matcher(text); + Matcher m2 = INFO_REF_PATTERN.matcher(text); while (m2.find()) { lastInfo = m2.group(1); } @@ -325,6 +327,13 @@ private static TrailerInfo parseTrailer(String text) { } private static String findTrailerEntry(String text, String key) { + Pattern p = + switch (key) { + case "Root" -> ROOT_REF_PATTERN; + case "Info" -> INFO_REF_PATTERN; + default -> Pattern.compile("/" + key + "\\s+(\\d+\\s+\\d+\\s+R)"); + }; + // Search backwards from end for the latest trailer int searchFrom = text.length(); while (true) { @@ -338,7 +347,6 @@ private static String findTrailerEntry(String text, String key) { if (dictEnd < 0) break; String dict = text.substring(dictStart, dictEnd + 2); - Pattern p = Pattern.compile("/" + key + "\\s+(\\d+\\s+\\d+\\s+R)"); Matcher m = p.matcher(dict); if (m.find()) { return m.group(1); @@ -362,7 +370,7 @@ private static int findLastStartxrefValue(String text) { private static int extractObjNum(String ref) { // "N 0 R" -> N - Matcher m = Pattern.compile("(\\d+)").matcher(ref); + Matcher m = FIRST_INT_PATTERN.matcher(ref); if (m.find()) return Integer.parseInt(m.group(1)); return 1; } @@ -399,8 +407,7 @@ private static String findObjectDict(String text, int objNum) { // --- Helpers --- private static int findMaxObjectNumber(String pdfText) { - Pattern objPattern = Pattern.compile("(\\d+)\\s+0\\s+obj\\b"); - Matcher matcher = objPattern.matcher(pdfText); + Matcher matcher = OBJ_NUM_PATTERN.matcher(pdfText); int max = 0; while (matcher.find()) { int num = Integer.parseInt(matcher.group(1)); diff --git a/src/main/java/org/grimmory/pdfium4j/internal/EditBindings.java b/src/main/java/org/grimmory/pdfium4j/internal/EditBindings.java index 1e3df66..0acc692 100644 --- a/src/main/java/org/grimmory/pdfium4j/internal/EditBindings.java +++ b/src/main/java/org/grimmory/pdfium4j/internal/EditBindings.java @@ -68,6 +68,10 @@ private static MethodHandle downcallCritical(String name, FunctionDescriptor des public static final FunctionDescriptor WRITE_BLOCK_DESC = FunctionDescriptor.of(JAVA_INT, ADDRESS, ADDRESS, JAVA_LONG); + /** Create a new empty document. Returns FPDF_DOCUMENT handle (NULL on failure). */ + public static final MethodHandle FPDF_CreateNewDocument = + downcall("FPDF_CreateNewDocument", FunctionDescriptor.of(ADDRESS)); + /** * Save the document to an FPDF_FILEWRITE sink. Flags: 0 = full save, 1 = incremental. Returns 1 * on success. @@ -99,4 +103,58 @@ private static MethodHandle downcallCritical(String name, FunctionDescriptor des public static final MethodHandle FPDF_ImportPages = downcall( "FPDF_ImportPages", FunctionDescriptor.of(JAVA_INT, ADDRESS, ADDRESS, ADDRESS, JAVA_INT)); + + // -- Page object API (fpdf_edit.h) -- + + /** Get the number of page objects on a page. */ + public static final MethodHandle FPDFPage_CountObjects = + downcallCritical("FPDFPage_CountObjects", FunctionDescriptor.of(JAVA_INT, ADDRESS)); + + /** Get a page object by index. Returns FPDF_PAGEOBJECT handle (do NOT free individually). */ + public static final MethodHandle FPDFPage_GetObject = + downcallCritical("FPDFPage_GetObject", FunctionDescriptor.of(ADDRESS, ADDRESS, JAVA_INT)); + + /** Get the type of a page object. Returns FPDF_PAGEOBJ_* constant. */ + public static final MethodHandle FPDFPageObj_GetType = + downcallCritical("FPDFPageObj_GetType", FunctionDescriptor.of(JAVA_INT, ADDRESS)); + + /** FPDF_PAGEOBJ_IMAGE constant (image XObject). */ + public static final int FPDF_PAGEOBJ_IMAGE = 3; + + /** + * Render an image object's content to a bitmap (respecting transforms). Returns FPDF_BITMAP + * handle (caller must destroy). Parameters: document, page, imageObject. + */ + public static final MethodHandle FPDFImageObj_GetRenderedBitmap = + downcall( + "FPDFImageObj_GetRenderedBitmap", + FunctionDescriptor.of(ADDRESS, ADDRESS, ADDRESS, ADDRESS)); + + /** + * Get the image metadata for an image object. Writes into an FPDF_IMAGEOBJ_METADATA struct. + * Returns 1 on success. + * + *

FPDF_IMAGEOBJ_METADATA layout: { unsigned int width; unsigned int height; float + * horizontal_dpi; float vertical_dpi; unsigned int bits_per_pixel; int colorspace; int + * marked_content_id; } + */ + public static final MethodHandle FPDFImageObj_GetImageMetadata = + downcall( + "FPDFImageObj_GetImageMetadata", + FunctionDescriptor.of(JAVA_INT, ADDRESS, ADDRESS, ADDRESS)); + + /** + * FPDF_IMAGEOBJ_METADATA struct layout. Fields: width (uint), height (uint), horizontal_dpi + * (float), vertical_dpi (float), bits_per_pixel (uint), colorspace (int), marked_content_id + * (int). + */ + public static final StructLayout IMAGE_METADATA_LAYOUT = + MemoryLayout.structLayout( + JAVA_INT.withName("width"), + JAVA_INT.withName("height"), + JAVA_FLOAT.withName("horizontal_dpi"), + JAVA_FLOAT.withName("vertical_dpi"), + JAVA_INT.withName("bits_per_pixel"), + JAVA_INT.withName("colorspace"), + JAVA_INT.withName("marked_content_id")); } diff --git a/src/main/java/org/grimmory/pdfium4j/model/EmbeddedImage.java b/src/main/java/org/grimmory/pdfium4j/model/EmbeddedImage.java new file mode 100644 index 0000000..8fec937 --- /dev/null +++ b/src/main/java/org/grimmory/pdfium4j/model/EmbeddedImage.java @@ -0,0 +1,21 @@ +package org.grimmory.pdfium4j.model; + +/** + * Metadata about an image embedded within a PDF page. The image can be rendered to pixel data via + * {@link org.grimmory.pdfium4j.PdfPage#renderEmbeddedImage(int)}. + * + * @param index the image's index among image objects on the page (0-based) + * @param width image width in pixels + * @param height image height in pixels + * @param bitsPerPixel bits per pixel (e.g. 8, 24, 32) + * @param horizontalDpi horizontal resolution in DPI + * @param verticalDpi vertical resolution in DPI + */ +public record EmbeddedImage( + int index, int width, int height, int bitsPerPixel, float horizontalDpi, float verticalDpi) { + + /** Total pixel count for this image. */ + public long pixelCount() { + return (long) width * height; + } +} diff --git a/src/main/java/org/grimmory/pdfium4j/model/PdfBookMetadata.java b/src/main/java/org/grimmory/pdfium4j/model/PdfBookMetadata.java index f84544e..e75860d 100644 --- a/src/main/java/org/grimmory/pdfium4j/model/PdfBookMetadata.java +++ b/src/main/java/org/grimmory/pdfium4j/model/PdfBookMetadata.java @@ -42,9 +42,9 @@ public record PdfBookMetadata( implements BookMetadata { private static final System.Logger LOG = System.getLogger(PdfBookMetadata.class.getName()); - private static final Pattern PATTERN = Pattern.compile("^(\\d)\\1{9,12}$"); - private static final Pattern REGEX = Pattern.compile("[^0-9Xx]"); - private static final Pattern REGEXP = Pattern.compile("[,;]"); + private static final Pattern UNIFORM_DIGIT_PATTERN = Pattern.compile("^(\\d)\\1{9,12}$"); + private static final Pattern NON_ISBN_CHARS_PATTERN = Pattern.compile("[^0-9Xx]"); + private static final Pattern SEPARATOR_PATTERN = Pattern.compile("[,;]"); public PdfBookMetadata { authors = Collections.unmodifiableList(authors); @@ -100,7 +100,7 @@ public static PdfBookMetadata from(PdfDocument document) { List subjects = new ArrayList<>(xmp.subjects()); Optional keywordsOpt = document.metadata(MetadataTag.KEYWORDS); if (keywordsOpt.isPresent()) { - for (String part : REGEXP.split(keywordsOpt.get())) { + for (String part : SEPARATOR_PATTERN.split(keywordsOpt.get())) { String trimmed = part.trim(); if (!trimmed.isBlank() && !subjects.contains(trimmed)) { subjects.add(trimmed); @@ -155,7 +155,7 @@ private static List extractAuthors(XmpMetadata xmp, PdfDocument document .metadata(MetadataTag.AUTHOR) .ifPresent( author -> { - String[] parts = REGEXP.split(author); + String[] parts = SEPARATOR_PATTERN.split(author); for (String part : parts) { String trimmed = part.trim(); if (!trimmed.isBlank() && !authors.contains(trimmed)) { @@ -175,7 +175,7 @@ private static Optional extractIsbn(XmpMetadata xmp) { } for (String id : xmp.identifiers()) { - if (id.toLowerCase().contains("isbn")) { + if (id.toLowerCase(Locale.ROOT).contains("isbn")) { String cleaned = cleanIsbn(id); if (cleaned != null) { return Optional.of(cleaned); @@ -188,6 +188,7 @@ private static Optional extractIsbn(XmpMetadata xmp) { private static final Pattern PDF_DATE_PATTERN = Pattern.compile("^D:(\\d{4})(\\d{2})?(\\d{2})?(\\d{2})?(\\d{2})?(\\d{2})?"); + private static final Pattern FOUR_DIGIT_YEAR_PATTERN = Pattern.compile("\\b(\\d{4})\\b"); private static final List DATE_FORMATS = List.of( @@ -244,7 +245,7 @@ private static Optional parseDate(String dateStr) { } // Last resort: extract 4-digit year - Matcher yearMatcher = Pattern.compile("\\b(\\d{4})\\b").matcher(dateStr); + Matcher yearMatcher = FOUR_DIGIT_YEAR_PATTERN.matcher(dateStr); if (yearMatcher.find()) { int year = Integer.parseInt(yearMatcher.group(1)); if (year >= 1000 && year <= 9999) { @@ -297,10 +298,10 @@ private static Map extractCustomFields(XmpMetadata xmp, PdfDocum private static String cleanIsbn(String id) { if (id == null) return null; - String cleaned = REGEX.matcher(id).replaceAll("").toUpperCase(); + String cleaned = NON_ISBN_CHARS_PATTERN.matcher(id).replaceAll("").toUpperCase(); // Reject uniform sequences like 0000000000 or 1111111111111 (fake ISBNs) - if (PATTERN.matcher(cleaned).matches()) return null; + if (UNIFORM_DIGIT_PATTERN.matcher(cleaned).matches()) return null; if (cleaned.length() == 10 && isValidIsbn10(cleaned)) return cleaned; if (cleaned.length() == 13 && isValidIsbn13(cleaned)) return cleaned; diff --git a/src/main/java/org/grimmory/pdfium4j/model/RenderResult.java b/src/main/java/org/grimmory/pdfium4j/model/RenderResult.java index 640a2a2..18a2900 100644 --- a/src/main/java/org/grimmory/pdfium4j/model/RenderResult.java +++ b/src/main/java/org/grimmory/pdfium4j/model/RenderResult.java @@ -1,6 +1,15 @@ package org.grimmory.pdfium4j.model; +import java.awt.Graphics2D; import java.awt.image.BufferedImage; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.UncheckedIOException; +import javax.imageio.IIOImage; +import javax.imageio.ImageIO; +import javax.imageio.ImageWriteParam; +import javax.imageio.ImageWriter; +import javax.imageio.stream.ImageOutputStream; /** * Result of rendering a PDF page: raw pixel data plus dimensions. @@ -53,4 +62,78 @@ public BufferedImage toBufferedImage() { img.setRGB(0, 0, width, height, pixels, 0, width); return img; } + + /** + * Encode this render result as JPEG bytes with the specified quality. + * + * @param quality JPEG quality from 0.0 (worst) to 1.0 (best) + * @return JPEG-encoded bytes + * @throws UncheckedIOException if encoding fails + */ + public byte[] toJpegBytes(float quality) { + if (quality < 0f || quality > 1f) { + throw new IllegalArgumentException( + "JPEG quality must be between 0.0 and 1.0, got: " + quality); + } + BufferedImage img = toBufferedImage(); + try { + // Convert ARGB to RGB (JPEG doesn't support alpha) + BufferedImage rgb = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB); + Graphics2D g = rgb.createGraphics(); + try { + g.drawImage(img, 0, 0, null); + } finally { + g.dispose(); + } + img.flush(); + + ImageWriter writer = ImageIO.getImageWritersByFormatName("JPEG").next(); + try { + ImageWriteParam param = writer.getDefaultWriteParam(); + param.setCompressionMode(ImageWriteParam.MODE_EXPLICIT); + param.setCompressionQuality(quality); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (ImageOutputStream ios = ImageIO.createImageOutputStream(baos)) { + writer.setOutput(ios); + writer.write(null, new IIOImage(rgb, null, null), param); + } + return baos.toByteArray(); + } finally { + writer.dispose(); + rgb.flush(); + } + } catch (IOException e) { + throw new UncheckedIOException("Failed to encode JPEG", e); + } + } + + /** + * Encode this render result as JPEG bytes with default quality (0.85). + * + * @return JPEG-encoded bytes + * @throws UncheckedIOException if encoding fails + */ + public byte[] toJpegBytes() { + return toJpegBytes(0.85f); + } + + /** + * Encode this render result as PNG bytes (lossless, with alpha). + * + * @return PNG-encoded bytes + * @throws UncheckedIOException if encoding fails + */ + public byte[] toPngBytes() { + BufferedImage img = toBufferedImage(); + try { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + ImageIO.write(img, "PNG", baos); + return baos.toByteArray(); + } catch (IOException e) { + throw new UncheckedIOException("Failed to encode PNG", e); + } finally { + img.flush(); + } + } } diff --git a/src/test/java/org/grimmory/pdfium4j/PdfDocumentTest.java b/src/test/java/org/grimmory/pdfium4j/PdfDocumentTest.java index ba3bcbc..53d048f 100644 --- a/src/test/java/org/grimmory/pdfium4j/PdfDocumentTest.java +++ b/src/test/java/org/grimmory/pdfium4j/PdfDocumentTest.java @@ -857,6 +857,204 @@ void setMetadataClearValue(@TempDir Path tempDir) throws IOException { } } + // --- Tests for new APIs (metadata(String), renderPageToBytes, RenderResult encoding, image + // extraction, isBlank) --- + + @Test + @EnabledIf("pdfiumAvailable") + void metadataByStringKeyReadsStandardTag(@TempDir Path tempDir) throws IOException { + Path testPdf = getTestPdf(); + if (testPdf == null) return; + + Path output = tempDir.resolve("string-key-meta.pdf"); + try (PdfDocument doc = PdfDocument.open(testPdf)) { + doc.setMetadata(MetadataTag.TITLE, "StringKeyTest"); + doc.save(output); + } + + try (PdfDocument doc = PdfDocument.open(output)) { + // Read via String key (same underlying FPDF_GetMetaText call) + Optional title = doc.metadata("Title"); + assertTrue(title.isPresent(), "Title should be readable via string key"); + assertEquals("StringKeyTest", title.get()); + } + } + + @Test + @EnabledIf("pdfiumAvailable") + void metadataByStringKeyReturnsEmptyForMissing() throws IOException { + Path testPdf = getTestPdf(); + if (testPdf == null) return; + + try (PdfDocument doc = PdfDocument.open(testPdf)) { + Optional result = doc.metadata("NonExistentCustomKey"); + assertTrue(result.isEmpty(), "Non-existent key should return empty"); + } + } + + @Test + @EnabledIf("pdfiumAvailable") + void renderResultToJpegBytesProducesValidJpeg() throws IOException { + Path testPdf = getTestPdf(); + if (testPdf == null) return; + + try (PdfDocument doc = PdfDocument.open(testPdf); + PdfPage page = doc.page(0)) { + RenderResult result = page.render(72); + byte[] jpeg = result.toJpegBytes(); + + assertTrue(jpeg.length > 0, "JPEG bytes should not be empty"); + // JPEG magic bytes: FF D8 FF + assertEquals((byte) 0xFF, jpeg[0], "JPEG should start with 0xFF"); + assertEquals((byte) 0xD8, jpeg[1], "JPEG byte 2 should be 0xD8"); + assertEquals((byte) 0xFF, jpeg[2], "JPEG byte 3 should be 0xFF"); + } + } + + @Test + @EnabledIf("pdfiumAvailable") + void renderResultToJpegBytesWithQuality() throws IOException { + Path testPdf = getTestPdf(); + if (testPdf == null) return; + + try (PdfDocument doc = PdfDocument.open(testPdf); + PdfPage page = doc.page(0)) { + RenderResult result = page.render(72); + byte[] lowQuality = result.toJpegBytes(0.1f); + byte[] highQuality = result.toJpegBytes(0.95f); + + assertTrue(lowQuality.length > 0, "Low quality JPEG should not be empty"); + assertTrue(highQuality.length > 0, "High quality JPEG should not be empty"); + assertTrue( + highQuality.length > lowQuality.length, + "High quality JPEG should be larger than low quality"); + } + } + + @Test + @EnabledIf("pdfiumAvailable") + void renderResultToPngBytesProducesValidPng() throws IOException { + Path testPdf = getTestPdf(); + if (testPdf == null) return; + + try (PdfDocument doc = PdfDocument.open(testPdf); + PdfPage page = doc.page(0)) { + RenderResult result = page.render(72); + byte[] png = result.toPngBytes(); + + assertTrue(png.length > 0, "PNG bytes should not be empty"); + // PNG magic bytes: 89 50 4E 47 0D 0A 1A 0A + assertEquals((byte) 0x89, png[0], "PNG byte 1"); + assertEquals((byte) 0x50, png[1], "PNG byte 2 (P)"); + assertEquals((byte) 0x4E, png[2], "PNG byte 3 (N)"); + assertEquals((byte) 0x47, png[3], "PNG byte 4 (G)"); + } + } + + @Test + @EnabledIf("pdfiumAvailable") + void renderPageToBytesJpeg() throws IOException { + Path testPdf = getTestPdf(); + if (testPdf == null) return; + + try (PdfDocument doc = PdfDocument.open(testPdf)) { + byte[] jpeg = doc.renderPageToBytes(0, 150, "jpeg"); + + assertTrue(jpeg.length > 0, "Rendered JPEG should not be empty"); + assertEquals((byte) 0xFF, jpeg[0], "Should be JPEG format"); + assertEquals((byte) 0xD8, jpeg[1], "Should be JPEG format"); + } + } + + @Test + @EnabledIf("pdfiumAvailable") + void renderPageToBytesPng() throws IOException { + Path testPdf = getTestPdf(); + if (testPdf == null) return; + + try (PdfDocument doc = PdfDocument.open(testPdf)) { + byte[] png = doc.renderPageToBytes(0, 150, "png"); + + assertTrue(png.length > 0, "Rendered PNG should not be empty"); + assertEquals((byte) 0x89, png[0], "Should be PNG format"); + assertEquals((byte) 0x50, png[1], "Should be PNG format"); + } + } + + @Test + @EnabledIf("pdfiumAvailable") + void renderPageToBytesInvalidFormat() throws IOException { + Path testPdf = getTestPdf(); + if (testPdf == null) return; + + try (PdfDocument doc = PdfDocument.open(testPdf)) { + assertThrows( + IllegalArgumentException.class, + () -> doc.renderPageToBytes(0, 150, "bmp"), + "Should reject unsupported format"); + } + } + + @Test + @EnabledIf("pdfiumAvailable") + void pageIsBlankOnTextPage() throws IOException { + Path testPdf = getTestPdf(); + if (testPdf == null) return; + + try (PdfDocument doc = PdfDocument.open(testPdf); + PdfPage page = doc.page(0)) { + // Our test PDF has "Hello World" text + assertFalse(page.isBlank(), "Page with text should not be blank"); + } + } + + @Test + @EnabledIf("pdfiumAvailable") + void pageIsBlankOnBlankPage(@TempDir Path tempDir) throws IOException { + Path testPdf = getTestPdf(); + if (testPdf == null) return; + + Path blankPdf = tempDir.resolve("blank.pdf"); + + try (PdfDocument doc = PdfDocument.open(testPdf)) { + doc.insertBlankPage(doc.pageCount(), PageSize.A4); + doc.save(blankPdf); + } + + try (PdfDocument doc = PdfDocument.open(blankPdf)) { + // The last page is the blank one we inserted + try (PdfPage page = doc.page(doc.pageCount() - 1)) { + assertTrue(page.isBlank(), "Blank page with no text or images should be blank"); + } + } + } + + @Test + @EnabledIf("pdfiumAvailable") + void pageImageCountOnTextOnlyPage() throws IOException { + Path testPdf = getTestPdf(); + if (testPdf == null) return; + + try (PdfDocument doc = PdfDocument.open(testPdf); + PdfPage page = doc.page(0)) { + // Minimal text-only PDF has no images + assertEquals(0, page.imageCount(), "Text-only page should have 0 images"); + } + } + + @Test + @EnabledIf("pdfiumAvailable") + void pageEmbeddedImagesOnTextOnlyPage() throws IOException { + Path testPdf = getTestPdf(); + if (testPdf == null) return; + + try (PdfDocument doc = PdfDocument.open(testPdf); + PdfPage page = doc.page(0)) { + List images = page.embeddedImages(); + assertTrue(images.isEmpty(), "Text-only page should have no embedded images"); + } + } + private Path getTestPdf() { var url = getClass().getResource("/test.pdf"); if (url != null) {