diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSPageContentHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSPageContentHandler.java index bce65b9604..41a4b029b3 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSPageContentHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSPageContentHandler.java @@ -59,6 +59,8 @@ class XPSPageContentHandler extends DefaultHandler { private static final String BIDI_LEVEL = "BidiLevel"; private static final String INDICES = "Indices"; private static final String NAME = "Name"; + private static final String FONT_RENDERING_EM_SIZE = "FontRenderingEmSize"; + private static final String FONT_URI = "FontUri"; private static final String PATH = "Path"; private static final String NAVIGATE_URI = "FixedPage.NavigateUri"; private static final String IMAGE_SOURCE = "ImageSource"; @@ -73,6 +75,18 @@ class XPSPageContentHandler extends DefaultHandler { private static final String P = "p"; private static final String HREF = "href"; private static final String A = "a"; + + private static final char[] SPACE = new char[]{' '}; + + // Estimate width of glyph when better information is not available, measured in em + private static final float ESTIMATE_GLYPH_WIDTH = 0.5f; + + // The threshold for the horizontal distance between glyph runs to insert a whitespace, measured in em + private static final float WHITESPACE_THRESHOLD = 0.3f; + + // The threshold for the vertical distance between glyph runs to be considered on the same row, measured in em + private static final float ROW_COMBINE_THRESHOLD = 0.5f; + //sort based on y coordinate of first element in each row //this requires every row to have at least one element private static Comparator> ROW_SORTER = @@ -84,6 +98,18 @@ class XPSPageContentHandler extends DefaultHandler { } return 0; }; + private static Comparator LTR_SORTER = new Comparator() { + @Override + public int compare(GlyphRun a, GlyphRun b) { + return Float.compare(a.left(), b.left()); + } + }; + private static Comparator RTL_SORTER = new Comparator() { + @Override + public int compare(GlyphRun a, GlyphRun b) { + return Float.compare(b.left(), a.left()); + } + }; private final XHTMLContentHandler xhml; private final Map embeddedInfos; //path in zip file for an image rendered on this page @@ -93,7 +119,7 @@ class XPSPageContentHandler extends DefaultHandler { //buffer for the glyph runs within a given canvas //in insertion order private Map> canvases = new LinkedHashMap<>(); - private Set urls = new LinkedHashSet(); + private Set urls = new LinkedHashSet(); private Stack canvasStack = new Stack<>(); public XPSPageContentHandler(XHTMLContentHandler xhtml, Map embeddedInfos) { @@ -140,7 +166,9 @@ public void startElement(String uri, String localName, String qName, Attributes Float originY = null; String unicodeString = null; int bidilevel = 1; - String indicesString = null; + List indices = null; + float fontSize = 0; + String fontUri = null; for (int i = 0; i < atts.getLength(); i++) { String lName = atts.getLocalName(i); @@ -149,13 +177,13 @@ public void startElement(String uri, String localName, String qName, Attributes if (ORIGIN_X.equals(lName) && value.length() > 0) { try { - originX = Float.parseFloat(atts.getValue(i)); + originX = Float.parseFloat(value); } catch (NumberFormatException e) { throw new SAXException(e); } } else if (ORIGIN_Y.equals(lName) && value.length() > 0) { try { - originY = Float.parseFloat(atts.getValue(i)); + originY = Float.parseFloat(value); } catch (NumberFormatException e) { throw new SAXException(e); } @@ -163,14 +191,18 @@ public void startElement(String uri, String localName, String qName, Attributes unicodeString = atts.getValue(i); } else if (BIDI_LEVEL.equals(lName) && value.length() > 0) { try { - bidilevel = Integer.parseInt(atts.getValue(i)); + bidilevel = Integer.parseInt(value); } catch (NumberFormatException e) { throw new SAXException(e); } } else if (INDICES.equals(lName)) { - indicesString = atts.getValue(i); + indices = parseIndicesString(value); } else if (NAME.equals(lName)) { name = value; + } else if (FONT_RENDERING_EM_SIZE.equals(lName)) { + fontSize = Float.parseFloat(value); + } else if (FONT_URI.equals(lName)) { + fontUri = value; } } if (unicodeString != null) { @@ -181,10 +213,38 @@ public void startElement(String uri, String localName, String qName, Attributes if (runs == null) { runs = new ArrayList<>(); } - runs.add(new GlyphRun(name, originY, originX, unicodeString, bidilevel, indicesString)); + runs.add(new GlyphRun(name, originY, originX, unicodeString, bidilevel, indices, fontSize, fontUri)); canvases.put(currentCanvasClip, runs); } + } + // Parses a indices string into a list of GlyphIndex + private static List parseIndicesString(String indicesString) throws SAXException { + try { + ArrayList indices = new ArrayList<>(); + for (String indexString : indicesString.split(";", -1)) { + if (indexString.isEmpty()) { + indices.add(new GlyphIndex(0, 0.0f)); + continue; + } + int commaIndex = indexString.indexOf(','); + if (commaIndex == -1) { + int glyphIndex = Integer.parseInt(indexString); + indices.add(new GlyphIndex(glyphIndex, 0.0f)); + } else { + int glyphIndex = 0; + if (commaIndex > 0) { + glyphIndex = Integer.parseInt(indexString.substring(0, commaIndex)); + } + // Advance is measured in hundreths so divide by 100 + float advance = Float.parseFloat(indexString.substring(commaIndex + 1)) / 100.0f; + indices.add(new GlyphIndex(glyphIndex, advance)); + } + } + return indices; + } catch (NumberFormatException e) { + throw new SAXException(e); + } } @Override @@ -234,7 +294,6 @@ private final void writePage() throws SAXException { } for (Map.Entry> e : canvases.entrySet()) { - String clip = e.getKey(); List runs = e.getValue(); if (runs.size() == 0) { continue; @@ -263,36 +322,45 @@ private final void writePage() throws SAXException { } private void writeRow(List row) throws SAXException { -/* - int rtl = 0; - int ltr = 0; - //if the row is entirely rtl, sort all as rtl - //otherwise sort ltr - for (GlyphRun r : row) { - //ignore directionality of pure spaces - if (r.unicodeString == null || r.unicodeString.trim().length() == 0) { - continue; - } - if (r.direction == GlyphRun.DIRECTION.RTL) { - rtl++; - } else { - ltr++; - } - } - if (rtl > 0 && ltr == 0) { - Collections.sort(row, GlyphRun.RTL_COMPARATOR); - } else { - Collections.sort(row, GlyphRun.LTR_COMPARATOR); - }*/ + sortRow(row); xhml.startElement(P); + GlyphRun previous = null; for (GlyphRun run : row) { - //figure out if you need to add a space + if (previous != null) { + float distanceFromPrevious = run.left() - previous.right(); + float averageFontSize = (run.fontSize + previous.fontSize) / 2f; + if (distanceFromPrevious > averageFontSize * WHITESPACE_THRESHOLD) { + xhml.ignorableWhitespace(SPACE, 0, SPACE.length); + } + } xhml.characters(run.unicodeString); + previous = run; } xhml.endElement(P); } + private static void sortRow(List row) { + boolean allRTL = true; + for (GlyphRun run : row) { + if (run.unicodeString.trim().length() == 0) { + // ignore whitespace for all RTL check + continue; + } + if (run.direction == GlyphRun.DIRECTION.LTR) { + allRTL = false; + break; + } + } + if (allRTL) { + // If all the text in a row is RTL then sort it in reverse + java.util.Collections.sort(row, RTL_SORTER); + } else { + // Otherwise sort it from left to right + java.util.Collections.sort(row, LTR_SORTER); + } + } + //returns a List of rows (where a row is a list of glyphruns) //the List is sorted in increasing order of the first y of each row private List> buildRows(List glyphRuns) { @@ -308,9 +376,9 @@ private List> buildRows(List glyphRuns) { boolean addedNewRow = false; //can rely on the last row having the highest y List row = rows.get(rows.size() - 1); - //0.5 is a purely heuristic/magical number that should be derived - //from the data, not made up. TODO: fix this - if (Math.abs(glyphRun.originY - row.get(0).originY) < 0.5) { + GlyphRun lastRun = row.get(row.size() - 1); + float averageFontSize = (glyphRun.fontSize + lastRun.fontSize) / 2f; + if (Math.abs(glyphRun.originY - lastRun.originY) < averageFontSize * ROW_COMBINE_THRESHOLD) { row.add(glyphRun); } else { row = new ArrayList<>(); @@ -339,19 +407,23 @@ final static class GlyphRun { private final String name; private final float originY; private final float originX; - //not currently used, but could be used for bidi text calculations private final String unicodeString; - private final String indicesString; - //not used yet + private final List indices; private final DIRECTION direction; -//not currently used, but could be used for width calculations + // Fonts em-size + private final float fontSize; + // Not used currently + private final String fontUri; private GlyphRun(String name, float originY, float originX, String unicodeString, - Integer bidiLevel, String indicesString) { + Integer bidiLevel, List indices, float fontSize, String fontUri) { this.name = name; this.unicodeString = unicodeString; this.originY = originY; this.originX = originX; + this.fontSize = fontSize; + this.fontUri = fontUri; + this.indices = indices; if (bidiLevel == null) { direction = DIRECTION.LTR; } else { @@ -361,12 +433,60 @@ private GlyphRun(String name, float originY, float originX, String unicodeString direction = DIRECTION.RTL; } } - this.indicesString = indicesString; } private enum DIRECTION { LTR, RTL } + + private float left() { + if (direction == DIRECTION.LTR) { + return originX; + } else { + return originX - width(); + } + } + + private float right() { + if (direction == DIRECTION.LTR) { + return originX + width(); + } else { + return originX; + } + } + + private float width() { + float width = 0.0f; + for (int i = 0; i < indices.size(); i++) { + if (indices.get(i).advance == 0.0) { + if (i == 0) { + // If this is the first glyph use hard coded estimate + width += ESTIMATE_GLYPH_WIDTH; + } else { + // If advance is 0.0 it is probably the last glyph in the run, we don't know how wide it is so we use the average of the previous widths as an estimate + width += width / i; + } + } else { + width += indices.get(i).advance; + } + } + return width * fontSize; + } + } + + final static class GlyphIndex { + // The index of the glyph in the font + private final int index; + // The placement of the glyph that follows relative to the origin of the current glyph. Measured as a multiple of the fonts em-size. + // Should be multiplied by the font em-size to get a value that can be compared across GlyphRuns + // Will be zero for the last glpyh in a glyph run + private final float advance; + + private GlyphIndex(int index, float advance) { + this.index = index; + this.advance = advance; + } + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java index b9a12700b0..60f206a829 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSParserTest.java @@ -48,8 +48,7 @@ public void testBasic() throws Exception { assertContains("

Attachment Test

", content); assertContains("

Different", content); - //I'd want this to be "tika content", but copy+paste in Windows yields tikacontent - assertContains("tikacontent", content); + assertContains("tika content", content); assertEquals("image/jpeg", metadataList.get(1).get(Metadata.CONTENT_TYPE)); @@ -104,14 +103,14 @@ public void testXPSWithDataDescriptor() throws Exception { XPSParserTest.class.getResource("/test-documents/testXPSWithDataDescriptor.xps") .toURI()); //test both path and stream based - List metadataList = getRecursiveMetadata(path, true); + List metadataList = getRecursiveMetadata(path); assertEquals(2, metadataList.size()); assertContains("This is my XPS document test", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); ByteArrayOutputStream bos = new ByteArrayOutputStream(); Files.copy(path, bos); - metadataList = getRecursiveMetadata(new ByteArrayInputStream(bos.toByteArray()), true); + metadataList = getRecursiveMetadata(new ByteArrayInputStream(bos.toByteArray()), false); assertEquals(2, metadataList.size()); assertContains("This is my XPS document test", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); @@ -125,17 +124,36 @@ public void testOpenXPSWithDataDescriptor() throws Exception { Path path = Paths.get( XPSParserTest.class.getResource("/test-documents/testXPSWithDataDescriptor2.xps") .toURI()); - List metadataList = getRecursiveMetadata(path, true); + List metadataList = getRecursiveMetadata(path); assertEquals(2, metadataList.size()); assertContains("How was I supposed to know", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); ByteArrayOutputStream bos = new ByteArrayOutputStream(); Files.copy(path, bos); - metadataList = getRecursiveMetadata(new ByteArrayInputStream(bos.toByteArray()), true); + metadataList = getRecursiveMetadata(new ByteArrayInputStream(bos.toByteArray()), false); assertEquals(2, metadataList.size()); assertContains("How was I supposed to know", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); } + @Test + public void testSpreadsheetXPS() throws Exception { + Path path = Paths.get(XPSParserTest.class.getResource("/test-documents/testXLSX.xps").toURI()); + List metadataList = getRecursiveMetadata(path); + String content = metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT); + assertContains("abcd efg", content); + assertContains("foo bar baz", content); + assertContains("spaced out", content); + } + + @Test + public void testTextDocumentXPS() throws Exception { + Path path = Paths.get(XPSParserTest.class.getResource("/test-documents/test_text.xps").toURI()); + List metadataList = getRecursiveMetadata(path); + String content = metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT); + assertContains("Rainbow", content); + assertContains("Large font size", content); + assertContains("Parts of this are in italics and bold.", content); + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXLSX.xps b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXLSX.xps new file mode 100644 index 0000000000..ed50059e2a Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testXLSX.xps differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test_text.xps b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test_text.xps new file mode 100644 index 0000000000..d49d39d189 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test_text.xps differ