Skip to content

Commit

Permalink
TIKA-4303: Handle OneNotePropertyEnum.CachedTitleString as RichEditTe…
Browse files Browse the repository at this point in the history
…xtUnicode (#2098)
  • Loading branch information
sunluman authored Jan 27, 2025
1 parent 1b8c1e2 commit 7f94520
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,8 @@ private List<Map<String, Object>> processPropertySet(PropertySet propertySet,
private boolean propertyIsBinary(OneNotePropertyEnum property) {
return property == OneNotePropertyEnum.RgOutlineIndentDistance ||
property == OneNotePropertyEnum.NotebookManagementEntityGuid ||
property == OneNotePropertyEnum.RichEditTextUnicode;
property == OneNotePropertyEnum.RichEditTextUnicode ||
property == OneNotePropertyEnum.CachedTitleString;
}

/**
Expand Down Expand Up @@ -508,7 +509,9 @@ private Map<String, Object> processPropertyValue(PropertyValue propertyValue,
dif.size());
}
if (propertyValue.propertyId.propertyEnum ==
OneNotePropertyEnum.RichEditTextUnicode) {
OneNotePropertyEnum.RichEditTextUnicode
|| propertyValue.propertyId.propertyEnum ==
OneNotePropertyEnum.CachedTitleString) {
if (!options.isOnlyLatestRevision()
|| (parentPropertyId != null &&
parentPropertyId.propertyEnum != OneNotePropertyEnum.ElementChildNodesOfVersionHistory)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@ public class OneNoteTreeWalkerOptions implements Serializable {
private boolean crawlAllFileNodesFromRoot = true;
private boolean onlyLatestRevision = true;
private Set<OneNotePropertyEnum> utf16PropertiesToPrint = new HashSet<>(
Arrays.asList(OneNotePropertyEnum.ImageFilename, OneNotePropertyEnum.Author,
OneNotePropertyEnum.CachedTitleString));
Arrays.asList(OneNotePropertyEnum.ImageFilename, OneNotePropertyEnum.Author));

/**
* Do this to ignore revisions and just parse all file nodes from the root recursively.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -288,4 +288,14 @@ public void testDupeText() throws Exception {

assertEquals(1, StringUtils.countMatches(txt, "Sunday morning"));
}

/**
* TIKA-4303 - test extract Chinese
*/
@Test
public void testExtractChinese() throws Exception {
Metadata metadata = new Metadata();
XMLResult xml = getXML("test-tika-4303-Chinese-notes.one", metadata);
assertContains("<p>中文标题</p>", xml.xml);
}
}
Binary file not shown.

0 comments on commit 7f94520

Please sign in to comment.