Skip to content

Commit

Permalink
TIKA-4363: refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
THausherr committed Jan 15, 2025
1 parent 185714e commit 657e75b
Showing 1 changed file with 7 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ public void close() {
throw new TikaException("Unable to extract PDF content", e);
}
}
if (pdfMarkedContent2XHTML.exceptions.size() > 0) {
if (!pdfMarkedContent2XHTML.exceptions.isEmpty()) {
//throw the first
throw new TikaException("Unable to extract PDF content",
pdfMarkedContent2XHTML.exceptions.get(0));
Expand Down Expand Up @@ -192,15 +192,15 @@ private static void findPages(COSBase kidsObj, List<ObjectRef> pageRefs) {
}

@Override
protected void processPages(PDPageTree pages) throws IOException {
protected void processPages(PDPageTree pageTree) throws IOException {

//this is a 0-indexed list of object refs for each page
//we need this to map the mcids later...
//TODO: is there a better way of getting these/doing the mapping?

List<ObjectRef> pageRefs = new ArrayList<>();
//STEP 1: get the page refs
findPages(pdDocument.getPages().getCOSObject().getDictionaryObject(COSName.KIDS), pageRefs);
findPages(pageTree.getCOSObject().getDictionaryObject(COSName.KIDS), pageRefs);
//confirm the right number of pages was found
if (pageRefs.size() != pdDocument.getNumberOfPages()) {
throw new IOException(new TikaException(
Expand All @@ -215,7 +215,7 @@ protected void processPages(PDPageTree pages) throws IOException {
Map<String, HtmlTag> roleMap = loadRoleMap(structureTreeRoot.getRoleMap());

//STEP 3: load all of the text, mapped to MCIDs
Map<MCID, String> paragraphs = loadTextByMCID(pageRefs);
Map<MCID, String> paragraphs = loadTextByMCID(pageTree, pageRefs);

//STEP 4: now recurse the the structure tree root and output the structure
//and the text bits from paragraphs
Expand Down Expand Up @@ -254,7 +254,7 @@ protected void processPages(PDPageTree pages) throws IOException {
//TODO: figure out when we're crossing page boundaries during the recursion
// step above and do the page by page processing then...rather than dumping this
// all here.
for (PDPage page : pdDocument.getPages()) {
for (PDPage page : pageTree) {
startPage(page);
endPage(page);
}
Expand Down Expand Up @@ -410,10 +410,10 @@ private HtmlTag getTag(String name, Map<String, HtmlTag> roleMap) {
return roleMap.get(name);
}

private Map<MCID, String> loadTextByMCID(List<ObjectRef> pageRefs) throws IOException {
private Map<MCID, String> loadTextByMCID(PDPageTree pageTree, List<ObjectRef> pageRefs) throws IOException {
int pageCount = 1;
Map<MCID, String> paragraphs = new HashMap<>();
for (PDPage page : pdDocument.getPages()) {
for (PDPage page : pageTree) {
ObjectRef pageRef = pageRefs.get(pageCount - 1);
PDFMarkedContentExtractor ex = new PDFMarkedContentExtractor();
try {
Expand Down

0 comments on commit 657e75b

Please sign in to comment.