Skip to content

Commit a1f9bfc

Browse files
committed
retrieve text from the discarded part of the figures
1 parent 17250ff commit a1f9bfc

File tree

2 files changed

+25
-7
lines changed

2 files changed

+25
-7
lines changed

grobid-core/src/main/java/org/grobid/core/document/Document.java

+24-6
Original file line numberDiff line numberDiff line change
@@ -926,6 +926,11 @@ public void postProcessTables() {
926926
}
927927

928928
public void assignGraphicObjectsToFigures() {
929+
/**
930+
* This method assigns graphic objects to figures based on the proximity of the graphic object to the figure caption.
931+
* It also modifies captions and textarea for existing figures
932+
* @return the modified figures
933+
*/
929934
Multimap<Integer, Figure> figureMap = HashMultimap.create();
930935

931936
for (Figure f : figures) {
@@ -960,7 +965,7 @@ public void assignGraphicObjectsToFigures() {
960965
List<GraphicObject> vectorBoxGraphicObjects =
961966
Lists.newArrayList(Iterables.filter(imagesPerPage.get(pageNum), Figure.VECTOR_BOX_GRAPHIC_OBJECT_PREDICATE));
962967

963-
// case where figure caption is covered almost precisely but the vector graphics box -- filter those out - they are covered by caption anyways
968+
// case where figure caption is covered almost precisely but the vector graphics box -- filter those out - they are covered by caption anyway
964969
vectorBoxGraphicObjects = vectorBoxGraphicObjects.stream().filter(go -> {
965970
for (Figure f : pageFigures) {
966971
BoundingBox intersection = BoundingBoxCalculator.calculateOneBox(f.getLayoutTokens(), true).boundingBoxIntersection(go.getBoundingBox());
@@ -1255,11 +1260,15 @@ protected List<LayoutToken> getFigureLayoutTokens(Figure f) {
12551260

12561261
Block figBlock = getBlocks().get(blockPtr);
12571262
String norm = LayoutTokensUtil.toText(figBlock.getTokens()).trim().toLowerCase();
1258-
if (norm.startsWith("fig") || norm.startsWith("abb") || norm.startsWith("scheme") || norm.startsWith("photo")
1259-
|| norm.startsWith("gambar") || norm.startsWith("quadro")
1260-
|| norm.startsWith("wykres")
1261-
|| norm.startsWith("fuente")
1262-
) {
1263+
if (norm.startsWith("fig")
1264+
|| norm.startsWith("abb")
1265+
|| norm.startsWith("scheme")
1266+
|| norm.startsWith("photo")
1267+
|| norm.startsWith("gambar")
1268+
|| norm.startsWith("quadro")
1269+
|| norm.startsWith("wykres")
1270+
|| norm.startsWith("fuente")
1271+
) {
12631272
result.addAll(figBlock.getTokens());
12641273

12651274
while (it.hasNext()) {
@@ -1270,11 +1279,20 @@ protected List<LayoutToken> getFigureLayoutTokens(Figure f) {
12701279
result.addAll(b.getTokens());
12711280
figBlock = b;
12721281
} else {
1282+
// A TEMPORARY trick would be to iterate to all the following blocks
1283+
// and place them into the discarded token list of the figure
1284+
f.addDiscardedPieceTokens(b.getTokens());
1285+
while (it.hasNext()) {
1286+
blockPtr = it.next();
1287+
figBlock = getBlocks().get(blockPtr);
1288+
f.addDiscardedPieceTokens(figBlock.getTokens());
1289+
}
12731290
break;
12741291
}
12751292
}
12761293
break;
12771294
} else {
1295+
f.addDiscardedPieceTokens(figBlock.getTokens());
12781296
// LOGGER.info("BAD_FIGIRE_LABEL: " + norm);
12791297
}
12801298
}

grobid-core/src/main/java/org/grobid/core/engines/FigureParser.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ class FigureParser extends AbstractParser {
3434
public Figure processing(List<LayoutToken> tokenizationFigure, String featureVector) {
3535
String res;
3636
try {
37-
res = label(featureVector);;
37+
res = label(featureVector);
3838
} catch (Exception e) {
3939
throw new GrobidException("Sequence labeling with figure model fails.", e);
4040
}

0 commit comments

Comments
 (0)