@@ -926,6 +926,11 @@ public void postProcessTables() {
926
926
}
927
927
928
928
public void assignGraphicObjectsToFigures () {
929
+ /**
930
+ * This method assigns graphic objects to figures based on the proximity of the graphic object to the figure caption.
931
+ * It also modifies captions and textarea for existing figures
932
+ * @return the modified figures
933
+ */
929
934
Multimap <Integer , Figure > figureMap = HashMultimap .create ();
930
935
931
936
for (Figure f : figures ) {
@@ -960,7 +965,7 @@ public void assignGraphicObjectsToFigures() {
960
965
List <GraphicObject > vectorBoxGraphicObjects =
961
966
Lists .newArrayList (Iterables .filter (imagesPerPage .get (pageNum ), Figure .VECTOR_BOX_GRAPHIC_OBJECT_PREDICATE ));
962
967
963
- // case where figure caption is covered almost precisely but the vector graphics box -- filter those out - they are covered by caption anyways
968
+ // case where figure caption is covered almost precisely but the vector graphics box -- filter those out - they are covered by caption anyway
964
969
vectorBoxGraphicObjects = vectorBoxGraphicObjects .stream ().filter (go -> {
965
970
for (Figure f : pageFigures ) {
966
971
BoundingBox intersection = BoundingBoxCalculator .calculateOneBox (f .getLayoutTokens (), true ).boundingBoxIntersection (go .getBoundingBox ());
@@ -1255,11 +1260,15 @@ protected List<LayoutToken> getFigureLayoutTokens(Figure f) {
1255
1260
1256
1261
Block figBlock = getBlocks ().get (blockPtr );
1257
1262
String norm = LayoutTokensUtil .toText (figBlock .getTokens ()).trim ().toLowerCase ();
1258
- if (norm .startsWith ("fig" ) || norm .startsWith ("abb" ) || norm .startsWith ("scheme" ) || norm .startsWith ("photo" )
1259
- || norm .startsWith ("gambar" ) || norm .startsWith ("quadro" )
1260
- || norm .startsWith ("wykres" )
1261
- || norm .startsWith ("fuente" )
1262
- ) {
1263
+ if (norm .startsWith ("fig" )
1264
+ || norm .startsWith ("abb" )
1265
+ || norm .startsWith ("scheme" )
1266
+ || norm .startsWith ("photo" )
1267
+ || norm .startsWith ("gambar" )
1268
+ || norm .startsWith ("quadro" )
1269
+ || norm .startsWith ("wykres" )
1270
+ || norm .startsWith ("fuente" )
1271
+ ) {
1263
1272
result .addAll (figBlock .getTokens ());
1264
1273
1265
1274
while (it .hasNext ()) {
@@ -1270,11 +1279,20 @@ protected List<LayoutToken> getFigureLayoutTokens(Figure f) {
1270
1279
result .addAll (b .getTokens ());
1271
1280
figBlock = b ;
1272
1281
} else {
1282
+ // A TEMPORARY trick would be to iterate to all the following blocks
1283
+ // and place them into the discarded token list of the figure
1284
+ f .addDiscardedPieceTokens (b .getTokens ());
1285
+ while (it .hasNext ()) {
1286
+ blockPtr = it .next ();
1287
+ figBlock = getBlocks ().get (blockPtr );
1288
+ f .addDiscardedPieceTokens (figBlock .getTokens ());
1289
+ }
1273
1290
break ;
1274
1291
}
1275
1292
}
1276
1293
break ;
1277
1294
} else {
1295
+ f .addDiscardedPieceTokens (figBlock .getTokens ());
1278
1296
// LOGGER.info("BAD_FIGIRE_LABEL: " + norm);
1279
1297
}
1280
1298
}
0 commit comments