@@ -1264,7 +1264,7 @@ protected void recalculateVectorBoxCoords(Figure f, GraphicObject g) {
1264
1264
}
1265
1265
1266
1266
/**
1267
- * This method assigns graphic objects to tables based on the proximity of the graphic object to the table caption.
1267
+ * This method assigns graphic objects to figures based on the proximity of the graphic object to the figure caption.
1268
1268
* In addition, it removes blocks of layout tokens that are at a distance greater than a threshold from the figure caption.
1269
1269
* The method returns the updated list of layout tokens, and the list of layout tokens that have been discarded.
1270
1270
*/
@@ -1275,10 +1275,10 @@ protected org.apache.commons.lang3.tuple.Pair<List<LayoutToken>, List<List<Layou
1275
1275
Iterator <Integer > it = f .getBlockPtrs ().iterator ();
1276
1276
1277
1277
while (it .hasNext ()) {
1278
- Integer blockPtr = it .next ();
1278
+ Integer newBlockPtr = it .next ();
1279
1279
1280
- Block figBlock = getBlocks ().get (blockPtr );
1281
- String norm = LayoutTokensUtil .toText (figBlock .getTokens ()).trim ().toLowerCase ();
1280
+ Block previousBlock = getBlocks ().get (newBlockPtr );
1281
+ String norm = LayoutTokensUtil .toText (previousBlock .getTokens ()).trim ().toLowerCase ();
1282
1282
if (norm .startsWith ("fig" )
1283
1283
|| norm .startsWith ("abb" )
1284
1284
|| norm .startsWith ("scheme" )
@@ -1289,32 +1289,84 @@ protected org.apache.commons.lang3.tuple.Pair<List<LayoutToken>, List<List<Layou
1289
1289
|| norm .startsWith ("fuente" )
1290
1290
|| norm .startsWith ("video" )
1291
1291
) {
1292
- result .addAll (figBlock .getTokens ());
1292
+ result .addAll (previousBlock .getTokens ());
1293
1293
1294
1294
while (it .hasNext ()) {
1295
- BoundingBox prevBlock = BoundingBox .fromPointAndDimensions (figBlock .getPageNumber (), figBlock .getX (), figBlock .getY (), figBlock .getWidth (), figBlock .getHeight ());
1296
- blockPtr = it .next ();
1297
- Block b = getBlocks ().get (blockPtr );
1298
- if (BoundingBox .fromPointAndDimensions (b .getPageNumber (), b .getX (), b .getY (), b .getWidth (), b .getHeight ()).distanceTo (prevBlock ) < 15 ) {
1299
- result .addAll (b .getTokens ());
1300
- figBlock = b ;
1295
+ BoundingBox prevBlockCoords = BoundingBox .fromPointAndDimensions (previousBlock .getPageNumber (), previousBlock .getX (), previousBlock .getY (), previousBlock .getWidth (), previousBlock .getHeight ());
1296
+ newBlockPtr = it .next ();
1297
+ Block newBlock = getBlocks ().get (newBlockPtr );
1298
+ BoundingBox newBlockCoords = BoundingBox .fromPointAndDimensions (newBlock .getPageNumber (), newBlock .getX (), newBlock .getY (), newBlock .getWidth (), newBlock .getHeight ());
1299
+ if (newBlockCoords .distanceTo (prevBlockCoords ) < 15 ) {
1300
+ result .addAll (newBlock .getTokens ());
1301
+ previousBlock = newBlock ;
1301
1302
} else {
1302
1303
// A TEMPORARY trick would be to iterate to all the following blocks
1303
1304
// and place them into the discarded token list of the figure
1304
1305
// f.addDiscardedPieceTokens(b.getTokens());
1305
- discardedPieces .add (b .getTokens ());
1306
+ List <LayoutToken > newBlockTrimmed = newBlock .getTokens ();
1307
+
1308
+ String figureLayoutTokens = LayoutTokensUtil .toText (f .getLayoutTokens ());
1309
+ if (!figureLayoutTokens .contains (newBlock .getText ())) {
1310
+ // We need to keep only the common tokens, we assume the block will overrun the figure layout tokens
1311
+ int subListSize = newBlock .getTokens ().size ();
1312
+
1313
+ while (!figureLayoutTokens .endsWith (LayoutTokensUtil .toText (newBlock .getTokens ().subList (0 , subListSize )))
1314
+ && subListSize > 0 ) {
1315
+ subListSize -= 1 ;
1316
+ }
1317
+
1318
+ if (subListSize > 0 ) {
1319
+ newBlockTrimmed = newBlock .getTokens ().subList (0 , subListSize );
1320
+ } else {
1321
+ // If the item is not found, we discard the current block and all the following
1322
+ f .addDiscardedPieceTokens (newBlock .getTokens ());
1323
+ while (it .hasNext ()) {
1324
+ newBlockPtr = it .next ();
1325
+ newBlock = getBlocks ().get (newBlockPtr );
1326
+ Iterables .getLast (f .getDiscardedPiecesTokens ()).addAll (newBlock .getTokens ());
1327
+ }
1328
+ break ;
1329
+ }
1330
+
1331
+ }
1332
+ discardedPieces .add (newBlockTrimmed );
1333
+
1306
1334
while (it .hasNext ()) {
1307
- blockPtr = it .next ();
1308
- figBlock = getBlocks ().get (blockPtr );
1309
- // Iterables.getLast(f.getDiscardedPiecesTokens()).addAll(figBlock.getTokens());
1310
- Iterables .getLast (discardedPieces ).addAll (figBlock .getTokens ());
1335
+ newBlockPtr = it .next ();
1336
+ newBlock = getBlocks ().get (newBlockPtr );
1337
+ // Iterables.getLast(f.getDiscardedPiecesTokens()).addAll(figBlock.getTokens());
1338
+
1339
+ newBlockTrimmed = newBlock .getTokens ();
1340
+ if (!figureLayoutTokens .contains (newBlock .getText ())) {
1341
+ // We need to keep only the common tokens, we assume the block will overrun the figure layout tokens
1342
+ int subListSize = newBlock .getTokens ().size ();
1343
+
1344
+ while (!figureLayoutTokens .endsWith (LayoutTokensUtil .toText (newBlock .getTokens ().subList (0 , subListSize )))
1345
+ && subListSize > 0 ) {
1346
+ subListSize -= 1 ;
1347
+ }
1348
+ if (subListSize > 0 ) {
1349
+ newBlockTrimmed = newBlock .getTokens ().subList (0 , subListSize );
1350
+ } else {
1351
+ // If the item is not found, we discard the current block and all the following
1352
+ f .addDiscardedPieceTokens (previousBlock .getTokens ());
1353
+ while (it .hasNext ()) {
1354
+ newBlockPtr = it .next ();
1355
+ newBlock = getBlocks ().get (newBlockPtr );
1356
+ Iterables .getLast (f .getDiscardedPiecesTokens ()).addAll (newBlock .getTokens ());
1357
+ }
1358
+ break ;
1359
+ }
1360
+ }
1361
+ Iterables .getLast (discardedPieces ).addAll (newBlockTrimmed );
1311
1362
}
1312
1363
break ;
1364
+
1313
1365
}
1314
1366
}
1315
1367
break ;
1316
1368
} else {
1317
- f .addDiscardedPieceTokens (figBlock .getTokens ());
1369
+ f .addDiscardedPieceTokens (previousBlock .getTokens ());
1318
1370
// LOGGER.info("BAD_FIGIRE_LABEL: " + norm);
1319
1371
}
1320
1372
}
0 commit comments