Skip to content

Commit 68ded09

Browse files
committed
update rules for properly collecting and reverting text
1 parent 63945a3 commit 68ded09

File tree

2 files changed

+136
-18
lines changed

2 files changed

+136
-18
lines changed

grobid-core/src/main/java/org/grobid/core/document/Document.java

+69-17
Original file line numberDiff line numberDiff line change
@@ -1264,7 +1264,7 @@ protected void recalculateVectorBoxCoords(Figure f, GraphicObject g) {
12641264
}
12651265

12661266
/**
1267-
* This method assigns graphic objects to tables based on the proximity of the graphic object to the table caption.
1267+
* This method assigns graphic objects to figures based on the proximity of the graphic object to the figure caption.
12681268
* In addition, it removes blocks of layout tokens that are at a distance greater than a threshold from the figure caption.
12691269
* The method returns the updated list of layout tokens, and the list of layout tokens that have been discarded.
12701270
*/
@@ -1275,10 +1275,10 @@ protected org.apache.commons.lang3.tuple.Pair<List<LayoutToken>, List<List<Layou
12751275
Iterator<Integer> it = f.getBlockPtrs().iterator();
12761276

12771277
while (it.hasNext()) {
1278-
Integer blockPtr = it.next();
1278+
Integer newBlockPtr = it.next();
12791279

1280-
Block figBlock = getBlocks().get(blockPtr);
1281-
String norm = LayoutTokensUtil.toText(figBlock.getTokens()).trim().toLowerCase();
1280+
Block previousBlock = getBlocks().get(newBlockPtr);
1281+
String norm = LayoutTokensUtil.toText(previousBlock.getTokens()).trim().toLowerCase();
12821282
if (norm.startsWith("fig")
12831283
|| norm.startsWith("abb")
12841284
|| norm.startsWith("scheme")
@@ -1289,32 +1289,84 @@ protected org.apache.commons.lang3.tuple.Pair<List<LayoutToken>, List<List<Layou
12891289
|| norm.startsWith("fuente")
12901290
|| norm.startsWith("video")
12911291
) {
1292-
result.addAll(figBlock.getTokens());
1292+
result.addAll(previousBlock.getTokens());
12931293

12941294
while (it.hasNext()) {
1295-
BoundingBox prevBlock = BoundingBox.fromPointAndDimensions(figBlock.getPageNumber(), figBlock.getX(), figBlock.getY(), figBlock.getWidth(), figBlock.getHeight());
1296-
blockPtr = it.next();
1297-
Block b = getBlocks().get(blockPtr);
1298-
if (BoundingBox.fromPointAndDimensions(b.getPageNumber(), b.getX(), b.getY(), b.getWidth(), b.getHeight()).distanceTo(prevBlock) < 15) {
1299-
result.addAll(b.getTokens());
1300-
figBlock = b;
1295+
BoundingBox prevBlockCoords = BoundingBox.fromPointAndDimensions(previousBlock.getPageNumber(), previousBlock.getX(), previousBlock.getY(), previousBlock.getWidth(), previousBlock.getHeight());
1296+
newBlockPtr = it.next();
1297+
Block newBlock = getBlocks().get(newBlockPtr);
1298+
BoundingBox newBlockCoords = BoundingBox.fromPointAndDimensions(newBlock.getPageNumber(), newBlock.getX(), newBlock.getY(), newBlock.getWidth(), newBlock.getHeight());
1299+
if (newBlockCoords.distanceTo(prevBlockCoords) < 15) {
1300+
result.addAll(newBlock.getTokens());
1301+
previousBlock = newBlock;
13011302
} else {
13021303
// A TEMPORARY trick would be to iterate to all the following blocks
13031304
// and place them into the discarded token list of the figure
13041305
// f.addDiscardedPieceTokens(b.getTokens());
1305-
discardedPieces.add(b.getTokens());
1306+
List<LayoutToken> newBlockTrimmed = newBlock.getTokens();
1307+
1308+
String figureLayoutTokens = LayoutTokensUtil.toText(f.getLayoutTokens());
1309+
if (!figureLayoutTokens.contains(newBlock.getText())) {
1310+
// We need to keep only the common tokens, we assume the block will overrun the figure layout tokens
1311+
int subListSize = newBlock.getTokens().size();
1312+
1313+
while (!figureLayoutTokens.endsWith(LayoutTokensUtil.toText(newBlock.getTokens().subList(0, subListSize)))
1314+
&& subListSize > 0) {
1315+
subListSize -= 1;
1316+
}
1317+
1318+
if (subListSize > 0) {
1319+
newBlockTrimmed = newBlock.getTokens().subList(0, subListSize);
1320+
} else {
1321+
// If the item is not found, we discard the current block and all the following
1322+
f.addDiscardedPieceTokens(newBlock.getTokens());
1323+
while (it.hasNext()) {
1324+
newBlockPtr = it.next();
1325+
newBlock = getBlocks().get(newBlockPtr);
1326+
Iterables.getLast(f.getDiscardedPiecesTokens()).addAll(newBlock.getTokens());
1327+
}
1328+
break;
1329+
}
1330+
1331+
}
1332+
discardedPieces.add(newBlockTrimmed);
1333+
13061334
while (it.hasNext()) {
1307-
blockPtr = it.next();
1308-
figBlock = getBlocks().get(blockPtr);
1309-
// Iterables.getLast(f.getDiscardedPiecesTokens()).addAll(figBlock.getTokens());
1310-
Iterables.getLast(discardedPieces).addAll(figBlock.getTokens());
1335+
newBlockPtr = it.next();
1336+
newBlock = getBlocks().get(newBlockPtr);
1337+
// Iterables.getLast(f.getDiscardedPiecesTokens()).addAll(figBlock.getTokens());
1338+
1339+
newBlockTrimmed = newBlock.getTokens();
1340+
if (!figureLayoutTokens.contains(newBlock.getText())) {
1341+
// We need to keep only the common tokens, we assume the block will overrun the figure layout tokens
1342+
int subListSize = newBlock.getTokens().size();
1343+
1344+
while (!figureLayoutTokens.endsWith(LayoutTokensUtil.toText(newBlock.getTokens().subList(0, subListSize)))
1345+
&& subListSize > 0) {
1346+
subListSize -= 1;
1347+
}
1348+
if (subListSize > 0) {
1349+
newBlockTrimmed = newBlock.getTokens().subList(0, subListSize);
1350+
} else {
1351+
// If the item is not found, we discard the current block and all the following
1352+
f.addDiscardedPieceTokens(previousBlock.getTokens());
1353+
while (it.hasNext()) {
1354+
newBlockPtr = it.next();
1355+
newBlock = getBlocks().get(newBlockPtr);
1356+
Iterables.getLast(f.getDiscardedPiecesTokens()).addAll(newBlock.getTokens());
1357+
}
1358+
break;
1359+
}
1360+
}
1361+
Iterables.getLast(discardedPieces).addAll(newBlockTrimmed);
13111362
}
13121363
break;
1364+
13131365
}
13141366
}
13151367
break;
13161368
} else {
1317-
f.addDiscardedPieceTokens(figBlock.getTokens());
1369+
f.addDiscardedPieceTokens(previousBlock.getTokens());
13181370
// LOGGER.info("BAD_FIGIRE_LABEL: " + norm);
13191371
}
13201372
}

grobid-core/src/test/kotlin/org/grobid/core/document/DocumentTest.kt

+67-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import org.grobid.core.utilities.GrobidProperties
88
import org.grobid.core.utilities.LayoutTokensUtil
99
import org.hamcrest.CoreMatchers.`is`
1010
import org.hamcrest.MatcherAssert.assertThat
11+
import org.hamcrest.Matchers.hasSize
1112
import org.junit.Before
1213
import org.junit.BeforeClass
1314
import org.junit.Test
@@ -92,10 +93,72 @@ class DocumentTest {
9293
figure.layoutTokens = tokens
9394
figure.captionLayoutTokens = captionLayoutTokens
9495

96+
val output = doc.getFigureLayoutTokens(figure)
97+
98+
assertThat(LayoutTokensUtil.toText(output.left), `is`("Figure 1: This is a caption."))
99+
assertThat(output.right, hasSize(1))
100+
assertThat(
101+
LayoutTokensUtil.toText(output.right[0]),
102+
`is`(LayoutTokensUtil.toText(block5.tokens))
103+
)
104+
}
105+
106+
@Test
107+
@Throws(Exception::class)
108+
fun testGetFigureLayoutTokens_paragraphFarFromCaptionWithBlockNotMatching_shouldRemoveParagraph() {
109+
val text = "This is some garbage that comes before the figure..\n" +
110+
"d\n" +
111+
"d\n" +
112+
"d\n" +
113+
"sss\n" +
114+
"Figure 1: This is a caption.\n" +
115+
"d\n" +
116+
"and a paragraph we want to keep or revert back into the fulltext.\n"
117+
118+
val tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text)
119+
val figure = Figure()
120+
121+
val block1 = Block()
122+
block1.tokens = tokens.subList(0, 20)
123+
block1.boundingBox = BoundingBox.fromPointAndDimensions(1, 10.0, 10.0, 10.0, 10.0)
124+
val block2 = Block()
125+
block2.tokens = tokens.subList(20, 25)
126+
block2.boundingBox = BoundingBox.fromPointAndDimensions(1, 10.0, 20.0, 10.0, 10.0)
127+
val block3 = Block()
128+
block3.tokens = tokens.subList(25, 28)
129+
block3.boundingBox = BoundingBox.fromPointAndDimensions(1, 10.0, 30.0, 10.0, 10.0)
130+
val block4 = Block()
131+
block4.tokens = tokens.subList(28, 41)
132+
block4.boundingBox = BoundingBox.fromPointAndDimensions(1, 10.0, 50.0, 10.0, 10.0)
133+
val block5 = Block()
134+
val block5OriginalTokens= tokens.subList(41, 71)
135+
block5.tokens = block5OriginalTokens + GrobidAnalyzer.getInstance().tokenizeWithLayoutToken("Some additional text.")
136+
block5.boundingBox = BoundingBox.fromPointAndDimensions(1, 10.0, 80.0, 10.0, 10.0)
137+
138+
val doc = Document()
139+
doc.blocks = Arrays.asList(
140+
block1,
141+
block2,
142+
block3,
143+
block4,
144+
block5
145+
)
146+
147+
figure.blockPtrs = TreeSet(listOf(0, 1, 2, 3, 4))
148+
149+
val captionString = "This is a caption."
150+
val captionLayoutTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(captionString)
151+
figure.setCaption(StringBuilder(captionString))
152+
figure.layoutTokens = tokens
153+
figure.captionLayoutTokens = captionLayoutTokens
95154

96155
val output = doc.getFigureLayoutTokens(figure)
97156

98157
assertThat(LayoutTokensUtil.toText(output.left), `is`("Figure 1: This is a caption."))
158+
assertThat(
159+
LayoutTokensUtil.toText(output.right[0]),
160+
`is`(LayoutTokensUtil.toText(block5OriginalTokens))
161+
)
99162
}
100163

101164
@Test
@@ -144,7 +207,10 @@ class DocumentTest {
144207

145208
val output = doc.getFigureLayoutTokens(figure)
146209

147-
assertThat(LayoutTokensUtil.toText(output.left), `is`("Figure 1: This is a caption.\nd\nand a paragraph we want to keep or revert back into the fulltext.\n"))
210+
assertThat(
211+
LayoutTokensUtil.toText(output.left),
212+
`is`("Figure 1: This is a caption.\nd\nand a paragraph we want to keep or revert back into the fulltext.\n")
213+
)
148214
}
149215

150216
companion object {

0 commit comments

Comments
 (0)