Skip to content

Commit 1f5e151

Browse files
committed
Restrict term length to never exceed Lucene's fixed term length
Change-Id: Icc3be552e95ca15967b544168e0c3be4d533d00f
1 parent 846e896 commit 1f5e151

File tree

4 files changed

+27
-1
lines changed

4 files changed

+27
-1
lines changed

Changes

+2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
- [feature] Make VC cache location customizable (margaretha)
44
- [bugfix] Improve handling of C2 #IN frames serialization
55
(diewald)
6+
- [bugfix] Restrict term length to never exceed Lucene
7+
boundaries (diewald)
68

79
0.62.2 2024-02-20
810
- [feature] Support MMap directory parameters directly

src/main/java/de/ids_mannheim/korap/index/MultiTerm.java

+8-1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ public class MultiTerm implements Comparable<MultiTerm> {
5656

5757
private static short i, l;
5858

59+
private static final int MAX_TERM_LENGTH = 1024;
60+
5961
// This advices the java compiler to ignore all loggings
6062
public static final boolean DEBUG = false;
6163
private final Logger log = LoggerFactory
@@ -122,6 +124,11 @@ public String getTerm () {
122124
* @return The {@link MultIterm} object for chaining.
123125
*/
124126
public MultiTerm setTerm (String term) {
127+
if (term.length() > (MAX_TERM_LENGTH - 2)) {
128+
term = term.substring(0, MAX_TERM_LENGTH - 2);
129+
log.warn("Term %s... exceeds %i cahracters - cutted", term, MAX_TERM_LENGTH);
130+
}
131+
125132
this.term = term;
126133
return this;
127134
};
@@ -468,7 +475,7 @@ private void _fromString (String term) throws CorpusDataException {
468475
+ termSurface[0]);
469476
};
470477
};
471-
this.term = _unescape(stringOffset[0]);
478+
this.setTerm(_unescape(stringOffset[0]));
472479
};
473480

474481

src/test/java/de/ids_mannheim/korap/TestIndexer.java

+17
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ public class TestIndexer {
2424
private final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
2525
private String info = "usage: Krill indexer";
2626
private File outputDirectory = new File("test-index");
27+
private File outputDirectory2 = new File("test-index2");
2728

2829
@Test
2930
public void testArguments () throws IOException {
@@ -91,6 +92,17 @@ public void testMissingInput () throws IOException {
9192
assertEquals(true, outputStream.toString().startsWith(info));
9293
}
9394

95+
@Test
96+
public void testUnicodeProblem () throws IOException {
97+
Indexer.main(new String[] {
98+
"-c", "src/test/resources/krill.properties",
99+
"-i", "src/test/resources/bug",
100+
"-o", "test-index2"
101+
});
102+
logger.info(outputStream.toString());
103+
assertEquals(outputStream.toString(), "Added 1 file.\n");
104+
}
105+
94106
@Before
95107
public void setOutputStream () {
96108
System.setOut(new PrintStream(outputStream));
@@ -107,6 +119,11 @@ public void cleanOutputDirectory () {
107119
if (outputDirectory.exists()) {
108120
logger.debug("Output directory exists");
109121
deleteFile(outputDirectory);
122+
deleteFile(outputDirectory2);
123+
}
124+
if (outputDirectory2.exists()) {
125+
logger.debug("Output directory 2 exists");
126+
deleteFile(outputDirectory2);
110127
}
111128
}
112129

Binary file not shown.

0 commit comments

Comments
 (0)