Skip to content

Commit d2ac41b

Browse files
committed
reading line-up corpus
1 parent 75cfe49 commit d2ac41b

File tree

4 files changed

+209
-0
lines changed

4 files changed

+209
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
package lineup;
2+
3+
/**
4+
* Thrown if the format of a corpus being read is invalid.
5+
*
6+
* @author Markus Kahl
7+
*/
8+
public class CorpusFormatException extends Exception {
9+
public CorpusFormatException(String msg) {
10+
super(msg);
11+
}
12+
}
+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
package lineup;
2+
3+
import java.io.Reader;
4+
import java.util.List;
5+
6+
/**
7+
* Reads a sentence-aligned corpus.
8+
*
9+
* @author Markus Kahl
10+
*/
11+
public interface CorpusReader {
12+
List<Translation> readCorpus(Reader reader);
13+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
package lineup;
2+
3+
import java.io.BufferedReader;
4+
import java.io.FileNotFoundException;
5+
import java.io.IOException;
6+
import java.io.Reader;
7+
import java.util.LinkedList;
8+
import java.util.List;
9+
10+
/**
11+
* Description goes here.
12+
*
13+
* @author Markus Kahl
14+
*/
15+
public class LineupCorpusReader implements CorpusReader {
16+
17+
private String sourceLanguage;
18+
private String targetLanguage;
19+
20+
public List<Translation> readCorpus(String file) throws FileNotFoundException {
21+
return readCorpus(new java.io.FileReader(file));
22+
}
23+
24+
public List<Translation> readCorpus(Reader reader) {
25+
List<Translation> result = new LinkedList<Translation>();
26+
BufferedReader in = new BufferedReader(reader);
27+
String source = null, target = null, line;
28+
29+
try {
30+
while ((line = in.readLine()) != null) {
31+
line = line.trim();
32+
if (line.startsWith("#") || line.isEmpty()) continue;
33+
34+
if (source == null) {
35+
source = line;
36+
} else if (target == null) {
37+
target = line;
38+
}
39+
if (source != null && target != null) {
40+
parseLanguage(source, true);
41+
parseLanguage(target, false);
42+
43+
Translation trans = new Translation(getSourceLanguage(), getTargetLanguage());
44+
trans.getSourceSentences().addAll(parseSentences(source));
45+
trans.getTargetSentences().addAll(parseSentences(target));
46+
47+
result.add(trans);
48+
49+
source = null;
50+
target = null;
51+
}
52+
}
53+
} catch (IOException e) {
54+
System.err.println("Could not read corpus: " + e.getMessage() +
55+
" (" + e.getClass().getName() + ")");
56+
} catch (CorpusFormatException e) {
57+
System.err.println("Invalid corpus format: " + e.getMessage());
58+
}
59+
60+
return result;
61+
}
62+
63+
protected void parseLanguage(String line, boolean source) throws CorpusFormatException {
64+
if (line.matches("[a-z]{2} \\d.*")) {
65+
String lang = line.substring(0, line.indexOf(" "));
66+
if (source) {
67+
if (getSourceLanguage() == null) {
68+
sourceLanguage = lang;
69+
} else if (!lang.equals(getSourceLanguage())) {
70+
throw new CorpusFormatException("Unexpected source language (expected "
71+
+ getSourceLanguage() + ") in: " + line);
72+
}
73+
} else if (!source) {
74+
if (getTargetLanguage() == null) {
75+
targetLanguage = lang;
76+
} else if (!lang.equals(getTargetLanguage())) {
77+
throw new CorpusFormatException("Unexpected target language (expected "
78+
+ getTargetLanguage() + ") in: " + line);
79+
} else if (getTargetLanguage().equals(getSourceLanguage())) {
80+
throw new CorpusFormatException("Source and target language must be different.");
81+
}
82+
}
83+
} else {
84+
throw new CorpusFormatException("Expected language, got: " + line);
85+
}
86+
}
87+
88+
protected List<String> parseSentences(String line) throws CorpusFormatException {
89+
String[] tokens = line.split(":|\\|{2}]");
90+
if (tokens.length >= 2 && tokens[0].matches("[a-z]{2} \\d")) {
91+
List<String> result = new LinkedList<String>();
92+
for (int i = 1; i < tokens.length; ++i) {
93+
result.add(tokens[i].trim());
94+
}
95+
return result;
96+
} else {
97+
throw new CorpusFormatException("Could not parse sentences: " + line);
98+
}
99+
}
100+
101+
public String getSourceLanguage() {
102+
return sourceLanguage;
103+
}
104+
105+
public String getTargetLanguage() {
106+
return targetLanguage;
107+
}
108+
}

src/main/java/lineup/Translation.java

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
package lineup;
2+
3+
import java.util.LinkedList;
4+
import java.util.List;
5+
6+
/**
7+
* A translation maps one or more sentences in one language to one or more
8+
* sentences in another languages. Those sentences express the same idea.
9+
*
10+
* @author Markus Kahl
11+
*/
12+
public class Translation {
13+
14+
private String sourceLanguage;
15+
private String targetLanguage;
16+
17+
private List<String> sourceSentences = new LinkedList<String>();
18+
private List<String> targetSentences = new LinkedList<String>();
19+
20+
public Translation(String sourceLanguage, String targetLanguage) {
21+
this.sourceLanguage = sourceLanguage;
22+
this.targetLanguage = targetLanguage;
23+
}
24+
25+
@Override
26+
public String toString() {
27+
StringBuilder sb = new StringBuilder("Translation(");
28+
String src = getSourceSentences().get(0);
29+
String tgt = getTargetSentences().get(0);
30+
final int maxLength = 20;
31+
32+
sb.append(getSourceLanguage());
33+
sb.append(": ");
34+
if (src.length() > maxLength) {
35+
sb.append(src.substring(0, maxLength - 1));
36+
sb.append("...");
37+
} else {
38+
sb.append(src);
39+
if (getSourceSentences().size() > 1 && src.length() <= maxLength) {
40+
sb.append(" ...");
41+
}
42+
}
43+
44+
sb.append(" | ");
45+
sb.append(getTargetLanguage());
46+
sb.append(": ");
47+
if (tgt.length() > maxLength) {
48+
sb.append(tgt.substring(0, maxLength - 1));
49+
sb.append("...");
50+
} else {
51+
sb.append(tgt);
52+
if (getTargetSentences().size() > 1 && tgt.length() <= maxLength) {
53+
sb.append(" ...");
54+
}
55+
}
56+
sb.append(")");
57+
58+
return sb.toString();
59+
}
60+
61+
public String getSourceLanguage() {
62+
return sourceLanguage;
63+
}
64+
65+
public String getTargetLanguage() {
66+
return targetLanguage;
67+
}
68+
69+
public List<String> getSourceSentences() {
70+
return sourceSentences;
71+
}
72+
73+
public List<String> getTargetSentences() {
74+
return targetSentences;
75+
}
76+
}

0 commit comments

Comments
 (0)