|
| 1 | +package lineup; |
| 2 | + |
| 3 | +import java.io.BufferedReader; |
| 4 | +import java.io.FileNotFoundException; |
| 5 | +import java.io.IOException; |
| 6 | +import java.io.Reader; |
| 7 | +import java.util.LinkedList; |
| 8 | +import java.util.List; |
| 9 | + |
| 10 | +/** |
| 11 | + * Description goes here. |
| 12 | + * |
| 13 | + * @author Markus Kahl |
| 14 | + */ |
| 15 | +public class LineupCorpusReader implements CorpusReader { |
| 16 | + |
| 17 | + private String sourceLanguage; |
| 18 | + private String targetLanguage; |
| 19 | + |
| 20 | + public List<Translation> readCorpus(String file) throws FileNotFoundException { |
| 21 | + return readCorpus(new java.io.FileReader(file)); |
| 22 | + } |
| 23 | + |
| 24 | + public List<Translation> readCorpus(Reader reader) { |
| 25 | + List<Translation> result = new LinkedList<Translation>(); |
| 26 | + BufferedReader in = new BufferedReader(reader); |
| 27 | + String source = null, target = null, line; |
| 28 | + |
| 29 | + try { |
| 30 | + while ((line = in.readLine()) != null) { |
| 31 | + line = line.trim(); |
| 32 | + if (line.startsWith("#") || line.isEmpty()) continue; |
| 33 | + |
| 34 | + if (source == null) { |
| 35 | + source = line; |
| 36 | + } else if (target == null) { |
| 37 | + target = line; |
| 38 | + } |
| 39 | + if (source != null && target != null) { |
| 40 | + parseLanguage(source, true); |
| 41 | + parseLanguage(target, false); |
| 42 | + |
| 43 | + Translation trans = new Translation(getSourceLanguage(), getTargetLanguage()); |
| 44 | + trans.getSourceSentences().addAll(parseSentences(source)); |
| 45 | + trans.getTargetSentences().addAll(parseSentences(target)); |
| 46 | + |
| 47 | + result.add(trans); |
| 48 | + |
| 49 | + source = null; |
| 50 | + target = null; |
| 51 | + } |
| 52 | + } |
| 53 | + } catch (IOException e) { |
| 54 | + System.err.println("Could not read corpus: " + e.getMessage() + |
| 55 | + " (" + e.getClass().getName() + ")"); |
| 56 | + } catch (CorpusFormatException e) { |
| 57 | + System.err.println("Invalid corpus format: " + e.getMessage()); |
| 58 | + } |
| 59 | + |
| 60 | + return result; |
| 61 | + } |
| 62 | + |
| 63 | + protected void parseLanguage(String line, boolean source) throws CorpusFormatException { |
| 64 | + if (line.matches("[a-z]{2} \\d.*")) { |
| 65 | + String lang = line.substring(0, line.indexOf(" ")); |
| 66 | + if (source) { |
| 67 | + if (getSourceLanguage() == null) { |
| 68 | + sourceLanguage = lang; |
| 69 | + } else if (!lang.equals(getSourceLanguage())) { |
| 70 | + throw new CorpusFormatException("Unexpected source language (expected " |
| 71 | + + getSourceLanguage() + ") in: " + line); |
| 72 | + } |
| 73 | + } else if (!source) { |
| 74 | + if (getTargetLanguage() == null) { |
| 75 | + targetLanguage = lang; |
| 76 | + } else if (!lang.equals(getTargetLanguage())) { |
| 77 | + throw new CorpusFormatException("Unexpected target language (expected " |
| 78 | + + getTargetLanguage() + ") in: " + line); |
| 79 | + } else if (getTargetLanguage().equals(getSourceLanguage())) { |
| 80 | + throw new CorpusFormatException("Source and target language must be different."); |
| 81 | + } |
| 82 | + } |
| 83 | + } else { |
| 84 | + throw new CorpusFormatException("Expected language, got: " + line); |
| 85 | + } |
| 86 | + } |
| 87 | + |
| 88 | + protected List<String> parseSentences(String line) throws CorpusFormatException { |
| 89 | + String[] tokens = line.split(":|\\|{2}]"); |
| 90 | + if (tokens.length >= 2 && tokens[0].matches("[a-z]{2} \\d")) { |
| 91 | + List<String> result = new LinkedList<String>(); |
| 92 | + for (int i = 1; i < tokens.length; ++i) { |
| 93 | + result.add(tokens[i].trim()); |
| 94 | + } |
| 95 | + return result; |
| 96 | + } else { |
| 97 | + throw new CorpusFormatException("Could not parse sentences: " + line); |
| 98 | + } |
| 99 | + } |
| 100 | + |
| 101 | + public String getSourceLanguage() { |
| 102 | + return sourceLanguage; |
| 103 | + } |
| 104 | + |
| 105 | + public String getTargetLanguage() { |
| 106 | + return targetLanguage; |
| 107 | + } |
| 108 | +} |
0 commit comments