Skip to content

Commit 3f3866e

Browse files
committed
started script for Europarl3 corpus download
1 parent 8c066fe commit 3f3866e

File tree

5 files changed

+51
-0
lines changed

5 files changed

+51
-0
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
.gradle
2+
build
3+
target

README.md

+4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
11
# line-up
22

33
A Java library for the alignment of interlinear texts.
4+
5+
## Build
6+
7+
SBT 0.11.3

build.sbt

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
name := "line-up"
2+
3+
version := "0.1"
4+
5+
scalaVersion := "2.10.0"
6+
7+
libraryDependencies += "org.scalaj" % "scalaj-http_2.10" % "0.3.7"

src/main/java/lineup/Test.java

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
package lineup;
2+
3+
import static java.lang.System.out;
4+
5+
public class Test {
6+
public static void main(String[] args) {
7+
out.println("Hallo Welt");
8+
}
9+
}

src/main/scala/opus-europarl3.scala

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
package opus
2+
3+
import scalaj.http.Http
4+
import sys.process._
5+
6+
object Europarl3 {
7+
val link = "<a[^/]*</a>".r
8+
val href = "\"([^\"]*)\"".r
9+
10+
def url(lang: String) = "http://opus.lingfil.uu.se/Europarl3/xml/%s/" format lang
11+
def markup(lang: String) = Http(url(lang)).asString
12+
def fetchFiles(lang: String) = link.findAllIn(markup(lang)).
13+
filter(_.contains(".xml.gz")).flatMap(link => href.findAllIn(link).map(_.replace("\"", "")))
14+
15+
object files {
16+
lazy val de = fetchFiles("de").toList
17+
lazy val en = fetchFiles("en").toList
18+
19+
lazy val de_en = de intersect en
20+
}
21+
22+
def download() {
23+
files.de_en.take(3).foreach { file =>
24+
Seq("de", "en").foreach(lang =>
25+
"wget -q %s%s -O %s".format(url(lang), file, file.replace(".xml.gz", s".$lang.xml.gz")).!)
26+
}
27+
}
28+
}

0 commit comments

Comments
 (0)