mgormley
diff --git a/‎LICENSE.txt
+339 b/‎LICENSE.txt
+339
diff --git a/‎README.txt
+107 b/‎README.txt
+107
diff --git a/‎build.xml
+79 b/‎build.xml
+79
diff --git a/‎setupenv.sh
+2 b/‎setupenv.sh
+2
diff --git a/‎src/edu/jhu/agiga/AgigaConstants.java
+71 b/‎src/edu/jhu/agiga/AgigaConstants.java
+71
diff --git a/‎src/edu/jhu/agiga/AgigaCoref.java
+30 b/‎src/edu/jhu/agiga/AgigaCoref.java
+30
@@ -0,0 +1,107 @@
+Annotated Gigaword API and Command Line Tools v1.0 - July 21, 2012
+------------------------------------------------------------------
+
+This release includes a Java API and command line tools for reading
+the Annotated Gigaword dataset XML files. 
+
+-------------------
+Project Hosting   :
+-------------------
+
+For the latest version, go to:
+http://code.google.com/p/agiga
+
+-------------------
+Command Line Tools:
+-------------------
+
+The command line tools provide a convenient way to print human
+readable versions of the XML annotations. The entry point is
+edu.jhu.agiga.AgigaPrinter and it has the following usage.
+
+usage: java edu.jhu.agiga.AgigaPrinter <type> <gzipped input file>
+  where <type> is one of:
+    words                     (Words only, one sentence per line)
+    lemmas                    (Lemmas only, one sentence per line)
+    pos                       (Part-of-speech tags)
+    ner                       (Named entity types)
+    basic-deps                (Basic dependency parses in CONNL-X format)
+    col-deps                  (Collapsed dependency parses in CONNL-X format)
+    col-ccproc-deps           (Collapsed and propagated dependency parses in CONNL-X format)
+    phrase-structure          (Phrase structure parses)
+    coref                     (Coreference resolution as SGML similar to MUC)
+    stanford-deps             (toString() methods of Stanford dependency parse annotations)
+    stanford-phrase-structure (toString() method of Stanford phrase structure parses)
+    for-testing-only          (**For use in testing this API only**)
+  and where <gzipped input file> is an .xml.gz file
+  from Annotated Gigaword
+
+For example, to print part-of-speech tags for the file
+nyt_eng_199911.xml.gz, we could run:
+
+java -cp build/agiga-1.0.jar:lib/* edu.jhu.agiga.AgigaPrinter pos annotated_gigaword/nyt_eng_199911.xml.gz
+
+-------------------
+Java API          :
+-------------------
+
+The Java API provides streaming access to the documents in the .xml.gz
+files. Two iterators are provided: StreamingDocumentReader and
+StreamingSentenceReader. Both of these take as input the path to an
+Annotated Gigaword file and an AgigaPrefs object. 
+
+By default, the AgigaPrefs constructor will ensure that every
+annotation in the XML is read in and that the resulting objects are
+fully populated. However, by turning off certain options, it's
+possible to skip the reading and creation of objects corresponding to
+unused annotations.
+
+StreamingDocumentReader is an iterator over AgigaDocument objects. The
+AgigaDocument class gives access to the coreference resolution (via
+AgigaCoref objects) annotations and the sentences (via AgigaSentence
+objects).
+
+StreamingSentenceReader is an iterator over AgigaSentence
+objects. This bypasses the document level annotations such as coref
+and the document ids and provides direct access to the sentence
+annotations only.
+
+AgigaPrinter provides examples of how to use these iterators and set
+the AgigaPrefs object so that only the necessary annotations are read.
+Examples of how to use the Agiga objects can also be found in the
+AgigaDocument.write* and AgigaSentence.write* methods.
+
+----------------------
+One- vs. Zero-Indexing:
+----------------------
+
+In the XML, the sentences and tokens are given Ids that are
+one-indexed. However, we find it to be more convenient to work with
+zero-indexed **indices** in the Java API. Accordingly, the Java API
+does not provide access to these original Ids but instead provides
+access to indices. These indices are accessed via methods named
+get*Idx(), such as AgigaSentence.getIdx() and
+AgigaMention.getSentenceIdx() -- or AgigaToken.getIdx() and
+AgigaDependency.getGovIdx(). These indices also correspond to the
+ordered elements in the Lists used throughout the API.
+
+Of course, the original Ids from the XML can be recovered by adding
+one to the indices in the API. However, we didn't want to confuse the
+issue by providing API calls for both.
+
+-------------------
+Building          :
+-------------------
+
+A build.xml is provided for building with Apache Ant.  Example
+commands are below and should be run from the top level directory that
+contains the build.xml.
+
+# To compile: 
+ant
+
+# To clean and compile
+ant clean compile
+
+# To build jars of classes and sources:
+ant jar
@@ -0,0 +1,79 @@
+<?xml version="1.0"?>
+<project name="agiga" default="compile" basedir=".">
+
+  <property name="classes.path"        value="${basedir}/classes" />
+  <property name="source.path"         value="${basedir}/src" />
+  <property name="build.path"          value="${basedir}/build" />
+  <property name="build.path"          value="${basedir}/build" />
+  <property name="version"             value="1.0" />
+  <property name="app.jar.path"        value="${build.path}/agiga-${version}.jar" />
+  <property name="source.jar.path"     value="${build.path}/agiga-sources-${version}.jar" />
+
+  <property name="compile.debug"       value="true"/>
+  <property name="compile.deprecation" value="false"/>
+  <property name="compile.optimize"    value="true"/>
+  <property name="compile.source"      value="1.6" />
+  <property name="compile.target"      value="1.6" />
+  <property name="compile.encoding"    value="utf-8" />
+
+  <target name="classpath" description="Sets the classpath">
+    <echo message="${ant.project.name}" />
+    <path id="classpath">
+      <fileset dir="${basedir}/lib">
+        <include name="*.jar"/>
+      </fileset>
+    </path>
+    <path id="classes">
+      <pathelement location="${classes.path}" />
+    </path>
+  </target>
+
+  <target name="clean" description="Delete built files">
+    <echo message="${ant.project.name}" />
+    <delete includeemptydirs="true" failonerror="false">
+      <fileset dir="${classes.path}/" includes="**/*"/>
+      <fileset dir="${build.path}/" includes="**/*"/>
+    </delete>
+  </target>
+
+  <target name="build-dir" description="Create build output directories">
+    <echo message="${ant.project.name}" />
+    <mkdir dir="${classes.path}" />
+    <mkdir dir="${build.path}" />
+  </target>
+  
+  <target name="compile" depends="classpath,build-dir"
+          description="Compile source files">
+    <echo message="${ant.project.name}" />
+    <javac srcdir="${source.path}"
+           destdir="${classes.path}"
+           debug="${compile.debug}"
+           encoding="utf-8" 
+         deprecation="${compile.deprecation}"
+           optimize="${compile.optimize}"
+           source="${compile.source}"
+           target="${compile.target}"
+         includeantruntime="false">
+      <classpath refid="classpath" />
+      <compilerarg value="-Xlint"/>
+    </javac>
+  </target>
+
+  <target name="jar" depends="compile"
+        description="Creates jar files of the classes and sources">
+    <echo message="${ant.project.name}" />
+    <jar destfile="${app.jar.path}">
+      <fileset dir="${classes.path}" 
+        excludes="**/*Test.class"/> 
+    </jar>
+    <jar destfile="${source.jar.path}">
+      <fileset dir="${source.path}" 
+        includes="**/*.java" 
+        excludes="**/*Test.java"/> 
+    </jar>
+  </target>
+
+  <target name="all" depends="clean,compile"
+          description="Clean and re-compile." />
+
+</project>
@@ -0,0 +1,2 @@
+ROOT_DIR=`pwd`
+export CLASSPATH=$ROOT_DIR/classes:$ROOT_DIR/lib/*
@@ -0,0 +1,71 @@
+package edu.jhu.agiga;
+
+/**
+ * This class contains the names of the XML tags and attributes used in
+ * Annotated Gigaword .xml.gz files.
+ * 
+ * @author mgormley
+ * 
+ */
+public class AgigaConstants {
+    
+    // XML Dependency Parse Tag names
+    public enum DependencyForm {
+        BASIC_DEPS("basic-dependencies"), 
+        COL_DEPS("collapsed-dependencies"), 
+        COL_CCPROC_DEPS("collapsed-ccprocessed-dependencies");
+
+        private String xmlTag;
+
+        private DependencyForm(String xmlTag) {
+            this.xmlTag = xmlTag;
+        }
+
+        public String getXmlTag() {
+            return xmlTag;
+        }
+    }
+    
+    // XML Tag names
+    public static final String FILE = "FILE";    
+    public static final String FILE_ID = "id";    
+    
+    public static final String DOC = "DOC";
+
+    public static final String SENTENCES = "sentences";
+    public static final String SENTENCE = "sentence";
+    public static final String TOKEN = "token";
+    public static final String TOKEN_ID = "id";
+    public static final String WORD = "word";
+    public static final String LEMMA = "lemma";
+    public static final String POS = "POS";
+    public static final String NER = "NER";
+    public static final String NORM_NER = "NormNER";
+    public static final String PARSE = "parse";
+    public static final String DEP = "dep";
+    public static final String DEP_TYPE = "type";
+    public static final String GOVERNOR = "governor";
+    public static final String DEPENDENT = "dependent";
+    
+    public static final String COREFERENCES = "coreferences";
+    public static final String COREFERENCE = "coreference";
+    public static final String MENTION = "mention";
+    public static final String M_SENTENCE = "sentence";
+    public static final String START = "start";
+    public static final String END = "end";
+    public static final String HEAD = "head";
+
+    // XML Attribute names
+    public static final String DOC_ID = "id";
+    public static final String DOC_TYPE = "type";
+
+    public static final String CHARACTER_OFFSET_BEGIN = "CharacterOffsetBegin";
+    public static final String CHARACTER_OFFSET_END = "CharacterOffsetEnd";
+
+    public static final String MENTION_REPRESENTATIVE = "representative";
+    
+    private AgigaConstants() {
+        // private constructor
+    }
+    
+}
@@ -0,0 +1,30 @@
+package edu.jhu.agiga;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Each AgigaCoref object provides access to all the mentions of a single entity
+ * in a document. These coreference resolution annotations are represented as a
+ * list of coref mentions, or AgigaMention objects.
+ * 
+ * @author mgormley
+ * 
+ */
+public class AgigaCoref {
+
+    private List<AgigaMention> mentions;
+    
+    public AgigaCoref() {
+        this.mentions = new ArrayList<AgigaMention>();
+    }
+
+    public List<AgigaMention> getMentions() {
+        return mentions;
+    }
+
+    public void add(AgigaMention mention) {
+        mentions.add(mention);
+    }
+        
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	+ROOT_DIR=`pwd`
	`2`	`+export CLASSPATH=$ROOT_DIR/classes:$ROOT_DIR/lib/*`