diff --git a/.gitignore b/.gitignore index f1a9de7..8837ff5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,8 @@ /wikipedia-miner-core/target/ -/wikipedia-miner-extract/target/ \ No newline at end of file +/wikipedia-miner-extract/target/ +/wikipedia-miner-web/target/ +/wikipedia-miner-web/nb-configuration.xml +/wikipedia-miner-web/src/main/webapp/WEB-INF/web.xml +/target/ +/wikipedia-miner-core/nbactions.xml +/wikipedia-miner-core/nb-configuration.xml \ No newline at end of file diff --git a/README.md b/README.md index 053f5c0..8aa61ed 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,53 @@ wikipediaminer ============== -An open source toolkit for mining Wikipedia +An open source toolkit for mining Wikipedia forked from: https://github.com/dnmilne/wikipediaminer + +Contain some improvements in the WebServices and a lot of bugfixes to Milne's sources. + + Documentation at : https://github.com/dnmilne/wikipediaminer/wiki + + +TODO: +```list +Add support for live snapshots of wikipedia (DBPedia approach) to stay updated +Implement other disambigation approaches like http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6354382 +Support for binary data on the webServices, (Thrift for example) to avoid problems with UTF8 characters. +``` + + +Add this repository to your POM.xml. + +```xml + + + galan-maven-repo + galan-maven-repo-releases + http://galan.ehu.es/artifactory/ext-release-local + + +``` + +Then... + +```xml + + + galan-maven-repo + galan-maven-repo-releases + http://galan.ehu.es/artifactory/ext-release-local + + +``` + +Then add the required subproyect, for example... +```xml + + org.wikipedia-miner + wikipedia-miner-core + 1.2.4 + + + + + diff --git a/configs/hub-template.xml b/configs/hub-template.xml index 92a219d..88dc480 100644 --- a/configs/hub-template.xml +++ b/configs/hub-template.xml @@ -8,7 +8,14 @@ + path/to/conf/file diff --git a/configs/languages.xml b/configs/languages.xml index f4d26df..7d73ac5 100644 --- a/configs/languages.xml +++ b/configs/languages.xml @@ -55,5 +55,25 @@ WEITERLEITUNG + + + + Artículos + + Wikipedia:Desambiguación + + desambiguación + des + desambiguacion + disambig + REDIRECT + des + otros usos + redirige aquí + ico-des + REDIRECCIÓN + REDIRECCION + + diff --git a/models/compare/artCompare_es_In.model b/models/compare/artCompare_es_ln.model similarity index 99% rename from models/compare/artCompare_es_In.model rename to models/compare/artCompare_es_ln.model index 7592053..c316895 100644 Binary files a/models/compare/artCompare_es_In.model and b/models/compare/artCompare_es_ln.model differ diff --git a/models/compare/labelCompare_es_In.model b/models/compare/labelCompare_es_ln.model similarity index 99% rename from models/compare/labelCompare_es_In.model rename to models/compare/labelCompare_es_ln.model index ba49c95..660ba92 100644 Binary files a/models/compare/labelCompare_es_In.model and b/models/compare/labelCompare_es_ln.model differ diff --git a/models/compare/labelDisambig_es_In.model b/models/compare/labelDisambig_es_In.model index 6d687c8..f0651b8 100644 Binary files a/models/compare/labelDisambig_es_In.model and b/models/compare/labelDisambig_es_In.model differ diff --git a/pom.xml b/pom.xml index dc8ae58..5581c84 100644 --- a/pom.xml +++ b/pom.xml @@ -1,44 +1,51 @@ - 4.0.0 + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + 4.0.0 + org.wikipedia-miner + wikipedia-miner + 1.2.4 + pom + wikipedia-miner + http://maven.apache.org + + UTF-8 + + + + + false + + central + bintray-plugins + http://jcenter.bintray.com + + + + + junit + junit + 3.8.1 + test + + + + wikipedia-miner-core + wikipedia-miner-extract + - org.wikipedia-miner - wikipedia-miner - 0.0.1-SNAPSHOT - pom - - wikipedia-miner - http://maven.apache.org - - - UTF-8 - - - - - junit - junit - 3.8.1 - test - - - - wikipedia-miner-core - wikipedia-miner-extract - - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.0 - - 1.6 - 1.6 - - - - + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.0 + + 1.6 + 1.6 + + + + + \ No newline at end of file diff --git a/wikipedia-miner-core/pom.xml b/wikipedia-miner-core/pom.xml index 2b31c50..9053b1f 100644 --- a/wikipedia-miner-core/pom.xml +++ b/wikipedia-miner-core/pom.xml @@ -1,104 +1,114 @@ - 4.0.0 - - org.wikipedia-miner - wikipedia-miner - 0.0.1-SNAPSHOT - + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" + xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + 4.0.0 + + org.wikipedia-miner + wikipedia-miner + 1.2.4 + - wikipedia-miner-core + wikipedia-miner-core - wikipedia-miner-core - http://maven.apache.org - - UTF-8 - - + wikipedia-miner-core + 1.2.4 - - com.sleepycat - je - 5.0.73 - + http://maven.apache.org + + UTF-8 + + + - - net.sf.trove4j - trove4j - 3.0.3 - + + com.sleepycat + je + 5.0.73 + + + + net.sf.trove4j + trove4j + 3.0.3 + - - junit - junit - 3.8.1 - test - - - - log4j - log4j - 1.2.17 - + + junit + junit + 3.8.1 + test + - - org.apache.hadoop - hadoop-core - 1.2.1 - + + log4j + log4j + 1.2.17 + - - org.apache.opennlp - opennlp-tools - 1.5.3 - + + org.apache.hadoop + hadoop-core + 1.2.1 + - - org.dmilne - weka-wrapper - 0.0.1 - + + org.apache.opennlp + opennlp-tools + 1.5.3 + - - org.apache.commons - commons-math - 2.2 - + + org.dmilne + weka-wrapper + 0.0.1 + - + + org.apache.commons + commons-math + 2.2 + + + org.apache.commons + commons-compress + 1.8.1 + jar + + - - - - maven-assembly-plugin - - - - org.wikipedia.miner.util.EnvironmentBuilder - - - - jar-with-dependencies - - - + + + + maven-assembly-plugin + + + + org.wikipedia.miner.util.EnvironmentBuilder + + + + jar-with-dependencies + + + - - org.apache.maven.plugins - maven-compiler-plugin - 3.0 - - 1.6 - 1.6 - - + + org.apache.maven.plugins + maven-compiler-plugin + 3.0 + + 1.6 + 1.6 + + + - - + + + diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/Disambiguator.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/Disambiguator.java index d7e3e2e..2125de7 100644 --- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/Disambiguator.java +++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/Disambiguator.java @@ -43,7 +43,6 @@ import weka.classifiers.*; import weka.classifiers.meta.Bagging; import weka.core.* ; -import weka.filters.supervised.instance.Resample ; /** * A machine-learned disambiguator. Given a term and a sense, it can identify how valid that sense is. @@ -62,7 +61,7 @@ public class Disambiguator { private NGrammer nGrammer ; private double minSenseProbability ; - private int maxLabelLength = 20 ; + private final int maxLabelLength = 20 ; private double minLinkProbability ; private int maxContextSize ; diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/preprocessing/WikiPreprocessor.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/preprocessing/WikiPreprocessor.java index b4fca02..0b00c76 100644 --- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/preprocessing/WikiPreprocessor.java +++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/preprocessing/WikiPreprocessor.java @@ -23,7 +23,6 @@ import java.util.ArrayList; import java.util.HashSet; import java.util.List; -import java.util.Vector; import java.util.regex.*; import org.wikipedia.miner.annotation.preprocessing.PreprocessedDocument.RegionTag; diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/tagging/DocumentTagger.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/tagging/DocumentTagger.java index c16df3f..9c76729 100644 --- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/tagging/DocumentTagger.java +++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/tagging/DocumentTagger.java @@ -89,7 +89,7 @@ public String tag(PreprocessedDocument doc, Collection topics, RepeatMode ArrayList references = resolveCollisions(topics) ; String originalText = doc.getOriginalText() ; - StringBuffer wikifiedText = new StringBuffer() ; + StringBuilder wikifiedText = new StringBuilder() ; int lastIndex = 0 ; HashSet doneIds = new HashSet() ; @@ -149,7 +149,7 @@ private ArrayList resolveCollisions(Collection topics) { double outerWeight = topicWeights.get(outerRef.getTopicId()); //identify references overlapped by this one, and their total weight - Vector innerReferences = new Vector() ; + List innerReferences = new ArrayList() ; double maxInnerWeight = 0 ; for (int j=i+1 ; j decider ; + private final Decider decider ; private Dataset dataset ; int linksConsidered = 0 ; @@ -123,6 +123,7 @@ public int getLinksConsidered() { * @return an ArrayList of the same topics, where the weight of each topic is the probability that it is a link. * @throws Exception if the link detector has not yet been trained */ + @Override public HashMap getTopicWeights(Collection topics) throws Exception { if (!decider.isReady()) diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/weighting/SimpleDocumentIndexer.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/weighting/SimpleDocumentIndexer.java index 85c9a9c..fd6df9a 100644 --- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/weighting/SimpleDocumentIndexer.java +++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/annotation/weighting/SimpleDocumentIndexer.java @@ -20,9 +20,7 @@ package org.wikipedia.miner.annotation.weighting; -import java.util.ArrayList; import java.util.Collection; -import java.util.Collections; import java.util.HashMap; import org.wikipedia.miner.annotation.Topic; diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ArticleComparer.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ArticleComparer.java index 9bac5cf..39648b8 100644 --- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ArticleComparer.java +++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ArticleComparer.java @@ -54,7 +54,7 @@ private enum LinkDirection{In, Out} ; int wikipediaArticleCount ; Double m ; - private long articlesCompared = 0 ; + private final long articlesCompared = 0 ; enum Attributes { inLinkGoogleMeasure, @@ -255,7 +255,6 @@ public void buildClassifier(Classifier classifier) throws Exception { /** * * - * @param classifier * @throws Exception */ public void buildDefaultClassifier() throws Exception { diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ArticleComparison.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ArticleComparison.java index 4f3e7df..bca9890 100644 --- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ArticleComparison.java +++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ArticleComparison.java @@ -4,8 +4,8 @@ public class ArticleComparison { - private Article articleA ; - private Article articleB ; + private final Article articleA ; + private final Article articleB ; private boolean inLinkFeaturesSet= false ; private Double inLinkGoogleMeasure ; diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ComparisonDataSet.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ComparisonDataSet.java index 6cf3727..0cab671 100644 --- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ComparisonDataSet.java +++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ComparisonDataSet.java @@ -183,7 +183,7 @@ public double getRelatedness() { @Override public String toString() { - StringBuffer sb = new StringBuffer() ; + StringBuilder sb = new StringBuilder() ; sb.append(termA) ; sb.append(",") ; diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ConnectionSnippet.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ConnectionSnippet.java index 676983b..efa060a 100644 --- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ConnectionSnippet.java +++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ConnectionSnippet.java @@ -9,21 +9,21 @@ public class ConnectionSnippet implements Comparable { private String _markup ; - private String _plainText ; + private final String _plainText ; - private Article _source ; - private Article _topic1 ; - private Article _topic2 ; + private final Article _source ; + private final Article _topic1 ; + private final Article _topic2 ; - private int _sentenceIndex ; + private final int _sentenceIndex ; private boolean _followsHeading = false ; private boolean _isListItem = false ; private Double _weight ; - private static Pattern _headingPattern = Pattern.compile("\\s*={2,}(.*?)={2,}(.*)") ; - private static Pattern _listPattern = Pattern.compile("\\s*[*#]+(.*)") ; - private static MarkupStripper _stripper = new MarkupStripper() ; + private static final Pattern _headingPattern = Pattern.compile("\\s*={2,}(.*?)={2,}(.*)") ; + private static final Pattern _listPattern = Pattern.compile("\\s*[*#]+(.*)") ; + private static final MarkupStripper _stripper = new MarkupStripper() ; public ConnectionSnippet(int sentenceIndex, Article source, Article topic1, Article topic2) { _sentenceIndex = sentenceIndex ; diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ConnectionSnippetWeighter.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ConnectionSnippetWeighter.java index 2af4a5a..59a2139 100644 --- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ConnectionSnippetWeighter.java +++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/ConnectionSnippetWeighter.java @@ -41,10 +41,10 @@ enum Attributes { } - private Wikipedia wikipedia ; - private ArticleComparer cmp ; + private final Wikipedia wikipedia ; + private final ArticleComparer cmp ; - private Decider snippetWeighter ; + private final Decider snippetWeighter ; private Dataset trainingDataset ; @@ -88,7 +88,7 @@ public double getWeight(ConnectionSnippet snippet) throws Exception { } } - public void train(Vector weightedSnippets) throws Exception { + public void train(List weightedSnippets) throws Exception { trainingDataset = snippetWeighter.createNewDataset() ; @@ -104,7 +104,7 @@ public void train(Vector weightedSnippets) throws Exception { } } - public double test(Vector weightedSnippets) throws Exception { + public double test(List weightedSnippets) throws Exception { List manualWeights = new ArrayList() ; List autoWeights = new ArrayList() ; @@ -183,7 +183,6 @@ public void buildClassifier(Classifier classifier) throws Exception { /** * * - * @param classifier * @throws Exception */ public void buildDefaultClassifier() throws Exception { diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/LabelComparer.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/LabelComparer.java index 15d2ec8..3c7f993 100644 --- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/LabelComparer.java +++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/comparison/LabelComparer.java @@ -5,9 +5,6 @@ import java.text.DecimalFormat; import java.util.ArrayList; import java.util.Collections; -import java.util.TreeSet; - -import org.wikipedia.miner.annotation.Disambiguator; import org.wikipedia.miner.model.Label; import org.wikipedia.miner.model.Wikipedia; import org.wikipedia.miner.util.CorrelationCalculator; @@ -17,9 +14,7 @@ import weka.classifiers.Classifier; import weka.classifiers.functions.GaussianProcesses; -import weka.classifiers.functions.SMO; import weka.classifiers.meta.Bagging; -import weka.classifiers.meta.FilteredClassifier; import weka.core.Instance; import weka.core.Utils; diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/db/MarkupDatabase.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/db/MarkupDatabase.java index d924222..ffd5b7a 100644 --- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/db/MarkupDatabase.java +++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/db/MarkupDatabase.java @@ -20,6 +20,14 @@ import com.sleepycat.bind.tuple.StringBinding; import com.sleepycat.je.Database; import com.sleepycat.je.DatabaseEntry; +import java.io.BufferedInputStream; +import java.io.InputStreamReader; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.tools.bzip2.* ; @@ -69,9 +77,10 @@ public void loadFromCsvFile(File dataFile, boolean overwrite, ProgressTracker tr * @param tracker an optional progress tracker (may be null) * @throws IOException if there is a problem reading or deserialising the given data file. * @throws XMLStreamException if the XML within the data file cannot be parsed. + * @throws org.apache.commons.compress.compressors.CompressorException */ - public void loadFromXmlFile(File dataFile, boolean overwrite, ProgressTracker tracker) throws IOException, XMLStreamException { - + public void loadFromXmlFile(File dataFile, boolean overwrite, ProgressTracker tracker) throws IOException, XMLStreamException, CompressorException { +// overwrite=true; if (exists() && !overwrite) return ; @@ -85,15 +94,24 @@ public void loadFromXmlFile(File dataFile, boolean overwrite, ProgressTracker tr StringBuffer characters = new StringBuffer() ; InputStream reader ; - - if (dataFile.getName().endsWith(".bz2")) - reader = new CBZip2InputStream(new FileInputStream(dataFile)) ; - else + CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + + if (dataFile.getName().endsWith(".bz2")){ + + FileInputStream fin=new FileInputStream(dataFile); + BufferedInputStream bis=new BufferedInputStream(fin); + CompressorInputStream input=new CompressorStreamFactory().createCompressorInputStream(bis); + reader = input; + } else{ reader = new FileInputStream(dataFile) ; - + } XMLInputFactory xmlStreamFactory = XMLInputFactory.newInstance() ; CountingInputStream countingReader = new CountingInputStream(reader) ; - XMLStreamReader xmlStreamReader = xmlStreamFactory.createXMLStreamReader(countingReader, "UTF-8") ; + // To work around a bug in XERCES (XERCESJ-1257), we assume the XML is always UTF8, so we simply provide reader. + XMLStreamReader xmlStreamReader = xmlStreamFactory.createXMLStreamReader(new InputStreamReader(countingReader,decoder)) ; + System.out.println("Parser class: " + xmlStreamReader.getClass().toString()); int pageTotal = 0 ; long charTotal = 0 ; diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/db/WEnvironment.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/db/WEnvironment.java index a7fb2af..b81139c 100755 --- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/db/WEnvironment.java +++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/db/WEnvironment.java @@ -11,6 +11,7 @@ import javax.xml.stream.XMLStreamException; import com.sleepycat.je.*; +import org.apache.commons.compress.compressors.CompressorException; import org.apache.log4j.Logger; import org.wikipedia.miner.db.WDatabase.DatabaseType; @@ -334,7 +335,6 @@ public WEnvironment(WikipediaConfiguration conf, boolean threaded) throws Enviro envConf.setAllowCreate(false) ; envConf.setReadOnly(true) ; envConf.setCachePercent(10) ; - env = new Environment(conf.getDatabaseDirectory(), envConf) ; initDatabases() ; @@ -672,7 +672,7 @@ public void finalize() { * @throws IOException if any of the required files cannot be read * @throws XMLStreamException if the XML dump of wikipedia cannot be parsed */ - public static void buildEnvironment(WikipediaConfiguration conf, File dataDirectory, boolean overwrite) throws IOException, XMLStreamException { + public static void buildEnvironment(WikipediaConfiguration conf, File dataDirectory, boolean overwrite) throws IOException, XMLStreamException, CompressorException { //check all files exist and are readable before doing anything @@ -739,14 +739,14 @@ public static void buildEnvironment(WikipediaConfiguration conf, File dataDirect env.close(); - TextProcessor tp = conf.getDefaultTextProcessor() ; - if (tp != null) { - File tmpDir = new File(conf.getDataDirectory() + File.separator + "tmp" + tp.getName()) ; - tmpDir.mkdir() ; - tmpDir.deleteOnExit() ; - - prepareTextProcessor(tp, conf, tmpDir, overwrite, 5) ; - } +// TextProcessor tp = conf.getDefaultTextProcessor() ; +// if (tp != null) { +// File tmpDir = new File(conf.getDataDirectory() + File.separator + "tmp" + tp.getName()) ; +// tmpDir.mkdir() ; +// tmpDir.deleteOnExit() ; +// +// prepareTextProcessor(tp, conf, tmpDir, overwrite, 5) ; +// } } /** diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Article.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Article.java index 436b99c..98b2b1b 100755 --- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Article.java +++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Article.java @@ -336,6 +336,7 @@ public Integer[] getSentenceIndexesMentioning(Article art) { DbLinkLocation key = new DbLinkLocation(id, null) ; int index = Collections.binarySearch(tmpLinks.getLinkLocations(), key, new Comparator(){ + @Override public int compare(DbLinkLocation a, DbLinkLocation b) { return new Integer(a.getLinkId()).compareTo(b.getLinkId()) ; } @@ -401,14 +402,14 @@ public Integer[] getSentenceIndexesMentioning(ArrayList
arts) { */ public class Label { - private String text ; + private final String text ; - private long linkDocCount ; - private long linkOccCount ; + private final long linkDocCount ; + private final long linkOccCount ; - private boolean fromTitle ; - private boolean fromRedirect ; - private boolean isPrimary ; + private final boolean fromTitle ; + private final boolean fromRedirect ; + private final boolean isPrimary ; protected Label(DbLabelForPage l) { diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Label.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Label.java index 891f628..dbaba8e 100755 --- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Label.java +++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Label.java @@ -19,8 +19,8 @@ public class Label { //properties ============================================================= - private String text ; - private TextProcessor textProcessor ; + private final String text ; + private final TextProcessor textProcessor ; private long linkDocCount = 0 ; private long linkOccCount = 0 ; @@ -150,11 +150,11 @@ public Sense[] getSenses() { public class Sense extends Article { - private long sLinkDocCount ; - private long sLinkOccCount ; + private final long sLinkDocCount ; + private final long sLinkOccCount ; - private boolean fromTitle ; - private boolean fromRedirect ; + private final boolean fromTitle ; + private final boolean fromRedirect ; //constructor ============================================================= diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Page.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Page.java index aee6f8d..208e676 100755 --- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Page.java +++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Page.java @@ -141,6 +141,7 @@ public boolean equals(Page p) { * @param p the Page to be compared * @return see above. */ + @Override public int compareTo(Page p) { if (p.id == id) @@ -163,6 +164,7 @@ public int compareTo(Page p) { * * @return a string representation of the page */ + @Override public String toString() { String s = getId() + ": " + getTitle() ; return s ; diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Wikipedia.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Wikipedia.java index 8890653..0f1f659 100644 --- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Wikipedia.java +++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/model/Wikipedia.java @@ -390,7 +390,15 @@ public void close() { @Override public void finalize() { - if (this.env != null) - Logger.getLogger(WIterator.class).warn("Unclosed wikipedia. You may be causing a memory leak.") ; + try { + if (this.env != null) + Logger.getLogger(WIterator.class).warn("Unclosed wikipedia. You may be causing a memory leak.") ; + } finally { + try { + super.finalize(); + } catch (Throwable ex) { + Logger.getLogger(WIterator.class).warn("Unclosed wikipedia. You may be causing a memory leak.") ; + } + } } } diff --git a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/util/EmphasisResolver.java b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/util/EmphasisResolver.java index d75b62b..06b17ca 100644 --- a/wikipedia-miner-core/src/main/java/org/wikipedia/miner/util/EmphasisResolver.java +++ b/wikipedia-miner-core/src/main/java/org/wikipedia/miner/util/EmphasisResolver.java @@ -20,9 +20,6 @@ package org.wikipedia.miner.util; import java.util.ArrayList; -import java.util.Arrays; -import java.util.Comparator; -import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -35,7 +32,7 @@ public class EmphasisResolver { public String resolveEmphasis(String text) { - StringBuffer sb = new StringBuffer() ; + StringBuilder sb = new StringBuilder() ; for (String line:text.split("\n")) { sb.append(resolveLine(line)) ; @@ -150,9 +147,7 @@ private String resolveLine(String line) { } - // Now let's actually convert our apostrophic mush to HTML! - - StringBuffer output = new StringBuffer() ; + StringBuilder output = new StringBuilder() ; StringBuffer buffer = new StringBuffer() ; String state = "" ; int i = 0 ; @@ -284,7 +279,7 @@ private String[] getSplits(String text) { private String getFilledString(int length) { - StringBuffer sb = new StringBuffer() ; + StringBuilder sb = new StringBuilder() ; for (int i=0 ; i{ +public class LabelIterator implements Iterator