Skip to content

Commit

Permalink
Merged ubertagger branch into main
Browse files Browse the repository at this point in the history
git-svn-id: https://pet.opendfki.de/repos/pet/main@946 4200e16c-5112-0410-ac55-d7fb557a720a
  • Loading branch information
beki01 committed Jul 9, 2018
1 parent c754d01 commit e0694b3
Show file tree
Hide file tree
Showing 13 changed files with 1,805 additions and 410 deletions.
9 changes: 7 additions & 2 deletions cheap/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,15 @@ endif
include $(top_srcdir)/common/Makefile.common

# Add cheap-specific CPPFLAGS/CXXFLAGS/LDFLAGS/LIBS as determined by configure
CPPFLAGS += -I$(top_srcdir)/common -I$(top_srcdir)/fspp -I$(top_srcdir)/cheap/repp @CHEAPCPPFLAGS@
CPPFLAGS += -I$(top_srcdir)/common -I$(top_srcdir)/fspp -Irepp @CHEAPCPPFLAGS@
CXXFLAGS += @CHEAPCXXFLAGS@
LDFLAGS += @CHEAPLDFLAGS@
LIBS += @CHEAPLIBS@
DEFS += -DDYNAMIC_SYMBOLS

# Adding trigram lexical pruning capabilities
CPPFLAGS += -Itrigram

# Further CPPFLAGS/CXXFLAGS/LDFLAGS/LIBS should be determined by configure.ac
# if they are system-specific or passed as a command-line option to configure
# if they are requested by the user (this is the whole point of configure).
Expand Down Expand Up @@ -86,6 +89,9 @@ cheaplibsources = \
repp/tdl_options.cpp repp/tdl_options.h \
repp/repp_from_pet.cpp repp/repp_from_pet.h \
tagger.cpp tagger.h \
trigram/trigram.cpp trigram/trigram.h trigram/lattice.h \
trigram/lexprune.cpp trigram/lexprune.h \
trigram/ut_from_pet.cpp trigram/ut_from_pet.h \
$(top_srcdir)/common/bitcode.cpp \
$(top_srcdir)/common/chunk-alloc.cpp \
$(top_srcdir)/common/configs.cpp \
Expand Down Expand Up @@ -207,4 +213,3 @@ EXTRA_DIST = dumpgram.cpp mtest.cpp pet.cpp psqltest.c

profclean:
rm -f *.gcov gmon.out *.bb *.bbg *.da

44 changes: 44 additions & 0 deletions cheap/chart.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ class chart {
friend class chart_iter_adj_active;
friend class chart_iter_adj_passive;
friend class chart_iter_filtered;
friend class chart_iter_end_passive;
};

std::ostream &operator<<(std::ostream &out, const chart &ch) ;
Expand Down Expand Up @@ -401,6 +402,49 @@ class chart_iter_adj_active {

item_iter _curr;
};
/** Return all passive items ending at a specific point.
* \attention iterators must return items in order of `stamp', so the
* `excursion' works.
*/
class chart_iter_end_passive {
public:
/** Create an iterator for all passive items in \a C ending at \a i.
*/
inline chart_iter_end_passive(chart *C, int i) :
_LI(C->_Cp_end[i]) {
_curr = _LI.begin();
}

/** Create an iterator for all passive items in \a C ending at \a i.
*/
inline chart_iter_end_passive(chart &C, int i) :
_LI(C._Cp_end[i]) {
_curr = _LI.begin();
}

/** Increase iterator */
inline chart_iter_end_passive &operator++() {
++_curr;
return *this;
}

/** Is the iterator still valid? */
inline bool valid() const {
return _curr != _LI.end();
}

/** If valid(), return the current item, \c NULL otherwise. */
inline tItem *current() {
if(valid())
return *_curr;
else
return 0;
}

private:
item_list &_LI;
item_iter _curr;
};


//
Expand Down
45 changes: 41 additions & 4 deletions cheap/grammar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
#include "dagprinter.h"
#include <fstream>

#include "ut_from_pet.h"

using namespace std;

static int init();
Expand All @@ -65,6 +67,13 @@ static int init() {
managed_opt("opt_chart_pruning_strategy",
"determines the chart pruning strategy: 0=all tasks; 1=all successful tasks; 2=all passive items (default)",
2);
managed_opt("opt_ut",
"Request ubertagging, with settings in file argument",
std::string(""));
managed_opt("opt_lpthreshold",
"probability threshold for discarding lexical items",
-1.0);

return true;
}

Expand Down Expand Up @@ -436,7 +445,7 @@ undump_dags(dumper *f) {
tGrammar::tGrammar(const char * filename)
: _properties(), _root_insts(0), _generics(0),
_deleted_daughters(0), _packing_restrictor(0),
_sm(0), _lexsm(0), _pcfgsm(0), _gm(0)
_sm(0), _lexsm(0), _pcfgsm(0), _gm(0), _lpsm(0)
{
#ifdef HAVE_ICU
initialize_encoding_converter(cheap_settings->req_value("encoding"));
Expand Down Expand Up @@ -602,7 +611,7 @@ tGrammar::tGrammar(const char * filename)
if(get_opt_string("opt_preprocess_only").empty()) {
//
// a parse selection model can be supplied on the command line or through
// the settings file. and furthermore, even when a setting is present,
// the settings file. and furthermore, even when a setting is present,
// the command line can take precedence, including disabling parse
// ranking by virtue of a special `null' model.
//
Expand All @@ -625,8 +634,8 @@ tGrammar::tGrammar(const char * filename)
if (pcfg_file != 0) {
try {
_pcfgsm = new tPCFG(this, pcfg_file, filename);
// delete pcfgsm;
// only pcfg rules are loaded, not their weights
// delete pcfgsm;
// only pcfg rules are loaded, not their weights
// TODO: what was happening here?
} catch (tError &e) {
LOG(logGrammar, ERROR, e.getMessage());
Expand All @@ -653,6 +662,33 @@ tGrammar::tGrammar(const char * filename)
_lexsm = 0;
}
}

} // if
const std::string opt_ut = get_opt_string("opt_ut");
if (!opt_ut.empty()) { //ut requested
if (opt_ut != "null") {
settings *ut_settings = new settings(opt_ut, cheap_settings->base(),
"reading");
if (!ut_settings->valid())
throw tError("Unable to read UT configuration '" + opt_ut + "'.");
cheap_settings->install(ut_settings);
}
try {
double threshold;
get_opt("opt_lpthreshold", threshold);
if (threshold < 0) { //and hence wasn't set on commandline
if (cheap_settings->lookup("ut-threshold") != NULL){
set_opt("opt_lpthreshold",
strtod(cheap_settings->value("ut-threshold"), NULL));
} else
set_opt("opt_lpthreshold", 0);
}
_lpsm = createTrigramModel(cheap_settings);
}
catch(tError &e) {
LOG(logGrammar, ERROR, e.getMessage());
_lpsm = 0;
}
} // if

// check validity of cm-specific parameters:
Expand Down Expand Up @@ -905,6 +941,7 @@ tGrammar::~tGrammar()
delete _sm;
delete _pcfgsm;
delete _lexsm;
delete _lpsm;

#ifdef CONSTRAINT_CACHE
free_constraint_cache(nstatictypes);
Expand Down
51 changes: 29 additions & 22 deletions cheap/grammar.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/

/** \file grammar.h
/** \file grammar.h
* grammar rules, grammar
*/

Expand All @@ -32,6 +32,7 @@
#include "types.h"
#include "fs.h"
#include "lexicon.h"
#include "trigram.h"

#include <string>
#include <list>
Expand All @@ -56,7 +57,7 @@ enum affix_trait { NONE, PREFIX, INFLECTION, SUFFIX };
class grammar_rule
{
public:
/** Constructor for grammar rules.
/** Constructor for grammar rules.
* \return If the feature structure of the given type is not a valid rule
* (no or empty \c ARGS path), this method returns \c NULL, a grammar rule
* for the given type otherwise.
Expand All @@ -76,7 +77,7 @@ class grammar_rule
*/
inline int nextarg() const { return first(_tofill); }
/** Does the rule extend to the left or to the right?
* \todo Remove the current restriction to binary rules.
* \todo Remove the current restriction to binary rules.
*/
inline bool left_extending() { return first(_tofill) == 1; }

Expand Down Expand Up @@ -111,15 +112,15 @@ class grammar_rule

/** Return the type of the next argument in PCFG rule */
inline type_t nextarg_pcfg() { return _pcfg_args[first(_tofill) - 1]; }

/** Return all of the arguments but the current one in the order in which
* they should be filled.
*/
inline list_int *restargs() { return rest(_tofill); }

/** Return all of the arguments in the order in which they should be
* filled.
*/
*/
inline list_int *allargs() { return _tofill; }

/** Return the quick check vector for argument \a arg */
Expand All @@ -130,15 +131,15 @@ class grammar_rule
/** Should this rule be treated special when using hyperactive parsing?
* Rules whose active items are seldom reused should be made hyperactive
* because one dag copying operation is much more expensive than several
* unsuccessful unifications.
* unsuccessful unifications.
*/
inline bool hyperactive() { return _hyper; }

/** Return \c true if the items using this rule should always span the whole
* chart
* chart
*/
inline bool spanningonly() { return _spanningonly; }

/** Return the type of the nth argument in the pcfg rule */
inline type_t nth_pcfg_arg(int n) { return _pcfg_args[n - 1]; }

Expand All @@ -165,7 +166,7 @@ class grammar_rule

fs _f_restriced; // The feature structure corresponding to this rule
// with the packing restrictor applied.

qc_vec *_qc_vector_unif;
void init_qc_vector_unif();

Expand All @@ -186,16 +187,16 @@ class rulefilter {
private:
int _nrules;
char * _filtermatrix;

inline char * access(grammar_rule *mother, grammar_rule *daughter) {
assert(valid() && daughter->id() < _nrules && mother->id() < _nrules);
return _filtermatrix + daughter->id() + _nrules * mother->id();
}

public:
/** create a rulefilter */
rulefilter() : _nrules(0), _filtermatrix(0) { }

void resize(int n) {
_nrules = n;
delete _filtermatrix;
Expand Down Expand Up @@ -258,7 +259,7 @@ class tGrammar {
* the returned string will be empty.
*/
std::string property(std::string key);

/** Return the map containing the grammar properties */
inline std::map<std::string, std::string> &properties()
{ return _properties; }
Expand Down Expand Up @@ -291,7 +292,7 @@ class tGrammar {
*/
inline bool filter_compatible(grammar_rule *mother, int arg,
grammar_rule *daughter) {
return ! _filter.valid()
return ! _filter.valid()
|| (daughter == NULL) || _filter.get(mother, daughter, arg);
}

Expand All @@ -316,11 +317,11 @@ class tGrammar {
inline const rulelist &lexrules() { return _lex_rules; }

/** Return list of PCFG rubust parsing rules in this grammar */
inline rulelist &pcfg_rules() { return _pcfg_rules; }
inline rulelist &pcfg_rules() { return _pcfg_rules; }

/** Return the number of hyperactive rules in this grammar */
int nhyperrules();

/** Return the number of stem entries in the grammar */
inline int nstems() { return _lexicon.size(); }
/** return a pointer to the lex_stem with type id \a inst_key, or NULL if it
Expand All @@ -334,7 +335,7 @@ class tGrammar {
extDictionary *extDict() { return _extDict; }
void clear_dynamic_stems();
#endif

// _fix_me_ becomes obsolete when yy.cpp does
//std::list<full_form> lookup_form(const std::string form);

Expand All @@ -349,7 +350,7 @@ class tGrammar {

/** Return the statistic maxent model of this grammar */
inline class tSM *sm() { return _sm; }

inline void sm(tSM* m) { _sm = m; }

/** Return the lexical type predictor ME model */
Expand All @@ -359,13 +360,16 @@ class tGrammar {
inline class tSM *pcfgsm() { return _pcfgsm; }

/** Return the generative model for agenda manipulation */
inline class tGM *gm() { return _gm; }
inline class tGM *gm() { return _gm; }

/** Return the lexical pruning trigram model */
inline class tTrigramModel *lpsm() { return _lpsm; }

/** deactivate all rules */
void deactivate_all_rules() {
_rules.clear();
}

/** activate all (and only) lexical and inflection rules */
void activate_lex_rules() {
deactivate_all_rules();
Expand Down Expand Up @@ -470,8 +474,11 @@ class tGrammar {
// Robust PCFG parsing model.
class tSM *_pcfgsm;

// Generative model for agenda manipulation.
class tGM *_gm;
// Generative model for agenda manipulation.
class tGM *_gm;

// Trigram model for lexical pruning
class tTrigramModel *_lpsm;

void undump_properties(dumper *f);
void init_parameters();
Expand Down
Loading

0 comments on commit e0694b3

Please sign in to comment.