-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeature_index.h
159 lines (137 loc) · 4.39 KB
/
feature_index.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
//
// CRF++ -- Yet Another CRF toolkit
//
// $Id: feature_index.h 1588 2007-02-12 09:03:39Z taku $;
//
// Copyright(C) 2005-2007 Taku Kudo <[email protected]>
//
#ifndef CRFPP_FEATURE_INDEX_H_
#define CRFPP_FEATURE_INDEX_H_
#include <iostream>
#include <map>
#include <vector>
#include "common.h"
#include "scoped_ptr.h"
#include "feature_cache.h"
#include "path.h"
#include "node.h"
#include "freelist.h"
#include "mmap.h"
#include "darts.h"
namespace CRFPP {
class TaggerImpl;
class Allocator {
public:
explicit Allocator(size_t thread_num);
Allocator();
virtual ~Allocator();
char *strdup(const char *str);
Path *newPath(size_t thread_id);
Node *newNode(size_t thread_id);
void clear();
void clear_freelist(size_t thread_id);
FeatureCache *feature_cache() const;
size_t thread_num() const;
private:
void init();
size_t thread_num_;
scoped_ptr<FeatureCache> feature_cache_;
scoped_ptr<FreeList<char> > char_freelist_;
scoped_array< FreeList<Path> > path_freelist_;
scoped_array< FreeList<Node> > node_freelist_;
};
class FeatureIndex {
public:
static const unsigned int version = MODEL_VERSION;
size_t size() const { return maxid_; }
size_t xsize() const { return xsize_; }
size_t ysize() const { return y_.size(); }
const char* y(size_t i) const { return y_[i].c_str(); }
void set_alpha(double *alpha) { alpha_ = alpha; }
float *alpha_float() { return alpha_float_; }
double *alpha() const { return alpha_; }
void set_cost_factor(double cost_factor) { cost_factor_ = cost_factor; }
double cost_factor() const { return cost_factor_; }
void calcCost(Node *node) const;
void calcCost(Path *path) const;
bool buildFeatures(TaggerImpl *tagger) const;
void rebuildFeatures(TaggerImpl *tagger) const;
const char* what() { return what_.str(); }
explicit FeatureIndex(): maxid_(0), alpha_(0), alpha_float_(0),
cost_factor_(1.0), xsize_(0),
check_max_xsize_(false), max_xsize_(0) {}
virtual ~FeatureIndex() {}
const char *getTemplate() const;
#ifdef USE_MPI
const std::vector<std::string> &getY() { return y_; }
void setY(const std::vector<std::string> &y) { y_ = y; }
#endif // USE_MPI
protected:
virtual int getID(const char *str) const = 0;
const char *getIndex(const char *&p,
size_t pos,
const TaggerImpl &tagger) const;
bool applyRule(string_buffer *os,
const char *pattern,
size_t pos, const TaggerImpl &tagger) const;
// number of feature function
mutable unsigned int maxid_;
// parameters to learn
double *alpha_;
float *alpha_float_;
double cost_factor_;
// column number exclude label in train file
unsigned int xsize_;
bool check_max_xsize_;
// column number used in feature template file
mutable unsigned int max_xsize_;
std::vector<std::string> unigram_templs_;
std::vector<std::string> bigram_templs_;
// labels
std::vector<std::string> y_;
std::string templs_;
whatlog what_;
};
struct FeatureInfo {
// start id of 'feature function' associated with current 'feature'
int id_;
// feature frequence
unsigned int freq_;
FeatureInfo() : id_(-1), freq_(0) {}
FeatureInfo(int id, unsigned int freq) : id_(id), freq_(freq) {}
void set(int id, unsigned int freq) {
id_ = id;
freq_ = freq;
}
};
class EncoderFeatureIndex: public FeatureIndex {
public:
bool open(const char *template_filename,
const char *train_filename);
bool save(const char *filename, bool emit_textmodelfile);
bool convert(const char *text_filename,
const char *binary_filename);
void shrink(size_t freq, Allocator *allocator);
void dump();
#ifdef USE_MPI
void clear();
const std::map<std::string, FeatureInfo> &getFeatureIndex();
#endif // USE_MPI
private:
int getID(const char *str) const;
bool openTemplate(const char *filename);
bool openTagSet(const char *filename);
// key: 'feature' text, value: feature info
mutable std::map<std::string, FeatureInfo> dic_;
};
class DecoderFeatureIndex: public FeatureIndex {
public:
bool open(const char *model_filename);
bool openFromArray(const char *buf, size_t size);
private:
Mmap <char> mmap_;
Darts::DoubleArray da_;
int getID(const char *str) const;
};
}
#endif