Skip to content

Commit

Permalink
add comments
Browse files Browse the repository at this point in the history
  • Loading branch information
liudragonfly committed Aug 9, 2015
1 parent 5b4cd87 commit 6b0a252
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 46 deletions.
16 changes: 9 additions & 7 deletions gbdt/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def __init__(self, filename): # just for csv data format
line_cnt += 1

def _construct_instance(self, fields):
"""构建一个新的样本"""
instance = dict()
for i in range(0, len(fields)):
field_name = self.field_names[i]
Expand Down Expand Up @@ -65,39 +66,40 @@ def describe(self):
print(info)

def get_instances_idset(self):
"""获取样本的id集合"""
return set(self.instances.keys())

# return feature type, real type or not
def is_real_type_field(self, name):
"""判断特征类型是否是real type"""
if name not in self.field_names:
raise ValueError(" field name not in the dictionary of dataset")
return len(self.field_type[name]) == 0

def get_label_size(self, name="label"):
if name not in self.field_names:
raise ValueError(" there is no class label field!")
# if the label works with float('label'), self.filed_type['label'] is empty. It's a bug.
# return len(self.field_type[name])
# 因为训练样本的label列的值可能不仅仅是字符类型,也可能是数字类型
# 如果是数字类型则field_type[name]为空
return len(self.field_type[name]) or len(self.distinct_valueset[name])

# 返回具体分类label的值
def get_label_valueset(self, name="label"):
"""返回具体分离label"""
if name not in self.field_names:
raise ValueError(" there is no class label field!")
return self.field_type[name] if self.field_type[name] else self.distinct_valueset[name]

# 返回里面有多少个样本个数
def size(self):
"""返回样本个数"""
return len(self.instances)

# 根据ID获取样本
def get_instance(self, Id):
"""根据ID获取样本"""
if Id not in self.instances:
raise ValueError("Id not in the instances dict of dataset")
return self.instances[Id]

# 获得所有的feature名称
def get_attributes(self):
"""返回所有features的名称"""
field_names = [x for x in self.field_names if x != "label"]
return tuple(field_names)

Expand Down
34 changes: 5 additions & 29 deletions gbdt/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def update_f_value(self, f, tree, leaf_nodes, subset, dataset, learn_rate, label
for node in leaf_nodes:
for id in node.get_idset():
f[id][label] += learn_rate*node.get_predict_value()
# for id not in subset, we have to predict by retrive the tree
# 更新OOB的样本
for id in data_idset-subset:
f[id][label] += learn_rate*tree.get_predict_value(dataset.get_instance(id))

Expand Down Expand Up @@ -195,7 +195,7 @@ def fit(self, dataset, train_data):
# 用损失函数的负梯度作为回归问题提升树的残差近似值
residual = self.loss.compute_residual(dataset, subset, f)
for label in label_valueset:
# 挂叶子节点的下的各种样本,只有到迭代的max-depth才会使用
# 挂在叶子节点下的各种样本,只有到迭代的max-depth才会使用
# 存放的各个叶子节点,注意叶子节点存放的是各个条件下的样本集点
leaf_nodes = []
targets = {}
Expand All @@ -204,17 +204,14 @@ def fit(self, dataset, train_data):
# 对某一个具体的label-K分类,选择max-depth个特征构造决策树
tree = construct_decision_tree(dataset, subset, targets, 0, leaf_nodes, self.max_depth, self.loss, self.split_points)
self.trees[iter][label] = tree
# self.update_f_value(f, tree, leaf_nodes, subset, dataset, label)
self.loss.update_f_value(f, tree, leaf_nodes, subset, dataset, self.learn_rate, label)
train_loss = self.compute_loss(dataset, train_data, f)
print("iter%d : average train_loss=%f" % (iter, train_loss))

else:
if self.loss_type == 'binary-classification':
print('binary')
self.loss = BinomialDeviance(n_classes=dataset.get_label_size())
elif self.loss_type == 'regression':
print('regression')
self.loss = LeastSquaresError(n_classes=1)

f = dict() # 记录F_{m-1}的值
Expand All @@ -223,7 +220,6 @@ def fit(self, dataset, train_data):
subset = train_data
if 0 < self.sample_rate < 1:
subset = sample(subset, int(len(subset)*self.sample_rate))
# self.trees[iter] = dict()
# 用损失函数的负梯度作为回归问题提升树的残差近似值
residual = self.loss.compute_residual(dataset, subset, f)
leaf_nodes = []
Expand All @@ -249,7 +245,6 @@ def compute_loss(self, dataset, subset, f):
loss -= ((1+y_i)*log(p_1)/2) + ((1-y_i)*log(1-p_1)/2)
except ValueError as e:
print(y_i, p_1)

else:
for id in dataset.get_instances_idset():
instance = dataset.get_instance(id)
Expand All @@ -264,25 +259,8 @@ def compute_loss(self, dataset, subset, f):
loss -= log(probs[instance["label"]])
return loss/dataset.size()

# def update_f_value(self, f, tree, leaf_nodes, subset, dataset, label):
# data_idset = set(dataset.get_instances_idset())
# subset = set(subset)
# if self.loss.K == 1:
# for node in leaf_nodes:
# for id in node.get_idset():
# f[id] += self.learn_rate*node.get_predict_value()
# for id in data_idset-subset:
# f[id] += self.learn_rate*tree.get_predict_value(dataset.get_instance(id))
# else:
# for node in leaf_nodes:
# for id in node.get_idset():
# f[id][label] += self.learn_rate*node.get_predict_value()
# # for id not in subset, we have to predict by retrive the tree
# for id in data_idset-subset:
# f[id][label] = f[id][label]+self.learn_rate*tree.get_predict_value(dataset.get_instance(id))

# 在进行预测的时候,得到某一个具体的instance样本点在K个分类下的浮点值
def compute_instance_f_value(self, instance):
"""计算样本的f值"""
if self.loss.K == 1:
f_value = 0.0
for iter in self.trees:
Expand All @@ -298,15 +276,13 @@ def compute_instance_f_value(self, instance):
f_value[label] += self.learn_rate*tree.get_predict_value(instance)
return f_value

# 预测f值
def predict(self, instance):
"""
对于回归和二元分类返回f值
对于多元分类返回每一类的f值
"""
return self.compute_instance_f_value(instance)

# 预测概率
def predict_prob(self, instance):
"""为了统一二元分类和多元分类,返回属于每个类别的概率"""
if isinstance(self.loss, RegressionLossFunction):
Expand All @@ -328,15 +304,15 @@ def predict_prob(self, instance):
probs[label] = exp_values[label]/exp_sum
return probs

# 预测label
def predict_label(self, instance):
"""预测标签"""
predict_label = None
if isinstance(self.loss, BinomialDeviance):
probs = self.predict_prob(instance)
predict_label = 1 if probs[1] >= probs[-1] else -1
else:
probs = self.predict_prob(instance)
# 选出K分类中,概率值最大的label,返回预测分类的label概率值list
# 选出K分类中,概率值最大的label
for label in probs:
if not predict_label or probs[label] > probs[predict_label]:
predict_label = label
Expand Down
18 changes: 8 additions & 10 deletions gbdt/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@ def __init__(self):
self.split_feature = None
self.leftTree = None
self.rightTree = None
# condition for real value is < , for category value is =
# is for the left-path tree
# 对于real value的条件为<,对于类别值得条件为=
# 将满足条件的放入左树
self.real_value_feature = True
self.conditionValue = None
self.leafNode = None

def get_predict_value(self, instance):
if self.leafNode: # we are in the leaf node
if self.leafNode: # 到达叶子节点
return self.leafNode.get_predict_value()
if not self.split_feature:
raise ValueError("the tree is null")
Expand Down Expand Up @@ -77,8 +77,6 @@ def FriedmanMSE(left_values, right_values):
(weighted_n_left + weighted_n_right))


# if split_points is larger than 0, we just random choice split_points to evalute minLoss
# when consider real-value split
def construct_decision_tree(dataset, remainedSet, targets, depth, leaf_nodes, max_depth, loss, criterion='MSE', split_points=0):
if depth < max_depth:
# todo 通过修改这里可以实现选择多少特征训练
Expand All @@ -89,18 +87,18 @@ def construct_decision_tree(dataset, remainedSet, targets, depth, leaf_nodes, ma
selectedLeftIdSet = []
selectedRightIdSet = []
for attribute in attributes:
# print "start process attribute=",attribute;
is_real_type = dataset.is_real_type_field(attribute)
attrValues = dataset.get_distinct_valueset(attribute)
if is_real_type and split_points > 0 and len(attrValues) > split_points: # need subsample split points to speed up
if is_real_type and split_points > 0 and len(attrValues) > split_points:
attrValues = sample(attrValues, split_points)
for attrValue in attrValues:
leftIdSet = []
rightIdSet = []
for Id in remainedSet:
instance = dataset.get_instance(Id)
value = instance[attribute]
if (is_real_type and value < attrValue)or(not is_real_type and value == attrValue): # fall into the left
# 将满足条件的放入左子树
if (is_real_type and value < attrValue)or(not is_real_type and value == attrValue):
leftIdSet.append(Id)
else:
rightIdSet.append(Id)
Expand All @@ -122,10 +120,10 @@ def construct_decision_tree(dataset, remainedSet, targets, depth, leaf_nodes, ma
tree.leftTree = construct_decision_tree(dataset, selectedLeftIdSet, targets, depth+1, leaf_nodes, max_depth, loss)
tree.rightTree = construct_decision_tree(dataset, selectedRightIdSet, targets, depth+1, leaf_nodes, max_depth, loss)
return tree
else: # is a leaf node
else: # 是叶子节点
node = LeafNode(remainedSet)
node.update_predict_value(targets, loss)
leaf_nodes.append(node) # add a leaf node
leaf_nodes.append(node)
tree = Tree()
tree.leafNode = node
return tree

0 comments on commit 6b0a252

Please sign in to comment.