apple · calebmadrigal · Sep 14, 2018 · Sep 17, 2018 · Sep 17, 2018 · Sep 17, 2018
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@ Core ML Community Tools
 
 Core ML community tools contains all supporting tools for CoreML model
 conversion and validation. This includes Scikit Learn, LIBSVM, Caffe,
-Keras and XGBoost.
+Keras, XGBoost, and LightGBM.
 
 
 We recommend using virtualenv to use, install, or build coremltools. Be
@@ -57,6 +57,7 @@ you are converting models of these formats:
 - Xgboost (0.7+)
 - scikit-learn (0.17+)
 - libSVM
+- LightGBM (2.1.0+)
 
 
 Building from source
@@ -137,6 +138,12 @@ If you'd like to use the old keras version, you can:
 pip install keras==1.2.2 tensorflow
 ```
 
+To install LightGBM (Version >= 2.1.0)
+
+```shell
+pip install lightgbm
+```
+
 Finally, to run the most important unit tests, you can use:
 
 ```shell

diff --git a/coremltools/_deps/__init__.py b/coremltools/_deps/__init__.py
@@ -52,6 +52,13 @@ def __get_sklearn_version(version):
 except:
     HAS_XGBOOST = False
 
+# ---------------------------------------------------------------------------------------
+HAS_LIGHTGBM = True
+try:
+    import lightgbm
+except:
+    HAS_LIGHTGBM = False
+
 # ---------------------------------------------------------------------------------------
 HAS_KERAS_TF = True
 HAS_KERAS2_TF = True

diff --git a/coremltools/_scripts/converter.py b/coremltools/_scripts/converter.py
@@ -80,6 +80,19 @@ def _convert(args):
             print('error: coremlconverter: %s.' % str(e))
             return 1 # error
         return 0
+    elif args.srcModelFormat == 'lightgbm':
+        try:
+            if not args.inputNames:
+                args.inputNames = 'data'
+            if not args.outputNames:
+                args.outputNames = 'target'
+            model = args.srcModelPath
+            model = converters.lightgbm.convert(model, args.inputNames, args.outputNames)
+            model.save(args.dstModelPath)
+        except Exception as e:
+            print('error: coremlconverter: %s.' % str(e))
+            return 1 # error
+        return 0
     else:
         print('error: coremlconverter: Invalid srcModelFormat specified.')
         return 1
@@ -88,7 +101,7 @@ def _main():
     import argparse
 
     parser = argparse.ArgumentParser(description='Convert other model file formats to MLKit format (.mlmodel).')
-    parser.add_argument('--srcModelFormat', type=unicode, choices=['auto', 'caffe', 'keras'], default='auto', help='Format of model at srcModelPath (default is to auto-detect).')
+    parser.add_argument('--srcModelFormat', type=unicode, choices=['auto', 'caffe', 'keras', 'lightgbm'], default='auto', help='Format of model at srcModelPath (default is to auto-detect).')
     parser.add_argument('--srcModelPath', type=unicode, required=True, help='Path to the model file of the external tool (e.g caffe weights proto binary, keras h5 binary')
     parser.add_argument('--dstModelPath', type=unicode, required=True, help='Path to save the model in format .mlmodel')
     parser.add_argument('--caffeProtoTxtPath', type=unicode, default='', help='Path to the .prototxt file if network differs from the source file (optional)')

diff --git a/coremltools/converters/__init__.py b/coremltools/converters/__init__.py
@@ -9,3 +9,4 @@
 from . import xgboost
 from . import keras
 from . import caffe
+from . import lightgbm
diff --git a/coremltools/converters/lightgbm/__init__.py b/coremltools/converters/lightgbm/__init__.py
@@ -0,0 +1,7 @@
+# Created by Caleb Madrigal
+# Copyright (c) 2018, FireEye Inc. All rights reserved.
+#
+# Use of this source code is governed by a BSD-3-clause license that can be
+# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from ._tree import convert
diff --git a/coremltools/converters/lightgbm/_tree.py b/coremltools/converters/lightgbm/_tree.py
@@ -0,0 +1,53 @@
+# Created by Caleb Madrigal
+# Copyright (c) 2018, FireEye Inc. All rights reserved.
+#
+# Use of this source code is governed by a BSD-3-clause license that can be
+# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from ._tree_ensemble import convert_tree_ensemble as _convert_tree_ensemble
+from ...models import MLModel as _MLModel
+
+
+def convert(model, feature_names = None, target = None):
+    """
+    Convert a trained LightGBM model to Core ML format.
+
+    Parameters
+    ----------
+    model : Booster
+        A trained LightGBM tree model.
+
+    feature_names: [str] | str
+        Names of input features that will be exposed in the Core ML model
+        interface. If not specified, defaults to 'input'
+
+        Can be set to one of the following:
+
+        - None for using the feature names from the model.
+        - List of names of the input features that should be exposed in the
+          interface to the Core ML model. These input features are in the same
+          order as the LightGBM model.
+
+    target: str
+        Name of the output feature name exposed to the Core ML model. If not
+        specified, defaults to 'predicted_class' for regressors and 'classLabel'
+        for classifiers.
+
+    Returns
+    -------
+    model:MLModel
+        Returns an MLModel instance representing a Core ML model.
+
+    Examples
+    --------
+    .. sourcecode:: python
+
+        # Convert it with default input and output names
+        >>> import coremltools
+        >>> coreml_model = coremltools.converters.lightgbm.convert(model)
+
+        # Saving the Core ML model to a file.
+        >>> coreml_model.save('my_model.mlmodel')
+    """
+    return _MLModel(_convert_tree_ensemble(model, feature_names, target))
+
diff --git a/coremltools/converters/lightgbm/_tree_ensemble.py b/coremltools/converters/lightgbm/_tree_ensemble.py
@@ -0,0 +1,221 @@
+# Created by Caleb Madrigal
+# Copyright (c) 2018, FireEye Inc. All rights reserved.
+#
+# Use of this source code is governed by a BSD-3-clause license that can be
+# found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
+
+from ...models.tree_ensemble import TreeEnsembleRegressor, TreeEnsembleClassifier
+from ..._deps import HAS_LIGHTGBM as _HAS_LIGHTGBM
+
+if _HAS_LIGHTGBM:
+    import lightgbm as _lightgbm
+
+LIGHTGBM_DECISION_TYPE_MAP = {
+    '<=': 'BranchOnValueLessThanEqual',
+    '<': 'BranchOnValueLessThan',
+    '>=': 'BranchOnValueGreaterThanEqual',
+    '>': 'BranchOnValueGreaterThan',
+    '=': 'BranchOnValueEqual',
+    '!=': 'BranchOnValueNotEqual'
+}
+
+
+def recurse_tree(coreml_tree, lgbm_tree_dict, tree_id, node_id, current_global_node_id, class_id=None):
+    """Traverse through the tree and append to the tree spec."""
+    relative_hit_rate = lgbm_tree_dict.get('internal_count', None)
+
+    # Branch node
+    if 'leaf_value' not in lgbm_tree_dict:
+        decision_type_str = lgbm_tree_dict['decision_type']
+        branch_mode = LIGHTGBM_DECISION_TYPE_MAP[decision_type_str]
+
+        feature_index = lgbm_tree_dict['split_feature']
+        feature_value = lgbm_tree_dict['threshold']
+
+        if 'left_child' in lgbm_tree_dict:
+            left_child = lgbm_tree_dict['left_child']
+            current_global_node_id[0] += 1
+            left_child_id = current_global_node_id[0]
+        else:
+            left_child = None
+            left_child_id = None
+
+        if 'right_child' in lgbm_tree_dict:
+            right_child = lgbm_tree_dict['right_child']
+            current_global_node_id[0] += 1
+            right_child_id = current_global_node_id[0]
+        else:
+            right_child = None
+            right_child_id = None
+
+        if lgbm_tree_dict['default_left']:  # If left is the 'true' branch
+            (true_child_id, false_child_id) = (left_child_id, right_child_id)
+        else:
+            (true_child_id, false_child_id) = (right_child_id, left_child_id)
+
+        missing_value_tracks_true_child = True
+
+        coreml_tree.add_branch_node(tree_id, node_id, feature_index,
+                                    feature_value, branch_mode, true_child_id, false_child_id,
+                                    relative_hit_rate = relative_hit_rate,
+                                    missing_value_tracks_true_child = missing_value_tracks_true_child)
+
+        # Recurse
+        if left_child:
+            recurse_tree(coreml_tree, lgbm_tree_dict['left_child'], tree_id,
+                         left_child_id, current_global_node_id, class_id = class_id)
+        if right_child:
+            recurse_tree(coreml_tree, lgbm_tree_dict['right_child'], tree_id,
+                         right_child_id, current_global_node_id, class_id = class_id)
+
+    # Leaf node
+    else:
+        value = lgbm_tree_dict['leaf_value']
+        if class_id:
+            value = {class_id: value}
+
+        coreml_tree.add_leaf_node(tree_id, node_id, value, relative_hit_rate = relative_hit_rate)
+
+
+def _is_classifier(lightgbm_model):
+    """Determines if the lightgbm model is a classifier or regressor.
+    This is not pretty, but I didn't see a better way to discriminate between the two.
+
+    The reason for this is tracked here: https://github.com/Microsoft/LightGBM/issues/1700
+    """
+
+    if isinstance(lightgbm_model, _lightgbm.LGBMClassifier):
+        return True
+
+    elif isinstance(lightgbm_model, _lightgbm.LGBMRegressor):
+        return False
+
+    # If lightgbm.basic.Booster, it's more difficult to differentiate between classifiers and regressors...
+    regressor_eval_algorithms = {'l1', 'l2', 'l2_root', 'quantile', 'mape', 'huber', 'fair', 'poisson',
+                                 'gamma', 'gamma_deviance', 'tweedie'}
+    inner_eval_list = set(lightgbm_model._Booster__name_inner_eval)
+    # This is a classifier if any of the regressor algorithms are present in the _Booster__name_inner_eval
+    return regressor_eval_algorithms & inner_eval_list == set()
+
+
+def convert_tree_ensemble(model, feature_names, target):
+    """Convert a generic tree model to the protobuf spec.
+
+    This currently supports:
+      * Classifier
+      * Regressor
+
+    Parameters
+    ----------
+    model: str | lightgbm.basic.Booster | lightgbm.LGBMClassifier | lightgbm.LGBMRegressor
+        Lightgbm model object or path on disk to pickled model object.
+
+    feature_names : list of strings or None
+        Names of each of the features. When set to None, the feature names are
+        extracted from the model.
+
+    target: str or None
+        Name of the output column.
+
+    Returns
+    -------
+    model_spec: An object of type Model_pb.
+        Protobuf representation of the model
+    """
+    if not(_HAS_LIGHTGBM):
+        raise RuntimeError('lightgbm not found. lightgbm conversion API is disabled.')
+
+    import pickle
+
+    # If str, assume path to pickled model
+    if isinstance(model, str):
+        with open(model, 'rb') as f:
+            model = pickle.load(f)
+
+    if isinstance(model, (_lightgbm.LGBMClassifier, _lightgbm.LGBMRegressor)):
+        lgbm_model_dict = model._Booster.dump_model()  # Produces a python dict representing the model
+
+    elif isinstance(model, _lightgbm.Booster):
+        lgbm_model_dict = model.dump_model()  # Produces a python dict representing the model
+
+    else:
+        raise ValueError('Model object not recognized; must be one of: lightgbm.Booster, lightgbm.LGBMClassifier, '
+                         'lightgbm.LGBMRegressor, or string path to pickled model on disk.')
+
+    trees = lgbm_model_dict['tree_info']
+    num_dimensions = len(lgbm_model_dict['feature_names'])
+
+    if feature_names:
+        if isinstance(feature_names, str):
+            features = {feature_names: tuple(range(num_dimensions))}
+        elif isinstance(feature_names, list) and len(feature_names) == num_dimensions:
+            features = tuple(feature_names)
+        else:
+            raise ValueError('List of feature_names does not match the dimensionality of the model.')
+
+    # If no feature_names specified, extract them from the model
+    else:
+        features = tuple([str(feat_name) for feat_name in lgbm_model_dict['feature_names']])
+
+    # Handle classifier model
+    if _is_classifier(model):
+        # Determine class labels
+        num_classes = lgbm_model_dict['num_class']
+
+        # num_class=1 is a special case indicating binary classification (which really means 2 classes)
+        if num_classes == 1:
+            num_classes = 2
+
+        class_labels = range(num_classes)
+
+        coreml_tree = TreeEnsembleClassifier(features, class_labels=class_labels, output_features=target)
+
+        # LightGBM uses a 0 default_prediction_value
+        if num_classes == 2:
+            # Binary classification
+            coreml_tree.set_default_prediction_value(0.0)
+
+            # LightGBM appears to always use a Logistic transformer for classifiers
+            coreml_tree.set_post_evaluation_transform('Regression_Logistic')
+        else:
+            # Multiclass classification. This is also how we inform the model of the number of classes.
+            coreml_tree.set_default_prediction_value([0.0] * num_classes)
+
+            # LightGBM multiclass uses SoftMax
+            coreml_tree.set_post_evaluation_transform('Classification_SoftMax')
+
+        # Actually build the tree
+        for lgbm_tree_id, lgbm_tree_dict in enumerate(trees):
+            if num_classes == 2:
+                class_id = None
+            else:
+                # If multiclass classification, the value needs to indicate which class is being acted upon,
+                # so it must be {class_id: value}. In LightGBM, multiclass classification is done as a series
+                # of All-vs-One trees. So, for example, if there are 4 classes and 40 trees, the first 10
+                # trees represent a binary classification between "Is this Class 0" or "Any other class".
+                #
+                # LightGBM simply cycles through the classes for each subsequent tree, so if there are 4 classes,
+                # tree 0 will be class 0, tree 1 will be class 1, tree 2 will be class 2, tree 3 will be class 3,
+                # tree 4 will be class 0, tree 5 will be class 1, tree 6 will be class 2, tree 7 will be class 3,
+                # etc.
+                class_id = lgbm_tree_id % num_classes
+
+            recurse_tree(coreml_tree, lgbm_tree_dict['tree_structure'], lgbm_tree_id, node_id=0,
+                         current_global_node_id=[0], class_id = class_id)
+
+    # Handle regressor model
+    else:
+        coreml_tree = TreeEnsembleRegressor(features, target)
+
+        # LightGBM uses a 0 default_prediction_value
+        coreml_tree.set_default_prediction_value(0.0)
+
+        # LightGBM appears to always use no transform for regressors
+        coreml_tree.set_post_evaluation_transform('NoTransform')
+
+        # Actually build the tree
+        for lgbm_tree_id, lgbm_tree_dict in enumerate(trees):
+            recurse_tree(coreml_tree, lgbm_tree_dict['tree_structure'], lgbm_tree_id, node_id=0,
+                         current_global_node_id=[0])
+
+    return coreml_tree.spec