apache
diff --git a/‎flink-ml-lib/pom.xml
Lines changed: 6 additions & 0 deletions b/‎flink-ml-lib/pom.xml
Lines changed: 6 additions & 0 deletions
diff --git a/‎flink-ml-lib/src/main/java/org/apache/flink/ml/classification/logisticregression/LogisticRegressionModel.java
Lines changed: 11 additions & 4 deletions b/‎flink-ml-lib/src/main/java/org/apache/flink/ml/classification/logisticregression/LogisticRegressionModel.java
Lines changed: 11 additions & 4 deletions
diff --git a/‎flink-ml-lib/src/main/java/org/apache/flink/ml/classification/logisticregression/LogisticRegressionModelDataUtil.java
Lines changed: 11 additions & 2 deletions b/‎flink-ml-lib/src/main/java/org/apache/flink/ml/classification/logisticregression/LogisticRegressionModelDataUtil.java
Lines changed: 11 additions & 2 deletions
diff --git a/‎flink-ml-lib/src/main/java/org/apache/flink/ml/classification/logisticregression/LogisticRegressionWithFtrl.java
Lines changed: 271 additions & 0 deletions b/‎flink-ml-lib/src/main/java/org/apache/flink/ml/classification/logisticregression/LogisticRegressionWithFtrl.java
Lines changed: 271 additions & 0 deletions
@@ -138,6 +138,12 @@ under the License.
       <scope>test</scope>
       <type>test-jar</type>
     </dependency>
+    <dependency>
+      <groupId>fastutil</groupId>
+      <artifactId>fastutil</artifactId>
+      <version>5.0.9</version>
+    </dependency>
+
   </dependencies>
 
   <build>
 
@@ -43,6 +43,7 @@
 import java.io.IOException;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 
 /** A Model which classifies data using the model data computed by {@link LogisticRegression}. */
@@ -147,10 +148,16 @@ public PredictLabelFunction(String broadcastModelKey, Map<Param<?>, Object> para
         @Override
         public Row map(Row dataPoint) {
             if (servable == null) {
-                LogisticRegressionModelData modelData =
-                        (LogisticRegressionModelData)
-                                getRuntimeContext().getBroadcastVariable(broadcastModelKey).get(0);
-                servable = new LogisticRegressionModelServable(modelData);
+                List<LogisticRegressionModelData> modelData =
+                        getRuntimeContext().getBroadcastVariable(broadcastModelKey);
+
+                if (modelData.size() == 1) {
+                    servable = new LogisticRegressionModelServable(modelData.get(0));
+                } else {
+                    LogisticRegressionModelData mergedModel =
+                            LogisticRegressionModelServable.mergePieces(modelData);
+                    servable = new LogisticRegressionModelServable(mergedModel);
+                }
                 ParamUtils.updateExistingParams(servable, params);
             }
             Vector features = (Vector) dataPoint.getField(servable.getFeaturesCol());
 
@@ -89,7 +89,13 @@ public static DataStream<LogisticRegressionModelData> getModelDataStream(Table m
         StreamTableEnvironment tEnv =
                 (StreamTableEnvironment) ((TableImpl) modelData).getTableEnvironment();
         return tEnv.toDataStream(modelData)
-                .map(x -> new LogisticRegressionModelData(x.getFieldAs(0), x.getFieldAs(1)));
+                .map(
+                        x ->
+                                new LogisticRegressionModelData(
+                                        x.getFieldAs(0),
+                                        x.getFieldAs(1),
+                                        x.getFieldAs(2),
+                                        x.getFieldAs(3)));
     }
 
     /**
@@ -107,7 +113,10 @@ public static DataStream<byte[]> getModelDataByteStream(Table modelDataTable) {
                         x -> {
                             LogisticRegressionModelData modelData =
                                     new LogisticRegressionModelData(
-                                            x.getFieldAs(0), x.getFieldAs(1));
+                                            x.getFieldAs(0),
+                                            x.getFieldAs(1),
+                                            x.getFieldAs(2),
+                                            x.getFieldAs(3));
 
                             ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
                             modelData.encode(outputStream);
 
@@ -0,0 +1,271 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.ml.classification.logisticregression;
+
+import org.apache.flink.api.common.functions.MapFunction;
+import org.apache.flink.api.common.functions.ReduceFunction;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.api.java.tuple.Tuple2;
+import org.apache.flink.api.java.tuple.Tuple3;
+import org.apache.flink.api.java.typeutils.ListTypeInfo;
+import org.apache.flink.ml.api.Estimator;
+import org.apache.flink.ml.common.datastream.DataStreamUtils;
+import org.apache.flink.ml.common.feature.LabeledLargePointWithWeight;
+import org.apache.flink.ml.common.lossfunc.BinaryLogisticLoss;
+import org.apache.flink.ml.common.lossfunc.LossFunc;
+import org.apache.flink.ml.common.ps.training.IterationStageList;
+import org.apache.flink.ml.common.ps.training.ProcessStage;
+import org.apache.flink.ml.common.ps.training.PullStage;
+import org.apache.flink.ml.common.ps.training.PushStage;
+import org.apache.flink.ml.common.ps.training.TrainingContext;
+import org.apache.flink.ml.common.ps.training.TrainingUtils;
+import org.apache.flink.ml.common.updater.FTRL;
+import org.apache.flink.ml.linalg.Vectors;
+import org.apache.flink.ml.param.Param;
+import org.apache.flink.ml.util.ParamUtils;
+import org.apache.flink.ml.util.ReadWriteUtils;
+import org.apache.flink.streaming.api.datastream.DataStream;
+import org.apache.flink.table.api.Table;
+import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
+import org.apache.flink.table.api.internal.TableImpl;
+import org.apache.flink.types.Row;
+import org.apache.flink.util.Preconditions;
+
+import it.unimi.dsi.fastutil.longs.Long2DoubleOpenHashMap;
+import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * An Estimator which implements the large scale logistic regression algorithm using FTRL optimizer.
+ *
+ * <p>See https://en.wikipedia.org/wiki/Logistic_regression.
+ */
+public class LogisticRegressionWithFtrl
+        implements Estimator<LogisticRegressionWithFtrl, LogisticRegressionModel>,
+                LogisticRegressionWithFtrlParams<LogisticRegressionWithFtrl> {
+
+    private final Map<Param<?>, Object> paramMap = new HashMap<>();
+
+    public LogisticRegressionWithFtrl() {
+        ParamUtils.initializeMapWithDefaultValues(paramMap, this);
+    }
+
+    @Override
+    public LogisticRegressionModel fit(Table... inputs) {
+        Preconditions.checkArgument(inputs.length == 1);
+        String classificationType = getMultiClass();
+        Preconditions.checkArgument(
+                "auto".equals(classificationType) || "binomial".equals(classificationType),
+                "Multinomial classification is not supported yet. Supported options: [auto, binomial].");
+        StreamTableEnvironment tEnv =
+                (StreamTableEnvironment) ((TableImpl) inputs[0]).getTableEnvironment();
+
+        DataStream<LabeledLargePointWithWeight> trainData =
+                tEnv.toDataStream(inputs[0])
+                        .map(
+                                (MapFunction<Row, LabeledLargePointWithWeight>)
+                                        dataPoint -> {
+                                            double weight =
+                                                    getWeightCol() == null
+                                                            ? 1.0
+                                                            : ((Number)
+                                                                            dataPoint.getField(
+                                                                                    getWeightCol()))
+                                                                    .doubleValue();
+                                            double label =
+                                                    ((Number) dataPoint.getField(getLabelCol()))
+                                                            .doubleValue();
+                                            boolean isBinomial =
+                                                    Double.compare(0., label) == 0
+                                                            || Double.compare(1., label) == 0;
+                                            if (!isBinomial) {
+                                                throw new RuntimeException(
+                                                        "Multinomial classification is not supported yet. Supported options: [auto, binomial].");
+                                            }
+                                            Tuple2<long[], double[]> features =
+                                                    dataPoint.getFieldAs(getFeaturesCol());
+                                            return new LabeledLargePointWithWeight(
+                                                    features, label, weight);
+                                        });
+
+        DataStream<Long> modelDim;
+        if (getModelDim() > 0) {
+            modelDim = trainData.getExecutionEnvironment().fromElements(getModelDim());
+        } else {
+            modelDim =
+                    DataStreamUtils.reduce(
+                                    trainData.map(x -> x.features.f0[x.features.f0.length - 1]),
+                                    (ReduceFunction<Long>) Math::max)
+                            .map((MapFunction<Long, Long>) value -> value + 1);
+        }
+
+        IterationStageList<LabeledLargePointWithWeight> iterationStages =
+                new IterationStageList<>();
+        iterationStages
+                .addTrainingStage(new ComputeIndices())
+                .addTrainingStage(new PullStage("pullIndices"))
+                .addTrainingStage(new ComputeGradients(BinaryLogisticLoss.INSTANCE))
+                .addTrainingStage(new PushStage("pushGradient"))
+                .setTerminationCriteria(context -> context.getCurrentIterationId() >= getMaxIter());
+
+        FTRL ftrl = new FTRL(getAlpha(), getBeta(), getReg(), getElasticNet());
+
+        DataStream<Tuple3<Long, Long, double[]>> rawModelData =
+                TrainingUtils.<LabeledLargePointWithWeight>train(
+                        modelDim,
+                        trainData,
+                        ftrl,
+                        iterationStages,
+                        getGlobalBatchSize(),
+                        getNumServers(),
+                        getNumServerCores());
+
+        final long modelVersion = 0L;
+
+        DataStream<LogisticRegressionModelData> modelData =
+                rawModelData.map(
+                        tuple3 ->
+                                new LogisticRegressionModelData(
+                                        Vectors.dense(tuple3.f2),
+                                        tuple3.f0,
+                                        tuple3.f1,
+                                        modelVersion));
+
+        LogisticRegressionModel model =
+                new LogisticRegressionModel().setModelData(tEnv.fromDataStream(modelData));
+        ParamUtils.updateExistingParams(model, paramMap);
+        return model;
+    }
+
+    @Override
+    public void save(String path) throws IOException {
+        ReadWriteUtils.saveMetadata(this, path);
+    }
+
+    public static LogisticRegressionWithFtrl load(StreamTableEnvironment tEnv, String path)
+            throws IOException {
+        return ReadWriteUtils.loadStageParam(path);
+    }
+
+    @Override
+    public Map<Param<?>, Object> getParamMap() {
+        return paramMap;
+    }
+}
+
+/**
+ * A stage that samples a batch of training data and computes the indices needed to compute
+ * gradients.
+ */
+class ComputeIndices extends ProcessStage<LabeledLargePointWithWeight> {
+    @Override
+    public void process(TrainingContext<LabeledLargePointWithWeight> context) throws Exception {
+        List<LabeledLargePointWithWeight> batchData = context.getNextBatchData();
+        long[] indices = computeIndices(batchData);
+
+        context.put(
+                "batchData",
+                batchData,
+                new ListTypeInfo<>(TypeInformation.of(LabeledLargePointWithWeight.class)));
+        // Saves the indices for pull.
+        context.put("pullIndices", indices);
+    }
+
+    public static long[] computeIndices(List<LabeledLargePointWithWeight> dataPoints) {
+        LongOpenHashSet indices = new LongOpenHashSet();
+        for (LabeledLargePointWithWeight dataPoint : dataPoints) {
+            long[] notZeros = dataPoint.features.f0;
+            for (long index : notZeros) {
+                indices.add(index);
+            }
+        }
+
+        long[] sortedIndices = new long[indices.size()];
+        Iterator<Long> iterator = indices.iterator();
+        int i = 0;
+        while (iterator.hasNext()) {
+            sortedIndices[i++] = iterator.next();
+        }
+        Arrays.sort(sortedIndices);
+        return sortedIndices;
+    }
+}
+
+/**
+ * A Stage that uses the pulled model parameters and batch data to compute the gradients. The
+ * gradients are stored in context for later push.
+ */
+class ComputeGradients extends ProcessStage<LabeledLargePointWithWeight> {
+
+    private final LossFunc lossFunc;
+
+    public ComputeGradients(LossFunc lossFunc) {
+        this.lossFunc = lossFunc;
+    }
+
+    @Override
+    @SuppressWarnings("unchecked")
+    public void process(TrainingContext<LabeledLargePointWithWeight> context) {
+        List<LabeledLargePointWithWeight> batchData =
+                (List<LabeledLargePointWithWeight>) context.get("batchData");
+
+        long[] indices = ComputeIndices.computeIndices(batchData);
+        double[] pulledModelValues = (double[]) context.get("pullIndices");
+        double[] gradients = computeGradient(batchData, indices, pulledModelValues);
+
+        // Saves the gradient for push.
+        context.put("pushGradient", Tuple2.of(indices, gradients));
+    }
+
+    private double[] computeGradient(
+            List<LabeledLargePointWithWeight> batchData,
+            long[] sortedBatchIndices,
+            double[] pulledModelValues) {
+        Long2DoubleOpenHashMap coefficient = new Long2DoubleOpenHashMap(sortedBatchIndices.length);
+        for (int i = 0; i < sortedBatchIndices.length; i++) {
+            coefficient.put(sortedBatchIndices[i], pulledModelValues[i]);
+        }
+        Long2DoubleOpenHashMap cumGradients = new Long2DoubleOpenHashMap(sortedBatchIndices.length);
+
+        for (LabeledLargePointWithWeight dataPoint : batchData) {
+            double dot = dot(dataPoint.features, coefficient);
+            lossFunc.computeGradientWithDot(dataPoint, coefficient, cumGradients, dot);
+        }
+        double[] cumGradientValues = new double[sortedBatchIndices.length];
+        for (int i = 0; i < sortedBatchIndices.length; i++) {
+            cumGradientValues[i] = cumGradients.get(sortedBatchIndices[i]);
+        }
+        return cumGradientValues;
+    }
+
+    private static double dot(
+            Tuple2<long[], double[]> features, Long2DoubleOpenHashMap coefficient) {
+        double dot = 0;
+        for (int i = 0; i < features.f0.length; i++) {
+            dot += features.f1[i] * coefficient.get(features.f0[i]);
+        }
+        return dot;
+    }
+}