From aec64860e68c16cdacd2ea02fc86c936d01e3def Mon Sep 17 00:00:00 2001 From: Lin Guo Date: Mon, 10 Feb 2020 23:03:45 -0800 Subject: [PATCH 01/11] test commit --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index d9b9a4cc..be4eadac 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # Photon Machine Learning (Photon ML) + [![Build Status](https://travis-ci.org/linkedin/photon-ml.svg?branch=master)](https://travis-ci.org/linkedin/photon-ml) **Check out our [hands-on tutorial](https://github.com/linkedin/photon-ml/wiki/Photon-ML-Tutorial).** From 0df72a003e8b920697dda5e56a25c39bb299f94e Mon Sep 17 00:00:00 2001 From: Lin Guo Date: Tue, 11 Feb 2020 16:41:19 -0800 Subject: [PATCH 02/11] Migrate to Dataframe Training datasets will be created directly before training a coordinate. FixedEffectDataset is merged into FixedEffectCoordinate; RandomEffectDataset is merged into RandomEffectCoordinate; Random effect vector projection will be disabled --- .../com/linkedin/photon/ml/Constants.scala | 36 + .../ml/algorithm/CoordinateFactory.scala | 53 +- .../ml/algorithm/FixedEffectCoordinate.scala | 137 ++-- .../FixedEffectModelCoordinate.scala | 4 +- .../photon/ml/algorithm/ModelProjection.scala | 85 --- .../ml/algorithm/RandomEffectCoordinate.scala | 225 +++--- .../RandomEffectModelCoordinate.scala | 9 +- .../photon/ml/data/FixedEffectDataset.scala | 156 ----- .../photon/ml/data/GameConverters.scala | 13 +- .../photon/ml/data/LocalDataset.scala | 289 +------- .../photon/ml/data/RandomEffectDataset.scala | 647 ------------------ .../photon/ml/estimators/GameEstimator.scala | 151 +--- .../DistributedOptimizationProblem.scala | 38 +- ...GeneralizedLinearOptimizationProblem.scala | 11 +- .../RandomEffectOptimizationTracker.scala | 6 +- .../SingleNodeOptimizationProblem.scala | 24 +- .../RandomEffectOptimizationProblem.scala | 43 +- .../com/linkedin/photon/ml/util/Utils.scala | 23 +- .../game/training/GameTrainingDriver.scala | 22 +- .../photon/ml/algorithm/Coordinate.scala | 95 +-- .../photon/ml/algorithm/ModelCoordinate.scala | 13 +- .../com/linkedin/photon/ml/data/Dataset.scala | 32 - .../photon/ml/optimization/Optimizer.scala | 4 +- .../photon/ml/sampling/DownSampler.scala | 4 +- 24 files changed, 444 insertions(+), 1676 deletions(-) create mode 100644 photon-api/src/main/scala/com/linkedin/photon/ml/Constants.scala delete mode 100644 photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/ModelProjection.scala delete mode 100644 photon-api/src/main/scala/com/linkedin/photon/ml/data/FixedEffectDataset.scala delete mode 100644 photon-api/src/main/scala/com/linkedin/photon/ml/data/RandomEffectDataset.scala rename {photon-client => photon-api}/src/main/scala/com/linkedin/photon/ml/util/Utils.scala (92%) delete mode 100644 photon-lib/src/main/scala/com/linkedin/photon/ml/data/Dataset.scala diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/Constants.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/Constants.scala new file mode 100644 index 00000000..2da28880 --- /dev/null +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/Constants.scala @@ -0,0 +1,36 @@ +package com.linkedin.photon.ml + +import org.joda.time.DateTimeZone + +import com.linkedin.photon.ml.util.Utils + +/** + * Some commonly used String constants. + */ +object Constants { + + /** + * Delimiter used to concatenate feature name and term into feature key. + * + * WARNING: This is not visible in println! + */ + val DELIMITER = "\u0001" + + /** + * Wildcard character used for specifying the feature constraints. Only the term is allowed to be a wildcard normally + * unless one wants to apply bounds to all features in which case both name and term can be specified as wildcards. + * Currently, we do not support wildcards in name alone. + */ + val WILDCARD = "*" + + val INTERCEPT_NAME = "(INTERCEPT)" + val INTERCEPT_TERM = "" + val INTERCEPT_KEY = Utils.getFeatureKey(INTERCEPT_NAME, INTERCEPT_TERM) + + /** + * Default time zone for relative date calculations + */ + val DEFAULT_TIME_ZONE = DateTimeZone.UTC + + val UNIQUE_SAMPLE_ID = "uniqueId" +} \ No newline at end of file diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateFactory.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateFactory.scala index 95d8bb28..943c16bd 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateFactory.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateFactory.scala @@ -14,9 +14,12 @@ */ package com.linkedin.photon.ml.algorithm -import com.linkedin.photon.ml.data.{Dataset, FixedEffectDataset, RandomEffectDataset} -import com.linkedin.photon.ml.function.ObjectiveFunctionHelper.{DistributedObjectiveFunctionFactory, ObjectiveFunctionFactoryFactory, SingleNodeObjectiveFunctionFactory} +import org.apache.spark.sql.{DataFrame, SparkSession} + +import com.linkedin.photon.ml.Types.{FeatureShardId, REType} +import com.linkedin.photon.ml.data.InputColumnsNames import com.linkedin.photon.ml.function.ObjectiveFunction +import com.linkedin.photon.ml.function.ObjectiveFunctionHelper.{DistributedObjectiveFunctionFactory, ObjectiveFunctionFactoryFactory, SingleNodeObjectiveFunctionFactory} import com.linkedin.photon.ml.model.Coefficients import com.linkedin.photon.ml.normalization.NormalizationContext import com.linkedin.photon.ml.optimization.DistributedOptimizationProblem @@ -34,11 +37,12 @@ import com.linkedin.photon.ml.util.PhotonBroadcast object CoordinateFactory { /** - * Creates a [[Coordinate]] of the appropriate type, given the input [[Dataset]], + * Creates a [[Coordinate]] of the appropriate type, given the input data set, * [[CoordinateOptimizationConfiguration]], and [[ObjectiveFunction]]. * - * @tparam D Some type of [[Dataset]] * @param dataset The input data to use for training + * @param featureShardId + * @param inputColumnsNames * @param coordinateOptConfig The optimization settings for training * @param lossFunctionFactoryConstructor A constructor for the loss function factory function * @param glmConstructor A constructor for the type of [[GeneralizedLinearModel]] being trained @@ -46,61 +50,76 @@ object CoordinateFactory { * @param normalizationContext The [[NormalizationContext]] * @param varianceComputationType Should the trained coefficient variances be computed in addition to the means? * @param interceptIndexOpt The index of the intercept, if one is present - * @return A [[Coordinate]] for the [[Dataset]] of type [[D]] + * @param rETypeOpt + * @return A [[Coordinate]] instance */ - def build[D <: Dataset[D]]( - dataset: D, + def build( + dataset: DataFrame, + featureShardId: FeatureShardId, + inputColumnsNames: InputColumnsNames, coordinateOptConfig: CoordinateOptimizationConfiguration, lossFunctionFactoryConstructor: ObjectiveFunctionFactoryFactory, glmConstructor: Coefficients => GeneralizedLinearModel, downSamplerFactory: DownSamplerFactory, normalizationContext: NormalizationContext, varianceComputationType: VarianceComputationType, - interceptIndexOpt: Option[Int]): Coordinate[D] = { + interceptIndexOpt: Option[Int], + rETypeOpt: Option[REType]): Coordinate = { val lossFunctionFactory = lossFunctionFactoryConstructor(coordinateOptConfig) - (dataset, coordinateOptConfig, lossFunctionFactory) match { + var datasetName : String = "" + (rETypeOpt, coordinateOptConfig, lossFunctionFactory) match { case ( - fEDataset: FixedEffectDataset, + None, fEOptConfig: FixedEffectOptimizationConfiguration, distributedLossFunctionFactory: DistributedObjectiveFunctionFactory) => + datasetName = "fixed-effect" val downSamplerOpt = if (DownSampler.isValidDownSamplingRate(fEOptConfig.downSamplingRate)) { Some(downSamplerFactory(fEOptConfig.downSamplingRate)) } else { None } - val normalizationPhotonBroadcast = PhotonBroadcast(fEDataset.sparkContext.broadcast(normalizationContext)) + val normalizationPhotonBroadcast = PhotonBroadcast( + SparkSession.builder.getOrCreate.sparkContext + .broadcast(normalizationContext)) new FixedEffectCoordinate( - fEDataset, + dataset, DistributedOptimizationProblem( fEOptConfig, distributedLossFunctionFactory(interceptIndexOpt), downSamplerOpt, glmConstructor, normalizationPhotonBroadcast, - varianceComputationType)).asInstanceOf[Coordinate[D]] + varianceComputationType), + featureShardId, + inputColumnsNames).asInstanceOf[Coordinate] case ( - rEDataset: RandomEffectDataset, + Some(rEType), rEOptConfig: RandomEffectOptimizationConfiguration, singleNodeLossFunctionFactory: SingleNodeObjectiveFunctionFactory) => + datasetName = "random-effect" + RandomEffectCoordinate( - rEDataset, + dataset, + rEType, + featureShardId, + inputColumnsNames, rEOptConfig, singleNodeLossFunctionFactory, glmConstructor, normalizationContext, varianceComputationType, - interceptIndexOpt).asInstanceOf[Coordinate[D]] + interceptIndexOpt).asInstanceOf[Coordinate] case _ => throw new UnsupportedOperationException( s"""Cannot build coordinate for the following input class combination: - | ${dataset.getClass.getName} + | ${datasetName} | ${coordinateOptConfig.getClass.getName} | ${lossFunctionFactory.getClass.getName}""".stripMargin) } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala index 201691f4..02ba85ab 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala @@ -14,54 +14,57 @@ */ package com.linkedin.photon.ml.algorithm -import org.apache.spark.rdd.RDD +import org.apache.spark.ml.linalg.{Vector => SparkVector} +import org.apache.spark.sql.functions.col +import org.apache.spark.sql.types.{DataTypes, StructField, StructType} +import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.spark.storage.StorageLevel +import com.linkedin.photon.ml.Constants import com.linkedin.photon.ml.Types.{FeatureShardId, UniqueSampleId} import com.linkedin.photon.ml.data._ import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.function.DistributedObjectiveFunction import com.linkedin.photon.ml.model.{DatumScoringModel, FixedEffectModel} import com.linkedin.photon.ml.optimization.{DistributedOptimizationProblem, FixedEffectOptimizationTracker, OptimizationTracker} +import com.linkedin.photon.ml.util.VectorUtils + /** * The optimization problem coordinate for a fixed effect model. * * @tparam Objective The type of objective function used to solve the fixed effect optimization problem - * @param dataset The training dataset + * @param rawData The raw training data * @param optimizationProblem The fixed effect optimization problem */ protected[ml] class FixedEffectCoordinate[Objective <: DistributedObjectiveFunction]( - override protected val dataset: FixedEffectDataset, - optimizationProblem: DistributedOptimizationProblem[Objective]) - extends Coordinate[FixedEffectDataset](dataset) { - - /** - * Update the coordinate with a new dataset. - * - * @param dataset The updated dataset - * @return A new coordinate with the updated dataset - */ - override protected[algorithm] def updateCoordinateWithDataset( - dataset: FixedEffectDataset): FixedEffectCoordinate[Objective] = - new FixedEffectCoordinate[Objective](dataset, optimizationProblem) - - /** - * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset. - * - * @return A (updated model, optimization state tracking information) tuple - */ - override protected[algorithm] def trainModel(): (DatumScoringModel, OptimizationTracker) = { - - val updatedFixedEffectModel = FixedEffectCoordinate.trainModel( - dataset.labeledPoints, - optimizationProblem, - dataset.featureShardId, - None) - val optimizationTracker = new FixedEffectOptimizationTracker(optimizationProblem.getStatesTracker) - - (updatedFixedEffectModel, optimizationTracker) + rawData: DataFrame, + optimizationProblem: DistributedOptimizationProblem[Objective], + featureShardId: FeatureShardId, + inputColumnsNames: InputColumnsNames) + extends Coordinate { + + var dataset: DataFrame = + rawData.select(featureShardId, inputColumnsNames(InputColumnsNames.RESPONSE)) + + + override protected def updateDataset(scores: CoordinateDataScores) = { + // TODO: change scores to dataframe + val schemaFields = Array[StructField]( + StructField(Constants.UNIQUE_SAMPLE_ID, DataTypes.LongType, nullable = false), + StructField("score", DataTypes.DoubleType, nullable = false)) + dataset = SparkSession + .builder + .getOrCreate + .createDataFrame(scores.scoresRdd.map(Row.fromTuple(_)), new StructType(schemaFields)) + .join(rawData, Constants.UNIQUE_SAMPLE_ID) + // TODO: WHAT IF OFFSET DOESN'T EXIST + //.withColumnRenamed("score", inputColumnsNames(InputColumnsNames.OFFSET)) + .withColumn(inputColumnsNames(InputColumnsNames.OFFSET), + col(inputColumnsNames(InputColumnsNames.OFFSET)) + col("score")) } + /** * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset using an existing model as * a starting point. @@ -72,20 +75,18 @@ protected[ml] class FixedEffectCoordinate[Objective <: DistributedObjectiveFunct override protected[algorithm] def trainModel(model: DatumScoringModel): (DatumScoringModel, OptimizationTracker) = model match { case fixedEffectModel: FixedEffectModel => - val updatedFixedEffectModel = FixedEffectCoordinate.trainModel( - dataset.labeledPoints, + FixedEffectCoordinate.trainModel( + dataset, optimizationProblem, - dataset.featureShardId, - Some(fixedEffectModel)) - val optimizationTracker = new FixedEffectOptimizationTracker(optimizationProblem.getStatesTracker) - - (updatedFixedEffectModel, optimizationTracker) + featureShardId, + Some(model)) case _ => throw new UnsupportedOperationException( s"Training model of type ${model.getClass} in ${this.getClass} is not supported") } + /** * Compute scores for the coordinate dataset using the given model. * @@ -95,12 +96,21 @@ protected[ml] class FixedEffectCoordinate[Objective <: DistributedObjectiveFunct override protected[algorithm] def score(model: DatumScoringModel): CoordinateDataScores = model match { case fixedEffectModel: FixedEffectModel => - FixedEffectCoordinate.score(dataset, fixedEffectModel) + FixedEffectCoordinate.score(dataset, fixedEffectModel) case _ => throw new UnsupportedOperationException( s"Scoring with model of type ${model.getClass} in ${this.getClass} is not supported") } + + + /** + * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset. + * + * @return A (updated model, optimization state tracking information) tuple + */ + override protected def trainModel(): (DatumScoringModel, OptimizationTracker) = + FixedEffectCoordinate.trainModel(dataset, optimizationProblem, featureShardId, None) } object FixedEffectCoordinate { @@ -108,7 +118,7 @@ object FixedEffectCoordinate { /** * Train a new [[FixedEffectModel]] (i.e. run model optimization). * - * @param input The training dataset + * @param dataset The training dataset * @param optimizationProblem The optimization problem * @param featureShardId The ID of the feature shard for the training data * @param initialFixedEffectModelOpt An optional existing [[FixedEffectModel]] to use as a starting point for @@ -116,23 +126,36 @@ object FixedEffectCoordinate { * @return A new [[FixedEffectModel]] */ private def trainModel[Function <: DistributedObjectiveFunction]( - input: RDD[(UniqueSampleId, LabeledPoint)], - optimizationProblem: DistributedOptimizationProblem[Function], - featureShardId: FeatureShardId, - initialFixedEffectModelOpt: Option[FixedEffectModel]): FixedEffectModel = { + dataset: DataFrame, + optimizationProblem: DistributedOptimizationProblem[Function], + featureShardId: FeatureShardId, + initialFixedEffectModelOpt: Option[FixedEffectModel]): (FixedEffectModel, OptimizationTracker) = { + + val rdd = dataset + .rdd + .map { row => + val features = row.getAs[SparkVector](0) + val label = row.getDouble(1) + + LabeledPoint(label, VectorUtils.mlToBreeze(features)) + } + rdd.persist(StorageLevel.MEMORY_ONLY) - val newModel = initialFixedEffectModelOpt + val (glm, stateTracker) = initialFixedEffectModelOpt .map { initialFixedEffectModel => - optimizationProblem.runWithSampling(input, initialFixedEffectModel.model) + optimizationProblem.runWithSampling(rdd, initialFixedEffectModel.model) } - .getOrElse(optimizationProblem.runWithSampling(input)) - val updatedModelBroadcast = input.sparkContext.broadcast(newModel) + .getOrElse(optimizationProblem.runWithSampling(rdd)) + + rdd.unpersist() - new FixedEffectModel(updatedModelBroadcast, featureShardId) + (new FixedEffectModel(SparkSession.builder.getOrCreate.sparkContext.broadcast(glm), featureShardId), + new FixedEffectOptimizationTracker(stateTracker)) } + /** - * Score a [[FixedEffectDataset]] using a given [[FixedEffectModel]]. + * Score a dataset using a given [[FixedEffectModel]]. * * @note The score is the dot product of the model coefficients with the feature values (i.e., it does not go * through a non-linear link function). @@ -141,14 +164,12 @@ object FixedEffectCoordinate { * @return The computed scores */ protected[algorithm] def score( - fixedEffectDataset: FixedEffectDataset, - fixedEffectModel: FixedEffectModel): CoordinateDataScores = { - - val modelBroadcast = fixedEffectModel.modelBroadcast - val scores = fixedEffectDataset.labeledPoints.mapValues { case LabeledPoint(_, features, _, _) => - modelBroadcast.value.computeScore(features) - } + fixedEffectDataset: DataFrame, + fixedEffectModel: FixedEffectModel): CoordinateDataScores = { - new CoordinateDataScores(scores) + //val modelBroadcast = fixedEffectModel.modelBroadcast + //val scores = fixedEffectDataset.mapValues { features => modelBroadcast.value.computeScore(features)} + //new CoordinateDataScores(scores) + null } } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala index 19355f0f..1316357d 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala @@ -14,7 +14,7 @@ */ package com.linkedin.photon.ml.algorithm -import com.linkedin.photon.ml.data.FixedEffectDataset +import org.apache.spark.sql.DataFrame import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.model.{DatumScoringModel, FixedEffectModel} @@ -23,7 +23,7 @@ import com.linkedin.photon.ml.model.{DatumScoringModel, FixedEffectModel} * * @param dataset The training dataset */ -class FixedEffectModelCoordinate(dataset: FixedEffectDataset) extends ModelCoordinate(dataset) { +class FixedEffectModelCoordinate(dataset: DataFrame) extends ModelCoordinate { /** * Score the effect-specific dataset in the coordinate with the input model. diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/ModelProjection.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/ModelProjection.scala deleted file mode 100644 index 503df9d3..00000000 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/ModelProjection.scala +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ -package com.linkedin.photon.ml.algorithm - -import com.linkedin.photon.ml.data.RandomEffectDataset -import com.linkedin.photon.ml.model.{Coefficients, RandomEffectModel} - -/** - * Trait to encapsulate [[RandomEffectModel]] projection. Needed as the random effects have their feature space - * collapsed to reduce the amount of memory used and training time. - */ -trait ModelProjection extends Coordinate[RandomEffectDataset] { - - /** - * Project a [[RandomEffectModel]] from the original space to the projected space. - * - * @param randomEffectModel The [[RandomEffectModel]] in the original space - * @return The same [[RandomEffectModel]] in the projected space - */ - protected[algorithm] def projectModelForward(randomEffectModel: RandomEffectModel): RandomEffectModel = { - - // Left join the models to projectors for cases where we have a prior model but no new model (and hence no - // projectors) - val linearSubspaceProjectorsRDD = dataset.projectors - val newModels = randomEffectModel - .modelsRDD - .leftOuterJoin(linearSubspaceProjectorsRDD) - .mapValues { case (model, projectorOpt) => - projectorOpt - .map { projector => - val oldCoefficients = model.coefficients - val newCoefficients = Coefficients( - projector.projectForward(oldCoefficients.means), - oldCoefficients.variancesOption.map(projector.projectForward)) - - model.updateCoefficients(newCoefficients) - } - .getOrElse(model) - } - - randomEffectModel.update(newModels) - } - - /** - * Project a [[RandomEffectModel]] from the projected space to the original space. - * - * @param randomEffectModel The [[RandomEffectModel]] in the projected space - * @return The same [[RandomEffectModel]] in the original space - */ - protected[algorithm] def projectModelBackward(randomEffectModel: RandomEffectModel): RandomEffectModel = { - - // Left join the models to projectors for cases where we have a prior model but no new model (and hence no - // projectors) - val linearSubspaceProjectorsRDD = dataset.projectors - val newModels = randomEffectModel - .modelsRDD - .leftOuterJoin(linearSubspaceProjectorsRDD) - .mapValues { case (model, projectorOpt) => - projectorOpt - .map { projector => - val oldCoefficients = model.coefficients - val newCoefficients = Coefficients( - projector.projectBackward(oldCoefficients.means), - oldCoefficients.variancesOption.map(projector.projectBackward)) - - model.updateCoefficients(newCoefficients) - } - .getOrElse(model) - } - - randomEffectModel.update(newModels) - } -} diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala index 8afa2949..669b7804 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala @@ -14,60 +14,80 @@ */ package com.linkedin.photon.ml.algorithm +import scala.collection.mutable + import org.apache.spark.SparkContext +import org.apache.spark.ml.linalg.{Vector => SparkVector} import org.apache.spark.rdd.RDD +import org.apache.spark.sql.types.{DataTypes, StructField, StructType} +import org.apache.spark.sql.{DataFrame, Row, SparkSession, functions} import org.apache.spark.storage.StorageLevel - +import org.apache.spark.sql.functions.col +import com.linkedin.photon.ml.normalization.NormalizationContext +import com.linkedin.photon.ml.Constants +import com.linkedin.photon.ml.Types.{FeatureShardId, REType} import com.linkedin.photon.ml.data._ import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.function.SingleNodeObjectiveFunction import com.linkedin.photon.ml.model.{Coefficients, DatumScoringModel, RandomEffectModel} -import com.linkedin.photon.ml.normalization.NormalizationContext +import com.linkedin.photon.ml.optimization.VarianceComputationType.VarianceComputationType import com.linkedin.photon.ml.optimization.game.{RandomEffectOptimizationConfiguration, RandomEffectOptimizationProblem} import com.linkedin.photon.ml.optimization._ -import com.linkedin.photon.ml.optimization.VarianceComputationType.VarianceComputationType import com.linkedin.photon.ml.spark.RDDLike import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel - +import com.linkedin.photon.ml.util.VectorUtils /** * The optimization problem coordinate for a random effect model. * + * @param rEType + * @param rawData The raw training dataframe * @tparam Objective The type of objective function used to solve individual random effect optimization problems - * @param dataset The training dataset * @param optimizationProblem The random effect optimization problem + * @param inputColumnsNames */ protected[ml] class RandomEffectCoordinate[Objective <: SingleNodeObjectiveFunction]( - override protected val dataset: RandomEffectDataset, - protected val optimizationProblem: RandomEffectOptimizationProblem[Objective]) - extends Coordinate[RandomEffectDataset](dataset) - with ModelProjection + rEType: REType, + rawData: DataFrame, + optimizationProblem: RandomEffectOptimizationProblem[Objective], + featureShardId: FeatureShardId, + inputColumnsNames: InputColumnsNames) + extends Coordinate with RDDLike { + /* Get the training data from raw data */ + var dataset: DataFrame = { + val label = inputColumnsNames(InputColumnsNames.RESPONSE) + val offset = inputColumnsNames(InputColumnsNames.OFFSET) + val weight = inputColumnsNames(InputColumnsNames.WEIGHT) + + rawData + .select(rEType, featureShardId, label, offset, weight) + .groupBy(rEType) + .agg( + functions.collect_list(featureShardId), + functions.collect_list(label), + functions.collect_list(offset), + functions.collect_list(weight)) + } + // // Coordinate functions // - - /** - * Update the coordinate with a new [[RandomEffectDataset]]. - * - * @param dataset The updated [[RandomEffectDataset]] - * @return A new coordinate with the updated [[RandomEffectDataset]] - */ - override protected[algorithm] def updateCoordinateWithDataset( - dataset: RandomEffectDataset): RandomEffectCoordinate[Objective] = - new RandomEffectCoordinate(dataset, optimizationProblem) - - - /** - * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset. - * - * @return A (updated model, optional optimization tracking information) tuple - */ - override protected[algorithm] def trainModel(): (DatumScoringModel, OptimizationTracker) = { - - val (newModel, optimizationTracker) = RandomEffectCoordinate.trainModel(dataset, optimizationProblem, None) - - (projectModelBackward(newModel), optimizationTracker) + override protected def updateDataset(scores: CoordinateDataScores) = { + + // TODO: change scores to dataframe + val schemaFields = Array[StructField]( + StructField(Constants.UNIQUE_SAMPLE_ID, DataTypes.LongType, nullable = false), + StructField("score", DataTypes.DoubleType, nullable = false)) + dataset = SparkSession + .builder + .getOrCreate + .createDataFrame(scores.scoresRdd.map(Row.fromTuple(_)), new StructType(schemaFields)) + .join(rawData, Constants.UNIQUE_SAMPLE_ID) + // TODO: WHAT IF OFFSET DOESN'T EXIST + //.withColumnRenamed("score", inputColumnsNames(InputColumnsNames.OFFSET)) + .withColumn(inputColumnsNames(InputColumnsNames.OFFSET), + col(inputColumnsNames(InputColumnsNames.OFFSET)) + col("score")) } /** @@ -84,16 +104,34 @@ protected[ml] class RandomEffectCoordinate[Objective <: SingleNodeObjectiveFunct case randomEffectModel: RandomEffectModel => val (newModel, optimizationTracker) = RandomEffectCoordinate.trainModel( dataset, + rEType, + featureShardId, optimizationProblem, - Some(projectModelForward(randomEffectModel))) - - (projectModelBackward(newModel), optimizationTracker) + Some(randomEffectModel)) + (newModel, optimizationTracker) case _ => throw new UnsupportedOperationException( s"Updating model of type ${model.getClass} in ${this.getClass} is not supported") } + /** + * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset. + * + * @return A (updated model, optimization state tracking information) tuple + */ + override protected[algorithm] def trainModel(): (DatumScoringModel, OptimizationTracker) = { + + val (newModel, optimizationTracker) = RandomEffectCoordinate.trainModel( + dataset, + rEType, + featureShardId, + optimizationProblem, + None) + + (newModel, optimizationTracker) + } + /** * Compute scores for the coordinate data using a given model. * @@ -101,9 +139,8 @@ protected[ml] class RandomEffectCoordinate[Objective <: SingleNodeObjectiveFunct * @return The dataset scores */ override protected[algorithm] def score(model: DatumScoringModel): CoordinateDataScores = model match { - case randomEffectModel: RandomEffectModel => - RandomEffectCoordinate.score(dataset, projectModelForward(randomEffectModel)) + RandomEffectCoordinate.score(dataset, randomEffectModel) case _ => throw new UnsupportedOperationException( @@ -172,6 +209,7 @@ protected[ml] class RandomEffectCoordinate[Objective <: SingleNodeObjectiveFunct this } + } object RandomEffectCoordinate { @@ -179,9 +217,11 @@ object RandomEffectCoordinate { /** * Helper function to construct [[RandomEffectCoordinate]] objects. * - * @tparam RandomEffectObjective The type of objective function used to solve individual random effect optimization - * problems - * @param randomEffectDataset The data on which to run the optimization algorithm + * @tparam RandomEffectObjective The type of objective function used to solve individual random effect optimization problems + * @param data The data on which to run the optimization algorithm + * @param rEType + * @param featureShardId + * @param inputColumnsNames * @param configuration The optimization problem configuration * @param objectiveFunctionFactory The objective function to optimize * @param glmConstructor The function to use for producing GLMs from trained coefficients @@ -191,17 +231,21 @@ object RandomEffectCoordinate { * @return A new [[RandomEffectCoordinate]] object */ protected[ml] def apply[RandomEffectObjective <: SingleNodeObjectiveFunction]( - randomEffectDataset: RandomEffectDataset, - configuration: RandomEffectOptimizationConfiguration, - objectiveFunctionFactory: Option[Int] => RandomEffectObjective, - glmConstructor: Coefficients => GeneralizedLinearModel, - normalizationContext: NormalizationContext, - varianceComputationType: VarianceComputationType = VarianceComputationType.NONE, - interceptIndexOpt: Option[Int] = None): RandomEffectCoordinate[RandomEffectObjective] = { + data: DataFrame, + rEType: REType, + featureShardId: FeatureShardId, + inputColumnsNames: InputColumnsNames, + configuration: RandomEffectOptimizationConfiguration, + objectiveFunctionFactory: Option[Int] => RandomEffectObjective, + glmConstructor: Coefficients => GeneralizedLinearModel, + normalizationContext: NormalizationContext, + varianceComputationType: VarianceComputationType, + interceptIndexOpt: Option[Int] = None): RandomEffectCoordinate[RandomEffectObjective] = { // Generate parameters of ProjectedRandomEffectCoordinate val randomEffectOptimizationProblem = RandomEffectOptimizationProblem( - randomEffectDataset.projectors, + data, + rEType, configuration, objectiveFunctionFactory, glmConstructor, @@ -209,7 +253,7 @@ object RandomEffectCoordinate { varianceComputationType, interceptIndexOpt) - new RandomEffectCoordinate(randomEffectDataset, randomEffectOptimizationProblem) + new RandomEffectCoordinate(rEType, data, randomEffectOptimizationProblem, featureShardId, inputColumnsNames) } /** @@ -217,26 +261,52 @@ object RandomEffectCoordinate { * * @tparam Function The type of objective function used to solve individual random effect optimization problems * @param randomEffectDataset The training dataset + * @param randomEffectType + * @param featureShardId * @param randomEffectOptimizationProblem The per-entity optimization problems * @param initialRandomEffectModelOpt An optional existing [[RandomEffectModel]] to use as a starting point for * optimization * @return A (new [[RandomEffectModel]], optional optimization stats) tuple */ protected[algorithm] def trainModel[Function <: SingleNodeObjectiveFunction]( - randomEffectDataset: RandomEffectDataset, - randomEffectOptimizationProblem: RandomEffectOptimizationProblem[Function], - initialRandomEffectModelOpt: Option[RandomEffectModel]): (RandomEffectModel, RandomEffectOptimizationTracker) = { + randomEffectDataset: DataFrame, + randomEffectType:REType, + featureShardId: FeatureShardId, + randomEffectOptimizationProblem: RandomEffectOptimizationProblem[Function], + initialRandomEffectModelOpt: Option[RandomEffectModel]): (RandomEffectModel, RandomEffectOptimizationTracker) = { + + val rdd = randomEffectDataset + .rdd + .map { row => + val reid = row.getInt(0).toString + val features = row.getList[SparkVector](1) + val labels = row.getList[Double](2) + val offsets = row.getList[Double](3) + val weights = row.getList[Double](4) + + val fIter = features.iterator() + val lIter = labels.iterator() + val oIter = offsets.iterator() + val wIter = weights.iterator() + + require(features.size == labels.size) + require(features.size == offsets.size) + require(features.size == weights.size) + + val result = new mutable.ArrayBuffer[LabeledPoint](features.size) + + (0 until features.size).map { _ => + result += LabeledPoint(lIter.next(), VectorUtils.mlToBreeze(fIter.next()), oIter.next(), wIter.next()) + } - // All 3 RDDs involved in the joins below use the same partitioner + (reid, LocalDataset(result.toArray)) + } - // Optimization problems are created for each entity with a projector, and thus guaranteed to match active data - // exactly (see RandomEffectDataset.apply) - val dataAndOptimizationProblems = randomEffectDataset - .activeData - .join(randomEffectOptimizationProblem.optimizationProblems) + // TODO: remove pre-REID optimization problems + // All 3 RDDs involved in these joins use the same partitioner + val dataAndOptimizationProblems = rdd.join(randomEffectOptimizationProblem.optimizationProblems) - // Left join the models to the (data, optimization problem) tuple for cases where we have a prior model but no new - // data + // Left join the models to data and optimization problems for cases where we have a prior model but no new data val (newModels, randomEffectOptimizationTracker) = initialRandomEffectModelOpt .map { randomEffectModel => val modelsAndTrackers = randomEffectModel @@ -244,9 +314,8 @@ object RandomEffectCoordinate { .leftOuterJoin(dataAndOptimizationProblems) .mapValues { case (localModel, Some((localDataset, optimizationProblem))) => - val trainingLabeledPoints = localDataset.dataPoints.map(_._2) - val updatedModel = optimizationProblem.run(trainingLabeledPoints, localModel) - val stateTrackers = optimizationProblem.getStatesTracker + val trainingLabeledPoints = localDataset.dataPoints + val (updatedModel, stateTrackers) = optimizationProblem.run(trainingLabeledPoints, localModel) (updatedModel, Some(stateTrackers)) @@ -261,45 +330,44 @@ object RandomEffectCoordinate { (models, optimizationTracker) } .getOrElse { - val modelsAndTrackers = dataAndOptimizationProblems.mapValues { case (localDataset, optimizationProblem) => - val trainingLabeledPoints = localDataset.dataPoints.map(_._2) - val newModel = optimizationProblem.run(trainingLabeledPoints) - val stateTrackers = optimizationProblem.getStatesTracker - - (newModel, stateTrackers) - } - modelsAndTrackers.persist(StorageLevel.MEMORY_ONLY_SER) + val modelsAndTrackers = dataAndOptimizationProblems + .mapValues { case (localDataset, optimizationProblem) => + val trainingLabeledPoints = localDataset.dataPoints + optimizationProblem.run(trainingLabeledPoints) + } + modelsAndTrackers.persist(StorageLevel.MEMORY_AND_DISK_SER) val models = modelsAndTrackers.mapValues(_._1) val optimizationTracker = RandomEffectOptimizationTracker(modelsAndTrackers.map(_._2._2)) - (models, optimizationTracker) } val newRandomEffectModel = new RandomEffectModel( newModels, - randomEffectDataset.randomEffectType, - randomEffectDataset.featureShardId) + randomEffectType, + featureShardId) (newRandomEffectModel, randomEffectOptimizationTracker) } + /** - * Score a [[RandomEffectDataset]] using a given [[RandomEffectModel]]. + * Score a dataset using a given [[RandomEffectModel]]. * - * For information about the differences between active and passive data, see the [[RandomEffectDataset]] + * For information about the differences between active and passive data * documentation. * * @note The score is the raw dot product of the model coefficients and the feature values - it does not go through a * non-linear link function. - * @param randomEffectDataset The [[RandomEffectDataset]] to score + * @param randomEffectDataset The data set to score * @param randomEffectModel The [[RandomEffectModel]] with which to score * @return The computed scores */ protected[algorithm] def score( - randomEffectDataset: RandomEffectDataset, + randomEffectDataset: DataFrame, randomEffectModel: RandomEffectModel): CoordinateDataScores = { + /* // There may be more models than active data. However, since we're computing residuals for future coordinates, no // data means no residual. Therefore, we use an inner join. Note that the active data and models use the same // partitioner, but scores need to use GameDatum partitioner. @@ -329,5 +397,8 @@ object RandomEffectCoordinate { } new CoordinateDataScores(activeScores ++ passiveScores) + + */ + return null } } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectModelCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectModelCoordinate.scala index 58543b38..b30bf030 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectModelCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectModelCoordinate.scala @@ -14,7 +14,7 @@ */ package com.linkedin.photon.ml.algorithm -import com.linkedin.photon.ml.data.RandomEffectDataset +import org.apache.spark.sql.DataFrame import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.model.{DatumScoringModel, RandomEffectModel} @@ -23,9 +23,8 @@ import com.linkedin.photon.ml.model.{DatumScoringModel, RandomEffectModel} * * @param dataset The training dataset */ -class RandomEffectModelCoordinate(dataset: RandomEffectDataset) - extends ModelCoordinate(dataset) - with ModelProjection { +class RandomEffectModelCoordinate(dataset: DataFrame) + extends ModelCoordinate { /** * Score the effect-specific dataset in the coordinate with the input model. @@ -36,7 +35,7 @@ class RandomEffectModelCoordinate(dataset: RandomEffectDataset) override protected[algorithm] def score(model: DatumScoringModel): CoordinateDataScores = { model match { case randomEffectModel: RandomEffectModel => - RandomEffectCoordinate.score(dataset, projectModelForward(randomEffectModel)) + RandomEffectCoordinate.score(dataset, randomEffectModel) case _ => throw new UnsupportedOperationException( diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/data/FixedEffectDataset.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/data/FixedEffectDataset.scala deleted file mode 100644 index 5c7154e1..00000000 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/data/FixedEffectDataset.scala +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ -package com.linkedin.photon.ml.data - -import org.apache.spark.SparkContext -import org.apache.spark.rdd.RDD -import org.apache.spark.storage.StorageLevel - -import com.linkedin.photon.ml.Types.{FeatureShardId, UniqueSampleId} -import com.linkedin.photon.ml.constants.MathConst -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores -import com.linkedin.photon.ml.spark.RDDLike - -/** - * Dataset implementation for fixed effect datasets. - * - * @param labeledPoints The input data - * @param featureShardId The feature shard id - */ -protected[ml] class FixedEffectDataset( - val labeledPoints: RDD[(UniqueSampleId, LabeledPoint)], - val featureShardId: FeatureShardId) - extends Dataset[FixedEffectDataset] - with RDDLike { - - lazy val numFeatures: Int = labeledPoints.first()._2.features.length - - /** - * Add scores to data offsets. - * - * @param scores The scores used throughout the coordinate descent algorithm - * @return An updated dataset with scores added to offsets - */ - override def addScoresToOffsets(scores: CoordinateDataScores): FixedEffectDataset = { - - // It's possible that other coordinates did not score some data. Since we're trying to add scores to the offset and - // the default score is 0, the result of a left join vs. an inner join is the same. However, an inner join will drop - // data which does not have a score. Thus, we need a left join. - val updatedLabeledPoints = labeledPoints - .leftOuterJoin(scores.scoresRdd) - .mapValues { case (LabeledPoint(label, features, offset, weight), scoreOpt) => - LabeledPoint(label, features, offset + scoreOpt.getOrElse(MathConst.DEFAULT_SCORE), weight) - } - - new FixedEffectDataset(updatedLabeledPoints, featureShardId) - } - - /** - * Get the Spark context. - * - * @return The Spark context - */ - override def sparkContext: SparkContext = labeledPoints.sparkContext - - /** - * Assign a given name to [[labeledPoints]]. - * - * @note Not used to reference models in the logic of photon-ml, only used for logging currently. - * @param name The parent name for all [[RDD]]s in this class - * @return This object with the name of [[labeledPoints]] assigned - */ - override def setName(name: String): FixedEffectDataset = { - - labeledPoints.setName(name) - - this - } - - /** - * Set the storage level of [[labeledPoints]], and persist their values across the cluster the first time they are - * computed. - * - * @param storageLevel The storage level - * @return This object with the storage level of [[labeledPoints]] set - */ - override def persistRDD(storageLevel: StorageLevel): FixedEffectDataset = { - - if (!labeledPoints.getStorageLevel.isValid) labeledPoints.persist(storageLevel) - - this - } - - /** - * Mark [[labeledPoints]] as non-persistent, and remove all blocks for them from memory and disk. - * - * @return This object with [[labeledPoints]] marked non-persistent - */ - override def unpersistRDD(): FixedEffectDataset = { - - if (labeledPoints.getStorageLevel.isValid) labeledPoints.unpersist() - - this - } - - /** - * Materialize [[labeledPoints]] (Spark [[RDD]]s are lazy evaluated: this method forces them to be evaluated). - * - * @return This object with [[labeledPoints]] materialized - */ - override def materialize(): FixedEffectDataset = { - - labeledPoints.count() - - this - } - - /** - * Build a summary string for the dataset. - * - * @return A String representation of the dataset - */ - override def toSummaryString: String = { - - val numSamples = labeledPoints.count() - val weightSum = labeledPoints.values.map(_.weight).sum() - val responseSum = labeledPoints.values.map(_.label).sum() - val featureStats = labeledPoints.values.map(_.features.activeSize).stats() - - s"numSamples: $numSamples\n" + - s"weightSum: $weightSum\n" + - s"responseSum: $responseSum\n" + - s"numFeatures: $numFeatures\n" + - s"featureStats: $featureStats" - } -} - -object FixedEffectDataset { - - /** - * Build an instance of a fixed effect dataset for the given feature shard. - * - * @param gameDataset The input dataset - * @param featureShardId The feature shard ID - * @return A new dataset with given configuration - */ - protected[ml] def apply( - gameDataset: RDD[(UniqueSampleId, GameDatum)], - featureShardId: FeatureShardId): FixedEffectDataset = { - - val labeledPoints = gameDataset.mapValues(_.generateLabeledPointWithFeatureShardId(featureShardId)) - - new FixedEffectDataset(labeledPoints, featureShardId) - } -} diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/data/GameConverters.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/data/GameConverters.scala index 71d80498..d27c0030 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/data/GameConverters.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/data/GameConverters.scala @@ -18,7 +18,9 @@ import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.linalg.SparseVector import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.functions.monotonically_increasing_id +import com.linkedin.photon.ml.Constants import com.linkedin.photon.ml.Types.{FeatureShardId, UniqueSampleId} import com.linkedin.photon.ml.util.VectorUtils @@ -46,7 +48,7 @@ object GameConverters { featureShards: Set[FeatureShardId], idTagSet: Set[String], isResponseRequired: Boolean, - inputColumnsNames: InputColumnsNames = InputColumnsNames()): RDD[(UniqueSampleId, GameDatum)] = { + inputColumnsNames: InputColumnsNames = InputColumnsNames()): DataFrame = { val colNamesSet = inputColumnsNames.getNames @@ -55,14 +57,7 @@ object GameConverters { idTagSet.intersect(colNamesSet).isEmpty, s"Cannot use required columns (${colNamesSet.mkString(", ")}) for random effect/validation grouping.") - val inputColumnsNamesBroadcast = data.sqlContext.sparkContext.broadcast(inputColumnsNames) - - data - .rdd - .zipWithUniqueId - .map { case (row, index) => - (index, getGameDatumFromRow(row, featureShards, idTagSet, isResponseRequired, inputColumnsNamesBroadcast)) - } + data.withColumn(Constants.UNIQUE_SAMPLE_ID, monotonically_increasing_id) } /** diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/data/LocalDataset.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/data/LocalDataset.scala index 487d7ceb..312e4957 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/data/LocalDataset.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/data/LocalDataset.scala @@ -14,14 +14,6 @@ */ package com.linkedin.photon.ml.data -import scala.collection.mutable - -import breeze.linalg.Vector - -import com.linkedin.photon.ml.Types.UniqueSampleId -import com.linkedin.photon.ml.constants.MathConst -import com.linkedin.photon.ml.util.VectorUtils - /** * Local dataset implementation. * @@ -31,7 +23,7 @@ import com.linkedin.photon.ml.util.VectorUtils * * @param dataPoints Local data points consists of (globalId, labeledPoint) pairs */ -protected[ml] case class LocalDataset(dataPoints: Array[(UniqueSampleId, LabeledPoint)]) { +protected[ml] case class LocalDataset(dataPoints: Array[LabeledPoint]) { require( dataPoints.length > 0, @@ -40,283 +32,6 @@ protected[ml] case class LocalDataset(dataPoints: Array[(UniqueSampleId, Labeled val numDataPoints: Int = dataPoints.length val numFeatures: Int = dataPoints .head - ._2 .features .length - - /** - * - * @return - */ - def getLabels: Array[(UniqueSampleId, Double)] = dataPoints.map { case (uid, labeledPoint) => - (uid, labeledPoint.label) - } - - /** - * - * @return - */ - def getWeights: Array[(UniqueSampleId, Double)] = dataPoints.map { case (uid, labeledPoint) => - (uid, labeledPoint.weight) - } - - /** - * - * @return - */ - def getOffsets: Array[(UniqueSampleId, Double)] = dataPoints.map { case (uid, labeledPoint) => - (uid, labeledPoint.offset) - } - - /** - * - * @return - */ - def getUniqueIds: Array[UniqueSampleId] = dataPoints.map(_._1) - - /** - * Add the residual scores to the offsets. - * - * @param residualScores The residual scores - * @return The [[LocalDataset]] with updated offsets - */ - def addScoresToOffsets(residualScores: Array[(UniqueSampleId, Double)]): LocalDataset = { - - val updatedDataPoints = dataPoints - .zip(residualScores) - .map { case ((dataId, LabeledPoint(label, features, offset, weight)), (residualScoreId, residualScoreDatum)) => - - require(residualScoreId == dataId, s"residual score Id ($residualScoreId) and data Id ($dataId) don't match!") - - (dataId, LabeledPoint(label, features, residualScoreDatum + offset, weight)) - } - - LocalDataset(updatedDataPoints) - } - - /** - * Filter features by Pearson correlation score. - * - * @param numFeaturesToKeep The number of features to keep - * @return The filtered dataset - */ - def filterFeaturesByPearsonCorrelationScore(numFeaturesToKeep: Int): LocalDataset = { - - val numActiveFeatures: Int = dataPoints.flatMap(_._2.features.activeKeysIterator).toSet.size - - if (numFeaturesToKeep < numActiveFeatures) { - val labelAndFeatures = dataPoints.map { case (_, labeledPoint) => (labeledPoint.label, labeledPoint.features) } - val pearsonScores = LocalDataset.stableComputePearsonCorrelationScore(labelAndFeatures) - - val filteredFeaturesIndexSet = pearsonScores - .toArray - .sortBy { case (_, score) => math.abs(score) } - .takeRight(numFeaturesToKeep) - .map(_._1) - .toSet - - val filteredActivities = dataPoints.map { case (id, LabeledPoint(label, features, offset, weight)) => - - val filteredFeatures = LocalDataset.filterFeaturesWithFeatureIndexSet(features, filteredFeaturesIndexSet) - - (id, LabeledPoint(label, filteredFeatures, offset, weight)) - } - - LocalDataset(filteredActivities) - } else { - this - } - } -} - -object LocalDataset { - - /** - * Factory method for LocalDataset. - * - * @param dataPoints The array of underlying data - * @param isSortedByFirstIndex Whether or not to sort the data by global ID - * @return A new LocalDataset - */ - protected[ml] def apply( - dataPoints: Array[(UniqueSampleId, LabeledPoint)], - isSortedByFirstIndex: Boolean): LocalDataset = { - - if (isSortedByFirstIndex) { - LocalDataset(dataPoints) - } else { - LocalDataset(dataPoints.sortBy(_._1)) - } - } - - /** - * Filter features by feature index. - * - * @param features The original feature set - * @param featureIndexSet The feature index set - * @return The filtered feature vector - */ - private def filterFeaturesWithFeatureIndexSet( - features: Vector[Double], - featureIndexSet: Set[Int]): Vector[Double] = { - - val result = VectorUtils.zeroOfSameType(features) - - features.activeIterator.foreach { case (key, value) => - if (featureIndexSet.contains(key)) { - result(key) = value - } - } - - result - } - - /** - * Compute Pearson correlation scores using a numerically stable algorithm. - * - * @param labelAndFeatures An array of (label, feature) tuples - * @return The Pearson correlation scores for each tuple - */ - protected[ml] def stableComputePearsonCorrelationScore( - labelAndFeatures: Array[(Double, Vector[Double])]): Map[Int, Double] = { - - val featureMeans = mutable.Map[Int, Double]() - val featureUnscaledVars = mutable.Map[Int, Double]() - var labelMean = 0.0 - var labelUnscaledVariance = 0.0 - val unscaledCovariances = mutable.Map[Int, Double]() - var interceptAdded = false - var numSamples = 0 - - labelAndFeatures.foreach { case (label, features) => - numSamples += 1 - - val deltaLabel = label - labelMean - labelMean += deltaLabel / numSamples - labelUnscaledVariance += deltaLabel * (label - labelMean) - - // Note that, if there is duplicated keys in the feature vector, then the following Pearson correlation scores - // calculation will screw up - features.iterator.foreach { case (key, value) => - val prevFeatureMean = featureMeans.getOrElse(key, 0.0) - val deltaFeature = value - prevFeatureMean - val featureMean = prevFeatureMean + deltaFeature / numSamples - - val prevFeatureUnscaledVar = featureUnscaledVars.getOrElse(key, 0.0) - val featureUnscaledVar = prevFeatureUnscaledVar + deltaFeature * (value - featureMean) - - val prevCovariance = unscaledCovariances.getOrElse(key, 0.0) - val unscaledCovariance = prevCovariance + deltaFeature * deltaLabel * (numSamples - 1) / numSamples - - featureMeans.update(key, featureMean) - featureUnscaledVars.update(key, featureUnscaledVar) - unscaledCovariances.update(key, unscaledCovariance) - } - } - - val labelStd = math.sqrt(labelUnscaledVariance) - - featureMeans - .iterator - .map { case (key, featureMean) => - val featureStd = math.sqrt(featureUnscaledVars(key)) - val covariance = unscaledCovariances(key) - - // When the standard deviation of the feature is close to 0 we treat it as the intercept term. - val score = if (featureStd < math.sqrt(numSamples) * MathConst.EPSILON) { - // Note that if the mean and standard deviation are equal to zero, it either means that the feature is constant - if (featureMean == 1.0 && !interceptAdded) { - interceptAdded = true - 1.0 - } else { - 0.0 - } - } else { - covariance / (labelStd * featureStd + MathConst.EPSILON) - } - - require(math.abs(score) <= 1 + MathConst.EPSILON, - s"Computed pearson correlation score is $score, while the score's magnitude should be less than 1. " + - s"(Diagnosis:\n" + - s"featureKey=$key\n" + - s"featureStd=$featureStd\n" + - s"labelStd=$labelStd\n" + - s"covariance=$covariance\n" + - s"numSamples=$numSamples\n" + - s"labelAndFeatures used to compute Pearson correlation score:\n${labelAndFeatures.mkString("\n")}})") - - (key, score) - } - .toMap - } - - /** - * Compute Pearson correlation scores. - * - * @param labelAndFeatures An array of (label, feature) tuples - * @return The Pearson correlation scores for each tuple - */ - protected[ml] def computePearsonCorrelationScore( - labelAndFeatures: Array[(Double, Vector[Double])]): Map[Int, Double] = { - - val featureLabelProductSums = mutable.Map[Int, Double]() - val featureFirstOrderSums = mutable.Map[Int, Double]() - val featureSecondOrderSums = mutable.Map[Int, Double]() - var labelFirstOrderSum = 0.0 - var labelSecondOrderSum = 0.0 - var numSamples = 0 - var interceptAdded = false - - labelAndFeatures.foreach { case (label, features) => - numSamples += 1 - labelFirstOrderSum += label - labelSecondOrderSum += label * label - // Note that, if there is duplicated keys in the feature vector, then the following Pearson correlation scores - // calculation will screw up - features.activeIterator.foreach { case (key, value) => - featureFirstOrderSums.update(key, featureFirstOrderSums.getOrElse(key, 0.0) + value) - featureSecondOrderSums.update(key, featureSecondOrderSums.getOrElse(key, 0.0) + value * value) - featureLabelProductSums.update(key, featureLabelProductSums.getOrElse(key, 0.0) + value * label) - } - } - - featureFirstOrderSums - .keySet - .map { key => - val featureFirstOrderSum = featureFirstOrderSums(key) - val featureSecondOrderSum = featureSecondOrderSums(key) - val featureLabelProductSum = featureLabelProductSums(key) - val numerator = numSamples * featureLabelProductSum - featureFirstOrderSum * labelFirstOrderSum - val std = math.sqrt(math.abs(numSamples * featureSecondOrderSum - featureFirstOrderSum * featureFirstOrderSum)) - val denominator = std * math.sqrt(numSamples * labelSecondOrderSum - labelFirstOrderSum * labelFirstOrderSum) - - // When the standard deviation of the feature is close to 0, we treat it as the intercept term - val score = if (std < MathConst.EPSILON) { - if (interceptAdded) { - 0.0 - } else { - interceptAdded = true - 1.0 - } - } else { - numerator / (denominator + MathConst.EPSILON) - } - - require(math.abs(score) <= 1 + MathConst.EPSILON, - s"Computed pearson correlation score is $score, while the score's magnitude should be less than 1. " + - s"(Diagnosis:\n" + - s"numerator=$numerator\n" + - s"denominator=$denominator\n" + - s"numSamples=$numSamples\n" + - s"featureFirstOrderSum=$featureFirstOrderSum\n" + - s"featureSecondOrderSum=$featureSecondOrderSum\n" + - s"featureLabelProductSum=$featureLabelProductSum\n" + - s"labelFirstOrderSum=$labelFirstOrderSum\n" + - s"labelSecondOrderSum=$labelSecondOrderSum\n" + - s"labelAndFeatures used to compute Pearson correlation score:\n${labelAndFeatures.mkString("\n")}})") - - (key, score) - } - .toMap - } -} +} \ No newline at end of file diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/data/RandomEffectDataset.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/data/RandomEffectDataset.scala deleted file mode 100644 index f59e3653..00000000 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/data/RandomEffectDataset.scala +++ /dev/null @@ -1,647 +0,0 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ -package com.linkedin.photon.ml.data - -import scala.collection.mutable -import scala.util.hashing.byteswap64 - -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SparkSession -import org.apache.spark.storage.StorageLevel -import org.apache.spark.{Partitioner, SparkContext} - -import com.linkedin.photon.ml.Types.{FeatureShardId, REId, REType, UniqueSampleId} -import com.linkedin.photon.ml.constants.MathConst -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores -import com.linkedin.photon.ml.projector.LinearSubspaceProjector -import com.linkedin.photon.ml.spark.{BroadcastLike, RDDLike} -import com.linkedin.photon.ml.util.VectorUtils - -/** - * Dataset implementation for random effect data. - * - * All of the training data for a single random effect must fit into on Spark partition. The size limit of a single - * Spark partition is 2 GB. If the size of (samples * features) exceeds the maximum size of a single Spark partition, - * the data is split into two sections: active and passive data. - * - * activeData + passiveData = full data set - * - * Active data is used for both training and scoring (to determine residuals for partial score). Passive data is used - * only for scoring. In the vast majority of cases, all data is active data. - * - * @param activeData Per-entity datasets used to train per-entity models and to compute residuals - * @param passiveData Per-entity datasets used only to compute residuals - * @param activeUniqueIdToRandomEffectIds Map of unique sample id to random effect id for active data samples - * @param projectors The per-entity [[LinearSubspaceProjector]] objects used to compress the per-entity feature spaces - * @param randomEffectType The random effect type (e.g. "memberId") - * @param featureShardId The ID of the data feature shard used by this dataset - */ -protected[ml] class RandomEffectDataset( - val activeData: RDD[(REId, LocalDataset)], - val passiveData: RDD[(UniqueSampleId, (REId, LabeledPoint))], - val activeUniqueIdToRandomEffectIds: RDD[(UniqueSampleId, REId)], - val projectors: RDD[(REId, LinearSubspaceProjector)], - val randomEffectType: REType, - val featureShardId: FeatureShardId) - extends Dataset[RandomEffectDataset] - with BroadcastLike - with RDDLike { - - lazy val passiveDataREIds: Broadcast[Set[REId]] = SparkSession - .builder() - .getOrCreate() - .sparkContext - .broadcast(passiveData.map(_._2._1).distinct().collect().toSet) - val randomEffectIdPartitioner: Partitioner = activeData.partitioner.get - val uniqueIdPartitioner: Partitioner = passiveData.partitioner.get - - // - // Dataset functions - // - - /** - * Add residual scores to the data offsets. - * - * @param scores The residual scores - * @return The dataset with updated offsets - */ - override def addScoresToOffsets(scores: CoordinateDataScores): RandomEffectDataset = { - - // It's possible that other coordinates did not score some data. Since we're trying to add scores to the offset and - // the default score is 0, the result of a left join vs. an inner join is the same. However, an inner join will drop - // data which does not have a score. Thus, we need a left join. - val scoresGroupedByRandomEffectId = activeUniqueIdToRandomEffectIds - .leftOuterJoin(scores.scoresRdd, uniqueIdPartitioner) - .map { case (uniqueId, (reId, scoreOpt)) => - (reId, (uniqueId, scoreOpt.getOrElse(MathConst.DEFAULT_SCORE))) - } - .groupByKey(randomEffectIdPartitioner) - .mapValues(_.toArray.sortBy(_._1)) - - // Since we use a left join above, we're guaranteed to have each random effect entity from the active data present - // and thus use an inner join - val updatedActiveData = activeData - .join(scoresGroupedByRandomEffectId, randomEffectIdPartitioner) - .mapValues { case (localData, localScore) => localData.addScoresToOffsets(localScore) } - - // The resultant dataset is only used for training a new model, thus only the active data needs to have scores added - new RandomEffectDataset( - updatedActiveData, - passiveData, - activeUniqueIdToRandomEffectIds, - projectors, - randomEffectType, - featureShardId) - } - - // - // BroadcastLike Functions - // - - /** - * Asynchronously delete cached copies of [[passiveDataREIds]] on all executors. - * - * @return This [[RandomEffectDataset]] with [[passiveDataREIds]] unpersisted - */ - override protected[ml] def unpersistBroadcast(): RandomEffectDataset = { - - passiveDataREIds.unpersist() - - this - } - - // - // RDDLike Functions - // - - /** - * Get the Spark context. - * - * @return The Spark context - */ - override def sparkContext: SparkContext = activeData.sparkContext - - /** - * Assign a given name to [[activeData]], [[activeUniqueIdToRandomEffectIds]], and [[passiveData]]. - * - * @note Not used to reference models in the logic of photon-ml, only used for logging currently. - * @param name The parent name for all [[RDD]]s in this class - * @return This object with the names [[activeData]], [[activeUniqueIdToRandomEffectIds]], and [[passiveData]] - * assigned - */ - override def setName(name: String): RandomEffectDataset = { - - activeData.setName(s"$name - Active Data") - passiveData.setName(s"$name - Passive Data") - activeUniqueIdToRandomEffectIds.setName(s"$name - UID to REID") - projectors.setName(s"$name - Projectors") - - this - } - - /** - * Set the storage level of [[activeData]], [[activeUniqueIdToRandomEffectIds]], and [[passiveData]], and persist - * their values across the cluster the first time they are computed. - * - * @param storageLevel The storage level - * @return This object with the storage level of [[activeData]], [[activeUniqueIdToRandomEffectIds]], and - * [[passiveData]] set - */ - override def persistRDD(storageLevel: StorageLevel): RandomEffectDataset = { - - if (!activeData.getStorageLevel.isValid) activeData.persist(storageLevel) - if (!passiveData.getStorageLevel.isValid) passiveData.persist(storageLevel) - if (!activeUniqueIdToRandomEffectIds.getStorageLevel.isValid) activeUniqueIdToRandomEffectIds.persist(storageLevel) - if (!projectors.getStorageLevel.isValid) projectors.persist(storageLevel) - - this - } - - /** - * Mark [[activeData]], [[activeUniqueIdToRandomEffectIds]], and [[passiveData]] as non-persistent, and remove all - * blocks for them from memory and disk. - * - * @return This object with [[activeData]], [[activeUniqueIdToRandomEffectIds]], and [[passiveData]] marked - * non-persistent - */ - override def unpersistRDD(): RandomEffectDataset = { - - if (activeData.getStorageLevel.isValid) activeData.unpersist() - if (passiveData.getStorageLevel.isValid) passiveData.unpersist() - if (activeUniqueIdToRandomEffectIds.getStorageLevel.isValid) activeUniqueIdToRandomEffectIds.unpersist() - if (projectors.getStorageLevel.isValid) projectors.unpersist() - - this - } - - /** - * Materialize [[activeData]], [[activeUniqueIdToRandomEffectIds]], and [[passiveData]] (Spark [[RDD]]s are lazy - * evaluated: this method forces them to be evaluated). - * - * @return This object with [[activeData]], [[activeUniqueIdToRandomEffectIds]], and [[passiveData]] materialized - */ - override def materialize(): RandomEffectDataset = { - - activeData.count() - passiveData.count() - activeUniqueIdToRandomEffectIds.count() - projectors.count() - - this - } - - // - // Summarizable Functions - // - - /** - * Build a human-readable summary for [[RandomEffectDataset]]. - * - * @return A summary of the object in string representation - */ - override def toSummaryString: String = { - - val stringBuilder = new StringBuilder("Random Effect Data Set:") - - val activeDataValues = activeData.values.persist(StorageLevel.MEMORY_ONLY_SER) - - val numActiveSamples = activeUniqueIdToRandomEffectIds.count() - val activeSampleWeightSum = activeDataValues.map(_.getWeights.map(_._2).sum).sum() - val activeSampleResponseSum = activeDataValues.map(_.getLabels.map(_._2).sum).sum() - val numPassiveSamples = passiveData.count() - val passiveSampleResponsesSum = passiveData.values.map(_._2.label).sum() - val numAllSamples = numActiveSamples + numPassiveSamples - val numActiveSamplesStats = activeDataValues.map(_.numDataPoints).stats() - val activeSamplerResponseSumStats = activeDataValues.map(_.getLabels.map(_._2).sum).stats() - val numFeaturesStats = activeDataValues.map(_.numFeatures).stats() - - activeDataValues.unpersist() - - // TODO: Need more descriptive text than just the variable name - stringBuilder.append(s"\nnumActiveSamples: $numActiveSamples") - stringBuilder.append(s"\nactiveSampleWeightSum: $activeSampleWeightSum") - stringBuilder.append(s"\nactiveSampleResponseSum: $activeSampleResponseSum") - stringBuilder.append(s"\nnumPassiveSamples: $numPassiveSamples") - stringBuilder.append(s"\npassiveSampleResponsesSum: $passiveSampleResponsesSum") - stringBuilder.append(s"\nnumAllSamples: $numAllSamples") - stringBuilder.append(s"\nnumActiveSamplesStats: $numActiveSamplesStats") - stringBuilder.append(s"\nactiveSamplerResponseSumStats: $activeSamplerResponseSumStats") - stringBuilder.append(s"\nnumFeaturesStats: $numFeaturesStats") - - stringBuilder.toString() - } -} - -object RandomEffectDataset { - - /** - * Build a new [[RandomEffectDataset]] from the raw data using the given configuration. - * - * @param gameDataset The [[RDD]] of [[GameDatum]] used to generate the random effect dataset - * @param randomEffectDataConfiguration The data configuration for the random effect dataset - * @param randomEffectPartitioner A specialized partitioner to co-locate all data from a single entity, while keeping - * the data distribution equal amongst partitions - * @param existingModelKeysRddOpt Optional set of entities that have existing models - * @return A new [[RandomEffectDataset]] - */ - def apply( - gameDataset: RDD[(UniqueSampleId, GameDatum)], - randomEffectDataConfiguration: RandomEffectDataConfiguration, - randomEffectPartitioner: RandomEffectDatasetPartitioner, - existingModelKeysRddOpt: Option[RDD[REId]], - storageLevel: StorageLevel): RandomEffectDataset = { - - val uniqueIdPartitioner = gameDataset.partitioner.get - - // - // Generate RDDs - // - - val keyedGameDataset = generateKeyedGameDataset(gameDataset, randomEffectDataConfiguration) - keyedGameDataset.persist(StorageLevel.MEMORY_ONLY_SER).count - - // In this RDD, there is a projector for every entity (even those which may later be filtered by the lower bound) - val unfilteredProjectors = generateLinearSubspaceProjectors(keyedGameDataset, randomEffectPartitioner) - unfilteredProjectors.persist(storageLevel).count - - val projectedKeyedGameDataset = generateProjectedDataset(keyedGameDataset, unfilteredProjectors, randomEffectPartitioner) - projectedKeyedGameDataset.persist(StorageLevel.MEMORY_ONLY_SER).count - - val unfilteredActiveData = generateGroupedActiveData( - projectedKeyedGameDataset, - randomEffectDataConfiguration, - randomEffectPartitioner) - - val (activeData, passiveData, uniqueIdToRandomEffectIds, projectors) = - randomEffectDataConfiguration.numActiveDataPointsLowerBound match { - - case Some(activeDataLowerBound) => - - unfilteredActiveData.persist(StorageLevel.MEMORY_ONLY_SER) - - // Filter entities which do not meet active data lower bound threshold - val filteredActiveData = filterActiveData( - unfilteredActiveData, - activeDataLowerBound, - existingModelKeysRddOpt) - filteredActiveData.persist(storageLevel).count - - val passiveData = generatePassiveData( - projectedKeyedGameDataset, - generateIdMap(unfilteredActiveData, uniqueIdPartitioner)) - passiveData.persist(storageLevel).count - - val uniqueIdToRandomEffectIds = generateIdMap(filteredActiveData, uniqueIdPartitioner) - uniqueIdToRandomEffectIds.persist(storageLevel).count - - val filteredProjectors = filterProjectors(unfilteredProjectors, filteredActiveData) - filteredProjectors.persist(storageLevel).count - - unfilteredActiveData.unpersist() - unfilteredProjectors.unpersist() - - (filteredActiveData, passiveData, uniqueIdToRandomEffectIds, filteredProjectors) - - case None => - - unfilteredActiveData.persist(storageLevel).count - - val uniqueIdToRandomEffectIds = generateIdMap(unfilteredActiveData, uniqueIdPartitioner) - uniqueIdToRandomEffectIds.persist(storageLevel).count - - val passiveData = generatePassiveData(projectedKeyedGameDataset, uniqueIdToRandomEffectIds) - passiveData.persist(storageLevel).count - - (unfilteredActiveData, passiveData, uniqueIdToRandomEffectIds, unfilteredProjectors) - } - - // - // Unpersist component RDDs - // - - keyedGameDataset.unpersist() - projectedKeyedGameDataset.unpersist() - - // - // Return new dataset - // - - new RandomEffectDataset( - activeData, - passiveData, - uniqueIdToRandomEffectIds, - projectors, - randomEffectDataConfiguration.randomEffectType, - randomEffectDataConfiguration.featureShardId) - } - - /** - * Process the raw data to be keyed by the [[REId]]s for the given [[REType]], and filter the feature vector for only - * the given shard. - * - * @param gameDataset The [[RDD]] of [[GameDatum]] used to generate the random effect dataset - * @param randomEffectDataConfiguration The data configuration for the random effect dataset - * @return The data for the given feature shard, keyed by the [[REId]]s for the given [[REType]] - */ - protected[data] def generateKeyedGameDataset( - gameDataset: RDD[(UniqueSampleId, GameDatum)], - randomEffectDataConfiguration: RandomEffectDataConfiguration): RDD[(REId, (UniqueSampleId, LabeledPoint))] = { - - val randomEffectType = randomEffectDataConfiguration.randomEffectType - val featureShardId = randomEffectDataConfiguration.featureShardId - - gameDataset - .map { case (uniqueId, gameData) => - val randomEffectId = gameData.idTagToValueMap(randomEffectType) - val labeledPoint = gameData.generateLabeledPointWithFeatureShardId(featureShardId) - - (randomEffectId, (uniqueId, labeledPoint)) - } - } - - /** - * Generate the [[LinearSubspaceProjector]] objects used to compress the feature vectors for each per-entity dataset. - * - * @param keyedGameDataset The data for the given feature shard, keyed by the [[REId]]s for the given [[REType]] - * @param randomEffectPartitioner A specialized partitioner to co-locate all data from a single entity, while keeping - * the data distribution equal amongst partitions - * @return An [[RDD]] of per-entity [[LinearSubspaceProjector]] objects - */ - protected[data] def generateLinearSubspaceProjectors( - keyedGameDataset: RDD[(REId, (UniqueSampleId, LabeledPoint))], - randomEffectPartitioner: RandomEffectDatasetPartitioner): RDD[(REId, LinearSubspaceProjector)] = { - - val originalSpaceDimension = keyedGameDataset - .take(1) - .head - ._2 - ._2 - .features - .length - - keyedGameDataset - .mapValues { case (_, labeledPoint) => - VectorUtils.getActiveIndices(labeledPoint.features) - } - .foldByKey(mutable.Set[Int](), randomEffectPartitioner)(_.union(_)) - .mapValues(activeIndices => new LinearSubspaceProjector(activeIndices.toSet, originalSpaceDimension)) - } - - /** - * Project the per-entity datasets to a linear subspace - thus reducing the size of their feature vectors (for faster - * optimization). - * - * @param keyedGameDataset The data for the given feature shard, keyed by the [[REId]]s for the given [[REType]] - * @param projectors An [[RDD]] of per-entity [[LinearSubspaceProjector]] objects - * @param randomEffectPartitioner A specialized partitioner to co-locate all data from a single entity, while keeping - * the data distribution equal amongst partitions - * @return The data for the given feature shard, keyed by the [[REId]]s for the given [[REType]], with feature vectors - * reduced to the smallest linear subspace possible without loss - */ - protected[data] def generateProjectedDataset( - keyedGameDataset: RDD[(REId, (UniqueSampleId, LabeledPoint))], - projectors: RDD[(REId, LinearSubspaceProjector)], - randomEffectPartitioner: RandomEffectDatasetPartitioner): RDD[(REId, (UniqueSampleId, LabeledPoint))] = - - keyedGameDataset - .partitionBy(randomEffectPartitioner) - .zipPartitions(projectors) { case (dataIt, projectorsIt) => - - val projectorLookupTable = projectorsIt.toMap - - dataIt.map { case (rEID, (uID, LabeledPoint(label, features, offset, weight))) => - - val projector = projectorLookupTable(rEID) - val projectedFeatures = projector.projectForward(features) - - (rEID, (uID, LabeledPoint(label, projectedFeatures, offset, weight))) - } - } - - /** - * Generate active data, down-sampling using reservoir sampling if the data for any entity exceeds the upper bound. - * - * @param projectedKeyedDataset The input data, keyed by entity ID - * @param randomEffectDataConfiguration The random effect data configuration - * @param randomEffectPartitioner A specialized partitioner to co-locate all data from a single entity, while keeping - * the data distribution equal amongst partitions - * @return The input data, grouped by entity ID, and down-sampled if necessary - */ - protected[data] def generateGroupedActiveData( - projectedKeyedDataset: RDD[(REId, (UniqueSampleId, LabeledPoint))], - randomEffectDataConfiguration: RandomEffectDataConfiguration, - randomEffectPartitioner: Partitioner): RDD[(REId, LocalDataset)] = { - - // Filter data using reservoir sampling if active data size is bounded - val groupedActiveData = randomEffectDataConfiguration - .numActiveDataPointsUpperBound - .map { activeDataUpperBound => - groupDataByKeyAndSample( - projectedKeyedDataset, - randomEffectPartitioner, - activeDataUpperBound, - randomEffectDataConfiguration.randomEffectType) - } - .getOrElse(projectedKeyedDataset.groupByKey(randomEffectPartitioner)) - .mapValues { iterable => - LocalDataset(iterable.toArray, isSortedByFirstIndex = false) - } - - // Filter features if feature dimension of active data is bounded - featureSelectionOnActiveData(groupedActiveData, randomEffectDataConfiguration.numFeaturesToSamplesRatioUpperBound) - } - - /** - * Generate a dataset grouped by random effect ID and limited to a maximum number of samples selected via reservoir - * sampling. - * - * The 'Min Heap' reservoir sampling algorithm is used for two reasons: - * 1. The exact sampling must be reproducible so that [[RDD]] partitions can be recovered - * 2. The linear algorithm is non-trivial to combine in a distributed manner - * - * @param projectedKeyedDataset The raw dataset, with samples keyed by random effect ID - * @param partitioner The partitioner - * @param sampleCap The sample cap - * @param randomEffectType The type of random effect - * @return An [[RDD]] of data grouped by individual ID - */ - private def groupDataByKeyAndSample( - projectedKeyedDataset: RDD[(REId, (UniqueSampleId, LabeledPoint))], - partitioner: Partitioner, - sampleCap: Int, - randomEffectType: REType): RDD[(REId, Iterable[(UniqueSampleId, LabeledPoint)])] = { - - // Helper class for defining a constant ordering between data samples (necessary for RDD re-computation) - case class ComparableLabeledPointWithId(comparableKey: Int, uniqueId: UniqueSampleId, labeledPoint: LabeledPoint) - extends Comparable[ComparableLabeledPointWithId] { - - override def compareTo(comparableLabeledPointWithId: ComparableLabeledPointWithId): Int = { - if (comparableKey - comparableLabeledPointWithId.comparableKey > 0) { - 1 - } else { - -1 - } - } - } - - val createCombiner = - (comparableLabeledPointWithId: ComparableLabeledPointWithId) => { - new MinHeapWithFixedCapacity[ComparableLabeledPointWithId](sampleCap) += comparableLabeledPointWithId - } - - val mergeValue = ( - minHeapWithFixedCapacity: MinHeapWithFixedCapacity[ComparableLabeledPointWithId], - comparableLabeledPointWithId: ComparableLabeledPointWithId) => { - minHeapWithFixedCapacity += comparableLabeledPointWithId - } - - val mergeCombiners = ( - minHeapWithFixedCapacity1: MinHeapWithFixedCapacity[ComparableLabeledPointWithId], - minHeapWithFixedCapacity2: MinHeapWithFixedCapacity[ComparableLabeledPointWithId]) => { - minHeapWithFixedCapacity1 ++= minHeapWithFixedCapacity2 - } - - // The reservoir sampling algorithm is fault tolerant, assuming that the uniqueId for a sample is recovered after - // node failure. We attempt to maximize the likelihood of successful recovery through RDD replication, however there - // is a non-zero possibility of massive failure. If this becomes an issue, we may need to resort to check-pointing - // the raw data RDD after uniqueId assignment. - projectedKeyedDataset - .mapValues { case (uniqueId, labeledPoint) => - val comparableKey = (byteswap64(randomEffectType.hashCode) ^ byteswap64(uniqueId)).hashCode() - ComparableLabeledPointWithId(comparableKey, uniqueId, labeledPoint) - } - .combineByKey[MinHeapWithFixedCapacity[ComparableLabeledPointWithId]]( - createCombiner, - mergeValue, - mergeCombiners, - partitioner) - .mapValues { minHeapWithFixedCapacity => - val count = minHeapWithFixedCapacity.getCount - val data = minHeapWithFixedCapacity.getData - val weightMultiplierOpt = if (count > sampleCap) Some(1D * count / sampleCap) else None - - data.map { case ComparableLabeledPointWithId(_, uniqueId, LabeledPoint(label, features, offset, weight)) => - (uniqueId, LabeledPoint(label, features, offset, weightMultiplierOpt.map(_ * weight).getOrElse(weight))) - } - } - } - - /** - * Filter out entities with less data than a given threshold. - * - * @param groupedActiveData An [[RDD]] of data grouped by entity ID - * @param numActiveDataPointsLowerBound Threshold for number of data points require to receive a per-entity model - * @param existingModelKeysRddOpt Optional set of entities that have existing models - * @return The input data with entities that did not meet the minimum sample threshold removed - */ - protected[data] def filterActiveData( - groupedActiveData: RDD[(REId, LocalDataset)], - numActiveDataPointsLowerBound: Int, - existingModelKeysRddOpt: Option[RDD[REId]]): RDD[(REId, LocalDataset)] = - - existingModelKeysRddOpt match { - case Some(existingModelKeysRdd) => - groupedActiveData.zipPartitions(existingModelKeysRdd, preservesPartitioning = true) { (dataIt, existingKeysIt) => - - val lookupTable = existingKeysIt.toSet - - dataIt.filter { case (key, data) => - (data.numDataPoints >= numActiveDataPointsLowerBound) || !lookupTable.contains(key) - } - } - - case None => - groupedActiveData.filter { case (_, data) => - data.numDataPoints >= numActiveDataPointsLowerBound - } - } - - /** - * Reduce active data feature dimension for entities with few samples. The maximum feature dimension is limited to - * the number of samples multiplied by the feature dimension ratio. Features are chosen by greatest Pearson - * correlation score. - * - * @param activeData An [[RDD]] of data grouped by entity ID - * @param numFeaturesToSamplesRatioUpperBoundOpt Optional ratio of samples to feature dimension - * @return The input data with feature dimension reduced for entities whose feature dimension greatly exceeded the - * number of available samples - */ - private def featureSelectionOnActiveData( - activeData: RDD[(REId, LocalDataset)], - numFeaturesToSamplesRatioUpperBoundOpt: Option[Double]): RDD[(REId, LocalDataset)] = - numFeaturesToSamplesRatioUpperBoundOpt - .map { numFeaturesToSamplesRatioUpperBound => - activeData.mapValues { localDataset => - - var numFeaturesToKeep = math.ceil(numFeaturesToSamplesRatioUpperBound * localDataset.numDataPoints).toInt - // In case the above product overflows - if (numFeaturesToKeep < 0) numFeaturesToKeep = Int.MaxValue - - localDataset.filterFeaturesByPearsonCorrelationScore(numFeaturesToKeep) - } - } - .getOrElse(activeData) - - /** - * Generate a map of unique sample id to random effect id for active data samples. - * - * @param activeData The active dataset - * @param partitioner The [[Partitioner]] to use for the [[RDD]] of unique sample ID to random effect ID - * @return A map of unique sample id to random effect id for active data samples - */ - protected[data] def generateIdMap( - activeData: RDD[(REId, LocalDataset)], - partitioner: Partitioner): RDD[(UniqueSampleId, REId)] = - activeData - .flatMap { case (individualId, localDataset) => - localDataset.getUniqueIds.map((_, individualId)) - } - .partitionBy(partitioner) - - /** - * Generate passive dataset. - * - * @param projectedKeyedDataset The data for the given feature shard, keyed by the [[REId]]s for the given [[REType]] - * @param activeUniqueIDs The unique IDs of the active dataset - * @return The passive dataset - */ - protected[data] def generatePassiveData( - projectedKeyedDataset: RDD[(REId, (UniqueSampleId, LabeledPoint))], - activeUniqueIDs: RDD[(UniqueSampleId, REId)]): RDD[(UniqueSampleId, (REId, LabeledPoint))] = { - - val passiveDataPool = projectedKeyedDataset.map { case (rEID, (uniqueID, labeledPoint)) => - (uniqueID, (rEID, labeledPoint)) - } - - passiveDataPool.subtractByKey(activeUniqueIDs) - } - - /** - * Filter out projectors for entities which were filtered out. - * - * @param unfilteredProjectors The unfiltered projectors - * @param filteredActiveData The filtered active data - * @return [[unfilteredProjectors]] with all projectors for entities not in [[filteredActiveData]] removed - */ - protected[data] def filterProjectors( - unfilteredProjectors: RDD[(REId, LinearSubspaceProjector)], - filteredActiveData: RDD[(REId, LocalDataset)]): RDD[(REId, LinearSubspaceProjector)] = - // Both RDDs use the same partitioner, thus there should be no shuffle. Use inner join to drop projectors for - // filtered entities. - filteredActiveData - .join(unfilteredProjectors) - .map { case (rEId, (_, projector)) => (rEId, projector) } -} diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala index c78d51d3..627ae6c5 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala @@ -318,39 +318,12 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P // Verify valid function input validateInput(optimizationConfigurations) - // Group additional columns to include in GameDatum - val randomEffectIdCols: Set[String] = getRequiredParam(coordinateDataConfigurations) - .flatMap { case (_, config) => - config match { - case reConfig: RandomEffectDataConfiguration => Some(reConfig.randomEffectType) - case _ => None - } - } - .toSet - val evaluatorCols = get(validationEvaluators).map(MultiEvaluatorType.getMultiEvaluatorIdTags).getOrElse(Set()) - val additionalCols = randomEffectIdCols ++ evaluatorCols - - // Gather the names of the feature shards used by the coordinates - val featureShards = getRequiredParam(coordinateDataConfigurations) - .map { case (_, coordinateDataConfig) => - coordinateDataConfig.featureShardId - } - .toSet - - // Transform the GAME training data set into fixed and random effect specific datasets - val gameDataset = Timed("Process training data from raw DataFrame to RDD of samples") { - prepareGameDataset(data, featureShards, additionalCols) - } - val trainingDatasets = Timed("Prepare training data") { - prepareTrainingDatasets(gameDataset) - } - // Transform the GAME validation data set into fixed and random effect specific data sets val validationDatasetAndEvaluationSuiteOpt = Timed("Prepare validation data, if any") { prepareValidationDatasetAndEvaluators( validationData, - featureShards, - additionalCols) + featureShards, // TO BE CORRECTED + additionalCols) // TO BE CORRECTED } val coordinateDescent = new CoordinateDescent( @@ -370,8 +343,9 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P optimizationConfigurations.map { optimizationConfiguration => val (gameModel, evaluations) = train( + data, optimizationConfiguration, - trainingDatasets, + //trainingDatasets, coordinateDescent, prevGameModel) @@ -381,19 +355,6 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P } } - // Purge the raw GAME data, training data, validation data, and normalization contexts in reverse order of - // definition - gameDataset.unpersist() - trainingDatasets.foreach { case (_, dataset) => - dataset match { - case rddLike: RDDLike => rddLike.unpersistRDD() - case _ => - } - dataset match { - case broadcastLike: BroadcastLike => broadcastLike.unpersistBroadcast() - case _ => - } - } validationDatasetAndEvaluationSuiteOpt.map { case (validationDataset, evaluationSuite) => validationDataset.unpersist() evaluationSuite.unpersistRDD() @@ -438,96 +399,6 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P } } - /** - * Construct a [[RDD]] of data processed into GAME format from a raw [[DataFrame]]. - * - * @param data The raw [[DataFrame]] - * @param featureShards The IDs of the feature shards to keep - * @param additionalCols The names of fields containing information necessary for random effects or evaluation - * @return A [[RDD]] of data processed into GAME format - */ - protected def prepareGameDataset( - data: DataFrame, - featureShards: Set[FeatureShardId], - additionalCols: Set[String]): RDD[(UniqueSampleId, GameDatum)] = - GameConverters - .getGameDatasetFromDataFrame( - data, - featureShards, - additionalCols, - isResponseRequired = true, - getOrDefault(inputColumnNames)) - .partitionBy(new LongHashPartitioner(data.rdd.getNumPartitions)) - .setName("GAME training data") - .persist(StorageLevel.DISK_ONLY) - - /** - * Construct one or more [[Dataset]]s from an [[RDD]] of samples. - * - * @param gameDataset The training data samples - * @return A map of coordinate ID to training [[Dataset]] - */ - protected def prepareTrainingDatasets( - gameDataset: RDD[(UniqueSampleId, GameDatum)]): Map[CoordinateId, D forSome { type D <: Dataset[D] }] = { - - val coordinateDataConfigs = getRequiredParam(coordinateDataConfigurations) - - coordinateDataConfigs.map { case (coordinateId, config) => - - val result = config match { - - case feConfig: FixedEffectDataConfiguration => - - val fixedEffectDataset = FixedEffectDataset(gameDataset, feConfig.featureShardId) - .setName(s"Fixed Effect Dataset: $coordinateId") - .persistRDD(StorageLevel.DISK_ONLY) - - if (logger.isDebugEnabled) { - // Eval this only in debug mode, because the call to "toSummaryString" can be very expensive - logger.debug( - s"Summary of fixed effect dataset with coordinate ID '$coordinateId':\n" + - s"${fixedEffectDataset.toSummaryString}") - } - - (coordinateId, fixedEffectDataset) - - case reConfig: RandomEffectDataConfiguration => - - val rePartitioner = RandomEffectDatasetPartitioner.fromGameDataset(gameDataset, reConfig) - val existingModelKeysRddOpt = if (getOrDefault(ignoreThresholdForNewModels)) { - getRequiredParam(initialModel).getModel(coordinateId).map { - case rem: RandomEffectModel => - rem.modelsRDD.partitionBy(rePartitioner).keys - - case other => - throw new IllegalArgumentException( - s"Model type mismatch: expected Random Effect Model but found '${other.getClass}'") - } - } else { - None - } - - val randomEffectDataset = RandomEffectDataset( - gameDataset, - reConfig, - rePartitioner, - existingModelKeysRddOpt, - StorageLevel.DISK_ONLY) - randomEffectDataset.setName(s"Random Effect Data Set: $coordinateId") - - if (logger.isDebugEnabled) { - // Eval this only in debug mode, because the call to "toSummaryString" can be very expensive - logger.debug( - s"Summary of random effect dataset with coordinate ID $coordinateId:\n" + - s"${randomEffectDataset.toSummaryString}\n") - } - - (coordinateId, randomEffectDataset) - } - - result.asInstanceOf[(CoordinateId, D forSome { type D <: Dataset[D] })] - } - } /** * Optionally construct an [[RDD]] of validation data samples, and an [[EvaluationSuite]] to compute evaluation metrics @@ -620,14 +491,13 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P * of the previous 'coordinates'. * * @param configuration The configuration for the GAME optimization problem - * @param trainingDatasets The training datasets for each coordinate of the GAME optimization problem * @param coordinateDescent The coordinate descent driver * @param initialModelOpt An optional existing GAME model who's components should be used to warm-start training * @return A trained GAME model */ protected def train( + data: DataFrame, configuration: GameOptimizationConfiguration, - trainingDatasets: Map[CoordinateId, D forSome { type D <: Dataset[D] }], coordinateDescent: CoordinateDescent, initialModelOpt: Option[GameModel] = None): (GameModel, Option[EvaluationResults]) = Timed(s"Train model:") { @@ -638,6 +508,7 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P val task = getRequiredParam(trainingTask) val updateSequence = getRequiredParam(coordinateUpdateSequence) + val dataConfigs = getRequiredParam(coordinateDataConfigurations) val normalizationContexts = get(coordinateNormalizationContexts).getOrElse(Map()) val variance = getOrDefault(varianceComputationType) val lossFunctionFactoryFactory = ObjectiveFunctionHelper.buildFactory(task, getOrDefault(treeAggregateDepth)) @@ -664,14 +535,20 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P } } else { CoordinateFactory.build( - trainingDatasets(coordinateId), + data, + dataConfigs(coordinateId).featureShardId, + getOrDefault(inputColumnNames), configuration(coordinateId), lossFunctionFactoryFactory, glmConstructor, downSamplerFactory, normalizationContexts.getOrElse(coordinateId, NoNormalization()), variance, - interceptIndices.get(coordinateId)) + interceptIndices.get(coordinateId), + dataConfigs(coordinateId) match { + case redc: RandomEffectDataConfiguration => Some(redc.randomEffectType) + case _: FixedEffectDataConfiguration => None + }) } (coordinateId, coordinate) diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/DistributedOptimizationProblem.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/DistributedOptimizationProblem.scala index 2573bf3b..eb1ba1c1 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/DistributedOptimizationProblem.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/DistributedOptimizationProblem.scala @@ -14,7 +14,8 @@ */ package com.linkedin.photon.ml.optimization -import breeze.linalg.{Vector, cholesky, diag} +import breeze.linalg.{Vector => BVector} +import breeze.linalg.{cholesky, diag} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel @@ -81,7 +82,7 @@ protected[ml] class DistributedOptimizationProblem[Objective <: DistributedObjec * @param coefficients The feature coefficients means * @return An optional feature coefficient variances vector */ - override def computeVariances(input: RDD[LabeledPoint], coefficients: Vector[Double]): Option[Vector[Double]] = { + override def computeVariances(input: RDD[LabeledPoint], coefficients: BVector[Double]): Option[BVector[Double]] = { val result = (objectiveFunction, varianceComputation) match { case (twiceDiffFunc: TwiceDiffFunction, VarianceComputationType.SIMPLE) => @@ -110,24 +111,15 @@ protected[ml] class DistributedOptimizationProblem[Objective <: DistributedObjec * @param input The training data * @return The learned [[GeneralizedLinearModel]] */ - override def run(input: RDD[LabeledPoint]): GeneralizedLinearModel = + override def run(input: RDD[LabeledPoint]): (GeneralizedLinearModel, OptimizationStatesTracker) = { run(input, initializeZeroModel(input.first.features.size)) - /** - * Run the algorithm with the configured parameters, starting from the initial model provided - * (warm start in iterations over the regularization weights for hyperparameter tuning). - * - * @param input The training data - * @param initialModel The initial model from which to begin optimization - * @return The learned [[GeneralizedLinearModel]] - */ - override def run(input: RDD[LabeledPoint], initialModel: GeneralizedLinearModel): GeneralizedLinearModel = { - - val normalizationContext = optimizer.getNormalizationContext - val (optimizedCoefficients, _) = optimizer.optimize(objectiveFunction, initialModel.coefficients.means)(input) - val optimizedVariances = computeVariances(input, optimizedCoefficients) + val (optimizedCoefficients, stateTracker) = optimizer.optimize( + objectiveFunction, + BVector.zeros[Double](input.first.features.length))( + input) - createModel(normalizationContext, optimizedCoefficients, optimizedVariances) + (createModel(optimizedCoefficients), stateTracker) } /** @@ -137,8 +129,8 @@ protected[ml] class DistributedOptimizationProblem[Objective <: DistributedObjec * @param input The training data * @return The learned [[GeneralizedLinearModel]] */ - def runWithSampling(input: RDD[(UniqueSampleId, LabeledPoint)]): GeneralizedLinearModel = - runWithSampling(input, initializeZeroModel(input.first._2.features.size)) + def runWithSampling(input: RDD[LabeledPoint]): (GeneralizedLinearModel, OptimizationStatesTracker) = + runWithSampling(input, initializeZeroModel(input.first.features.size)) /** * Run the algorithm with the configured parameters, starting from the initial model provided, and down-sample the @@ -149,12 +141,12 @@ protected[ml] class DistributedOptimizationProblem[Objective <: DistributedObjec * @return The learned [[GeneralizedLinearModel]] */ def runWithSampling( - input: RDD[(UniqueSampleId, LabeledPoint)], - initialModel: GeneralizedLinearModel): GeneralizedLinearModel = { + input: RDD[LabeledPoint], + initialModel: GeneralizedLinearModel): (GeneralizedLinearModel, OptimizationStatesTracker) = { val data = (samplerOption match { - case Some(sampler) => sampler.downSample(input).values - case None => input.values + case Some(sampler) => sampler.downSample(input) + case None => input }) .setName("In memory fixed effect training dataset") .persist(StorageLevel.MEMORY_AND_DISK) diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/GeneralizedLinearOptimizationProblem.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/GeneralizedLinearOptimizationProblem.scala index 4766cc2a..09577c87 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/GeneralizedLinearOptimizationProblem.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/GeneralizedLinearOptimizationProblem.scala @@ -41,13 +41,6 @@ protected[ml] abstract class GeneralizedLinearOptimizationProblem[Objective <: O glmConstructor: Coefficients => GeneralizedLinearModel, varianceComputation: VarianceComputationType) extends Logging { - /** - * Get the optimization state trackers for the optimization problems solved - * - * @return Some(OptimizationStatesTracker) if optimization states were tracked, otherwise None - */ - def getStatesTracker: OptimizationStatesTracker = optimizer.getStateTracker - /** * Create a default generalized linear model with 0-valued coefficients * @@ -98,7 +91,7 @@ protected[ml] abstract class GeneralizedLinearOptimizationProblem[Objective <: O * @param input The training data * @return The learned GLM for the given optimization problem, data, regularization type, and regularization weight */ - def run(input: objectiveFunction.Data): GeneralizedLinearModel + def run(input: objectiveFunction.Data): (GeneralizedLinearModel, OptimizationStatesTracker) /** * Run the optimization algorithm on the input data, starting from the initial model provided. @@ -107,7 +100,7 @@ protected[ml] abstract class GeneralizedLinearOptimizationProblem[Objective <: O * @param initialModel The initial model from which to begin optimization * @return The learned GLM for the given optimization problem, data, regularization type, and regularization weight */ - def run(input: objectiveFunction.Data, initialModel: GeneralizedLinearModel): GeneralizedLinearModel + def run(input: objectiveFunction.Data, initialModel: GeneralizedLinearModel): (GeneralizedLinearModel, OptimizationStatesTracker) /** * Compute the regularization term value diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/RandomEffectOptimizationTracker.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/RandomEffectOptimizationTracker.scala index f36c0296..bc53ce61 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/RandomEffectOptimizationTracker.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/RandomEffectOptimizationTracker.scala @@ -61,7 +61,11 @@ object RandomEffectOptimizationTracker{ val convergenceReasons = optimizationStatesTrackers .map { optimizationStatesTracker => - (optimizationStatesTracker.convergenceReason.getOrElse(DidNotConverge), 1) + val reason = optimizationStatesTracker + .convergenceReason + .getOrElse(DidNotConverge) + + (reason, 1) } .reduceByKey(_ + _) .collectAsMap() diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/SingleNodeOptimizationProblem.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/SingleNodeOptimizationProblem.scala index c5875a8b..97d03c36 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/SingleNodeOptimizationProblem.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/SingleNodeOptimizationProblem.scala @@ -15,7 +15,6 @@ package com.linkedin.photon.ml.optimization import breeze.linalg.{Vector, cholesky, diag} - import com.linkedin.photon.ml.constants.MathConst import com.linkedin.photon.ml.data.LabeledPoint import com.linkedin.photon.ml.function._ @@ -74,28 +73,17 @@ protected[ml] class SingleNodeOptimizationProblem[Objective <: SingleNodeObjecti } /** - * Run the optimization algorithm on the input data, starting from an initial model of all-0 coefficients. - * - * @param input The training data - * @return The learned GLM for the given optimization problem, data, regularization type, and regularization weight - */ - override def run(input: Iterable[LabeledPoint]): GeneralizedLinearModel = - run(input, initializeZeroModel(input.head.features.size)) - - /** - * Run the optimization algorithm on the input data, starting from the initial model provided. + * Run the algorithm with the configured parameters, starting from the initial model provided + * (warm start in iterations over the regularization weights for hyperparameter tuning). * * @param input The training data - * @param initialModel The initial model from which to begin optimization - * @return The learned GLM for the given optimization problem, data, regularization type, and regularization weight + * @return The learned [[GeneralizedLinearModel]] */ - override def run(input: Iterable[LabeledPoint], initialModel: GeneralizedLinearModel): GeneralizedLinearModel = { + override def run(input: Iterable[LabeledPoint]): (GeneralizedLinearModel, OptimizationStatesTracker) = { - val normalizationContext = optimizer.getNormalizationContext - val (optimizedCoefficients, _) = optimizer.optimize(objectiveFunction, initialModel.coefficients.means)(input) - val optimizedVariances = computeVariances(input, optimizedCoefficients) + val (optimizedCoefficients, stateTracker) = optimizer.optimize(objectiveFunction, Vector.zeros[Double](input.head.features.length))(input) - createModel(normalizationContext, optimizedCoefficients, optimizedVariances) + (createModel(optimizedCoefficients), stateTracker) } } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/game/RandomEffectOptimizationProblem.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/game/RandomEffectOptimizationProblem.scala index c0a0201e..691e5cc8 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/game/RandomEffectOptimizationProblem.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/game/RandomEffectOptimizationProblem.scala @@ -16,9 +16,11 @@ package com.linkedin.photon.ml.optimization.game import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame import org.apache.spark.storage.StorageLevel -import com.linkedin.photon.ml.Types.REId +import com.linkedin.photon.ml.Constants +import com.linkedin.photon.ml.Types.{REId, REType} import com.linkedin.photon.ml.function.SingleNodeObjectiveFunction import com.linkedin.photon.ml.model.Coefficients import com.linkedin.photon.ml.normalization.NormalizationContext @@ -124,8 +126,6 @@ object RandomEffectOptimizationProblem { * * @tparam RandomEffectObjective The type of objective function used to solve individual random effect optimization * problems - * @param linearSubspaceProjectorsRDD The per-entity [[LinearSubspaceProjector]] objects used to compress the - * per-entity feature spaces * @param configuration The optimization problem configuration * @param objectiveFunctionFactory The objective function to optimize * @param glmConstructor The function to use for producing GLMs from trained coefficients @@ -135,7 +135,8 @@ object RandomEffectOptimizationProblem { * @return A new [[RandomEffectOptimizationProblem]] object */ def apply[RandomEffectObjective <: SingleNodeObjectiveFunction]( - linearSubspaceProjectorsRDD: RDD[(REId, LinearSubspaceProjector)], + data: DataFrame, + rEType: REType, configuration: RandomEffectOptimizationConfiguration, objectiveFunctionFactory: Option[Int] => RandomEffectObjective, glmConstructor: Coefficients => GeneralizedLinearModel, @@ -143,30 +144,26 @@ object RandomEffectOptimizationProblem { varianceComputationType: VarianceComputationType = VarianceComputationType.NONE, interceptIndexOpt: Option[Int]): RandomEffectOptimizationProblem[RandomEffectObjective] = { + val factors = normalizationContext.factorsOpt + val shiftsAndIntercept = normalizationContext.shiftsAndInterceptOpt + val projectedNormalizationContext = new NormalizationContext(factors, shiftsAndIntercept) + // Generate new NormalizationContext and SingleNodeOptimizationProblem objects - val optimizationProblems = linearSubspaceProjectorsRDD - .mapValues { projector => - val factors = normalizationContext.factorsOpt.map(factors => projector.projectForward(factors)) - val shiftsAndIntercept = normalizationContext - .shiftsAndInterceptOpt - .map { case (shifts, intercept) => - val newShifts = projector.projectForward(shifts) - val newIntercept = projector.originalToProjectedSpaceMap(intercept) - - (newShifts, newIntercept) - } - val projectedNormalizationContext = new NormalizationContext(factors, shiftsAndIntercept) - val projectedInterceptOpt = interceptIndexOpt.map { interceptIndex => - projector.originalToProjectedSpaceMap(interceptIndex) - } - - // TODO: Broadcast arguments to SingleNodeOptimizationProblem? - SingleNodeOptimizationProblem( + val optimizationProblems = data + .select(rEType, Constants.UNIQUE_SAMPLE_ID) + .groupBy(rEType) + .count + .rdd + .map { row => + val reid = row.getInt(0).toString + val problem = SingleNodeOptimizationProblem( configuration, - objectiveFunctionFactory(projectedInterceptOpt), + objectiveFunctionFactory(interceptIndexOpt), glmConstructor, PhotonNonBroadcast(projectedNormalizationContext), varianceComputationType) + + (reid, problem) } new RandomEffectOptimizationProblem(optimizationProblems, glmConstructor) diff --git a/photon-client/src/main/scala/com/linkedin/photon/ml/util/Utils.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/util/Utils.scala similarity index 92% rename from photon-client/src/main/scala/com/linkedin/photon/ml/util/Utils.scala rename to photon-api/src/main/scala/com/linkedin/photon/ml/util/Utils.scala index 76f92a23..5096b19c 100644 --- a/photon-client/src/main/scala/com/linkedin/photon/ml/util/Utils.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/util/Utils.scala @@ -1,33 +1,12 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ package com.linkedin.photon.ml.util -import java.lang.{Boolean => JBoolean, Double => JDouble, Float => JFloat, Number => JNumber, Object => JObject, String => JString} - -import scala.collection.JavaConverters._ - import org.apache.avro.generic.GenericRecord import org.apache.avro.util.Utf8 import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path -import com.linkedin.photon.ml.Constants import com.linkedin.photon.ml.evaluation.EvaluatorType._ -import com.linkedin.photon.ml.evaluation.{MultiAUC, MultiPrecisionAtK, EvaluatorType} - -// TODO: Better documentation. +import com.linkedin.photon.ml.evaluation.{EvaluatorType, MultiAUC, MultiPrecisionAtK} /** * Some useful functions diff --git a/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/training/GameTrainingDriver.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/training/GameTrainingDriver.scala index 94ca91e1..17e52425 100644 --- a/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/training/GameTrainingDriver.scala +++ b/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/training/GameTrainingDriver.scala @@ -21,6 +21,7 @@ import org.apache.spark.ml.linalg.{Vector => SparkMLVector} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators, Params} import org.apache.spark.sql.DataFrame import org.apache.spark.storage.StorageLevel +import org.apache.spark.sql.functions.monotonically_increasing_id import com.linkedin.photon.ml._ import com.linkedin.photon.ml.HyperparameterTunerName.HyperparameterTunerName @@ -359,9 +360,16 @@ object GameTrainingDriver extends GameDriver { val (trainingData, featureIndexMapLoaders) = Timed(s"Read training data") { readTrainingData(avroDataReader, featureIndexMapLoadersOpt) } + val gameTrainingData = Timed("Prepare GAME training data") { + trainingData.withColumn(Constants.UNIQUE_SAMPLE_ID, monotonically_increasing_id) + } + val validationData = Timed(s"Read validation data") { readValidationData(avroDataReader, featureIndexMapLoaders) } + val gameValidationData = Timed("Prepare GAME validation data") { + validationData.map(_.withColumn(Constants.UNIQUE_SAMPLE_ID, monotonically_increasing_id)) + } val interceptIndices = featureIndexMapLoaders.flatMap { case (coordinateId, indexMap) => indexMap.indexMapForDriver().getIndex(Constants.INTERCEPT_KEY) match { @@ -371,8 +379,8 @@ object GameTrainingDriver extends GameDriver { } } - trainingData.persist(StorageLevel.DISK_ONLY) - validationData.map(_.persist(StorageLevel.DISK_ONLY)) + gameTrainingData.persist(StorageLevel.DISK_ONLY) + gameValidationData.map(_.persist(StorageLevel.DISK_ONLY)) val modelOpt = get(modelInputDirectory).map { modelDir => Timed("Load model for warm-start training") { @@ -420,7 +428,7 @@ object GameTrainingDriver extends GameDriver { getOrDefault(inputColumnNames), getRequiredParam(featureShardConfigurations).keySet) - validationData match { + gameValidationData match { case Some(x) => DataValidators.sanityCheckDataFrameForTraining( x, getRequiredParam(trainingTask), @@ -470,17 +478,17 @@ object GameTrainingDriver extends GameDriver { } val explicitModels = Timed("Fit models") { - gameEstimator.fit(trainingData, validationData, gameOptimizationConfigs) + gameEstimator.fit(gameTrainingData, gameValidationData, gameOptimizationConfigs) } val tunedModels = Timed("Tune hyperparameters") { // Disable warm start for autotuning gameEstimator.setUseWarmStart(false) - runHyperparameterTuning(gameEstimator, trainingData, validationData, explicitModels) + runHyperparameterTuning(gameEstimator, gameTrainingData, gameValidationData, explicitModels) } - trainingData.unpersist() - validationData.map(_.unpersist()) + gameTrainingData.unpersist() + gameValidationData.map(_.unpersist()) val (outputModels, bestModel) = selectModels(explicitModels, tunedModels) diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/Coordinate.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/Coordinate.scala index d20e1af5..088a71e0 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/Coordinate.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/Coordinate.scala @@ -14,71 +14,74 @@ */ package com.linkedin.photon.ml.algorithm -import com.linkedin.photon.ml.data.Dataset +import org.apache.spark.sql.DataFrame import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.model.DatumScoringModel import com.linkedin.photon.ml.optimization.OptimizationTracker /** - * The optimization problem coordinate for each effect model. - * - * @tparam D The training dataset type - * @param dataset The training dataset - */ -protected[ml] abstract class Coordinate[D <: Dataset[D]](protected val dataset: D) { + * The optimization problem coordinate for each effect model. + * + */ +protected[ml] abstract class Coordinate { /** - * Update the coordinate with a new dataset. - * - * @param dataset The updated dataset - * @return A new coordinate with the updated dataset - */ - protected[algorithm] def updateCoordinateWithDataset(dataset: D): Coordinate[D] - - /** - * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset. - * - * @return A (updated model, optimization state tracking information) tuple - */ + * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset. + * + * @return A (updated model, optimization state tracking information) tuple + */ protected[algorithm] def trainModel(): (DatumScoringModel, OptimizationTracker) /** - * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset with residuals from other - * coordinates. - * - * @param score The combined scores for each record of the other coordinates - * @return A (updated model, optimization state tracking information) tuple - */ - protected[algorithm] def trainModel(score: CoordinateDataScores): (DatumScoringModel, OptimizationTracker) = - updateCoordinateWithDataset(dataset.addScoresToOffsets(score)).trainModel() + * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset with residuals from other + * coordinates. + * + * @param score The combined scores for each record of the other coordinates + * @return A (updated model, optimization state tracking information) tuple + */ + protected[algorithm] def trainModel(score: CoordinateDataScores): (DatumScoringModel, OptimizationTracker) = { + updateDataset(score) + trainModel() + } /** - * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset using an existing model as - * a starting point. - * - * @param model The model to use as a starting point - * @return A (updated model, optimization state tracking information) tuple - */ + * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset using an existing model as + * a starting point. + * + * @param model The model to use as a starting point + * @return A (updated model, optimization state tracking information) tuple + */ protected[algorithm] def trainModel(model: DatumScoringModel): (DatumScoringModel, OptimizationTracker) /** - * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset using an existing model as - * a starting point and with residuals from other coordinates. - * - * @param model The existing model - * @param score The combined scores for each record of the other coordinates - * @return A (updated model, optimization state tracking information) tuple - */ + * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset using an existing model as + * a starting point and with residuals from other coordinates. + * + * @param model The existing model + * @param score The combined scores for each record of the other coordinates + * @return A (updated model, optimization state tracking information) tuple + */ protected[algorithm] def trainModel( - model: DatumScoringModel, - score: CoordinateDataScores): (DatumScoringModel, OptimizationTracker) = - updateCoordinateWithDataset(dataset.addScoresToOffsets(score)).trainModel(model) + model: DatumScoringModel, + score: CoordinateDataScores): (DatumScoringModel, OptimizationTracker) = { + updateDataset(score) + trainModel(model) + } /** - * Compute scores for the coordinate data using a given model. + * Generate a new dataset with updated offset. * - * @param model The input model - * @return The dataset scores + * @param scores The score dataset + * @return A new dataset with the updated offsets */ + protected def updateDataset(scores: CoordinateDataScores) + + /** + * Compute scores for the coordinate data using a given model. + * + * @param model The input model + * @return The dataset scores + */ protected[algorithm] def score(model: DatumScoringModel): CoordinateDataScores } + diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/ModelCoordinate.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/ModelCoordinate.scala index bdf9bf2e..bde880c8 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/ModelCoordinate.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/ModelCoordinate.scala @@ -14,7 +14,6 @@ */ package com.linkedin.photon.ml.algorithm -import com.linkedin.photon.ml.data.Dataset import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.model.DatumScoringModel import com.linkedin.photon.ml.optimization.OptimizationTracker @@ -22,18 +21,10 @@ import com.linkedin.photon.ml.optimization.OptimizationTracker /** * The optimization problem coordinate for a pre-trained model. * - * @tparam D The training dataset type - * @param dataset The training dataset */ -abstract class ModelCoordinate[D <: Dataset[D]](dataset: D) extends Coordinate(dataset) { +abstract class ModelCoordinate extends Coordinate { - /** - * Update the coordinate with a new dataset. - * - * @param dataset The updated dataset - * @return A new coordinate with the updated dataset - */ - override protected[algorithm] def updateCoordinateWithDataset(dataset: D): Coordinate[D] = + override protected[algorithm] def updateDataset(scores: CoordinateDataScores) = throw new UnsupportedOperationException("Attempted to update model coordinate.") /** diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/data/Dataset.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/data/Dataset.scala deleted file mode 100644 index 0217b5af..00000000 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/data/Dataset.scala +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ -package com.linkedin.photon.ml.data - -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores -import com.linkedin.photon.ml.util.Summarizable - -/** - * Interface for GAME dataset implementations. - */ -protected[ml] trait Dataset[D <: Dataset[D]] extends Summarizable { - - /** - * Add residual scores to the data offsets. - * - * @param keyScore The residual scores - * @return The dataset with updated offsets - */ - def addScoresToOffsets(keyScore: CoordinateDataScores): D -} diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/optimization/Optimizer.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/optimization/Optimizer.scala index 5266e03a..4ee7b1d9 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/optimization/Optimizer.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/optimization/Optimizer.scala @@ -162,7 +162,7 @@ abstract class Optimizer[-Function <: ObjectiveFunction]( protected[ml] def optimize( objectiveFunction: Function, initialCoefficients: Vector[Double])( - data: objectiveFunction.Data): (Vector[Double], Double) = { + data: objectiveFunction.Data): (Vector[Double], OptimizationStatesTracker) = { val normalizedInitialCoefficients = normalizationContext.value.modelToTransformedSpace(initialCoefficients) @@ -183,7 +183,7 @@ abstract class Optimizer[-Function <: ObjectiveFunction]( statesTracker.convergenceReason = getConvergenceReason val currState = getCurrentState.get - (currState.coefficients, currState.loss) + (currState.coefficients, statesTracker) } /** diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/sampling/DownSampler.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/sampling/DownSampler.scala index 4d5e1221..e5b23550 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/sampling/DownSampler.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/sampling/DownSampler.scala @@ -43,8 +43,8 @@ protected[ml] trait DownSampler { * @return The down-sampled dataset */ def downSample( - labeledPoints: RDD[(UniqueSampleId, LabeledPoint)], - seed: Long = getSeed): RDD[(UniqueSampleId, LabeledPoint)] + labeledPoints: RDD[LabeledPoint], + seed: Long = getSeed): RDD[LabeledPoint] } object DownSampler { From 4b373c64860843799fbacb7a058e6618e5041eb4 Mon Sep 17 00:00:00 2001 From: Lin Guo Date: Tue, 11 Feb 2020 20:43:41 -0800 Subject: [PATCH 03/11] fix compilation bugs --- .../ml/algorithm/CoordinateFactory.scala | 6 +-- .../ml/algorithm/FixedEffectCoordinate.scala | 6 +-- .../DistributedOptimizationProblem.scala | 30 ++++++++---- .../RandomEffectOptimizationTracker.scala | 6 +-- .../SingleNodeOptimizationProblem.scala | 18 +++++-- .../RandomEffectOptimizationProblem.scala | 6 +-- .../com/linkedin/photon/ml/Constants.scala | 48 ------------------- 7 files changed, 40 insertions(+), 80 deletions(-) delete mode 100644 photon-client/src/main/scala/com/linkedin/photon/ml/Constants.scala diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateFactory.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateFactory.scala index 943c16bd..3104f00a 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateFactory.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateFactory.scala @@ -68,14 +68,12 @@ object CoordinateFactory { val lossFunctionFactory = lossFunctionFactoryConstructor(coordinateOptConfig) - var datasetName : String = "" (rETypeOpt, coordinateOptConfig, lossFunctionFactory) match { case ( None, fEOptConfig: FixedEffectOptimizationConfiguration, distributedLossFunctionFactory: DistributedObjectiveFunctionFactory) => - datasetName = "fixed-effect" val downSamplerOpt = if (DownSampler.isValidDownSamplingRate(fEOptConfig.downSamplingRate)) { Some(downSamplerFactory(fEOptConfig.downSamplingRate)) } else { @@ -102,8 +100,6 @@ object CoordinateFactory { rEOptConfig: RandomEffectOptimizationConfiguration, singleNodeLossFunctionFactory: SingleNodeObjectiveFunctionFactory) => - datasetName = "random-effect" - RandomEffectCoordinate( dataset, rEType, @@ -119,7 +115,7 @@ object CoordinateFactory { case _ => throw new UnsupportedOperationException( s"""Cannot build coordinate for the following input class combination: - | ${datasetName} + | ${rETypeOpt.getOrElse("fixed-effect")} | ${coordinateOptConfig.getClass.getName} | ${lossFunctionFactory.getClass.getName}""".stripMargin) } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala index 02ba85ab..3369045a 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala @@ -96,7 +96,7 @@ protected[ml] class FixedEffectCoordinate[Objective <: DistributedObjectiveFunct override protected[algorithm] def score(model: DatumScoringModel): CoordinateDataScores = model match { case fixedEffectModel: FixedEffectModel => - FixedEffectCoordinate.score(dataset, fixedEffectModel) + FixedEffectCoordinate.score(dataset, fixedEffectModel) case _ => throw new UnsupportedOperationException( @@ -167,9 +167,7 @@ object FixedEffectCoordinate { fixedEffectDataset: DataFrame, fixedEffectModel: FixedEffectModel): CoordinateDataScores = { - //val modelBroadcast = fixedEffectModel.modelBroadcast - //val scores = fixedEffectDataset.mapValues { features => modelBroadcast.value.computeScore(features)} - //new CoordinateDataScores(scores) + // TODO: to move model to data frame null } } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/DistributedOptimizationProblem.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/DistributedOptimizationProblem.scala index eb1ba1c1..b1a5a788 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/DistributedOptimizationProblem.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/DistributedOptimizationProblem.scala @@ -14,12 +14,10 @@ */ package com.linkedin.photon.ml.optimization -import breeze.linalg.{Vector => BVector} -import breeze.linalg.{cholesky, diag} +import breeze.linalg.{Vector, cholesky, diag} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel -import com.linkedin.photon.ml.Types.UniqueSampleId import com.linkedin.photon.ml.constants.MathConst import com.linkedin.photon.ml.data.LabeledPoint import com.linkedin.photon.ml.function.{DistributedObjectiveFunction, L2Regularization, TwiceDiffFunction} @@ -82,7 +80,7 @@ protected[ml] class DistributedOptimizationProblem[Objective <: DistributedObjec * @param coefficients The feature coefficients means * @return An optional feature coefficient variances vector */ - override def computeVariances(input: RDD[LabeledPoint], coefficients: BVector[Double]): Option[BVector[Double]] = { + override def computeVariances(input: RDD[LabeledPoint], coefficients: Vector[Double]): Option[Vector[Double]] = { val result = (objectiveFunction, varianceComputation) match { case (twiceDiffFunc: TwiceDiffFunction, VarianceComputationType.SIMPLE) => @@ -111,15 +109,27 @@ protected[ml] class DistributedOptimizationProblem[Objective <: DistributedObjec * @param input The training data * @return The learned [[GeneralizedLinearModel]] */ - override def run(input: RDD[LabeledPoint]): (GeneralizedLinearModel, OptimizationStatesTracker) = { + override def run(input: RDD[LabeledPoint]): (GeneralizedLinearModel, OptimizationStatesTracker) = run(input, initializeZeroModel(input.first.features.size)) - val (optimizedCoefficients, stateTracker) = optimizer.optimize( - objectiveFunction, - BVector.zeros[Double](input.first.features.length))( - input) + /** + * Run the algorithm with the configured parameters, starting from the initial model provided + * (warm start in iterations over the regularization weights for hyperparameter tuning). + * + * @param input The training data + * @param initialModel The initial model from which to begin optimization + * @return The learned [[GeneralizedLinearModel]] + */ + override def run( + input: RDD[LabeledPoint], + initialModel: GeneralizedLinearModel): (GeneralizedLinearModel, OptimizationStatesTracker) = { + + val normalizationContext = optimizer.getNormalizationContext + val (optimizedCoefficients, stateTracker) = optimizer + .optimize(objectiveFunction, initialModel.coefficients.means)(input) + val optimizedVariances = computeVariances(input, optimizedCoefficients) - (createModel(optimizedCoefficients), stateTracker) + (createModel(normalizationContext, optimizedCoefficients, optimizedVariances), stateTracker) } /** diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/RandomEffectOptimizationTracker.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/RandomEffectOptimizationTracker.scala index bc53ce61..f36c0296 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/RandomEffectOptimizationTracker.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/RandomEffectOptimizationTracker.scala @@ -61,11 +61,7 @@ object RandomEffectOptimizationTracker{ val convergenceReasons = optimizationStatesTrackers .map { optimizationStatesTracker => - val reason = optimizationStatesTracker - .convergenceReason - .getOrElse(DidNotConverge) - - (reason, 1) + (optimizationStatesTracker.convergenceReason.getOrElse(DidNotConverge), 1) } .reduceByKey(_ + _) .collectAsMap() diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/SingleNodeOptimizationProblem.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/SingleNodeOptimizationProblem.scala index 97d03c36..aea3db19 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/SingleNodeOptimizationProblem.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/SingleNodeOptimizationProblem.scala @@ -72,6 +72,16 @@ protected[ml] class SingleNodeOptimizationProblem[Objective <: SingleNodeObjecti None } + /** + * Run the optimization algorithm on the input data, starting from an initial model of all-0 coefficients. + * + * @param input The training data + * @return The learned GLM for the given optimization problem, data, regularization type, and regularization weight + */ + override def run(input: Iterable[LabeledPoint]): (GeneralizedLinearModel, OptimizationStatesTracker) = + run(input, initializeZeroModel(input.head.features.size)) + + /** * Run the algorithm with the configured parameters, starting from the initial model provided * (warm start in iterations over the regularization weights for hyperparameter tuning). @@ -79,11 +89,13 @@ protected[ml] class SingleNodeOptimizationProblem[Objective <: SingleNodeObjecti * @param input The training data * @return The learned [[GeneralizedLinearModel]] */ - override def run(input: Iterable[LabeledPoint]): (GeneralizedLinearModel, OptimizationStatesTracker) = { + override def run(input: Iterable[LabeledPoint], initialModel: GeneralizedLinearModel): (GeneralizedLinearModel, OptimizationStatesTracker) = { - val (optimizedCoefficients, stateTracker) = optimizer.optimize(objectiveFunction, Vector.zeros[Double](input.head.features.length))(input) + val normalizationContext = optimizer.getNormalizationContext + val (optimizedCoefficients, stateTracker) = optimizer.optimize(objectiveFunction, initialModel.coefficients.means)(input) + val optimizedVariances = computeVariances(input, optimizedCoefficients) - (createModel(optimizedCoefficients), stateTracker) + (createModel(normalizationContext, optimizedCoefficients, optimizedVariances), stateTracker) } } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/game/RandomEffectOptimizationProblem.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/game/RandomEffectOptimizationProblem.scala index 691e5cc8..0d8d7c8c 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/game/RandomEffectOptimizationProblem.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/game/RandomEffectOptimizationProblem.scala @@ -144,10 +144,6 @@ object RandomEffectOptimizationProblem { varianceComputationType: VarianceComputationType = VarianceComputationType.NONE, interceptIndexOpt: Option[Int]): RandomEffectOptimizationProblem[RandomEffectObjective] = { - val factors = normalizationContext.factorsOpt - val shiftsAndIntercept = normalizationContext.shiftsAndInterceptOpt - val projectedNormalizationContext = new NormalizationContext(factors, shiftsAndIntercept) - // Generate new NormalizationContext and SingleNodeOptimizationProblem objects val optimizationProblems = data .select(rEType, Constants.UNIQUE_SAMPLE_ID) @@ -160,7 +156,7 @@ object RandomEffectOptimizationProblem { configuration, objectiveFunctionFactory(interceptIndexOpt), glmConstructor, - PhotonNonBroadcast(projectedNormalizationContext), + PhotonNonBroadcast(normalizationContext), varianceComputationType) (reid, problem) diff --git a/photon-client/src/main/scala/com/linkedin/photon/ml/Constants.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/Constants.scala deleted file mode 100644 index c8bbf120..00000000 --- a/photon-client/src/main/scala/com/linkedin/photon/ml/Constants.scala +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ -package com.linkedin.photon.ml - -import org.joda.time.DateTimeZone - -import com.linkedin.photon.ml.util.Utils - -/** - * Some commonly used String constants. - */ -object Constants { - - /** - * Delimiter used to concatenate feature name and term into feature key. - * - * WARNING: This is not visible in println! - */ - val DELIMITER = "\u0001" - - /** - * Wildcard character used for specifying the feature constraints. Only the term is allowed to be a wildcard normally - * unless one wants to apply bounds to all features in which case both name and term can be specified as wildcards. - * Currently, we do not support wildcards in name alone. - */ - val WILDCARD = "*" - - val INTERCEPT_NAME = "(INTERCEPT)" - val INTERCEPT_TERM = "" - val INTERCEPT_KEY = Utils.getFeatureKey(INTERCEPT_NAME, INTERCEPT_TERM) - - /** - * Default time zone for relative date calculations - */ - val DEFAULT_TIME_ZONE = DateTimeZone.UTC -} From ace606f77303a995e788e31c8f94298067a4c418 Mon Sep 17 00:00:00 2001 From: Lin Guo Date: Tue, 11 Feb 2020 22:06:32 -0800 Subject: [PATCH 04/11] Fix errors in CoordinateDescent --- .../com/linkedin/photon/ml/Constants.scala | 17 +++- .../linkedin/photon/ml/ModelTraining.scala | 8 +- .../ml/algorithm/RandomEffectCoordinate.scala | 2 +- .../ml/data/CoordinateDataConfiguration.scala | 4 +- .../photon/ml/estimators/GameEstimator.scala | 92 ++++++------------- .../ml/algorithm/CoordinateDescent.scala | 51 +++++----- 6 files changed, 80 insertions(+), 94 deletions(-) diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/Constants.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/Constants.scala index 2da28880..ee61acd4 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/Constants.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/Constants.scala @@ -1,3 +1,18 @@ +/* + * Copyright 2017 LinkedIn Corp. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain a + * copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + */ + package com.linkedin.photon.ml import org.joda.time.DateTimeZone @@ -33,4 +48,4 @@ object Constants { val DEFAULT_TIME_ZONE = DateTimeZone.UTC val UNIQUE_SAMPLE_ID = "uniqueId" -} \ No newline at end of file +} diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/ModelTraining.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/ModelTraining.scala index 8c3457ad..81a8370f 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/ModelTraining.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/ModelTraining.scala @@ -184,7 +184,7 @@ object ModelTraining extends Logging { // Initialize the list with the result from the first regularization weight optimizationProblem.updateRegularizationWeight(currentWeight) - val glm = if (numWarmStartModels == 0) { + val (glm, stateTracker) = if (numWarmStartModels == 0) { logger.info(s"No warm start model found; beginning training with a 0-coefficients model") @@ -199,14 +199,14 @@ object ModelTraining extends Logging { optimizationProblem.run(trainingData, warmStartModels(maxLambda)) } - List((currentWeight, glm, optimizationProblem.getStatesTracker)) + List((currentWeight, glm, stateTracker)) case (latestWeightsModelsAndTrackers, currentWeight) => optimizationProblem.updateRegularizationWeight(currentWeight) // Train the rest of the models - val glm = if (useWarmStart) { + val (glm, stateTracker) = if (useWarmStart) { val previousModel = latestWeightsModelsAndTrackers.head._2 logger.info(s"Training model with regularization weight $currentWeight started (warm start)") @@ -219,7 +219,7 @@ object ModelTraining extends Logging { optimizationProblem.run(trainingData) } - (currentWeight, glm, optimizationProblem.getStatesTracker) +: latestWeightsModelsAndTrackers + (currentWeight, glm, stateTracker) +: latestWeightsModelsAndTrackers } broadcastNormalizationContext.unpersist() diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala index 669b7804..06708f87 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala @@ -270,7 +270,7 @@ object RandomEffectCoordinate { */ protected[algorithm] def trainModel[Function <: SingleNodeObjectiveFunction]( randomEffectDataset: DataFrame, - randomEffectType:REType, + randomEffectType: REType, featureShardId: FeatureShardId, randomEffectOptimizationProblem: RandomEffectOptimizationProblem[Function], initialRandomEffectModelOpt: Option[RandomEffectModel]): (RandomEffectModel, RandomEffectOptimizationTracker) = { diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/data/CoordinateDataConfiguration.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/data/CoordinateDataConfiguration.scala index f1a2642c..1a06a51b 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/data/CoordinateDataConfiguration.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/data/CoordinateDataConfiguration.scala @@ -28,7 +28,7 @@ sealed trait CoordinateDataConfiguration { } /** - * Configuration needed in order to generate a [[com.linkedin.photon.ml.data.FixedEffectDataset]]. + * Configuration needed in order to generate a FixedEffectCoordinate. * * @param featureShardId Key of the feature shard used to generate the dataset * @param minNumPartitions Minimum number of data partitions @@ -39,7 +39,7 @@ case class FixedEffectDataConfiguration( extends CoordinateDataConfiguration /** - * Configurations needed in order to generate a [[com.linkedin.photon.ml.data.RandomEffectDataset]]. + * Configurations needed in order to generate a RandomEffectCoordinate. * * @param randomEffectType The corresponding random effect type of the dataset * @param featureShardId Key of the feature shard used to generate the dataset diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala index 627ae6c5..3fd795e2 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala @@ -15,6 +15,7 @@ package com.linkedin.photon.ml.estimators import scala.language.existentials +import scala.util.Random import org.apache.commons.cli.MissingArgumentException import org.apache.spark.SparkContext @@ -22,6 +23,7 @@ import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.util.Identifiable import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.{udf, col} import org.apache.spark.storage.StorageLevel import org.slf4j.Logger @@ -302,14 +304,14 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P * Fits a GAME model to the training dataset, once per configuration. * * @param data The training set - * @param validationData Optional validation set for per-iteration validation + * @param validationDataOpt Optional validation set for per-iteration validation * @param optimizationConfigurations A set of GAME optimization configurations * @return A set of (trained GAME model, optional evaluation results, GAME model configuration) tuples, one for each * configuration */ def fit( data: DataFrame, - validationData: Option[DataFrame], + validationDataOpt: Option[DataFrame], optimizationConfigurations: Seq[GameOptimizationConfiguration]): Seq[GameResult] = { // Verify valid GameEstimator settings @@ -319,17 +321,15 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P validateInput(optimizationConfigurations) // Transform the GAME validation data set into fixed and random effect specific data sets - val validationDatasetAndEvaluationSuiteOpt = Timed("Prepare validation data, if any") { - prepareValidationDatasetAndEvaluators( - validationData, - featureShards, // TO BE CORRECTED - additionalCols) // TO BE CORRECTED + val evaluationSuiteOpt = Timed("Prepare validation data, if any") { + validationDataOpt.map { case validData => prepareValidationEvaluators(validData) } } val coordinateDescent = new CoordinateDescent( getRequiredParam(coordinateUpdateSequence), getOrDefault(coordinateDescentIterations), - validationDatasetAndEvaluationSuiteOpt, + validationDataOpt, + evaluationSuiteOpt, getOrDefault(partialRetrainLockedCoordinates), logger) @@ -345,7 +345,6 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P val (gameModel, evaluations) = train( data, optimizationConfiguration, - //trainingDatasets, coordinateDescent, prevGameModel) @@ -355,8 +354,7 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P } } - validationDatasetAndEvaluationSuiteOpt.map { case (validationDataset, evaluationSuite) => - validationDataset.unpersist() + evaluationSuiteOpt.map { case evaluationSuite => evaluationSuite.unpersistRDD() } @@ -399,55 +397,22 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P } } - - /** - * Optionally construct an [[RDD]] of validation data samples, and an [[EvaluationSuite]] to compute evaluation metrics - * over the validation data. - * - * @param dataOpt Optional [[DataFrame]] of validation data - * @param featureShards The feature shard columns to import from the [[DataFrame]] - * @param additionalCols A set of additional columns whose values should be maintained for validation evaluation - * @return An optional ([[RDD]] of validation data, validation metric [[EvaluationSuite]]) tuple - */ - protected def prepareValidationDatasetAndEvaluators( - dataOpt: Option[DataFrame], - featureShards: Set[FeatureShardId], - additionalCols: Set[String]): Option[(RDD[(UniqueSampleId, GameDatum)], EvaluationSuite)] = - - dataOpt.map { data => - val partitioner = new LongHashPartitioner(data.rdd.partitions.length) - val gameDataset = Timed("Convert validation data from raw DataFrame to processed RDD of GAME data") { - GameConverters - .getGameDatasetFromDataFrame( - data, - featureShards, - additionalCols, - isResponseRequired = true, - getOrDefault(inputColumnNames)) - .partitionBy(partitioner) - .setName("Validation Game dataset") - .persist(StorageLevel.DISK_ONLY) - } - val evaluationSuite = Timed("Prepare validation metric evaluators") { - prepareValidationEvaluators(gameDataset) - } - - (gameDataset, evaluationSuite) - } - /** * Construct the validation [[EvaluationSuite]]. * - * @param gameDataset An [[RDD]] of validation data samples + * @param dataset An [[RDD]] of validation data samples * @return [[EvaluationSuite]] containing one or more validation metric [[Evaluator]] objects */ - protected def prepareValidationEvaluators(gameDataset: RDD[(UniqueSampleId, GameDatum)]): EvaluationSuite = { + protected def prepareValidationEvaluators(dataset: DataFrame): EvaluationSuite = { + + val columnsNames = getOrDefault(inputColumnNames) + val response = columnsNames(InputColumnsNames.RESPONSE) + val offset = columnsNames(InputColumnsNames.OFFSET) + val weight = columnsNames(InputColumnsNames.WEIGHT) + val validatingLabelsAndOffsetsAndWeights = dataset.select(response, offset, weight) - val validatingLabelsAndOffsetsAndWeights = gameDataset.mapValues { gameData => - (gameData.response, gameData.offset, gameData.weight) - } val evaluators = get(validationEvaluators) - .map(_.map(EvaluatorFactory.buildEvaluator(_, gameDataset))) + .map(_.map(EvaluatorFactory.buildEvaluator(_, dataset))) //TODO: fix the errors .getOrElse { // Get default evaluators given the task type val taskType = getRequiredParam(trainingTask) @@ -460,16 +425,18 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P Seq(defaultEvaluator) } - val evaluationSuite = EvaluationSuite(evaluators, validatingLabelsAndOffsetsAndWeights) + val evaluationSuite = EvaluationSuite(evaluators, validatingLabelsAndOffsetsAndWeights) //TODO: fix the errors .setName(s"Evaluation: validation data labels, offsets, and weights") .persistRDD(StorageLevel.MEMORY_AND_DISK) if (logger.isDebugEnabled) { - val randomScores = gameDataset.mapValues(_ => math.random).persist() + val randUdf = udf({() => Random.nextInt()}) + val randomScores = dataset.withColumn("score", randUdf()).select("score") + randomScores.persist() evaluationSuite - .evaluate(randomScores) + .evaluate(randomScores) //TODO: fix the errors .evaluations .foreach { case (evaluator, evaluation) => logger.debug(s"Random guessing baseline for evaluation metric '${evaluator.name}': $evaluation") @@ -490,6 +457,7 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P * with the most general 'coordinates' and end with the least general - each successive update learning the residuals * of the previous 'coordinates'. * + * @param data Input training data set * @param configuration The configuration for the GAME optimization problem * @param coordinateDescent The coordinate descent driver * @param initialModelOpt An optional existing GAME model who's components should be used to warm-start training @@ -524,14 +492,14 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P val interceptIndices = getOrDefault(coordinateInterceptIndices) // Create the optimization coordinates for each component model - val coordinates: Map[CoordinateId, C forSome { type C <: Coordinate[_] }] = + val coordinates: Map[CoordinateId, C forSome { type C <: Coordinate }] = updateSequence .map { coordinateId => - val coordinate: C forSome { type C <: Coordinate[_] } = if (lockedCoordinates.contains(coordinateId)) { - trainingDatasets(coordinateId) match { - case feDataset: FixedEffectDataset => new FixedEffectModelCoordinate(feDataset) - case reDataset: RandomEffectDataset => new RandomEffectModelCoordinate(reDataset) - case dataset => throw new UnsupportedOperationException(s"Unsupported dataset type: ${dataset.getClass}") + val coordinate: C forSome { type C <: Coordinate } = if (lockedCoordinates.contains(coordinateId)) { + dataConfigs(coordinateId) match { + case _: FixedEffectDataConfiguration => new FixedEffectModelCoordinate(data) + case _: RandomEffectDataConfiguration => new RandomEffectModelCoordinate(data) + case oConfig => throw new UnsupportedOperationException(s"Unsupported coordinate type: ${oConfig.getClass}") } } else { CoordinateFactory.build( diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateDescent.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateDescent.scala index 109af0c6..9b028901 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateDescent.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateDescent.scala @@ -16,13 +16,11 @@ package com.linkedin.photon.ml.algorithm import scala.collection.mutable -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.storage.StorageLevel import org.slf4j.Logger import com.linkedin.photon.ml.Types.{CoordinateId, UniqueSampleId} -import com.linkedin.photon.ml.data.GameDatum import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.evaluation.{EvaluationResults, EvaluationSuite, EvaluatorType} import com.linkedin.photon.ml.model.{DatumScoringModel, GameModel} @@ -35,15 +33,17 @@ import com.linkedin.photon.ml.util.Timed * * @param updateSequence The order in which to update coordinates * @param descentIterations Number of coordinate descent iterations (updates to each coordinate in order) - * @param validationDataAndEvaluationSuiteOpt Optional validation data and [[EvaluationSuite]] of validation metric - * [[com.linkedin.photon.ml.evaluation.Evaluator]] objects + * @param validationOpt Optional validation data + * @param evaluationSuiteOpt Optional [[EvaluationSuite]] of validation metric + * [[com.linkedin.photon.ml.evaluation.Evaluator]] objects * @param lockedCoordinates Set of locked coordinates within the initial model for performing partial retraining * @param logger A logger instance */ class CoordinateDescent( updateSequence: Seq[CoordinateId], descentIterations: Int, - validationDataAndEvaluationSuiteOpt: Option[(RDD[(UniqueSampleId, GameDatum)], EvaluationSuite)], + validationOpt: Option[DataFrame], + evaluationSuiteOpt: Option[EvaluationSuite], lockedCoordinates: Set[CoordinateId], implicit private val logger: Logger) { @@ -98,7 +98,7 @@ class CoordinateDescent( * @param initialModelsOpt An optional map of existing models */ private def checkInput( - coordinates: Map[CoordinateId, Coordinate[_]], + coordinates: Map[CoordinateId, Coordinate], initialModelsOpt: Option[Map[CoordinateId, DatumScoringModel]]): Unit = { // All coordinates in the update sequence must be passed as input @@ -130,7 +130,7 @@ class CoordinateDescent( * at the conclusion of coordinate descent). */ def run( - coordinates: Map[CoordinateId, Coordinate[_]], + coordinates: Map[CoordinateId, Coordinate], initialModelsOpt: Option[Map[CoordinateId, DatumScoringModel]]): (GameModel, Option[EvaluationResults]) = { checkInput(coordinates, initialModelsOpt) @@ -145,10 +145,12 @@ class CoordinateDescent( coordinateId, coordinates(coordinateId), initialModels.get(coordinateId), - validationDataAndEvaluationSuiteOpt) + validationOpt, + evaluationSuiteOpt) - } else if (validationDataAndEvaluationSuiteOpt.isDefined) { - val (validationData, evaluationSuite) = validationDataAndEvaluationSuiteOpt.get + } else if (validationOpt.isDefined && evaluationSuiteOpt.isDefined) { + val validationData = validationOpt.get + val evaluationSuite = evaluationSuiteOpt.get val (model, evaluationsResults) = descendWithValidation( coordinates, updateSequence, @@ -182,7 +184,7 @@ object CoordinateDescent { */ protected[algorithm] def trainCoordinateModel( coordinateId: CoordinateId, - coordinate: Coordinate[_], + coordinate: Coordinate, iteration: Int, initialModelOpt: Option[DatumScoringModel], residualsOpt: Option[CoordinateDataScores])( @@ -279,7 +281,7 @@ object CoordinateDescent { */ protected[algorithm] def trainOrFetchCoordinateModel( coordinateId: CoordinateId, - coordinate: Coordinate[_], + coordinate: Coordinate, coordinatesToTrain: Seq[CoordinateId], initialModelOpt: Option[DatumScoringModel], residualsOpt: Option[CoordinateDataScores])( @@ -311,12 +313,12 @@ object CoordinateDescent { */ protected[algorithm] def evaluateModel( modelToEvaluate: DatumScoringModel, - validationData: RDD[(UniqueSampleId, GameDatum)], + validationData: DataFrame, evaluationSuite: EvaluationSuite)( implicit logger: Logger): EvaluationResults = Timed("Validate GAME model") { val validatingScores = Timed(s"Compute validation scores") { - modelToEvaluate.scoreForCoordinateDescent(validationData) + modelToEvaluate.scoreForCoordinateDescent(validationData) // TODO: to fix the error } Timed(s"Compute evaluation metrics") { @@ -371,7 +373,7 @@ object CoordinateDescent { * @return A new [[GameModel]] */ private def descend( - coordinates: Map[CoordinateId, Coordinate[_]], + coordinates: Map[CoordinateId, Coordinate], updateSequence: Seq[CoordinateId], coordinatesToTrain: Seq[CoordinateId], iterations: Int, @@ -491,12 +493,12 @@ object CoordinateDescent { * @return A (new [[GameModel]], model [[EvaluationResults]]) tuple */ private def descendWithValidation( - coordinates: Map[CoordinateId, Coordinate[_]], + coordinates: Map[CoordinateId, Coordinate], updateSequence: Seq[CoordinateId], coordinatesToTrain: Seq[CoordinateId], iterations: Int, initialModels: Map[CoordinateId, DatumScoringModel], - validationData: RDD[(UniqueSampleId, GameDatum)], + validationData: DataFrame, evaluationSuite: EvaluationSuite)( implicit logger: Logger): (GameModel, EvaluationResults) = { @@ -645,24 +647,25 @@ object CoordinateDescent { * @param coordinateId The ID of the single coordinate for which to train a new model * @param coordinate The [[Coordinate]] for which to train a new model * @param initialModelOpt An optional existing model to use for warm-start training - * @param validationDataAndEvaluationSuiteOpt An optional (validation data, set of evaluation metrics to compute) - * tuple + * @param validationOpt An optional validation data + * @param evaluationSuiteOpt An optional set of evaluation metrics to compute tuple * @param logger An implicit logger * @return A (new [[GameModel]], optional model [[EvaluationResults]]) tuple */ private def descendSingleCoordinate( coordinateId: CoordinateId, - coordinate: Coordinate[_], + coordinate: Coordinate, initialModelOpt: Option[DatumScoringModel], - validationDataAndEvaluationSuiteOpt: Option[(RDD[(UniqueSampleId, GameDatum)], EvaluationSuite)])( + validationOpt: Option[DataFrame], + evaluationSuiteOpt: Option[EvaluationSuite])( implicit logger: Logger): (GameModel, Option[EvaluationResults]) = { val newModel = trainCoordinateModel(coordinateId, coordinate, iteration = 1, initialModelOpt, residualsOpt = None) persistModel(newModel, coordinateId, iteration = 1) - val evaluationResultsOpt = validationDataAndEvaluationSuiteOpt.map { case (validationData, evaluationSuite) => - evaluateModel(newModel, validationData, evaluationSuite) + val evaluationResultsOpt = validationOpt.map { case validationData => + evaluateModel(newModel, validationData, evaluationSuiteOpt.get) } (new GameModel(Map(coordinateId -> newModel)), evaluationResultsOpt) From 0ce3fd2103b528229cc9d9f81cb0e074edeabca3 Mon Sep 17 00:00:00 2001 From: Lin Guo Date: Wed, 12 Feb 2020 10:26:06 -0800 Subject: [PATCH 05/11] 1. Models are changed to use Dataframe 2. Scores are changed to use Dataframe 3. Residuals will be computed by using a UDF on the training DataFrame. For random effects, the per-entity models will first need to be joined to the DataFrame by REID. A single UDF will do all scoring for fixed and random effects at once. --- .../com/linkedin/photon/ml/Constants.scala | 4 +- .../ml/algorithm/FixedEffectCoordinate.scala | 54 ++--- .../FixedEffectModelCoordinate.scala | 7 +- .../ml/algorithm/RandomEffectCoordinate.scala | 70 ++---- .../photon/ml/estimators/GameEstimator.scala | 8 +- .../photon/ml/model/FixedEffectModel.scala | 53 ++--- .../photon/ml/model/RandomEffectModel.scala | 204 ++++++++---------- .../model/GeneralizedLinearModel.scala | 33 +++ .../ml/algorithm/CoordinateDescent.scala | 10 +- .../photon/ml/constants/DataConst.scala | 23 ++ .../data/scoring/CoordinateDataScores.scala | 63 ++++-- .../photon/ml/data/scoring/DataScores.scala | 73 +++---- .../ml/data/scoring/ModelDataScores.scala | 106 --------- .../photon/ml/model/Coefficients.scala | 2 +- .../photon/ml/model/DatumScoringModel.scala | 16 +- .../linkedin/photon/ml/model/GameModel.scala | 12 +- 16 files changed, 308 insertions(+), 430 deletions(-) create mode 100644 photon-lib/src/main/scala/com/linkedin/photon/ml/constants/DataConst.scala delete mode 100644 photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/ModelDataScores.scala diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/Constants.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/Constants.scala index ee61acd4..95ee568c 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/Constants.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/Constants.scala @@ -16,7 +16,7 @@ package com.linkedin.photon.ml import org.joda.time.DateTimeZone - +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.util.Utils /** @@ -47,5 +47,5 @@ object Constants { */ val DEFAULT_TIME_ZONE = DateTimeZone.UTC - val UNIQUE_SAMPLE_ID = "uniqueId" + val UNIQUE_SAMPLE_ID = DataConst.ID } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala index 3369045a..0310bfa5 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala @@ -15,27 +15,28 @@ package com.linkedin.photon.ml.algorithm import org.apache.spark.ml.linalg.{Vector => SparkVector} -import org.apache.spark.sql.functions.col -import org.apache.spark.sql.types.{DataTypes, StructField, StructType} -import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.spark.sql.functions.{col, lit} +import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.storage.StorageLevel import com.linkedin.photon.ml.Constants -import com.linkedin.photon.ml.Types.{FeatureShardId, UniqueSampleId} +import com.linkedin.photon.ml.Types.FeatureShardId +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.data._ import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.function.DistributedObjectiveFunction import com.linkedin.photon.ml.model.{DatumScoringModel, FixedEffectModel} import com.linkedin.photon.ml.optimization.{DistributedOptimizationProblem, FixedEffectOptimizationTracker, OptimizationTracker} +import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel import com.linkedin.photon.ml.util.VectorUtils - /** * The optimization problem coordinate for a fixed effect model. * * @tparam Objective The type of objective function used to solve the fixed effect optimization problem * @param rawData The raw training data * @param optimizationProblem The fixed effect optimization problem + * @param inputColumnsNames */ protected[ml] class FixedEffectCoordinate[Objective <: DistributedObjectiveFunction]( rawData: DataFrame, @@ -45,23 +46,16 @@ protected[ml] class FixedEffectCoordinate[Objective <: DistributedObjectiveFunct extends Coordinate { var dataset: DataFrame = - rawData.select(featureShardId, inputColumnsNames(InputColumnsNames.RESPONSE)) + rawData + .select(Constants.UNIQUE_SAMPLE_ID, featureShardId, inputColumnsNames(InputColumnsNames.RESPONSE)) + .withColumn(inputColumnsNames(InputColumnsNames.OFFSET), lit(0.0)) override protected def updateDataset(scores: CoordinateDataScores) = { - // TODO: change scores to dataframe - val schemaFields = Array[StructField]( - StructField(Constants.UNIQUE_SAMPLE_ID, DataTypes.LongType, nullable = false), - StructField("score", DataTypes.DoubleType, nullable = false)) - dataset = SparkSession - .builder - .getOrCreate - .createDataFrame(scores.scoresRdd.map(Row.fromTuple(_)), new StructType(schemaFields)) + dataset = scores.scores .join(rawData, Constants.UNIQUE_SAMPLE_ID) - // TODO: WHAT IF OFFSET DOESN'T EXIST - //.withColumnRenamed("score", inputColumnsNames(InputColumnsNames.OFFSET)) .withColumn(inputColumnsNames(InputColumnsNames.OFFSET), - col(inputColumnsNames(InputColumnsNames.OFFSET)) + col("score")) + col(inputColumnsNames(InputColumnsNames.OFFSET)) + col(DataConst.SCORE)) } @@ -79,7 +73,7 @@ protected[ml] class FixedEffectCoordinate[Objective <: DistributedObjectiveFunct dataset, optimizationProblem, featureShardId, - Some(model)) + Some(fixedEffectModel)) case _ => throw new UnsupportedOperationException( @@ -96,7 +90,7 @@ protected[ml] class FixedEffectCoordinate[Objective <: DistributedObjectiveFunct override protected[algorithm] def score(model: DatumScoringModel): CoordinateDataScores = model match { case fixedEffectModel: FixedEffectModel => - FixedEffectCoordinate.score(dataset, fixedEffectModel) + FixedEffectCoordinate.score(dataset, fixedEffectModel, featureShardId) case _ => throw new UnsupportedOperationException( @@ -153,21 +147,19 @@ object FixedEffectCoordinate { new FixedEffectOptimizationTracker(stateTracker)) } - /** - * Score a dataset using a given [[FixedEffectModel]]. + * Compute scores given a training dataset and a fixed effect model * - * @note The score is the dot product of the model coefficients with the feature values (i.e., it does not go - * through a non-linear link function). - * @param fixedEffectDataset The dataset to score - * @param fixedEffectModel The model used to score the dataset + * @param dataset The dataset to score + * @param fixedEffectModel The model used to score the dataset + * @param featureShardId The ID of the feature shard for the training data * @return The computed scores */ - protected[algorithm] def score( - fixedEffectDataset: DataFrame, - fixedEffectModel: FixedEffectModel): CoordinateDataScores = { - - // TODO: to move model to data frame - null + def score(dataset: DataFrame, fixedEffectModel: FixedEffectModel, featureShardId: FeatureShardId): CoordinateDataScores = { + val cofs = VectorUtils.breezeToMl(fixedEffectModel.model.coefficients.means) + val scores = dataset + .withColumn(DataConst.SCORE, GeneralizedLinearModel.scoreUdf(lit(cofs), col(featureShardId))) + .select(Constants.UNIQUE_SAMPLE_ID, DataConst.SCORE) + new CoordinateDataScores(scores) } } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala index 1316357d..9a78b2e7 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala @@ -15,6 +15,8 @@ package com.linkedin.photon.ml.algorithm import org.apache.spark.sql.DataFrame + +import com.linkedin.photon.ml.Types.FeatureShardId import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.model.{DatumScoringModel, FixedEffectModel} @@ -22,8 +24,9 @@ import com.linkedin.photon.ml.model.{DatumScoringModel, FixedEffectModel} * The optimization problem coordinate for a pre-trained fixed effect model. * * @param dataset The training dataset + * @param featureShardId The ID of the feature shard for the training data */ -class FixedEffectModelCoordinate(dataset: DataFrame) extends ModelCoordinate { +class FixedEffectModelCoordinate(dataset: DataFrame, featureShardId: FeatureShardId) extends ModelCoordinate { /** * Score the effect-specific dataset in the coordinate with the input model. @@ -34,7 +37,7 @@ class FixedEffectModelCoordinate(dataset: DataFrame) extends ModelCoordinate { override protected[algorithm] def score(model: DatumScoringModel): CoordinateDataScores = { model match { case fixedEffectModel: FixedEffectModel => - FixedEffectCoordinate.score(dataset, fixedEffectModel) + FixedEffectCoordinate.score(dataset, fixedEffectModel, featureShardId) case _ => throw new UnsupportedOperationException( diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala index 06708f87..fc977515 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala @@ -19,29 +19,30 @@ import scala.collection.mutable import org.apache.spark.SparkContext import org.apache.spark.ml.linalg.{Vector => SparkVector} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.types.{DataTypes, StructField, StructType} -import org.apache.spark.sql.{DataFrame, Row, SparkSession, functions} -import org.apache.spark.storage.StorageLevel import org.apache.spark.sql.functions.col -import com.linkedin.photon.ml.normalization.NormalizationContext +import org.apache.spark.sql.{DataFrame, functions} +import org.apache.spark.storage.StorageLevel + import com.linkedin.photon.ml.Constants import com.linkedin.photon.ml.Types.{FeatureShardId, REType} +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.data._ import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.function.SingleNodeObjectiveFunction import com.linkedin.photon.ml.model.{Coefficients, DatumScoringModel, RandomEffectModel} +import com.linkedin.photon.ml.normalization.NormalizationContext import com.linkedin.photon.ml.optimization.VarianceComputationType.VarianceComputationType -import com.linkedin.photon.ml.optimization.game.{RandomEffectOptimizationConfiguration, RandomEffectOptimizationProblem} import com.linkedin.photon.ml.optimization._ +import com.linkedin.photon.ml.optimization.game.{RandomEffectOptimizationConfiguration, RandomEffectOptimizationProblem} import com.linkedin.photon.ml.spark.RDDLike import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel import com.linkedin.photon.ml.util.VectorUtils /** * The optimization problem coordinate for a random effect model. * - * @param rEType - * @param rawData The raw training dataframe * @tparam Objective The type of objective function used to solve individual random effect optimization problems + * @param rEType The random effect type + * @param rawData The raw training dataframe * @param optimizationProblem The random effect optimization problem * @param inputColumnsNames */ @@ -74,20 +75,10 @@ protected[ml] class RandomEffectCoordinate[Objective <: SingleNodeObjectiveFunct // Coordinate functions // override protected def updateDataset(scores: CoordinateDataScores) = { - - // TODO: change scores to dataframe - val schemaFields = Array[StructField]( - StructField(Constants.UNIQUE_SAMPLE_ID, DataTypes.LongType, nullable = false), - StructField("score", DataTypes.DoubleType, nullable = false)) - dataset = SparkSession - .builder - .getOrCreate - .createDataFrame(scores.scoresRdd.map(Row.fromTuple(_)), new StructType(schemaFields)) + dataset = scores.scores .join(rawData, Constants.UNIQUE_SAMPLE_ID) - // TODO: WHAT IF OFFSET DOESN'T EXIST - //.withColumnRenamed("score", inputColumnsNames(InputColumnsNames.OFFSET)) .withColumn(inputColumnsNames(InputColumnsNames.OFFSET), - col(inputColumnsNames(InputColumnsNames.OFFSET)) + col("score")) + col(inputColumnsNames(InputColumnsNames.OFFSET)) + col(DataConst.SCORE)) } /** @@ -309,8 +300,8 @@ object RandomEffectCoordinate { // Left join the models to data and optimization problems for cases where we have a prior model but no new data val (newModels, randomEffectOptimizationTracker) = initialRandomEffectModelOpt .map { randomEffectModel => - val modelsAndTrackers = randomEffectModel - .modelsRDD + val modelsRdd = randomEffectModel.toRDD() + val modelsAndTrackers = modelsRdd .leftOuterJoin(dataAndOptimizationProblems) .mapValues { case (localModel, Some((localDataset, optimizationProblem))) => @@ -343,7 +334,7 @@ object RandomEffectCoordinate { } val newRandomEffectModel = new RandomEffectModel( - newModels, + RandomEffectModel.toDataFrame(newModels), randomEffectType, featureShardId) @@ -366,39 +357,6 @@ object RandomEffectCoordinate { protected[algorithm] def score( randomEffectDataset: DataFrame, randomEffectModel: RandomEffectModel): CoordinateDataScores = { - - /* - // There may be more models than active data. However, since we're computing residuals for future coordinates, no - // data means no residual. Therefore, we use an inner join. Note that the active data and models use the same - // partitioner, but scores need to use GameDatum partitioner. - val activeScores = randomEffectDataset - .activeData - .join(randomEffectModel.modelsRDD) - .flatMap { case (_, (localDataset, model)) => - localDataset.dataPoints.map { case (uniqueId, labeledPoint) => - (uniqueId, model.computeScore(labeledPoint.features)) - } - } - .partitionBy(randomEffectDataset.uniqueIdPartitioner) - - // Passive data already uses the GameDatum partitioner. Note that this code assumes few (if any) entities have a - // passive dataset. - val passiveDataREIds = randomEffectDataset.passiveDataREIds - val modelsForPassiveData = randomEffectModel - .modelsRDD - .filter { case (reId, _) => - passiveDataREIds.value.contains(reId) - } - .collectAsMap() - val passiveScores = randomEffectDataset - .passiveData - .mapValues { case (randomEffectId, labeledPoint) => - modelsForPassiveData(randomEffectId).computeScore(labeledPoint.features) - } - - new CoordinateDataScores(activeScores ++ passiveScores) - - */ - return null + randomEffectModel.score(randomEffectDataset) } } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala index 3fd795e2..b8c151bd 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala @@ -23,19 +23,19 @@ import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.util.Identifiable import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.functions.{udf, col} +import org.apache.spark.sql.functions.udf import org.apache.spark.storage.StorageLevel import org.slf4j.Logger import com.linkedin.photon.ml.TaskType import com.linkedin.photon.ml.TaskType.TaskType -import com.linkedin.photon.ml.Types.{CoordinateId, FeatureShardId, UniqueSampleId} +import com.linkedin.photon.ml.Types.CoordinateId import com.linkedin.photon.ml.algorithm._ import com.linkedin.photon.ml.data._ import com.linkedin.photon.ml.evaluation._ import com.linkedin.photon.ml.function.ObjectiveFunctionHelper import com.linkedin.photon.ml.function.glm._ -import com.linkedin.photon.ml.model.{GameModel, RandomEffectModel} +import com.linkedin.photon.ml.model.GameModel import com.linkedin.photon.ml.normalization._ import com.linkedin.photon.ml.optimization.VarianceComputationType import com.linkedin.photon.ml.optimization.VarianceComputationType.VarianceComputationType @@ -497,7 +497,7 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P .map { coordinateId => val coordinate: C forSome { type C <: Coordinate } = if (lockedCoordinates.contains(coordinateId)) { dataConfigs(coordinateId) match { - case _: FixedEffectDataConfiguration => new FixedEffectModelCoordinate(data) + case _: FixedEffectDataConfiguration => new FixedEffectModelCoordinate(data, dataConfigs(coordinateId).featureShardId) case _: RandomEffectDataConfiguration => new RandomEffectModelCoordinate(data) case oConfig => throw new UnsupportedOperationException(s"Unsupported coordinate type: ${oConfig.getClass}") } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/model/FixedEffectModel.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/model/FixedEffectModel.scala index da2dae64..d8deb093 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/model/FixedEffectModel.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/model/FixedEffectModel.scala @@ -15,14 +15,17 @@ package com.linkedin.photon.ml.model import org.apache.spark.broadcast.Broadcast -import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.{col, lit} +import com.linkedin.photon.ml.Constants import com.linkedin.photon.ml.TaskType.TaskType -import com.linkedin.photon.ml.Types.{FeatureShardId, UniqueSampleId} -import com.linkedin.photon.ml.data.GameDatum -import com.linkedin.photon.ml.data.scoring.{CoordinateDataScores, ModelDataScores} +import com.linkedin.photon.ml.Types.FeatureShardId +import com.linkedin.photon.ml.constants.DataConst +import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.spark.BroadcastLike import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel +import com.linkedin.photon.ml.util.VectorUtils /** * Representation of a fixed effect model. @@ -52,25 +55,12 @@ class FixedEffectModel( * @param dataPoints The dataset to score * @return The computed scores */ - override def score(dataPoints: RDD[(UniqueSampleId, GameDatum)]): ModelDataScores = - FixedEffectModel.score(dataPoints, modelBroadcast, featureShardId, ModelDataScores.toScore, ModelDataScores.apply) - - /** - * Compute the scores for the GAME dataset, and store the scores only. - * - * @note Use a static method to avoid serializing entire model object during RDD operations. - * @param dataPoints The dataset to score - * @return The computed scores - */ - override protected[ml] def scoreForCoordinateDescent(dataPoints: RDD[(UniqueSampleId, GameDatum)]): CoordinateDataScores = - FixedEffectModel.score( - dataPoints, - modelBroadcast, - featureShardId, - CoordinateDataScores.toScore, - CoordinateDataScores.apply) + override def score(dataPoints: DataFrame): CoordinateDataScores = { + val scores = FixedEffectModel.score(dataPoints, modelBroadcast, featureShardId) + new CoordinateDataScores(scores) + } - /** + /** * Build a summary string for the coefficients. * * @return String representation @@ -115,22 +105,21 @@ object FixedEffectModel { /** * Compute the scores for the dataset. * - * @param dataPoints The dataset to score + * @param dataset The dataset to score * @param modelBroadcast The model to use for scoring * @param featureShardId The feature shard id * @return The scores */ - private def score[T, V]( - dataPoints: RDD[(UniqueSampleId, GameDatum)], + private def score( + dataset: DataFrame, modelBroadcast: Broadcast[GeneralizedLinearModel], - featureShardId: FeatureShardId, - toScore: (GameDatum, Double) => T, - toResult: RDD[(UniqueSampleId, T)] => V): V = { + featureShardId: FeatureShardId): DataFrame = { - val scores = dataPoints.mapValues { gameDatum => - toScore(gameDatum, modelBroadcast.value.computeScore(gameDatum.featureShardContainer(featureShardId))) - } + val cofs = VectorUtils.breezeToMl(modelBroadcast.value.coefficients.means) + val scores = dataset + .withColumn(DataConst.SCORE, GeneralizedLinearModel.scoreUdf(lit(cofs), col(featureShardId))) + .select(Constants.UNIQUE_SAMPLE_ID, DataConst.SCORE) - toResult(scores) + scores } } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/model/RandomEffectModel.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/model/RandomEffectModel.scala index 7ce23a66..4ed00514 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/model/RandomEffectModel.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/model/RandomEffectModel.scala @@ -14,33 +14,40 @@ */ package com.linkedin.photon.ml.model +import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD -import org.apache.spark.rdd.RDD._ +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.functions.col import org.apache.spark.storage.StorageLevel -import org.apache.spark.{HashPartitioner, SparkContext} +import org.apache.spark.ml.linalg.{Vector => SparkVector} +import com.linkedin.photon.ml.{Constants, TaskType} import com.linkedin.photon.ml.TaskType.TaskType -import com.linkedin.photon.ml.Types.{UniqueSampleId, REId, REType, FeatureShardId} +import com.linkedin.photon.ml.Types.{FeatureShardId, REType} +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.data.GameDatum -import com.linkedin.photon.ml.data.scoring.{CoordinateDataScores, ModelDataScores} +import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.spark.RDDLike +import com.linkedin.photon.ml.supervised.classification.{LogisticRegressionModel, SmoothedHingeLossLinearSVMModel} import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel +import com.linkedin.photon.ml.supervised.regression.{LinearRegressionModel, PoissonRegressionModel} +import com.linkedin.photon.ml.util.VectorUtils /** * Representation of a random effect model. * - * @param modelsRDD The models, one for each unique random effect value + * @param models The models, one for each unique random effect value * @param randomEffectType The random effect type * @param featureShardId The feature shard id */ class RandomEffectModel( - val modelsRDD: RDD[(REId, GeneralizedLinearModel)], + val models: DataFrame, val randomEffectType: REType, val featureShardId: FeatureShardId) extends DatumScoringModel with RDDLike { - override val modelType: TaskType = RandomEffectModel.determineModelType(modelsRDD) + override val modelType: TaskType = RandomEffectModel.determineModelType(models) // // RandomEffectModel functions @@ -49,11 +56,11 @@ class RandomEffectModel( /** * Create a new [[RandomEffectModel]] with new underlying models. * - * @param newModelsRdd The new underlying models, one per entity + * @param newModels The new underlying models, one per entity * @return A new [[RandomEffectModel]] */ - def update(newModelsRdd: RDD[(REId, GeneralizedLinearModel)]): RandomEffectModel = - new RandomEffectModel(newModelsRdd, randomEffectType, featureShardId) + def update(newModels: DataFrame): RandomEffectModel = + new RandomEffectModel(newModels, randomEffectType, featureShardId) // // DatumScoringModel functions @@ -63,35 +70,19 @@ class RandomEffectModel( * Compute the score for the dataset. * * @note Use a static method to avoid serializing entire model object during RDD operations. - * @param dataPoints The dataset to score (Note that the Long in the RDD is a unique identifier for the paired + * @param dataset The dataset to score (Note that the Long in the RDD is a unique identifier for the paired * [[GameDatum]] object, referred to in the GAME code as the "unique id") * @return The computed scores */ - override def score(dataPoints: RDD[(UniqueSampleId, GameDatum)]): ModelDataScores = - RandomEffectModel.score( - dataPoints, - modelsRDD, - randomEffectType, - featureShardId, - ModelDataScores.toScore, - ModelDataScores.apply) + override def score(dataset: DataFrame): CoordinateDataScores = { - /** - * Compute the scores for the GAME dataset, and store the scores only. - * - * @note Use a static method to avoid serializing entire model object during RDD operations. - * @param dataPoints The dataset to score (Note that the Long in the RDD is a unique identifier for the paired - * [[GameDatum]] object, referred to in the GAME code as the "unique id") - * @return The computed scores - */ - override def scoreForCoordinateDescent(dataPoints: RDD[(UniqueSampleId, GameDatum)]): CoordinateDataScores = - RandomEffectModel.score( - dataPoints, - modelsRDD, + val scores = RandomEffectModel.score( + dataset, + models, randomEffectType, - featureShardId, - CoordinateDataScores.toScore, - CoordinateDataScores.apply) + featureShardId) + new CoordinateDataScores(scores) + } // // Summarizable functions @@ -108,11 +99,11 @@ class RandomEffectModel( stringBuilder.append(s"\nRandom Effect Type: '$randomEffectType'") stringBuilder.append(s"\nFeature Shard ID: '$featureShardId'") - stringBuilder.append(s"\nLength: ${modelsRDD.values.map(_.coefficients.means.length).stats()}") - stringBuilder.append(s"\nMean: ${modelsRDD.values.map(_.coefficients.meansL2Norm).stats()}") - if (modelsRDD.first()._2.coefficients.variancesOption.isDefined) { - stringBuilder.append(s"\nVariance: ${modelsRDD.values.map(_.coefficients.variancesL2NormOption.get).stats()}") - } + //stringBuilder.append(s"\nLength: ${modelsRDD.values.map(_.coefficients.means.length).stats()}") + //stringBuilder.append(s"\nMean: ${modelsRDD.values.map(_.coefficients.meansL2Norm).stats()}") + //if (modelsRDD.first()._2.coefficients.variancesOption.isDefined) { + // stringBuilder.append(s"\nVariance: ${modelsRDD.values.map(_.coefficients.variancesL2NormOption.get).stats()}") + //} stringBuilder.toString() } @@ -126,56 +117,47 @@ class RandomEffectModel( * * @return The Spark context */ - override protected[ml] def sparkContext: SparkContext = modelsRDD.sparkContext + override protected[ml] def sparkContext: SparkContext = SparkSession.builder.getOrCreate.sparkContext - /** - * Assign a given name to [[modelsRDD]]. - * - * @note Not used to reference models in the logic of photon-ml, only used for logging currently. - * @param name The parent name for all [[RDD]]s in this class - * @return This object with the name of [[modelsRDD]] assigned - */ override protected[ml] def setName(name: String): RandomEffectModel = { - modelsRDD.setName(name) - this } /** - * Set the storage level of [[modelsRDD]], and persist their values across the cluster the first time they are + * Set the storage level of [[models]], and persist their values across the cluster the first time they are * computed. * * @param storageLevel The storage level - * @return This object with the storage level of [[modelsRDD]] set + * @return This object with the storage level of [[models]] set */ override protected[ml] def persistRDD(storageLevel: StorageLevel): RandomEffectModel = { - if (!modelsRDD.getStorageLevel.isValid) modelsRDD.persist(storageLevel) + models.persist(storageLevel) this } /** - * Mark [[modelsRDD]] as non-persistent, and remove all blocks for them from memory and disk. + * Mark [[models]] as non-persistent, and remove all blocks for them from memory and disk. * - * @return This object with [[modelsRDD]] marked non-persistent + * @return This object with [[models]] marked non-persistent */ override protected[ml] def unpersistRDD(): RandomEffectModel = { - if (modelsRDD.getStorageLevel.isValid) modelsRDD.unpersist() + models.unpersist() this } /** - * Materialize [[modelsRDD]] (Spark [[RDD]]s are lazy evaluated: this method forces them to be evaluated). + * Materialize [[models]] (Spark data are lazy evaluated: this method forces them to be evaluated). * - * @return This object with [[modelsRDD]] materialized + * @return This object with [[models]] materialized */ override protected[ml] def materialize(): RandomEffectModel = { - modelsRDD.count() + models.count() this } @@ -194,18 +176,12 @@ class RandomEffectModel( val areTypesEqual = this.randomEffectType == other.randomEffectType val areShardsEqual = this.featureShardId == other.featureShardId lazy val areAllModelsEqual = this - .modelsRDD - .fullOuterJoin(other.modelsRDD) - .mapPartitions { iterator => - - val areModelsEqual = iterator.forall { - case (_, (Some(model1), Some(model2))) => model1.equals(model2) - case _ => false - } - - Iterator.single(areModelsEqual) - } - .fold(true)(_ && _) + .models + .withColumnRenamed(DataConst.COEFFICIENTS, "s1") + .join(other.models.withColumnRenamed(DataConst.COEFFICIENTS, "s2"), col(DataConst.ID), "fullouter") + .filter("s1 is null or s2 is null or s1 != s2") //TODO: add udf to compare two vectors + .head(1) + .isEmpty areTypesEqual && areShardsEqual && areAllModelsEqual @@ -213,6 +189,33 @@ class RandomEffectModel( false } + /** + * Convert models from dataframe to RDD + * @return + */ + def toRDD(): RDD[(REType, GeneralizedLinearModel)] = { + models + .select(randomEffectType, DataConst.MODEL_TYPE, DataConst.COEFFICIENTS) + .rdd + .map { row => + val reid = row.getInt(0).toString + val modelType: TaskType = TaskType.withName(row.getString(1)) + val coefficients = Coefficients(VectorUtils.mlToBreeze(row.getAs[SparkVector](2))) + + val model = modelType match { + case TaskType.LINEAR_REGRESSION => + LinearRegressionModel(coefficients) + case TaskType.LOGISTIC_REGRESSION => + LogisticRegressionModel(coefficients) + case TaskType.POISSON_REGRESSION => + PoissonRegressionModel(coefficients) + case TaskType.SMOOTHED_HINGE_LOSS_LINEAR_SVM => + SmoothedHingeLossLinearSVMModel(coefficients) + } + (reid, model) + } + } + /** * Returns a hash code value for the object. * @@ -233,67 +236,44 @@ object RandomEffectModel { * that type - it will be faster for large numbers of random effect models. Note that it may still be a * bottleneck if we check each time a new RandomEffectModel is created. * - * @param modelsRDD The random effect models + * @param models The random effect models * @return The GAME model type */ - protected def determineModelType(modelsRDD: RDD[(REId, GeneralizedLinearModel)]): TaskType = { + protected def determineModelType(models: DataFrame): TaskType = { - val modelTypes = modelsRDD.values.map(_.modelType).distinct().collect() + val modelTypes = models.select(GeneralizedLinearModel.MODEL_TYPE).head(1) require( modelTypes.length == 1, - s"${modelsRDD.name} has multiple model types:\n${modelTypes.mkString(", ")}") + s"models has multiple model types:\n${modelTypes.mkString(", ")}") - modelTypes.head + TaskType.withName(modelTypes(0).getString(0)) } /** * Compute the scores for a dataset, using random effect models. * - * @param dataPoints The dataset to score - * @param modelsRDD The individual random effect models to use for scoring + * @param dataset The dataset to score + * @param models The individual random effect models to use for scoring * @param randomEffectType The random effect type * @param featureShardId The feature shard id * @return The scores */ - private def score[T, V]( - dataPoints: RDD[(UniqueSampleId, GameDatum)], - modelsRDD: RDD[(REId, GeneralizedLinearModel)], + private def score ( + dataset: DataFrame, + models: DataFrame, randomEffectType: REType, - featureShardId: FeatureShardId, - toScore: (GameDatum, Double) => T, - toResult: RDD[(UniqueSampleId, T)] => V): V = { - - val hashPartitioner = new HashPartitioner(dataPoints.getNumPartitions) - - /* - * We perform a replicated partitioned hash join here under the assumption that we can fit the per partition - * random effect models in memory. We first partition both relations using the same partitioner and then zip them. - * This ensures that the same keys from both relations go in the same partition. Given above, we can now perform the - * join by doing the following operations per partition: - * 1. Load the random effect models in memory - * 2. Iterate over the data points - * 3. For each data point, look up the corresponding random effect model in the in memory map and score - */ - val scores = dataPoints - .map { case (uniqueId, gameDatum) => - (gameDatum.idTagToValueMap(randomEffectType), (uniqueId, gameDatum)) - } - .partitionBy(hashPartitioner) - .zipPartitions(modelsRDD.partitionBy(hashPartitioner)) { (dataIt, modelIt) => - - val lookupTable = modelIt.toMap + featureShardId: FeatureShardId): DataFrame = { - dataIt.map { case (id, (uid, datum)) => - val score = lookupTable - .get(id) - .map(_.computeScore(datum.featureShardContainer(featureShardId))) - .getOrElse(0.0) + val scores: DataFrame = dataset + .join(models, randomEffectType) + .withColumn(DataConst.SCORE, GeneralizedLinearModel.scoreUdf(col(DataConst.COEFFICIENTS), col(featureShardId))) + .select(Constants.UNIQUE_SAMPLE_ID, DataConst.SCORE) - (uid, toScore(datum, score)) - } - } + scores + } - toResult(scores) + def toDataFrame(input: RDD[(REType, GeneralizedLinearModel)]): DataFrame = { + null } } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/supervised/model/GeneralizedLinearModel.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/supervised/model/GeneralizedLinearModel.scala index 8f55fbf0..14c9cc1c 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/supervised/model/GeneralizedLinearModel.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/supervised/model/GeneralizedLinearModel.scala @@ -15,7 +15,10 @@ package com.linkedin.photon.ml.supervised.model import breeze.linalg.Vector +import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector => SparkVector} + import org.apache.spark.rdd.RDD +import org.apache.spark.sql.functions.udf import com.linkedin.photon.ml.TaskType.TaskType import com.linkedin.photon.ml.model.Coefficients @@ -165,4 +168,34 @@ object GeneralizedLinearModel { broadcastModel.unpersist() result } + + val MODEL_TYPE = "modelType" + + /** + * A UDF to compute scores given a linear model and a feature vector + * @return The score which is the dot product of model coefficients and features + */ + def scoreUdf = udf({(coefficients: SparkVector, features: SparkVector) => + require( + coefficients.size == features.size, + s"Coefficients.size = ${coefficients.size} and features.size = ${features.size}") + + val score = coefficients match { + case (dCoef: DenseVector) => + val array = dCoef.toArray + var s = 0.0 + features.foreachActive((i, v) => s += v * array(i)) + s + case (sCoef: SparseVector) => + val array = features.toArray + var s = 0.0 + sCoef.foreachActive((i, v) => s += v * array(i)) + s + case _ => throw new UnsupportedOperationException( + s"Coefficients type ${coefficients.getClass} is not supported.") + + } + score + }) + } diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateDescent.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateDescent.scala index 9b028901..98376575 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateDescent.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateDescent.scala @@ -20,7 +20,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.storage.StorageLevel import org.slf4j.Logger -import com.linkedin.photon.ml.Types.{CoordinateId, UniqueSampleId} +import com.linkedin.photon.ml.Types.CoordinateId import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.evaluation.{EvaluationResults, EvaluationSuite, EvaluatorType} import com.linkedin.photon.ml.model.{DatumScoringModel, GameModel} @@ -318,11 +318,11 @@ object CoordinateDescent { implicit logger: Logger): EvaluationResults = Timed("Validate GAME model") { val validatingScores = Timed(s"Compute validation scores") { - modelToEvaluate.scoreForCoordinateDescent(validationData) // TODO: to fix the error + modelToEvaluate.score(validationData) } Timed(s"Compute evaluation metrics") { - val results = evaluationSuite.evaluate(validatingScores.scoresRdd) + val results = evaluationSuite.evaluate(validatingScores.scores) //todo: to fix it results .evaluations @@ -397,7 +397,7 @@ object CoordinateDescent { var previousScores = firstCoordinate.score(firstCoordinateModel) var summedScores: CoordinateDataScores = - CoordinateDataScores(SparkSession.builder().getOrCreate().sparkContext.emptyRDD) + CoordinateDataScores(SparkSession.builder().getOrCreate().emptyDataFrame) val currentModels: mutable.Map[CoordinateId, DatumScoringModel] = mutable.Map(firstCoordinateId -> firstCoordinateModel) val currentScores: mutable.Map[CoordinateId, CoordinateDataScores] = @@ -521,7 +521,7 @@ object CoordinateDescent { var previousScores = firstCoordinate.score(firstCoordinateModel) var summedScores: CoordinateDataScores = - CoordinateDataScores(SparkSession.builder().getOrCreate().sparkContext.emptyRDD) + CoordinateDataScores(SparkSession.builder().getOrCreate().emptyDataFrame) val currentModels: mutable.Map[CoordinateId, DatumScoringModel] = mutable.Map(firstCoordinateId -> firstCoordinateModel) val currentScores: mutable.Map[CoordinateId, CoordinateDataScores] = diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/constants/DataConst.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/constants/DataConst.scala new file mode 100644 index 00000000..40b4a8c4 --- /dev/null +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/constants/DataConst.scala @@ -0,0 +1,23 @@ +/* + * Copyright 2017 LinkedIn Corp. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain a + * copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + */ +package com.linkedin.photon.ml.constants + +object DataConst { + val ID = "uniqueId" + val SCORE = "score" + + val MODEL_TYPE = "modelType" + val COEFFICIENTS = "coefficients" +} diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/CoordinateDataScores.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/CoordinateDataScores.scala index 311be33a..d6edd2fe 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/CoordinateDataScores.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/CoordinateDataScores.scala @@ -15,20 +15,21 @@ package com.linkedin.photon.ml.data.scoring import org.apache.spark.rdd.RDD -import org.apache.spark.rdd.RDD.rddToPairRDDFunctions +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.expressions.UserDefinedFunction +import org.apache.spark.sql.functions.{col, udf} -import com.linkedin.photon.ml.Types.UniqueSampleId -import com.linkedin.photon.ml.constants.MathConst import com.linkedin.photon.ml.data.GameDatum +import com.linkedin.photon.ml.constants.{DataConst, MathConst} /** * The class used to track scored data points throughout training. The score objects are scores only, with no additional * information. * - * @param scoresRdd The scores consist of (unique ID, score) pairs as explained above. + * @param scores The scores dataframe consist of (unique ID, score) pairs as explained above. */ -protected[ml] class CoordinateDataScores(override val scoresRdd: RDD[(UniqueSampleId, Double)]) - extends DataScores[Double, CoordinateDataScores](scoresRdd) { +protected[ml] class CoordinateDataScores(override val scores: DataFrame) + extends DataScores[CoordinateDataScores](scores) { /** * Generic method to combine two [[CoordinateDataScores]] objects. @@ -37,15 +38,16 @@ protected[ml] class CoordinateDataScores(override val scoresRdd: RDD[(UniqueSamp * @param that The [[CoordinateDataScores]] instance to merge with this instance * @return A merged [[CoordinateDataScores]] */ - private def joinAndApply(op: (Double, Double) => Double, that: CoordinateDataScores): CoordinateDataScores = - // Use fullOuterJoin: it's possible for some data to not be scored by a model + private def joinAndApply(op: UserDefinedFunction, that: CoordinateDataScores): CoordinateDataScores = new CoordinateDataScores( this - .scoresRdd - .fullOuterJoin(that.scoresRdd) - .mapValues { case (thisScoreOpt, thatScoreOpt) => - op(thisScoreOpt.getOrElse(MathConst.DEFAULT_SCORE), thatScoreOpt.getOrElse(MathConst.DEFAULT_SCORE)) - }) + .scores + .withColumnRenamed(DataConst.SCORE, "s1") + // use fullOuterJoin: it's possible for some data to not be scored by a model + .join(that.scores.withColumnRenamed(DataConst.SCORE, "s2"), col(DataConst.ID), "fullouter") + .withColumn("newScore", op(col("s1"), col("s2"))) + .select(DataConst.ID, "newScore") + .withColumnRenamed("newScore", DataConst.SCORE)) /** * The addition operation for [[CoordinateDataScores]]. @@ -54,7 +56,21 @@ protected[ml] class CoordinateDataScores(override val scoresRdd: RDD[(UniqueSamp * @param that The [[CoordinateDataScores]] instance to add to this instance * @return A new [[CoordinateDataScores]] instance encapsulating the accumulated values */ - override def +(that: CoordinateDataScores): CoordinateDataScores = joinAndApply((a, b) => a + b, that) + override def +(that: CoordinateDataScores): CoordinateDataScores = { + + val op = udf((a1: Double, a2: Double) => { + val s1 = Option(a1) match { + case Some(v) => v + case _ => MathConst.DEFAULT_SCORE + } + val s2 = Option(a2) match { + case Some(v) => v + case _ => MathConst.DEFAULT_SCORE + } + s1 + s2 + }) + joinAndApply(op, that) + } /** * The minus operation for [[CoordinateDataScores]]. @@ -63,7 +79,22 @@ protected[ml] class CoordinateDataScores(override val scoresRdd: RDD[(UniqueSamp * @param that The [[CoordinateDataScores]] instance to subtract from this instance * @return A new [[CoordinateDataScores]] instance encapsulating the subtracted values */ - override def -(that: CoordinateDataScores): CoordinateDataScores = joinAndApply((a, b) => a - b, that) + override def -(that: CoordinateDataScores): CoordinateDataScores = { + + val op = udf((a1: Double, a2: Double) => { + val s1 = Option(a1) match { + case Some(v) => v + case _ => MathConst.DEFAULT_SCORE + } + val s2 = Option(a2) match { + case Some(v) => v + case _ => MathConst.DEFAULT_SCORE + } + + s1 - s2 + }) + joinAndApply(op, that) + } /** * Method used to define equality on multiple class levels while conforming to equality contract. Defines under @@ -83,7 +114,7 @@ object CoordinateDataScores { * @param scores The scores, consisting of (unique ID, score) pairs. * @return A new [[CoordinateDataScores]] object */ - def apply(scores: RDD[(UniqueSampleId, Double)]): CoordinateDataScores = new CoordinateDataScores(scores) + def apply(scores: DataFrame): CoordinateDataScores = new CoordinateDataScores(scores) /** * Convert a [[GameDatum]] and a raw score into a score object. For [[CoordinateDataScores]] this is the raw score. diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/DataScores.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/DataScores.scala index eb140632..ac3d4d9e 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/DataScores.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/DataScores.scala @@ -14,24 +14,23 @@ */ package com.linkedin.photon.ml.data.scoring -import scala.reflect.ClassTag - import org.apache.spark.SparkContext -import org.apache.spark.rdd.RDD -import org.apache.spark.rdd.RDD.rddToPairRDDFunctions +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.functions.col import org.apache.spark.storage.StorageLevel -import com.linkedin.photon.ml.Types.UniqueSampleId import com.linkedin.photon.ml.spark.RDDLike +import com.linkedin.photon.ml.constants.DataConst /** - * A base class for tracking scored data points, where the scores are stored in an [[RDD]] which associates the unique + * A base class for tracking scored data points, where the scores are stored in an [[DataFrame]] + * which associates the unique * ID of a data point with a score object. * - * @param scoresRdd Data point scores, as described above + * @param scores Data point scores, as described above */ -abstract protected[ml] class DataScores[T : ClassTag, D <: DataScores[T, D]]( - val scoresRdd: RDD[(UniqueSampleId, T)]) +abstract protected[ml] class DataScores[D <: DataScores[D]]( + val scores: DataFrame) extends RDDLike { /** @@ -57,54 +56,47 @@ abstract protected[ml] class DataScores[T : ClassTag, D <: DataScores[T, D]]( * * @return The Spark context */ - override def sparkContext: SparkContext = scoresRdd.sparkContext - - /** - * Set the name of [[scoresRdd]]. - * - * @param name The parent name for all [[RDD]]s in this class - * @return This object with the name of [[scoresRdd]] assigned - */ - override def setName(name: String): RDDLike = { + override def sparkContext: SparkContext = SparkSession.builder.getOrCreate.sparkContext - scoresRdd.setName(name) + /* RDDLike methods */ + override def setName(name: String): RDDLike = { this } /** - * Set the storage level of [[scoresRdd]]. + * Set the storage level of [[scores]]. * * @param storageLevel The storage level - * @return This object with the storage level of [[scoresRdd]] set + * @return This object with the storage level of [[scores]] set */ override def persistRDD(storageLevel: StorageLevel): RDDLike = { - if (!scoresRdd.getStorageLevel.isValid) scoresRdd.persist(storageLevel) + scores.persist(storageLevel) this } /** - * Mark [[scoresRdd]] as non-persistent, and remove all blocks for them from memory and disk. + * Mark [[scores]] as non-persistent, and remove all blocks for them from memory and disk. * - * @return This object with [[scoresRdd]] marked non-persistent + * @return This object with [[scores]] marked non-persistent */ override def unpersistRDD(): RDDLike = { - if (scoresRdd.getStorageLevel.isValid) scoresRdd.unpersist() + scores.unpersist() this } /** - * Materialize [[scoresRdd]] (Spark [[RDD]]s are lazy evaluated: this method forces them to be evaluated). + * Materialize [[scores]] (Spark data are lazy evaluated: this method forces them to be evaluated). * - * @return This object with [[scoresRdd]] materialized + * @return This object with [[scores]] materialized */ override def materialize(): RDDLike = { - scoresRdd.count() + scores.count() this } @@ -116,7 +108,7 @@ abstract protected[ml] class DataScores[T : ClassTag, D <: DataScores[T, D]]( * @param other Some other object * @return Whether this object can equal the other object */ - def canEqual(other: Any): Boolean = other.isInstanceOf[DataScores[T, D]] + def canEqual(other: Any): Boolean = other.isInstanceOf[DataScores[D]] /** * Compare two [[DataScores]]s objects. @@ -126,22 +118,16 @@ abstract protected[ml] class DataScores[T : ClassTag, D <: DataScores[T, D]]( */ override def equals(other: Any): Boolean = other match { - case that: DataScores[T, D] => + case that: DataScores[D] => val canEqual = this.canEqual(that) lazy val areEqual = this - .scoresRdd - .fullOuterJoin(that.scoresRdd) - .mapPartitions { iterator => - - val areScoresEqual = iterator.forall { - case (_, (Some(thisScore), Some(thatScore))) => thisScore.equals(thatScore) - case _ => false - } - - Iterator.single(areScoresEqual) - } - .fold(true)(_ && _) + .scores + .withColumnRenamed(DataConst.SCORE, "s1") + .join(that.scores.withColumnRenamed(DataConst.SCORE, "s2"), col(DataConst.ID), "fullouter") + .filter("s1 is null or s2 is null or s1 != s2") + .head(1) + .isEmpty canEqual && areEqual @@ -154,5 +140,6 @@ abstract protected[ml] class DataScores[T : ClassTag, D <: DataScores[T, D]]( * * @return An [[Int]] hash code */ - override def hashCode: Int = scoresRdd.hashCode() + override def hashCode: Int = scores.hashCode() + } diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/ModelDataScores.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/ModelDataScores.scala deleted file mode 100644 index 798770e6..00000000 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/ModelDataScores.scala +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ -package com.linkedin.photon.ml.data.scoring - -import org.apache.spark.rdd.RDD -import org.apache.spark.rdd.RDD.rddToPairRDDFunctions - -import com.linkedin.photon.ml.Types.UniqueSampleId -import com.linkedin.photon.ml.constants.MathConst -import com.linkedin.photon.ml.data.GameDatum - -/** - * The class used to track scored data points throughout scoring and validation. The score objects are - * [[ScoredGameDatum]], full data points with score information. - * - * @param scoresRdd Data point scores, as described above - */ -class ModelDataScores(override val scoresRdd: RDD[(UniqueSampleId, ScoredGameDatum)]) - extends DataScores[ScoredGameDatum, ModelDataScores](scoresRdd) { - - /** - * Generic method to combine two [[ModelDataScores]] objects. - * - * @param op The operator to combine two [[ModelDataScores]] - * @param that The [[ModelDataScores]] instance to merge with this instance - * @return A merged [[ModelDataScores]] - */ - private def joinAndApply( - op: (ScoredGameDatum, ScoredGameDatum) => ScoredGameDatum, - that: ModelDataScores): ModelDataScores = - // Use fullOuterJoin: it's possible for some data to not be scored by a model - new ModelDataScores( - this - .scoresRdd - .fullOuterJoin(that.scoresRdd) - .mapValues { case (thisScoreOpt, thatScoreOpt) => - // Currently acceptable to drop op if one value is missing, since the currently existing operations are - // commutative and the default value is the 0 value - (thisScoreOpt, thatScoreOpt) match { - case (Some(thisScore), Some(thatScore)) => op(thisScore, thatScore) - case (Some(thisScore), None) => op(thisScore, thisScore.copy(score = MathConst.DEFAULT_SCORE)) - case (None, Some(thatScore)) => op(thatScore.copy(score = MathConst.DEFAULT_SCORE), thatScore) - } - }) - - /** - * The addition operation for [[ModelDataScores]]. - * - * @note This operation performs a full outer join. - * @param that The [[ModelDataScores]] instance to add to this instance - * @return A new [[ModelDataScores]] instance encapsulating the accumulated values - */ - override def +(that: ModelDataScores): ModelDataScores = - joinAndApply((a, b) => a.copy(score = a.score + b.score), that) - - /** - * The minus operation for [[ModelDataScores]]. - * - * @note This operation performs a full outer join. - * @param that The [[ModelDataScores]] instance to subtract from this instance - * @return A new [[ModelDataScores]] instance encapsulating the subtracted values - */ - override def -(that: ModelDataScores): ModelDataScores = - joinAndApply((a, b) => a.copy(score = a.score - b.score), that) - - /** - * Method used to define equality on multiple class levels while conforming to equality contract. Defines under - * what circumstances this class can equal another class. - * - * @param other Some other object - * @return Whether this object can equal the other object - */ - override def canEqual(other: Any): Boolean = other.isInstanceOf[ModelDataScores] -} - -object ModelDataScores { - - /** - * A factory method to create a [[ModelDataScores]] object from an [[RDD]] of scores. - * - * @param scores The scores, consisting of (unique ID, scored datum) pairs. - * @return A new [[ModelDataScores]] object - */ - def apply(scores: RDD[(Long, ScoredGameDatum)]): ModelDataScores = new ModelDataScores(scores) - - /** - * Convert a [[GameDatum]] and a raw score into a score object. For [[CoordinateDataScores]] this is the raw score. - * - * @param datum The datum which was scored - * @param score The raw score for the datum - * @return The score object - */ - protected[ml] def toScore(datum: GameDatum, score: Double): ScoredGameDatum = datum.toScoredGameDatum(score) -} diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/model/Coefficients.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/model/Coefficients.scala index 38dc40c1..07ab0516 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/model/Coefficients.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/model/Coefficients.scala @@ -14,7 +14,7 @@ */ package com.linkedin.photon.ml.model -import breeze.linalg.{DenseVector, SparseVector, Vector, norm} +import breeze.linalg.{Vector, norm} import breeze.stats.meanAndVariance import com.linkedin.photon.ml.constants.MathConst diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/model/DatumScoringModel.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/model/DatumScoringModel.scala index 5b6aff04..4fe2650c 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/model/DatumScoringModel.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/model/DatumScoringModel.scala @@ -14,12 +14,10 @@ */ package com.linkedin.photon.ml.model -import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame import com.linkedin.photon.ml.TaskType.TaskType -import com.linkedin.photon.ml.Types.UniqueSampleId -import com.linkedin.photon.ml.data.GameDatum -import com.linkedin.photon.ml.data.scoring.{CoordinateDataScores, ModelDataScores} +import com.linkedin.photon.ml.data.scoring.{CoordinateDataScores} import com.linkedin.photon.ml.util.Summarizable /** @@ -39,14 +37,6 @@ trait DatumScoringModel extends Summarizable { * @param dataPoints The dataset to score * @return The computed scores */ - def score(dataPoints: RDD[(UniqueSampleId, GameDatum)]): ModelDataScores + def score(dataPoints: DataFrame): CoordinateDataScores - /** - * Compute the scores for the GAME dataset, and store the scores only. - * - * @note "score" = features * coefficients (Before link function in the case of logistic regression, for example) - * @param dataPoints The dataset to score - * @return The computed scores - */ - protected[ml] def scoreForCoordinateDescent(dataPoints: RDD[(UniqueSampleId, GameDatum)]): CoordinateDataScores } diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/model/GameModel.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/model/GameModel.scala index 417ba0e2..f2d38d1c 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/model/GameModel.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/model/GameModel.scala @@ -16,12 +16,12 @@ package com.linkedin.photon.ml.model import scala.collection.SortedMap -import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame import com.linkedin.photon.ml.TaskType.TaskType -import com.linkedin.photon.ml.Types.{CoordinateId, UniqueSampleId} +import com.linkedin.photon.ml.Types.CoordinateId import com.linkedin.photon.ml.data.GameDatum -import com.linkedin.photon.ml.data.scoring.{CoordinateDataScores, ModelDataScores} +import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.util.ClassUtils /** @@ -96,12 +96,9 @@ class GameModel (private val gameModels: Map[CoordinateId, DatumScoringModel]) e * [[GameDatum]] object, referred to in the GAME code as the "unique id") * @return The computed scores */ - override def score(dataPoints: RDD[(UniqueSampleId, GameDatum)]): ModelDataScores = + override def score(dataPoints: DataFrame): CoordinateDataScores = gameModels.values.map(_.score(dataPoints)).reduce(_ + _) - override protected[ml] def scoreForCoordinateDescent(dataPoints: RDD[(UniqueSampleId, GameDatum)]): CoordinateDataScores = - gameModels.values.map(_.scoreForCoordinateDescent(dataPoints)).reduce(_ + _) - /** * Summarize this GAME model. * @@ -137,6 +134,7 @@ class GameModel (private val gameModels: Map[CoordinateId, DatumScoringModel]) e * @return An [[Int]] hash code */ override def hashCode(): Int = super.hashCode() + } object GameModel { From 26c62105718baacf3054af2dab851a70d5c6deda Mon Sep 17 00:00:00 2001 From: Lin Guo Date: Wed, 12 Feb 2020 10:29:07 -0800 Subject: [PATCH 06/11] reset the changes in README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index be4eadac..d9b9a4cc 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,5 @@ # Photon Machine Learning (Photon ML) - [![Build Status](https://travis-ci.org/linkedin/photon-ml.svg?branch=master)](https://travis-ci.org/linkedin/photon-ml) **Check out our [hands-on tutorial](https://github.com/linkedin/photon-ml/wiki/Photon-ML-Tutorial).** From 05b3e01803d1819e1deeb3ffde4bc85b4451da33 Mon Sep 17 00:00:00 2001 From: Lin Guo Date: Tue, 18 Feb 2020 10:19:03 -0800 Subject: [PATCH 07/11] Address Alex's comments --- .../photon/ml/SparkSessionConfiguration.scala | 3 +- .../ml/algorithm/FixedEffectCoordinate.scala | 103 ++++----- .../FixedEffectModelCoordinate.scala | 5 +- .../ml/algorithm/RandomEffectCoordinate.scala | 182 +++++++-------- .../RandomEffectModelCoordinate.scala | 5 +- .../photon/ml/data/GameConverters.scala | 168 -------------- .../photon/ml/data/LocalDataset.scala | 37 ---- .../data/RandomEffectDatasetPartitioner.scala | 171 -------------- .../photon/ml/estimators/GameEstimator.scala | 6 +- .../photon/ml/model/FixedEffectModel.scala | 26 +-- .../photon/ml/model/RandomEffectModel.scala | 90 ++------ .../RandomEffectOptimizationProblem.scala | 7 +- .../model/GeneralizedLinearModel.scala | 62 ++---- .../ml/transformers/GameTransformer.scala | 3 +- .../linkedin/photon/ml/util/ApiUtils.scala | 33 +++ .../com/linkedin/photon/ml/Constants.scala | 4 - .../cli/game/scoring/GameScoringDriver.scala | 1 - .../game/training/GameTrainingDriver.scala | 16 +- .../com/linkedin/photon/ml/util/Utils.scala | 23 +- .../photon/ml/algorithm/Coordinate.scala | 67 ++---- .../ml/algorithm/CoordinateDescent.scala | 209 ++++++------------ .../photon/ml/algorithm/ModelCoordinate.scala | 6 +- .../data/scoring/CoordinateDataScores.scala | 127 ----------- .../photon/ml/data/scoring/DataScores.scala | 145 ------------ .../ml/evaluation/EvaluationSuite.scala | 3 +- .../photon/ml/model/DatumScoringModel.scala | 3 +- .../linkedin/photon/ml/model/GameModel.scala | 7 +- .../photon/ml/sampling/DownSampler.scala | 1 - 28 files changed, 329 insertions(+), 1184 deletions(-) delete mode 100644 photon-api/src/main/scala/com/linkedin/photon/ml/data/GameConverters.scala delete mode 100644 photon-api/src/main/scala/com/linkedin/photon/ml/data/LocalDataset.scala delete mode 100644 photon-api/src/main/scala/com/linkedin/photon/ml/data/RandomEffectDatasetPartitioner.scala create mode 100644 photon-api/src/main/scala/com/linkedin/photon/ml/util/ApiUtils.scala rename {photon-api => photon-client}/src/main/scala/com/linkedin/photon/ml/Constants.scala (94%) rename {photon-api => photon-client}/src/main/scala/com/linkedin/photon/ml/util/Utils.scala (94%) delete mode 100644 photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/CoordinateDataScores.scala delete mode 100644 photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/DataScores.scala diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/SparkSessionConfiguration.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/SparkSessionConfiguration.scala index f1b3df63..e6556eb3 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/SparkSessionConfiguration.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/SparkSessionConfiguration.scala @@ -21,7 +21,7 @@ import org.apache.spark.serializer.KryoSerializer import org.apache.spark.sql.SparkSession import org.apache.spark.SparkConf -import com.linkedin.photon.ml.data.{GameDatum, LabeledPoint, LocalDataset} +import com.linkedin.photon.ml.data.{GameDatum, LabeledPoint} import com.linkedin.photon.ml.function._ import com.linkedin.photon.ml.function.glm.{HessianVectorAggregator, ValueAndGradientAggregator} import com.linkedin.photon.ml.model.Coefficients @@ -57,7 +57,6 @@ object SparkSessionConfiguration { classOf[LabeledPoint], classOf[LBFGS], classOf[LinearRegressionModel], - classOf[LocalDataset], classOf[LogisticRegressionModel], classOf[Matrix[Double]], classOf[NormalizationContext], diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala index 0310bfa5..7c505279 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala @@ -15,50 +15,41 @@ package com.linkedin.photon.ml.algorithm import org.apache.spark.ml.linalg.{Vector => SparkVector} -import org.apache.spark.sql.functions.{col, lit} -import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.col import org.apache.spark.storage.StorageLevel -import com.linkedin.photon.ml.Constants import com.linkedin.photon.ml.Types.FeatureShardId -import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.data._ -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.function.DistributedObjectiveFunction import com.linkedin.photon.ml.model.{DatumScoringModel, FixedEffectModel} import com.linkedin.photon.ml.optimization.{DistributedOptimizationProblem, FixedEffectOptimizationTracker, OptimizationTracker} -import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel -import com.linkedin.photon.ml.util.VectorUtils +import com.linkedin.photon.ml.util.{ApiUtils, VectorUtils} /** * The optimization problem coordinate for a fixed effect model. * * @tparam Objective The type of objective function used to solve the fixed effect optimization problem - * @param rawData The raw training data + * @param dataset The raw training data * @param optimizationProblem The fixed effect optimization problem * @param inputColumnsNames */ protected[ml] class FixedEffectCoordinate[Objective <: DistributedObjectiveFunction]( - rawData: DataFrame, + var dataset: DataFrame, optimizationProblem: DistributedOptimizationProblem[Objective], featureShardId: FeatureShardId, inputColumnsNames: InputColumnsNames) extends Coordinate { - var dataset: DataFrame = - rawData - .select(Constants.UNIQUE_SAMPLE_ID, featureShardId, inputColumnsNames(InputColumnsNames.RESPONSE)) - .withColumn(inputColumnsNames(InputColumnsNames.OFFSET), lit(0.0)) - - - override protected def updateDataset(scores: CoordinateDataScores) = { - dataset = scores.scores - .join(rawData, Constants.UNIQUE_SAMPLE_ID) - .withColumn(inputColumnsNames(InputColumnsNames.OFFSET), - col(inputColumnsNames(InputColumnsNames.OFFSET)) + col(DataConst.SCORE)) + override protected def updateOffset(model: DatumScoringModel) = { + model match { + case fixedEffectModel: FixedEffectModel => + dataset = FixedEffectCoordinate.updateOffset(dataset, fixedEffectModel, featureShardId, inputColumnsNames) + case _ => + throw new UnsupportedOperationException(s"Unsupported model type: ${model.modelType}") + } } - /** * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset using an existing model as * a starting point. @@ -81,23 +72,6 @@ protected[ml] class FixedEffectCoordinate[Objective <: DistributedObjectiveFunct } - /** - * Compute scores for the coordinate dataset using the given model. - * - * @param model The input model - * @return The dataset scores - */ - override protected[algorithm] def score(model: DatumScoringModel): CoordinateDataScores = model match { - - case fixedEffectModel: FixedEffectModel => - FixedEffectCoordinate.score(dataset, fixedEffectModel, featureShardId) - - case _ => - throw new UnsupportedOperationException( - s"Scoring with model of type ${model.getClass} in ${this.getClass} is not supported") - } - - /** * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset. * @@ -105,10 +79,13 @@ protected[ml] class FixedEffectCoordinate[Objective <: DistributedObjectiveFunct */ override protected def trainModel(): (DatumScoringModel, OptimizationTracker) = FixedEffectCoordinate.trainModel(dataset, optimizationProblem, featureShardId, None) + } object FixedEffectCoordinate { + def SCORE_FIELD = "fixed_score" + /** * Train a new [[FixedEffectModel]] (i.e. run model optimization). * @@ -136,30 +113,44 @@ object FixedEffectCoordinate { rdd.persist(StorageLevel.MEMORY_ONLY) val (glm, stateTracker) = initialFixedEffectModelOpt - .map { initialFixedEffectModel => + .map ( initialFixedEffectModel => optimizationProblem.runWithSampling(rdd, initialFixedEffectModel.model) - } + ) .getOrElse(optimizationProblem.runWithSampling(rdd)) rdd.unpersist() - (new FixedEffectModel(SparkSession.builder.getOrCreate.sparkContext.broadcast(glm), featureShardId), - new FixedEffectOptimizationTracker(stateTracker)) + (FixedEffectModel(glm, featureShardId), new FixedEffectOptimizationTracker(stateTracker)) } - /** - * Compute scores given a training dataset and a fixed effect model - * - * @param dataset The dataset to score - * @param fixedEffectModel The model used to score the dataset - * @param featureShardId The ID of the feature shard for the training data - * @return The computed scores - */ - def score(dataset: DataFrame, fixedEffectModel: FixedEffectModel, featureShardId: FeatureShardId): CoordinateDataScores = { - val cofs = VectorUtils.breezeToMl(fixedEffectModel.model.coefficients.means) - val scores = dataset - .withColumn(DataConst.SCORE, GeneralizedLinearModel.scoreUdf(lit(cofs), col(featureShardId))) - .select(Constants.UNIQUE_SAMPLE_ID, DataConst.SCORE) - new CoordinateDataScores(scores) + def updateOffset( + dataset: DataFrame, fixedEffectModel: FixedEffectModel, featureShardId: FeatureShardId, + inputColumnsNames: InputColumnsNames): DataFrame = { + + require( + featureShardId == fixedEffectModel.featureShardId, + s"Fixed effect coordinate featureShardId ${featureShardId} != model.featureShardId ${ + fixedEffectModel + .featureShardId + }") + + val offset = inputColumnsNames(InputColumnsNames.OFFSET) + val hasOffsetField = ApiUtils.hasColumn(dataset, offset) + val hasCoordinateScoreField = ApiUtils.hasColumn(dataset, SCORE_FIELD) + + if (hasOffsetField && hasCoordinateScoreField) { + // offset = offset - old_coordinateScore + new_coordinateScore + dataset.withColumn(offset, col(offset) - col(SCORE_FIELD)) + fixedEffectModel.computeScore(dataset, SCORE_FIELD) + .withColumn(offset, col(offset) + col(SCORE_FIELD)) + } else if (!hasOffsetField && !hasCoordinateScoreField) { + fixedEffectModel.computeScore(dataset, SCORE_FIELD) + .withColumn(offset, col(SCORE_FIELD)) + } else if (hasOffsetField && !hasCoordinateScoreField) { + fixedEffectModel.computeScore(dataset, SCORE_FIELD) + .withColumn(offset, col(offset) + col(SCORE_FIELD)) + } else { + throw new UnsupportedOperationException("It shouldn't happen!") + } } } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala index 9a78b2e7..7aab7369 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala @@ -17,7 +17,6 @@ package com.linkedin.photon.ml.algorithm import org.apache.spark.sql.DataFrame import com.linkedin.photon.ml.Types.FeatureShardId -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.model.{DatumScoringModel, FixedEffectModel} /** @@ -34,10 +33,10 @@ class FixedEffectModelCoordinate(dataset: DataFrame, featureShardId: FeatureShar * @param model The input model * @return The output scores */ - override protected[algorithm] def score(model: DatumScoringModel): CoordinateDataScores = { + override protected def updateOffset(model: DatumScoringModel) = { model match { case fixedEffectModel: FixedEffectModel => - FixedEffectCoordinate.score(dataset, fixedEffectModel, featureShardId) + FixedEffectCoordinate.updateOffset(dataset, fixedEffectModel, featureShardId) case _ => throw new UnsupportedOperationException( diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala index fc977515..bf88d3f3 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala @@ -16,27 +16,22 @@ package com.linkedin.photon.ml.algorithm import scala.collection.mutable -import org.apache.spark.SparkContext import org.apache.spark.ml.linalg.{Vector => SparkVector} -import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions.col import org.apache.spark.sql.{DataFrame, functions} import org.apache.spark.storage.StorageLevel -import com.linkedin.photon.ml.Constants import com.linkedin.photon.ml.Types.{FeatureShardId, REType} -import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.data._ -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.function.SingleNodeObjectiveFunction import com.linkedin.photon.ml.model.{Coefficients, DatumScoringModel, RandomEffectModel} import com.linkedin.photon.ml.normalization.NormalizationContext import com.linkedin.photon.ml.optimization.VarianceComputationType.VarianceComputationType import com.linkedin.photon.ml.optimization._ import com.linkedin.photon.ml.optimization.game.{RandomEffectOptimizationConfiguration, RandomEffectOptimizationProblem} -import com.linkedin.photon.ml.spark.RDDLike import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel -import com.linkedin.photon.ml.util.VectorUtils +import com.linkedin.photon.ml.util.{ApiUtils, VectorUtils} + /** * The optimization problem coordinate for a random effect model. * @@ -48,20 +43,22 @@ import com.linkedin.photon.ml.util.VectorUtils */ protected[ml] class RandomEffectCoordinate[Objective <: SingleNodeObjectiveFunction]( rEType: REType, - rawData: DataFrame, + var rawData: DataFrame, optimizationProblem: RandomEffectOptimizationProblem[Objective], featureShardId: FeatureShardId, inputColumnsNames: InputColumnsNames) - extends Coordinate - with RDDLike { + extends Coordinate { /* Get the training data from raw data */ - var dataset: DataFrame = { - val label = inputColumnsNames(InputColumnsNames.RESPONSE) + var dataset: DataFrame = null + + protected def updateDataset(): Unit = { + + val label = inputColumnsNames(InputColumnsNames.RESPONSE) val offset = inputColumnsNames(InputColumnsNames.OFFSET) val weight = inputColumnsNames(InputColumnsNames.WEIGHT) - rawData + dataset = rawData .select(rEType, featureShardId, label, offset, weight) .groupBy(rEType) .agg( @@ -74,11 +71,19 @@ protected[ml] class RandomEffectCoordinate[Objective <: SingleNodeObjectiveFunct // // Coordinate functions // - override protected def updateDataset(scores: CoordinateDataScores) = { - dataset = scores.scores - .join(rawData, Constants.UNIQUE_SAMPLE_ID) - .withColumn(inputColumnsNames(InputColumnsNames.OFFSET), - col(inputColumnsNames(InputColumnsNames.OFFSET)) + col(DataConst.SCORE)) + override protected def updateOffset(model: DatumScoringModel) = { + + model match { + case randomEffectModel: RandomEffectModel => + rawData = RandomEffectCoordinate.updateOffset( + rawData, randomEffectModel, featureShardId, + rEType, inputColumnsNames) + + updateDataset() + + case _ => + throw new UnsupportedOperationException(s"Unsupported model type: ${model.modelType}") + } } /** @@ -89,7 +94,11 @@ protected[ml] class RandomEffectCoordinate[Objective <: SingleNodeObjectiveFunct * @return A (updated model, optional optimization tracking information) tuple */ override protected[algorithm] def trainModel( - model: DatumScoringModel): (DatumScoringModel, OptimizationTracker) = + model: DatumScoringModel): (DatumScoringModel, OptimizationTracker) = { + + if (dataset == null) { + updateDataset() + } model match { case randomEffectModel: RandomEffectModel => @@ -105,6 +114,7 @@ protected[ml] class RandomEffectCoordinate[Objective <: SingleNodeObjectiveFunct throw new UnsupportedOperationException( s"Updating model of type ${model.getClass} in ${this.getClass} is not supported") } + } /** * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset. @@ -112,6 +122,9 @@ protected[ml] class RandomEffectCoordinate[Objective <: SingleNodeObjectiveFunct * @return A (updated model, optimization state tracking information) tuple */ override protected[algorithm] def trainModel(): (DatumScoringModel, OptimizationTracker) = { + if (dataset == null) { + updateDataset() + } val (newModel, optimizationTracker) = RandomEffectCoordinate.trainModel( dataset, @@ -123,84 +136,6 @@ protected[ml] class RandomEffectCoordinate[Objective <: SingleNodeObjectiveFunct (newModel, optimizationTracker) } - /** - * Compute scores for the coordinate data using a given model. - * - * @param model The input model - * @return The dataset scores - */ - override protected[algorithm] def score(model: DatumScoringModel): CoordinateDataScores = model match { - case randomEffectModel: RandomEffectModel => - RandomEffectCoordinate.score(dataset, randomEffectModel) - - case _ => - throw new UnsupportedOperationException( - s"Scoring with model of type ${model.getClass} in ${this.getClass} is not supported") - } - - // - // RDDLike Functions - // - - /** - * Get the Spark context. - * - * @return The Spark context - */ - override def sparkContext: SparkContext = optimizationProblem.sparkContext - - /** - * Assign a given name to the [[optimizationProblem]] [[RDD]]. - * - * @param name The parent name for all [[RDD]] objects in this class - * @return This object with the name of the [[optimizationProblem]] [[RDD]] assigned - */ - override def setName(name: String): RandomEffectCoordinate[Objective] = { - - optimizationProblem.setName(name) - - this - } - - /** - * Set the persistence storage level of the [[optimizationProblem]] [[RDD]]. - * - * @param storageLevel The storage level - * @return This object with the storage level of the [[optimizationProblem]] [[RDD]] set - */ - override def persistRDD(storageLevel: StorageLevel): RandomEffectCoordinate[Objective] = { - - optimizationProblem.persistRDD(storageLevel) - - this - } - - /** - * Mark the [[optimizationProblem]] [[RDD]] as unused, and asynchronously remove all blocks for it from memory and - * disk. - * - * @return This object with the [[optimizationProblem]] [[RDD]] unpersisted - */ - override def unpersistRDD(): RandomEffectCoordinate[Objective] = { - - optimizationProblem.unpersistRDD() - - this - } - - /** - * Materialize the [[optimizationProblem]] [[RDD]] (Spark [[RDD]]s are lazy evaluated: this method forces them to be - * evaluated). - * - * @return This object with the [[optimizationProblem]] [[RDD]] materialized - */ - override def materialize(): RandomEffectCoordinate[Objective] = { - - optimizationProblem.materialize() - - this - } - } object RandomEffectCoordinate { @@ -290,7 +225,7 @@ object RandomEffectCoordinate { result += LabeledPoint(lIter.next(), VectorUtils.mlToBreeze(fIter.next()), oIter.next(), wIter.next()) } - (reid, LocalDataset(result.toArray)) + (reid, result.toArray) } // TODO: remove pre-REID optimization problems @@ -305,7 +240,7 @@ object RandomEffectCoordinate { .leftOuterJoin(dataAndOptimizationProblems) .mapValues { case (localModel, Some((localDataset, optimizationProblem))) => - val trainingLabeledPoints = localDataset.dataPoints + val trainingLabeledPoints = localDataset val (updatedModel, stateTrackers) = optimizationProblem.run(trainingLabeledPoints, localModel) (updatedModel, Some(stateTrackers)) @@ -323,7 +258,7 @@ object RandomEffectCoordinate { .getOrElse { val modelsAndTrackers = dataAndOptimizationProblems .mapValues { case (localDataset, optimizationProblem) => - val trainingLabeledPoints = localDataset.dataPoints + val trainingLabeledPoints = localDataset optimizationProblem.run(trainingLabeledPoints) } modelsAndTrackers.persist(StorageLevel.MEMORY_AND_DISK_SER) @@ -341,6 +276,9 @@ object RandomEffectCoordinate { (newRandomEffectModel, randomEffectOptimizationTracker) } + def getScoreFieldName(rEType: REType): String = { + return s"${rEType}_score" + } /** * Score a dataset using a given [[RandomEffectModel]]. @@ -350,13 +288,47 @@ object RandomEffectCoordinate { * * @note The score is the raw dot product of the model coefficients and the feature values - it does not go through a * non-linear link function. - * @param randomEffectDataset The data set to score + * @param dataset The data set to score * @param randomEffectModel The [[RandomEffectModel]] with which to score * @return The computed scores */ - protected[algorithm] def score( - randomEffectDataset: DataFrame, - randomEffectModel: RandomEffectModel): CoordinateDataScores = { - randomEffectModel.score(randomEffectDataset) + def updateOffset( + dataset: DataFrame, randomEffectModel: RandomEffectModel, featureShardId: FeatureShardId, + rEType: REType, + inputColumnsNames: InputColumnsNames): DataFrame = { + + require( + featureShardId == randomEffectModel.featureShardId, + s"Random effect coordinate featureShardId ${featureShardId} != model.featureShardId ${ + randomEffectModel + .featureShardId + }") + + require( + rEType == randomEffectModel.randomEffectType, + s"Random effect coordinate randomEffectType ${rEType} != model.randomEffectType ${ + randomEffectModel + .randomEffectType + }") + + val scoreField = getScoreFieldName(rEType) + val offset = inputColumnsNames(InputColumnsNames.OFFSET) + val hasOffsetField = ApiUtils.hasColumn(dataset, offset) + val hasCoordinateScoreField = ApiUtils.hasColumn(dataset, scoreField) + + if (hasOffsetField && hasCoordinateScoreField) { + // offset = offset - old_coordinateScore + new_coordinateScore + dataset.withColumn(offset, col(offset) - col(scoreField)) + randomEffectModel.computeScore(dataset, scoreField) + .withColumn(offset, col(offset) + col(scoreField)) + } else if (!hasOffsetField && !hasCoordinateScoreField) { + randomEffectModel.computeScore(dataset, scoreField) + .withColumn(offset, col(scoreField)) + } else if (hasOffsetField && !hasCoordinateScoreField) { + randomEffectModel.computeScore(dataset, scoreField) + .withColumn(offset, col(offset) + col(scoreField)) + } else { + throw new UnsupportedOperationException("It shouldn't happen!") + } } } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectModelCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectModelCoordinate.scala index b30bf030..ec09da8e 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectModelCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectModelCoordinate.scala @@ -15,7 +15,6 @@ package com.linkedin.photon.ml.algorithm import org.apache.spark.sql.DataFrame -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.model.{DatumScoringModel, RandomEffectModel} /** @@ -32,10 +31,10 @@ class RandomEffectModelCoordinate(dataset: DataFrame) * @param model The input model * @return The output scores */ - override protected[algorithm] def score(model: DatumScoringModel): CoordinateDataScores = { + override protected def updateOffset(model: DatumScoringModel) = { model match { case randomEffectModel: RandomEffectModel => - RandomEffectCoordinate.score(dataset, randomEffectModel) + RandomEffectCoordinate.updateOffset(dataset, randomEffectModel) case _ => throw new UnsupportedOperationException( diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/data/GameConverters.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/data/GameConverters.scala deleted file mode 100644 index d27c0030..00000000 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/data/GameConverters.scala +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ -package com.linkedin.photon.ml.data - -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.ml.linalg.SparseVector -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Row} -import org.apache.spark.sql.functions.monotonically_increasing_id - -import com.linkedin.photon.ml.Constants -import com.linkedin.photon.ml.Types.{FeatureShardId, UniqueSampleId} -import com.linkedin.photon.ml.util.VectorUtils - -/** - * A collection of utility functions for converting to and from GAME datasets. - */ -object GameConverters { - - /** - * Converts a [[DataFrame]] into an [[RDD]] of type [[GameDatum]]. - * - * @note We "decode" the map of column names into an Array[String] which we broadcast for performance. The - * "inputColumnNames" contains the user-specified custom names of columns required by GAME, with default names - * for the unspecified columns. - * @param data The source [[DataFrame]] - * @param featureShards A set of feature shard ids - * @param idTagSet The set of columns/metadata fields expected for each [[Row]] in the [[DataFrame]] - * @param isResponseRequired Whether a response column is mandatory. For example: [[GameDatum]] used for training - * require a response for each [[Row]]; [[GameDatum]] used for scoring do not. - * @param inputColumnsNames User-supplied input column names to read the input data - * @return An [[RDD]] of type [[GameDatum]] - */ - protected[ml] def getGameDatasetFromDataFrame( - data: DataFrame, - featureShards: Set[FeatureShardId], - idTagSet: Set[String], - isResponseRequired: Boolean, - inputColumnsNames: InputColumnsNames = InputColumnsNames()): DataFrame = { - - val colNamesSet = inputColumnsNames.getNames - - // Cannot use response, offset, weight, or uid fields as fields for grouping random effects or queries - require( - idTagSet.intersect(colNamesSet).isEmpty, - s"Cannot use required columns (${colNamesSet.mkString(", ")}) for random effect/validation grouping.") - - data.withColumn(Constants.UNIQUE_SAMPLE_ID, monotonically_increasing_id) - } - - /** - * Build a [[GameDatum]] from a [[DataFrame]] [[Row]]. - * - * @param row The source [[DataFrame]] [[Row]] (must contain [[SparseVector]] instances) - * @param featureShards A set of feature shard ids - * @param idTagSet The set of columns/metadata fields expected for the [[Row]] - * @param isResponseRequired Whether a response column is mandatory. For example: [[GameDatum]] used for training - * require a response for the [[Row]]; [[GameDatum]] used for scoring do not. - * @param columnsBroadcast The names of the columns to look for in the input rows, in order - * @return A [[GameDatum]] - */ - protected[data] def getGameDatumFromRow( - row: Row, - featureShards: Set[String], - idTagSet: Set[String], - isResponseRequired: Boolean, - columnsBroadcast: Broadcast[InputColumnsNames]): GameDatum = { - - val columns = columnsBroadcast.value - - val featureShardContainer = featureShards.map { shardId => - val features = row.getAs[SparseVector](shardId) - (shardId, VectorUtils.mlToBreeze(features)) - }.toMap - - val response = if (isResponseRequired) { - row.getAs[Number](columns(InputColumnsNames.RESPONSE)).doubleValue - } else { - if (row.schema.fieldNames.contains(columns(InputColumnsNames.RESPONSE))) { - row.getAs[Number](columns(InputColumnsNames.RESPONSE)).doubleValue - } else { - Double.NaN - } - } - - val offset = if (row.schema.fieldNames.contains(columns(InputColumnsNames.OFFSET))) { - Option(row.getAs[Number](columns(InputColumnsNames.OFFSET))).map(_.doubleValue) - } else { - None - } - - val weight = if (row.schema.fieldNames.contains(columns(InputColumnsNames.WEIGHT))) { - Option(row.getAs[Number](columns(InputColumnsNames.WEIGHT))).map(_.doubleValue) - } else { - None - } - - val idTagToValueMap = - // TODO: find a better way to handle the field "uid", which is used in ScoringResult - if (row.schema.fieldNames.contains(columns(InputColumnsNames.UID)) - && row.getAs[Any](columns(InputColumnsNames.UID)) != null) { - getIdTagToValueMapFromRow(row, idTagSet, columns) + - (InputColumnsNames.UID.toString -> row.getAs[Any](columns(InputColumnsNames.UID)).toString) - } else { - getIdTagToValueMapFromRow(row, idTagSet, columns) - } - - new GameDatum( - response, - offset, - weight, - featureShardContainer, - idTagToValueMap) - } - - /** - * Given a [[DataFrame]] [[Row]], build a map of ID tag to ID value. - * - * @param row The source DataFrame row - * @param idTagSet The set of columns/metadata fields expected for the [[Row]] - * @return The map of ID tag to ID value map for the [[Row]] - */ - protected[data] def getIdTagToValueMapFromRow( - row: Row, - idTagSet: Set[String], - columns: InputColumnsNames = InputColumnsNames()): Map[String, String] = { - - val metaMap: Option[Map[String, String]] = if (row.schema.fieldNames.contains(columns(InputColumnsNames.META_DATA_MAP))) { - Some(row.getAs[Map[String, String]](columns(InputColumnsNames.META_DATA_MAP))) - } else { - None - } - - idTagSet - .map { idTag => - val idFromRow: Option[String] = if (row.schema.fieldNames.contains(idTag)) { - Some(row.getAs[Any](idTag).toString) - } else { - None - } - - val id = idFromRow - .orElse { - metaMap.flatMap(_.get(idTag)) - } - .getOrElse( - throw new IllegalArgumentException( - s"Cannot find id in either record field: $idTag or in metadataMap with key: #$idTag")) - - // random effect group name -> random effect group id value - // random effect types are assumed to be strings - (idTag, id) - } - .toMap - } -} diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/data/LocalDataset.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/data/LocalDataset.scala deleted file mode 100644 index 312e4957..00000000 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/data/LocalDataset.scala +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ -package com.linkedin.photon.ml.data - -/** - * Local dataset implementation. - * - * @note One design concern is whether to store the local data as a [[Map]] or an [[Array]] (high sort cost, but low - * merge cost vs. no sort cost but high merge cost). Currently, we use an [[Array]] since the data is only sorted - * once, and used as the base for all other data/score [[Array]]s. - * - * @param dataPoints Local data points consists of (globalId, labeledPoint) pairs - */ -protected[ml] case class LocalDataset(dataPoints: Array[LabeledPoint]) { - - require( - dataPoints.length > 0, - "Cannot create LocalDataset with empty data array") - - val numDataPoints: Int = dataPoints.length - val numFeatures: Int = dataPoints - .head - .features - .length -} \ No newline at end of file diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/data/RandomEffectDatasetPartitioner.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/data/RandomEffectDatasetPartitioner.scala deleted file mode 100644 index fcc81e13..00000000 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/data/RandomEffectDatasetPartitioner.scala +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ -package com.linkedin.photon.ml.data - -import scala.collection.{Map, immutable, mutable} - -import org.apache.spark.broadcast.Broadcast -import org.apache.spark.rdd.RDD -import org.apache.spark.{HashPartitioner, Partitioner} - -import com.linkedin.photon.ml.Types.REId -import com.linkedin.photon.ml.spark.BroadcastLike - -/** - * Partitioner implementation for random effect datasets. - * - * In GAME, we can improve on Spark default partitioning by using domain-specific knowledge in two ways. First, we can - * reduce time spent in shuffle operations by leveraging training record keys (helping joins). Second, we assume that - * each random effect has less than the maximum partition size of associated training data, i.e. that all the training - * data for a given RE will fit within a single Spark data partition. So we can group the training records so that they - * all land in the same partition for a given RE, which is what RandomEffectDatasetPartitioner is about. - * - * RandomEffectDatasetPartitioner also makes sure that partitions are as equally balanced as possible, to equalize the - * workload of the executors: because we assume the data for each random effect is small, it will usually not even fill - * a Spark data partition, so we fill up the partition (i.e. add (id/partition) records to idToPartitionMap with data - * for multiple random effects). However, since idToPartitionMap is eventually broadcast to the executors, we also want - * to keep the size of that Map under control (see parameter partitionerCapacity below). - * - * @param numPartitions Number of partitions across which to split random effects - * @param idToPartitionMap Random effect type to partition map - */ -protected[ml] class RandomEffectDatasetPartitioner( - val numPartitions: Int, - private val idToPartitionMap: Broadcast[Map[REId, Int]]) - extends Partitioner - with BroadcastLike { - - // Backup partitioner for random effect IDs not found in the primary assignment Map - lazy private val backupPartitioner: HashPartitioner = new HashPartitioner(numPartitions) - - /** - * Asynchronously delete cached copies of this broadcast on the executors. - * - * @return This object with all its broadcast variables unpersisted - */ - override def unpersistBroadcast(): this.type = { - idToPartitionMap.unpersist() - this - } - - /** - * Compares two [[RandomEffectDatasetPartitioner]] objects. - * - * @param that Some other object - * @return True if the two partitioners have the same idToPartitionMap, false otherwise - */ - override def equals(that: Any): Boolean = - that match { - case other: RandomEffectDatasetPartitioner => this.idToPartitionMap.value.equals(other.idToPartitionMap.value) - case _ => false - } - - /** - * Returns a hash code value for the object. - * - * @return An [[Int]] hash code - */ - override def hashCode: Int = idToPartitionMap.hashCode() - - /** - * For a given key, get the corresponding partition id. If the key is not in any partition, we randomly assign - * the training vector to a partition (with Spark's HashPartitioner). - * - * @param key A training vector key (String). - * @return The partition id to which the training vector belongs. - */ - def getPartition(key: Any): Int = key match { - case reId: REId => - idToPartitionMap.value.getOrElse(reId, backupPartitioner.getPartition(reId)) - - case any => - throw new IllegalArgumentException(s"Expected key of ${this.getClass} is String, but ${any.getClass} found") - } -} - -object RandomEffectDatasetPartitioner { - - /** - * Generate a partitioner for one random effect model. - * - * Multiple random effect models, one per random effect ID (e.g. "user123"), are instantiated for a single random - * effect type (e.g. "per-user"), and each of these instantiations is trained with training vectors marked for that - * random effect ID. We collect the training vector ids that correspond to the random effect type, then build an id - * to partition map. Data should be distributed across partitions as equally as possible. Since some items have more - * data points than others, this partitioner uses simple 'bin packing' for distributing data load across partitions - * (using minHeap). - * - * We stop filling in idToPartitionMap at partitionerCapacity records, because this map is passed to the executors - * and we therefore wish to control/limit its size. - * - * @param gameDataset The GAME training dataset - * @param reConfig The random effect data configuration options - * @param partitionerCapacity The partitioner capacity - * @return A partitioner for one random effect model - */ - def fromGameDataset( - gameDataset: RDD[(Long, GameDatum)], - reConfig: RandomEffectDataConfiguration, - partitionerCapacity: Int = 10000): RandomEffectDatasetPartitioner = { - - val numPartitions = reConfig.minNumPartitions - val randomEffectType = reConfig.randomEffectType - val activeDataUpperBoundOpt = reConfig.numActiveDataPointsUpperBound - - require(numPartitions > 0, s"Number of partitions ($numPartitions) has to be larger than 0.") - - val rawSortedRandomEffectTypes = gameDataset - .values - .filter(_.idTagToValueMap.contains(randomEffectType)) - .map(gameData => (gameData.idTagToValueMap(randomEffectType), 1)) - .reduceByKey(_ + _) - .collect() - .sortBy(_._2 * -1) - .take(partitionerCapacity) - - // If the number of active samples is bounded, we can partition them better by using the bound as the count - val sortedRandomEffectTypes = activeDataUpperBoundOpt match { - case Some(bound) => - rawSortedRandomEffectTypes.map { case (reId, count) => - - val newCount = if (count > bound) bound else count - - (reId, newCount) - } - - case None => - rawSortedRandomEffectTypes - } - - val ordering = new Ordering[(Int, Int)] { - def compare(pair1: (Int, Int), pair2: (Int, Int)): Int = pair2._2 compare pair1._2 - } - - val minHeap = mutable.PriorityQueue.newBuilder[(Int, Int)](ordering) - minHeap ++= Array.tabulate[(Int, Int)](numPartitions)(i => (i, 0)) - val idToPartitionMapBuilder = immutable.Map.newBuilder[String, Int] - idToPartitionMapBuilder.sizeHint(numPartitions) - - sortedRandomEffectTypes.foreach { case (id, size) => - val (partition, currentSize) = minHeap.dequeue() - idToPartitionMapBuilder += id -> partition - minHeap.enqueue((partition, currentSize + size)) - } - - new RandomEffectDatasetPartitioner( - numPartitions, - gameDataset.sparkContext.broadcast(idToPartitionMapBuilder.result())) - } -} diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala index b8c151bd..8772f0e5 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala @@ -322,7 +322,7 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P // Transform the GAME validation data set into fixed and random effect specific data sets val evaluationSuiteOpt = Timed("Prepare validation data, if any") { - validationDataOpt.map { case validData => prepareValidationEvaluators(validData) } + validationDataOpt.map(validationData => prepareValidationEvaluators(validationData)) } val coordinateDescent = new CoordinateDescent( @@ -354,9 +354,7 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P } } - evaluationSuiteOpt.map { case evaluationSuite => - evaluationSuite.unpersistRDD() - } + evaluationSuiteOpt.map(_.unpersistRDD()) // Return the trained models, along with validation information (if any), and model configuration results diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/model/FixedEffectModel.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/model/FixedEffectModel.scala index d8deb093..5d6cc9a4 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/model/FixedEffectModel.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/model/FixedEffectModel.scala @@ -15,14 +15,11 @@ package com.linkedin.photon.ml.model import org.apache.spark.broadcast.Broadcast -import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, lit} +import org.apache.spark.sql.{DataFrame, SparkSession} -import com.linkedin.photon.ml.Constants import com.linkedin.photon.ml.TaskType.TaskType import com.linkedin.photon.ml.Types.FeatureShardId -import com.linkedin.photon.ml.constants.DataConst -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.spark.BroadcastLike import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel import com.linkedin.photon.ml.util.VectorUtils @@ -53,11 +50,11 @@ class FixedEffectModel( * * @note Use a static method to avoid serializing entire model object during RDD operations. * @param dataPoints The dataset to score + * @param scoreField The name of the score field * @return The computed scores */ - override def score(dataPoints: DataFrame): CoordinateDataScores = { - val scores = FixedEffectModel.score(dataPoints, modelBroadcast, featureShardId) - new CoordinateDataScores(scores) + override def computeScore(dataPoints: DataFrame, scoreField: String): DataFrame = { + FixedEffectModel.score(dataPoints, modelBroadcast, featureShardId, scoreField) } /** @@ -98,10 +95,15 @@ class FixedEffectModel( * @return An [[Int]] hash code */ override def hashCode: Int = featureShardId.hashCode + model.hashCode + } object FixedEffectModel { + def apply(glm: GeneralizedLinearModel, featureShardId: FeatureShardId): FixedEffectModel = { + new FixedEffectModel(SparkSession.builder.getOrCreate.sparkContext.broadcast(glm), featureShardId) + } + /** * Compute the scores for the dataset. * @@ -113,13 +115,11 @@ object FixedEffectModel { private def score( dataset: DataFrame, modelBroadcast: Broadcast[GeneralizedLinearModel], - featureShardId: FeatureShardId): DataFrame = { + featureShardId: FeatureShardId, + scoreField: String): DataFrame = { val cofs = VectorUtils.breezeToMl(modelBroadcast.value.coefficients.means) - val scores = dataset - .withColumn(DataConst.SCORE, GeneralizedLinearModel.scoreUdf(lit(cofs), col(featureShardId))) - .select(Constants.UNIQUE_SAMPLE_ID, DataConst.SCORE) - - scores + dataset + .withColumn(scoreField, GeneralizedLinearModel.scoreUdf(lit(cofs), col(featureShardId))) } } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/model/RandomEffectModel.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/model/RandomEffectModel.scala index 4ed00514..d17d4f6f 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/model/RandomEffectModel.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/model/RandomEffectModel.scala @@ -14,20 +14,15 @@ */ package com.linkedin.photon.ml.model -import org.apache.spark.SparkContext +import org.apache.spark.ml.linalg.{Vector => SparkVector} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.col -import org.apache.spark.storage.StorageLevel -import org.apache.spark.ml.linalg.{Vector => SparkVector} -import com.linkedin.photon.ml.{Constants, TaskType} +import com.linkedin.photon.ml.TaskType import com.linkedin.photon.ml.TaskType.TaskType import com.linkedin.photon.ml.Types.{FeatureShardId, REType} import com.linkedin.photon.ml.constants.DataConst -import com.linkedin.photon.ml.data.GameDatum -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores -import com.linkedin.photon.ml.spark.RDDLike import com.linkedin.photon.ml.supervised.classification.{LogisticRegressionModel, SmoothedHingeLossLinearSVMModel} import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel import com.linkedin.photon.ml.supervised.regression.{LinearRegressionModel, PoissonRegressionModel} @@ -44,8 +39,7 @@ class RandomEffectModel( val models: DataFrame, val randomEffectType: REType, val featureShardId: FeatureShardId) - extends DatumScoringModel - with RDDLike { + extends DatumScoringModel { override val modelType: TaskType = RandomEffectModel.determineModelType(models) @@ -70,18 +64,17 @@ class RandomEffectModel( * Compute the score for the dataset. * * @note Use a static method to avoid serializing entire model object during RDD operations. - * @param dataset The dataset to score (Note that the Long in the RDD is a unique identifier for the paired - * [[GameDatum]] object, referred to in the GAME code as the "unique id") + * @param dataset The dataset to score * @return The computed scores */ - override def score(dataset: DataFrame): CoordinateDataScores = { + override def computeScore(dataset: DataFrame, scoreField: String): DataFrame = { - val scores = RandomEffectModel.score( + RandomEffectModel.score( dataset, models, randomEffectType, - featureShardId) - new CoordinateDataScores(scores) + featureShardId, + scoreField) } // @@ -108,60 +101,6 @@ class RandomEffectModel( stringBuilder.toString() } - // - // RDDLike functions - // - - /** - * Get the Spark context. - * - * @return The Spark context - */ - override protected[ml] def sparkContext: SparkContext = SparkSession.builder.getOrCreate.sparkContext - - override protected[ml] def setName(name: String): RandomEffectModel = { - - this - } - - /** - * Set the storage level of [[models]], and persist their values across the cluster the first time they are - * computed. - * - * @param storageLevel The storage level - * @return This object with the storage level of [[models]] set - */ - override protected[ml] def persistRDD(storageLevel: StorageLevel): RandomEffectModel = { - - models.persist(storageLevel) - - this - } - - /** - * Mark [[models]] as non-persistent, and remove all blocks for them from memory and disk. - * - * @return This object with [[models]] marked non-persistent - */ - override protected[ml] def unpersistRDD(): RandomEffectModel = { - - models.unpersist() - - this - } - - /** - * Materialize [[models]] (Spark data are lazy evaluated: this method forces them to be evaluated). - * - * @return This object with [[models]] materialized - */ - override protected[ml] def materialize(): RandomEffectModel = { - - models.count() - - this - } - /** * Compares two [[RandomEffectModel]] objects. * @@ -224,6 +163,7 @@ class RandomEffectModel( * @return An [[Int]] hash code */ override def hashCode(): Int = super.hashCode() + } object RandomEffectModel { @@ -263,14 +203,12 @@ object RandomEffectModel { dataset: DataFrame, models: DataFrame, randomEffectType: REType, - featureShardId: FeatureShardId): DataFrame = { + featureShardId: FeatureShardId, + scoreField: String): DataFrame = { - val scores: DataFrame = dataset + dataset .join(models, randomEffectType) - .withColumn(DataConst.SCORE, GeneralizedLinearModel.scoreUdf(col(DataConst.COEFFICIENTS), col(featureShardId))) - .select(Constants.UNIQUE_SAMPLE_ID, DataConst.SCORE) - - scores + .withColumn(scoreField, GeneralizedLinearModel.scoreUdf(col(DataConst.COEFFICIENTS), col(featureShardId))) } def toDataFrame(input: RDD[(REType, GeneralizedLinearModel)]): DataFrame = { diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/game/RandomEffectOptimizationProblem.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/game/RandomEffectOptimizationProblem.scala index 0d8d7c8c..25de24d7 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/game/RandomEffectOptimizationProblem.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/game/RandomEffectOptimizationProblem.scala @@ -19,14 +19,13 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.storage.StorageLevel -import com.linkedin.photon.ml.Constants import com.linkedin.photon.ml.Types.{REId, REType} +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.function.SingleNodeObjectiveFunction import com.linkedin.photon.ml.model.Coefficients import com.linkedin.photon.ml.normalization.NormalizationContext -import com.linkedin.photon.ml.optimization.{SingleNodeOptimizationProblem, VarianceComputationType} import com.linkedin.photon.ml.optimization.VarianceComputationType.VarianceComputationType -import com.linkedin.photon.ml.projector.LinearSubspaceProjector +import com.linkedin.photon.ml.optimization.{SingleNodeOptimizationProblem, VarianceComputationType} import com.linkedin.photon.ml.spark.RDDLike import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel import com.linkedin.photon.ml.util.PhotonNonBroadcast @@ -146,7 +145,7 @@ object RandomEffectOptimizationProblem { // Generate new NormalizationContext and SingleNodeOptimizationProblem objects val optimizationProblems = data - .select(rEType, Constants.UNIQUE_SAMPLE_ID) + .select(rEType, DataConst.ID) .groupBy(rEType) .count .rdd diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/supervised/model/GeneralizedLinearModel.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/supervised/model/GeneralizedLinearModel.scala index 14c9cc1c..57822bc4 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/supervised/model/GeneralizedLinearModel.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/supervised/model/GeneralizedLinearModel.scala @@ -53,25 +53,6 @@ abstract class GeneralizedLinearModel(val coefficients: Coefficients) extends Se */ protected[ml] def computeMean(features: Vector[Double], offset: Double): Double - /** - * Compute the score for the given features. - * - * @note "score" = coefficients * features (no link function in the case of logistic regression: see above) - * - * @param features The input data point's feature - * @return The score for the passed features - */ - def computeScore(features: Vector[Double]): Double = coefficients.computeScore(features) - - /** - * Compute the value of the mean function of the generalized linear model given one data point using the estimated - * coefficients. - * - * @param features Vector representing a single data point's features - * @return Computed mean function value - */ - def computeMeanFunction(features: Vector[Double]): Double = computeMeanFunctionWithOffset(features, 0.0) - /** * Compute the value of the mean function of the generalized linear model given one data point using the estimated * coefficients. @@ -173,29 +154,30 @@ object GeneralizedLinearModel { /** * A UDF to compute scores given a linear model and a feature vector + * * @return The score which is the dot product of model coefficients and features */ - def scoreUdf = udf({(coefficients: SparkVector, features: SparkVector) => - require( - coefficients.size == features.size, - s"Coefficients.size = ${coefficients.size} and features.size = ${features.size}") - - val score = coefficients match { - case (dCoef: DenseVector) => - val array = dCoef.toArray - var s = 0.0 - features.foreachActive((i, v) => s += v * array(i)) - s - case (sCoef: SparseVector) => - val array = features.toArray - var s = 0.0 - sCoef.foreachActive((i, v) => s += v * array(i)) - s - case _ => throw new UnsupportedOperationException( - s"Coefficients type ${coefficients.getClass} is not supported.") + def scoreUdf = udf[Double, SparseVector, SparseVector]( + { (coefficients: SparkVector, features: SparkVector) => + require( + coefficients.size == features.size, + s"Coefficients.size = ${coefficients.size} and features.size = ${features.size}") + + var score = 0D + + coefficients match { + case denseCoef: DenseVector => + features.foreachActive { case (index, value) => + score += value * denseCoef(index) + } + + case sparseCoef: SparseVector => + sparseCoef.foreachActive { case (index, coefficient) => + score += coefficient * features(index) + } + } - } - score - }) + score + }) } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/transformers/GameTransformer.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/transformers/GameTransformer.scala index 82fb6c39..0efc4775 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/transformers/GameTransformer.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/transformers/GameTransformer.scala @@ -24,8 +24,7 @@ import org.apache.spark.storage.StorageLevel import org.slf4j.Logger import com.linkedin.photon.ml.Types.{FeatureShardId, REType, UniqueSampleId} -import com.linkedin.photon.ml.data.scoring.ModelDataScores -import com.linkedin.photon.ml.data.{GameConverters, GameDatum, InputColumnsNames} +import com.linkedin.photon.ml.data.{GameDatum, InputColumnsNames} import com.linkedin.photon.ml.evaluation._ import com.linkedin.photon.ml.model.{FixedEffectModel, GameModel, RandomEffectModel} import com.linkedin.photon.ml.util._ diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/util/ApiUtils.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/util/ApiUtils.scala new file mode 100644 index 00000000..6c396adc --- /dev/null +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/util/ApiUtils.scala @@ -0,0 +1,33 @@ +/* + * Copyright 2017 LinkedIn Corp. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain a + * copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + */ + +package com.linkedin.photon.ml.util + +import scala.util.Try +import org.apache.spark.sql.DataFrame + +object ApiUtils { + + def randomString(length: Int): String = { + val r = new scala.util.Random + val sb = new StringBuilder + for (i <- 1 to length) { + sb.append(r.nextPrintableChar) + } + sb.toString + } + + def hasColumn(df: DataFrame, path: String): Boolean = Try(df(path)).isSuccess +} diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/Constants.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/Constants.scala similarity index 94% rename from photon-api/src/main/scala/com/linkedin/photon/ml/Constants.scala rename to photon-client/src/main/scala/com/linkedin/photon/ml/Constants.scala index 95ee568c..d8e504a0 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/Constants.scala +++ b/photon-client/src/main/scala/com/linkedin/photon/ml/Constants.scala @@ -12,11 +12,9 @@ * License for the specific language governing permissions and limitations * under the License. */ - package com.linkedin.photon.ml import org.joda.time.DateTimeZone -import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.util.Utils /** @@ -46,6 +44,4 @@ object Constants { * Default time zone for relative date calculations */ val DEFAULT_TIME_ZONE = DateTimeZone.UTC - - val UNIQUE_SAMPLE_ID = DataConst.ID } diff --git a/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/scoring/GameScoringDriver.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/scoring/GameScoringDriver.scala index 55f0a458..fc339256 100644 --- a/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/scoring/GameScoringDriver.scala +++ b/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/scoring/GameScoringDriver.scala @@ -25,7 +25,6 @@ import com.linkedin.photon.ml.{Constants, DataValidationType, SparkSessionConfig import com.linkedin.photon.ml.Types.FeatureShardId import com.linkedin.photon.ml.cli.game.GameDriver import com.linkedin.photon.ml.data.avro._ -import com.linkedin.photon.ml.data.scoring.ModelDataScores import com.linkedin.photon.ml.data.{DataValidators, InputColumnsNames} import com.linkedin.photon.ml.index.IndexMapLoader import com.linkedin.photon.ml.io.scopt.game.ScoptGameScoringParametersParser diff --git a/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/training/GameTrainingDriver.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/training/GameTrainingDriver.scala index 17e52425..20938a30 100644 --- a/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/training/GameTrainingDriver.scala +++ b/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/training/GameTrainingDriver.scala @@ -20,24 +20,25 @@ import org.apache.spark.SparkContext import org.apache.spark.ml.linalg.{Vector => SparkMLVector} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators, Params} import org.apache.spark.sql.DataFrame -import org.apache.spark.storage.StorageLevel import org.apache.spark.sql.functions.monotonically_increasing_id +import org.apache.spark.storage.StorageLevel -import com.linkedin.photon.ml._ import com.linkedin.photon.ml.HyperparameterTunerName.HyperparameterTunerName import com.linkedin.photon.ml.HyperparameterTuningMode.HyperparameterTuningMode import com.linkedin.photon.ml.TaskType.TaskType import com.linkedin.photon.ml.Types._ +import com.linkedin.photon.ml._ import com.linkedin.photon.ml.cli.game.GameDriver -import com.linkedin.photon.ml.data.{DataValidators, FixedEffectDataConfiguration, InputColumnsNames, RandomEffectDataConfiguration} +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.data.avro.{AvroDataReader, ModelProcessingUtils} +import com.linkedin.photon.ml.data.{DataValidators, FixedEffectDataConfiguration, InputColumnsNames, RandomEffectDataConfiguration} import com.linkedin.photon.ml.estimators.GameEstimator.GameOptimizationConfiguration import com.linkedin.photon.ml.estimators.{GameEstimator, GameEstimatorEvaluationFunction} import com.linkedin.photon.ml.hyperparameter.tuner.HyperparameterTunerFactory import com.linkedin.photon.ml.index.{IndexMap, IndexMapLoader} -import com.linkedin.photon.ml.io.{CoordinateConfiguration, ModelOutputMode, RandomEffectCoordinateConfiguration} import com.linkedin.photon.ml.io.ModelOutputMode.ModelOutputMode import com.linkedin.photon.ml.io.scopt.game.ScoptGameTrainingParametersParser +import com.linkedin.photon.ml.io.{CoordinateConfiguration, ModelOutputMode, RandomEffectCoordinateConfiguration} import com.linkedin.photon.ml.model.{DatumScoringModel, FixedEffectModel, RandomEffectModel} import com.linkedin.photon.ml.normalization.NormalizationType.NormalizationType import com.linkedin.photon.ml.normalization.{NormalizationContext, NormalizationType} @@ -46,8 +47,7 @@ import com.linkedin.photon.ml.optimization.VarianceComputationType.VarianceCompu import com.linkedin.photon.ml.optimization.game.CoordinateOptimizationConfiguration import com.linkedin.photon.ml.stat.FeatureDataStatistics import com.linkedin.photon.ml.util.Implicits._ -import com.linkedin.photon.ml.util.Utils -import com.linkedin.photon.ml.util._ +import com.linkedin.photon.ml.util.{Utils, _} /** * This object is the entry point and driver for GAME training. There is a separate driver object for scoring. @@ -361,14 +361,14 @@ object GameTrainingDriver extends GameDriver { readTrainingData(avroDataReader, featureIndexMapLoadersOpt) } val gameTrainingData = Timed("Prepare GAME training data") { - trainingData.withColumn(Constants.UNIQUE_SAMPLE_ID, monotonically_increasing_id) + trainingData.withColumn(DataConst.ID, monotonically_increasing_id) } val validationData = Timed(s"Read validation data") { readValidationData(avroDataReader, featureIndexMapLoaders) } val gameValidationData = Timed("Prepare GAME validation data") { - validationData.map(_.withColumn(Constants.UNIQUE_SAMPLE_ID, monotonically_increasing_id)) + validationData.map(_.withColumn(DataConst.ID, monotonically_increasing_id)) } val interceptIndices = featureIndexMapLoaders.flatMap { case (coordinateId, indexMap) => diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/util/Utils.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/util/Utils.scala similarity index 94% rename from photon-api/src/main/scala/com/linkedin/photon/ml/util/Utils.scala rename to photon-client/src/main/scala/com/linkedin/photon/ml/util/Utils.scala index 5096b19c..cd0486c7 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/util/Utils.scala +++ b/photon-client/src/main/scala/com/linkedin/photon/ml/util/Utils.scala @@ -1,9 +1,24 @@ +/* + * Copyright 2017 LinkedIn Corp. All rights reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. You may obtain a + * copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + */ package com.linkedin.photon.ml.util import org.apache.avro.generic.GenericRecord import org.apache.avro.util.Utf8 import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path +import com.linkedin.photon.ml.Constants import com.linkedin.photon.ml.evaluation.EvaluatorType._ import com.linkedin.photon.ml.evaluation.{EvaluatorType, MultiAUC, MultiPrecisionAtK} @@ -103,9 +118,9 @@ object Utils { * @return A java map of String -> Object */ def getMapAvro( - record: GenericRecord, - key: String, - isNullOK: Boolean = false): Map[String, JObject] = { + record: GenericRecord, + key: String, + isNullOK: Boolean = false): Map[String, JObject] = { type T = java.util.Map[Any, JObject] // to avoid type erasure warning record.get(key) match { @@ -311,4 +326,4 @@ object Utils { * @return Some[T] if p or None */ def filter[T](p: => Boolean)(f: => T): Option[T] = if (p) Some(f) else None -} +} \ No newline at end of file diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/Coordinate.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/Coordinate.scala index 088a71e0..de1acae6 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/Coordinate.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/Coordinate.scala @@ -14,74 +14,37 @@ */ package com.linkedin.photon.ml.algorithm -import org.apache.spark.sql.DataFrame -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.model.DatumScoringModel import com.linkedin.photon.ml.optimization.OptimizationTracker /** - * The optimization problem coordinate for each effect model. - * - */ + * The optimization problem coordinate for each effect model. + * + */ protected[ml] abstract class Coordinate { /** - * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset. - * - * @return A (updated model, optimization state tracking information) tuple - */ + * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset. + * + * @return A (updated model, optimization state tracking information) tuple + */ protected[algorithm] def trainModel(): (DatumScoringModel, OptimizationTracker) /** - * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset with residuals from other - * coordinates. - * - * @param score The combined scores for each record of the other coordinates - * @return A (updated model, optimization state tracking information) tuple - */ - protected[algorithm] def trainModel(score: CoordinateDataScores): (DatumScoringModel, OptimizationTracker) = { - updateDataset(score) - trainModel() - } - - /** - * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset using an existing model as - * a starting point. - * - * @param model The model to use as a starting point - * @return A (updated model, optimization state tracking information) tuple - */ + * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset using an existing model as + * a starting point. + * + * @param model The model to use as a starting point + * @return A (updated model, optimization state tracking information) tuple + */ protected[algorithm] def trainModel(model: DatumScoringModel): (DatumScoringModel, OptimizationTracker) - /** - * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset using an existing model as - * a starting point and with residuals from other coordinates. - * - * @param model The existing model - * @param score The combined scores for each record of the other coordinates - * @return A (updated model, optimization state tracking information) tuple - */ - protected[algorithm] def trainModel( - model: DatumScoringModel, - score: CoordinateDataScores): (DatumScoringModel, OptimizationTracker) = { - updateDataset(score) - trainModel(model) - } - /** * Generate a new dataset with updated offset. * - * @param scores The score dataset + * @param model The model of previous coordinate * @return A new dataset with the updated offsets */ - protected def updateDataset(scores: CoordinateDataScores) - - /** - * Compute scores for the coordinate data using a given model. - * - * @param model The input model - * @return The dataset scores - */ - protected[algorithm] def score(model: DatumScoringModel): CoordinateDataScores + def updateOffset(model: DatumScoringModel) } diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateDescent.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateDescent.scala index 98376575..aee16d54 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateDescent.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateDescent.scala @@ -16,12 +16,11 @@ package com.linkedin.photon.ml.algorithm import scala.collection.mutable -import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.DataFrame import org.apache.spark.storage.StorageLevel import org.slf4j.Logger import com.linkedin.photon.ml.Types.CoordinateId -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.evaluation.{EvaluationResults, EvaluationSuite, EvaluatorType} import com.linkedin.photon.ml.model.{DatumScoringModel, GameModel} import com.linkedin.photon.ml.optimization.OptimizationTracker @@ -178,7 +177,6 @@ object CoordinateDescent { * @param iteration The current iteration of coordinate descent (for logging purposes) * @param initialModelOpt An optional initial model whose coefficients should be used as a starting point for * optimization - * @param residualsOpt Optional residual scores to add to the training data offsets * @param logger An implicit logger * @return The new model trained for the coordinate */ @@ -187,14 +185,24 @@ object CoordinateDescent { coordinate: Coordinate, iteration: Int, initialModelOpt: Option[DatumScoringModel], - residualsOpt: Option[CoordinateDataScores])( + prevModelOpt: Option[DatumScoringModel])( implicit logger: Logger): DatumScoringModel = Timed(s"Optimizing coordinate '$coordinateId' for iteration $iteration") { logger.debug(s"Updating coordinate of class ${coordinate.getClass}") - val (model, tracker) = (initialModelOpt, residualsOpt) match { + prevModelOpt.map(model => coordinate.updateOffset(model)) + + val (model, tracker) = initialModelOpt.map( + initialModel => Timed(s"Train new model using existing model as starting point") { + coordinate.trainModel(initialModel) + }).getOrElse( + Timed(s"Train new model") { + coordinate.trainModel() + }) + + /*(initialModelOpt, residualsOpt) match { case (Some(initialModel), Some(residuals)) => Timed(s"Train new model with residuals using existing model as starting point") { coordinate.trainModel(initialModel, residuals) @@ -214,7 +222,7 @@ object CoordinateDescent { Timed(s"Train new model") { coordinate.trainModel() } - } + }*/ logOptimizationSummary(logger, coordinateId, model, tracker) @@ -275,7 +283,7 @@ object CoordinateDescent { * @param coordinatesToTrain A list of coordinates for which to train new models * @param initialModelOpt An optional initial model whose coefficients should be used as a starting point for * optimization - * @param residualsOpt Optional residual scores to add to the training data offsets +// * @param residualsOpt Optional residual scores to add to the training data offsets * @param logger An implicit logger * @return The locked model if a new model should not be trained for this coordinate, a newly trained model otherwise. */ @@ -284,14 +292,15 @@ object CoordinateDescent { coordinate: Coordinate, coordinatesToTrain: Seq[CoordinateId], initialModelOpt: Option[DatumScoringModel], - residualsOpt: Option[CoordinateDataScores])( + prevModelOpt: Option[DatumScoringModel])( implicit logger: Logger): DatumScoringModel = if (coordinatesToTrain.contains(coordinateId)) { - val newModel = trainCoordinateModel(coordinateId, coordinate, iteration = 1, initialModelOpt, residualsOpt) + prevModelOpt.map(coordinate.updateOffset(_)) + val newModel = trainCoordinateModel(coordinateId, coordinate, iteration = 1, initialModelOpt, prevModelOpt) - persistModel(newModel, coordinateId, iteration = 1) + //persistModel(newModel, coordinateId, iteration = 1) newModel @@ -312,7 +321,7 @@ object CoordinateDescent { * [[com.linkedin.photon.ml.evaluation.Evaluator]] */ protected[algorithm] def evaluateModel( - modelToEvaluate: DatumScoringModel, + modelToEvaluate: GameModel, validationData: DataFrame, evaluationSuite: EvaluationSuite)( implicit logger: Logger): EvaluationResults = Timed("Validate GAME model") { @@ -322,7 +331,7 @@ object CoordinateDescent { } Timed(s"Compute evaluation metrics") { - val results = evaluationSuite.evaluate(validatingScores.scores) //todo: to fix it + val results = evaluationSuite.evaluate(validatingScores) //todo: to fix it results .evaluations @@ -334,14 +343,6 @@ object CoordinateDescent { } } - /** - * Cache summed residual scores to memory/disk. - * - * @param coordinateDataScores The residual scores to cache - */ - protected[algorithm] def persistSummedScores(coordinateDataScores: CoordinateDataScores): Unit = - coordinateDataScores.setName(s"Summed scores").persistRDD(StorageLevel.MEMORY_AND_DISK_SER).materialize() - /** * Remove a cached model from cache. * @@ -380,41 +381,15 @@ object CoordinateDescent { initialModels: Map[CoordinateId, DatumScoringModel])( implicit logger: Logger): GameModel = { - var i: Int = 2 - - // - // First coordinate, first iteration - // - - val firstCoordinateId = updateSequence.head - val firstCoordinate = coordinates(firstCoordinateId) - val firstCoordinateModel = trainOrFetchCoordinateModel( - firstCoordinateId, - firstCoordinate, - coordinatesToTrain, - initialModels.get(firstCoordinateId), - residualsOpt = None) - - var previousScores = firstCoordinate.score(firstCoordinateModel) - var summedScores: CoordinateDataScores = - CoordinateDataScores(SparkSession.builder().getOrCreate().emptyDataFrame) - val currentModels: mutable.Map[CoordinateId, DatumScoringModel] = - mutable.Map(firstCoordinateId -> firstCoordinateModel) - val currentScores: mutable.Map[CoordinateId, CoordinateDataScores] = - mutable.Map(firstCoordinateId -> previousScores) - - previousScores.persistRDD(StorageLevel.DISK_ONLY) + var i: Int = 1 + val currentModels: mutable.Map[CoordinateId, DatumScoringModel] = mutable.Map() + // The optional model of previous coordinate + var prevModelOpt: Option[DatumScoringModel] = None // - // Subsequent coordinates, first iteration + // First iteration // - - updateSequence.tail.foreach { coordinateId => - - val newSummedScores = previousScores + summedScores - persistSummedScores(newSummedScores) - summedScores.unpersistRDD() - summedScores = newSummedScores + updateSequence.foreach { coordinateId => val coordinate = coordinates(coordinateId) val newModel = trainOrFetchCoordinateModel( @@ -422,52 +397,36 @@ object CoordinateDescent { coordinate, coordinatesToTrain, initialModels.get(coordinateId), - Some(summedScores)) - - val scores = coordinate.score(newModel) - scores.persistRDD(StorageLevel.DISK_ONLY) + prevModelOpt) + // persist the new model + persistModel(newModel, coordinateId, 1) currentModels.put(coordinateId, newModel) - currentScores.put(coordinateId, scores) - previousScores = scores + prevModelOpt = Option.apply(newModel) } // - // Subsequent coordinates, subsequent iterations + // Subsequent iterations // - - while (i <= iterations) { + while (i < iterations) { coordinatesToTrain.foreach { coordinateId => - val oldScores = currentScores(coordinateId) - val newSummedScores = summedScores - oldScores + previousScores - persistSummedScores(newSummedScores) - summedScores.unpersistRDD() - oldScores.unpersistRDD() - summedScores = newSummedScores - val coordinate = coordinates(coordinateId) val oldModelOpt = currentModels.get(coordinateId) - val newModel = trainCoordinateModel(coordinateId, coordinate, i, oldModelOpt, Some(summedScores)) + val newModel = trainCoordinateModel(coordinateId, coordinate, i, oldModelOpt, prevModelOpt) persistModel(newModel, coordinateId, i) - unpersistModel(oldModelOpt.get) - - val scores = coordinate.score(newModel) - scores.persistRDD(StorageLevel.DISK_ONLY) - currentModels.put(coordinateId, newModel) - currentScores.put(coordinateId, scores) - previousScores = scores + unpersistModel(oldModelOpt.get) + prevModelOpt = Option.apply(newModel) } i += 1 } - summedScores.unpersistRDD() - currentScores.foreach { case (_, scores) => - scores.unpersistRDD() + currentModels.foreach { case (_, model) => + unpersistModel(model) } new GameModel(currentModels.toMap) @@ -502,48 +461,21 @@ object CoordinateDescent { evaluationSuite: EvaluationSuite)( implicit logger: Logger): (GameModel, EvaluationResults) = { - val evaluatorType: EvaluatorType = evaluationSuite.primaryEvaluator.evaluatorType - - var i: Int = 2 - - // - // First coordinate, first iteration - // + var i: Int = 1 + val currentModels: mutable.Map[CoordinateId, DatumScoringModel] = mutable.Map() + // The optional model of previous coordinate + var prevModelOpt: Option[DatumScoringModel] = Option.empty - val firstCoordinateId = updateSequence.head - val firstCoordinate = coordinates(firstCoordinateId) - val firstCoordinateModel = trainOrFetchCoordinateModel( - firstCoordinateId, - firstCoordinate, - coordinatesToTrain, - initialModels.get(firstCoordinateId), - residualsOpt = None) - - var previousScores = firstCoordinate.score(firstCoordinateModel) - var summedScores: CoordinateDataScores = - CoordinateDataScores(SparkSession.builder().getOrCreate().emptyDataFrame) - val currentModels: mutable.Map[CoordinateId, DatumScoringModel] = - mutable.Map(firstCoordinateId -> firstCoordinateModel) - val currentScores: mutable.Map[CoordinateId, CoordinateDataScores] = - mutable.Map(firstCoordinateId -> previousScores) - var bestModels: Map[CoordinateId, DatumScoringModel] = currentModels.toMap - var bestEvaluationResults: EvaluationResults = evaluateModel( - firstCoordinateModel, - validationData, - evaluationSuite) - - previousScores.persistRDD(StorageLevel.DISK_ONLY) + val evaluatorType: EvaluatorType = evaluationSuite.primaryEvaluator.evaluatorType + var bestEvaluationResults: EvaluationResults = null // - // Subsequent coordinates, first iteration + // First iteration // - updateSequence.tail.foreach { coordinateId => + updateSequence.foreach { coordinateId => - val newSummedScores = previousScores + summedScores - persistSummedScores(newSummedScores) - summedScores.unpersistRDD() - summedScores = newSummedScores +// summedScores = previousScores + summedScores val coordinate = coordinates(coordinateId) val newModel = trainOrFetchCoordinateModel( @@ -551,20 +483,19 @@ object CoordinateDescent { coordinate, coordinatesToTrain, initialModels.get(coordinateId), - Some(summedScores)) - - val scores = coordinate.score(newModel) - scores.persistRDD(StorageLevel.DISK_ONLY) + prevModelOpt) + // persist the new model + persistModel(newModel, coordinateId, 1) currentModels.put(coordinateId, newModel) - currentScores.put(coordinateId, scores) - previousScores = scores + prevModelOpt = Option.apply(newModel) val evaluationModel = new GameModel(currentModels.toMap) val evaluationResults = evaluateModel(evaluationModel, validationData, evaluationSuite) // Log warning if adding a coordinate reduces the overall model performance - if (evaluatorType.betterThan(bestEvaluationResults.primaryEvaluation, evaluationResults.primaryEvaluation)) { + if (bestEvaluationResults != null + && evaluatorType.betterThan(bestEvaluationResults.primaryEvaluation, evaluationResults.primaryEvaluation)) { logger.info(s"Warning: adding model for coordinate '$coordinateId' reduces overall model performance") } @@ -575,37 +506,28 @@ object CoordinateDescent { // Subsequent coordinates, subsequent iterations // - bestModels = currentModels.toMap + var bestModels: Map[CoordinateId, DatumScoringModel] = currentModels.toMap - while (i <= iterations) { + while (i < iterations) { coordinatesToTrain.foreach { coordinateId => - val oldScores = currentScores(coordinateId) - val newSummedScores = summedScores - oldScores + previousScores - persistSummedScores(newSummedScores) - summedScores.unpersistRDD() - oldScores.unpersistRDD() - summedScores = newSummedScores +// summedScores = summedScores - oldScores + previousScores val coordinate = coordinates(coordinateId) val oldModelOpt = currentModels.get(coordinateId) - val newModel = trainCoordinateModel(coordinateId, coordinate, i, oldModelOpt, Some(summedScores)) + val newModel = trainCoordinateModel(coordinateId, coordinate, i, oldModelOpt, prevModelOpt) persistModel(newModel, coordinateId, i) + currentModels.put(coordinateId, newModel) + prevModelOpt = Option.apply(newModel) + // If the best GAME model doesn't have a model for this coordinate or it does but it's not the old model, // unpersist the old model. if (bestModels.get(coordinateId).forall(!_.eq(oldModelOpt.get))) { unpersistModel(oldModelOpt.get) } - val scores = coordinate.score(newModel) - scores.persistRDD(StorageLevel.DISK_ONLY) - - currentModels.put(coordinateId, newModel) - currentScores.put(coordinateId, scores) - previousScores = scores - val evaluationModel = new GameModel(currentModels.toMap) val evaluationResults = evaluateModel(evaluationModel, validationData, evaluationSuite) if (evaluatorType.betterThan(evaluationResults.primaryEvaluation, bestEvaluationResults.primaryEvaluation)) { @@ -627,10 +549,6 @@ object CoordinateDescent { i += 1 } - summedScores.unpersistRDD() - currentScores.foreach { case (_, scores) => - scores.unpersistRDD() - } currentModels.foreach { case (coordinateId, model) => // If the best GAME model doesn't have a model for this coordinate or it does but they don't match, unpersist it if (bestModels.get(coordinateId).forall(!_.eq(model))) { @@ -660,14 +578,15 @@ object CoordinateDescent { evaluationSuiteOpt: Option[EvaluationSuite])( implicit logger: Logger): (GameModel, Option[EvaluationResults]) = { - val newModel = trainCoordinateModel(coordinateId, coordinate, iteration = 1, initialModelOpt, residualsOpt = None) + val newModel = trainCoordinateModel(coordinateId, coordinate, iteration = 1, initialModelOpt, prevModelOpt = None) persistModel(newModel, coordinateId, iteration = 1) + val gameModel = new GameModel(Map(coordinateId -> newModel)) val evaluationResultsOpt = validationOpt.map { case validationData => - evaluateModel(newModel, validationData, evaluationSuiteOpt.get) + evaluateModel(gameModel, validationData, evaluationSuiteOpt.get) } - (new GameModel(Map(coordinateId -> newModel)), evaluationResultsOpt) + (gameModel, evaluationResultsOpt) } } diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/ModelCoordinate.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/ModelCoordinate.scala index bde880c8..d22e750f 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/ModelCoordinate.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/ModelCoordinate.scala @@ -14,7 +14,6 @@ */ package com.linkedin.photon.ml.algorithm -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.model.DatumScoringModel import com.linkedin.photon.ml.optimization.OptimizationTracker @@ -24,9 +23,6 @@ import com.linkedin.photon.ml.optimization.OptimizationTracker */ abstract class ModelCoordinate extends Coordinate { - override protected[algorithm] def updateDataset(scores: CoordinateDataScores) = - throw new UnsupportedOperationException("Attempted to update model coordinate.") - /** * Compute an optimized model (i.e. run the coordinate optimizer) for the current dataset. * @@ -51,5 +47,5 @@ abstract class ModelCoordinate extends Coordinate { * @param model The input model * @return The output scores */ - override protected[algorithm] def score(model: DatumScoringModel): CoordinateDataScores + override protected[algorithm] def updateOffset(model: DatumScoringModel) } diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/CoordinateDataScores.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/CoordinateDataScores.scala deleted file mode 100644 index d6edd2fe..00000000 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/CoordinateDataScores.scala +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ -package com.linkedin.photon.ml.data.scoring - -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.expressions.UserDefinedFunction -import org.apache.spark.sql.functions.{col, udf} - -import com.linkedin.photon.ml.data.GameDatum -import com.linkedin.photon.ml.constants.{DataConst, MathConst} - -/** - * The class used to track scored data points throughout training. The score objects are scores only, with no additional - * information. - * - * @param scores The scores dataframe consist of (unique ID, score) pairs as explained above. - */ -protected[ml] class CoordinateDataScores(override val scores: DataFrame) - extends DataScores[CoordinateDataScores](scores) { - - /** - * Generic method to combine two [[CoordinateDataScores]] objects. - * - * @param op The operator to combine two [[CoordinateDataScores]] - * @param that The [[CoordinateDataScores]] instance to merge with this instance - * @return A merged [[CoordinateDataScores]] - */ - private def joinAndApply(op: UserDefinedFunction, that: CoordinateDataScores): CoordinateDataScores = - new CoordinateDataScores( - this - .scores - .withColumnRenamed(DataConst.SCORE, "s1") - // use fullOuterJoin: it's possible for some data to not be scored by a model - .join(that.scores.withColumnRenamed(DataConst.SCORE, "s2"), col(DataConst.ID), "fullouter") - .withColumn("newScore", op(col("s1"), col("s2"))) - .select(DataConst.ID, "newScore") - .withColumnRenamed("newScore", DataConst.SCORE)) - - /** - * The addition operation for [[CoordinateDataScores]]. - * - * @note This operation performs a full outer join. - * @param that The [[CoordinateDataScores]] instance to add to this instance - * @return A new [[CoordinateDataScores]] instance encapsulating the accumulated values - */ - override def +(that: CoordinateDataScores): CoordinateDataScores = { - - val op = udf((a1: Double, a2: Double) => { - val s1 = Option(a1) match { - case Some(v) => v - case _ => MathConst.DEFAULT_SCORE - } - val s2 = Option(a2) match { - case Some(v) => v - case _ => MathConst.DEFAULT_SCORE - } - s1 + s2 - }) - joinAndApply(op, that) - } - - /** - * The minus operation for [[CoordinateDataScores]]. - * - * @note This operation performs a full outer join. - * @param that The [[CoordinateDataScores]] instance to subtract from this instance - * @return A new [[CoordinateDataScores]] instance encapsulating the subtracted values - */ - override def -(that: CoordinateDataScores): CoordinateDataScores = { - - val op = udf((a1: Double, a2: Double) => { - val s1 = Option(a1) match { - case Some(v) => v - case _ => MathConst.DEFAULT_SCORE - } - val s2 = Option(a2) match { - case Some(v) => v - case _ => MathConst.DEFAULT_SCORE - } - - s1 - s2 - }) - joinAndApply(op, that) - } - - /** - * Method used to define equality on multiple class levels while conforming to equality contract. Defines under - * what circumstances this class can equal another class. - * - * @param other Some other object - * @return Whether this object can equal the other object - */ - override def canEqual(other: Any): Boolean = other.isInstanceOf[CoordinateDataScores] -} - -object CoordinateDataScores { - - /** - * A factory method to create a [[CoordinateDataScores]] object from an [[RDD]] of scores. - * - * @param scores The scores, consisting of (unique ID, score) pairs. - * @return A new [[CoordinateDataScores]] object - */ - def apply(scores: DataFrame): CoordinateDataScores = new CoordinateDataScores(scores) - - /** - * Convert a [[GameDatum]] and a raw score into a score object. For [[CoordinateDataScores]] this is the raw score. - * - * @param datum The datum which was scored - * @param score The raw score for the datum - * @return The score object - */ - protected[ml] def toScore(datum: GameDatum, score: Double): Double = score -} diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/DataScores.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/DataScores.scala deleted file mode 100644 index ac3d4d9e..00000000 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/data/scoring/DataScores.scala +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright 2017 LinkedIn Corp. All rights reserved. - * Licensed under the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. You may obtain a - * copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - */ -package com.linkedin.photon.ml.data.scoring - -import org.apache.spark.SparkContext -import org.apache.spark.sql.{DataFrame, SparkSession} -import org.apache.spark.sql.functions.col -import org.apache.spark.storage.StorageLevel - -import com.linkedin.photon.ml.spark.RDDLike -import com.linkedin.photon.ml.constants.DataConst - -/** - * A base class for tracking scored data points, where the scores are stored in an [[DataFrame]] - * which associates the unique - * ID of a data point with a score object. - * - * @param scores Data point scores, as described above - */ -abstract protected[ml] class DataScores[D <: DataScores[D]]( - val scores: DataFrame) - extends RDDLike { - - /** - * The addition operation for [[DataScores]]. - * - * @note This operation performs a full outer join. - * @param that The [[DataScores]] instance to add to this instance - * @return A new [[DataScores]] instance encapsulating the accumulated values - */ - def +(that: D): D - - /** - * The minus operation for [[DataScores]]. - * - * @note This operation performs a full outer join. - * @param that The [[DataScores]] instance to subtract from this instance - * @return A new [[DataScores]] instance encapsulating the subtracted values - */ - def -(that: D): D - - /** - * Get the Spark context for the distributed scores. - * - * @return The Spark context - */ - override def sparkContext: SparkContext = SparkSession.builder.getOrCreate.sparkContext - - /* RDDLike methods */ - override def setName(name: String): RDDLike = { - - this - } - - /** - * Set the storage level of [[scores]]. - * - * @param storageLevel The storage level - * @return This object with the storage level of [[scores]] set - */ - override def persistRDD(storageLevel: StorageLevel): RDDLike = { - - scores.persist(storageLevel) - - this - } - - /** - * Mark [[scores]] as non-persistent, and remove all blocks for them from memory and disk. - * - * @return This object with [[scores]] marked non-persistent - */ - override def unpersistRDD(): RDDLike = { - - scores.unpersist() - - this - } - - /** - * Materialize [[scores]] (Spark data are lazy evaluated: this method forces them to be evaluated). - * - * @return This object with [[scores]] materialized - */ - override def materialize(): RDDLike = { - - scores.count() - - this - } - - /** - * Method used to define equality on multiple class levels while conforming to equality contract. Defines under - * what circumstances this class can equal another class. - * - * @param other Some other object - * @return Whether this object can equal the other object - */ - def canEqual(other: Any): Boolean = other.isInstanceOf[DataScores[D]] - - /** - * Compare two [[DataScores]]s objects. - * - * @param other Some other object - * @return True if the both [[DataScores]] objects have identical scores for each unique ID, false otherwise - */ - override def equals(other: Any): Boolean = other match { - - case that: DataScores[D] => - - val canEqual = this.canEqual(that) - lazy val areEqual = this - .scores - .withColumnRenamed(DataConst.SCORE, "s1") - .join(that.scores.withColumnRenamed(DataConst.SCORE, "s2"), col(DataConst.ID), "fullouter") - .filter("s1 is null or s2 is null or s1 != s2") - .head(1) - .isEmpty - - canEqual && areEqual - - case _ => - false - } - - /** - * Returns a hash code value for the object. - * - * @return An [[Int]] hash code - */ - override def hashCode: Int = scores.hashCode() - -} diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/evaluation/EvaluationSuite.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/evaluation/EvaluationSuite.scala index 847d5df2..cdc87b53 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/evaluation/EvaluationSuite.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/evaluation/EvaluationSuite.scala @@ -16,6 +16,7 @@ package com.linkedin.photon.ml.evaluation import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame import org.apache.spark.storage.StorageLevel import com.linkedin.photon.ml.Types.UniqueSampleId @@ -54,7 +55,7 @@ class EvaluationSuite( * @param scores The scores to evaluate * @return The evaluation metric values as [[EvaluationResults]] */ - protected[ml] def evaluate(scores: RDD[(UniqueSampleId, Double)]): EvaluationResults = { + protected[ml] def evaluate(scores: DataFrame /* RDD[(UniqueSampleId, Double)]*/): EvaluationResults = { // Possible for all models to be missing a score for some datum, meaning the score for a datum is missing even after // summing scores from all models. Thus, need a leftOuterJoin. diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/model/DatumScoringModel.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/model/DatumScoringModel.scala index 4fe2650c..8c52e0d8 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/model/DatumScoringModel.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/model/DatumScoringModel.scala @@ -17,7 +17,6 @@ package com.linkedin.photon.ml.model import org.apache.spark.sql.DataFrame import com.linkedin.photon.ml.TaskType.TaskType -import com.linkedin.photon.ml.data.scoring.{CoordinateDataScores} import com.linkedin.photon.ml.util.Summarizable /** @@ -37,6 +36,6 @@ trait DatumScoringModel extends Summarizable { * @param dataPoints The dataset to score * @return The computed scores */ - def score(dataPoints: DataFrame): CoordinateDataScores + def computeScore(dataPoints: DataFrame, scoreField: String): DataFrame } diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/model/GameModel.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/model/GameModel.scala index f2d38d1c..b7342633 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/model/GameModel.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/model/GameModel.scala @@ -20,8 +20,6 @@ import org.apache.spark.sql.DataFrame import com.linkedin.photon.ml.TaskType.TaskType import com.linkedin.photon.ml.Types.CoordinateId -import com.linkedin.photon.ml.data.GameDatum -import com.linkedin.photon.ml.data.scoring.CoordinateDataScores import com.linkedin.photon.ml.util.ClassUtils /** @@ -92,11 +90,10 @@ class GameModel (private val gameModels: Map[CoordinateId, DatumScoringModel]) e * Compute score, PRIOR to going through any link function, i.e. just compute a dot product of feature values * and model coefficients. * - * @param dataPoints The dataset to score (Note that the Long in the RDD is a unique identifier for the paired - * [[GameDatum]] object, referred to in the GAME code as the "unique id") + * @param dataPoints The dataset to score * @return The computed scores */ - override def score(dataPoints: DataFrame): CoordinateDataScores = + override def score(dataPoints: DataFrame): DataFrame = gameModels.values.map(_.score(dataPoints)).reduce(_ + _) /** diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/sampling/DownSampler.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/sampling/DownSampler.scala index e5b23550..413c76fc 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/sampling/DownSampler.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/sampling/DownSampler.scala @@ -18,7 +18,6 @@ import java.util.Random import org.apache.spark.rdd.RDD -import com.linkedin.photon.ml.Types.UniqueSampleId import com.linkedin.photon.ml.constants.MathConst import com.linkedin.photon.ml.data.LabeledPoint From ae98cc827b21f9ab0223f187f0e69f3132ba9191 Mon Sep 17 00:00:00 2001 From: Lin Guo Date: Thu, 20 Feb 2020 11:45:19 -0800 Subject: [PATCH 08/11] Fix game scoring codes; remove usage of RandomEffectOptimizationProblem --- .../FixedEffectModelCoordinate.scala | 9 +- .../ml/algorithm/RandomEffectCoordinate.scala | 41 +++---- .../RandomEffectModelCoordinate.scala | 12 +- .../photon/ml/estimators/GameEstimator.scala | 45 ++++--- .../ml/evaluation/EvaluatorFactory.scala | 18 +-- .../photon/ml/model/FixedEffectModel.scala | 69 +++++++++-- .../photon/ml/model/RandomEffectModel.scala | 53 +++++++- .../ml/transformers/GameTransformer.scala | 114 ++++++------------ .../cli/game/scoring/GameScoringDriver.scala | 38 +++--- .../ml/data/avro/ScoreProcessingUtils.scala | 3 +- .../ml/algorithm/CoordinateDescent.scala | 31 ++--- .../ml/evaluation/EvaluationSuite.scala | 3 +- .../photon/ml/model/DatumScoringModel.scala | 12 ++ .../linkedin/photon/ml/model/GameModel.scala | 20 ++- 14 files changed, 273 insertions(+), 195 deletions(-) diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala index 7aab7369..21eb794a 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala @@ -17,6 +17,7 @@ package com.linkedin.photon.ml.algorithm import org.apache.spark.sql.DataFrame import com.linkedin.photon.ml.Types.FeatureShardId +import com.linkedin.photon.ml.data.InputColumnsNames import com.linkedin.photon.ml.model.{DatumScoringModel, FixedEffectModel} /** @@ -25,7 +26,10 @@ import com.linkedin.photon.ml.model.{DatumScoringModel, FixedEffectModel} * @param dataset The training dataset * @param featureShardId The ID of the feature shard for the training data */ -class FixedEffectModelCoordinate(dataset: DataFrame, featureShardId: FeatureShardId) extends ModelCoordinate { +class FixedEffectModelCoordinate( + dataset: DataFrame, + featureShardId: FeatureShardId, + inputColumnsNames: InputColumnsNames) extends ModelCoordinate { /** * Score the effect-specific dataset in the coordinate with the input model. @@ -34,9 +38,10 @@ class FixedEffectModelCoordinate(dataset: DataFrame, featureShardId: FeatureShar * @return The output scores */ override protected def updateOffset(model: DatumScoringModel) = { + model match { case fixedEffectModel: FixedEffectModel => - FixedEffectCoordinate.updateOffset(dataset, fixedEffectModel, featureShardId) + FixedEffectCoordinate.updateOffset(dataset, fixedEffectModel, featureShardId, inputColumnsNames) case _ => throw new UnsupportedOperationException( diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala index bf88d3f3..8eb0e237 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala @@ -28,9 +28,9 @@ import com.linkedin.photon.ml.model.{Coefficients, DatumScoringModel, RandomEffe import com.linkedin.photon.ml.normalization.NormalizationContext import com.linkedin.photon.ml.optimization.VarianceComputationType.VarianceComputationType import com.linkedin.photon.ml.optimization._ -import com.linkedin.photon.ml.optimization.game.{RandomEffectOptimizationConfiguration, RandomEffectOptimizationProblem} +import com.linkedin.photon.ml.optimization.game.RandomEffectOptimizationConfiguration import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel -import com.linkedin.photon.ml.util.{ApiUtils, VectorUtils} +import com.linkedin.photon.ml.util.{ApiUtils, PhotonNonBroadcast, VectorUtils} /** * The optimization problem coordinate for a random effect model. @@ -38,13 +38,13 @@ import com.linkedin.photon.ml.util.{ApiUtils, VectorUtils} * @tparam Objective The type of objective function used to solve individual random effect optimization problems * @param rEType The random effect type * @param rawData The raw training dataframe - * @param optimizationProblem The random effect optimization problem + * @param optimizationProblem The single node optimization problem * @param inputColumnsNames */ protected[ml] class RandomEffectCoordinate[Objective <: SingleNodeObjectiveFunction]( rEType: REType, var rawData: DataFrame, - optimizationProblem: RandomEffectOptimizationProblem[Objective], + optimizationProblem: SingleNodeOptimizationProblem[Objective], featureShardId: FeatureShardId, inputColumnsNames: InputColumnsNames) extends Coordinate { @@ -169,17 +169,14 @@ object RandomEffectCoordinate { interceptIndexOpt: Option[Int] = None): RandomEffectCoordinate[RandomEffectObjective] = { // Generate parameters of ProjectedRandomEffectCoordinate - val randomEffectOptimizationProblem = RandomEffectOptimizationProblem( - data, - rEType, + val optimizationProblem = SingleNodeOptimizationProblem( configuration, - objectiveFunctionFactory, + objectiveFunctionFactory(interceptIndexOpt), glmConstructor, - normalizationContext, - varianceComputationType, - interceptIndexOpt) + PhotonNonBroadcast(normalizationContext), + varianceComputationType) - new RandomEffectCoordinate(rEType, data, randomEffectOptimizationProblem, featureShardId, inputColumnsNames) + new RandomEffectCoordinate(rEType, data, optimizationProblem, featureShardId, inputColumnsNames) } /** @@ -189,7 +186,7 @@ object RandomEffectCoordinate { * @param randomEffectDataset The training dataset * @param randomEffectType * @param featureShardId - * @param randomEffectOptimizationProblem The per-entity optimization problems + * @param optimizationProblem The per-entity optimization problems * @param initialRandomEffectModelOpt An optional existing [[RandomEffectModel]] to use as a starting point for * optimization * @return A (new [[RandomEffectModel]], optional optimization stats) tuple @@ -198,10 +195,10 @@ object RandomEffectCoordinate { randomEffectDataset: DataFrame, randomEffectType: REType, featureShardId: FeatureShardId, - randomEffectOptimizationProblem: RandomEffectOptimizationProblem[Function], + optimizationProblem: SingleNodeOptimizationProblem[Function], initialRandomEffectModelOpt: Option[RandomEffectModel]): (RandomEffectModel, RandomEffectOptimizationTracker) = { - val rdd = randomEffectDataset + val data = randomEffectDataset .rdd .map { row => val reid = row.getInt(0).toString @@ -228,18 +225,14 @@ object RandomEffectCoordinate { (reid, result.toArray) } - // TODO: remove pre-REID optimization problems - // All 3 RDDs involved in these joins use the same partitioner - val dataAndOptimizationProblems = rdd.join(randomEffectOptimizationProblem.optimizationProblems) - // Left join the models to data and optimization problems for cases where we have a prior model but no new data val (newModels, randomEffectOptimizationTracker) = initialRandomEffectModelOpt .map { randomEffectModel => val modelsRdd = randomEffectModel.toRDD() val modelsAndTrackers = modelsRdd - .leftOuterJoin(dataAndOptimizationProblems) + .leftOuterJoin(data) .mapValues { - case (localModel, Some((localDataset, optimizationProblem))) => + case (localModel, Some((localDataset))) => val trainingLabeledPoints = localDataset val (updatedModel, stateTrackers) = optimizationProblem.run(trainingLabeledPoints, localModel) @@ -256,11 +249,7 @@ object RandomEffectCoordinate { (models, optimizationTracker) } .getOrElse { - val modelsAndTrackers = dataAndOptimizationProblems - .mapValues { case (localDataset, optimizationProblem) => - val trainingLabeledPoints = localDataset - optimizationProblem.run(trainingLabeledPoints) - } + val modelsAndTrackers = data.mapValues (optimizationProblem.run(_)) modelsAndTrackers.persist(StorageLevel.MEMORY_AND_DISK_SER) val models = modelsAndTrackers.mapValues(_._1) diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectModelCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectModelCoordinate.scala index ec09da8e..7f915fb2 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectModelCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectModelCoordinate.scala @@ -15,6 +15,9 @@ package com.linkedin.photon.ml.algorithm import org.apache.spark.sql.DataFrame + +import com.linkedin.photon.ml.Types.{FeatureShardId, REType} +import com.linkedin.photon.ml.data.InputColumnsNames import com.linkedin.photon.ml.model.{DatumScoringModel, RandomEffectModel} /** @@ -22,7 +25,11 @@ import com.linkedin.photon.ml.model.{DatumScoringModel, RandomEffectModel} * * @param dataset The training dataset */ -class RandomEffectModelCoordinate(dataset: DataFrame) +class RandomEffectModelCoordinate( + rEType: REType, + dataset: DataFrame, + featureShardId: FeatureShardId, + inputColumnsNames: InputColumnsNames) extends ModelCoordinate { /** @@ -32,9 +39,10 @@ class RandomEffectModelCoordinate(dataset: DataFrame) * @return The output scores */ override protected def updateOffset(model: DatumScoringModel) = { + model match { case randomEffectModel: RandomEffectModel => - RandomEffectCoordinate.updateOffset(dataset, randomEffectModel) + RandomEffectCoordinate.updateOffset(dataset, randomEffectModel, featureShardId, rEType, inputColumnsNames) case _ => throw new UnsupportedOperationException( diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala index 8772f0e5..092dbf32 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/estimators/GameEstimator.scala @@ -29,8 +29,9 @@ import org.slf4j.Logger import com.linkedin.photon.ml.TaskType import com.linkedin.photon.ml.TaskType.TaskType -import com.linkedin.photon.ml.Types.CoordinateId +import com.linkedin.photon.ml.Types.{CoordinateId, UniqueSampleId} import com.linkedin.photon.ml.algorithm._ +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.data._ import com.linkedin.photon.ml.evaluation._ import com.linkedin.photon.ml.function.ObjectiveFunctionHelper @@ -407,10 +408,10 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P val response = columnsNames(InputColumnsNames.RESPONSE) val offset = columnsNames(InputColumnsNames.OFFSET) val weight = columnsNames(InputColumnsNames.WEIGHT) - val validatingLabelsAndOffsetsAndWeights = dataset.select(response, offset, weight) + val validatingLabelsAndOffsetsAndWeights = dataset.select(DataConst.ID, response, offset, weight) val evaluators = get(validationEvaluators) - .map(_.map(EvaluatorFactory.buildEvaluator(_, dataset))) //TODO: fix the errors + .map(_.map(EvaluatorFactory.buildEvaluator(_, dataset))) .getOrElse { // Get default evaluators given the task type val taskType = getRequiredParam(trainingTask) @@ -423,18 +424,25 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P Seq(defaultEvaluator) } - val evaluationSuite = EvaluationSuite(evaluators, validatingLabelsAndOffsetsAndWeights) //TODO: fix the errors + + val validatingLabelsAndOffsetsAndWeightsRdd = validatingLabelsAndOffsetsAndWeights + .rdd.map(row => (row.getAs[UniqueSampleId](0), (row.getDouble(1), row.getDouble(2), row.getDouble(3)))) + val evaluationSuite = EvaluationSuite(evaluators, validatingLabelsAndOffsetsAndWeightsRdd) .setName(s"Evaluation: validation data labels, offsets, and weights") .persistRDD(StorageLevel.MEMORY_AND_DISK) if (logger.isDebugEnabled) { val randUdf = udf({() => Random.nextInt()}) - val randomScores = dataset.withColumn("score", randUdf()).select("score") + val randomScores = dataset.withColumn(DataConst.SCORE, randUdf()) + .select(DataConst.ID, DataConst.SCORE) + .rdd + .map(row => (row.getAs[UniqueSampleId](0), row.getDouble(1))) + randomScores.persist() evaluationSuite - .evaluate(randomScores) //TODO: fix the errors + .evaluate(randomScores) .evaluations .foreach { case (evaluator, evaluation) => logger.debug(s"Random guessing baseline for evaluation metric '${evaluator.name}': $evaluation") @@ -489,21 +497,30 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P val lockedCoordinates = get(partialRetrainLockedCoordinates).getOrElse(Set()) val interceptIndices = getOrDefault(coordinateInterceptIndices) + val columnsNames = getOrDefault(inputColumnNames) + // Create the optimization coordinates for each component model val coordinates: Map[CoordinateId, C forSome { type C <: Coordinate }] = updateSequence .map { coordinateId => - val coordinate: C forSome { type C <: Coordinate } = if (lockedCoordinates.contains(coordinateId)) { - dataConfigs(coordinateId) match { - case _: FixedEffectDataConfiguration => new FixedEffectModelCoordinate(data, dataConfigs(coordinateId).featureShardId) - case _: RandomEffectDataConfiguration => new RandomEffectModelCoordinate(data) - case oConfig => throw new UnsupportedOperationException(s"Unsupported coordinate type: ${oConfig.getClass}") + + val dataConfiguration: CoordinateDataConfiguration = dataConfigs(coordinateId) + val coordinate: C forSome {type C <: Coordinate} = if (lockedCoordinates.contains(coordinateId)) { + dataConfiguration match { + case fedc: FixedEffectDataConfiguration => new FixedEffectModelCoordinate( + data, fedc.featureShardId, + columnsNames) + case redc: RandomEffectDataConfiguration => new RandomEffectModelCoordinate( + redc.randomEffectType, data, + redc.featureShardId, columnsNames) + case oConfig => throw new UnsupportedOperationException( + s"Unsupported coordinate type: ${oConfig.getClass}") } } else { CoordinateFactory.build( data, - dataConfigs(coordinateId).featureShardId, - getOrDefault(inputColumnNames), + dataConfiguration.featureShardId, + columnsNames, configuration(coordinateId), lossFunctionFactoryFactory, glmConstructor, @@ -511,7 +528,7 @@ class GameEstimator(val sc: SparkContext, implicit val logger: Logger) extends P normalizationContexts.getOrElse(coordinateId, NoNormalization()), variance, interceptIndices.get(coordinateId), - dataConfigs(coordinateId) match { + dataConfiguration match { case redc: RandomEffectDataConfiguration => Some(redc.randomEffectType) case _: FixedEffectDataConfiguration => None }) diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/evaluation/EvaluatorFactory.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/evaluation/EvaluatorFactory.scala index 740efe79..b7d83c40 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/evaluation/EvaluatorFactory.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/evaluation/EvaluatorFactory.scala @@ -14,10 +14,10 @@ */ package com.linkedin.photon.ml.evaluation -import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame import com.linkedin.photon.ml.Types.UniqueSampleId -import com.linkedin.photon.ml.data.GameDatum +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.evaluation.EvaluatorType._ /** @@ -29,13 +29,13 @@ object EvaluatorFactory { * Construct [[Evaluator]] objects. * * @param evaluatorType The [[EvaluatorType]] - * @param gameDataset A [[RDD]] of (unique ID, GAME data point) which may be necessary to construct [[MultiEvaluator]] + * @param gameDataset A [[DataFrame]] of (unique ID, GAME data point, scores) which may be necessary to construct [[MultiEvaluator]] * objects * @return A new [[Evaluator]] */ protected[ml] def buildEvaluator( evaluatorType: EvaluatorType, - gameDataset: RDD[(UniqueSampleId, GameDatum)]): Evaluator = + gameDataset: DataFrame): Evaluator = evaluatorType match { case AUC => AreaUnderROCCurveEvaluator @@ -52,12 +52,14 @@ object EvaluatorFactory { case SquaredLoss => SquaredLossEvaluator case MultiPrecisionAtK(k, idTag) => - val ids = gameDataset.mapValues(_.idTagToValueMap(idTag)) - new PrecisionAtKMultiEvaluator(k, idTag, ids) + val idsRDD = gameDataset.select(DataConst.ID, idTag) + .rdd.map(row => (row.getAs[UniqueSampleId](0), row.getString(1))) + new PrecisionAtKMultiEvaluator(k, idTag, idsRDD) case MultiAUC(idTag) => - val ids = gameDataset.mapValues(_.idTagToValueMap(idTag)) - new AreaUnderROCCurveMultiEvaluator(idTag, ids) + val idsRDD = gameDataset.select(DataConst.ID, idTag) + .rdd.map(row => (row.getAs[UniqueSampleId](0), row.getString(1))) + new AreaUnderROCCurveMultiEvaluator(idTag, idsRDD) case _ => throw new UnsupportedOperationException(s"Unsupported evaluator type: $evaluatorType") diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/model/FixedEffectModel.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/model/FixedEffectModel.scala index 5d6cc9a4..ce3b7c1f 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/model/FixedEffectModel.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/model/FixedEffectModel.scala @@ -20,9 +20,10 @@ import org.apache.spark.sql.{DataFrame, SparkSession} import com.linkedin.photon.ml.TaskType.TaskType import com.linkedin.photon.ml.Types.FeatureShardId +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.spark.BroadcastLike import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel -import com.linkedin.photon.ml.util.VectorUtils +import com.linkedin.photon.ml.util.{ApiUtils, VectorUtils} /** * Representation of a fixed effect model. @@ -31,10 +32,10 @@ import com.linkedin.photon.ml.util.VectorUtils * @param featureShardId The feature shard id */ class FixedEffectModel( - val modelBroadcast: Broadcast[GeneralizedLinearModel], - val featureShardId: String) + val modelBroadcast: Broadcast[GeneralizedLinearModel], + val featureShardId: String) extends DatumScoringModel - with BroadcastLike { + with BroadcastLike { override val modelType: TaskType = modelBroadcast.value.modelType @@ -54,10 +55,28 @@ class FixedEffectModel( * @return The computed scores */ override def computeScore(dataPoints: DataFrame, scoreField: String): DataFrame = { + FixedEffectModel.score(dataPoints, modelBroadcast, featureShardId, scoreField) } - /** + /** + * Accumulatively compute the scores for the GAME dataset. + * + * @note "score" = sum(features * coefficients) (Before link function in the case of logistic regression, for example) + * @param dataPoints The dataset to score + * @param scoreField The field name of the score + * @param accumulativeScoreField The field name of the accumulativeScore + * @return The computed scores + */ + override def computeScore( + dataPoints: DataFrame, + scoreField: String, + accumulativeScoreField: String): DataFrame = { + + FixedEffectModel.score(dataPoints, modelBroadcast, featureShardId, scoreField, DataConst.SCORE) + } + + /** * Build a summary string for the coefficients. * * @return String representation @@ -69,6 +88,7 @@ class FixedEffectModel( * Clean up coefficient broadcast. */ override protected[ml] def unpersistBroadcast(): BroadcastLike = { + modelBroadcast.unpersist() this } @@ -80,6 +100,7 @@ class FixedEffectModel( * @return True if both models have the same feature shard ID and underlying models, false otherwise */ override def equals(that: Any): Boolean = { + that match { case other: FixedEffectModel => val sameMetaData = this.featureShardId == other.featureShardId @@ -95,12 +116,12 @@ class FixedEffectModel( * @return An [[Int]] hash code */ override def hashCode: Int = featureShardId.hashCode + model.hashCode - } object FixedEffectModel { def apply(glm: GeneralizedLinearModel, featureShardId: FeatureShardId): FixedEffectModel = { + new FixedEffectModel(SparkSession.builder.getOrCreate.sparkContext.broadcast(glm), featureShardId) } @@ -113,13 +134,41 @@ object FixedEffectModel { * @return The scores */ private def score( - dataset: DataFrame, - modelBroadcast: Broadcast[GeneralizedLinearModel], - featureShardId: FeatureShardId, - scoreField: String): DataFrame = { + dataset: DataFrame, + modelBroadcast: Broadcast[GeneralizedLinearModel], + featureShardId: FeatureShardId, + scoreField: String): DataFrame = { val cofs = VectorUtils.breezeToMl(modelBroadcast.value.coefficients.means) dataset .withColumn(scoreField, GeneralizedLinearModel.scoreUdf(lit(cofs), col(featureShardId))) } + + /** + * Compute the scores for the dataset. + * + * @param dataset The dataset to score + * @param modelBroadcast The model to use for scoring + * @param featureShardId The feature shard id + * @return The scores + */ + private def score( + dataset: DataFrame, + modelBroadcast: Broadcast[GeneralizedLinearModel], + featureShardId: FeatureShardId, + scoreField: String, + accumulativeScoreField: String): DataFrame = { + + val cofs = VectorUtils.breezeToMl(modelBroadcast.value.coefficients.means) + + if (ApiUtils.hasColumn(dataset, DataConst.SCORE)) { + dataset + .withColumn(scoreField, GeneralizedLinearModel.scoreUdf(lit(cofs), col(featureShardId))) + .withColumn(DataConst.SCORE, col(DataConst.SCORE) + col(scoreField)) + } else { + dataset + .withColumn(scoreField, GeneralizedLinearModel.scoreUdf(lit(cofs), col(featureShardId))) + .withColumn(DataConst.SCORE, col(scoreField)) + } + } } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/model/RandomEffectModel.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/model/RandomEffectModel.scala index d17d4f6f..994ed15f 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/model/RandomEffectModel.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/model/RandomEffectModel.scala @@ -17,7 +17,7 @@ package com.linkedin.photon.ml.model import org.apache.spark.ml.linalg.{Vector => SparkVector} import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.functions.col +import org.apache.spark.sql.functions.{col, lit} import com.linkedin.photon.ml.TaskType import com.linkedin.photon.ml.TaskType.TaskType @@ -26,7 +26,7 @@ import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.supervised.classification.{LogisticRegressionModel, SmoothedHingeLossLinearSVMModel} import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel import com.linkedin.photon.ml.supervised.regression.{LinearRegressionModel, PoissonRegressionModel} -import com.linkedin.photon.ml.util.VectorUtils +import com.linkedin.photon.ml.util.{ApiUtils, VectorUtils} /** * Representation of a random effect model. @@ -77,6 +77,23 @@ class RandomEffectModel( scoreField) } + /** + * Accumulatively compute the scores for the GAME dataset. + * + * @note "score" = sum(features * coefficients) (Before link function in the case of logistic regression, for example) + * @param dataPoints The dataset to score + * @param scoreField The field name of the score + * @param accumulativeScoreField The field name of the accumulativeScore + * @return The computed scores + */ + override def computeScore( + dataPoints: DataFrame, + scoreField: String, + accumulativeScoreField: String): DataFrame = { + + RandomEffectModel.score(dataPoints, models, randomEffectType, featureShardId, scoreField, DataConst.SCORE) + } + // // Summarizable functions // @@ -211,6 +228,38 @@ object RandomEffectModel { .withColumn(scoreField, GeneralizedLinearModel.scoreUdf(col(DataConst.COEFFICIENTS), col(featureShardId))) } + /** + * Compute the scores for the dataset. + * + * @param dataset The dataset to score + * @param models The individual random effect models to use for scoring + * @param randomEffectType The random effect type + * @param featureShardId The feature shard id + * @param scoreField The field name of the coordinate + * @param accumulativeScoreField The field name of the accumulative score + * @return The scores + */ + private def score( + dataset: DataFrame, + models: DataFrame, + randomEffectType: REType, + featureShardId: FeatureShardId, + scoreField: String, + accumulativeScoreField: String): DataFrame = { + + if (ApiUtils.hasColumn(dataset, DataConst.SCORE)) { + dataset + .join(models, randomEffectType) + .withColumn(scoreField, GeneralizedLinearModel.scoreUdf(lit(DataConst.COEFFICIENTS), col(featureShardId))) + .withColumn(DataConst.SCORE, col(DataConst.SCORE) + col(scoreField)) + } else { + dataset + .join(models, randomEffectType) + .withColumn(scoreField, GeneralizedLinearModel.scoreUdf(lit(DataConst.COEFFICIENTS), col(featureShardId))) + .withColumn(DataConst.SCORE, col(scoreField)) + } + } + def toDataFrame(input: RDD[(REType, GeneralizedLinearModel)]): DataFrame = { null } diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/transformers/GameTransformer.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/transformers/GameTransformer.scala index 0efc4775..e474bff1 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/transformers/GameTransformer.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/transformers/GameTransformer.scala @@ -18,17 +18,19 @@ import org.apache.commons.cli.MissingArgumentException import org.apache.spark.SparkContext import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable -import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.{col, count, monotonically_increasing_id} import org.apache.spark.storage.StorageLevel import org.slf4j.Logger -import com.linkedin.photon.ml.Types.{FeatureShardId, REType, UniqueSampleId} -import com.linkedin.photon.ml.data.{GameDatum, InputColumnsNames} +import com.linkedin.photon.ml.Types.{REType, UniqueSampleId} +import com.linkedin.photon.ml.constants.DataConst +import com.linkedin.photon.ml.data.InputColumnsNames import com.linkedin.photon.ml.evaluation._ import com.linkedin.photon.ml.model.{FixedEffectModel, GameModel, RandomEffectModel} import com.linkedin.photon.ml.util._ + /** * Scores input data using a [[GameModel]]. Plays a similar role to the [[org.apache.spark.ml.Model]]. * @@ -146,7 +148,7 @@ class GameTransformer(val sc: SparkContext, implicit val logger: Logger) extends * @param data Input [[DataFrame]] of samples * @return Scored data samples */ - def transform(data: DataFrame): ModelDataScores = { + def transform(data: DataFrame): DataFrame = { validateParams() @@ -168,7 +170,7 @@ class GameTransformer(val sc: SparkContext, implicit val logger: Logger) extends .toSet val gameDataset = Timed("Preparing GAME dataset") { - prepareGameDataset(data, randomEffectTypes, featureShards) + data.withColumn(DataConst.ID, monotonically_increasing_id) } if (getOrDefault(logDataAndModelStats)) { @@ -176,55 +178,27 @@ class GameTransformer(val sc: SparkContext, implicit val logger: Logger) extends logger.debug(s"GAME model summary:\n${getRequiredParam(model).toSummaryString}") } - val scores = Timed("Computing scores") { - scoreGameDataset(gameDataset) + val storageLevel = if (getOrDefault(spillScoresToDisk)) { + StorageLevel.MEMORY_AND_DISK + } else { + StorageLevel.MEMORY_ONLY } - - gameDataset.unpersist() + val gameDataWithScores = Timed("Computing scores") { + getRequiredParam(model).score(gameDataset) + } + gameDataWithScores.persist(storageLevel) Timed("Evaluating scores") { get(validationEvaluators).foreach( _.foreach { evaluatorType => - val evaluationMetricValue = evaluateScores(evaluatorType, gameDataset, scores) + val evaluationMetricValue = evaluateScores(evaluatorType, gameDataWithScores) logger.info(s"Evaluation metric value on scores with $evaluatorType: $evaluationMetricValue") }) } - // TODO: Instead, we should merge the scores back into the DataFrame in a new column (at least optionally) - - scores + gameDataWithScores } - /** - * Builds a GAME dataset according to input data configuration. - * - * @param dataFrame A [[DataFrame]] of raw input data - * @param randomEffectTypes The set of unique identifier fields used by the random effects of the model - * @param featureShards The set of feature shards used by the model - * @return The prepared GAME dataset - */ - protected def prepareGameDataset( - dataFrame: DataFrame, - randomEffectTypes: Set[REType], - featureShards: Set[FeatureShardId]): RDD[(UniqueSampleId, GameDatum)] = { - - val parallelism = sc.getConf.get("spark.default.parallelism", s"${sc.getExecutorStorageStatus.length * 3}").toInt - val partitioner = new LongHashPartitioner(parallelism) - val idTagSet = randomEffectTypes ++ - get(validationEvaluators).map(MultiEvaluatorType.getMultiEvaluatorIdTags).getOrElse(Seq()) - val gameDataset = GameConverters - .getGameDatasetFromDataFrame( - dataFrame, - featureShards, - idTagSet, - isResponseRequired = false, - getOrDefault(inputColumnNames)) - .partitionBy(partitioner) - .setName("Game dataset with UIDs for scoring") - .persist(StorageLevel.DISK_ONLY) - - gameDataset - } /** * Log some simple summary statistics for the GAME dataset. @@ -232,7 +206,7 @@ class GameTransformer(val sc: SparkContext, implicit val logger: Logger) extends * @param gameDataset The GAME dataset * @param randomEffectTypes The set of unique identifier fields used by the random effects of the model */ - private def logGameDataset(gameDataset: RDD[(UniqueSampleId, GameDatum)], randomEffectTypes: Set[REType]): Unit = { + private def logGameDataset(gameDataset: DataFrame, randomEffectTypes: Set[REType]): Unit = { val numSamples = gameDataset.count() @@ -241,63 +215,45 @@ class GameTransformer(val sc: SparkContext, implicit val logger: Logger) extends randomEffectTypes.foreach { idTag => val numSamplesStats = gameDataset - .map { case (_, gameData) => - val idValue = gameData.idTagToValueMap(idTag) - (idValue, 1) - } - .reduceByKey(_ + _) - .values - .stats() + .groupBy(idTag).agg(count("*").alias("cnt")) + .describe("cnt") + .collect() + .map(t => t.getString(0) + "\t" + t.getDouble(1) + "\t" + t.getDouble(2)) + .mkString("\n") logger.debug(s"numSamples for $idTag: $numSamplesStats") } } - /** - * Load the GAME model and score the GAME dataset. - * - * @param gameDataset The GAME dataset - * @return The scores - */ - protected def scoreGameDataset(gameDataset: RDD[(UniqueSampleId, GameDatum)]): ModelDataScores = { - - val storageLevel = if (getOrDefault(spillScoresToDisk)) { - StorageLevel.MEMORY_AND_DISK - } else { - StorageLevel.MEMORY_ONLY - } - // Need to split these calls to keep correct return type - val scores = getRequiredParam(model).score(gameDataset) - scores.persistRDD(storageLevel).materialize() - - scores - } /** * Evaluate the computed scores with the given evaluator type. * * @param evaluatorType The evaluator type - * @param scores The computed scores - * @param gameDataset The GAME dataset + * @param gameDatasetWithscores The GAME dataset * @return The evaluation metric */ protected def evaluateScores( evaluatorType: EvaluatorType, - gameDataset: RDD[(UniqueSampleId, GameDatum)], - scores: ModelDataScores): Double = { + gameDatasetWithscores: DataFrame): Double = { - val evaluator = EvaluatorFactory.buildEvaluator(evaluatorType, gameDataset) + val evaluator = EvaluatorFactory.buildEvaluator(evaluatorType, gameDatasetWithscores) + val offset = inputColumnNames(InputColumnsNames.OFFSET) + val response = inputColumnNames(InputColumnsNames.RESPONSE) + val weight = inputColumnNames(InputColumnsNames.WEIGHT) evaluator match { case se: SingleEvaluator => - val scoresRDD = scores.scoresRdd.map { case (_, sGD) => - (sGD.score + sGD.offset, sGD.response, sGD.weight) - } + val scoresRDD = gameDatasetWithscores + .select(col(DataConst.SCORE) + col(offset), response, weight) + .rdd.map (row => (row.getDouble(0), row.getDouble(1), row.getDouble(2))) se.evaluate(scoresRDD) case me: MultiEvaluator => - val scoresRDD = scores.scoresRdd.mapValues(sGD => (sGD.score + sGD.offset, sGD.response, sGD.weight)) + val scoresRDD = gameDatasetWithscores + .select(col(DataConst.ID), col(DataConst.SCORE) + col(offset), response, weight) + .rdd.map (row => (row.getAs[UniqueSampleId](0), (row.getDouble(1), row.getDouble(2), row.getDouble(3)))) me.evaluate(scoresRDD) diff --git a/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/scoring/GameScoringDriver.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/scoring/GameScoringDriver.scala index fc339256..16eefab8 100644 --- a/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/scoring/GameScoringDriver.scala +++ b/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/scoring/GameScoringDriver.scala @@ -21,16 +21,16 @@ import org.apache.spark.ml.param.{Param, ParamMap, Params} import org.apache.spark.sql.DataFrame import org.apache.spark.storage.StorageLevel -import com.linkedin.photon.ml.{Constants, DataValidationType, SparkSessionConfiguration, TaskType} import com.linkedin.photon.ml.Types.FeatureShardId import com.linkedin.photon.ml.cli.game.GameDriver +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.data.avro._ import com.linkedin.photon.ml.data.{DataValidators, InputColumnsNames} import com.linkedin.photon.ml.index.IndexMapLoader import com.linkedin.photon.ml.io.scopt.game.ScoptGameScoringParametersParser -import com.linkedin.photon.ml.model.RandomEffectModel import com.linkedin.photon.ml.transformers.GameTransformer import com.linkedin.photon.ml.util._ +import com.linkedin.photon.ml.{Constants, DataValidationType, SparkSessionConfiguration, TaskType} /** * Driver for GAME full model scoring. @@ -181,18 +181,20 @@ object GameScoringDriver extends GameDriver { transformer } - val scores = Timed("Score data") { + val gameDataWithScores = Timed("Score data") { gameTransformer.transform(dataFrame) } - gameModel.toMap.foreach { - case (_, model: RandomEffectModel) => model.unpersistRDD() - case _ => - } +// gameModel.toMap.foreach { +// case (_, model: RandomEffectModel) => model.unpersistRDD() +// case _ => +// } Timed("Save scores") { - saveScoresToHDFS(scores) + saveScoresToHDFS(gameDataWithScores) } + + gameDataWithScores.unpersist() } /** @@ -223,29 +225,27 @@ object GameScoringDriver extends GameDriver { /** * Save the computed scores to HDFS with auxiliary info. * - * @param scores The computed scores + * @param data The game dataset with computed scores */ - protected def saveScoresToHDFS(scores: ModelDataScores): Unit = { + protected def saveScoresToHDFS(data: DataFrame): Unit = { // Take the offset information into account when writing the scores to HDFS - val scoredItems = scores.scoresRdd.map { case (_, scoredGameDatum) => - ScoredItem( - scoredGameDatum.score + scoredGameDatum.offset, - Some(scoredGameDatum.response), - Some(scoredGameDatum.weight), - scoredGameDatum.idTagToValueMap) - } + val scoredItems = data.select(DataConst.SCORE + inputColumnNames(InputColumnsNames.OFFSET), + inputColumnNames(InputColumnsNames.RESPONSE), + inputColumnNames(InputColumnsNames.WEIGHT), + // scoredGameDatum.idTagToValueMap) + ) if (getOrDefault(logDataAndModelStats)) { // Persist scored items here since we introduce multiple passes - scoredItems.setName("Scored items").persist(StorageLevel.MEMORY_AND_DISK) + scoredItems.persist(StorageLevel.MEMORY_AND_DISK) val numScoredItems = scoredItems.count() logger.info(s"Number of scored items to be written to HDFS: $numScoredItems \n") } val scoredItemsToBeSaved = get(outputFilesLimit) match { - case Some(limit) if limit < scoredItems.partitions.length => scoredItems.coalesce(getOrDefault(outputFilesLimit)) + case Some(limit) => scoredItems.limit(getOrDefault(outputFilesLimit)) case _ => scoredItems } val scoresDir = new Path(getRequiredParam(rootOutputDirectory), SCORES_DIR) diff --git a/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/ScoreProcessingUtils.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/ScoreProcessingUtils.scala index d0df6c01..c3eb4d8d 100644 --- a/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/ScoreProcessingUtils.scala +++ b/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/ScoreProcessingUtils.scala @@ -18,6 +18,7 @@ import scala.collection.JavaConverters._ import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame import com.linkedin.photon.avro.generated.ScoringResultAvro import com.linkedin.photon.ml.cli.game.scoring.ScoredItem @@ -67,7 +68,7 @@ object ScoreProcessingUtils { * @param outputDir The given output directory */ protected[ml] def saveScoredItemsToHDFS( - scoredItems: RDD[ScoredItem], + scoredItems: DataFrame, //[ScoredItem], outputDir: String, modelId: Option[String]): Unit = { diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateDescent.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateDescent.scala index aee16d54..42dfb647 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateDescent.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/CoordinateDescent.scala @@ -20,7 +20,8 @@ import org.apache.spark.sql.DataFrame import org.apache.spark.storage.StorageLevel import org.slf4j.Logger -import com.linkedin.photon.ml.Types.CoordinateId +import com.linkedin.photon.ml.Types.{CoordinateId, UniqueSampleId} +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.evaluation.{EvaluationResults, EvaluationSuite, EvaluatorType} import com.linkedin.photon.ml.model.{DatumScoringModel, GameModel} import com.linkedin.photon.ml.optimization.OptimizationTracker @@ -202,28 +203,6 @@ object CoordinateDescent { coordinate.trainModel() }) - /*(initialModelOpt, residualsOpt) match { - case (Some(initialModel), Some(residuals)) => - Timed(s"Train new model with residuals using existing model as starting point") { - coordinate.trainModel(initialModel, residuals) - } - - case (Some(initialModel), None) => - Timed(s"Train new model using existing model as starting point") { - coordinate.trainModel(initialModel) - } - - case (None, Some(residuals)) => - Timed(s"Train new model with residuals") { - coordinate.trainModel(residuals) - } - - case (None, None) => - Timed(s"Train new model") { - coordinate.trainModel() - } - }*/ - logOptimizationSummary(logger, coordinateId, model, tracker) model @@ -331,7 +310,11 @@ object CoordinateDescent { } Timed(s"Compute evaluation metrics") { - val results = evaluationSuite.evaluate(validatingScores) //todo: to fix it + val scoresRdd = validatingScores.select(DataConst.ID, DataConst.SCORE) + .rdd + .map(row => (row.getAs[UniqueSampleId](0), row.getDouble(1))) + + val results = evaluationSuite.evaluate(scoresRdd) results .evaluations diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/evaluation/EvaluationSuite.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/evaluation/EvaluationSuite.scala index cdc87b53..847d5df2 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/evaluation/EvaluationSuite.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/evaluation/EvaluationSuite.scala @@ -16,7 +16,6 @@ package com.linkedin.photon.ml.evaluation import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD -import org.apache.spark.sql.DataFrame import org.apache.spark.storage.StorageLevel import com.linkedin.photon.ml.Types.UniqueSampleId @@ -55,7 +54,7 @@ class EvaluationSuite( * @param scores The scores to evaluate * @return The evaluation metric values as [[EvaluationResults]] */ - protected[ml] def evaluate(scores: DataFrame /* RDD[(UniqueSampleId, Double)]*/): EvaluationResults = { + protected[ml] def evaluate(scores: RDD[(UniqueSampleId, Double)]): EvaluationResults = { // Possible for all models to be missing a score for some datum, meaning the score for a datum is missing even after // summing scores from all models. Thus, need a leftOuterJoin. diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/model/DatumScoringModel.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/model/DatumScoringModel.scala index 8c52e0d8..e30d5029 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/model/DatumScoringModel.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/model/DatumScoringModel.scala @@ -34,8 +34,20 @@ trait DatumScoringModel extends Summarizable { * * @note "score" = features * coefficients (Before link function in the case of logistic regression, for example) * @param dataPoints The dataset to score + * @param scoreField The field name of the score * @return The computed scores */ def computeScore(dataPoints: DataFrame, scoreField: String): DataFrame + /** + * Accumulatively compute the scores for the GAME dataset. + * + * @note "score" = sum(features * coefficients) (Before link function in the case of logistic regression, for example) + * @param dataPoints The dataset to score + * @param scoreField The field name of the score + * @param accumulativeScoreField The field name of the accumulativeScore + * @return The computed scores + */ + def computeScore(dataPoints: DataFrame, scoreField: String, accumulativeScoreField: String): DataFrame + } diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/model/GameModel.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/model/GameModel.scala index b7342633..d30c0692 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/model/GameModel.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/model/GameModel.scala @@ -20,6 +20,7 @@ import org.apache.spark.sql.DataFrame import com.linkedin.photon.ml.TaskType.TaskType import com.linkedin.photon.ml.Types.CoordinateId +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.util.ClassUtils /** @@ -27,12 +28,12 @@ import com.linkedin.photon.ml.util.ClassUtils * * @param gameModels A (modelName -> model) map containing the sub-models that make up the complete GAME model */ -class GameModel (private val gameModels: Map[CoordinateId, DatumScoringModel]) extends DatumScoringModel { +class GameModel (private val gameModels: Map[CoordinateId, DatumScoringModel]) { // The model type should be consistent at construction time. However, copies of this object shouldn't need to call the // check again. Thus the value is lazy, so that anonymous classes can overwrite it without triggering a call to // determineModelType, but it's called immediately so that it's evaluated at construction time. - override lazy val modelType: TaskType = GameModel.determineModelType(gameModels) + lazy val modelType: TaskType = GameModel.determineModelType(gameModels) modelType /** @@ -93,15 +94,23 @@ class GameModel (private val gameModels: Map[CoordinateId, DatumScoringModel]) e * @param dataPoints The dataset to score * @return The computed scores */ - override def score(dataPoints: DataFrame): DataFrame = - gameModels.values.map(_.score(dataPoints)).reduce(_ + _) + def score(dataPoints: DataFrame): DataFrame = { + + gameModels.foreach { case (coordinateId: CoordinateId, coordinateModel: DatumScoringModel) => + val scoreName = s"${coordinateId}_score" + coordinateModel.computeScore(dataPoints, scoreName, DataConst.SCORE) + } + + dataPoints + } + /** * Summarize this GAME model. * * @return A summary of the object in string representation */ - override def toSummaryString: String = { + def toSummaryString: String = { gameModels.map { case (name, model) => s"Model name: $name, summary:\n${model.toSummaryString}\n" }.mkString("\n") } @@ -131,7 +140,6 @@ class GameModel (private val gameModels: Map[CoordinateId, DatumScoringModel]) e * @return An [[Int]] hash code */ override def hashCode(): Int = super.hashCode() - } object GameModel { From 1875588022661b11f307a5847882b5d4cbada3d1 Mon Sep 17 00:00:00 2001 From: Lin Guo Date: Fri, 21 Feb 2020 11:10:19 -0800 Subject: [PATCH 09/11] fix problems in RandomEffectCoordinate --- .../ml/algorithm/RandomEffectCoordinate.scala | 132 ++++++++++++------ .../photon/ml/model/RandomEffectModel.scala | 32 +---- 2 files changed, 90 insertions(+), 74 deletions(-) diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala index 8eb0e237..790e8015 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala @@ -17,11 +17,14 @@ package com.linkedin.photon.ml.algorithm import scala.collection.mutable import org.apache.spark.ml.linalg.{Vector => SparkVector} -import org.apache.spark.sql.functions.col -import org.apache.spark.sql.{DataFrame, functions} +import org.apache.spark.sql.functions.{col, collect_list} +import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.storage.StorageLevel -import com.linkedin.photon.ml.Types.{FeatureShardId, REType} +import com.linkedin.photon.ml.TaskType +import com.linkedin.photon.ml.TaskType.TaskType +import com.linkedin.photon.ml.Types.{FeatureShardId, REId, REType} +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.data._ import com.linkedin.photon.ml.function.SingleNodeObjectiveFunction import com.linkedin.photon.ml.model.{Coefficients, DatumScoringModel, RandomEffectModel} @@ -29,7 +32,9 @@ import com.linkedin.photon.ml.normalization.NormalizationContext import com.linkedin.photon.ml.optimization.VarianceComputationType.VarianceComputationType import com.linkedin.photon.ml.optimization._ import com.linkedin.photon.ml.optimization.game.RandomEffectOptimizationConfiguration +import com.linkedin.photon.ml.supervised.classification.{LogisticRegressionModel, SmoothedHingeLossLinearSVMModel} import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel +import com.linkedin.photon.ml.supervised.regression.{LinearRegressionModel, PoissonRegressionModel} import com.linkedin.photon.ml.util.{ApiUtils, PhotonNonBroadcast, VectorUtils} /** @@ -62,10 +67,10 @@ protected[ml] class RandomEffectCoordinate[Objective <: SingleNodeObjectiveFunct .select(rEType, featureShardId, label, offset, weight) .groupBy(rEType) .agg( - functions.collect_list(featureShardId), - functions.collect_list(label), - functions.collect_list(offset), - functions.collect_list(weight)) + collect_list(featureShardId).alias("features"), + collect_list(label).alias("labels"), + collect_list(offset).alias("offsets"), + collect_list(weight).alias("weights")) } // @@ -107,6 +112,7 @@ protected[ml] class RandomEffectCoordinate[Objective <: SingleNodeObjectiveFunct rEType, featureShardId, optimizationProblem, + inputColumnsNames, Some(randomEffectModel)) (newModel, optimizationTracker) @@ -131,6 +137,7 @@ protected[ml] class RandomEffectCoordinate[Objective <: SingleNodeObjectiveFunct rEType, featureShardId, optimizationProblem, + inputColumnsNames, None) (newModel, optimizationTracker) @@ -183,7 +190,7 @@ object RandomEffectCoordinate { * Train a new [[RandomEffectModel]] (i.e. run model optimization for each entity). * * @tparam Function The type of objective function used to solve individual random effect optimization problems - * @param randomEffectDataset The training dataset + * @param trainingData The training dataset * @param randomEffectType * @param featureShardId * @param optimizationProblem The per-entity optimization problems @@ -192,49 +199,27 @@ object RandomEffectCoordinate { * @return A (new [[RandomEffectModel]], optional optimization stats) tuple */ protected[algorithm] def trainModel[Function <: SingleNodeObjectiveFunction]( - randomEffectDataset: DataFrame, + trainingData: DataFrame, randomEffectType: REType, featureShardId: FeatureShardId, optimizationProblem: SingleNodeOptimizationProblem[Function], + inputColumnsNames: InputColumnsNames, initialRandomEffectModelOpt: Option[RandomEffectModel]): (RandomEffectModel, RandomEffectOptimizationTracker) = { - val data = randomEffectDataset - .rdd - .map { row => - val reid = row.getInt(0).toString - val features = row.getList[SparkVector](1) - val labels = row.getList[Double](2) - val offsets = row.getList[Double](3) - val weights = row.getList[Double](4) - - val fIter = features.iterator() - val lIter = labels.iterator() - val oIter = offsets.iterator() - val wIter = weights.iterator() - - require(features.size == labels.size) - require(features.size == offsets.size) - require(features.size == weights.size) - - val result = new mutable.ArrayBuffer[LabeledPoint](features.size) - - (0 until features.size).map { _ => - result += LabeledPoint(lIter.next(), VectorUtils.mlToBreeze(fIter.next()), oIter.next(), wIter.next()) - } - - (reid, result.toArray) - } - - // Left join the models to data and optimization problems for cases where we have a prior model but no new data val (newModels, randomEffectOptimizationTracker) = initialRandomEffectModelOpt .map { randomEffectModel => - val modelsRdd = randomEffectModel.toRDD() - val modelsAndTrackers = modelsRdd - .leftOuterJoin(data) + + val modelsAndTrackers = randomEffectModel.models.join(trainingData, col(randomEffectType), "left_outer") + .rdd + .map { row => + val reid = row.getAs[REId](randomEffectType) + val labeledPoints: Option[Array[LabeledPoint]] = getLabeledPoints(row) + val model = getModel(row) + (reid, (model, labeledPoints)) + } .mapValues { case (localModel, Some((localDataset))) => - val trainingLabeledPoints = localDataset - val (updatedModel, stateTrackers) = optimizationProblem.run(trainingLabeledPoints, localModel) + val (updatedModel, stateTrackers) = optimizationProblem.run(localDataset, localModel) (updatedModel, Some(stateTrackers)) @@ -249,7 +234,14 @@ object RandomEffectCoordinate { (models, optimizationTracker) } .getOrElse { - val modelsAndTrackers = data.mapValues (optimizationProblem.run(_)) + val modelsAndTrackers = trainingData + .rdd + .map( + row => { + val reid = row.getAs[REId](randomEffectType) + (reid, getLabeledPoints(row).get) + }) + .mapValues(optimizationProblem.run(_)) modelsAndTrackers.persist(StorageLevel.MEMORY_AND_DISK_SER) val models = modelsAndTrackers.mapValues(_._1) @@ -265,10 +257,64 @@ object RandomEffectCoordinate { (newRandomEffectModel, randomEffectOptimizationTracker) } + /** + * Get the score field name + * @param rEType Random effect type + * @return A field name + */ def getScoreFieldName(rEType: REType): String = { return s"${rEType}_score" } + /** + * Create a generalized linear model from an input row + * @param row An input row + * @return A generalized linear model + */ + def getModel(row: Row): GeneralizedLinearModel = { + val coefficients = Coefficients(VectorUtils.mlToBreeze(row.getAs[SparkVector](DataConst.COEFFICIENTS))) + val modelType: TaskType = TaskType.withName(row.getAs[String](DataConst.MODEL_TYPE)) + val model = modelType match { + case TaskType.LINEAR_REGRESSION => + LinearRegressionModel(coefficients) + case TaskType.LOGISTIC_REGRESSION => + LogisticRegressionModel(coefficients) + case TaskType.POISSON_REGRESSION => + PoissonRegressionModel(coefficients) + case TaskType.SMOOTHED_HINGE_LOSS_LINEAR_SVM => + SmoothedHingeLossLinearSVMModel(coefficients) + } + model + } + + /** + * Create an optional array of labeled points + * @param row An input row + * @return An optional array of labeled points + */ + def getLabeledPoints(row: Row): Option[Array[LabeledPoint]] = { + + val features = row.getAs[List[SparkVector]]("features") + val labels = row.getAs[List[Double]]("labels") + val offsets = row.getAs[List[Double]]("offsets") + val weights = row.getAs[List[Double]]("weights") + + if (features != null) { + require(features.size == labels.size) + require(features.size == offsets.size) + require(features.size == weights.size) + + val result = new mutable.ArrayBuffer[LabeledPoint](features.size) + + for (i <- features.indices) + result += LabeledPoint(labels(i), VectorUtils.mlToBreeze(features(i)), offsets(i), weights(i)) + + Option.apply(result.toArray) + } else { + None + } + } + /** * Score a dataset using a given [[RandomEffectModel]]. * diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/model/RandomEffectModel.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/model/RandomEffectModel.scala index 994ed15f..e167a12f 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/model/RandomEffectModel.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/model/RandomEffectModel.scala @@ -14,7 +14,6 @@ */ package com.linkedin.photon.ml.model -import org.apache.spark.ml.linalg.{Vector => SparkVector} import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, lit} @@ -23,10 +22,8 @@ import com.linkedin.photon.ml.TaskType import com.linkedin.photon.ml.TaskType.TaskType import com.linkedin.photon.ml.Types.{FeatureShardId, REType} import com.linkedin.photon.ml.constants.DataConst -import com.linkedin.photon.ml.supervised.classification.{LogisticRegressionModel, SmoothedHingeLossLinearSVMModel} import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel -import com.linkedin.photon.ml.supervised.regression.{LinearRegressionModel, PoissonRegressionModel} -import com.linkedin.photon.ml.util.{ApiUtils, VectorUtils} +import com.linkedin.photon.ml.util.ApiUtils /** * Representation of a random effect model. @@ -145,33 +142,6 @@ class RandomEffectModel( false } - /** - * Convert models from dataframe to RDD - * @return - */ - def toRDD(): RDD[(REType, GeneralizedLinearModel)] = { - models - .select(randomEffectType, DataConst.MODEL_TYPE, DataConst.COEFFICIENTS) - .rdd - .map { row => - val reid = row.getInt(0).toString - val modelType: TaskType = TaskType.withName(row.getString(1)) - val coefficients = Coefficients(VectorUtils.mlToBreeze(row.getAs[SparkVector](2))) - - val model = modelType match { - case TaskType.LINEAR_REGRESSION => - LinearRegressionModel(coefficients) - case TaskType.LOGISTIC_REGRESSION => - LogisticRegressionModel(coefficients) - case TaskType.POISSON_REGRESSION => - PoissonRegressionModel(coefficients) - case TaskType.SMOOTHED_HINGE_LOSS_LINEAR_SVM => - SmoothedHingeLossLinearSVMModel(coefficients) - } - (reid, model) - } - } - /** * Returns a hash code value for the object. * From 1f36c783b0f9688bc2d2cc0000217dcde73596c6 Mon Sep 17 00:00:00 2001 From: Lin Guo Date: Fri, 21 Feb 2020 14:58:28 -0800 Subject: [PATCH 10/11] fix compilation errors: model loading/storing and score storing and down sampler --- .../ml/algorithm/FixedEffectCoordinate.scala | 18 +++--- .../FixedEffectModelCoordinate.scala | 2 +- .../ml/algorithm/RandomEffectCoordinate.scala | 2 +- .../RandomEffectModelCoordinate.scala | 2 +- .../DistributedOptimizationProblem.scala | 12 ++-- .../model/GeneralizedLinearModel.scala | 14 ++++- .../ml/transformers/GameTransformer.scala | 11 ++-- .../cli/game/scoring/GameScoringDriver.scala | 21 ++++--- .../photon/ml/data/avro/AvroUtils.scala | 37 ++++++++++- .../ml/data/avro/ModelProcessingUtils.scala | 63 ++++++++++++------- .../ml/data/avro/ScoreProcessingUtils.scala | 38 ++++++----- .../com/linkedin/photon/ml/util/Utils.scala | 10 ++- .../photon/ml/algorithm/Coordinate.scala | 2 +- .../photon/ml/constants/DataConst.scala | 2 + .../photon/ml/sampling/DownSampler.scala | 6 +- 15 files changed, 165 insertions(+), 75 deletions(-) diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala index 7c505279..7b2f157c 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectCoordinate.scala @@ -19,7 +19,8 @@ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.col import org.apache.spark.storage.StorageLevel -import com.linkedin.photon.ml.Types.FeatureShardId +import com.linkedin.photon.ml.Types.{FeatureShardId, UniqueSampleId} +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.data._ import com.linkedin.photon.ml.function.DistributedObjectiveFunction import com.linkedin.photon.ml.model.{DatumScoringModel, FixedEffectModel} @@ -41,7 +42,7 @@ protected[ml] class FixedEffectCoordinate[Objective <: DistributedObjectiveFunct inputColumnsNames: InputColumnsNames) extends Coordinate { - override protected def updateOffset(model: DatumScoringModel) = { + override protected[algorithm] def updateOffset(model: DatumScoringModel) = { model match { case fixedEffectModel: FixedEffectModel => dataset = FixedEffectCoordinate.updateOffset(dataset, fixedEffectModel, featureShardId, inputColumnsNames) @@ -64,6 +65,7 @@ protected[ml] class FixedEffectCoordinate[Objective <: DistributedObjectiveFunct dataset, optimizationProblem, featureShardId, + inputColumnsNames, Some(fixedEffectModel)) case _ => @@ -77,8 +79,8 @@ protected[ml] class FixedEffectCoordinate[Objective <: DistributedObjectiveFunct * * @return A (updated model, optimization state tracking information) tuple */ - override protected def trainModel(): (DatumScoringModel, OptimizationTracker) = - FixedEffectCoordinate.trainModel(dataset, optimizationProblem, featureShardId, None) + override protected[algorithm] def trainModel(): (DatumScoringModel, OptimizationTracker) = + FixedEffectCoordinate.trainModel(dataset, optimizationProblem, featureShardId, inputColumnsNames, None) } @@ -100,15 +102,17 @@ object FixedEffectCoordinate { dataset: DataFrame, optimizationProblem: DistributedOptimizationProblem[Function], featureShardId: FeatureShardId, + inputColumnsNames: InputColumnsNames, initialFixedEffectModelOpt: Option[FixedEffectModel]): (FixedEffectModel, OptimizationTracker) = { val rdd = dataset .rdd .map { row => - val features = row.getAs[SparkVector](0) - val label = row.getDouble(1) + val uid = row.getAs[UniqueSampleId](DataConst.ID) + val features = row.getAs[SparkVector](featureShardId) + val label = row.getAs[Double](inputColumnsNames(InputColumnsNames.RESPONSE)) - LabeledPoint(label, VectorUtils.mlToBreeze(features)) + (uid, LabeledPoint(label, VectorUtils.mlToBreeze(features))) } rdd.persist(StorageLevel.MEMORY_ONLY) diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala index 21eb794a..12f57417 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/FixedEffectModelCoordinate.scala @@ -37,7 +37,7 @@ class FixedEffectModelCoordinate( * @param model The input model * @return The output scores */ - override protected def updateOffset(model: DatumScoringModel) = { + override protected[algorithm] def updateOffset(model: DatumScoringModel) = { model match { case fixedEffectModel: FixedEffectModel => diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala index 790e8015..b69cbbd5 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectCoordinate.scala @@ -76,7 +76,7 @@ protected[ml] class RandomEffectCoordinate[Objective <: SingleNodeObjectiveFunct // // Coordinate functions // - override protected def updateOffset(model: DatumScoringModel) = { + override protected[algorithm] def updateOffset(model: DatumScoringModel) = { model match { case randomEffectModel: RandomEffectModel => diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectModelCoordinate.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectModelCoordinate.scala index 7f915fb2..a6ec1b29 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectModelCoordinate.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/algorithm/RandomEffectModelCoordinate.scala @@ -38,7 +38,7 @@ class RandomEffectModelCoordinate( * @param model The input model * @return The output scores */ - override protected def updateOffset(model: DatumScoringModel) = { + override protected[algorithm] def updateOffset(model: DatumScoringModel) = { model match { case randomEffectModel: RandomEffectModel => diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/DistributedOptimizationProblem.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/DistributedOptimizationProblem.scala index b1a5a788..d56aa475 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/DistributedOptimizationProblem.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/optimization/DistributedOptimizationProblem.scala @@ -17,7 +17,7 @@ package com.linkedin.photon.ml.optimization import breeze.linalg.{Vector, cholesky, diag} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel - +import com.linkedin.photon.ml.Types.UniqueSampleId import com.linkedin.photon.ml.constants.MathConst import com.linkedin.photon.ml.data.LabeledPoint import com.linkedin.photon.ml.function.{DistributedObjectiveFunction, L2Regularization, TwiceDiffFunction} @@ -139,8 +139,8 @@ protected[ml] class DistributedOptimizationProblem[Objective <: DistributedObjec * @param input The training data * @return The learned [[GeneralizedLinearModel]] */ - def runWithSampling(input: RDD[LabeledPoint]): (GeneralizedLinearModel, OptimizationStatesTracker) = - runWithSampling(input, initializeZeroModel(input.first.features.size)) + def runWithSampling(input: RDD[(UniqueSampleId, LabeledPoint)]): (GeneralizedLinearModel, OptimizationStatesTracker) = + runWithSampling(input, initializeZeroModel(input.values.first.features.size)) /** * Run the algorithm with the configured parameters, starting from the initial model provided, and down-sample the @@ -151,12 +151,12 @@ protected[ml] class DistributedOptimizationProblem[Objective <: DistributedObjec * @return The learned [[GeneralizedLinearModel]] */ def runWithSampling( - input: RDD[LabeledPoint], + input: RDD[(UniqueSampleId, LabeledPoint)], initialModel: GeneralizedLinearModel): (GeneralizedLinearModel, OptimizationStatesTracker) = { val data = (samplerOption match { - case Some(sampler) => sampler.downSample(input) - case None => input + case Some(sampler) => sampler.downSample(input).values + case None => input.values }) .setName("In memory fixed effect training dataset") .persist(StorageLevel.MEMORY_AND_DISK) diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/supervised/model/GeneralizedLinearModel.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/supervised/model/GeneralizedLinearModel.scala index 57822bc4..ac711c30 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/supervised/model/GeneralizedLinearModel.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/supervised/model/GeneralizedLinearModel.scala @@ -16,11 +16,12 @@ package com.linkedin.photon.ml.supervised.model import breeze.linalg.Vector import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector => SparkVector} - import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions.udf - +import org.apache.spark.sql.types.{StringType, StructField, StructType} +import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import com.linkedin.photon.ml.TaskType.TaskType +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.model.Coefficients import com.linkedin.photon.ml.util.Summarizable @@ -120,6 +121,15 @@ abstract class GeneralizedLinearModel(val coefficients: Coefficients) extends Se } object GeneralizedLinearModel { + + // Schema for [[DataFrame]] + def schema: StructType = StructType(Array( + StructField(DataConst.MODEL_ID, StringType, false), + StructField(DataConst.MODEL_TYPE, StringType, false), + StructField(DataConst.COEFFICIENTS, VectorType , false), + StructField(DataConst.VARIANCES, VectorType, true) + )) + /** * Compute the value of the mean functions of the generalized linear model given a RDD of data points using the * estimated coefficients and intercept. diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/transformers/GameTransformer.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/transformers/GameTransformer.scala index e474bff1..a8fb9318 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/transformers/GameTransformer.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/transformers/GameTransformer.scala @@ -239,20 +239,21 @@ class GameTransformer(val sc: SparkContext, implicit val logger: Logger) extends val evaluator = EvaluatorFactory.buildEvaluator(evaluatorType, gameDatasetWithscores) - val offset = inputColumnNames(InputColumnsNames.OFFSET) - val response = inputColumnNames(InputColumnsNames.RESPONSE) - val weight = inputColumnNames(InputColumnsNames.WEIGHT) + val columnsNames = getOrDefault(inputColumnNames) + val offset = columnsNames(InputColumnsNames.OFFSET) + val response = columnsNames(InputColumnsNames.RESPONSE) + val weight = columnsNames(InputColumnsNames.WEIGHT) evaluator match { case se: SingleEvaluator => val scoresRDD = gameDatasetWithscores - .select(col(DataConst.SCORE) + col(offset), response, weight) + .select(col(DataConst.SCORE) + col(offset), col(response), col(weight)) .rdd.map (row => (row.getDouble(0), row.getDouble(1), row.getDouble(2))) se.evaluate(scoresRDD) case me: MultiEvaluator => val scoresRDD = gameDatasetWithscores - .select(col(DataConst.ID), col(DataConst.SCORE) + col(offset), response, weight) + .select(col(DataConst.ID), col(DataConst.SCORE) + col(offset), col(response), col(weight)) .rdd.map (row => (row.getAs[UniqueSampleId](0), (row.getDouble(1), row.getDouble(2), row.getDouble(3)))) me.evaluate(scoresRDD) diff --git a/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/scoring/GameScoringDriver.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/scoring/GameScoringDriver.scala index 16eefab8..040ac2c8 100644 --- a/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/scoring/GameScoringDriver.scala +++ b/photon-client/src/main/scala/com/linkedin/photon/ml/cli/game/scoring/GameScoringDriver.scala @@ -19,15 +19,17 @@ import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.ml.param.{Param, ParamMap, Params} import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.col import org.apache.spark.storage.StorageLevel -import com.linkedin.photon.ml.Types.FeatureShardId +import com.linkedin.photon.ml.Types.{CoordinateId, FeatureShardId, REType} import com.linkedin.photon.ml.cli.game.GameDriver import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.data.avro._ import com.linkedin.photon.ml.data.{DataValidators, InputColumnsNames} import com.linkedin.photon.ml.index.IndexMapLoader import com.linkedin.photon.ml.io.scopt.game.ScoptGameScoringParametersParser +import com.linkedin.photon.ml.model.RandomEffectModel import com.linkedin.photon.ml.transformers.GameTransformer import com.linkedin.photon.ml.util._ import com.linkedin.photon.ml.{Constants, DataValidationType, SparkSessionConfiguration, TaskType} @@ -191,7 +193,11 @@ object GameScoringDriver extends GameDriver { // } Timed("Save scores") { - saveScoresToHDFS(gameDataWithScores) + val reTypes = gameModel.toMap.values.collect { + case rem: RandomEffectModel => rem.randomEffectType + } + + saveScoresToHDFS(gameDataWithScores, reTypes) } gameDataWithScores.unpersist() @@ -227,14 +233,11 @@ object GameScoringDriver extends GameDriver { * * @param data The game dataset with computed scores */ - protected def saveScoresToHDFS(data: DataFrame): Unit = { + protected def saveScoresToHDFS(data: DataFrame, reTypes: Iterable[REType]): Unit = { // Take the offset information into account when writing the scores to HDFS - val scoredItems = data.select(DataConst.SCORE + inputColumnNames(InputColumnsNames.OFFSET), - inputColumnNames(InputColumnsNames.RESPONSE), - inputColumnNames(InputColumnsNames.WEIGHT), - // scoredGameDatum.idTagToValueMap) - ) + val columnsNames = getOrDefault(inputColumnNames) + val scoredItems = data.withColumn(DataConst.SCORE, col(DataConst.SCORE) + col(columnsNames(InputColumnsNames.OFFSET))) if (getOrDefault(logDataAndModelStats)) { // Persist scored items here since we introduce multiple passes @@ -250,7 +253,7 @@ object GameScoringDriver extends GameDriver { } val scoresDir = new Path(getRequiredParam(rootOutputDirectory), SCORES_DIR) - ScoreProcessingUtils.saveScoredItemsToHDFS(scoredItemsToBeSaved, scoresDir.toString, get(modelId)) + ScoreProcessingUtils.saveScoredItemsToHDFS(scoredItemsToBeSaved, reTypes, scoresDir.toString, get(modelId)) scoredItems.unpersist() } diff --git a/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/AvroUtils.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/AvroUtils.scala index a900ae23..897da1f4 100644 --- a/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/AvroUtils.scala +++ b/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/AvroUtils.scala @@ -40,6 +40,7 @@ import com.linkedin.photon.ml.index.{DefaultIndexMap, DefaultIndexMapLoader, Ind import com.linkedin.photon.ml.model.Coefficients import com.linkedin.photon.ml.supervised.model.GeneralizedLinearModel import com.linkedin.photon.ml.util._ +import com.linkedin.photon.ml.TaskType.TaskType /** * Some basic functions to read/write Avro's [[GenericRecord]] from/to HDFS. @@ -325,7 +326,7 @@ object AvroUtils { * @param featureMap The map from feature index of type [[Int]] to feature name of type [[NameAndTerm]] * @param sparsityThreshold The model sparsity threshold, or the minimum absolute value considered nonzero * @return The Avro record that contains the information of the input coefficients - */ + protected[avro] def convertGLMModelToBayesianLinearModelAvro( model: GeneralizedLinearModel, modelId: String, @@ -349,6 +350,40 @@ object AvroUtils { avroFile.setVariances(variancesAvrosOption.get.toList) } + avroFile.build() + }*/ + + /** + * Convert the coefficients of type [[Coefficients]] to Avro record of type [[BayesianLinearModelAvro]]. + * + * @param modelId The model's id + * @param featureMap The map from feature index of type [[Int]] to feature name of type [[NameAndTerm]] + * @param sparsityThreshold The model sparsity threshold, or the minimum absolute value considered nonzero + * @return The Avro record that contains the information of the input coefficients + */ + protected[avro] def convertGLMModelToBayesianLinearModelAvro( + modelClassName: String, + modelCoefficients: Vector[Double], + variancesOption: Option[Vector[Double]], + modelId: String, + featureMap: IndexMap, + sparsityThreshold: Double = VectorUtils.DEFAULT_SPARSITY_THRESHOLD): BayesianLinearModelAvro = { + + val meansAvros = convertVectorAsArrayOfNameTermValueAvros(modelCoefficients, featureMap, sparsityThreshold) + val variancesAvrosOption = variancesOption + .map(convertVectorAsArrayOfNameTermValueAvros(_, featureMap, sparsityThreshold)) + // TODO: Output type of model. + val avroFile = BayesianLinearModelAvro + .newBuilder() + .setModelId(modelId) + .setModelClass(modelClassName) + .setLossFunction("") + .setMeans(meansAvros.toList) + + if (variancesAvrosOption.isDefined) { + avroFile.setVariances(variancesAvrosOption.get.toList) + } + avroFile.build() } diff --git a/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/ModelProcessingUtils.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/ModelProcessingUtils.scala index 7c03c199..1699a333 100644 --- a/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/ModelProcessingUtils.scala +++ b/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/ModelProcessingUtils.scala @@ -23,14 +23,16 @@ import scala.io.Source import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext +import org.apache.spark.ml.linalg.{Vector => SparkMLVector} import org.apache.spark.ml.param.ParamMap -import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.storage.StorageLevel import com.linkedin.photon.avro.generated.{BayesianLinearModelAvro, FeatureSummarizationResultAvro} import com.linkedin.photon.ml.TaskType.TaskType import com.linkedin.photon.ml.Types.{CoordinateId, FeatureShardId} import com.linkedin.photon.ml.cli.game.training.GameTrainingDriver +import com.linkedin.photon.ml.constants.DataConst import com.linkedin.photon.ml.estimators.GameEstimator import com.linkedin.photon.ml.index.{IndexMap, IndexMapLoader} import com.linkedin.photon.ml.model._ @@ -227,9 +229,9 @@ object ModelProcessingUtils { s"Missing feature shard definition for '$featureShardId' required by coordinate '$name' in loaded model") } val modelsRDDInputPath = new Path(innerPath, AvroConstants.COEFFICIENTS) - val modelsRDD = loadModelsRDDFromHDFS(modelsRDDInputPath.toString, indexMapLoader, sc) + val models = loadModelsDataFrameFromHDFS(modelsRDDInputPath.toString, indexMapLoader, sc) - (name, new RandomEffectModel(modelsRDD, randomEffectType, featureShardId).persistRDD(storageLevel)) + (name, new RandomEffectModel(models, randomEffectType, featureShardId)/*.persist(storageLevel)*/) } } else { @@ -276,18 +278,18 @@ object ModelProcessingUtils { //Write the coefficientsRDD val coefficientsRDDOutputDir = new Path(randomEffectModelOutputDir, AvroConstants.COEFFICIENTS).toString - val modelsRDD = randomEffectModelFileLimit match { + val models = randomEffectModelFileLimit match { case Some(fileLimit) => require(fileLimit > 0, "Attempt to coalesce random effect model RDD into fewer than 1 partitions") // Control the number of output files by re-partitioning the RDD. - randomEffectModel.modelsRDD.coalesce(fileLimit) + randomEffectModel.models.coalesce(fileLimit) case None => - randomEffectModel.modelsRDD + randomEffectModel.models } - saveModelsRDDToHDFS(modelsRDD, indexMapLoader, coefficientsRDDOutputDir, sparsityThreshold) + saveModelsDataFrameToHDFS(models, indexMapLoader, coefficientsRDDOutputDir, sparsityThreshold) } /** @@ -307,7 +309,9 @@ object ModelProcessingUtils { sparsityThreshold: Double): Unit = { val bayesianLinearModelAvro = AvroUtils.convertGLMModelToBayesianLinearModelAvro( - model, + model.getClass.getName, + model.coefficients.means, + model.coefficients.variancesOption, AvroConstants.FIXED_EFFECT, featureMap, sparsityThreshold) @@ -344,23 +348,39 @@ object ModelProcessingUtils { } /** - * Save an [[RDD]] of GLM to HDFS. + * Save an [[DataFrame]] of GLM to HDFS. * - * @param modelsRDD The models to save + * @param models The models to save * @param featureMapLoader A loader for the feature to index map * @param outputDir The directory to which to save the models * @param sparsityThreshold The model sparsity threshold, or the minimum absolute value considered nonzero */ - private def saveModelsRDDToHDFS( - modelsRDD: RDD[(String, GeneralizedLinearModel)], + private def saveModelsDataFrameToHDFS( + models: DataFrame, /*(RDD[(String, GeneralizedLinearModel)],*/ featureMapLoader: IndexMapLoader, outputDir: String, sparsityThreshold: Double): Unit = { + val modelsRDD = models + .rdd + .map(row => { + val id = row.getAs[String](DataConst.MODEL_ID) + val modelType = row.getAs[String](DataConst.MODEL_TYPE) + val coefficients = VectorUtils.mlToBreeze(row.getAs[SparkMLVector](DataConst.COEFFICIENTS)) + val variances = row.getAs[SparkMLVector](DataConst.VARIANCES) + val variancesOption = if (variances != null) { + Option.apply(VectorUtils.mlToBreeze(variances)) + } else { + None + } + (id, modelType, coefficients, variancesOption) + } + ) + val linearModelAvro = modelsRDD.mapPartitions { iter => val featureMap = featureMapLoader.indexMapForRDD() - iter.map { case (modelId, model) => - AvroUtils.convertGLMModelToBayesianLinearModelAvro(model, modelId, featureMap, sparsityThreshold) + iter.map { case (modelId, modelType, coefficients, variancesOption) => + AvroUtils.convertGLMModelToBayesianLinearModelAvro(modelType, coefficients, variancesOption, modelId, featureMap, sparsityThreshold) } } @@ -368,35 +388,36 @@ object ModelProcessingUtils { } /** - * Load multiple GLM into a [[RDD]]. + * Load multiple GLM into a [[DataFrame]]. * * TODO: Currently only the means of the coefficients are loaded, the variances are discarded * * @param coefficientsRDDInputDir The input directory from which to read models * @param indexMapLoader A loader for the feature to index map * @param sc The Spark context - * @return A [[RDD]] of GLMs loaded from HDFS and a loader for the feature to index map it uses + * @return A [[DataFrame]] of GLMs loaded from HDFS and a loader for the feature to index map it uses */ - private def loadModelsRDDFromHDFS( + private def loadModelsDataFrameFromHDFS( coefficientsRDDInputDir: String, indexMapLoader: IndexMapLoader, - sc: SparkContext): RDD[(String, GeneralizedLinearModel)] = { + sc: SparkContext): DataFrame = { val modelAvros = AvroUtils.readAvroFilesInDir[BayesianLinearModelAvro]( sc, coefficientsRDDInputDir, minNumPartitions = sc.defaultParallelism) - modelAvros.mapPartitions { iter => + val rdd = modelAvros.mapPartitions { iter => val indexMap = indexMapLoader.indexMapForRDD() iter.map { modelAvro => val modelId = modelAvro.getModelId.toString val glm = AvroUtils.convertBayesianLinearModelAvroToGLM(modelAvro, indexMap) - - (modelId, glm) + Row.fromTuple(modelId, glm.modelType, glm.coefficients.means, glm.coefficients.variancesOption.getOrElse(null)) } } + + SparkSession.builder().getOrCreate().createDataFrame(rdd, GeneralizedLinearModel.schema) } /** diff --git a/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/ScoreProcessingUtils.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/ScoreProcessingUtils.scala index c3eb4d8d..df001cbd 100644 --- a/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/ScoreProcessingUtils.scala +++ b/photon-client/src/main/scala/com/linkedin/photon/ml/data/avro/ScoreProcessingUtils.scala @@ -21,6 +21,7 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import com.linkedin.photon.avro.generated.ScoringResultAvro +import com.linkedin.photon.ml.Types.REType import com.linkedin.photon.ml.cli.game.scoring.ScoredItem /** @@ -63,27 +64,34 @@ object ScoreProcessingUtils { /** * Save the scored items of type [[ScoredItem]] to the given output directory on HDFS. * - * @param scoredItems An [[RDD]] of scored items of type [[ScoredItem]] + * @param scoredItems An [[DataFrame]] of scored items [score, label, weight] * @param modelId The model's id that used to compute the scores * @param outputDir The given output directory */ protected[ml] def saveScoredItemsToHDFS( - scoredItems: DataFrame, //[ScoredItem], - outputDir: String, - modelId: Option[String]): Unit = { + scoredItems: DataFrame, + reTypes: Iterable[REType], + outputDir: String, + modelId: Option[String]): Unit = { - val scoringResultAvros = scoredItems.map { case ScoredItem(predictionScore, labelOpt, weightOpt, ids) => - val metaDataMap = collection.mutable.Map(ids.toMap[CharSequence, CharSequence].toSeq: _*).asJava - val builder = ScoringResultAvro.newBuilder() - builder.setPredictionScore(predictionScore) - builder.setModelId(modelId.getOrElse(DEFAULT_MODEL_ID)) - ids.get(ResponsePredictionFieldNames.UID).foreach(builder.setUid(_)) - labelOpt.foreach(builder.setLabel(_)) - weightOpt.foreach(builder.setWeight(_)) - builder.setMetadataMap(metaDataMap) - builder.build() - } + val scoringResultAvros = scoredItems + .rdd + .map { row => + val predictionScore = row.getDouble(0) + val label = row.getDouble(1) // Nullable + val weight = row.getDouble(2) // Nullable + val ids = reTypes.map(reType => (reType, row.getAs[String](reType))).toMap + val metaDataMap = collection.mutable.Map(ids.toMap[CharSequence, CharSequence].toSeq: _*).asJava + val builder = ScoringResultAvro.newBuilder() + builder.setPredictionScore(predictionScore) + builder.setModelId(modelId.getOrElse(DEFAULT_MODEL_ID)) + ids.get(ResponsePredictionFieldNames.UID).foreach(builder.setUid(_)) + Option.apply(label).foreach(builder.setLabel(_)) + Option.apply(weight).foreach(builder.setWeight(_)) + builder.setMetadataMap(metaDataMap) + builder.build() + } AvroUtils.saveAsAvro(scoringResultAvros, outputDir, ScoringResultAvro.getClassSchema.toString) } } diff --git a/photon-client/src/main/scala/com/linkedin/photon/ml/util/Utils.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/util/Utils.scala index cd0486c7..89d269f7 100644 --- a/photon-client/src/main/scala/com/linkedin/photon/ml/util/Utils.scala +++ b/photon-client/src/main/scala/com/linkedin/photon/ml/util/Utils.scala @@ -14,14 +14,20 @@ */ package com.linkedin.photon.ml.util +import java.lang.{Boolean => JBoolean, Double => JDouble, Float => JFloat, Number => JNumber, Object => JObject, String => JString} + +import scala.collection.JavaConverters._ + import org.apache.avro.generic.GenericRecord import org.apache.avro.util.Utf8 import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path -import com.linkedin.photon.ml.Constants +import com.linkedin.photon.ml.Constants import com.linkedin.photon.ml.evaluation.EvaluatorType._ -import com.linkedin.photon.ml.evaluation.{EvaluatorType, MultiAUC, MultiPrecisionAtK} +import com.linkedin.photon.ml.evaluation.{MultiAUC, MultiPrecisionAtK, EvaluatorType} + +// TODO: Better documentation. /** * Some useful functions diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/Coordinate.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/Coordinate.scala index de1acae6..37e42de6 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/Coordinate.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/algorithm/Coordinate.scala @@ -45,6 +45,6 @@ protected[ml] abstract class Coordinate { * @param model The model of previous coordinate * @return A new dataset with the updated offsets */ - def updateOffset(model: DatumScoringModel) + protected[algorithm] def updateOffset(model: DatumScoringModel) } diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/constants/DataConst.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/constants/DataConst.scala index 40b4a8c4..826c7363 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/constants/DataConst.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/constants/DataConst.scala @@ -18,6 +18,8 @@ object DataConst { val ID = "uniqueId" val SCORE = "score" + val MODEL_ID = "mId" val MODEL_TYPE = "modelType" val COEFFICIENTS = "coefficients" + val VARIANCES = "variances" } diff --git a/photon-lib/src/main/scala/com/linkedin/photon/ml/sampling/DownSampler.scala b/photon-lib/src/main/scala/com/linkedin/photon/ml/sampling/DownSampler.scala index 413c76fc..ace58073 100644 --- a/photon-lib/src/main/scala/com/linkedin/photon/ml/sampling/DownSampler.scala +++ b/photon-lib/src/main/scala/com/linkedin/photon/ml/sampling/DownSampler.scala @@ -15,7 +15,7 @@ package com.linkedin.photon.ml.sampling import java.util.Random - +import com.linkedin.photon.ml.Types.UniqueSampleId import org.apache.spark.rdd.RDD import com.linkedin.photon.ml.constants.MathConst @@ -42,8 +42,8 @@ protected[ml] trait DownSampler { * @return The down-sampled dataset */ def downSample( - labeledPoints: RDD[LabeledPoint], - seed: Long = getSeed): RDD[LabeledPoint] + labeledPoints: RDD[(UniqueSampleId, LabeledPoint)], + seed: Long = getSeed): RDD[(UniqueSampleId, LabeledPoint)] } object DownSampler { From 77a8dc4e29642b1145bc11141d7f1fb056eb2d4a Mon Sep 17 00:00:00 2001 From: Lin Guo Date: Wed, 26 Feb 2020 14:07:49 -0800 Subject: [PATCH 11/11] Address comments --- .../linkedin/photon/ml/util/ApiUtils.scala | 9 ------- .../com/linkedin/photon/ml/Constants.scala | 1 + .../com/linkedin/photon/ml/util/Utils.scala | 26 ++++++++++++++----- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/photon-api/src/main/scala/com/linkedin/photon/ml/util/ApiUtils.scala b/photon-api/src/main/scala/com/linkedin/photon/ml/util/ApiUtils.scala index 6c396adc..cb14e65f 100644 --- a/photon-api/src/main/scala/com/linkedin/photon/ml/util/ApiUtils.scala +++ b/photon-api/src/main/scala/com/linkedin/photon/ml/util/ApiUtils.scala @@ -20,14 +20,5 @@ import org.apache.spark.sql.DataFrame object ApiUtils { - def randomString(length: Int): String = { - val r = new scala.util.Random - val sb = new StringBuilder - for (i <- 1 to length) { - sb.append(r.nextPrintableChar) - } - sb.toString - } - def hasColumn(df: DataFrame, path: String): Boolean = Try(df(path)).isSuccess } diff --git a/photon-client/src/main/scala/com/linkedin/photon/ml/Constants.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/Constants.scala index d8e504a0..c8bbf120 100644 --- a/photon-client/src/main/scala/com/linkedin/photon/ml/Constants.scala +++ b/photon-client/src/main/scala/com/linkedin/photon/ml/Constants.scala @@ -15,6 +15,7 @@ package com.linkedin.photon.ml import org.joda.time.DateTimeZone + import com.linkedin.photon.ml.util.Utils /** diff --git a/photon-client/src/main/scala/com/linkedin/photon/ml/util/Utils.scala b/photon-client/src/main/scala/com/linkedin/photon/ml/util/Utils.scala index 89d269f7..19b9b9d5 100644 --- a/photon-client/src/main/scala/com/linkedin/photon/ml/util/Utils.scala +++ b/photon-client/src/main/scala/com/linkedin/photon/ml/util/Utils.scala @@ -42,6 +42,7 @@ object Utils { * @return The feature name */ def getFeatureKey(record: GenericRecord, nameKey: String, termKey: String, delimiter: String): String = { + val name = getStringAvro(record, nameKey) val term = getStringAvro(record, termKey, isNullOK = true) getFeatureKey(name, term, delimiter) @@ -66,6 +67,7 @@ object Utils { * @return The feature name */ def getFeatureNameFromKey(key: String, delimiter: String = Constants.DELIMITER): String = { + require(delimiter.r.findAllIn(key).length == 1, s"Provided input [$key] is not a valid feature key") key.split(delimiter).headOption.getOrElse("") } @@ -78,6 +80,7 @@ object Utils { * @return The feature term */ def getFeatureTermFromKey(key: String, delimiter: String = Constants.DELIMITER): String = { + require(delimiter.r.findAllIn(key).length == 1, s"Provided input [$key] is not a valid feature key") key.split(delimiter).lift(1).getOrElse("") } @@ -88,10 +91,11 @@ object Utils { * @param record The generic record * @param key The key of the field * @param isNullOK Whether null is accepted. If set to true, then an empty string will be returned if the - * corresponding field of the key is null, otherwise, exception will be thrown. + * corresponding field of the key is null, otherwise, exception will be thrown. * @return The String typed field */ def getStringAvro(record: GenericRecord, key: String, isNullOK: Boolean = false): String = { + record.get(key) match { case id@(_: Utf8 | _: JString) => id.toString case number: JNumber => number.toString @@ -108,6 +112,7 @@ object Utils { * @return The Double typed field */ def getDoubleAvro(record: GenericRecord, key: String): Double = { + record.get(key) match { case number: JNumber => number.doubleValue case id@(_: Utf8 | _: JString) => atod(id.toString) @@ -135,7 +140,7 @@ object Utils { // Need to convert Utf8 values to String here, because otherwise we get schema casting errors and misleading // equivalence failures downstream. case s@(_: Utf8 | _: JString) => s.toString - case x@(_: Number | _: JBoolean) => x + case x@(_: Number | _: JBoolean) => x case _ => null }) }.filter(_._2 != null).toMap @@ -152,6 +157,7 @@ object Utils { * @return The double parsed from the string, or an exception if string is empty or double is NaN or Infinity */ private def atod(string: String): Double = { + if (string.length() < 1) { throw new IllegalArgumentException("Can't convert empty string to double") } @@ -172,6 +178,7 @@ object Utils { * @return The Float typed field */ def getFloatAvro(record: GenericRecord, key: String): Float = { + record.get(key) match { case number: JNumber => number.floatValue case id@(_: Utf8 | _: JString) => atof(id.toString) @@ -187,6 +194,7 @@ object Utils { * @return A float parse from the string, or an exception if the string is empty or the flat is NaN or Infinity */ private def atof(string: String): Float = { + if (string.length() < 1) { throw new IllegalArgumentException("Can't convert empty string to float") } @@ -207,6 +215,7 @@ object Utils { * @return The Int typed field */ def getIntAvro(record: GenericRecord, key: String): Int = { + record.get(key) match { case number: JNumber => number.intValue case id@(_: Utf8 | _: JString) => id.toString.toInt @@ -223,6 +232,7 @@ object Utils { * @return The Long typed field */ def getLongAvro(record: GenericRecord, key: String): Long = { + record.get(key) match { case number: JNumber => number.longValue() case id@(_: Utf8 | _: JString) => id.toString.toLong @@ -239,6 +249,7 @@ object Utils { * @return The Boolean typed field */ def getBooleanAvro(record: GenericRecord, key: String): Boolean = { + record.get(key) match { case booleanValue: JBoolean => booleanValue.booleanValue // NOTE Scala String#toBoolean method is better than JBoolean#parseBoolean in the sense that it only accepts @@ -256,6 +267,7 @@ object Utils { * @param hadoopConf The Hadoop Configuration object */ def deleteHDFSDir(dir: Path, hadoopConf: Configuration): Unit = { + val fs = dir.getFileSystem(hadoopConf) if (fs.exists(dir)) fs.delete(dir, true) } @@ -267,6 +279,7 @@ object Utils { * @param hadoopConf The Hadoop Configuration object */ def createHDFSDir(dir: Path, hadoopConf: Configuration): Unit = { + val fs = dir.getFileSystem(hadoopConf) if (!fs.exists(dir)) fs.mkdirs(dir) } @@ -281,17 +294,18 @@ object Utils { * @param map Input map to look up * @param key The key to be looked up in the provided map * @param elseBranch If one wants to fail on not finding a value of type [[T]] in the map, an - * [[IllegalArgumentException]] will be thrown with the error message provided. If one wants to - * continue without failure, a default value is expected that will be returned + * [[IllegalArgumentException]] will be thrown with the error message provided. If one wants to + * continue without failure, a default value is expected that will be returned * @tparam T Intended return type of the method * @throws java.lang.IllegalArgumentException Exception thrown if a value of type [[T]] isn't found in the map and - * the error message is non-empty + * the error message is non-empty * @return A value of type [[T]] or throw an [[IllegalArgumentException]] */ @throws(classOf[IllegalArgumentException]) def getKeyFromMapOrElse[T](map: Map[String, Any], key: String, elseBranch: Either[String, T]): T = { + map.get(key) match { - case Some(x: T) => x // type erasure warning here + case Some(x: T) => x // type erasure warning here case _ => elseBranch match { case Left(errorMsg) => throw new IllegalArgumentException(errorMsg)