From 541b0c26fc27e6edd04aefd772426131041a829f Mon Sep 17 00:00:00 2001 From: Bhaskar Divya Date: Fri, 2 Feb 2018 21:53:18 +0100 Subject: [PATCH 01/32] Integrating TDigest for Median Calculation and removing streams to/from HBase --- pom.xml | 6 ++ smartgrid.iml | 2 +- .../base/SensorEventAveragingJobBase.scala | 102 +++++++++--------- .../smartgrid/model/MedianLoadWithKey.scala | 27 +++++ .../projects/smartgrid/model/Prediction.scala | 6 +- .../operators/MedianAggregateWithKey.scala | 15 +++ .../operators/PredictionFunction.scala | 8 +- .../smartgrid/util/TDigestMedian.scala | 21 ++++ 8 files changed, 129 insertions(+), 58 deletions(-) create mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/model/MedianLoadWithKey.scala create mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianAggregateWithKey.scala create mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/util/TDigestMedian.scala diff --git a/pom.xml b/pom.xml index 2eaa215..9c5fe3d 100644 --- a/pom.xml +++ b/pom.xml @@ -58,6 +58,7 @@ under the License. 5.1.2 0.11.0.2 + 3.2 @@ -159,6 +160,11 @@ under the License. flink-connector-elasticsearch5_${scala.binary.version} ${flink.version} + + com.tdunning + t-digest + ${tdigest.version} + diff --git a/smartgrid.iml b/smartgrid.iml index e8f7a49..94619dc 100644 --- a/smartgrid.iml +++ b/smartgrid.iml @@ -126,7 +126,6 @@ - @@ -150,6 +149,7 @@ + diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala index a45104b..7be191b 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala @@ -1,16 +1,14 @@ package com.bhaskardivya.projects.smartgrid.base import java.io.File -import java.util.concurrent.TimeUnit import com.bhaskardivya.projects.smartgrid.model._ -import com.bhaskardivya.projects.smartgrid.operators.{AverageAggregateWithKey, HBaseAsyncFunction, PredictionFunction} +import com.bhaskardivya.projects.smartgrid.operators.{AverageAggregateWithKey, MedianAggregateWithKey, PredictionFunction} import com.bhaskardivya.projects.smartgrid.pipeline._ -import com.bhaskardivya.projects.smartgrid.sinks.{HBaseOutputFormatAverageWithKey, PredictionElasticSearchSink, SensorEventElasticSearchSink} +import com.bhaskardivya.projects.smartgrid.sinks.{PredictionElasticSearchSink, SensorEventElasticSearchSink} import org.apache.flink.api.java.utils.ParameterTool import org.apache.flink.core.fs.FileSystem import org.apache.flink.streaming.api.TimeCharacteristic -import org.apache.flink.streaming.api.functions.sink.OutputFormatSinkFunction import org.apache.flink.streaming.api.scala._ import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows import org.apache.flink.streaming.api.windowing.time.Time @@ -131,72 +129,76 @@ abstract class SensorEventAveragingJobBase extends Serializable { avg_windowed120min.writeAsCsv(LOG_DIR + "/output_avg/windowed120min.csv", FileSystem.WriteMode.OVERWRITE).name("Debug Avg windowed 120 Min") } - // Write to HBase for each window duration - avg_windowed1min.addSink(new OutputFormatSinkFunction[AverageWithKey](new HBaseOutputFormatAverageWithKey().of(Constants.TABLE_1MIN, getTargetColumnFamily()))) - .name(getKeyName() + " Average - HBase - 1 Min Window") + val median_windowed1min = avg_windowed1min + .keyBy(_.key) + .window(SlidingEventTimeWindows.of(Time.minutes(1), Time.seconds(Constants.SLIDING_INTERVAL))) + .aggregate(new MedianAggregateWithKey) + .name(getKeyName() + "Median for 1 min Window") - avg_windowed5min.addSink(new OutputFormatSinkFunction[AverageWithKey](new HBaseOutputFormatAverageWithKey().of(Constants.TABLE_5MIN, getTargetColumnFamily()))) - .name(getKeyName() + " Average - HBase - 5 Min Window") + val median_windowed5min = median_windowed1min + .keyBy(_.key) + .window(SlidingEventTimeWindows.of(Time.minutes(5), Time.seconds(Constants.SLIDING_INTERVAL))) + .reduce(MedianLoadWithKey.reducer) + .name(getKeyName() + "Median for 5 min Window") - avg_windowed15min.addSink(new OutputFormatSinkFunction[AverageWithKey](new HBaseOutputFormatAverageWithKey().of(Constants.TABLE_15MIN, getTargetColumnFamily()))) - .name(getKeyName() + " Average - HBase - 15 Min Window") + val median_windowed15min = median_windowed5min + .keyBy(_.key) + .window(SlidingEventTimeWindows.of(Time.minutes(15), Time.seconds(Constants.SLIDING_INTERVAL))) + .reduce(MedianLoadWithKey.reducer) + .name(getKeyName() + "Median for 15 min Window") - avg_windowed60min.addSink(new OutputFormatSinkFunction[AverageWithKey](new HBaseOutputFormatAverageWithKey().of(Constants.TABLE_60MIN, getTargetColumnFamily()))) - .name(getKeyName() + " Average - HBase - 60 Min Window") + val median_windowed60min = median_windowed15min + .keyBy(_.key) + .window(SlidingEventTimeWindows.of(Time.minutes(60), Time.seconds(Constants.SLIDING_INTERVAL))) + .reduce(MedianLoadWithKey.reducer) + .name(getKeyName() + "Median for 60 min Window") - avg_windowed120min.addSink(new OutputFormatSinkFunction[AverageWithKey](new HBaseOutputFormatAverageWithKey().of(Constants.TABLE_120MIN, getTargetColumnFamily()))) - .name(getKeyName() + " Average - HBase - 120 Min Window") + val median_windowed120min = median_windowed60min + .keyBy(_.key) + .window(SlidingEventTimeWindows.of(Time.minutes(120), Time.seconds(Constants.SLIDING_INTERVAL))) + .reduce(MedianLoadWithKey.reducer) + .name(getKeyName() + "Median for 120 min Window") val timeout = params.getLong("timeout", 3000000L) // 300 seconds timeout // Enrich the average calculation with the median value for each stream of different window duration - val windowed1min_enriched: DataStream[(AverageWithKey, MedianLoad)] = AsyncDataStream.orderedWait( - avg_windowed1min, - new HBaseAsyncFunction(Constants.TABLE_1MIN, getTargetColumnFamily()), - timeout, - TimeUnit.MILLISECONDS, - 1 - ) + val windowed1min_enriched: DataStream[(AverageWithKey, MedianLoadWithKey)] = avg_windowed1min + .join(median_windowed1min) + .where(_.key).equalTo(_.key) + .window(SlidingEventTimeWindows.of(Time.minutes(1), Time.seconds(Constants.SLIDING_INTERVAL))) + .apply((_, _)) .startNewChain() .name(getKeyName() + " Enriched - 1 Min Window") - val windowed5min_enriched: DataStream[(AverageWithKey, MedianLoad)] = AsyncDataStream.orderedWait( - avg_windowed5min, - new HBaseAsyncFunction(Constants.TABLE_5MIN, getTargetColumnFamily()), - timeout, - TimeUnit.MILLISECONDS, - 1 - ) + val windowed5min_enriched: DataStream[(AverageWithKey, MedianLoadWithKey)] = avg_windowed5min + .join(median_windowed5min) + .where(_.key).equalTo(_.key) + .window(SlidingEventTimeWindows.of(Time.minutes(5), Time.seconds(Constants.SLIDING_INTERVAL))) + .apply((_, _)) .startNewChain() .name(getKeyName() + " Enriched - 5 Min Window") - val windowed15min_enriched: DataStream[(AverageWithKey, MedianLoad)] = AsyncDataStream.orderedWait( - avg_windowed15min, - new HBaseAsyncFunction(Constants.TABLE_15MIN, getTargetColumnFamily()), - timeout, - TimeUnit.MILLISECONDS, - 1 - ) + val windowed15min_enriched: DataStream[(AverageWithKey, MedianLoadWithKey)] = avg_windowed15min + .join(median_windowed15min) + .where(_.key).equalTo(_.key) + .window(SlidingEventTimeWindows.of(Time.minutes(15), Time.seconds(Constants.SLIDING_INTERVAL))) + .apply((_, _)) .startNewChain() .name(getKeyName() + " Enriched - 15 Min Window") - val windowed60min_enriched: DataStream[(AverageWithKey, MedianLoad)] = AsyncDataStream.orderedWait( - avg_windowed60min, - new HBaseAsyncFunction(Constants.TABLE_60MIN, getTargetColumnFamily()), - timeout, - TimeUnit.MILLISECONDS, - 1 - ) + val windowed60min_enriched: DataStream[(AverageWithKey, MedianLoadWithKey)] = avg_windowed60min + .join(median_windowed60min) + .where(_.key).equalTo(_.key) + .window(SlidingEventTimeWindows.of(Time.minutes(60), Time.seconds(Constants.SLIDING_INTERVAL))) + .apply((_, _)) .startNewChain() .name(getKeyName() + " Enriched - 60 Min Window") - val windowed120min_enriched: DataStream[(AverageWithKey, MedianLoad)] = AsyncDataStream.orderedWait( - avg_windowed120min, - new HBaseAsyncFunction(Constants.TABLE_120MIN, getTargetColumnFamily()), - timeout, - TimeUnit.MILLISECONDS, - 1 - ) + val windowed120min_enriched: DataStream[(AverageWithKey, MedianLoadWithKey)] = avg_windowed120min + .join(median_windowed120min) + .where(_.key).equalTo(_.key) + .window(SlidingEventTimeWindows.of(Time.minutes(120), Time.seconds(Constants.SLIDING_INTERVAL))) + .apply((_, _)) .startNewChain() .name(getKeyName() + " Enriched - 120 Min Window") diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/MedianLoadWithKey.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/MedianLoadWithKey.scala new file mode 100644 index 0000000..114a186 --- /dev/null +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/MedianLoadWithKey.scala @@ -0,0 +1,27 @@ +package com.bhaskardivya.projects.smartgrid.model + +import com.tdunning.math.stats.TDigest + +case class MedianLoadWithKey(var key: SensorKeyObject, totalDigest: TDigest){ + val medianLoad : Double = totalDigest.quantile(0.5) + def add(averageWithKey: AverageWithKey) = { + this.totalDigest.add(averageWithKey.averageValue) + this + } + + def add(other: MedianLoadWithKey) = { + this.totalDigest.add(other.totalDigest) + if(this.key.house_id < other.key.house_id){ + this.key = other.key + } + this + } + + def +(other: MedianLoadWithKey) = add(other) +} + +object MedianLoadWithKey { + def reducer = { + (a: MedianLoadWithKey, b: MedianLoadWithKey) => a+b + } +} \ No newline at end of file diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction.scala index cf49348..79ab66b 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction.scala @@ -5,7 +5,7 @@ import java.util.Date import org.apache.sling.commons.json.JSONObject -case class Prediction(averageWithKey: AverageWithKey, medianLoad: MedianLoad, key: String, slidingWindow: Long, predictedValue: Double){ +case class Prediction(averageWithKey: AverageWithKey, medianLoad: MedianLoadWithKey, key: String, slidingWindow: Long, predictedValue: Double){ def toJSONString(): String = { toJSON().toString() @@ -23,9 +23,9 @@ case class Prediction(averageWithKey: AverageWithKey, medianLoad: MedianLoad, ke averageWithKeyJSON.put("eventTimestamp", averageWithKey.eventTimestamp) json.put("averageWithKey", averageWithKeyJSON) - //mediaLoad + //medianLoad val medianLoadJSON = new JSONObject() - medianLoadJSON.put("load", medianLoad.load) + medianLoadJSON.put("load", medianLoad.medianLoad) json.put("medianLoad", medianLoadJSON) //key or entity diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianAggregateWithKey.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianAggregateWithKey.scala new file mode 100644 index 0000000..d03c8c4 --- /dev/null +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianAggregateWithKey.scala @@ -0,0 +1,15 @@ +package com.bhaskardivya.projects.smartgrid.operators + +import com.bhaskardivya.projects.smartgrid.model.{AverageWithKey, MedianLoadWithKey, SensorKeyObject} +import com.tdunning.math.stats.TDigest +import org.apache.flink.api.common.functions.AggregateFunction + +class MedianAggregateWithKey extends AggregateFunction[AverageWithKey, MedianLoadWithKey, MedianLoadWithKey]{ + override def createAccumulator() = MedianLoadWithKey(SensorKeyObject(-1), TDigest.createDigest(100)) + + override def add(value: AverageWithKey, accumulator: MedianLoadWithKey) = accumulator.add(averageWithKey = value) + + override def getResult(accumulator: MedianLoadWithKey) = accumulator + + override def merge(a: MedianLoadWithKey, b: MedianLoadWithKey) = a+b +} diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/PredictionFunction.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/PredictionFunction.scala index c19b537..51ce4dd 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/PredictionFunction.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/PredictionFunction.scala @@ -1,6 +1,6 @@ package com.bhaskardivya.projects.smartgrid.operators -import com.bhaskardivya.projects.smartgrid.model.{AverageWithKey, MedianLoad, Prediction} +import com.bhaskardivya.projects.smartgrid.model.{AverageWithKey, MedianLoad, MedianLoadWithKey, Prediction} import org.apache.flink.api.common.functions.MapFunction import org.apache.flink.streaming.api.windowing.time.Time @@ -9,10 +9,10 @@ import org.apache.flink.streaming.api.windowing.time.Time * @param entity The key for which prediction is made ie.House or Plug * @param slidingWindow */ -class PredictionFunction(entity: String, slidingWindow: Long) extends MapFunction[(AverageWithKey, MedianLoad), Prediction]{ - override def map(value: (AverageWithKey, MedianLoad)): Prediction = { +class PredictionFunction(entity: String, slidingWindow: Long) extends MapFunction[(AverageWithKey, MedianLoadWithKey), Prediction]{ + override def map(value: (AverageWithKey, MedianLoadWithKey)): Prediction = { val avg = value._1.averageValue - val median = value._2.load + val median = value._2.medianLoad val predictedValue: Double = (avg + median)/ 2.0 diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/util/TDigestMedian.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/util/TDigestMedian.scala new file mode 100644 index 0000000..2e09de1 --- /dev/null +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/util/TDigestMedian.scala @@ -0,0 +1,21 @@ +package com.bhaskardivya.projects.smartgrid.util + +import com.tdunning.math.stats.TDigest +import java.io.Serializable + +@SerialVersionUID(1L) +class TDigestMedian() extends Serializable { + private var totalDigest: TDigest = _ + + this.setTotalDigest(TDigest.createDigest(100)) + + def getTotalDigest: TDigest = totalDigest + + def setTotalDigest(totalDigest: TDigest) { + this.totalDigest = totalDigest + } + + def addDigest(digest: Double): Unit = this.totalDigest.add(digest) + + def getMedian: Double = this.totalDigest.quantile(0.5) +} \ No newline at end of file From 5374489a3f6fe876e4df7a5a414f06f3dde3a4e7 Mon Sep 17 00:00:00 2001 From: Bhaskar Divya Date: Fri, 2 Feb 2018 21:54:13 +0100 Subject: [PATCH 02/32] Updating Docker-compose files --- docker/Docker-compose.yml | 13 +++++------ docker/Flink-Hbase-Kafka/Docker-compose.yml | 24 +++++++++++++++++++++ docker/kafka-manager/Docker-compose.yml | 2 +- 3 files changed, 30 insertions(+), 9 deletions(-) create mode 100755 docker/Flink-Hbase-Kafka/Docker-compose.yml diff --git a/docker/Docker-compose.yml b/docker/Docker-compose.yml index e07fbd7..08d853e 100755 --- a/docker/Docker-compose.yml +++ b/docker/Docker-compose.yml @@ -1,16 +1,13 @@ - version: '2.1' services: -#Flink + Kafka + Hbase + Zookeeper flink-kafka-hbase: - image: - container -# Elasticsearch + build: ./Flink-Hbase-Kafka + elasticsearch: + build: ./elasticsearch -# Kibana kibana: + build: ./kibana -# Kafka-manager kafka-manager: - \ No newline at end of file + build: ./kafka-manager diff --git a/docker/Flink-Hbase-Kafka/Docker-compose.yml b/docker/Flink-Hbase-Kafka/Docker-compose.yml new file mode 100755 index 0000000..6ee061f --- /dev/null +++ b/docker/Flink-Hbase-Kafka/Docker-compose.yml @@ -0,0 +1,24 @@ +services: + flink-kafka-hbase: + image: quay.io/koldbyte/flink-cluster + container_name: flink_to_hbase + hostname: hbase-docker + ports: + - 8080:8080 + - 8085:8085 + - 9090:9090 + - 9092:9092 + - 9095:9095 + - 2181:2181 + - 16000:16000 + - 16010:16010 + - 16020:16020 + - 16030:16030 + - 6123:6123 + - 7203:7203 + - 8081:8081 + - 8090:8090 + environment: + KAFKA_ADVERTISED_HOST_NAME: 192.168.99.100 + volumes: + - /data \ No newline at end of file diff --git a/docker/kafka-manager/Docker-compose.yml b/docker/kafka-manager/Docker-compose.yml index f6a0b69..212b254 100755 --- a/docker/kafka-manager/Docker-compose.yml +++ b/docker/kafka-manager/Docker-compose.yml @@ -4,6 +4,6 @@ services: ports: - "9000:9000" environment: - ZK_HOSTS: "zoo:2181" + ZK_HOSTS: "192.168.99.100:2181/kafka" APPLICATION_SECRET: "random-secret" command: -Dpidfile.path=/dev/null \ No newline at end of file From 87841ef1c57961116369aafce8efec8355008ffc Mon Sep 17 00:00:00 2001 From: Bhaskar Divya Date: Fri, 2 Feb 2018 22:12:29 +0100 Subject: [PATCH 03/32] Renaming docker-compose.yml in proper case for docker-compose --- ...{Docker-compose.yml => docker-compose.yml} | 0 ...{Docker-compose.yml => docker-compose.yml} | 26 +++++++++---------- ...{Docker-compose.yml => docker-compose.yml} | 16 ++++++------ ...{Docker-compose.yml => docker-compose.yml} | 22 ++++++++-------- 4 files changed, 32 insertions(+), 32 deletions(-) rename docker/Flink-Hbase-Kafka/{Docker-compose.yml => docker-compose.yml} (100%) rename docker/{Docker-compose.yml => docker-compose.yml} (93%) rename docker/kafka-manager/{Docker-compose.yml => docker-compose.yml} (96%) rename docker/kibana/{Docker-compose.yml => docker-compose.yml} (95%) diff --git a/docker/Flink-Hbase-Kafka/Docker-compose.yml b/docker/Flink-Hbase-Kafka/docker-compose.yml similarity index 100% rename from docker/Flink-Hbase-Kafka/Docker-compose.yml rename to docker/Flink-Hbase-Kafka/docker-compose.yml diff --git a/docker/Docker-compose.yml b/docker/docker-compose.yml similarity index 93% rename from docker/Docker-compose.yml rename to docker/docker-compose.yml index 08d853e..fa6d243 100755 --- a/docker/Docker-compose.yml +++ b/docker/docker-compose.yml @@ -1,13 +1,13 @@ -version: '2.1' -services: - flink-kafka-hbase: - build: ./Flink-Hbase-Kafka - - elasticsearch: - build: ./elasticsearch - - kibana: - build: ./kibana - - kafka-manager: - build: ./kafka-manager +version: '2.1' +services: + flink-kafka-hbase: + build: ./Flink-Hbase-Kafka + + elasticsearch: + build: ./elasticsearch + + kibana: + build: ./kibana + + kafka-manager: + build: ./kafka-manager diff --git a/docker/kafka-manager/Docker-compose.yml b/docker/kafka-manager/docker-compose.yml similarity index 96% rename from docker/kafka-manager/Docker-compose.yml rename to docker/kafka-manager/docker-compose.yml index 212b254..7fa25dc 100755 --- a/docker/kafka-manager/Docker-compose.yml +++ b/docker/kafka-manager/docker-compose.yml @@ -1,9 +1,9 @@ -services: - kafka_manager: - image: hlebalbau/kafka-manager - ports: - - "9000:9000" - environment: - ZK_HOSTS: "192.168.99.100:2181/kafka" - APPLICATION_SECRET: "random-secret" +services: + kafka_manager: + image: hlebalbau/kafka-manager + ports: + - "9000:9000" + environment: + ZK_HOSTS: "192.168.99.100:2181/kafka" + APPLICATION_SECRET: "random-secret" command: -Dpidfile.path=/dev/null \ No newline at end of file diff --git a/docker/kibana/Docker-compose.yml b/docker/kibana/docker-compose.yml similarity index 95% rename from docker/kibana/Docker-compose.yml rename to docker/kibana/docker-compose.yml index 7214ea9..0533fec 100755 --- a/docker/kibana/Docker-compose.yml +++ b/docker/kibana/docker-compose.yml @@ -1,11 +1,11 @@ -version: '2' -services: - kibana: - image: kibana:5.1.2 - container_name: kibana - restart: always - network_mode: "bridge" - ports: - - "5601:5601" - external_links: - - elasticsearch:elasticsearch +version: '2' +services: + kibana: + image: kibana:5.1.2 + container_name: kibana + restart: always + network_mode: "bridge" + ports: + - "5601:5601" + external_links: + - elasticsearch:elasticsearch From a0fb6777d9264e4d6957334eca418b7384a800c5 Mon Sep 17 00:00:00 2001 From: Bhaskar Divya Date: Fri, 2 Feb 2018 22:45:08 +0100 Subject: [PATCH 04/32] Fixing docker-compose errors --- docker/Flink-Hbase-Kafka/docker-compose.yml | 6 +++++- docker/docker-compose.yml | 19 ++++++++++++++++--- docker/kafka-manager/docker-compose.yml | 1 + 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/docker/Flink-Hbase-Kafka/docker-compose.yml b/docker/Flink-Hbase-Kafka/docker-compose.yml index 6ee061f..35c59c1 100755 --- a/docker/Flink-Hbase-Kafka/docker-compose.yml +++ b/docker/Flink-Hbase-Kafka/docker-compose.yml @@ -21,4 +21,8 @@ services: environment: KAFKA_ADVERTISED_HOST_NAME: 192.168.99.100 volumes: - - /data \ No newline at end of file + - /data + external_links: + - elasticsearch + - kibana + - kafka-manager diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index fa6d243..b795968 100755 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -1,13 +1,26 @@ version: '2.1' services: flink-kafka-hbase: - build: ./Flink-Hbase-Kafka + extends: + service: flink-kafka-hbase + file: ./Flink-Hbase-Kafka elasticsearch: - build: ./elasticsearch + extends: + service: elasticsearch + file: ./elasticsearch + + elasticsearch2: + extends: + service: elasticsearch2 + file: ./elasticsearch kibana: + extends: + service: build: ./kibana kafka-manager: - build: ./kafka-manager + extends: + service: kafka-manager + file: ./kafka-manager \ No newline at end of file diff --git a/docker/kafka-manager/docker-compose.yml b/docker/kafka-manager/docker-compose.yml index 7fa25dc..bb8b8a1 100755 --- a/docker/kafka-manager/docker-compose.yml +++ b/docker/kafka-manager/docker-compose.yml @@ -1,6 +1,7 @@ services: kafka_manager: image: hlebalbau/kafka-manager + container_name: kafka-manager ports: - "9000:9000" environment: From 6cec3d6ca99cc812214b3a5646ade27248554d27 Mon Sep 17 00:00:00 2001 From: Bhaskar Divya Date: Fri, 2 Feb 2018 22:46:45 +0100 Subject: [PATCH 05/32] Fixing docker-compose errors --- docker/docker-compose.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index b795968..e0a0cb1 100755 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -17,8 +17,8 @@ services: kibana: extends: - service: - build: ./kibana + service: kibana + file: ./kibana kafka-manager: extends: From 10738c6a70d5696d7e842644fe5ecac94dfed0cc Mon Sep 17 00:00:00 2001 From: Bhaskar Divya Date: Fri, 2 Feb 2018 22:47:47 +0100 Subject: [PATCH 06/32] Fixing docker-compose errors --- docker/docker-compose.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index e0a0cb1..31f4476 100755 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -3,24 +3,24 @@ services: flink-kafka-hbase: extends: service: flink-kafka-hbase - file: ./Flink-Hbase-Kafka + file: ./Flink-Hbase-Kafka/docker-compose.yml elasticsearch: extends: service: elasticsearch - file: ./elasticsearch + file: ./elasticsearch/docker-compose.yml elasticsearch2: extends: service: elasticsearch2 - file: ./elasticsearch + file: ./elasticsearch/docker-compose.yml kibana: extends: service: kibana - file: ./kibana + file: ./kibana/docker-compose.yml kafka-manager: extends: service: kafka-manager - file: ./kafka-manager \ No newline at end of file + file: ./kafka-manager/docker-compose.yml \ No newline at end of file From 76e7fb12e4594a9ee8987ddbe1e7d839acf097dd Mon Sep 17 00:00:00 2001 From: Bhaskar Divya Date: Fri, 2 Feb 2018 22:53:13 +0100 Subject: [PATCH 07/32] Fixing docker-compose errors --- docker/Flink-Hbase-Kafka/docker-compose.yml | 1 + docker/elasticsearch/docker-compose.yml | 108 ++++++++++---------- docker/kafka-manager/docker-compose.yml | 1 + docker/kibana/docker-compose.yml | 2 +- 4 files changed, 58 insertions(+), 54 deletions(-) diff --git a/docker/Flink-Hbase-Kafka/docker-compose.yml b/docker/Flink-Hbase-Kafka/docker-compose.yml index 35c59c1..f88e7c0 100755 --- a/docker/Flink-Hbase-Kafka/docker-compose.yml +++ b/docker/Flink-Hbase-Kafka/docker-compose.yml @@ -1,3 +1,4 @@ +version: '2.1' services: flink-kafka-hbase: image: quay.io/koldbyte/flink-cluster diff --git a/docker/elasticsearch/docker-compose.yml b/docker/elasticsearch/docker-compose.yml index 97f4364..65695e8 100755 --- a/docker/elasticsearch/docker-compose.yml +++ b/docker/elasticsearch/docker-compose.yml @@ -1,54 +1,56 @@ -elasticsearch: - image: docker.elastic.co/elasticsearch/elasticsearch:5.1.2 - ports: - - 9200:9200 - - 9300:9300 - container_name: elasticsearch - ulimits: - memlock: - soft: -1 - hard: -1 - mem_limit: 1g - environment: - - cluster.name=docker-cluster - - node.name=one - - bootstrap.memory_lock=false - - xpack.security.enabled=false - - "ES_JAVA_OPTS=-Xms512m -Xmx512m" - - network.publish_host=192.168.99.100 - - transport.publish_port=9300 - - transport.host=0.0.0.0 - - transport.tcp.port=9300 - - network.host=0.0.0.0 - - http.host=0.0.0.0 - - http.port=9200 - volumes: - - /usr/share/elasticsearch/data +version: '2.1' +services: + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:5.1.2 + ports: + - 9200:9200 + - 9300:9300 + container_name: elasticsearch + ulimits: + memlock: + soft: -1 + hard: -1 + mem_limit: 1g + environment: + - cluster.name=docker-cluster + - node.name=one + - bootstrap.memory_lock=false + - xpack.security.enabled=false + - "ES_JAVA_OPTS=-Xms512m -Xmx512m" + - network.publish_host=192.168.99.100 + - transport.publish_port=9300 + - transport.host=0.0.0.0 + - transport.tcp.port=9300 + - network.host=0.0.0.0 + - http.host=0.0.0.0 + - http.port=9200 + volumes: + - /usr/share/elasticsearch/data -elasticsearch2: - image: docker.elastic.co/elasticsearch/elasticsearch:5.1.2 - ports: - - 9201:9200 - - 9301:9300 - container_name: elasticsearch2 - ulimits: - memlock: - soft: -1 - hard: -1 - mem_limit: 1g - environment: - - cluster.name=docker-cluster - - node.name=two - - bootstrap.memory_lock=false - - xpack.security.enabled=false - - "ES_JAVA_OPTS=-Xms512m -Xmx512m" - - network.publish_host=192.168.99.100 - - transport.publish_port=9301 - - "discovery.zen.ping.unicast.hosts=192.168.99.100" - - "discovery.zen.minimum_master_nodes=2" - - network.host=0.0.0.0 - - transport.host=0.0.0.0 - - transport.tcp.port=9300 - - http.host=0.0.0.0 - volumes: - - /usr/share/elasticsearch/data + elasticsearch2: + image: docker.elastic.co/elasticsearch/elasticsearch:5.1.2 + ports: + - 9201:9200 + - 9301:9300 + container_name: elasticsearch2 + ulimits: + memlock: + soft: -1 + hard: -1 + mem_limit: 1g + environment: + - cluster.name=docker-cluster + - node.name=two + - bootstrap.memory_lock=false + - xpack.security.enabled=false + - "ES_JAVA_OPTS=-Xms512m -Xmx512m" + - network.publish_host=192.168.99.100 + - transport.publish_port=9301 + - "discovery.zen.ping.unicast.hosts=192.168.99.100" + - "discovery.zen.minimum_master_nodes=2" + - network.host=0.0.0.0 + - transport.host=0.0.0.0 + - transport.tcp.port=9300 + - http.host=0.0.0.0 + volumes: + - /usr/share/elasticsearch/data diff --git a/docker/kafka-manager/docker-compose.yml b/docker/kafka-manager/docker-compose.yml index bb8b8a1..b560948 100755 --- a/docker/kafka-manager/docker-compose.yml +++ b/docker/kafka-manager/docker-compose.yml @@ -1,3 +1,4 @@ +version: '2.1' services: kafka_manager: image: hlebalbau/kafka-manager diff --git a/docker/kibana/docker-compose.yml b/docker/kibana/docker-compose.yml index 0533fec..62e912a 100755 --- a/docker/kibana/docker-compose.yml +++ b/docker/kibana/docker-compose.yml @@ -1,4 +1,4 @@ -version: '2' +version: '2.1' services: kibana: image: kibana:5.1.2 From 4543138de65f3bb566ca6113f217d344cc1be829 Mon Sep 17 00:00:00 2001 From: Bhaskar Divya Date: Fri, 2 Feb 2018 22:54:27 +0100 Subject: [PATCH 08/32] Fixing docker-compose errors --- docker/docker-compose.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 31f4476..442785b 100755 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -20,7 +20,7 @@ services: service: kibana file: ./kibana/docker-compose.yml - kafka-manager: + kafka_manager: extends: - service: kafka-manager + service: kafka_manager file: ./kafka-manager/docker-compose.yml \ No newline at end of file From 2b5de2ad560be72bd58ae1905a71385861ed0788 Mon Sep 17 00:00:00 2001 From: Bhaskar Divya Date: Tue, 6 Feb 2018 18:20:46 +0100 Subject: [PATCH 09/32] Fixing docker-compose errors --- docker/kibana/docker-compose.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/docker/kibana/docker-compose.yml b/docker/kibana/docker-compose.yml index 62e912a..74c7086 100755 --- a/docker/kibana/docker-compose.yml +++ b/docker/kibana/docker-compose.yml @@ -4,7 +4,6 @@ services: image: kibana:5.1.2 container_name: kibana restart: always - network_mode: "bridge" ports: - "5601:5601" external_links: From 06c0fa603d2cdbff7724e783bbbe8e6f994635df Mon Sep 17 00:00:00 2001 From: Bhaskar Divya Date: Tue, 6 Feb 2018 18:33:59 +0100 Subject: [PATCH 10/32] Fixing docker-compose errors Adding networks and volumes --- docker/docker-compose.yml | 21 ++++++++++++++++++++- docker/kafka-manager/docker-compose.yml | 2 +- docker/kibana/docker-compose.yml | 3 +++ 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 442785b..43d63c3 100755 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -4,23 +4,42 @@ services: extends: service: flink-kafka-hbase file: ./Flink-Hbase-Kafka/docker-compose.yml + networks: + - smartgrid + volumes: + - "/data" : /app/smartgrid/data elasticsearch: extends: service: elasticsearch file: ./elasticsearch/docker-compose.yml + networks: + - smartgrid + volumes: + - "/usr/share/elasticsearch/data" : /app/smartgrid/es-data elasticsearch2: extends: service: elasticsearch2 file: ./elasticsearch/docker-compose.yml + networks: + - smartgrid + volumes: + - "/usr/share/elasticsearch/data" : /app/smartgrid/es-data-2 kibana: extends: service: kibana file: ./kibana/docker-compose.yml + networks: + - smartgrid kafka_manager: extends: service: kafka_manager - file: ./kafka-manager/docker-compose.yml \ No newline at end of file + file: ./kafka-manager/docker-compose.yml + networks: + - smartgrid + +networks: + smartgrid \ No newline at end of file diff --git a/docker/kafka-manager/docker-compose.yml b/docker/kafka-manager/docker-compose.yml index b560948..e062886 100755 --- a/docker/kafka-manager/docker-compose.yml +++ b/docker/kafka-manager/docker-compose.yml @@ -6,6 +6,6 @@ services: ports: - "9000:9000" environment: - ZK_HOSTS: "192.168.99.100:2181/kafka" + ZK_HOSTS: "localhost:2181/kafka" APPLICATION_SECRET: "random-secret" command: -Dpidfile.path=/dev/null \ No newline at end of file diff --git a/docker/kibana/docker-compose.yml b/docker/kibana/docker-compose.yml index 74c7086..97a930f 100755 --- a/docker/kibana/docker-compose.yml +++ b/docker/kibana/docker-compose.yml @@ -6,5 +6,8 @@ services: restart: always ports: - "5601:5601" + environment: + - ELASTICSEARCH_URL: http://localhost:9200 + - SERVER_NAME: localhost external_links: - elasticsearch:elasticsearch From 43a65b56dac7f47bbd40c18e017808d9131fc9a8 Mon Sep 17 00:00:00 2001 From: Bhaskar Divya Date: Tue, 6 Feb 2018 18:40:17 +0100 Subject: [PATCH 11/32] Fixing docker-compose errors Adding networks and volumes --- docker/docker-compose.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 43d63c3..f67df9a 100755 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -42,4 +42,6 @@ services: - smartgrid networks: - smartgrid \ No newline at end of file + smartgrid: + driver: bridge + enable_ipv6: true \ No newline at end of file From c2780acb00b2fc6c5da24aadf2ef2151890cdf33 Mon Sep 17 00:00:00 2001 From: Bhaskar Divya Date: Tue, 6 Feb 2018 18:44:23 +0100 Subject: [PATCH 12/32] Fixing docker-compose errors Adding networks and volumes --- docker/docker-compose.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index f67df9a..359e0c0 100755 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -7,7 +7,7 @@ services: networks: - smartgrid volumes: - - "/data" : /app/smartgrid/data + - /app/smartgrid/data:/data elasticsearch: extends: @@ -16,7 +16,7 @@ services: networks: - smartgrid volumes: - - "/usr/share/elasticsearch/data" : /app/smartgrid/es-data + - /app/smartgrid/es-data:/usr/share/elasticsearch/data elasticsearch2: extends: @@ -25,7 +25,7 @@ services: networks: - smartgrid volumes: - - "/usr/share/elasticsearch/data" : /app/smartgrid/es-data-2 + - /app/smartgrid/es-data-2:/usr/share/elasticsearch/data kibana: extends: From 9e2230396d80b510b53424dda98ddd84396ed834 Mon Sep 17 00:00:00 2001 From: Bhaskar Divya Date: Tue, 6 Feb 2018 18:48:29 +0100 Subject: [PATCH 13/32] Fixing docker-compose errors Adding networks and volumes --- docker/kibana/docker-compose.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/kibana/docker-compose.yml b/docker/kibana/docker-compose.yml index 97a930f..cd450d2 100755 --- a/docker/kibana/docker-compose.yml +++ b/docker/kibana/docker-compose.yml @@ -7,7 +7,7 @@ services: ports: - "5601:5601" environment: - - ELASTICSEARCH_URL: http://localhost:9200 - - SERVER_NAME: localhost + - ELASTICSEARCH_URL="http://localhost:9200" + - SERVER_NAME=localhost external_links: - elasticsearch:elasticsearch From 43ce10ec8e657e004da827d8dff271a4b17b5bd1 Mon Sep 17 00:00:00 2001 From: Bhaskar Divya Date: Tue, 6 Feb 2018 18:56:20 +0100 Subject: [PATCH 14/32] Fixing docker-compose errors Adding networks and volumes --- docker/Flink-Hbase-Kafka/docker-compose.yml | 2 +- docker/elasticsearch/docker-compose.yml | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/Flink-Hbase-Kafka/docker-compose.yml b/docker/Flink-Hbase-Kafka/docker-compose.yml index f88e7c0..501e32d 100755 --- a/docker/Flink-Hbase-Kafka/docker-compose.yml +++ b/docker/Flink-Hbase-Kafka/docker-compose.yml @@ -20,7 +20,7 @@ services: - 8081:8081 - 8090:8090 environment: - KAFKA_ADVERTISED_HOST_NAME: 192.168.99.100 + KAFKA_ADVERTISED_HOST_NAME: localhost volumes: - /data external_links: diff --git a/docker/elasticsearch/docker-compose.yml b/docker/elasticsearch/docker-compose.yml index 65695e8..8563d06 100755 --- a/docker/elasticsearch/docker-compose.yml +++ b/docker/elasticsearch/docker-compose.yml @@ -17,7 +17,7 @@ services: - bootstrap.memory_lock=false - xpack.security.enabled=false - "ES_JAVA_OPTS=-Xms512m -Xmx512m" - - network.publish_host=192.168.99.100 + - network.publish_host=localhost - transport.publish_port=9300 - transport.host=0.0.0.0 - transport.tcp.port=9300 @@ -44,9 +44,9 @@ services: - bootstrap.memory_lock=false - xpack.security.enabled=false - "ES_JAVA_OPTS=-Xms512m -Xmx512m" - - network.publish_host=192.168.99.100 + - network.publish_host=localhost - transport.publish_port=9301 - - "discovery.zen.ping.unicast.hosts=192.168.99.100" + - "discovery.zen.ping.unicast.hosts=localhost" - "discovery.zen.minimum_master_nodes=2" - network.host=0.0.0.0 - transport.host=0.0.0.0 From a084d786d14f532c91300b7a92de75165d29ba0f Mon Sep 17 00:00:00 2001 From: Bhaskar Divya Date: Tue, 6 Feb 2018 19:56:16 +0100 Subject: [PATCH 15/32] Fixing docker-compose errors Adding networks and volumes --- docker/docker-compose.yml | 24 ++++++++++++++++-------- docker/elasticsearch/docker-compose.yml | 2 ++ docker/kafka-manager/docker-compose.yml | 1 + docker/kibana/docker-compose.yml | 3 ++- 4 files changed, 21 insertions(+), 9 deletions(-) diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 359e0c0..d505cc9 100755 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -1,47 +1,55 @@ version: '2.1' services: flink-kafka-hbase: + domainname: smartgrid.com extends: service: flink-kafka-hbase file: ./Flink-Hbase-Kafka/docker-compose.yml networks: - - smartgrid + - smartgrid.com volumes: - /app/smartgrid/data:/data elasticsearch: + domainname: smartgrid.com extends: service: elasticsearch file: ./elasticsearch/docker-compose.yml networks: - - smartgrid + - smartgrid.com volumes: - /app/smartgrid/es-data:/usr/share/elasticsearch/data elasticsearch2: + domainname: smartgrid.com extends: service: elasticsearch2 file: ./elasticsearch/docker-compose.yml networks: - - smartgrid + - smartgrid.com volumes: - /app/smartgrid/es-data-2:/usr/share/elasticsearch/data kibana: + domainname: smartgrid.com extends: service: kibana file: ./kibana/docker-compose.yml networks: - - smartgrid + - smartgrid.com + depends_on: + - elasticsearch kafka_manager: + domainname: smartgrid.com extends: service: kafka_manager file: ./kafka-manager/docker-compose.yml networks: - - smartgrid + - smartgrid.com + depends_on: + - flink-kafka-hbase networks: - smartgrid: - driver: bridge - enable_ipv6: true \ No newline at end of file + smartgrid.com: + driver: bridge \ No newline at end of file diff --git a/docker/elasticsearch/docker-compose.yml b/docker/elasticsearch/docker-compose.yml index 8563d06..d6ceaab 100755 --- a/docker/elasticsearch/docker-compose.yml +++ b/docker/elasticsearch/docker-compose.yml @@ -2,6 +2,7 @@ version: '2.1' services: elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch:5.1.2 + hostname: elasticsearch ports: - 9200:9200 - 9300:9300 @@ -29,6 +30,7 @@ services: elasticsearch2: image: docker.elastic.co/elasticsearch/elasticsearch:5.1.2 + hostname: elasticsearch2 ports: - 9201:9200 - 9301:9300 diff --git a/docker/kafka-manager/docker-compose.yml b/docker/kafka-manager/docker-compose.yml index e062886..2c0e983 100755 --- a/docker/kafka-manager/docker-compose.yml +++ b/docker/kafka-manager/docker-compose.yml @@ -2,6 +2,7 @@ version: '2.1' services: kafka_manager: image: hlebalbau/kafka-manager + hostname: kafka-manager container_name: kafka-manager ports: - "9000:9000" diff --git a/docker/kibana/docker-compose.yml b/docker/kibana/docker-compose.yml index cd450d2..88f20c0 100755 --- a/docker/kibana/docker-compose.yml +++ b/docker/kibana/docker-compose.yml @@ -2,12 +2,13 @@ version: '2.1' services: kibana: image: kibana:5.1.2 + hostname: kibana container_name: kibana restart: always ports: - "5601:5601" environment: - - ELASTICSEARCH_URL="http://localhost:9200" + - ELASTICSEARCH_URL=http://localhost:9200 - SERVER_NAME=localhost external_links: - elasticsearch:elasticsearch From e2eb0442b620713122fa2dbcfe72a95318a43029 Mon Sep 17 00:00:00 2001 From: Bhaskar Divya Date: Wed, 7 Feb 2018 10:45:41 +0100 Subject: [PATCH 16/32] Fixing docker-compose errors Adding networks and volumes --- docker/docker-compose.yml | 14 ++++++++++++-- docker/elasticsearch/docker-compose.yml | 6 +++--- docker/kibana/docker-compose.yml | 4 ++-- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index d505cc9..c547d10 100755 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -9,6 +9,9 @@ services: - smartgrid.com volumes: - /app/smartgrid/data:/data + links: + - elasticsearch + - elasticsearch2 elasticsearch: domainname: smartgrid.com @@ -39,9 +42,12 @@ services: - smartgrid.com depends_on: - elasticsearch + links: + - elasticsearch + - elasticsearch2 kafka_manager: - domainname: smartgrid.com + domainname: smartgrid.com extends: service: kafka_manager file: ./kafka-manager/docker-compose.yml @@ -49,7 +55,11 @@ services: - smartgrid.com depends_on: - flink-kafka-hbase + links: + - flink-kafka-hbase + - "flink-kafka-hbase:hbase-docker" networks: smartgrid.com: - driver: bridge \ No newline at end of file + driver: bridge + name: smartgrid.com \ No newline at end of file diff --git a/docker/elasticsearch/docker-compose.yml b/docker/elasticsearch/docker-compose.yml index d6ceaab..41b72c5 100755 --- a/docker/elasticsearch/docker-compose.yml +++ b/docker/elasticsearch/docker-compose.yml @@ -18,7 +18,7 @@ services: - bootstrap.memory_lock=false - xpack.security.enabled=false - "ES_JAVA_OPTS=-Xms512m -Xmx512m" - - network.publish_host=localhost + - network.publish_host=elasticsearch - transport.publish_port=9300 - transport.host=0.0.0.0 - transport.tcp.port=9300 @@ -46,9 +46,9 @@ services: - bootstrap.memory_lock=false - xpack.security.enabled=false - "ES_JAVA_OPTS=-Xms512m -Xmx512m" - - network.publish_host=localhost + - network.publish_host=elasticsearch - transport.publish_port=9301 - - "discovery.zen.ping.unicast.hosts=localhost" + - "discovery.zen.ping.unicast.hosts=elasticsearch" - "discovery.zen.minimum_master_nodes=2" - network.host=0.0.0.0 - transport.host=0.0.0.0 diff --git a/docker/kibana/docker-compose.yml b/docker/kibana/docker-compose.yml index 88f20c0..5617be5 100755 --- a/docker/kibana/docker-compose.yml +++ b/docker/kibana/docker-compose.yml @@ -8,7 +8,7 @@ services: ports: - "5601:5601" environment: - - ELASTICSEARCH_URL=http://localhost:9200 - - SERVER_NAME=localhost + - ELASTICSEARCH_URL=http://elasticsearch:9200 + - SERVER_NAME=elasticsearch external_links: - elasticsearch:elasticsearch From 7d1ac824eb15738c219172eaf65c2a703c7416f9 Mon Sep 17 00:00:00 2001 From: Bhaskar Divya Date: Wed, 7 Feb 2018 11:19:46 +0100 Subject: [PATCH 17/32] Fixing docker-compose errors Adding networks and volumes --- docker/Flink-Hbase-Kafka/docker-compose.yml | 2 +- docker/elasticsearch/docker-compose.yml | 5 +++-- docker/kafka-manager/docker-compose.yml | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docker/Flink-Hbase-Kafka/docker-compose.yml b/docker/Flink-Hbase-Kafka/docker-compose.yml index 501e32d..e97db32 100755 --- a/docker/Flink-Hbase-Kafka/docker-compose.yml +++ b/docker/Flink-Hbase-Kafka/docker-compose.yml @@ -20,7 +20,7 @@ services: - 8081:8081 - 8090:8090 environment: - KAFKA_ADVERTISED_HOST_NAME: localhost + KAFKA_ADVERTISED_HOST_NAME: hbase-docker volumes: - /data external_links: diff --git a/docker/elasticsearch/docker-compose.yml b/docker/elasticsearch/docker-compose.yml index 41b72c5..0b6dbd7 100755 --- a/docker/elasticsearch/docker-compose.yml +++ b/docker/elasticsearch/docker-compose.yml @@ -46,13 +46,14 @@ services: - bootstrap.memory_lock=false - xpack.security.enabled=false - "ES_JAVA_OPTS=-Xms512m -Xmx512m" - - network.publish_host=elasticsearch - - transport.publish_port=9301 + - network.publish_host=elasticsearch2 + - transport.publish_port=9300 - "discovery.zen.ping.unicast.hosts=elasticsearch" - "discovery.zen.minimum_master_nodes=2" - network.host=0.0.0.0 - transport.host=0.0.0.0 - transport.tcp.port=9300 - http.host=0.0.0.0 + - http.port=9200 volumes: - /usr/share/elasticsearch/data diff --git a/docker/kafka-manager/docker-compose.yml b/docker/kafka-manager/docker-compose.yml index 2c0e983..982f33a 100755 --- a/docker/kafka-manager/docker-compose.yml +++ b/docker/kafka-manager/docker-compose.yml @@ -7,6 +7,6 @@ services: ports: - "9000:9000" environment: - ZK_HOSTS: "localhost:2181/kafka" + ZK_HOSTS: "hbase-docker:2181/kafka" APPLICATION_SECRET: "random-secret" command: -Dpidfile.path=/dev/null \ No newline at end of file From f2a790208fba766bfb8b32af78537d0f93ebe523 Mon Sep 17 00:00:00 2001 From: Bhaskar Divya Date: Wed, 7 Feb 2018 19:57:07 +0100 Subject: [PATCH 18/32] Fixing docker-compose errors Adding networks and volumes --- docker/docker-compose.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index c547d10..6b7b24a 100755 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -53,6 +53,8 @@ services: file: ./kafka-manager/docker-compose.yml networks: - smartgrid.com + volumes: + - /app/smartgrid/kafka-manager:/kafka-manager/configuration depends_on: - flink-kafka-hbase links: From fad824833f1bc37f9321c4a6324467df596138da Mon Sep 17 00:00:00 2001 From: Bhaskar Divya Date: Wed, 7 Feb 2018 19:58:51 +0100 Subject: [PATCH 19/32] Minor changes in names and conditional sink output of raw --- .../base/SensorEventAveragingJobBase.scala | 28 ++++++++++--------- .../operators/MedianAggregateWithKey.scala | 2 +- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala index 7be191b..a7e1483 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala @@ -92,31 +92,31 @@ abstract class SensorEventAveragingJobBase extends Serializable { .keyBy(keyGetter(_)) .window(SlidingEventTimeWindows.of(Time.minutes(1), Time.seconds(Constants.SLIDING_INTERVAL))) .aggregate(new AverageAggregateWithKey(keyGetter)) - .name(getKeyName() + "Average for 1 min Window") + .name(getKeyName() + " Average for 1 min Window") val avg_windowed5min = avg_windowed1min .keyBy(_.key) .window(SlidingEventTimeWindows.of(Time.minutes(5), Time.seconds(Constants.SLIDING_INTERVAL))) .reduce(AverageWithKey.reducer) - .name(getKeyName() + "Average for 5 min Window") + .name(getKeyName() + " Average for 5 min Window") val avg_windowed15min = avg_windowed5min .keyBy(_.key) .window(SlidingEventTimeWindows.of(Time.minutes(15), Time.seconds(Constants.SLIDING_INTERVAL))) .reduce(AverageWithKey.reducer) - .name(getKeyName() + "Average for 15 min Window") + .name(getKeyName() + " Average for 15 min Window") val avg_windowed60min = avg_windowed15min .keyBy(_.key) .window(SlidingEventTimeWindows.of(Time.minutes(60), Time.seconds(Constants.SLIDING_INTERVAL))) .reduce(AverageWithKey.reducer) - .name(getKeyName() + "Average for 60 min Window") + .name(getKeyName() + " Average for 60 min Window") val avg_windowed120min = avg_windowed60min .keyBy(_.key) .window(SlidingEventTimeWindows.of(Time.minutes(120), Time.seconds(Constants.SLIDING_INTERVAL))) .reduce(AverageWithKey.reducer) - .name(getKeyName() + "Average for 120 min Window") + .name(getKeyName() + " Average for 120 min Window") // Write to file for debug val debug = params.has("debug") @@ -133,31 +133,31 @@ abstract class SensorEventAveragingJobBase extends Serializable { .keyBy(_.key) .window(SlidingEventTimeWindows.of(Time.minutes(1), Time.seconds(Constants.SLIDING_INTERVAL))) .aggregate(new MedianAggregateWithKey) - .name(getKeyName() + "Median for 1 min Window") + .name(getKeyName() + " Median for 1 min Window") val median_windowed5min = median_windowed1min .keyBy(_.key) .window(SlidingEventTimeWindows.of(Time.minutes(5), Time.seconds(Constants.SLIDING_INTERVAL))) .reduce(MedianLoadWithKey.reducer) - .name(getKeyName() + "Median for 5 min Window") + .name(getKeyName() + " Median for 5 min Window") val median_windowed15min = median_windowed5min .keyBy(_.key) .window(SlidingEventTimeWindows.of(Time.minutes(15), Time.seconds(Constants.SLIDING_INTERVAL))) .reduce(MedianLoadWithKey.reducer) - .name(getKeyName() + "Median for 15 min Window") + .name(getKeyName() + " Median for 15 min Window") val median_windowed60min = median_windowed15min .keyBy(_.key) .window(SlidingEventTimeWindows.of(Time.minutes(60), Time.seconds(Constants.SLIDING_INTERVAL))) .reduce(MedianLoadWithKey.reducer) - .name(getKeyName() + "Median for 60 min Window") + .name(getKeyName() + " Median for 60 min Window") val median_windowed120min = median_windowed60min .keyBy(_.key) .window(SlidingEventTimeWindows.of(Time.minutes(120), Time.seconds(Constants.SLIDING_INTERVAL))) .reduce(MedianLoadWithKey.reducer) - .name(getKeyName() + "Median for 120 min Window") + .name(getKeyName() + " Median for 120 min Window") val timeout = params.getLong("timeout", 3000000L) // 300 seconds timeout @@ -256,9 +256,11 @@ abstract class SensorEventAveragingJobBase extends Serializable { } //Sink the original feed into ES as well - initializedFlow - .addSink(SensorEventElasticSearchSink(params, Constants.ES_INDEX_NAME, Constants.ES_INDEX_TYPE_RAW)) - .name("Sensor Raw to ES") + if(debug) { + initializedFlow + .addSink(SensorEventElasticSearchSink(params, Constants.ES_INDEX_NAME, Constants.ES_INDEX_TYPE_RAW)) + .name("Sensor Raw to ES") + } env.execute("Sensor Event" + getKeyName() + " Prediction Job (Kafka to HBase Averages + Prediction to ES)") diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianAggregateWithKey.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianAggregateWithKey.scala index d03c8c4..3f984d2 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianAggregateWithKey.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianAggregateWithKey.scala @@ -5,7 +5,7 @@ import com.tdunning.math.stats.TDigest import org.apache.flink.api.common.functions.AggregateFunction class MedianAggregateWithKey extends AggregateFunction[AverageWithKey, MedianLoadWithKey, MedianLoadWithKey]{ - override def createAccumulator() = MedianLoadWithKey(SensorKeyObject(-1), TDigest.createDigest(100)) + override def createAccumulator() = new MedianLoadWithKey(SensorKeyObject(-1), TDigest.createDigest(100)) override def add(value: AverageWithKey, accumulator: MedianLoadWithKey) = accumulator.add(averageWithKey = value) From 8bf3322613bda3ce02d4e2318e1a7131a8fbd10c Mon Sep 17 00:00:00 2001 From: Bhaskar Divya Date: Tue, 13 Feb 2018 10:55:51 +0100 Subject: [PATCH 20/32] Adding slice model --- .../projects/smartgrid/model/Slice.scala | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala new file mode 100644 index 0000000..d3ca879 --- /dev/null +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala @@ -0,0 +1,32 @@ +package com.bhaskardivya.projects.smartgrid.model + +import org.apache.flink.streaming.api.windowing.time.Time + +/** + * Class to represent a slice index of a size (aka window size) + * + * @param size Window size + * @param timestamp Event timestamp from the record + */ +case class Slice(size: Time)(timestamp: Long) { + + lazy val size_in_seconds = size.toMilliseconds / 1000 + + // This is the base timestamp ... the epoch of the world with flink + val base = 0L // TODO: currently set 0 to align with Unix Epoch + + lazy val ts_start = timestamp - (timestamp % size_in_seconds) + + lazy val ts_stop = ts_start + size_in_seconds - 1 + + lazy val i = ( ts_start - base ) / size_in_seconds + + def num_slices_in(hr: Long) = (hr * 60 * 60) / size_in_seconds + + val num_slices_in_day = num_slices_in(24) + + lazy val j = { + val k = num_slices_in_day + (1L to ((i+2)/k)).map(n => (i+2 - n*k)) + } +} From a45ea1c152b04e5681d947e357f6642689da09b3 Mon Sep 17 00:00:00 2001 From: koldbyte Date: Thu, 15 Feb 2018 19:12:36 +0530 Subject: [PATCH 21/32] Adding initial version of pipeline with TDigest --- .idea/hydra.xml | 9 + smartgrid.iml | 20 ++ .../SensorEventHouseAveragingJob.scala | 7 +- .../base/SensorEventAveragingJobBase.scala | 214 ++++-------------- .../smartgrid/job/PlugAveragingJob.scala | 9 +- .../projects/smartgrid/model/Average.scala | 11 + .../smartgrid/model/AverageWithKey.scala | 27 ++- .../projects/smartgrid/model/Constants.scala | 4 + .../smartgrid/model/MedianLoadWithKey.scala | 19 +- .../projects/smartgrid/model/Prediction.scala | 3 +- .../smartgrid/model/Prediction2.scala | 48 ++++ .../smartgrid/model/SensorKeyObject.scala | 15 +- .../projects/smartgrid/model/Slice.scala | 20 +- .../operators/AverageAggregateWithKey.scala | 26 --- .../operators/AverageWithKeyReducer.scala | 11 + .../smartgrid/operators/EnrichMapper.scala | 47 ++++ .../operators/MedianAggregateWithKey.scala | 2 + .../operators/MedianWithKeyMapper.scala | 38 ++++ .../operators/PredictionFunction.scala | 2 + .../sinks/PredictionElasticSearchSink.scala | 14 +- .../PredictionElasticSearchSinkFunction.scala | 8 +- .../smartgrid/sources/HBaseMedianSource.scala | 8 +- .../smartgrid/sources/KafkaSource.scala | 11 +- 23 files changed, 321 insertions(+), 252 deletions(-) create mode 100644 .idea/hydra.xml create mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction2.scala delete mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/operators/AverageAggregateWithKey.scala create mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/operators/AverageWithKeyReducer.scala create mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/operators/EnrichMapper.scala create mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianWithKeyMapper.scala diff --git a/.idea/hydra.xml b/.idea/hydra.xml new file mode 100644 index 0000000..123e89c --- /dev/null +++ b/.idea/hydra.xml @@ -0,0 +1,9 @@ + + + + + \ No newline at end of file diff --git a/smartgrid.iml b/smartgrid.iml index 94619dc..e75d68b 100644 --- a/smartgrid.iml +++ b/smartgrid.iml @@ -10,6 +10,26 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/archived/SensorEventHouseAveragingJob.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/archived/SensorEventHouseAveragingJob.scala index 7d6a64f..4af224c 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/archived/SensorEventHouseAveragingJob.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/archived/SensorEventHouseAveragingJob.scala @@ -1,8 +1,9 @@ +/* package com.bhaskardivya.projects.smartgrid.archived import com.bhaskardivya.projects.smartgrid.base.AbstractKeyGetter import com.bhaskardivya.projects.smartgrid.model._ -import com.bhaskardivya.projects.smartgrid.operators.AverageAggregateWithKey +import com.bhaskardivya.projects.smartgrid.operators.AverageWithKeyReducer import com.bhaskardivya.projects.smartgrid.pipeline._ import com.bhaskardivya.projects.smartgrid.sinks.HBaseOutputFormatAverageWithKey import org.apache.flink.api.java.utils.ParameterTool @@ -34,7 +35,7 @@ object SensorEventHouseAveragingJob { val windowed1min = withTimestamps .keyBy(keyGetter(_)) .window(SlidingEventTimeWindows.of(Time.minutes(1), Time.seconds(Constants.SLIDING_INTERVAL))) - .aggregate(new AverageAggregateWithKey(keyGetter)) + .aggregate(new AverageWithKeyReducer(keyGetter)) .name("Average for 1 min Window") val windowed5min = windowed1min @@ -95,4 +96,4 @@ object SensorEventHouseAveragingJob { } } -} \ No newline at end of file +}*/ diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala index a7e1483..48eb00b 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala @@ -3,14 +3,15 @@ package com.bhaskardivya.projects.smartgrid.base import java.io.File import com.bhaskardivya.projects.smartgrid.model._ -import com.bhaskardivya.projects.smartgrid.operators.{AverageAggregateWithKey, MedianAggregateWithKey, PredictionFunction} +import com.bhaskardivya.projects.smartgrid.operators._ import com.bhaskardivya.projects.smartgrid.pipeline._ -import com.bhaskardivya.projects.smartgrid.sinks.{PredictionElasticSearchSink, SensorEventElasticSearchSink} +import com.bhaskardivya.projects.smartgrid.sinks.PredictionElasticSearchSink +import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.utils.ParameterTool import org.apache.flink.core.fs.FileSystem import org.apache.flink.streaming.api.TimeCharacteristic import org.apache.flink.streaming.api.scala._ -import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows +import org.apache.flink.streaming.api.windowing.assigners.{SlidingEventTimeWindows, TumblingEventTimeWindows} import org.apache.flink.streaming.api.windowing.time.Time /** @@ -62,7 +63,6 @@ abstract class SensorEventAveragingJobBase extends Serializable { new File(LOG_DIR).mkdir() new File(LOG_DIR + "/input/").mkdir() new File(LOG_DIR + "/output_avg/").mkdir() - new File(LOG_DIR + "/output_enriched/").mkdir() new File(LOG_DIR + "/output_prediction/").mkdir() } catch { case e: Exception => println("Directories already created") @@ -78,192 +78,54 @@ abstract class SensorEventAveragingJobBase extends Serializable { env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) // Get the stream according to params - val stream: DataStream[SensorEvent] = SourceChooser.from(env, params).name("Sensor Source") + val rawStream: DataStream[SensorEvent] = SourceChooser.from(env, params).name("Sensor Source with Timestamp") - val withTimestamps = stream - .assignTimestampsAndWatermarks(SensorEvent.tsAssigner()) - .name("Source with Timestamp") + //TODO: create a Global Window for work values which will output the missing load values // Create a stream with sum according to the key specified - val initializedFlow = initializeFlow(withTimestamps) + val initializedFlow = initializeFlow(rawStream) + + implicit val typeInfoAverageWithKey = TypeInformation.of(classOf[AverageWithKey]) // Streams for each window duration for the average val avg_windowed1min = initializedFlow - .keyBy(keyGetter(_)) - .window(SlidingEventTimeWindows.of(Time.minutes(1), Time.seconds(Constants.SLIDING_INTERVAL))) - .aggregate(new AverageAggregateWithKey(keyGetter)) - .name(getKeyName() + " Average for 1 min Window") - - val avg_windowed5min = avg_windowed1min - .keyBy(_.key) - .window(SlidingEventTimeWindows.of(Time.minutes(5), Time.seconds(Constants.SLIDING_INTERVAL))) - .reduce(AverageWithKey.reducer) - .name(getKeyName() + " Average for 5 min Window") - - val avg_windowed15min = avg_windowed5min - .keyBy(_.key) - .window(SlidingEventTimeWindows.of(Time.minutes(15), Time.seconds(Constants.SLIDING_INTERVAL))) - .reduce(AverageWithKey.reducer) - .name(getKeyName() + " Average for 15 min Window") - - val avg_windowed60min = avg_windowed15min - .keyBy(_.key) - .window(SlidingEventTimeWindows.of(Time.minutes(60), Time.seconds(Constants.SLIDING_INTERVAL))) - .reduce(AverageWithKey.reducer) - .name(getKeyName() + " Average for 60 min Window") - - val avg_windowed120min = avg_windowed60min - .keyBy(_.key) - .window(SlidingEventTimeWindows.of(Time.minutes(120), Time.seconds(Constants.SLIDING_INTERVAL))) - .reduce(AverageWithKey.reducer) - .name(getKeyName() + " Average for 120 min Window") - - // Write to file for debug - val debug = params.has("debug") - if (debug) { - initializedFlow.writeAsCsv(LOG_DIR + "/input/initializedFlow.csv", FileSystem.WriteMode.OVERWRITE).name("Debug Initialized Flow") - avg_windowed1min.writeAsCsv(LOG_DIR + "/output_avg/windowed1min.csv", FileSystem.WriteMode.OVERWRITE).name("Debug Avg windowed 1 Min") - avg_windowed5min.writeAsCsv(LOG_DIR + "/output_avg/windowed5min.csv", FileSystem.WriteMode.OVERWRITE).name("Debug Avg windowed 5 Min") - avg_windowed15min.writeAsCsv(LOG_DIR + "/output_avg/windowed15min.csv", FileSystem.WriteMode.OVERWRITE).name("Debug Avg windowed 15 Min") - avg_windowed60min.writeAsCsv(LOG_DIR + "/output_avg/windowed60min.csv", FileSystem.WriteMode.OVERWRITE).name("Debug Avg windowed 60 Min") - avg_windowed120min.writeAsCsv(LOG_DIR + "/output_avg/windowed120min.csv", FileSystem.WriteMode.OVERWRITE).name("Debug Avg windowed 120 Min") - } - - val median_windowed1min = avg_windowed1min - .keyBy(_.key) - .window(SlidingEventTimeWindows.of(Time.minutes(1), Time.seconds(Constants.SLIDING_INTERVAL))) - .aggregate(new MedianAggregateWithKey) - .name(getKeyName() + " Median for 1 min Window") - - val median_windowed5min = median_windowed1min - .keyBy(_.key) - .window(SlidingEventTimeWindows.of(Time.minutes(5), Time.seconds(Constants.SLIDING_INTERVAL))) - .reduce(MedianLoadWithKey.reducer) - .name(getKeyName() + " Median for 5 min Window") - - val median_windowed15min = median_windowed5min - .keyBy(_.key) - .window(SlidingEventTimeWindows.of(Time.minutes(15), Time.seconds(Constants.SLIDING_INTERVAL))) - .reduce(MedianLoadWithKey.reducer) - .name(getKeyName() + " Median for 15 min Window") - - val median_windowed60min = median_windowed15min - .keyBy(_.key) - .window(SlidingEventTimeWindows.of(Time.minutes(60), Time.seconds(Constants.SLIDING_INTERVAL))) - .reduce(MedianLoadWithKey.reducer) - .name(getKeyName() + " Median for 60 min Window") - - val median_windowed120min = median_windowed60min - .keyBy(_.key) - .window(SlidingEventTimeWindows.of(Time.minutes(120), Time.seconds(Constants.SLIDING_INTERVAL))) - .reduce(MedianLoadWithKey.reducer) - .name(getKeyName() + " Median for 120 min Window") - - val timeout = params.getLong("timeout", 3000000L) // 300 seconds timeout - - // Enrich the average calculation with the median value for each stream of different window duration - val windowed1min_enriched: DataStream[(AverageWithKey, MedianLoadWithKey)] = avg_windowed1min - .join(median_windowed1min) - .where(_.key).equalTo(_.key) - .window(SlidingEventTimeWindows.of(Time.minutes(1), Time.seconds(Constants.SLIDING_INTERVAL))) - .apply((_, _)) - .startNewChain() - .name(getKeyName() + " Enriched - 1 Min Window") - - val windowed5min_enriched: DataStream[(AverageWithKey, MedianLoadWithKey)] = avg_windowed5min - .join(median_windowed5min) - .where(_.key).equalTo(_.key) - .window(SlidingEventTimeWindows.of(Time.minutes(5), Time.seconds(Constants.SLIDING_INTERVAL))) - .apply((_, _)) - .startNewChain() - .name(getKeyName() + " Enriched - 5 Min Window") - - val windowed15min_enriched: DataStream[(AverageWithKey, MedianLoadWithKey)] = avg_windowed15min - .join(median_windowed15min) - .where(_.key).equalTo(_.key) - .window(SlidingEventTimeWindows.of(Time.minutes(15), Time.seconds(Constants.SLIDING_INTERVAL))) - .apply((_, _)) - .startNewChain() - .name(getKeyName() + " Enriched - 15 Min Window") - - val windowed60min_enriched: DataStream[(AverageWithKey, MedianLoadWithKey)] = avg_windowed60min - .join(median_windowed60min) - .where(_.key).equalTo(_.key) - .window(SlidingEventTimeWindows.of(Time.minutes(60), Time.seconds(Constants.SLIDING_INTERVAL))) - .apply((_, _)) - .startNewChain() - .name(getKeyName() + " Enriched - 60 Min Window") - - val windowed120min_enriched: DataStream[(AverageWithKey, MedianLoadWithKey)] = avg_windowed120min - .join(median_windowed120min) - .where(_.key).equalTo(_.key) - .window(SlidingEventTimeWindows.of(Time.minutes(120), Time.seconds(Constants.SLIDING_INTERVAL))) - .apply((_, _)) - .startNewChain() - .name(getKeyName() + " Enriched - 120 Min Window") - - if (debug) { - windowed1min_enriched.writeAsCsv(LOG_DIR + "/output_enriched/windowed1min.csv", FileSystem.WriteMode.OVERWRITE).name("Debug Enriched 1 Min Window") - windowed5min_enriched.writeAsCsv(LOG_DIR + "/output_enriched/windowed5min.csv", FileSystem.WriteMode.OVERWRITE).name("Debug Enriched 5 Min Window") - windowed15min_enriched.writeAsCsv(LOG_DIR + "/output_enriched/windowed15min.csv", FileSystem.WriteMode.OVERWRITE).name("Debug Enriched 15 Min Window") - windowed60min_enriched.writeAsCsv(LOG_DIR + "/output_enriched/windowed60min.csv", FileSystem.WriteMode.OVERWRITE).name("Debug Enriched 60 Min Window") - windowed120min_enriched.writeAsCsv(LOG_DIR + "/output_enriched/windowed120min.csv", FileSystem.WriteMode.OVERWRITE).name("Debug Enriched 120 Min Window") - } - - // Create the predicted value streams - val windowed1min_prediction = windowed1min_enriched - .map(new PredictionFunction(entity = getKeyName(), slidingWindow = Time.minutes(1).toMilliseconds)) - - val windowed5min_prediction = windowed5min_enriched - .map(new PredictionFunction(entity = getKeyName(), slidingWindow = Time.minutes(5).toMilliseconds)) - - val windowed15min_prediction = windowed15min_enriched - .map(new PredictionFunction(entity = getKeyName(), slidingWindow = Time.minutes(15).toMilliseconds)) - - val windowed60min_prediction = windowed60min_enriched - .map(new PredictionFunction(entity = getKeyName(), slidingWindow = Time.minutes(60).toMilliseconds)) - - val windowed120min_prediction = windowed120min_enriched - .map(new PredictionFunction(entity = getKeyName(), slidingWindow = Time.minutes(120).toMilliseconds)) + .map(e => AverageWithKey(keyGetter(e), Slice(Time.minutes(1))(e.timestamp), Average(e.value, 1))) + .keyBy(e => (e.key, e.slice.start_time_of_day)) + .window(TumblingEventTimeWindows.of(Time.minutes(1))) + .reduce(new AverageWithKeyReducer) + .name(getKeyName() + " Average for 1 min Tumbling Window") + + // Store median as operator state + avg_windowed1min + .keyBy(e => (e.key, e.slice.start_time_of_day)) + .flatMap(new MedianWithKeyMapper) + .name(getKeyName() + " Median state for 1 min Tumbling Window") + + implicit val typeInfoPrediction2 = TypeInformation.of(classOf[Prediction2]) + + val windowed1min_prediction = avg_windowed1min + .keyBy(e => (e.key, e.slice.start_time_of_day)) + .window(SlidingEventTimeWindows.of(Time.minutes(1), Time.seconds(30))) + .reduce(new AverageWithKeyReducer) + .keyBy(e => (e.key, e.slice.start_time_of_day)) + .flatMap(new EnrichMapper) + .name(getKeyName() + " Prediction values for 1 min") // Sink the Predicted value streams to Elasticsearch windowed1min_prediction .addSink(PredictionElasticSearchSink(params,Constants.ES_INDEX_NAME, Constants.ES_INDEX_TYPE_1MIN)) .name(getKeyName() + " Prediction Sink - ES - 1 min Window") - windowed5min_prediction - .addSink(PredictionElasticSearchSink(params,Constants.ES_INDEX_NAME, Constants.ES_INDEX_TYPE_5MIN)) - .name(getKeyName() + " Prediction Sink - ES - 5 min Window") - - windowed15min_prediction - .addSink(PredictionElasticSearchSink(params,Constants.ES_INDEX_NAME, Constants.ES_INDEX_TYPE_15MIN)) - .name(getKeyName() + " Prediction Sink - ES - 15 min Window") - - windowed60min_prediction - .addSink(PredictionElasticSearchSink(params,Constants.ES_INDEX_NAME, Constants.ES_INDEX_TYPE_60MIN)) - .name(getKeyName() + " Prediction Sink - ES - 60 min Window") - - windowed120min_prediction - .addSink(PredictionElasticSearchSink(params,Constants.ES_INDEX_NAME, Constants.ES_INDEX_TYPE_120MIN)) - .name(getKeyName() + " Prediction Sink - ES - 120 min Window") - + // Write to file for debug + val debug = params.has("debug") if (debug) { - windowed1min_prediction.writeAsCsv(LOG_DIR + "/output_prediction/windowed1min.csv", FileSystem.WriteMode.OVERWRITE) - windowed5min_prediction.writeAsCsv(LOG_DIR + "/output_prediction/windowed5min.csv", FileSystem.WriteMode.OVERWRITE) - windowed15min_prediction.writeAsCsv(LOG_DIR + "/output_prediction/windowed15min.csv", FileSystem.WriteMode.OVERWRITE) - windowed60min_prediction.writeAsCsv(LOG_DIR + "/output_prediction/windowed60min.csv", FileSystem.WriteMode.OVERWRITE) - windowed120min_prediction.writeAsCsv(LOG_DIR + "/output_prediction/windowed120min.csv", FileSystem.WriteMode.OVERWRITE) + avg_windowed1min.writeAsText(LOG_DIR + "/output_avg/windowed1min.csv", FileSystem.WriteMode.OVERWRITE).name("Debug Avg 1 Min Window") + windowed1min_prediction.writeAsText(LOG_DIR + "/output_prediction/windowed1min.csv", FileSystem.WriteMode.OVERWRITE) + /*initializedFlow + .addSink(SensorEventElasticSearchSink(params, Constants.ES_INDEX_NAME, Constants.ES_INDEX_TYPE_RAW)) + .name("Sensor Raw to ES")*/ } - //Sink the original feed into ES as well - if(debug) { - initializedFlow - .addSink(SensorEventElasticSearchSink(params, Constants.ES_INDEX_NAME, Constants.ES_INDEX_TYPE_RAW)) - .name("Sensor Raw to ES") - } - - env.execute("Sensor Event" + getKeyName() + " Prediction Job (Kafka to HBase Averages + Prediction to ES)") - + env.execute("Sensor Event" + getKeyName() + " Prediction Job") } - } \ No newline at end of file diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/job/PlugAveragingJob.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/job/PlugAveragingJob.scala index 189ba10..57bd49f 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/job/PlugAveragingJob.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/job/PlugAveragingJob.scala @@ -13,10 +13,11 @@ object PlugAveragingJob extends SensorEventAveragingJobBase with Serializable{ override def getTargetColumnFamily(): String = Constants.PLUG_CF override def initializeFlow(dataStream: DataStream[SensorEvent]) = { - // Sum all the multiple values with the same timestamp for a given plug of a given house + // De-duplicate values with the same timestamp for a given plug of a given house dataStream - .keyBy("house_id", "plug_id", "timestamp") - .sum("value") - .name("Aggregated by Plug Data") + .filter(_.property == Constants.PROPERTY_LOAD) + .keyBy("house_id", "household_id" , "plug_id", "timestamp") + .reduce((a,b) => b) + .name("De-duplicated Raw stream") } } diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Average.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Average.scala index 43edfa9..4c38f65 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Average.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Average.scala @@ -1,5 +1,12 @@ package com.bhaskardivya.projects.smartgrid.model +import org.apache.flink.streaming.api.scala._ + +/** + * Class to hold the sum and count + * @param sum Aggregation of load values + * @param count Number of load values + */ case class Average(var sum: Double, var count: Long){ def add(that: Average): Average = { Average(this.sum + that.sum, this.count + that.count) @@ -8,4 +15,8 @@ case class Average(var sum: Double, var count: Long){ def +(that: Average): Average = { this.add(that) } + + def avg: Double = { + sum/count + } } diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/AverageWithKey.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/AverageWithKey.scala index 154f69e..ef176be 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/AverageWithKey.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/AverageWithKey.scala @@ -1,27 +1,28 @@ package com.bhaskardivya.projects.smartgrid.model import com.gotometrics.orderly.DoubleWritableRowKey +import org.apache.flink.streaming.api.windowing.time.Time import org.apache.hadoop.io.DoubleWritable +import org.apache.flink.streaming.api.scala._ -case class AverageWithKey(var key: SensorKeyObject, var sum: Double, var count: Long, eventTimestamp: Long = Constants.KEY_NO_VALUE){ - val averageValue: Double = sum / count +case class AverageWithKey(var key: SensorKeyObject, slice: Slice, average: Average){ + def averageValue = average.avg def add(that: AverageWithKey): AverageWithKey = { - // To ensure that we do no propagate "-1" (Constants.KEY_NO_VALUE) (initial accumulator value) - if(this.key.house_id > that.key.house_id) - AverageWithKey(this.key, this.sum + that.sum, this.count + that.count, Math.max(this.eventTimestamp, that.eventTimestamp)) - else - AverageWithKey(that.key, this.sum + that.sum, this.count + that.count, Math.max(this.eventTimestamp, that.eventTimestamp)) + AverageWithKey(this.key, this.slice, this.average + that.average) } def +(that: AverageWithKey): AverageWithKey = { this.add(that) } + /* HBase related functions start */ + @deprecated def toHBaseColumnName(): String = { - key.toColumnString() + key.toColumnString } + @deprecated def bytesRowKey(): Array[Byte] = { // Original Object that will be serialized val rowkeyVal: DoubleWritable = new DoubleWritable(toHBaseColumnValue().asInstanceOf[java.lang.Double]) @@ -34,17 +35,19 @@ case class AverageWithKey(var key: SensorKeyObject, var sum: Double, var count: * Otherwise, DoubleColumnInterpreter will not be able to read column value * @return */ + @deprecated def toHBaseColumnValue(): Double = { //this.sum + Constants.DELIMITER + this.count averageValue } + @deprecated def toHBaseLongColumnValue(): Long = { //this.sum + Constants.DELIMITER + this.count (averageValue * 1000).toLong } - def fromHBaseColumnValue(column: String, value: String): AverageWithKey = { + /*def fromHBaseColumnValue(column: String, value: String): AverageWithKey = { //val fields = value.split(Constants.DELIMITER) //AverageWithKey(column, fields(0).toDouble, fields(2).toLong) var sumValue = 0.0 @@ -54,11 +57,15 @@ case class AverageWithKey(var key: SensorKeyObject, var sum: Double, var count: case e: Exception => sumValue = 0.0 } AverageWithKey(SensorKeyObject.fromColumnString(column), sumValue, 1, Constants.KEY_NO_VALUE) - } + }*/ } object AverageWithKey { def reducer = { (a: AverageWithKey, b: AverageWithKey) => a+b } + + def getInitialValue(): AverageWithKey = { + new AverageWithKey(SensorKeyObject(-1), Slice(Time.milliseconds(-1))(-1), Average(0.0, 0) ) + } } diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Constants.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Constants.scala index 31e3755..facda18 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Constants.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Constants.scala @@ -29,4 +29,8 @@ object Constants { // Sensor Key Object val KEY_NO_VALUE: Long = -1 + val PROPERTY_LOAD: Int = 1 + + // TDigest + val TDIGEST_COMPRESSION = 100 } diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/MedianLoadWithKey.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/MedianLoadWithKey.scala index 114a186..953820b 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/MedianLoadWithKey.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/MedianLoadWithKey.scala @@ -2,18 +2,16 @@ package com.bhaskardivya.projects.smartgrid.model import com.tdunning.math.stats.TDigest -case class MedianLoadWithKey(var key: SensorKeyObject, totalDigest: TDigest){ - val medianLoad : Double = totalDigest.quantile(0.5) +case class MedianLoadWithKey(var key: SensorKeyObject, slice: Slice, digest: TDigest){ + def medianLoad : Double = digest.quantile(0.5) + def add(averageWithKey: AverageWithKey) = { - this.totalDigest.add(averageWithKey.averageValue) + this.digest.add(averageWithKey.averageValue) this } def add(other: MedianLoadWithKey) = { - this.totalDigest.add(other.totalDigest) - if(this.key.house_id < other.key.house_id){ - this.key = other.key - } + this.digest.add(other.digest) this } @@ -24,4 +22,11 @@ object MedianLoadWithKey { def reducer = { (a: MedianLoadWithKey, b: MedianLoadWithKey) => a+b } + + def fromAverageWithKey(averageWithKey: AverageWithKey) = { + val tDigest: TDigest = TDigest.createDigest(Constants.TDIGEST_COMPRESSION) + tDigest.add(averageWithKey.averageValue) + + MedianLoadWithKey(averageWithKey.key, averageWithKey.slice, tDigest) + } } \ No newline at end of file diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction.scala index 79ab66b..0617f03 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction.scala @@ -1,3 +1,4 @@ +/* package com.bhaskardivya.projects.smartgrid.model import java.util.Date @@ -42,4 +43,4 @@ case class Prediction(averageWithKey: AverageWithKey, medianLoad: MedianLoadWith json } -} \ No newline at end of file +}*/ diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction2.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction2.scala new file mode 100644 index 0000000..3c10cdc --- /dev/null +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction2.scala @@ -0,0 +1,48 @@ +package com.bhaskardivya.projects.smartgrid.model + +import java.util.Date + +import org.apache.sling.commons.json.JSONObject + +case class Prediction2(var averageWithKey: AverageWithKey, var medianLoad: MedianLoad, var predictedLoad: Double) { + + def toJSONString(): String = { + toJSON().toString() + } + + def toJSON(): JSONObject = { + //main object + val json = new JSONObject() + + //averageWithKey object + val averageWithKeyJSON = averageWithKey.key.toJSON + averageWithKeyJSON.put("sum", averageWithKey.average.sum) + averageWithKeyJSON.put("count", averageWithKey.average.count) + averageWithKeyJSON.put("avg", averageWithKey.averageValue) + averageWithKeyJSON.put("eventTimestamp", averageWithKey.slice.timestamp) + json.put("averageWithKey", averageWithKeyJSON) + + //medianLoad + val medianLoadJSON = new JSONObject() + medianLoadJSON.put("load", medianLoad.load.formatted("%.3f").toFloat) + json.put("medianLoad", medianLoadJSON) + + //key or entity + json.put("house_id", averageWithKey.key.house_id) + json.put("household_id", averageWithKey.key.household_id) + json.put("plug_id", averageWithKey.key.plug_id) + + //sliding window duration + json.put("slidingWindowDuration", averageWithKey.slice.size.toMilliseconds) + json.put("slice-start", averageWithKey.slice.ts_start) + json.put("slice-stop", averageWithKey.slice.ts_stop) + + //Predicted value + json.put("predictedValue", predictedLoad) + + // Current Time + json.put("current-timestamp", System.currentTimeMillis) + + json + } +} diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/SensorKeyObject.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/SensorKeyObject.scala index 1fbce6d..07bc3c9 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/SensorKeyObject.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/SensorKeyObject.scala @@ -1,12 +1,13 @@ package com.bhaskardivya.projects.smartgrid.model import org.apache.sling.commons.json.JSONObject +import org.apache.flink.streaming.api.scala._ class SensorKeyObject(var house_id: Long, var household_id: Long, var plug_id: Long) extends Serializable{ - override def toString: String = this.toColumnString() + override def toString: String = this.toColumnString - def toColumnString(): String = { + def toColumnString: String = { val str: StringBuilder = new StringBuilder if(this.house_id > Constants.KEY_NO_VALUE) @@ -23,11 +24,9 @@ class SensorKeyObject(var house_id: Long, var household_id: Long, var plug_id: L str.toString() } - def toJSONString(): String = { - toJSON().toString - } + def toJSONString: String = toJSON.toString - def toJSON(): JSONObject = { + def toJSON: JSONObject = { val json: JSONObject = new JSONObject() if(this.house_id > Constants.KEY_NO_VALUE) @@ -62,11 +61,11 @@ class SensorKeyObject(var house_id: Long, var household_id: Long, var plug_id: L object SensorKeyObject { def apply(house_id: Long): SensorKeyObject = { - new SensorKeyObject(house_id, Constants.KEY_NO_VALUE, Constants.KEY_NO_VALUE) + SensorKeyObject(house_id, Constants.KEY_NO_VALUE, Constants.KEY_NO_VALUE) } def apply(house_id: Long, household_id: Long, plug_id: Long): SensorKeyObject ={ - new SensorKeyObject(house_id, household_id, plug_id) + SensorKeyObject(house_id, household_id, plug_id) } def fromColumnString(columnValue: String): SensorKeyObject = { diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala index d3ca879..d37504b 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala @@ -1,14 +1,15 @@ package com.bhaskardivya.projects.smartgrid.model import org.apache.flink.streaming.api.windowing.time.Time +import org.apache.flink.streaming.api.scala._ /** * Class to represent a slice index of a size (aka window size) * * @param size Window size - * @param timestamp Event timestamp from the record + * @param timestamp Event timestamp from the record in seconds */ -case class Slice(size: Time)(timestamp: Long) { +case class Slice(val size: Time)(var timestamp: Long) { lazy val size_in_seconds = size.toMilliseconds / 1000 @@ -29,4 +30,19 @@ case class Slice(size: Time)(timestamp: Long) { val k = num_slices_in_day (1L to ((i+2)/k)).map(n => (i+2 - n*k)) } + + lazy val start_time_of_day = ts_start % (24*60*60) + lazy val stop_time_of_day = ts_stop % (24*60*60) + + override def toString : String = { + val str: StringBuilder = new StringBuilder + + str.append(size.toMilliseconds.toString) + str.append(Constants.DELIMITER) + str.append(ts_start.toString) + str.append(Constants.DELIMITER) + str.append(ts_stop.toString) + + str.toString() + } } diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/AverageAggregateWithKey.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/AverageAggregateWithKey.scala deleted file mode 100644 index dbace07..0000000 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/AverageAggregateWithKey.scala +++ /dev/null @@ -1,26 +0,0 @@ -package com.bhaskardivya.projects.smartgrid.operators - -import com.bhaskardivya.projects.smartgrid.base.AbstractKeyGetter -import com.bhaskardivya.projects.smartgrid.model.{AverageWithKey, Constants, SensorEvent, SensorKeyObject} -import org.apache.flink.api.common.functions.AggregateFunction - -/** - * The accumulator is used to keep a running sum and a count. The [getResult] method - * computes the average. - */ -class AverageAggregateWithKey(keyGetter: AbstractKeyGetter) extends AggregateFunction[SensorEvent, AverageWithKey, AverageWithKey]{ - - override def createAccumulator(): AverageWithKey = AverageWithKey(SensorKeyObject(-1), 0.0, 0L, Constants.KEY_NO_VALUE) - - override def add(value: SensorEvent, accumulator: AverageWithKey): AverageWithKey = - AverageWithKey( - keyGetter(value), - accumulator.sum + value.value, - accumulator.count + 1L, - value.timestamp - ) - - override def getResult(accumulator: AverageWithKey): AverageWithKey = accumulator - - override def merge(a: AverageWithKey, b: AverageWithKey): AverageWithKey = a + b -} diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/AverageWithKeyReducer.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/AverageWithKeyReducer.scala new file mode 100644 index 0000000..0430f3e --- /dev/null +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/AverageWithKeyReducer.scala @@ -0,0 +1,11 @@ +package com.bhaskardivya.projects.smartgrid.operators + +import com.bhaskardivya.projects.smartgrid.model._ +import org.apache.flink.api.common.functions.ReduceFunction + +/** + * The reducer function for AverageWithKey object having same keys + */ +class AverageWithKeyReducer extends ReduceFunction[AverageWithKey]{ + override def reduce(a: AverageWithKey, b: AverageWithKey): AverageWithKey = AverageWithKey.reducer(a,b) +} diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/EnrichMapper.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/EnrichMapper.scala new file mode 100644 index 0000000..d876142 --- /dev/null +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/EnrichMapper.scala @@ -0,0 +1,47 @@ +package com.bhaskardivya.projects.smartgrid.operators + +import com.bhaskardivya.projects.smartgrid.model.{AverageWithKey, MedianLoad, MedianLoadWithKey, Prediction2} +import com.tdunning.math.stats.TDigest +import org.apache.flink.api.common.functions.{RichFlatMapFunction, RichReduceFunction} +import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor} +import org.apache.flink.api.java.functions.FunctionAnnotation.ForwardedFields +import org.apache.flink.configuration.Configuration +import org.apache.flink.streaming.api.scala.createTypeInformation +import org.apache.flink.util.Collector + +/** + * Rich Mapper function to get the predicted load values from the median state and Average Loads + */ +class EnrichMapper extends RichFlatMapFunction[AverageWithKey, Prediction2]{ + + private var digest: ValueState[TDigest] = _ + private var prediction2: Prediction2 = _ + + override def open(parameters: Configuration): Unit = { + val descriptor = new ValueStateDescriptor[TDigest]("median", createTypeInformation[TDigest]) + digest = getRuntimeContext.getState(descriptor) + } + + override def flatMap(value: AverageWithKey, out: Collector[Prediction2]): Unit = { + val currentDigest = digest.value() + + val medianLoad = + if(currentDigest == null) { + value.averageValue + }else{ + currentDigest.quantile(0.5) + } + + val prediction = (value.averageValue + medianLoad) / 2.0 + + if (prediction2 == null) { + prediction2 = Prediction2(value, MedianLoad(medianLoad), prediction) + } else { + prediction2.averageWithKey = value + prediction2.medianLoad = MedianLoad(medianLoad) + prediction2.predictedLoad = prediction + } + + out.collect(prediction2) + } +} diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianAggregateWithKey.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianAggregateWithKey.scala index 3f984d2..ebe47d3 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianAggregateWithKey.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianAggregateWithKey.scala @@ -1,3 +1,4 @@ +/* package com.bhaskardivya.projects.smartgrid.operators import com.bhaskardivya.projects.smartgrid.model.{AverageWithKey, MedianLoadWithKey, SensorKeyObject} @@ -13,3 +14,4 @@ class MedianAggregateWithKey extends AggregateFunction[AverageWithKey, MedianLoa override def merge(a: MedianLoadWithKey, b: MedianLoadWithKey) = a+b } +*/ diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianWithKeyMapper.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianWithKeyMapper.scala new file mode 100644 index 0000000..bfca9d5 --- /dev/null +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianWithKeyMapper.scala @@ -0,0 +1,38 @@ +package com.bhaskardivya.projects.smartgrid.operators + +import com.bhaskardivya.projects.smartgrid.model._ +import com.tdunning.math.stats.TDigest +import org.apache.flink.api.common.functions.RichFlatMapFunction +import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor} +import org.apache.flink.configuration.Configuration +import org.apache.flink.streaming.api.scala._ +import org.apache.flink.util.Collector + +/** + * A Rich flatMap function to keep a running TDigest object for each key and each starting minutes of a day. + */ +class MedianWithKeyMapper extends RichFlatMapFunction[AverageWithKey, AverageWithKey]{ + + private var digest: ValueState[TDigest] = _ + + override def open(parameters: Configuration): Unit = { + val descriptor = new ValueStateDescriptor[TDigest]("median", createTypeInformation[TDigest]) + descriptor.setQueryable("median-query") + digest = getRuntimeContext.getState(descriptor) + } + + override def flatMap(value: AverageWithKey, out: Collector[AverageWithKey]): Unit = { + + var currentDigest = digest.value() + + if(currentDigest == null){ + currentDigest = TDigest.createDigest(Constants.TDIGEST_COMPRESSION) + } + + currentDigest.add(value.averageValue) + + digest.update(currentDigest) + + out.collect(value) + } +} diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/PredictionFunction.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/PredictionFunction.scala index 51ce4dd..072c023 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/PredictionFunction.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/PredictionFunction.scala @@ -1,3 +1,4 @@ +/* package com.bhaskardivya.projects.smartgrid.operators import com.bhaskardivya.projects.smartgrid.model.{AverageWithKey, MedianLoad, MedianLoadWithKey, Prediction} @@ -19,3 +20,4 @@ class PredictionFunction(entity: String, slidingWindow: Long) extends MapFunctio Prediction(value._1, value._2, entity, slidingWindow, predictedValue) } } +*/ diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/PredictionElasticSearchSink.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/PredictionElasticSearchSink.scala index 75f807b..b289464 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/PredictionElasticSearchSink.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/PredictionElasticSearchSink.scala @@ -2,29 +2,33 @@ package com.bhaskardivya.projects.smartgrid.sinks import java.net.{InetAddress, InetSocketAddress} -import com.bhaskardivya.projects.smartgrid.model.Prediction +import com.bhaskardivya.projects.smartgrid.model.Prediction2 import org.apache.flink.api.java.utils.ParameterTool import org.apache.flink.streaming.connectors.elasticsearch5.ElasticsearchSink object PredictionElasticSearchSink { - def apply(params: ParameterTool, esIndex: String, esIndexType: String): ElasticsearchSink[Prediction] ={ + def apply(params: ParameterTool, esIndex: String, esIndexType: String): ElasticsearchSink[Prediction2] ={ //Initialize Elastic search configuration val esClusterLocationIP = params.get("es.cluster.ip", "192.168.99.100") val esClusterLocationPort = params.getInt("es.cluster.port", 9300) + val esFlushMaxActions = params.getInt("bulk.flush.max.actions", 100) + val config = new java.util.HashMap[String, String] config.put("cluster.name", params.get("es.cluster.name", "docker-cluster")) // This instructs the sink to emit after every element, otherwise they would be buffered - config.put("bulk.flush.max.actions", "1") + config.put("bulk.flush.max.actions", esFlushMaxActions.toString) val transportAddresses = new java.util.ArrayList[InetSocketAddress] -/* + + /* transportAddresses.add(new InetSocketAddress(InetAddress.getByName("127.0.0.1"), esClusterLocationPort)) transportAddresses.add(new InetSocketAddress(InetAddress.getByName("172.17.0.2"), esClusterLocationPort)) transportAddresses.add(new InetSocketAddress(InetAddress.getByName("0.0.0.0"), esClusterLocationPort)) transportAddresses.add(new InetSocketAddress(InetAddress.getByName("localhost"), esClusterLocationPort)) -*/ transportAddresses.add(new InetSocketAddress(InetAddress.getByName("192.168.99.100"), esClusterLocationPort)) + */ + transportAddresses.add(new InetSocketAddress(InetAddress.getByName(esClusterLocationIP), esClusterLocationPort)) new ElasticsearchSink(config, transportAddresses, new PredictionElasticSearchSinkFunction(esIndex, esIndexType)) diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/PredictionElasticSearchSinkFunction.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/PredictionElasticSearchSinkFunction.scala index 70ffb92..f421604 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/PredictionElasticSearchSinkFunction.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/PredictionElasticSearchSinkFunction.scala @@ -1,6 +1,6 @@ package com.bhaskardivya.projects.smartgrid.sinks -import com.bhaskardivya.projects.smartgrid.model.Prediction +import com.bhaskardivya.projects.smartgrid.model.Prediction2 import org.apache.flink.api.common.functions.RuntimeContext import org.apache.flink.streaming.connectors.elasticsearch.{ElasticsearchSinkFunction, RequestIndexer} import org.elasticsearch.action.ActionRequest @@ -11,9 +11,9 @@ import org.elasticsearch.client.Requests * @param esIndex ElasticSearch Index name * @param esType ElasticSearch Index type */ -class PredictionElasticSearchSinkFunction(esIndex: String, esType: String) extends ElasticsearchSinkFunction[Prediction]{ +class PredictionElasticSearchSinkFunction(esIndex: String, esType: String) extends ElasticsearchSinkFunction[Prediction2]{ - def createIndexRequest(element: Prediction): ActionRequest = { + def createIndexRequest(element: Prediction2): ActionRequest = { val json = element.toJSONString() Requests.indexRequest @@ -22,7 +22,7 @@ class PredictionElasticSearchSinkFunction(esIndex: String, esType: String) exten .source(json) } - override def process(element: Prediction, ctx: RuntimeContext, indexer: RequestIndexer): Unit = { + override def process(element: Prediction2, ctx: RuntimeContext, indexer: RequestIndexer): Unit = { indexer.add(createIndexRequest(element)) } diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/sources/HBaseMedianSource.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/sources/HBaseMedianSource.scala index 2e90489..da25d4a 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/sources/HBaseMedianSource.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/sources/HBaseMedianSource.scala @@ -28,23 +28,23 @@ object HBaseMedianSource extends Serializable{ val scan = new Scan() if (avg != null && avg.key != null) { println("HBaseMedianSource | getMedian | ColumnQualifier is set") - scan.addColumn(columnFamily.getBytes(), avg.key.toColumnString().getBytes()) + scan.addColumn(columnFamily.getBytes(), avg.key.toColumnString.getBytes) } else { scan.addFamily(columnFamily.getBytes()) } - println("HBaseMedianSource | getMedian | Fetching Median " + table + " | " + columnFamily + " | " + avg.key.toColumnString()) + println("HBaseMedianSource | getMedian | Fetching Median " + table + " | " + columnFamily + " | " + avg.key.toColumnString) //println("BD | " + conf.toString) try { val aggregationClient: AggregationClient = new AggregationClient(conf) val median = aggregationClient.median(TableName.valueOf(table), new DoubleColumnInterpreter(), scan) - println("HBaseMedianSource | getMedian | Median Fetched " + table + " | " + columnFamily + " | " + avg.key.toColumnString()) + println("HBaseMedianSource | getMedian | Median Fetched " + table + " | " + columnFamily + " | " + avg.key.toColumnString) if (median == null) return 0.0 median / 1000.0 }catch { case e: Exception => println("Exception fetching median" + e) }finally { - println("GetMedian Took " + (System.currentTimeMillis() - startTime) + "ms for " + avg.key.toColumnString()) + println("GetMedian Took " + (System.currentTimeMillis() - startTime) + "ms for " + avg.key.toColumnString) } 0.0 } diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/sources/KafkaSource.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/sources/KafkaSource.scala index 8daf230..ee2e9c4 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/sources/KafkaSource.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/sources/KafkaSource.scala @@ -19,6 +19,7 @@ class KafkaSource() { val topic = params.get("topic", "test") val server = params.get("bootstrap.servers", "localhost:9092") val groupId = params.get("group.id", "test-" + System.currentTimeMillis().toString) + val startFromEarliest = params.getBoolean("start-from-earliest", true) // Create properties map for Kafka val properties = new Properties() @@ -26,9 +27,15 @@ class KafkaSource() { properties.setProperty("group.id", groupId) val consumer = new FlinkKafkaConsumer011[SensorEvent](topic, SensorEvent.schema(env.getConfig), properties) - consumer.setStartFromEarliest() - LOG.info("Created Source from kafka - Topic: {}, Server: {}, Consumer Group: {}", topic, server, groupId); + // Always start from the earliest kafka offset + if(startFromEarliest) + consumer.setStartFromEarliest() + + // Assign the Timestamp and Watermark from the SensorEvent's timestamp field + consumer.assignTimestampsAndWatermarks(SensorEvent.tsAssigner()) + + LOG.info("Created Source from kafka - Topic: {}, Server: {}, Consumer Group: {}", topic, server, groupId) env.addSource(consumer) } From 32218252c0b826045d2c9f057bafc6821f29f0f7 Mon Sep 17 00:00:00 2001 From: koldbyte Date: Mon, 19 Feb 2018 12:10:48 +0530 Subject: [PATCH 22/32] Changed to use MapState + fixes --- .../base/SensorEventAveragingJobBase.scala | 13 +++++++--- .../smartgrid/model/AverageWithKey.scala | 2 +- .../smartgrid/model/Prediction2.scala | 18 ++++++++----- .../smartgrid/model/SensorKeyObject.scala | 4 +-- .../projects/smartgrid/model/Slice.scala | 26 ++++++++++++------- .../smartgrid/operators/EnrichMapper.scala | 17 ++++++++---- .../operators/MedianWithKeyMapper.scala | 16 ++++++------ 7 files changed, 60 insertions(+), 36 deletions(-) diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala index 48eb00b..aa5aa20 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala @@ -9,6 +9,7 @@ import com.bhaskardivya.projects.smartgrid.sinks.PredictionElasticSearchSink import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.utils.ParameterTool import org.apache.flink.core.fs.FileSystem +import org.apache.flink.runtime.state.filesystem.FsStateBackend import org.apache.flink.streaming.api.TimeCharacteristic import org.apache.flink.streaming.api.scala._ import org.apache.flink.streaming.api.windowing.assigners.{SlidingEventTimeWindows, TumblingEventTimeWindows} @@ -61,6 +62,7 @@ abstract class SensorEventAveragingJobBase extends Serializable { //Create the log dirs try { new File(LOG_DIR).mkdir() + new File(LOG_DIR + "/state/").mkdir() new File(LOG_DIR + "/input/").mkdir() new File(LOG_DIR + "/output_avg/").mkdir() new File(LOG_DIR + "/output_prediction/").mkdir() @@ -77,6 +79,9 @@ abstract class SensorEventAveragingJobBase extends Serializable { // will be using the timestamp from the records env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) + //env.enableCheckpointing(10000) // checkpoint every 10000 msecs + //env.setStateBackend(new FsStateBackend(LOG_DIR +"/state/")) + // Get the stream according to params val rawStream: DataStream[SensorEvent] = SourceChooser.from(env, params).name("Sensor Source with Timestamp") @@ -90,24 +95,24 @@ abstract class SensorEventAveragingJobBase extends Serializable { // Streams for each window duration for the average val avg_windowed1min = initializedFlow .map(e => AverageWithKey(keyGetter(e), Slice(Time.minutes(1))(e.timestamp), Average(e.value, 1))) - .keyBy(e => (e.key, e.slice.start_time_of_day)) + .keyBy(_.key) .window(TumblingEventTimeWindows.of(Time.minutes(1))) .reduce(new AverageWithKeyReducer) .name(getKeyName() + " Average for 1 min Tumbling Window") // Store median as operator state avg_windowed1min - .keyBy(e => (e.key, e.slice.start_time_of_day)) + .keyBy(_.key) .flatMap(new MedianWithKeyMapper) .name(getKeyName() + " Median state for 1 min Tumbling Window") implicit val typeInfoPrediction2 = TypeInformation.of(classOf[Prediction2]) val windowed1min_prediction = avg_windowed1min - .keyBy(e => (e.key, e.slice.start_time_of_day)) + .keyBy(_.key) .window(SlidingEventTimeWindows.of(Time.minutes(1), Time.seconds(30))) .reduce(new AverageWithKeyReducer) - .keyBy(e => (e.key, e.slice.start_time_of_day)) + .keyBy(_.key) .flatMap(new EnrichMapper) .name(getKeyName() + " Prediction values for 1 min") diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/AverageWithKey.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/AverageWithKey.scala index ef176be..d3bcef7 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/AverageWithKey.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/AverageWithKey.scala @@ -5,7 +5,7 @@ import org.apache.flink.streaming.api.windowing.time.Time import org.apache.hadoop.io.DoubleWritable import org.apache.flink.streaming.api.scala._ -case class AverageWithKey(var key: SensorKeyObject, slice: Slice, average: Average){ +case class AverageWithKey(var key: SensorKeyObject, var slice: Slice, average: Average){ def averageValue = average.avg def add(that: AverageWithKey): AverageWithKey = { diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction2.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction2.scala index 3c10cdc..45f7181 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction2.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction2.scala @@ -1,7 +1,5 @@ package com.bhaskardivya.projects.smartgrid.model -import java.util.Date - import org.apache.sling.commons.json.JSONObject case class Prediction2(var averageWithKey: AverageWithKey, var medianLoad: MedianLoad, var predictedLoad: Double) { @@ -16,15 +14,15 @@ case class Prediction2(var averageWithKey: AverageWithKey, var medianLoad: Media //averageWithKey object val averageWithKeyJSON = averageWithKey.key.toJSON - averageWithKeyJSON.put("sum", averageWithKey.average.sum) + averageWithKeyJSON.put("sum", normalise_double(averageWithKey.average.sum)) averageWithKeyJSON.put("count", averageWithKey.average.count) - averageWithKeyJSON.put("avg", averageWithKey.averageValue) + averageWithKeyJSON.put("avg", normalise_double(averageWithKey.averageValue)) averageWithKeyJSON.put("eventTimestamp", averageWithKey.slice.timestamp) json.put("averageWithKey", averageWithKeyJSON) //medianLoad val medianLoadJSON = new JSONObject() - medianLoadJSON.put("load", medianLoad.load.formatted("%.3f").toFloat) + medianLoadJSON.put("load", normalise_double(medianLoad.load)) json.put("medianLoad", medianLoadJSON) //key or entity @@ -38,11 +36,19 @@ case class Prediction2(var averageWithKey: AverageWithKey, var medianLoad: Media json.put("slice-stop", averageWithKey.slice.ts_stop) //Predicted value - json.put("predictedValue", predictedLoad) + json.put("predictedValue", normalise_double(predictedLoad)) // Current Time json.put("current-timestamp", System.currentTimeMillis) json } + + def normalise_double(dbl: Double): Double = { + if(dbl < 1e-6) { + 0.000001 + }else{ + dbl + } + } } diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/SensorKeyObject.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/SensorKeyObject.scala index 07bc3c9..ffed820 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/SensorKeyObject.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/SensorKeyObject.scala @@ -61,11 +61,11 @@ class SensorKeyObject(var house_id: Long, var household_id: Long, var plug_id: L object SensorKeyObject { def apply(house_id: Long): SensorKeyObject = { - SensorKeyObject(house_id, Constants.KEY_NO_VALUE, Constants.KEY_NO_VALUE) + new SensorKeyObject(house_id, Constants.KEY_NO_VALUE, Constants.KEY_NO_VALUE) } def apply(house_id: Long, household_id: Long, plug_id: Long): SensorKeyObject ={ - SensorKeyObject(house_id, household_id, plug_id) + new SensorKeyObject(house_id, household_id, plug_id) } def fromColumnString(columnValue: String): SensorKeyObject = { diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala index d37504b..84782c5 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala @@ -3,36 +3,42 @@ package com.bhaskardivya.projects.smartgrid.model import org.apache.flink.streaming.api.windowing.time.Time import org.apache.flink.streaming.api.scala._ +import scala.collection.immutable + /** * Class to represent a slice index of a size (aka window size) * * @param size Window size * @param timestamp Event timestamp from the record in seconds */ -case class Slice(val size: Time)(var timestamp: Long) { +case class Slice(size: Time)(var timestamp: Long) { + private val seconds_in_Day = (24*60*60) - lazy val size_in_seconds = size.toMilliseconds / 1000 + def size_in_seconds: Long = size.toMilliseconds / 1000 // This is the base timestamp ... the epoch of the world with flink val base = 0L // TODO: currently set 0 to align with Unix Epoch - lazy val ts_start = timestamp - (timestamp % size_in_seconds) + def ts_start: Long = timestamp - (timestamp % size_in_seconds) - lazy val ts_stop = ts_start + size_in_seconds - 1 + def ts_stop: Long = ts_start + size_in_seconds - 1 - lazy val i = ( ts_start - base ) / size_in_seconds + def i: Long = ( ts_start - base ) / size_in_seconds - def num_slices_in(hr: Long) = (hr * 60 * 60) / size_in_seconds + def num_slices_in(hr: Long): Long = (hr * 60 * 60) / size_in_seconds - val num_slices_in_day = num_slices_in(24) + def num_slices_in_day: Long = num_slices_in(24) - lazy val j = { + def j: immutable.IndexedSeq[Long] = { val k = num_slices_in_day (1L to ((i+2)/k)).map(n => (i+2 - n*k)) } - lazy val start_time_of_day = ts_start % (24*60*60) - lazy val stop_time_of_day = ts_stop % (24*60*60) + def start_time_of_day: Long = ts_start % seconds_in_Day + def stop_time_of_day: Long = ts_stop % seconds_in_Day + + def predicting_for_time_of_day: Long = (start_time_of_day + 2*size_in_seconds) % seconds_in_Day + def predicting_for_slice: Slice = Slice(size)(ts_start + 2*size_in_seconds) override def toString : String = { val str: StringBuilder = new StringBuilder diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/EnrichMapper.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/EnrichMapper.scala index d876142..f971c45 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/EnrichMapper.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/EnrichMapper.scala @@ -3,7 +3,7 @@ package com.bhaskardivya.projects.smartgrid.operators import com.bhaskardivya.projects.smartgrid.model.{AverageWithKey, MedianLoad, MedianLoadWithKey, Prediction2} import com.tdunning.math.stats.TDigest import org.apache.flink.api.common.functions.{RichFlatMapFunction, RichReduceFunction} -import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor} +import org.apache.flink.api.common.state.{MapState, MapStateDescriptor, ValueState, ValueStateDescriptor} import org.apache.flink.api.java.functions.FunctionAnnotation.ForwardedFields import org.apache.flink.configuration.Configuration import org.apache.flink.streaming.api.scala.createTypeInformation @@ -14,17 +14,19 @@ import org.apache.flink.util.Collector */ class EnrichMapper extends RichFlatMapFunction[AverageWithKey, Prediction2]{ - private var digest: ValueState[TDigest] = _ + private var digest: MapState[Long, TDigest] = _ private var prediction2: Prediction2 = _ override def open(parameters: Configuration): Unit = { - val descriptor = new ValueStateDescriptor[TDigest]("median", createTypeInformation[TDigest]) - digest = getRuntimeContext.getState(descriptor) + val descriptor = new MapStateDescriptor[Long, TDigest]("median", createTypeInformation[Long], createTypeInformation[TDigest]) + digest = getRuntimeContext.getMapState(descriptor) } override def flatMap(value: AverageWithKey, out: Collector[Prediction2]): Unit = { - val currentDigest = digest.value() + // Get the TDigest Object for the SensorKey and slice predicting for + val currentDigest = digest.get(value.slice.predicting_for_time_of_day) + // Get the Median Load Value val medianLoad = if(currentDigest == null) { value.averageValue @@ -32,8 +34,13 @@ class EnrichMapper extends RichFlatMapFunction[AverageWithKey, Prediction2]{ currentDigest.quantile(0.5) } + // Calculate the load prediction val prediction = (value.averageValue + medianLoad) / 2.0 + // Update the Slice index for prediction + value.slice = value.slice.predicting_for_slice + + // Create the final Prediction Object to be collected if (prediction2 == null) { prediction2 = Prediction2(value, MedianLoad(medianLoad), prediction) } else { diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianWithKeyMapper.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianWithKeyMapper.scala index bfca9d5..7c3d990 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianWithKeyMapper.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianWithKeyMapper.scala @@ -3,7 +3,7 @@ package com.bhaskardivya.projects.smartgrid.operators import com.bhaskardivya.projects.smartgrid.model._ import com.tdunning.math.stats.TDigest import org.apache.flink.api.common.functions.RichFlatMapFunction -import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor} +import org.apache.flink.api.common.state.{MapState, MapStateDescriptor, ValueState, ValueStateDescriptor} import org.apache.flink.configuration.Configuration import org.apache.flink.streaming.api.scala._ import org.apache.flink.util.Collector @@ -13,25 +13,25 @@ import org.apache.flink.util.Collector */ class MedianWithKeyMapper extends RichFlatMapFunction[AverageWithKey, AverageWithKey]{ - private var digest: ValueState[TDigest] = _ + private var digest: MapState[Long, TDigest] = _ override def open(parameters: Configuration): Unit = { - val descriptor = new ValueStateDescriptor[TDigest]("median", createTypeInformation[TDigest]) + val descriptor = new MapStateDescriptor[Long, TDigest]("median", createTypeInformation[Long], createTypeInformation[TDigest]) descriptor.setQueryable("median-query") - digest = getRuntimeContext.getState(descriptor) + digest = getRuntimeContext.getMapState(descriptor) } override def flatMap(value: AverageWithKey, out: Collector[AverageWithKey]): Unit = { - - var currentDigest = digest.value() + val key = value.slice.start_time_of_day + var currentDigest = digest.get(key) if(currentDigest == null){ currentDigest = TDigest.createDigest(Constants.TDIGEST_COMPRESSION) } - currentDigest.add(value.averageValue) + currentDigest.add(key) - digest.update(currentDigest) + digest.put(key, currentDigest) out.collect(value) } From ccac03cefe748da1a0596e8dd8ddfae53af9f53b Mon Sep 17 00:00:00 2001 From: koldbyte Date: Mon, 19 Feb 2018 16:14:05 +0530 Subject: [PATCH 23/32] Added streams for more window durations + Refactorings + Cleanups --- .../SensorEventHouseAveragingJob.scala | 99 ------------------- .../SensorEventHouseAveragingJob2.scala | 79 --------------- .../SensorEventPlugAveragingJob.scala | 90 ----------------- .../base/SensorEventAveragingJobBase.scala | 97 ++++++++++++------ .../projects/smartgrid/model/Prediction.scala | 39 +++++--- .../smartgrid/model/Prediction2.scala | 54 ---------- .../smartgrid/model/SensorEvent.scala | 3 +- .../smartgrid/operators/EnrichMapper.scala | 12 +-- .../operators/MedianWithKeyMapper.scala | 6 +- ...archSink.scala => ElasticSearchSink.scala} | 14 +-- ....scala => ElasticSearchSinkFunction.scala} | 9 +- .../projects/smartgrid/sinks/HBaseSink.scala | 7 -- .../sinks/PredictionElasticSearchSink.scala | 37 ------- ...SensorEventElasticSearchSinkFunction.scala | 29 ------ .../projects/smartgrid/util/JSONTrait.scala | 7 ++ 15 files changed, 122 insertions(+), 460 deletions(-) delete mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/archived/SensorEventHouseAveragingJob.scala delete mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/archived/SensorEventHouseAveragingJob2.scala delete mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/archived/SensorEventPlugAveragingJob.scala delete mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction2.scala rename src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/{SensorEventElasticSearchSink.scala => ElasticSearchSink.scala} (65%) rename src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/{PredictionElasticSearchSinkFunction.scala => ElasticSearchSinkFunction.scala} (62%) delete mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/HBaseSink.scala delete mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/PredictionElasticSearchSink.scala delete mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/SensorEventElasticSearchSinkFunction.scala create mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/util/JSONTrait.scala diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/archived/SensorEventHouseAveragingJob.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/archived/SensorEventHouseAveragingJob.scala deleted file mode 100644 index 4af224c..0000000 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/archived/SensorEventHouseAveragingJob.scala +++ /dev/null @@ -1,99 +0,0 @@ -/* -package com.bhaskardivya.projects.smartgrid.archived - -import com.bhaskardivya.projects.smartgrid.base.AbstractKeyGetter -import com.bhaskardivya.projects.smartgrid.model._ -import com.bhaskardivya.projects.smartgrid.operators.AverageWithKeyReducer -import com.bhaskardivya.projects.smartgrid.pipeline._ -import com.bhaskardivya.projects.smartgrid.sinks.HBaseOutputFormatAverageWithKey -import org.apache.flink.api.java.utils.ParameterTool -import org.apache.flink.core.fs.FileSystem -import org.apache.flink.streaming.api.TimeCharacteristic -import org.apache.flink.streaming.api.functions.sink.OutputFormatSinkFunction -import org.apache.flink.streaming.api.scala._ -import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows -import org.apache.flink.streaming.api.windowing.time.Time - -object SensorEventHouseAveragingJob { - def main(args: Array[String]): Unit = { - // parse parameters - val params = ParameterTool.fromArgs(args) - - // Initialise the environment for flink - val env = StreamExecutionEnvironment.getExecutionEnvironment - - // will be using the timestamp from the records - env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) - - // Get the stream according to params - val stream: DataStream[SensorEvent] = SourceChooser.from(env, params).name("Sensor Parsed Data") - - val withTimestamps = stream - .assignTimestampsAndWatermarks(SensorEvent.tsAssigner()) - .name("Kafka Source with TS") - - val windowed1min = withTimestamps - .keyBy(keyGetter(_)) - .window(SlidingEventTimeWindows.of(Time.minutes(1), Time.seconds(Constants.SLIDING_INTERVAL))) - .aggregate(new AverageWithKeyReducer(keyGetter)) - .name("Average for 1 min Window") - - val windowed5min = windowed1min - .keyBy(_.key) - .window(SlidingEventTimeWindows.of(Time.minutes(5), Time.seconds(Constants.SLIDING_INTERVAL))) - .reduce(AverageWithKey.reducer) - .name("Average for 5 min Window") - - val windowed15min = windowed5min - .keyBy(_.key) - .window(SlidingEventTimeWindows.of(Time.minutes(15), Time.seconds(Constants.SLIDING_INTERVAL))) - .reduce(AverageWithKey.reducer) - .name("Average for 15 min Window") - - val windowed60min = windowed15min - .keyBy(_.key) - .window(SlidingEventTimeWindows.of(Time.minutes(60), Time.seconds(Constants.SLIDING_INTERVAL))) - .reduce(AverageWithKey.reducer) - .name("Average for 60 min Window") - - val windowed120min = windowed60min - .keyBy(_.key) - .window(SlidingEventTimeWindows.of(Time.minutes(120), Time.seconds(Constants.SLIDING_INTERVAL))) - .reduce(AverageWithKey.reducer) - .name("Average for 120 min Window") - - val debug = params.has("debug") - if(debug){ - windowed1min.writeAsCsv("/data/output.windowed1min.csv", FileSystem.WriteMode.OVERWRITE) - windowed5min.writeAsCsv("/data/output.windowed5min.csv", FileSystem.WriteMode.OVERWRITE) - windowed15min.writeAsCsv("/data/output.windowed15min.csv", FileSystem.WriteMode.OVERWRITE) - windowed60min.writeAsCsv("/data/output.windowed60min.csv", FileSystem.WriteMode.OVERWRITE) - windowed120min.writeAsCsv("/data/output.windowed120min.csv", FileSystem.WriteMode.OVERWRITE) - } - - windowed1min.addSink(new OutputFormatSinkFunction[AverageWithKey](new HBaseOutputFormatAverageWithKey().of(Constants.TABLE_1MIN, Constants.HOUSE_CF))) - .name("House - 1 Min Window") - - windowed5min.addSink(new OutputFormatSinkFunction[AverageWithKey](new HBaseOutputFormatAverageWithKey().of(Constants.TABLE_5MIN, Constants.HOUSE_CF))) - .name("House - 5 Min Window") - - windowed15min.addSink(new OutputFormatSinkFunction[AverageWithKey](new HBaseOutputFormatAverageWithKey().of(Constants.TABLE_15MIN, Constants.HOUSE_CF))) - .name("House - 15 Min Window") - - windowed60min.addSink(new OutputFormatSinkFunction[AverageWithKey](new HBaseOutputFormatAverageWithKey().of(Constants.TABLE_60MIN, Constants.HOUSE_CF))) - .name("House - 60 Min Window") - - windowed120min.addSink(new OutputFormatSinkFunction[AverageWithKey](new HBaseOutputFormatAverageWithKey().of(Constants.TABLE_120MIN, Constants.HOUSE_CF))) - .name("House - 120 Min Window") - - env.execute("Sensor Event House Averaging Job (Kafka to HBase Averages) ") - - } - - object keyGetter extends AbstractKeyGetter{ - def apply(element: SensorEvent): SensorKeyObject = { - SensorKeyObject(element.house_id) - } - } - -}*/ diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/archived/SensorEventHouseAveragingJob2.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/archived/SensorEventHouseAveragingJob2.scala deleted file mode 100644 index 506667d..0000000 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/archived/SensorEventHouseAveragingJob2.scala +++ /dev/null @@ -1,79 +0,0 @@ -package com.bhaskardivya.projects.smartgrid.archived - -import com.bhaskardivya.projects.smartgrid.model.{Constants, SensorEvent} -import com.bhaskardivya.projects.smartgrid.operators.AverageAggregate -import com.bhaskardivya.projects.smartgrid.pipeline._ -import org.apache.flink.api.java.utils.ParameterTool -import org.apache.flink.core.fs.FileSystem -import org.apache.flink.streaming.api.TimeCharacteristic -import org.apache.flink.streaming.api.scala._ -import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows -import org.apache.flink.streaming.api.windowing.time.Time - -object SensorEventHouseAveragingJob2 { - def main(args: Array[String]): Unit = { - // parse parameters - val params = ParameterTool.fromArgs(args) - - // Initialise the environment for flink - val env = StreamExecutionEnvironment.getExecutionEnvironment - - // will be using the timestamp from the records - env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) - - // Get the stream according to params - val stream: DataStream[SensorEvent] = SourceChooser.from(env, params).name("Sensor Raw") - - val withTimestampsKeyed = stream - //id, timestamp, value, property, plug_id, household_id, house_id - //.map(x => SensorEvent(x(0).toLong, x(1).toLong, x(2).toDouble, x(3).toInt, x(4).toLong, x(5).toLong, x(6).toLong)) - .assignTimestampsAndWatermarks(SensorEvent.tsAssigner()) - .name("Kafka Source with TS") - .keyBy(_.house_id) - - val windowed1min = withTimestampsKeyed - .window(SlidingEventTimeWindows.of(Time.minutes(1), Time.seconds(Constants.SLIDING_INTERVAL))) - .aggregate(new AverageAggregate())//, new AverageWindowFunction()) - .name("Average for 1 min Window") - - windowed1min.writeAsCsv("/data/window_1min_out.csv", FileSystem.WriteMode.OVERWRITE) - - val windowed5min = withTimestampsKeyed - .window(SlidingEventTimeWindows.of(Time.minutes(5), Time.seconds(Constants.SLIDING_INTERVAL))) - .aggregate(new AverageAggregate())//, new AverageWindowFunction()) - .name("Average for 5 min Window") - - val windowed15min = withTimestampsKeyed - .window(SlidingEventTimeWindows.of(Time.minutes(15), Time.seconds(Constants.SLIDING_INTERVAL))) - .aggregate(new AverageAggregate())//, new AverageWindowFunction()) - .name("Average for 15 min Window") - - val windowed60min = withTimestampsKeyed - .window(SlidingEventTimeWindows.of(Time.minutes(60), Time.seconds(Constants.SLIDING_INTERVAL))) - .aggregate(new AverageAggregate())//, new AverageWindowFunction()) - .name("Average for 60 min Window") - - val windowed120min = withTimestampsKeyed - .window(SlidingEventTimeWindows.of(Time.minutes(120), Time.seconds(Constants.SLIDING_INTERVAL))) - .aggregate(new AverageAggregate())//, new AverageWindowFunction()) - .name("Average for 120 min Window") - - /* windowed1min.addSink(new OutputFormatSinkFunction[(Long, Average)](new HBaseOutputFormat().of(Constants.TABLE_1MIN, Constants.HOUSE_CF))) - .name("House - 1 Min Window") - - windowed5min.addSink(new OutputFormatSinkFunction[(Long, Average)](new HBaseOutputFormat().of(Constants.TABLE_5MIN, Constants.HOUSE_CF))) - .name("House - 5 Min Window") - - windowed15min.addSink(new OutputFormatSinkFunction[(Long, Average)](new HBaseOutputFormat().of(Constants.TABLE_15MIN, Constants.HOUSE_CF))) - .name("House - 15 Min Window") - - windowed60min.addSink(new OutputFormatSinkFunction[(Long, Average)](new HBaseOutputFormat().of(Constants.TABLE_60MIN, Constants.HOUSE_CF))) - .name("House - 60 Min Window") - - windowed120min.addSink(new OutputFormatSinkFunction[(Long, Average)](new HBaseOutputFormat().of(Constants.TABLE_120MIN, Constants.HOUSE_CF))) - .name("House - 120 Min Window")*/ - - env.execute("Sensor Event House Averaging Job (Kafka to HBase Averages) ") - - } -} \ No newline at end of file diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/archived/SensorEventPlugAveragingJob.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/archived/SensorEventPlugAveragingJob.scala deleted file mode 100644 index 74caacb..0000000 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/archived/SensorEventPlugAveragingJob.scala +++ /dev/null @@ -1,90 +0,0 @@ -package com.bhaskardivya.projects.smartgrid.archived - -import com.bhaskardivya.projects.smartgrid.model.{Average, Constants, SensorEvent} -import com.bhaskardivya.projects.smartgrid.operators.{AverageAggregate, AverageWindowFunction} -import com.bhaskardivya.projects.smartgrid.pipeline._ -import org.apache.flink.api.java.utils.ParameterTool -import org.apache.flink.streaming.api.TimeCharacteristic -import org.apache.flink.streaming.api.scala._ -import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows -import org.apache.flink.streaming.api.windowing.time.Time - -object SensorEventPlugAveragingJob { - def main(args: Array[String]): Unit = { - // parse parameters - val params = ParameterTool.fromArgs(args) - - // Initialise the environment for flink - val env = StreamExecutionEnvironment.getExecutionEnvironment - - // will be using the timestamp from the records - env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) - - // Get the stream according to params - val stream: DataStream[SensorEvent] = SourceChooser.from(env, params).name("Sensor Raw") - - //no need to add timestamps as kafka source already has it - //val withTimestamps = stream - //id, timestamp, value, property, plug_id, household_id, house_id - //.map(x => SensorEvent(x(0).toLong, x(1).toLong, x(2).toDouble, x(3).toInt, x(4).toLong, x(5).toLong, x(6).toLong)) - //.assignTimestampsAndWatermarks(new PunctuatedAssigner()) - - val windowed1min = stream - .keyBy(_.plug_id) - .window(SlidingEventTimeWindows.of(Time.minutes(1), Time.seconds(Constants.SLIDING_INTERVAL))) - .aggregate(new AverageAggregate(), new AverageWindowFunction()) - .name("Average for 1 min Window") - - val windowed5min = windowed1min - .keyBy(_._1) - .window(SlidingEventTimeWindows.of(Time.minutes(5), Time.seconds(Constants.SLIDING_INTERVAL))) - .reduce( - (a: (Long,Average), b: (Long, Average)) => (a._1, a._2+b._2) - ) - .name("Average for 5 min Window") - - val windowed15min = windowed5min - .keyBy(_._1) - .window(SlidingEventTimeWindows.of(Time.minutes(15), Time.seconds(Constants.SLIDING_INTERVAL))) - .reduce( - (a: (Long,Average), b: (Long, Average)) => (a._1, a._2+b._2) - ) - .name("Average for 15 min Window") - - val windowed60min = windowed15min - .keyBy(_._1) - .window(SlidingEventTimeWindows.of(Time.minutes(60), Time.seconds(Constants.SLIDING_INTERVAL))) - .reduce( - (a: (Long,Average), b: (Long, Average)) => (a._1, a._2+b._2) - ) - .name("Average for 60 min Window") - - val windowed120min = windowed60min - .keyBy(_._1) - .window(SlidingEventTimeWindows.of(Time.minutes(120), Time.seconds(Constants.SLIDING_INTERVAL))) - .reduce( - (a: (Long,Average), b: (Long, Average)) => (a._1, a._2+b._2) - ) - .name("Average for 120 min Window") -/* - - windowed1min.writeUsingOutputFormat(new HBaseOutputFormatAverageWithKey().of(Constants.TABLE_1MIN, Constants.HOUSE_CF)) - .name("Plug - 1 Min Window") - - windowed5min.writeUsingOutputFormat(new HBaseOutputFormatAverageWithKey().of(Constants.TABLE_5MIN, Constants.HOUSE_CF)) - .name("Plug - 5 Min Window") - - windowed15min.writeUsingOutputFormat(new HBaseOutputFormatAverageWithKey().of(Constants.TABLE_15MIN, Constants.HOUSE_CF)) - .name("Plug - 15 Min Window") - - windowed60min.writeUsingOutputFormat(new HBaseOutputFormatAverageWithKey().of(Constants.TABLE_60MIN, Constants.HOUSE_CF)) - .name("Plug - 60 Min Window") - - windowed120min.writeUsingOutputFormat(new HBaseOutputFormatAverageWithKey().of(Constants.TABLE_120MIN, Constants.HOUSE_CF)) - .name("Plug - 120 Min Window") -*/ - - env.execute("Sensor Event Plug Averaging Job (Kafka to HBase Averages) ") - - } -} \ No newline at end of file diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala index aa5aa20..caf8d15 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala @@ -5,13 +5,13 @@ import java.io.File import com.bhaskardivya.projects.smartgrid.model._ import com.bhaskardivya.projects.smartgrid.operators._ import com.bhaskardivya.projects.smartgrid.pipeline._ -import com.bhaskardivya.projects.smartgrid.sinks.PredictionElasticSearchSink +import com.bhaskardivya.projects.smartgrid.sinks.ElasticSearchSink import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.java.utils.ParameterTool import org.apache.flink.core.fs.FileSystem import org.apache.flink.runtime.state.filesystem.FsStateBackend import org.apache.flink.streaming.api.TimeCharacteristic -import org.apache.flink.streaming.api.scala._ +import org.apache.flink.streaming.api.scala.{DataStream, _} import org.apache.flink.streaming.api.windowing.assigners.{SlidingEventTimeWindows, TumblingEventTimeWindows} import org.apache.flink.streaming.api.windowing.time.Time @@ -22,6 +22,10 @@ import org.apache.flink.streaming.api.windowing.time.Time */ abstract class SensorEventAveragingJobBase extends Serializable { + //register implicits for types + implicit val typeInfoAverageWithKey = TypeInformation.of(classOf[AverageWithKey]) + implicit val typeInfoPrediction2 = TypeInformation.of(classOf[Prediction]) + private val LOG_DIR = "/data/" + getKeyName() /** @@ -58,7 +62,11 @@ abstract class SensorEventAveragingJobBase extends Serializable { } } + // Sliding window durations in minutes + val windowDurations = List(1, 5, 15, 60, 120) + def main(args: Array[String]): Unit = { + //Create the log dirs try { new File(LOG_DIR).mkdir() @@ -89,48 +97,77 @@ abstract class SensorEventAveragingJobBase extends Serializable { // Create a stream with sum according to the key specified val initializedFlow = initializeFlow(rawStream) + val averageWithKeys = initializedFlow + .map(e => AverageWithKey(keyGetter(e), Slice(Time.minutes(1))(e.timestamp), Average(e.value, 1))) + + //create median states for various window duration + createMedianState(averageWithKeys) + + val windowed_average_1min = createPredictionStream(params, 1, averageWithKeys) + val windowed_average_5min = createPredictionStream(params, 5, windowed_average_1min) + val windowed_average_15min = createPredictionStream(params, 15, windowed_average_5min) + val windowed_average_60min = createPredictionStream(params, 60, windowed_average_15min) + val windowed_average_120min = createPredictionStream(params, 120, windowed_average_60min) + + initializedFlow + .addSink(ElasticSearchSink[SensorEvent](params, Constants.ES_INDEX_NAME, Constants.ES_INDEX_TYPE_RAW)) + .name("Sensor Raw to ES") - implicit val typeInfoAverageWithKey = TypeInformation.of(classOf[AverageWithKey]) + env.execute("Sensor Event" + getKeyName() + " Prediction Job") + } + + /** + * Helper function to create median MapState for each window duration + * @param averageWithKeyStream DataStream of AverageWithKey + */ + def createMedianState(averageWithKeyStream: DataStream[AverageWithKey]) = { // Streams for each window duration for the average - val avg_windowed1min = initializedFlow - .map(e => AverageWithKey(keyGetter(e), Slice(Time.minutes(1))(e.timestamp), Average(e.value, 1))) - .keyBy(_.key) - .window(TumblingEventTimeWindows.of(Time.minutes(1))) - .reduce(new AverageWithKeyReducer) - .name(getKeyName() + " Average for 1 min Tumbling Window") + windowDurations.foreach(duration => { - // Store median as operator state - avg_windowed1min - .keyBy(_.key) - .flatMap(new MedianWithKeyMapper) - .name(getKeyName() + " Median state for 1 min Tumbling Window") + val stateName = getStateName(duration) + + val avg_windowed = averageWithKeyStream + .keyBy(_.key) + .window(TumblingEventTimeWindows.of(Time.minutes(duration))) + .reduce(new AverageWithKeyReducer) + .name(getKeyName() + " Average for " + duration + " min Tumbling Window") + + // Store median as operator state + avg_windowed + .keyBy(_.key) + .flatMap(new MedianWithKeyMapper(stateName)) + .name(getKeyName() + " Median state for " + duration + " min Tumbling Window") - implicit val typeInfoPrediction2 = TypeInformation.of(classOf[Prediction2]) + }) + } + + def createPredictionStream(params: ParameterTool, duration: Int, sourceStream: DataStream[AverageWithKey]) = { - val windowed1min_prediction = avg_windowed1min + val windowed_average = sourceStream + // Map the Correct Slice duration + .map(e => AverageWithKey(e.key, Slice(Time.minutes(duration))(e.slice.timestamp), e.average)) .keyBy(_.key) - .window(SlidingEventTimeWindows.of(Time.minutes(1), Time.seconds(30))) + .window(SlidingEventTimeWindows.of(Time.minutes(duration), Time.seconds(Constants.SLIDING_INTERVAL))) .reduce(new AverageWithKeyReducer) + + val windowed_prediction = windowed_average .keyBy(_.key) - .flatMap(new EnrichMapper) - .name(getKeyName() + " Prediction values for 1 min") + .flatMap(new EnrichMapper(getStateName(duration))) + .name(getKeyName() + " Prediction values for " + duration + " min") // Sink the Predicted value streams to Elasticsearch - windowed1min_prediction - .addSink(PredictionElasticSearchSink(params,Constants.ES_INDEX_NAME, Constants.ES_INDEX_TYPE_1MIN)) - .name(getKeyName() + " Prediction Sink - ES - 1 min Window") + windowed_prediction + .addSink(ElasticSearchSink[Prediction](params, Constants.ES_INDEX_NAME, Constants.ES_INDEX_TYPE_1MIN)) + .name(getKeyName() + " Prediction Sink - ES - " + duration + " min Window") // Write to file for debug - val debug = params.has("debug") - if (debug) { - avg_windowed1min.writeAsText(LOG_DIR + "/output_avg/windowed1min.csv", FileSystem.WriteMode.OVERWRITE).name("Debug Avg 1 Min Window") - windowed1min_prediction.writeAsText(LOG_DIR + "/output_prediction/windowed1min.csv", FileSystem.WriteMode.OVERWRITE) - /*initializedFlow - .addSink(SensorEventElasticSearchSink(params, Constants.ES_INDEX_NAME, Constants.ES_INDEX_TYPE_RAW)) - .name("Sensor Raw to ES")*/ + if(params.has("debug")){ + windowed_prediction.writeAsText(LOG_DIR + "/output_prediction/windowed"+ duration +"min.csv", FileSystem.WriteMode.OVERWRITE) } - env.execute("Sensor Event" + getKeyName() + " Prediction Job") + windowed_average } + + def getStateName(duration: Int) = "median-" + duration + "min" } \ No newline at end of file diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction.scala index 0617f03..d166812 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction.scala @@ -1,12 +1,9 @@ -/* package com.bhaskardivya.projects.smartgrid.model -import java.util.Date - +import com.bhaskardivya.projects.smartgrid.util.JSONTrait import org.apache.sling.commons.json.JSONObject - -case class Prediction(averageWithKey: AverageWithKey, medianLoad: MedianLoadWithKey, key: String, slidingWindow: Long, predictedValue: Double){ +case class Prediction(var averageWithKey: AverageWithKey, var medianLoad: MedianLoad, var predictedLoad: Double) extends JSONTrait { def toJSONString(): String = { toJSON().toString() @@ -18,29 +15,41 @@ case class Prediction(averageWithKey: AverageWithKey, medianLoad: MedianLoadWith //averageWithKey object val averageWithKeyJSON = averageWithKey.key.toJSON - averageWithKeyJSON.put("sum", averageWithKey.sum) - averageWithKeyJSON.put("count", averageWithKey.count) - averageWithKeyJSON.put("avg", averageWithKey.averageValue) - averageWithKeyJSON.put("eventTimestamp", averageWithKey.eventTimestamp) + averageWithKeyJSON.put("sum", normalise_double(averageWithKey.average.sum)) + averageWithKeyJSON.put("count", averageWithKey.average.count) + averageWithKeyJSON.put("avg", normalise_double(averageWithKey.averageValue)) + averageWithKeyJSON.put("eventTimestamp", averageWithKey.slice.timestamp) json.put("averageWithKey", averageWithKeyJSON) //medianLoad val medianLoadJSON = new JSONObject() - medianLoadJSON.put("load", medianLoad.medianLoad) + medianLoadJSON.put("load", normalise_double(medianLoad.load)) json.put("medianLoad", medianLoadJSON) //key or entity - json.put("key", key) + json.put("house_id", averageWithKey.key.house_id) + json.put("household_id", averageWithKey.key.household_id) + json.put("plug_id", averageWithKey.key.plug_id) //sliding window duration - json.put("slidingWindowDuration", slidingWindow) + json.put("slidingWindowDuration", averageWithKey.slice.size.toMilliseconds) + json.put("slice-start", averageWithKey.slice.ts_start) + json.put("slice-stop", averageWithKey.slice.ts_stop) //Predicted value - json.put("predictedValue", predictedValue) + json.put("predictedValue", normalise_double(predictedLoad)) // Current Time - json.put("timestamp", new Date().getTime) + json.put("current-timestamp", System.currentTimeMillis) json } -}*/ + + def normalise_double(dbl: Double): Double = { + if(dbl < 1e-6) { + 0.000001 + }else{ + dbl + } + } +} diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction2.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction2.scala deleted file mode 100644 index 45f7181..0000000 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction2.scala +++ /dev/null @@ -1,54 +0,0 @@ -package com.bhaskardivya.projects.smartgrid.model - -import org.apache.sling.commons.json.JSONObject - -case class Prediction2(var averageWithKey: AverageWithKey, var medianLoad: MedianLoad, var predictedLoad: Double) { - - def toJSONString(): String = { - toJSON().toString() - } - - def toJSON(): JSONObject = { - //main object - val json = new JSONObject() - - //averageWithKey object - val averageWithKeyJSON = averageWithKey.key.toJSON - averageWithKeyJSON.put("sum", normalise_double(averageWithKey.average.sum)) - averageWithKeyJSON.put("count", averageWithKey.average.count) - averageWithKeyJSON.put("avg", normalise_double(averageWithKey.averageValue)) - averageWithKeyJSON.put("eventTimestamp", averageWithKey.slice.timestamp) - json.put("averageWithKey", averageWithKeyJSON) - - //medianLoad - val medianLoadJSON = new JSONObject() - medianLoadJSON.put("load", normalise_double(medianLoad.load)) - json.put("medianLoad", medianLoadJSON) - - //key or entity - json.put("house_id", averageWithKey.key.house_id) - json.put("household_id", averageWithKey.key.household_id) - json.put("plug_id", averageWithKey.key.plug_id) - - //sliding window duration - json.put("slidingWindowDuration", averageWithKey.slice.size.toMilliseconds) - json.put("slice-start", averageWithKey.slice.ts_start) - json.put("slice-stop", averageWithKey.slice.ts_stop) - - //Predicted value - json.put("predictedValue", normalise_double(predictedLoad)) - - // Current Time - json.put("current-timestamp", System.currentTimeMillis) - - json - } - - def normalise_double(dbl: Double): Double = { - if(dbl < 1e-6) { - 0.000001 - }else{ - dbl - } - } -} diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/SensorEvent.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/SensorEvent.scala index bcc7fab..a27520b 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/SensorEvent.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/SensorEvent.scala @@ -1,5 +1,6 @@ package com.bhaskardivya.projects.smartgrid.model +import com.bhaskardivya.projects.smartgrid.util.JSONTrait import org.apache.commons.lang3.builder.ToStringBuilder import org.apache.flink.api.common.ExecutionConfig import org.apache.flink.api.common.serialization.TypeInformationSerializationSchema @@ -13,7 +14,7 @@ import org.apache.sling.commons.json.JSONObject /** * id, timestamp, value, property, plug_id, household_id, house_id */ -case class SensorEvent(var id: Long,var timestamp: Long, var value: Double, var property: Int, var plug_id: Long, var household_id: Long, var house_id: Long){ +case class SensorEvent(var id: Long,var timestamp: Long, var value: Double, var property: Int, var plug_id: Long, var household_id: Long, var house_id: Long) extends JSONTrait { def adjustEventTimestamp(millis: Long) = { this.timestamp = this.timestamp + (millis / 1000) diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/EnrichMapper.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/EnrichMapper.scala index f971c45..59c7755 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/EnrichMapper.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/EnrichMapper.scala @@ -1,6 +1,6 @@ package com.bhaskardivya.projects.smartgrid.operators -import com.bhaskardivya.projects.smartgrid.model.{AverageWithKey, MedianLoad, MedianLoadWithKey, Prediction2} +import com.bhaskardivya.projects.smartgrid.model.{AverageWithKey, MedianLoad, MedianLoadWithKey, Prediction} import com.tdunning.math.stats.TDigest import org.apache.flink.api.common.functions.{RichFlatMapFunction, RichReduceFunction} import org.apache.flink.api.common.state.{MapState, MapStateDescriptor, ValueState, ValueStateDescriptor} @@ -12,17 +12,17 @@ import org.apache.flink.util.Collector /** * Rich Mapper function to get the predicted load values from the median state and Average Loads */ -class EnrichMapper extends RichFlatMapFunction[AverageWithKey, Prediction2]{ +class EnrichMapper(stateName: String) extends RichFlatMapFunction[AverageWithKey, Prediction]{ private var digest: MapState[Long, TDigest] = _ - private var prediction2: Prediction2 = _ + private var prediction2: Prediction = _ override def open(parameters: Configuration): Unit = { - val descriptor = new MapStateDescriptor[Long, TDigest]("median", createTypeInformation[Long], createTypeInformation[TDigest]) + val descriptor = new MapStateDescriptor[Long, TDigest](stateName, createTypeInformation[Long], createTypeInformation[TDigest]) digest = getRuntimeContext.getMapState(descriptor) } - override def flatMap(value: AverageWithKey, out: Collector[Prediction2]): Unit = { + override def flatMap(value: AverageWithKey, out: Collector[Prediction]): Unit = { // Get the TDigest Object for the SensorKey and slice predicting for val currentDigest = digest.get(value.slice.predicting_for_time_of_day) @@ -42,7 +42,7 @@ class EnrichMapper extends RichFlatMapFunction[AverageWithKey, Prediction2]{ // Create the final Prediction Object to be collected if (prediction2 == null) { - prediction2 = Prediction2(value, MedianLoad(medianLoad), prediction) + prediction2 = Prediction(value, MedianLoad(medianLoad), prediction) } else { prediction2.averageWithKey = value prediction2.medianLoad = MedianLoad(medianLoad) diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianWithKeyMapper.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianWithKeyMapper.scala index 7c3d990..240cb9f 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianWithKeyMapper.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianWithKeyMapper.scala @@ -11,13 +11,13 @@ import org.apache.flink.util.Collector /** * A Rich flatMap function to keep a running TDigest object for each key and each starting minutes of a day. */ -class MedianWithKeyMapper extends RichFlatMapFunction[AverageWithKey, AverageWithKey]{ +class MedianWithKeyMapper(stateName: String) extends RichFlatMapFunction[AverageWithKey, AverageWithKey]{ private var digest: MapState[Long, TDigest] = _ override def open(parameters: Configuration): Unit = { - val descriptor = new MapStateDescriptor[Long, TDigest]("median", createTypeInformation[Long], createTypeInformation[TDigest]) - descriptor.setQueryable("median-query") + val descriptor = new MapStateDescriptor[Long, TDigest](stateName, createTypeInformation[Long], createTypeInformation[TDigest]) + descriptor.setQueryable(stateName+"-query") digest = getRuntimeContext.getMapState(descriptor) } diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/SensorEventElasticSearchSink.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/ElasticSearchSink.scala similarity index 65% rename from src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/SensorEventElasticSearchSink.scala rename to src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/ElasticSearchSink.scala index 29b9724..7b06aa3 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/SensorEventElasticSearchSink.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/ElasticSearchSink.scala @@ -2,25 +2,27 @@ package com.bhaskardivya.projects.smartgrid.sinks import java.net.{InetAddress, InetSocketAddress} -import com.bhaskardivya.projects.smartgrid.model.SensorEvent +import com.bhaskardivya.projects.smartgrid.util.JSONTrait import org.apache.flink.api.java.utils.ParameterTool import org.apache.flink.streaming.connectors.elasticsearch5.ElasticsearchSink -object SensorEventElasticSearchSink { +object ElasticSearchSink { - def apply(params: ParameterTool, esIndex: String, esIndexType: String): ElasticsearchSink[SensorEvent] ={ + def apply[T <: JSONTrait](params: ParameterTool, esIndex: String, esIndexType: String): ElasticsearchSink[T] ={ //Initialize Elastic search configuration val esClusterLocationIP = params.get("es.cluster.ip", "192.168.99.100") val esClusterLocationPort = params.getInt("es.cluster.port", 9300) + val esFlushMaxActions = params.getInt("bulk.flush.max.actions", 100) + val config = new java.util.HashMap[String, String] config.put("cluster.name", params.get("es.cluster.name", "docker-cluster")) // This instructs the sink to emit after every element, otherwise they would be buffered - config.put("bulk.flush.max.actions", "1") + config.put("bulk.flush.max.actions", esFlushMaxActions.toString) val transportAddresses = new java.util.ArrayList[InetSocketAddress] transportAddresses.add(new InetSocketAddress(InetAddress.getByName(esClusterLocationIP), esClusterLocationPort)) - new ElasticsearchSink(config, transportAddresses, new SensorEventElasticSearchSinkFunction(esIndex, esIndexType)) + new ElasticsearchSink(config, transportAddresses, new ElasticSearchSinkFunction[T](esIndex, esIndexType)) } -} +} \ No newline at end of file diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/PredictionElasticSearchSinkFunction.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/ElasticSearchSinkFunction.scala similarity index 62% rename from src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/PredictionElasticSearchSinkFunction.scala rename to src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/ElasticSearchSinkFunction.scala index f421604..d43a0d7 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/PredictionElasticSearchSinkFunction.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/ElasticSearchSinkFunction.scala @@ -1,6 +1,7 @@ package com.bhaskardivya.projects.smartgrid.sinks -import com.bhaskardivya.projects.smartgrid.model.Prediction2 +import com.bhaskardivya.projects.smartgrid.model.Prediction +import com.bhaskardivya.projects.smartgrid.util.JSONTrait import org.apache.flink.api.common.functions.RuntimeContext import org.apache.flink.streaming.connectors.elasticsearch.{ElasticsearchSinkFunction, RequestIndexer} import org.elasticsearch.action.ActionRequest @@ -11,9 +12,9 @@ import org.elasticsearch.client.Requests * @param esIndex ElasticSearch Index name * @param esType ElasticSearch Index type */ -class PredictionElasticSearchSinkFunction(esIndex: String, esType: String) extends ElasticsearchSinkFunction[Prediction2]{ +class ElasticSearchSinkFunction[T <: JSONTrait](esIndex: String, esType: String) extends ElasticsearchSinkFunction[T]{ - def createIndexRequest(element: Prediction2): ActionRequest = { + def createIndexRequest(element: T): ActionRequest = { val json = element.toJSONString() Requests.indexRequest @@ -22,7 +23,7 @@ class PredictionElasticSearchSinkFunction(esIndex: String, esType: String) exten .source(json) } - override def process(element: Prediction2, ctx: RuntimeContext, indexer: RequestIndexer): Unit = { + override def process(element: T, ctx: RuntimeContext, indexer: RequestIndexer): Unit = { indexer.add(createIndexRequest(element)) } diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/HBaseSink.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/HBaseSink.scala deleted file mode 100644 index 920f0aa..0000000 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/HBaseSink.scala +++ /dev/null @@ -1,7 +0,0 @@ -package com.bhaskardivya.projects.smartgrid.sinks - -class HBaseSink[IN] { - def getSink(): Unit ={ - - } -} diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/PredictionElasticSearchSink.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/PredictionElasticSearchSink.scala deleted file mode 100644 index b289464..0000000 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/PredictionElasticSearchSink.scala +++ /dev/null @@ -1,37 +0,0 @@ -package com.bhaskardivya.projects.smartgrid.sinks - -import java.net.{InetAddress, InetSocketAddress} - -import com.bhaskardivya.projects.smartgrid.model.Prediction2 -import org.apache.flink.api.java.utils.ParameterTool -import org.apache.flink.streaming.connectors.elasticsearch5.ElasticsearchSink - -object PredictionElasticSearchSink { - - def apply(params: ParameterTool, esIndex: String, esIndexType: String): ElasticsearchSink[Prediction2] ={ - //Initialize Elastic search configuration - val esClusterLocationIP = params.get("es.cluster.ip", "192.168.99.100") - val esClusterLocationPort = params.getInt("es.cluster.port", 9300) - val esFlushMaxActions = params.getInt("bulk.flush.max.actions", 100) - - val config = new java.util.HashMap[String, String] - config.put("cluster.name", params.get("es.cluster.name", "docker-cluster")) - // This instructs the sink to emit after every element, otherwise they would be buffered - config.put("bulk.flush.max.actions", esFlushMaxActions.toString) - - val transportAddresses = new java.util.ArrayList[InetSocketAddress] - - /* - transportAddresses.add(new InetSocketAddress(InetAddress.getByName("127.0.0.1"), esClusterLocationPort)) - transportAddresses.add(new InetSocketAddress(InetAddress.getByName("172.17.0.2"), esClusterLocationPort)) - transportAddresses.add(new InetSocketAddress(InetAddress.getByName("0.0.0.0"), esClusterLocationPort)) - transportAddresses.add(new InetSocketAddress(InetAddress.getByName("localhost"), esClusterLocationPort)) - transportAddresses.add(new InetSocketAddress(InetAddress.getByName("192.168.99.100"), esClusterLocationPort)) - */ - - transportAddresses.add(new InetSocketAddress(InetAddress.getByName(esClusterLocationIP), esClusterLocationPort)) - - new ElasticsearchSink(config, transportAddresses, new PredictionElasticSearchSinkFunction(esIndex, esIndexType)) - } - -} diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/SensorEventElasticSearchSinkFunction.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/SensorEventElasticSearchSinkFunction.scala deleted file mode 100644 index aca041c..0000000 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/SensorEventElasticSearchSinkFunction.scala +++ /dev/null @@ -1,29 +0,0 @@ -package com.bhaskardivya.projects.smartgrid.sinks - -import com.bhaskardivya.projects.smartgrid.model.SensorEvent -import org.apache.flink.api.common.functions.RuntimeContext -import org.apache.flink.streaming.connectors.elasticsearch.{ElasticsearchSinkFunction, RequestIndexer} -import org.elasticsearch.action.ActionRequest -import org.elasticsearch.client.Requests - -/** - * Sink to ElasticSearch to store the prediction values - * @param esIndex ElasticSearch Index name - * @param esType ElasticSearch Index type - */ -class SensorEventElasticSearchSinkFunction(esIndex: String, esType: String) extends ElasticsearchSinkFunction[SensorEvent]{ - - def createIndexRequest(element: SensorEvent): ActionRequest = { - val json = element.toJSONString() - - Requests.indexRequest - .index(esIndex) - .`type`(esType) - .source(json) - } - - override def process(element: SensorEvent, ctx: RuntimeContext, indexer: RequestIndexer): Unit = { - indexer.add(createIndexRequest(element)) - } - -} diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/util/JSONTrait.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/util/JSONTrait.scala new file mode 100644 index 0000000..1e09279 --- /dev/null +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/util/JSONTrait.scala @@ -0,0 +1,7 @@ +package com.bhaskardivya.projects.smartgrid.util + +trait JSONTrait { + + def toJSONString() : String + +} From fabb26700ed571db5361b07dc15a207b248b2ef1 Mon Sep 17 00:00:00 2001 From: koldbyte Date: Tue, 20 Feb 2018 14:28:05 +0530 Subject: [PATCH 24/32] Added stream for handling work values and generating the average values stream which is union with the main stream of averages + Refactorings + Cleanups --- .../base/SensorEventAveragingJobBase.scala | 53 ++++++++++++----- .../projects/smartgrid/model/Constants.scala | 5 ++ .../projects/smartgrid/model/Slice.scala | 19 ++++-- .../smartgrid/model/TwoWorkEvents.scala | 5 ++ .../smartgrid/model/WorkValueFlatMap.scala | 58 +++++++++++++++++++ .../operators/WorkValueProcessWindow.scala | 15 +++++ 6 files changed, 136 insertions(+), 19 deletions(-) create mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/model/TwoWorkEvents.scala create mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/model/WorkValueFlatMap.scala create mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/operators/WorkValueProcessWindow.scala diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala index caf8d15..ec5440d 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala @@ -23,8 +23,8 @@ import org.apache.flink.streaming.api.windowing.time.Time abstract class SensorEventAveragingJobBase extends Serializable { //register implicits for types - implicit val typeInfoAverageWithKey = TypeInformation.of(classOf[AverageWithKey]) - implicit val typeInfoPrediction2 = TypeInformation.of(classOf[Prediction]) + implicit val typeInfoAverageWithKey: TypeInformation[AverageWithKey] = TypeInformation.of(classOf[AverageWithKey]) + implicit val typeInfoPrediction2: TypeInformation[Prediction] = TypeInformation.of(classOf[Prediction]) private val LOG_DIR = "/data/" + getKeyName() @@ -37,7 +37,7 @@ abstract class SensorEventAveragingJobBase extends Serializable { /** * Method to prepare the raw events properly aggregated(sum) based on the key - * @param dataStream + * @param dataStream source raw data stream * @return */ def initializeFlow(dataStream: DataStream[SensorEvent]): DataStream[SensorEvent] @@ -45,8 +45,8 @@ abstract class SensorEventAveragingJobBase extends Serializable { /** * Method that returns the value of the key in the source datum * - * @param element - * @return Long Key value + * @param element SensorEvent record + * @return Long Key value */ def getKey(element: SensorEvent): SensorKeyObject @@ -75,7 +75,7 @@ abstract class SensorEventAveragingJobBase extends Serializable { new File(LOG_DIR + "/output_avg/").mkdir() new File(LOG_DIR + "/output_prediction/").mkdir() } catch { - case e: Exception => println("Directories already created") + case e: Exception => println("Directories already created" + e.getMessage) } // parse parameters @@ -93,21 +93,40 @@ abstract class SensorEventAveragingJobBase extends Serializable { // Get the stream according to params val rawStream: DataStream[SensorEvent] = SourceChooser.from(env, params).name("Sensor Source with Timestamp") - //TODO: create a Global Window for work values which will output the missing load values + // Create a Global Window for work values which will output the missing load values + val averageUsingWorkValues: DataStream[AverageWithKey] = rawStream + .filter(_.property == Constants.PROPERTY_WORK) + .keyBy(e => getKey(e)) + .countWindow(2,1) + .process(new WorkValueProcessWindow) + .flatMap(new WorkValueFlatMap(Time.minutes(1))(keyGetter)) // Create a stream with sum according to the key specified val initializedFlow = initializeFlow(rawStream) val averageWithKeys = initializedFlow .map(e => AverageWithKey(keyGetter(e), Slice(Time.minutes(1))(e.timestamp), Average(e.value, 1))) + // Assumption: If both the Load values and work Values are available in a slice, + // Average of them will still be closer to the real measurement + .union(averageUsingWorkValues) //create median states for various window duration createMedianState(averageWithKeys) - val windowed_average_1min = createPredictionStream(params, 1, averageWithKeys) - val windowed_average_5min = createPredictionStream(params, 5, windowed_average_1min) - val windowed_average_15min = createPredictionStream(params, 15, windowed_average_5min) - val windowed_average_60min = createPredictionStream(params, 60, windowed_average_15min) - val windowed_average_120min = createPredictionStream(params, 120, windowed_average_60min) + // Create the average sliding window stream and corresponding Prediction Stream + val windowed_average_1min = createAverageStream(params, 1, averageWithKeys) + createPredictionStream(params, 1, windowed_average_1min) + + val windowed_average_5min = createAverageStream(params, 5, windowed_average_1min) + createPredictionStream(params, 5, windowed_average_5min) + + val windowed_average_15min = createAverageStream(params, 15, windowed_average_5min) + createPredictionStream(params, 15, windowed_average_15min) + + val windowed_average_60min = createAverageStream(params, 60, windowed_average_15min) + createPredictionStream(params, 60, windowed_average_60min) + + val windowed_average_120min = createAverageStream(params, 120, windowed_average_60min) + createPredictionStream(params, 120, windowed_average_120min) initializedFlow .addSink(ElasticSearchSink[SensorEvent](params, Constants.ES_INDEX_NAME, Constants.ES_INDEX_TYPE_RAW)) @@ -142,7 +161,7 @@ abstract class SensorEventAveragingJobBase extends Serializable { }) } - def createPredictionStream(params: ParameterTool, duration: Int, sourceStream: DataStream[AverageWithKey]) = { + def createAverageStream(params: ParameterTool, duration: Int, sourceStream: DataStream[AverageWithKey]) = { val windowed_average = sourceStream // Map the Correct Slice duration @@ -151,7 +170,11 @@ abstract class SensorEventAveragingJobBase extends Serializable { .window(SlidingEventTimeWindows.of(Time.minutes(duration), Time.seconds(Constants.SLIDING_INTERVAL))) .reduce(new AverageWithKeyReducer) - val windowed_prediction = windowed_average + windowed_average + } + + def createPredictionStream(params: ParameterTool, duration: Int, averageStream: DataStream[AverageWithKey]) = { + val windowed_prediction = averageStream .keyBy(_.key) .flatMap(new EnrichMapper(getStateName(duration))) .name(getKeyName() + " Prediction values for " + duration + " min") @@ -166,7 +189,7 @@ abstract class SensorEventAveragingJobBase extends Serializable { windowed_prediction.writeAsText(LOG_DIR + "/output_prediction/windowed"+ duration +"min.csv", FileSystem.WriteMode.OVERWRITE) } - windowed_average + windowed_prediction } def getStateName(duration: Int) = "median-" + duration + "min" diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Constants.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Constants.scala index facda18..83531d4 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Constants.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Constants.scala @@ -30,7 +30,12 @@ object Constants { // Sensor Key Object val KEY_NO_VALUE: Long = -1 val PROPERTY_LOAD: Int = 1 + val PROPERTY_WORK: Int = 0 // TDigest val TDIGEST_COMPRESSION = 100 + + // This is the base timestamp ... the epoch of the world with flink + val BASE_TIMESTAMP = 0L // TODO: currently set 0 to align with Unix Epoch + } diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala index 84782c5..5e79b17 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala @@ -14,16 +14,14 @@ import scala.collection.immutable case class Slice(size: Time)(var timestamp: Long) { private val seconds_in_Day = (24*60*60) - def size_in_seconds: Long = size.toMilliseconds / 1000 - // This is the base timestamp ... the epoch of the world with flink - val base = 0L // TODO: currently set 0 to align with Unix Epoch + def size_in_seconds: Long = size.toMilliseconds / 1000 def ts_start: Long = timestamp - (timestamp % size_in_seconds) def ts_stop: Long = ts_start + size_in_seconds - 1 - def i: Long = ( ts_start - base ) / size_in_seconds + def i: Long = ( ts_start - Constants.BASE_TIMESTAMP ) / size_in_seconds def num_slices_in(hr: Long): Long = (hr * 60 * 60) / size_in_seconds @@ -52,3 +50,16 @@ case class Slice(size: Time)(var timestamp: Long) { str.toString() } } + +object Slice { + def from(size: Time)(sliceIndex: Long): Slice = { + val size_in_seconds = size.toMilliseconds / 1000 + + val ts_start = sliceIndex * size_in_seconds + Constants.BASE_TIMESTAMP + + //lets keep the event occurring just right in the middle of the slice index + val event_timestamp = ts_start + (size_in_seconds / 2) + + Slice(size)(event_timestamp) + } +} diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/TwoWorkEvents.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/TwoWorkEvents.scala new file mode 100644 index 0000000..456cbb2 --- /dev/null +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/TwoWorkEvents.scala @@ -0,0 +1,5 @@ +package com.bhaskardivya.projects.smartgrid.model + +case class TwoWorkEvents(sensorEvent1: SensorEvent, sensorEvent2: SensorEvent) { + def isValid: Boolean = sensorEvent1 != null && sensorEvent2 != null +} diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/WorkValueFlatMap.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/WorkValueFlatMap.scala new file mode 100644 index 0000000..6df5609 --- /dev/null +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/WorkValueFlatMap.scala @@ -0,0 +1,58 @@ +package com.bhaskardivya.projects.smartgrid.model + +import com.bhaskardivya.projects.smartgrid.base.AbstractKeyGetter +import org.apache.flink.api.common.functions.FlatMapFunction +import org.apache.flink.streaming.api.windowing.time.Time +import org.apache.flink.util.Collector + +class WorkValueFlatMap(size: Time)(keyGetter: AbstractKeyGetter) extends FlatMapFunction[TwoWorkEvents, AverageWithKey]{ + override def flatMap(value: TwoWorkEvents, out: Collector[AverageWithKey]): Unit = { + + if(!value.isValid) + return + + // TwoWorkEvents have records with the same key + // Find the slices between the two WorkEvents + // Generate AverageWithKey records for each of the slices + + val slice_range: Seq[Long] = getSliceRange(value) + + val averageLoad = getAverageLoad(value) + + // let's pre-create the objects so that we dont need to create it again for each collect call + val sensorKeyObject = keyGetter(value.sensorEvent1) + val average = Average(averageLoad, 1) + + + slice_range.foreach ( + slice_index => { + val averageWithKey = AverageWithKey(sensorKeyObject, Slice.from(size)(slice_index), average) + out.collect(averageWithKey) + } + ) + + } + + def getSliceRange(value: TwoWorkEvents) = { + val slice_range_a = Slice(size)(value.sensorEvent1.timestamp).i + val slice_range_b = Slice(size)(value.sensorEvent2.timestamp).i + + if(slice_range_a > slice_range_b){ + slice_range_b to slice_range_a + } else { + slice_range_a to slice_range_b + } + } + + def getAverageLoad(value: TwoWorkEvents) = { + //in kWH + val work_diff = Math.abs(value.sensorEvent1.value - value.sensorEvent2.value) + + //in seconds + val time_diff = Math.abs(value.sensorEvent1.timestamp - value.sensorEvent2.timestamp) + + val averageLoad = (work_diff.toDouble * 60 * 60) / time_diff + + averageLoad + } +} diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/WorkValueProcessWindow.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/WorkValueProcessWindow.scala new file mode 100644 index 0000000..a0ba403 --- /dev/null +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/WorkValueProcessWindow.scala @@ -0,0 +1,15 @@ +package com.bhaskardivya.projects.smartgrid.operators + +import com.bhaskardivya.projects.smartgrid.model.{SensorEvent, SensorKeyObject, TwoWorkEvents} +import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction +import org.apache.flink.streaming.api.windowing.windows.GlobalWindow +import org.apache.flink.util.Collector + +class WorkValueProcessWindow extends ProcessWindowFunction[SensorEvent, TwoWorkEvents, SensorKeyObject, GlobalWindow] { + override def process(key: SensorKeyObject, context: Context, elements: Iterable[SensorEvent], out: Collector[TwoWorkEvents]): Unit = { + + elements match { + case x: Iterable[SensorEvent] if x.size == 2 => out.collect(TwoWorkEvents(x.toList(0), x.toList(1))) + } + } +} From d00092be37fd67d6845264de0d764b5465b085a0 Mon Sep 17 00:00:00 2001 From: koldbyte Date: Tue, 20 Feb 2018 14:50:59 +0530 Subject: [PATCH 25/32] Adding CircleCI integration on repo --- .circleci/config.yml | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 .circleci/config.yml diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..b1c056d --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,42 @@ +# Java Maven CircleCI 2.0 configuration file +# +# Check https://circleci.com/docs/2.0/language-java/ for more details +# +version: 2 +jobs: + build: + docker: + # specify the version you desire here + - image: circleci/openjdk:8-jdk + + # Specify service dependencies here if necessary + # CircleCI maintains a library of pre-built images + # documented at https://circleci.com/docs/2.0/circleci-images/ + # - image: circleci/postgres:9.4 + + working_directory: ~/repo + + environment: + # Customize the JVM maximum heap limit + MAVEN_OPTS: -Xmx3200m + + steps: + - checkout + + # Download and cache dependencies + - restore_cache: + keys: + - v1-dependencies-{{ checksum "pom.xml" }} + # fallback to using the latest cache if no exact match is found + - v1-dependencies- + + - run: mvn package -P build-jar dependency:go-offline + + - save_cache: + paths: + - ~/.m2 + key: v1-dependencies-{{ checksum "pom.xml" }} + + # run tests! + #- run: mvn integration-test + From 16c4af6a04ff340162418643e905a26e8d50a382 Mon Sep 17 00:00:00 2001 From: koldbyte Date: Tue, 20 Feb 2018 15:03:53 +0530 Subject: [PATCH 26/32] Adding CircleCI integration on repo --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index b1c056d..84a9de8 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,6 +2,7 @@ # # Check https://circleci.com/docs/2.0/language-java/ for more details # +# Continuous Integration! version: 2 jobs: build: From 3119310cbd62ce2894bcbec136f9b3b02b2a1b10 Mon Sep 17 00:00:00 2001 From: koldbyte Date: Tue, 20 Feb 2018 15:12:20 +0530 Subject: [PATCH 27/32] Adding CircleCI integration on repo + Artifacts handling --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 84a9de8..0e4d98f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -40,4 +40,5 @@ jobs: # run tests! #- run: mvn integration-test + - run: mv target/smartgrid-*.jar $CIRCLE_ARTIFACTS/ From 949b0715fb0a8201b4fd37597dc1fb39be36d4dd Mon Sep 17 00:00:00 2001 From: koldbyte Date: Tue, 20 Feb 2018 15:15:48 +0530 Subject: [PATCH 28/32] Adding CircleCI integration on repo + Artifacts handling --- .circleci/config.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 0e4d98f..98a194b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -4,6 +4,9 @@ # # Continuous Integration! version: 2 +general: + artifacts: + - $CIRCLE_ARTIFACTS/jar jobs: build: docker: @@ -40,5 +43,5 @@ jobs: # run tests! #- run: mvn integration-test - - run: mv target/smartgrid-*.jar $CIRCLE_ARTIFACTS/ + - run: mkdir $CIRCLE_ARTIFACTS/jar && mv target/smartgrid-*.jar $CIRCLE_ARTIFACTS/jar/ From c18a9999bd8ca06d7af6f489a5ec6292e53d49de Mon Sep 17 00:00:00 2001 From: koldbyte Date: Tue, 20 Feb 2018 15:23:47 +0530 Subject: [PATCH 29/32] Adding CircleCI integration on repo + Artifacts handling --- .circleci/config.yml | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 98a194b..44ee543 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -4,9 +4,6 @@ # # Continuous Integration! version: 2 -general: - artifacts: - - $CIRCLE_ARTIFACTS/jar jobs: build: docker: @@ -43,5 +40,16 @@ jobs: # run tests! #- run: mvn integration-test - - run: mkdir $CIRCLE_ARTIFACTS/jar && mv target/smartgrid-*.jar $CIRCLE_ARTIFACTS/jar/ + + - run: | + set -xu + mkdir -p /tmp/artifacts + cp target/smartgrid*.jar /tmp/artifacts + + # Save artifacts + - store_artifacts: + path: /tmp/artifacts + destination: build + + From 3b01e087ca34f8be40c7c70b2f668f5c0c47468a Mon Sep 17 00:00:00 2001 From: koldbyte Date: Tue, 3 Apr 2018 00:00:24 +0530 Subject: [PATCH 30/32] Fixes 1. _type correct use for ES sink 2. Work unit conversion --- .../base/SensorEventAveragingJobBase.scala | 30 ++++++++++++------- .../smartgrid/model/WorkValueFlatMap.scala | 2 +- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala index ec5440d..486c073 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala @@ -114,19 +114,24 @@ abstract class SensorEventAveragingJobBase extends Serializable { // Create the average sliding window stream and corresponding Prediction Stream val windowed_average_1min = createAverageStream(params, 1, averageWithKeys) - createPredictionStream(params, 1, windowed_average_1min) + val prediction_1min = createPredictionStream(params, 1, windowed_average_1min) + createPredictionSink(params, prediction_1min, Constants.ES_INDEX_TYPE_1MIN) val windowed_average_5min = createAverageStream(params, 5, windowed_average_1min) - createPredictionStream(params, 5, windowed_average_5min) + val prediction_5min = createPredictionStream(params, 5, windowed_average_5min) + createPredictionSink(params, prediction_5min, Constants.ES_INDEX_TYPE_5MIN) val windowed_average_15min = createAverageStream(params, 15, windowed_average_5min) - createPredictionStream(params, 15, windowed_average_15min) + val prediction_15min = createPredictionStream(params, 15, windowed_average_15min) + createPredictionSink(params, prediction_15min, Constants.ES_INDEX_TYPE_15MIN) val windowed_average_60min = createAverageStream(params, 60, windowed_average_15min) - createPredictionStream(params, 60, windowed_average_60min) + val prediction_60min = createPredictionStream(params, 60, windowed_average_60min) + createPredictionSink(params, prediction_60min, Constants.ES_INDEX_TYPE_60MIN) val windowed_average_120min = createAverageStream(params, 120, windowed_average_60min) - createPredictionStream(params, 120, windowed_average_120min) + val prediction_120min = createPredictionStream(params, 120, windowed_average_120min) + createPredictionSink(params, prediction_120min, Constants.ES_INDEX_TYPE_120MIN) initializedFlow .addSink(ElasticSearchSink[SensorEvent](params, Constants.ES_INDEX_NAME, Constants.ES_INDEX_TYPE_RAW)) @@ -164,6 +169,7 @@ abstract class SensorEventAveragingJobBase extends Serializable { def createAverageStream(params: ParameterTool, duration: Int, sourceStream: DataStream[AverageWithKey]) = { val windowed_average = sourceStream + .keyBy(_.key) // Map the Correct Slice duration .map(e => AverageWithKey(e.key, Slice(Time.minutes(duration))(e.slice.timestamp), e.average)) .keyBy(_.key) @@ -174,22 +180,24 @@ abstract class SensorEventAveragingJobBase extends Serializable { } def createPredictionStream(params: ParameterTool, duration: Int, averageStream: DataStream[AverageWithKey]) = { - val windowed_prediction = averageStream + val windowed_prediction: DataStream[Prediction] = averageStream .keyBy(_.key) .flatMap(new EnrichMapper(getStateName(duration))) .name(getKeyName() + " Prediction values for " + duration + " min") + windowed_prediction + } + + def createPredictionSink(params: ParameterTool, windowed_prediction: DataStream[Prediction], indexType: String) = { // Sink the Predicted value streams to Elasticsearch windowed_prediction - .addSink(ElasticSearchSink[Prediction](params, Constants.ES_INDEX_NAME, Constants.ES_INDEX_TYPE_1MIN)) - .name(getKeyName() + " Prediction Sink - ES - " + duration + " min Window") + .addSink(ElasticSearchSink[Prediction](params, Constants.ES_INDEX_NAME, indexType)) + .name(getKeyName() + " Prediction Sink - ES - " + indexType + " min Window") // Write to file for debug if(params.has("debug")){ - windowed_prediction.writeAsText(LOG_DIR + "/output_prediction/windowed"+ duration +"min.csv", FileSystem.WriteMode.OVERWRITE) + windowed_prediction.writeAsText(LOG_DIR + "/output_prediction/windowed"+ indexType +".csv", FileSystem.WriteMode.OVERWRITE) } - - windowed_prediction } def getStateName(duration: Int) = "median-" + duration + "min" diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/WorkValueFlatMap.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/WorkValueFlatMap.scala index 6df5609..a70eac7 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/WorkValueFlatMap.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/WorkValueFlatMap.scala @@ -46,7 +46,7 @@ class WorkValueFlatMap(size: Time)(keyGetter: AbstractKeyGetter) extends FlatMap def getAverageLoad(value: TwoWorkEvents) = { //in kWH - val work_diff = Math.abs(value.sensorEvent1.value - value.sensorEvent2.value) + val work_diff = Math.abs(value.sensorEvent1.value - value.sensorEvent2.value) * 1000 //in seconds val time_diff = Math.abs(value.sensorEvent1.timestamp - value.sensorEvent2.timestamp) From b9ee6e9c7fb131b6447ae32b211a7bbc30246175 Mon Sep 17 00:00:00 2001 From: koldbyte Date: Tue, 3 Apr 2018 01:14:09 +0530 Subject: [PATCH 31/32] Added File sources Cleanups using static code checker --- src/main/resources/META-INF/MANIFEST.MF | 3 +- .../base/SensorEventAveragingJobBase.scala | 8 +++--- .../smartgrid/job/HouseAveragingJob.scala | 2 +- .../smartgrid/job/PlugAveragingJob.scala | 4 +-- .../smartgrid/model/SensorEvent.scala | 6 ++-- .../projects/smartgrid/model/Slice.scala | 2 +- .../smartgrid/pipeline/DataCleanser.scala | 2 +- .../smartgrid/pipeline/SourceChooser.scala | 7 +++-- .../HBaseOutputFormatAverageWithKey.scala | 2 +- .../smartgrid/sources/FileSource.scala | 28 ++++++++++++++----- 10 files changed, 39 insertions(+), 25 deletions(-) diff --git a/src/main/resources/META-INF/MANIFEST.MF b/src/main/resources/META-INF/MANIFEST.MF index f7055b0..c89a621 100644 --- a/src/main/resources/META-INF/MANIFEST.MF +++ b/src/main/resources/META-INF/MANIFEST.MF @@ -1,4 +1,3 @@ Manifest-Version: 1.0 -Main-Class: com.bhaskardivya.projects.smartgrid.SmartGridProcessorFrom - File +Main-Class: com.bhaskardivya.projects.smartgrid.job.PlugAveragingJob diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala index 486c073..8d9497b 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala @@ -144,7 +144,7 @@ abstract class SensorEventAveragingJobBase extends Serializable { * Helper function to create median MapState for each window duration * @param averageWithKeyStream DataStream of AverageWithKey */ - def createMedianState(averageWithKeyStream: DataStream[AverageWithKey]) = { + def createMedianState(averageWithKeyStream: DataStream[AverageWithKey]): Unit = { // Streams for each window duration for the average windowDurations.foreach(duration => { @@ -166,7 +166,7 @@ abstract class SensorEventAveragingJobBase extends Serializable { }) } - def createAverageStream(params: ParameterTool, duration: Int, sourceStream: DataStream[AverageWithKey]) = { + def createAverageStream(params: ParameterTool, duration: Int, sourceStream: DataStream[AverageWithKey]): DataStream[AverageWithKey] = { val windowed_average = sourceStream .keyBy(_.key) @@ -179,7 +179,7 @@ abstract class SensorEventAveragingJobBase extends Serializable { windowed_average } - def createPredictionStream(params: ParameterTool, duration: Int, averageStream: DataStream[AverageWithKey]) = { + def createPredictionStream(params: ParameterTool, duration: Int, averageStream: DataStream[AverageWithKey]): DataStream[Prediction] = { val windowed_prediction: DataStream[Prediction] = averageStream .keyBy(_.key) .flatMap(new EnrichMapper(getStateName(duration))) @@ -200,5 +200,5 @@ abstract class SensorEventAveragingJobBase extends Serializable { } } - def getStateName(duration: Int) = "median-" + duration + "min" + def getStateName(duration: Int): String = "median-" + duration + "min" } \ No newline at end of file diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/job/HouseAveragingJob.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/job/HouseAveragingJob.scala index 9254f60..f2a83cf 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/job/HouseAveragingJob.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/job/HouseAveragingJob.scala @@ -12,7 +12,7 @@ object HouseAveragingJob extends SensorEventAveragingJobBase with Serializable{ override def getTargetColumnFamily(): String = Constants.HOUSE_CF - override def initializeFlow(dataStream: DataStream[SensorEvent]) = { + override def initializeFlow(dataStream: DataStream[SensorEvent]): DataStream[SensorEvent] = { // Sum the values of all the plugs in a house with the same time stamp dataStream .keyBy("house_id", "timestamp") diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/job/PlugAveragingJob.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/job/PlugAveragingJob.scala index 57bd49f..8e7671e 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/job/PlugAveragingJob.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/job/PlugAveragingJob.scala @@ -12,12 +12,12 @@ object PlugAveragingJob extends SensorEventAveragingJobBase with Serializable{ override def getTargetColumnFamily(): String = Constants.PLUG_CF - override def initializeFlow(dataStream: DataStream[SensorEvent]) = { + override def initializeFlow(dataStream: DataStream[SensorEvent]): DataStream[SensorEvent] = { // De-duplicate values with the same timestamp for a given plug of a given house dataStream .filter(_.property == Constants.PROPERTY_LOAD) .keyBy("house_id", "household_id" , "plug_id", "timestamp") - .reduce((a,b) => b) + .reduce((_,b) => b) .name("De-duplicated Raw stream") } } diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/SensorEvent.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/SensorEvent.scala index a27520b..2a8a6fe 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/SensorEvent.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/SensorEvent.scala @@ -16,7 +16,7 @@ import org.apache.sling.commons.json.JSONObject */ case class SensorEvent(var id: Long,var timestamp: Long, var value: Double, var property: Int, var plug_id: Long, var household_id: Long, var house_id: Long) extends JSONTrait { - def adjustEventTimestamp(millis: Long) = { + def adjustEventTimestamp(millis: Long): Unit = { this.timestamp = this.timestamp + (millis / 1000) } @@ -57,7 +57,7 @@ object SensorEvent { new SensorEvent(id, timestamp, value, property, plug_id, household_id, house_id) } catch { - case e: Exception => null + case _ => null } } @@ -70,7 +70,7 @@ object SensorEvent { def tsAssigner(): BoundedOutOfOrdernessTimestampExtractor[SensorEvent] = { new BoundedOutOfOrdernessTimestampExtractor[SensorEvent](MAX_DELAY) { - override def extractTimestamp(element: SensorEvent) = element.getTimeMillis() + override def extractTimestamp(element: SensorEvent): Long = element.getTimeMillis() } } } \ No newline at end of file diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala index 5e79b17..160e568 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala @@ -12,7 +12,7 @@ import scala.collection.immutable * @param timestamp Event timestamp from the record in seconds */ case class Slice(size: Time)(var timestamp: Long) { - private val seconds_in_Day = (24*60*60) + private val seconds_in_Day = 24 * 60 * 60 def size_in_seconds: Long = size.toMilliseconds / 1000 diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/pipeline/DataCleanser.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/pipeline/DataCleanser.scala index 498c698..6d94286 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/pipeline/DataCleanser.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/pipeline/DataCleanser.scala @@ -5,7 +5,7 @@ import org.apache.flink.streaming.api.scala._ object DataCleanser { def clean(stream: DataStream[List[String]]): DataStream[List[String]] = { stream - .filter(_.length == 7) // There should be 7 fields in a record + .filter(_.lengthCompare(7) == 0) // There should be 7 fields in a record .filter(row => row(3).toInt == 0 && row(3).toInt == 1) // 'property' can have either 0 or 1 value } } diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/pipeline/SourceChooser.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/pipeline/SourceChooser.scala index 386be00..9c385b1 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/pipeline/SourceChooser.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/pipeline/SourceChooser.scala @@ -1,7 +1,7 @@ package com.bhaskardivya.projects.smartgrid.pipeline import com.bhaskardivya.projects.smartgrid.model.SensorEvent -import com.bhaskardivya.projects.smartgrid.sources.KafkaSource +import com.bhaskardivya.projects.smartgrid.sources.{FileSource, KafkaSource} import org.apache.flink.api.java.utils.ParameterTool import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment} @@ -14,8 +14,9 @@ object SourceChooser { source match { case "kafka" => new KafkaSource().getSource(env, params) - /*case "file" => new FileSource().getSource(env, params) - case _ => new FileSource().getSource(env, params)*/ + case "file" => new FileSource().getSource(env, params) + case "simulated" => new FileSource().getSimulatedCSVSource(env, params) + case _ => new FileSource().getSource(env, params) } } } diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/HBaseOutputFormatAverageWithKey.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/HBaseOutputFormatAverageWithKey.scala index 09d529a..193ff10 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/HBaseOutputFormatAverageWithKey.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/sinks/HBaseOutputFormatAverageWithKey.scala @@ -48,7 +48,7 @@ class HBaseOutputFormatAverageWithKey extends OutputFormat[AverageWithKey] { @throws[IOException] override def writeRecord(record: AverageWithKey): Unit = { - val startTime = System.currentTimeMillis(); + val startTime = System.currentTimeMillis() // Make sure that the rowkey is sorted by the average values val put = new Put(Bytes.toBytes(taskNumber + rowNumber) ++ record.bytesRowKey()) put.setDurability(Durability.SKIP_WAL) diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/sources/FileSource.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/sources/FileSource.scala index 2938a47..e3c4140 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/sources/FileSource.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/sources/FileSource.scala @@ -1,18 +1,32 @@ package com.bhaskardivya.projects.smartgrid.sources -import org.apache.flink.api.java.io.TextInputFormat +import com.bhaskardivya.projects.smartgrid.model.SensorEvent import org.apache.flink.api.java.utils.ParameterTool -import org.apache.flink.core.fs.Path -import org.apache.flink.streaming.api.functions.source.FileProcessingMode import org.apache.flink.streaming.api.scala._ class FileSource { - def getSource(env: StreamExecutionEnvironment, params: ParameterTool): DataStream[String] = { - val filePath: String = params.get("filePath", "/run/media/osboxes/Data/data/sorted100M.1s.hsum.csv") - val fileInterval: Long = params.getLong("fileInterval", 100) + def getSimulatedCSVSource(env: StreamExecutionEnvironment, params: ParameterTool): DataStream[SensorEvent] = { + val data = params.get("input", "/data/data.gz") + val maxServingDelay = params.getInt("maxServingDelay", 0) + val servingSpeedFactor = params.getFloat("servingSpeedFactor", 1f) + val offsetEventTimestamp = params.has("offsetEventTimestamp") - env.readFile(new TextInputFormat(new Path(filePath)), filePath, FileProcessingMode.PROCESS_ONCE, fileInterval) + val events = env.addSource(new CSVFileSource(data, maxServingDelay, servingSpeedFactor, offsetEventTimestamp)) + .name("CSV GZ File") + + events + } + + def getSource(env: StreamExecutionEnvironment, params: ParameterTool): DataStream[SensorEvent] = { + val filePath: String = params.get("input", "/data/data.gz") + + // read the CSV GZ and assign Timestamp + val csv: DataStream[SensorEvent] = env.readTextFile(filePath) + .map[SensorEvent](line => SensorEvent.fromString(line)) + .assignTimestampsAndWatermarks(SensorEvent.tsAssigner()) + + csv } } From a6a04b6d0db2377d0b1f0e7969a5323db0a7ff66 Mon Sep 17 00:00:00 2001 From: koldbyte Date: Wed, 4 Apr 2018 23:32:19 +0530 Subject: [PATCH 32/32] Cleanups + Refactorings + Bug Fixes Added a new job for just one window duration --- .../base/PredictionJobSingleWindowBase.scala | 188 ++++++++++++++++++ .../base/SensorEventAveragingJobBase.scala | 57 ++++-- .../smartgrid/job/PlugAveragingJob5Min.scala | 21 ++ .../projects/smartgrid/model/Constants.scala | 2 +- .../projects/smartgrid/model/Prediction.scala | 34 +++- .../smartgrid/model/SensorEvent.scala | 12 +- .../projects/smartgrid/model/Slice.scala | 6 + .../smartgrid/model/WorkValueFlatMap.scala | 8 +- .../smartgrid/operators/EnrichMapper.scala | 11 +- .../operators/MedianWithKeyMapper.scala | 9 +- .../operators/WorkValueProcessWindow.scala | 1 + 11 files changed, 313 insertions(+), 36 deletions(-) create mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/base/PredictionJobSingleWindowBase.scala create mode 100644 src/main/scala/com/bhaskardivya/projects/smartgrid/job/PlugAveragingJob5Min.scala diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/base/PredictionJobSingleWindowBase.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/base/PredictionJobSingleWindowBase.scala new file mode 100644 index 0000000..f41399c --- /dev/null +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/base/PredictionJobSingleWindowBase.scala @@ -0,0 +1,188 @@ +package com.bhaskardivya.projects.smartgrid.base + +import java.io.File + +import com.bhaskardivya.projects.smartgrid.model._ +import com.bhaskardivya.projects.smartgrid.operators._ +import com.bhaskardivya.projects.smartgrid.pipeline._ +import com.bhaskardivya.projects.smartgrid.sinks.ElasticSearchSink +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.api.java.utils.ParameterTool +import org.apache.flink.core.fs.FileSystem +import org.apache.flink.streaming.api.TimeCharacteristic +import org.apache.flink.streaming.api.scala.{DataStream, _} +import org.apache.flink.streaming.api.windowing.assigners.{SlidingEventTimeWindows, TumblingEventTimeWindows} +import org.apache.flink.streaming.api.windowing.time.Time + +/** + * Abstract class that represents the Main Job that calculates the + * averages and medians which are being used here itself to predict + * the load forecast + */ +abstract class PredictionJobSingleWindowBase extends Serializable { + + //register implicits for types + implicit val typeInfoAverageWithKey: TypeInformation[AverageWithKey] = TypeInformation.of(classOf[AverageWithKey]) + implicit val typeInfoPrediction2: TypeInformation[Prediction] = TypeInformation.of(classOf[Prediction]) + implicit val typeInfoTime: TypeInformation[Time] = TypeInformation.of(classOf[Time]) + + private val LOG_DIR = "/data/" + getKeyName() + + /** + * Method that returns the name of the key in the source datum + * + * @return String key name + */ + def getKeyName(): String + + /** + * Method that returns the value of the key in the source datum + * + * @param element SensorEvent record + * @return Long Key value + */ + def getKey(element: SensorEvent): SensorKeyObject + + object keyGetter extends AbstractKeyGetter { + def apply(element: SensorEvent): SensorKeyObject = { + getKey(element) + } + } + + /** + * Method to prepare the raw events properly aggregated(sum) based on the key + * @param dataStream source raw data stream + * @return + */ + def initializeFlow(dataStream: DataStream[SensorEvent]): DataStream[SensorEvent] + + // Sliding window durations in minutes + val windowDurations = List(5) + + def main(args: Array[String]): Unit = { + + //Create the log dirs + try { + new File(LOG_DIR).mkdir() + new File(LOG_DIR + "/state/").mkdir() + new File(LOG_DIR + "/input/").mkdir() + new File(LOG_DIR + "/output_avg/").mkdir() + new File(LOG_DIR + "/output_prediction/").mkdir() + } catch { + case e: Exception => println("Directories already created" + e.getMessage) + } + + // parse parameters + val params = ParameterTool.fromArgs(args) + + // Initialise the environment for flink + val env = StreamExecutionEnvironment.getExecutionEnvironment + + // will be using the timestamp from the records + env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) + + //env.enableCheckpointing(10000) // checkpoint every 10000 msecs + //env.setStateBackend(new FsStateBackend(LOG_DIR +"/state/")) + + // Get the stream according to params + val rawStream: DataStream[SensorEvent] = SourceChooser.from(env, params).name("Sensor Source with Timestamp") + + // Create a stream with sum according to the key specified + val initializedFlow = if (params.has("deduplicate")) initializeFlow(rawStream) else rawStream + + if(params.getBoolean("sink.raw", false)) + initializedFlow + .addSink(ElasticSearchSink[SensorEvent](params, Constants.ES_INDEX_NAME, Constants.ES_INDEX_TYPE_RAW)) + .name("Sensor Raw to ES") + + // Create a Global Window for work values which will output the missing load values + val averageUsingWorkValues: DataStream[AverageWithKey] = rawStream + .filter(_.property == Constants.PROPERTY_WORK) + .keyBy(e => getKey(e)) + .countWindow(2, 1) + .process(new WorkValueProcessWindow) + .flatMap(new WorkValueFlatMap(60)(keyGetter)) //60 seconds + + val averageWithKeys = initializedFlow + .map(e => AverageWithKey(keyGetter(e), Slice(Time.minutes(1))(e.timestamp), Average(e.value, 1))) + // Assumption: If both the Load values and work Values are available in a slice, + // Average of them will still be closer to the real measurement + .union(averageUsingWorkValues) + + //create median states for various window duration + createMedianState(averageWithKeys) + + // Create the average sliding window stream and corresponding Prediction Stream + val windowed_average_5min = createAverageStream(params, 5, averageWithKeys) + if(params.getBoolean("sink.5min", false)) { + val prediction_5min = createPredictionStream(params, 5, windowed_average_5min) + createPredictionSink(params, prediction_5min, Constants.ES_INDEX_TYPE_5MIN) + } + + env.execute("Sensor Event" + getKeyName() + " Prediction Job") + } + + /** + * Helper function to create median MapState for each window duration + * @param averageWithKeyStream DataStream of AverageWithKey + */ + def createMedianState(averageWithKeyStream: DataStream[AverageWithKey]): Unit = { + + // Streams for each window duration for the average + windowDurations.foreach(duration => { + + val stateName = getStateName(duration) + + val avg_windowed = averageWithKeyStream + .keyBy(_.key) + .window(TumblingEventTimeWindows.of(Time.minutes(duration))) + .reduce(new AverageWithKeyReducer) + .name(getKeyName() + " Average for " + duration + " min Tumbling Window") + + // Store median as operator state + avg_windowed + .keyBy(_.key) + .flatMap(new MedianWithKeyMapper(stateName)) + .name(getKeyName() + " Median state for " + duration + " min Tumbling Window") + + }) + } + + def createAverageStream(params: ParameterTool, duration: Int, sourceStream: DataStream[AverageWithKey]): DataStream[AverageWithKey] = { + + val slidingInterval = params.getInt("sliding.interval", Constants.SLIDING_INTERVAL) + + val windowed_average = sourceStream + .keyBy(_.key) + // Map the Correct Slice duration + .map(e => AverageWithKey(e.key, Slice(Time.minutes(duration))(e.slice.timestamp), e.average)) + .keyBy(_.key) + .window(SlidingEventTimeWindows.of(Time.minutes(duration), Time.seconds(slidingInterval))) + .reduce(new AverageWithKeyReducer) + + windowed_average + } + + def createPredictionStream(params: ParameterTool, duration: Int, averageStream: DataStream[AverageWithKey]): DataStream[Prediction] = { + val windowed_prediction: DataStream[Prediction] = averageStream + .keyBy(_.key) + .flatMap(new EnrichMapper(getStateName(duration))) + .name(getKeyName() + " Prediction values for " + duration + " min") + + windowed_prediction + } + + def createPredictionSink(params: ParameterTool, windowed_prediction: DataStream[Prediction], indexType: String) = { + // Sink the Predicted value streams to Elasticsearch + windowed_prediction + .addSink(ElasticSearchSink[Prediction](params, Constants.ES_INDEX_NAME, indexType)) + .name(getKeyName() + " Prediction Sink - ES - " + indexType + " min Window") + + // Write to file for debug + if(params.has("debug")){ + windowed_prediction.writeAsText(LOG_DIR + "/output_prediction/windowed"+ indexType +".csv", FileSystem.WriteMode.OVERWRITE) + } + } + + def getStateName(duration: Int): String = "median-" + duration + "min" +} \ No newline at end of file diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala index 8d9497b..1277850 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/base/SensorEventAveragingJobBase.scala @@ -25,6 +25,7 @@ abstract class SensorEventAveragingJobBase extends Serializable { //register implicits for types implicit val typeInfoAverageWithKey: TypeInformation[AverageWithKey] = TypeInformation.of(classOf[AverageWithKey]) implicit val typeInfoPrediction2: TypeInformation[Prediction] = TypeInformation.of(classOf[Prediction]) + implicit val typeInfoTime: TypeInformation[Time] = TypeInformation.of(classOf[Time]) private val LOG_DIR = "/data/" + getKeyName() @@ -97,45 +98,57 @@ abstract class SensorEventAveragingJobBase extends Serializable { val averageUsingWorkValues: DataStream[AverageWithKey] = rawStream .filter(_.property == Constants.PROPERTY_WORK) .keyBy(e => getKey(e)) - .countWindow(2,1) + .countWindow(2, 1) .process(new WorkValueProcessWindow) - .flatMap(new WorkValueFlatMap(Time.minutes(1))(keyGetter)) + .flatMap(new WorkValueFlatMap(60)(keyGetter)) //60 seconds // Create a stream with sum according to the key specified - val initializedFlow = initializeFlow(rawStream) + val initializedFlow = if (params.has("deduplicate")) initializeFlow(rawStream) else rawStream + val averageWithKeys = initializedFlow - .map(e => AverageWithKey(keyGetter(e), Slice(Time.minutes(1))(e.timestamp), Average(e.value, 1))) - // Assumption: If both the Load values and work Values are available in a slice, - // Average of them will still be closer to the real measurement - .union(averageUsingWorkValues) + .map(e => AverageWithKey(keyGetter(e), Slice(Time.minutes(1))(e.timestamp), Average(e.value, 1))) + // Assumption: If both the Load values and work Values are available in a slice, + // Average of them will still be closer to the real measurement + .union(averageUsingWorkValues) //create median states for various window duration createMedianState(averageWithKeys) // Create the average sliding window stream and corresponding Prediction Stream val windowed_average_1min = createAverageStream(params, 1, averageWithKeys) - val prediction_1min = createPredictionStream(params, 1, windowed_average_1min) - createPredictionSink(params, prediction_1min, Constants.ES_INDEX_TYPE_1MIN) + if (params.getBoolean("sink.1min", false)) { + val prediction_1min = createPredictionStream(params, 1, windowed_average_1min) + createPredictionSink(params, prediction_1min, Constants.ES_INDEX_TYPE_1MIN) + } val windowed_average_5min = createAverageStream(params, 5, windowed_average_1min) - val prediction_5min = createPredictionStream(params, 5, windowed_average_5min) - createPredictionSink(params, prediction_5min, Constants.ES_INDEX_TYPE_5MIN) + if(params.getBoolean("sink.5min", false)) { + val prediction_5min = createPredictionStream(params, 5, windowed_average_5min) + createPredictionSink(params, prediction_5min, Constants.ES_INDEX_TYPE_5MIN) + } val windowed_average_15min = createAverageStream(params, 15, windowed_average_5min) - val prediction_15min = createPredictionStream(params, 15, windowed_average_15min) - createPredictionSink(params, prediction_15min, Constants.ES_INDEX_TYPE_15MIN) + if(params.getBoolean("sink.15min", false)) { + val prediction_15min = createPredictionStream(params, 15, windowed_average_15min) + createPredictionSink(params, prediction_15min, Constants.ES_INDEX_TYPE_15MIN) + } val windowed_average_60min = createAverageStream(params, 60, windowed_average_15min) - val prediction_60min = createPredictionStream(params, 60, windowed_average_60min) - createPredictionSink(params, prediction_60min, Constants.ES_INDEX_TYPE_60MIN) + if(params.getBoolean("sink.60min", false)) { + val prediction_60min = createPredictionStream(params, 60, windowed_average_60min) + createPredictionSink(params, prediction_60min, Constants.ES_INDEX_TYPE_60MIN) + } val windowed_average_120min = createAverageStream(params, 120, windowed_average_60min) - val prediction_120min = createPredictionStream(params, 120, windowed_average_120min) - createPredictionSink(params, prediction_120min, Constants.ES_INDEX_TYPE_120MIN) + if(params.getBoolean("sink.120min", false)) { + val prediction_120min = createPredictionStream(params, 120, windowed_average_120min) + createPredictionSink(params, prediction_120min, Constants.ES_INDEX_TYPE_120MIN) + } - initializedFlow - .addSink(ElasticSearchSink[SensorEvent](params, Constants.ES_INDEX_NAME, Constants.ES_INDEX_TYPE_RAW)) - .name("Sensor Raw to ES") + if(params.getBoolean("sink.raw", false)) + initializedFlow + .addSink(ElasticSearchSink[SensorEvent](params, Constants.ES_INDEX_NAME, Constants.ES_INDEX_TYPE_RAW)) + .name("Sensor Raw to ES") env.execute("Sensor Event" + getKeyName() + " Prediction Job") } @@ -168,12 +181,14 @@ abstract class SensorEventAveragingJobBase extends Serializable { def createAverageStream(params: ParameterTool, duration: Int, sourceStream: DataStream[AverageWithKey]): DataStream[AverageWithKey] = { + val slidingInterval = params.getInt("sliding.interval", Constants.SLIDING_INTERVAL) + val windowed_average = sourceStream .keyBy(_.key) // Map the Correct Slice duration .map(e => AverageWithKey(e.key, Slice(Time.minutes(duration))(e.slice.timestamp), e.average)) .keyBy(_.key) - .window(SlidingEventTimeWindows.of(Time.minutes(duration), Time.seconds(Constants.SLIDING_INTERVAL))) + .window(SlidingEventTimeWindows.of(Time.minutes(duration), Time.seconds(slidingInterval))) .reduce(new AverageWithKeyReducer) windowed_average diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/job/PlugAveragingJob5Min.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/job/PlugAveragingJob5Min.scala new file mode 100644 index 0000000..378715c --- /dev/null +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/job/PlugAveragingJob5Min.scala @@ -0,0 +1,21 @@ +package com.bhaskardivya.projects.smartgrid.job + +import com.bhaskardivya.projects.smartgrid.base.PredictionJobSingleWindowBase +import com.bhaskardivya.projects.smartgrid.model.{Constants, SensorEvent, SensorKeyObject} +import org.apache.flink.streaming.api.scala.DataStream + +object PlugAveragingJob5Min extends PredictionJobSingleWindowBase with Serializable{ + + override def getKeyName(): String = "Plug" + + override def getKey(element: SensorEvent): SensorKeyObject = SensorKeyObject(element.house_id, element.household_id, element.plug_id) + + override def initializeFlow(dataStream: DataStream[SensorEvent]): DataStream[SensorEvent] = { + // De-duplicate values with the same timestamp for a given plug of a given house + dataStream + .filter(_.property == Constants.PROPERTY_LOAD) + .keyBy("house_id", "household_id" , "plug_id", "timestamp") + .reduce((_,b) => b) + .name("De-duplicated Raw stream") + } +} diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Constants.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Constants.scala index 83531d4..4a36a72 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Constants.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Constants.scala @@ -36,6 +36,6 @@ object Constants { val TDIGEST_COMPRESSION = 100 // This is the base timestamp ... the epoch of the world with flink - val BASE_TIMESTAMP = 0L // TODO: currently set 0 to align with Unix Epoch + val BASE_TIMESTAMP = 1377900000L // TODO: currently set 0 to align with Unix Epoch } diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction.scala index d166812..9cddc00 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Prediction.scala @@ -6,7 +6,7 @@ import org.apache.sling.commons.json.JSONObject case class Prediction(var averageWithKey: AverageWithKey, var medianLoad: MedianLoad, var predictedLoad: Double) extends JSONTrait { def toJSONString(): String = { - toJSON().toString() + toJSON2().toString() } def toJSON(): JSONObject = { @@ -52,4 +52,36 @@ case class Prediction(var averageWithKey: AverageWithKey, var medianLoad: Median dbl } } + + def toJSON2(): JSONObject = { + //main object + val json = new JSONObject() + + //averageWithKey object + val averageWithKeyJSON = averageWithKey.key.toJSON + averageWithKeyJSON.put("sum", normalise_double(averageWithKey.average.sum)) + averageWithKeyJSON.put("count", averageWithKey.average.count) + averageWithKeyJSON.put("avg", normalise_double(averageWithKey.averageValue)) + averageWithKeyJSON.put("eventTimestamp", averageWithKey.slice.timestamp) + json.put("averageWithKey", averageWithKeyJSON) + + //medianLoad + val medianLoadJSON = new JSONObject() + medianLoadJSON.put("load", normalise_double(medianLoad.load)) + json.put("medianLoad", medianLoadJSON) + + //sliding window duration + json.put("slidingWindowDuration", averageWithKey.slice.size.toMilliseconds) + json.put("slice-start", averageWithKey.slice.ts_start) + json.put("slice-stop", averageWithKey.slice.ts_stop) + + //Predicted value + json.put("predictedValue", normalise_double(predictedLoad)) + json.put("predicted-slice-start", averageWithKey.slice.predicting_for_slice.ts_start) + + // Current Time + json.put("current-timestamp", System.currentTimeMillis) + + json + } } diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/SensorEvent.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/SensorEvent.scala index 2a8a6fe..b839a40 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/SensorEvent.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/SensorEvent.scala @@ -31,7 +31,7 @@ case class SensorEvent(var id: Long,var timestamp: Long, var value: Double, var val json: JSONObject = new JSONObject() json.put("id", this.id) json.put("timestamp", this.timestamp) - json.put("value", this.value) + json.put("value", normalise_double(this.value)) json.put("property", this.property) json.put("plug_id", this.plug_id) json.put("household_id", this.household_id) @@ -39,6 +39,14 @@ case class SensorEvent(var id: Long,var timestamp: Long, var value: Double, var json } + + def normalise_double(dbl: Double): Double = { + if(dbl < 1e-6) { + 0.000001 + }else{ + dbl + } + } } object SensorEvent { @@ -57,7 +65,7 @@ object SensorEvent { new SensorEvent(id, timestamp, value, property, plug_id, household_id, house_id) } catch { - case _ => null + case _ : Throwable => null } } diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala index 160e568..ef991bd 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/Slice.scala @@ -36,6 +36,7 @@ case class Slice(size: Time)(var timestamp: Long) { def stop_time_of_day: Long = ts_stop % seconds_in_Day def predicting_for_time_of_day: Long = (start_time_of_day + 2*size_in_seconds) % seconds_in_Day + def predicting_previous_slice: Long = (start_time_of_day - 2*size_in_seconds + seconds_in_Day) % seconds_in_Day def predicting_for_slice: Slice = Slice(size)(ts_start + 2*size_in_seconds) override def toString : String = { @@ -62,4 +63,9 @@ object Slice { Slice(size)(event_timestamp) } + + def from(size_in_seconds: Long)(sliceIndex: Long) : Slice = { + from(Time.seconds(size_in_seconds))(sliceIndex) + } + } diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/WorkValueFlatMap.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/WorkValueFlatMap.scala index a70eac7..bcae404 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/model/WorkValueFlatMap.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/model/WorkValueFlatMap.scala @@ -5,7 +5,7 @@ import org.apache.flink.api.common.functions.FlatMapFunction import org.apache.flink.streaming.api.windowing.time.Time import org.apache.flink.util.Collector -class WorkValueFlatMap(size: Time)(keyGetter: AbstractKeyGetter) extends FlatMapFunction[TwoWorkEvents, AverageWithKey]{ +class WorkValueFlatMap(size_in_seconds: Long)(keyGetter: AbstractKeyGetter) extends FlatMapFunction[TwoWorkEvents, AverageWithKey]{ override def flatMap(value: TwoWorkEvents, out: Collector[AverageWithKey]): Unit = { if(!value.isValid) @@ -26,7 +26,7 @@ class WorkValueFlatMap(size: Time)(keyGetter: AbstractKeyGetter) extends FlatMap slice_range.foreach ( slice_index => { - val averageWithKey = AverageWithKey(sensorKeyObject, Slice.from(size)(slice_index), average) + val averageWithKey = AverageWithKey(sensorKeyObject, Slice.from(size_in_seconds)(slice_index), average) out.collect(averageWithKey) } ) @@ -34,8 +34,8 @@ class WorkValueFlatMap(size: Time)(keyGetter: AbstractKeyGetter) extends FlatMap } def getSliceRange(value: TwoWorkEvents) = { - val slice_range_a = Slice(size)(value.sensorEvent1.timestamp).i - val slice_range_b = Slice(size)(value.sensorEvent2.timestamp).i + val slice_range_a = Slice(Time.seconds(size_in_seconds))(value.sensorEvent1.timestamp).i + val slice_range_b = Slice(Time.seconds(size_in_seconds))(value.sensorEvent2.timestamp).i if(slice_range_a > slice_range_b){ slice_range_b to slice_range_a diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/EnrichMapper.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/EnrichMapper.scala index 59c7755..ae24aaa 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/EnrichMapper.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/EnrichMapper.scala @@ -29,7 +29,11 @@ class EnrichMapper(stateName: String) extends RichFlatMapFunction[AverageWithKey // Get the Median Load Value val medianLoad = if(currentDigest == null) { - value.averageValue + val previousDigest = digest.get(value.slice.predicting_previous_slice) + if(previousDigest == null) + value.averageValue + else + previousDigest.quantile(0.5) }else{ currentDigest.quantile(0.5) } @@ -38,7 +42,7 @@ class EnrichMapper(stateName: String) extends RichFlatMapFunction[AverageWithKey val prediction = (value.averageValue + medianLoad) / 2.0 // Update the Slice index for prediction - value.slice = value.slice.predicting_for_slice + //TODO: Testing //value.slice = value.slice.predicting_for_slice // Create the final Prediction Object to be collected if (prediction2 == null) { @@ -49,6 +53,7 @@ class EnrichMapper(stateName: String) extends RichFlatMapFunction[AverageWithKey prediction2.predictedLoad = prediction } - out.collect(prediction2) + if(prediction > 1e-6) // + out.collect(prediction2) } } diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianWithKeyMapper.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianWithKeyMapper.scala index 240cb9f..5adbd02 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianWithKeyMapper.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/MedianWithKeyMapper.scala @@ -29,10 +29,11 @@ class MedianWithKeyMapper(stateName: String) extends RichFlatMapFunction[Average currentDigest = TDigest.createDigest(Constants.TDIGEST_COMPRESSION) } - currentDigest.add(key) - - digest.put(key, currentDigest) + if(!value.average.avg.isNaN) { + currentDigest.add(value.average.avg) + digest.put(key, currentDigest) + } - out.collect(value) + //out.collect(value) //Dont emit as no further processing is required on this stream } } diff --git a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/WorkValueProcessWindow.scala b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/WorkValueProcessWindow.scala index a0ba403..92dd193 100644 --- a/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/WorkValueProcessWindow.scala +++ b/src/main/scala/com/bhaskardivya/projects/smartgrid/operators/WorkValueProcessWindow.scala @@ -10,6 +10,7 @@ class WorkValueProcessWindow extends ProcessWindowFunction[SensorEvent, TwoWorkE elements match { case x: Iterable[SensorEvent] if x.size == 2 => out.collect(TwoWorkEvents(x.toList(0), x.toList(1))) + case _ => println("Encountered non-pair CountWindow Work") } } }