first commit

cuidapeng · cuidapeng · commit cb4842f1c637 · 2019-03-02T14:28:12.000+08:00
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,2 @@
+.idea
+.target
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.idea
+src
+project
+
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,4 @@
+FROM gettyimages/spark:2.3.0-hadoop-2.8
+RUN cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
+ADD jars /usr/spark-2.3.0/jars
+ADD target/scala-2.11/ /usr/spark-2.3.0/
diff --git a/README.md b/README.md
@@ -0,0 +1,51 @@
+### use Spark SQL to load/store offset with mysql
+
+less complicated than custom implement sql operation
+
+#### offset store
+
+```sql
+mysql> select * from kfk_offset where datetime>'2019-01-09' and topic='task-response' and `group`='extract';
++--------+------------+-------+------+-----------+-----------+-----------+-------+---------------------+
+| id     | topic      | group | step | partition | from      | until     | count | datetime            |
++--------+------------+-------+------+-----------+-----------+-----------+-------+---------------------+
+| 1 | task-response | extract   |  1 |         0 | 1959008 | 1995008 | 36000 | 2019-01-09 00:01:19 |
+| 2 | task-response | extract   |  1 |         1 | 1897546 | 1933546 | 36000 | 2019-01-09 00:01:19 |
+| 0 | task-response | extract   |  1 |         2 | 1876072 | 1912072 | 36000 | 2019-01-09 00:01:19 |
+| 5 | task-response | extract   |  2 |         0 | 1995008 | 2031008 | 36000 | 2019-01-09 00:05:05 |
+| 7 | task-response | extract   |  2 |         1 | 1933546 | 1969546 | 36000 | 2019-01-09 00:05:05 |
+| 6 | task-response | extract   |  2 |         2 | 1912072 | 1948072 | 36000 | 2019-01-09 00:05:05 |
+
+```
+
+For my scene(extract crawler's response dom/json),I need rollback to the 'problem datetime' and re-consumer records after it;
+
+### rollback
+ 
+1 kill the spark consume process
+
+2 point the problem datetime,then delete sql record by datetime or step 
+
+`delete from kfk_offset where `step`>1 and `topic`='task-response' and `group`='extract'`
+
+3 start the spark consume process
+
+### Use 
+
+#### develop 
+
+copy source code
+
+or
+
+copy spark-streaming-kafka-offset-mysql_2.11-0.1.jar -> {project}/lib/
+
+#### deploy
+
+sbt package
+
+copy spark-streaming-kafka-offset-mysql_2.11-0.1.jar -> $SPARK_HOME/jars/
+
+### other
+
+upload to maven repositories to use jar 
diff --git a/build.sbt b/build.sbt
@@ -0,0 +1,17 @@
+name := "spark-streaming-kafka-offset-mysql"
+
+version := "0.1"
+
+scalaVersion := "2.11.8"
+
+resolvers += "Local Maven Repository" at "file://"+Path.userHome.absolutePath+"/.m2/repository"
+
+libraryDependencies ++= Seq(
+  "org.scalactic" %% "scalactic" % "3.0.4" % "test",
+  "org.scalatest" %% "scalatest" % "3.0.4" % "test",
+  "mysql" % "mysql-connector-java" % "5.1.44",
+  "org.apache.spark" % "spark-core_2.11" % "2.3.0",
+  "org.apache.spark" % "spark-sql_2.11" % "2.3.0",
+  "org.apache.spark" % "spark-streaming_2.11" % "2.3.0",
+  "org.apache.spark" % "spark-streaming-kafka-0-10_2.11" % "2.3.0"
+)
diff --git a/build.sh b/build.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+sbt package
+docker build -t cuidapeng/spark-offset:$1 ./
+docker push cuidapeng/spark-offset:$1
diff --git a/deployment.yml b/deployment.yml
@@ -0,0 +1,47 @@
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: spark-offset
+  namespace: default
+  labels:
+    app: spark-offset
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: spark-offset
+  template:
+    metadata:
+      labels:
+        app: spark-offset
+    spec:
+      containers:
+      - name: spark-offset
+        imagePullPolicy: Always
+        image: cuidapeng/spark-offset:v0.1
+        command:
+        - spark-submit
+        args:
+        - --driver-java-options
+        - "-Duser.timezone=Asia/Shanghai"
+        - --repositories
+        - http://maven.aliyun.com/nexus/content/groups/public/
+        - --packages
+        - mysql:mysql-connector-java:5.1.27,org.apache.spark:spark-streaming_2.11:2.3.0,,org.apache.spark:spark-streaming-kafka-0-10_2.11:2.3.0
+        - --executor-memory
+        - 2G
+        - --driver-memory
+        - 2G
+        - --conf
+        - "spark.streaming.kafka.maxRatePerPartition=100"
+        - --class
+        - com.github.cclient.spark.Stream
+        - spark-streaming-kafka-offset-mysql_2.11-0.1.jar
+        - prod
+        - task-response
+        - extract
+        resources:
+          requests:
+            memory: "2Gi"
+          limits:
+            memory: "3Gi"
diff --git a/doc/kfk_offset.sql b/doc/kfk_offset.sql
@@ -0,0 +1,13 @@
+CREATE TABLE `kfk_offset` (
+  `id` int(11) NOT NULL AUTO_INCREMENT,
+  `topic` varchar(45) NOT NULL,
+  `group` varchar(45) NOT NULL,
+  `step` int(11) NOT NULL DEFAULT '0',
+  `partition` int(11) NOT NULL,
+  `from` bigint(10) NOT NULL,
+  `until` bigint(10) NOT NULL,
+  `count` bigint(10) NOT NULL DEFAULT '0',
+  `datetime` datetime NOT NULL,
+  PRIMARY KEY (`id`),
+  UNIQUE KEY `unique` (`topic`,`group`,`step`,`partition`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +.idea
 +src
 +project
++