-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMakefile
153 lines (128 loc) · 5.12 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# Makefile for Spark FollowerCount project.
# Customize these paths for your environment.
# -----------------------------------------------------------
spark.root=/home/prdx/spark-2.4.0-bin-without-hadoop
hadoop.root=/home/prdx/hadoop-2.9.2
app.name=Project
jar.name=project.jar
maven.jar.name=project-1.0.jar
job.name=project.ProjectImpl
job.min.support= 0.9
local.master=local[4]
local.input=input
local.output=output
# Pseudo-Cluster Execution
hdfs.user.name=prdx
hdfs.input=input
hdfs.output=output
# AWS EMR Execution
aws.emr.release=emr-5.21.0
aws.bucket.name=mr-prj-son
aws.subnet.id=subnet-6356553a
aws.input=input
aws.output=output
aws.log.dir=log
aws.min.support=0.0001
aws.num.nodes=10
aws.instance.type=m4.large
# -----------------------------------------------------------
# Compiles code and builds jar (with dependencies).
jar:
mvn clean package
cp target/${maven.jar.name} ${jar.name}
# Removes local output directory.
clean-local-output:
rm -rf ${local.output}
# Runs standalone
local: jar clean-local-output
spark-submit --class ${job.name} --master ${local.master} --name "${app.name}" ${jar.name} ${local.input} ${local.output} ${job.min.support}
# Start HDFS
start-hdfs:
${hadoop.root}/sbin/start-dfs.sh
# Stop HDFS
stop-hdfs:
${hadoop.root}/sbin/stop-dfs.sh
# Start YARN
start-yarn: stop-yarn
${hadoop.root}/sbin/start-yarn.sh
# Stop YARN
stop-yarn:
${hadoop.root}/sbin/stop-yarn.sh
# Reformats & initializes HDFS.
format-hdfs: stop-hdfs
rm -rf /tmp/hadoop*
${hadoop.root}/bin/hdfs namenode -format
# Initializes user & input directories of HDFS.
init-hdfs: start-hdfs
${hadoop.root}/bin/hdfs dfs -rm -r -f /user
${hadoop.root}/bin/hdfs dfs -mkdir /user
${hadoop.root}/bin/hdfs dfs -mkdir /user/${hdfs.user.name}
${hadoop.root}/bin/hdfs dfs -mkdir /user/${hdfs.user.name}/${hdfs.input}
# Load data to HDFS
upload-input-hdfs: start-hdfs
${hadoop.root}/bin/hdfs dfs -put ${local.input}/* /user/${hdfs.user.name}/${hdfs.input}
# Removes hdfs output directory.
clean-hdfs-output:
${hadoop.root}/bin/hdfs dfs -rm -r -f ${hdfs.output}*
# Download output from HDFS to local.
download-output-hdfs:
mkdir ${local.output}
${hadoop.root}/bin/hdfs dfs -get ${hdfs.output}/* ${local.output}
# Runs pseudo-clustered (ALL). ONLY RUN THIS ONCE, THEN USE: make pseudoq
# Make sure Hadoop is set up (in /etc/hadoop files) for pseudo-clustered operation (not standalone).
# https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/SingleCluster.html#Pseudo-Distributed_Operation
pseudo: jar stop-yarn format-hdfs init-hdfs upload-input-hdfs start-yarn clean-local-output
spark-submit --class ${job.name} --master yarn --deploy-mode cluster ${jar.name} ${local.input} ${local.output}
make download-output-hdfs
# Runs pseudo-clustered (quickie).
pseudoq: jar clean-local-output clean-hdfs-output
spark-submit --class ${job.name} --master yarn --deploy-mode cluster ${jar.name} ${local.input} ${local.output}
make download-output-hdfs
# Create S3 bucket.
make-bucket:
aws s3 mb s3://${aws.bucket.name}/${aws.num.nodes}
# Upload data to S3 input dir.
upload-input-aws: make-bucket
aws s3 sync ${local.input} s3://${aws.bucket.name}/${aws.num.nodes}/${aws.input}
# Delete S3 output dir.
delete-output-aws:
aws s3 rm s3://${aws.bucket.name}/${aws.num.nodes}/ --recursive --exclude "*" --include "${aws.output}*"
# Upload application to S3 bucket.
upload-app-aws:
aws s3 cp ${jar.name} s3://${aws.bucket.name}/${aws.num.nodes}/code/
# Main EMR launch.
aws: jar upload-app-aws delete-output-aws
aws emr create-cluster \
--name "large SON sp.0001 RunAttempt" \
--release-label ${aws.emr.release} \
--instance-groups '[{"InstanceCount":${aws.num.nodes},"InstanceGroupType":"CORE","InstanceType":"${aws.instance.type}"},{"InstanceCount":1,"InstanceGroupType":"MASTER","InstanceType":"${aws.instance.type}"}]' \
--applications Name=Hadoop Name=Spark \
--steps Type=CUSTOM_JAR,Name="${app.name}",Jar="command-runner.jar",ActionOnFailure=TERMINATE_CLUSTER,Args=["spark-submit","--deploy-mode","cluster","--class","${job.name}","s3://${aws.bucket.name}/${aws.num.nodes}/code/${jar.name}","s3://${aws.bucket.name}/${aws.input}","s3://${aws.bucket.name}/${aws.num.nodes}/${aws.output}","${aws.min.support}"] \
--log-uri s3://${aws.bucket.name}/${aws.num.nodes}/${aws.log.dir} \
--use-default-roles \
--enable-debugging \
--auto-terminate
# Download output from S3.
download-output-aws: clean-local-output
mkdir ${local.output}
aws s3 sync s3://${aws.bucket.name}/${aws.num.nodes}/${aws.output} ${local.output}
# Change to standalone mode.
switch-standalone:
cp config/standalone/*.xml ${hadoop.root}/etc/hadoop
# Change to pseudo-cluster mode.
switch-pseudo:
cp config/pseudo/*.xml ${hadoop.root}/etc/hadoop
# Package for release.
distro:
rm -f Spark-Demo.tar.gz
rm -f Spark-Demo.zip
rm -rf build
mkdir -p build/deliv/Spark-Demo
cp -r src build/deliv/Spark-Demo
cp -r config build/deliv/Spark-Demo
cp -r input build/deliv/Spark-Demo
cp pom.xml build/deliv/Spark-Demo
cp Makefile build/deliv/Spark-Demo
cp README.txt build/deliv/Spark-Demo
tar -czf Spark-Demo.tar.gz -C build/deliv Spark-Demo
cd build/deliv && zip -rq ../../Spark-Demo.zip Spark-Demo