-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
add verification options: Spark, YARN. Share run configs through VCS
- Loading branch information
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
""" | ||
This is the DAG that will show you one interesting specific of how Livy works | ||
in local mode vs in YARN mode (job running in cluster mode). | ||
This DAG runs an intentionally failing Livy batch. | ||
TODO doc | ||
""" | ||
from datetime import datetime | ||
|
||
from airflow import DAG | ||
|
||
try: | ||
# Import statement for Airflow when it loads new operators into airflow.operators | ||
from airflow.operators import LivyBatchOperator | ||
except ImportError: | ||
# Import statement for IDE with the local folder structure | ||
from airflow_home.plugins.livy_batch_plugin import LivyBatchOperator | ||
|
||
dag = DAG( | ||
"04_batch_example_failing", | ||
description="Running Spark jobs via Livy Batches, intentionally failing the job", | ||
schedule_interval=None, | ||
start_date=datetime(1970, 1, 1), | ||
catchup=False, | ||
) | ||
|
||
t1 = LivyBatchOperator( | ||
name="batch_example_failing_{{ run_id }}", | ||
file="file:///data/batches/join_2_files.py", | ||
py_files=["file:///data/batches/join_2_files.py"], | ||
arguments=[ | ||
"file:///data/grades.csv", | ||
"file:///data/ssn-address.tsv", | ||
"-file1_sep=,", | ||
"-file1_header=true", | ||
"-file1_schema=`Last name` STRING, `First name` STRING, SSN STRING, " | ||
"Test1 INT, Test2 INT, Test3 INT, Test4 INT, Final INT, Grade STRING", | ||
"-file1_join_column=SSN", | ||
"-file2_header=false", | ||
"-file2_schema=`Last name` STRING, `First name` STRING, SSN STRING, " | ||
"Address1 STRING, Address2 STRING", | ||
"-file2_join_column=SSN", | ||
"-output_header=true", | ||
"-output_columns=file1.Inexistent", | ||
], | ||
conf={"spark.submit.deployMode": "cluster"}, | ||
task_id="livy_batch_example_failing", | ||
dag=dag, | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
from datetime import datetime | ||
|
||
from airflow import DAG | ||
|
||
try: | ||
# Import statement for Airflow when it loads new operators into airflow.operators | ||
from airflow.operators import LivyBatchOperator | ||
except ImportError: | ||
# Import statement for IDE with the local folder structure | ||
from airflow_home.plugins.livy_batch_plugin import LivyBatchOperator | ||
|
||
dag = DAG( | ||
"05_batch_example_verify_in_spark", | ||
description="Running Spark job via Livy Batches " | ||
"+ verifying status in Spark REST API", | ||
schedule_interval=None, | ||
start_date=datetime(1970, 1, 1), | ||
catchup=False, | ||
) | ||
|
||
t1 = LivyBatchOperator( | ||
name="batch_example_verify_in_spark_{{ run_id }}", | ||
file="file:///data/batches/join_2_files.py", | ||
py_files=["file:///data/batches/join_2_files.py"], | ||
arguments=[ | ||
"file:///data/grades.csv", | ||
"file:///data/ssn-address.tsv", | ||
"-file1_sep=,", | ||
"-file1_header=true", | ||
"-file1_schema=`Last name` STRING, `First name` STRING, SSN STRING, " | ||
"Test1 INT, Test2 INT, Test3 INT, Test4 INT, Final INT, Grade STRING", | ||
"-file1_join_column=SSN", | ||
"-file2_header=false", | ||
"-file2_schema=`Last name` STRING, `First name` STRING, SSN STRING, " | ||
"Address1 STRING, Address2 STRING", | ||
"-file2_join_column=SSN", | ||
"-output_header=true", | ||
"-output_columns=file1.`Last name`, file1.`First name`, file1.SSN, " | ||
"file2.Address1, file2.Address2", | ||
## TODO MAKE IT FAIL | ||
], | ||
verify_in="spark", | ||
task_id="livy_batch_example_verify_in_spark", | ||
dag=dag, | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
from datetime import datetime | ||
|
||
from airflow import DAG | ||
|
||
try: | ||
# Import statement for Airflow when it loads new operators into airflow.operators | ||
from airflow.operators import LivyBatchOperator | ||
except ImportError: | ||
# Import statement for IDE with the local folder structure | ||
from airflow_home.plugins.livy_batch_plugin import LivyBatchOperator | ||
|
||
dag = DAG( | ||
"06_batch_example_verify_in_yarn", | ||
description="Running Spark job via Livy Batches + " | ||
"verifying job status in YARN Resource Manager REST API", | ||
schedule_interval=None, | ||
start_date=datetime(1970, 1, 1), | ||
catchup=False, | ||
) | ||
|
||
t1 = LivyBatchOperator( | ||
name="batch_example_verify_in_yarn_{{ run_id }}", | ||
file="file:///data/batches/join_2_files.py", | ||
py_files=["file:///data/batches/join_2_files.py"], | ||
arguments=[ | ||
"file:///data/grades.csv", | ||
"file:///data/ssn-address.tsv", | ||
"-file1_sep=,", | ||
"-file1_header=true", | ||
"-file1_schema=`Last name` STRING, `First name` STRING, SSN STRING, " | ||
"Test1 INT, Test2 INT, Test3 INT, Test4 INT, Final INT, Grade STRING", | ||
"-file1_join_column=SSN", | ||
"-file2_header=false", | ||
"-file2_schema=`Last name` STRING, `First name` STRING, SSN STRING, " | ||
"Address1 STRING, Address2 STRING", | ||
"-file2_join_column=SSN", | ||
"-output_header=true", | ||
"-output_columns=file1.`Last name`, file1.`First name`, file1.SSN, " | ||
"file2.Address1, file2.Address2", | ||
## TODO MAKE IT FAIL | ||
], | ||
task_id="livy_batch_example_verify_in_yarn", | ||
verify_in="yarn", | ||
dag=dag, | ||
) |