Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 158 additions & 0 deletions DatabricksDemo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# Databricks notebook source
# MAGIC %md
# MAGIC # Running SDLB on Databricks
# MAGIC This is a Databricks Notebook to install and demonstrate SDLB on Databricks.
# MAGIC See recent Blog Post on https://smartdatalake.ch/blog for detailled explanations.
# MAGIC
# MAGIC Run cell below to create Widgets above to configure most important parameters.

# COMMAND ----------

# Create widgets
dbutils.widgets.text("REPODIR", "", "Repository Directory")
dbutils.widgets.text("TMPDIR", "", "Temporary Directory")
dbutils.widgets.text("VOLDIR", "", "Volume Directory")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Install maven
# MAGIC Maven is needed to compile Java source file and create a Jar-Archive file containing all libraries used for this project (see also dependencies in pom.xml).

# COMMAND ----------

import os
# write widgets parameters into a shell script
#params = dbutils.widgets.getAll() # error No module named 'delta.exceptions.captured'; 'delta.exceptions' is not a package
params = {param: dbutils.widgets.get(param) for param in ["REPODIR", "TMPDIR", "VOLDIR"]}
f = open("/tmp/getting-started-env.sh", "w")
for key, value in params.items():
f.write(f"export {key}={value}\n")
f.close()
os.chmod("/tmp/getting-started-env.sh", 0o775) # make it executable

# COMMAND ----------

# MAGIC %sh
# MAGIC # Java version should be 17, otherwise set cluster environment variable JNAME=zulu17-ca-amd64
# MAGIC java --version

# COMMAND ----------

# MAGIC %sh
# MAGIC # install maven
# MAGIC apt update
# MAGIC apt install -y maven

# COMMAND ----------

# MAGIC %md
# MAGIC ## Prepare Getting Started Config and Jar file
# MAGIC Lets copy config files, source code and pom.xml to a temporary directory. Then start Maven build to create the Jar-Archive.
# MAGIC Finally copy Jar-Archive to Unity Catalog Volume, so it is accessible by Job and Cluster.

# COMMAND ----------

# MAGIC %sh
# MAGIC . /tmp/getting-started-env.sh # set env variables prepared above
# MAGIC cat /tmp/getting-started-env.sh
# MAGIC if [ -z "${TMPDIR}" ]; then echo "variable TMPDIR not defined!"; exit -1; fi
# MAGIC
# MAGIC # copy latest config files to workspace folder
# MAGIC cd $REPODIR
# MAGIC cp ./config/airports.conf.part-3-solution ./config/airports.conf
# MAGIC cp ./config/departures.conf.part-3-solution ./config/departures.conf
# MAGIC cp ./config/btl.conf.part-3-solution ./config/btl.conf
# MAGIC cp ./envConfig/databricks.conf.template ./envConfig/dev.conf
# MAGIC
# MAGIC # prepare temporary build folder on cluster
# MAGIC mkdir -p $TMPDIR
# MAGIC cd $TMPDIR
# MAGIC
# MAGIC # copy scala code to build folder
# MAGIC mkdir -p $TMPDIR/src/main/scala/com/sample/
# MAGIC cp $REPODIR/src/main/scala/com/sample/*.scala ./src/main/scala/com/sample/
# MAGIC cp $REPODIR/src/main/scala/com/sample/CustomWebserviceDataObject.scala.part-3-solution ./src/main/scala/com/sample/CustomWebserviceDataObject.scala
# MAGIC
# MAGIC # copy maven pom to build folder
# MAGIC cp $REPODIR/pom.xml .
# MAGIC
# MAGIC # copy config to build folder
# MAGIC cp -r $REPODIR/config .
# MAGIC cp -r $REPODIR/envConfig .

# COMMAND ----------

# MAGIC %sh
# MAGIC . /tmp/getting-started-env.sh # set env variables prepared above
# MAGIC cd $TMPDIR
# MAGIC mvn package -B -Pgenerate-catalog -Pfat-jar
# MAGIC cp target/getting-started-1.0.jar $VOLDIR/getting-started.jar
# MAGIC cp target/getting-started-1.0-jar-with-dependencies.jar $VOLDIR/getting-started-with-dependencies.jar

# COMMAND ----------

# MAGIC %md
# MAGIC ## Upload Configuration to UI

# COMMAND ----------

# MAGIC %scala
# MAGIC // Check keystore entries
# MAGIC import com.databricks.sdk.scala.dbutils.DBUtils
# MAGIC val dbutils = DBUtils.getDBUtils()
# MAGIC dbutils.secrets.list(scope = "my_sec").foreach(println)
# MAGIC // Upload config
# MAGIC import io.smartdatalake.meta.configexporter._
# MAGIC val repodir = dbutils.widgets.get("REPODIR")
# MAGIC ConfigJsonExporter.main(Array("--config", s"file://$repodir/config,file://$repodir/envConfig/dev.conf", "--target", "uiBackend"))

# COMMAND ----------

# MAGIC %md
# MAGIC ## Upload Schema and Statistics to UI

# COMMAND ----------

# MAGIC %scala
# MAGIC // ATTENTION: run after first run!
# MAGIC // Upload schemas and statistics
# MAGIC // Statistics export is currently broken in Databricks, there seems to be a problem with DeltaSQLConf
# MAGIC val t = org.apache.spark.sql.delta.sources.DeltaSQLConf.LOAD_FILE_SYSTEM_CONFIGS_FROM_DATAFRAME_OPTIONS
# MAGIC //import io.smartdatalake.meta.configexporter._
# MAGIC //val repodir = dbutils.widgets.get("REPODIR")
# MAGIC //DataObjectSchemaExporter.main(Array("--config", s"file://$repodir/config,file://$repodir/envConfig/dev.conf", "--target", "uiBackend"))

# COMMAND ----------

# MAGIC %md
# MAGIC ## Try SDLB Lab interface

# COMMAND ----------

# MAGIC %scala
# MAGIC // load SDLB Lab interface
# MAGIC import io.smartdatalake.generated._
# MAGIC import io.smartdatalake.lab.SmartDataLakeBuilderLab
# MAGIC val sdlb = SmartDataLakeBuilderLab[DataObjectCatalog, ActionCatalog](spark,Seq("file:///Workspace/Repos/sdlb/getting-started/config", "file:///Workspace/Repos/sdlb/getting-started/envConfig/dev.conf"), DataObjectCatalog(_, _), ActionCatalog(_, _))
# MAGIC implicit val context = sdlb.context

# COMMAND ----------

# MAGIC %scala
# MAGIC // access dataObjects via SDLB interface with Code Completion
# MAGIC // get DataFrame, schema, or drop table...
# MAGIC sdlb.dataObjects.btlDistances.printSchema
# MAGIC sdlb.dataObjects.btlDistances.get
# MAGIC .where($"could_be_done_by_rail"===true).show

# COMMAND ----------

# MAGIC %scala
# MAGIC // access actions via SDLB interface with Code Completion.
# MAGIC // play with & manipulate transformations, get resulting DataFrames.
# MAGIC sdlb.actions.computeDistances.buildDataFrames.withFilterEquals("estdepartureairport","EDDF").getOne.show

# COMMAND ----------


3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## Introduction
This the code used in the Step-By-Step Walk Through Guide to get started with Smart Data Lake Builder (SDLB).

There are multiple ways to run the example code.
There are multiple ways to run the example code.
[Go to the getting started guide](https://smartdatalake.ch/docs/getting-started/setup) for detailed ways to run SmartDataLakeBuilder.

To see SDLB running on Databricks, check out our recent [Blog Post](https://smartdatalake.ch/blog).
5 changes: 3 additions & 2 deletions config/airports.conf.part-2-solution
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@ dataObjects {

stg-airports {
type = CsvFileDataObject
path = ${env.basePath}"~{id}"
path = ${env.basePathWithId}
}

int-airports {
type = DeltaLakeTableDataObject
path = ${env.basePath}"~{id}"
path = ${env.tablePathWithId}
table = {
catalog = ${env.catalog}
db = ${env.database}
name = "int_airports"
primaryKey = [ident]
Expand Down
5 changes: 3 additions & 2 deletions config/airports.conf.part-3-solution
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ dataObjects {

stg-airports {
type = CsvFileDataObject
path = ${env.basePath}"~{id}"
path = ${env.basePathWithId}
schema = "id string, ident string, type string, name string, latitude_deg string, longitude_deg string, elevation_ft string, continent string, iso_country string, iso_region string, municipality string, scheduled_service string, gps_code string, iata_code string, local_code string, home_link string, wikipedia_link string, keywords string"
metadata {
name = "Staging file of Airport location data"
Expand All @@ -36,8 +36,9 @@ dataObjects {

int-airports {
type = DeltaLakeTableDataObject
path = ${env.basePath}"~{id}"
path = ${env.tablePathWithId}
table = {
catalog = ${env.catalog}
db = ${env.database}
name = "int_airports"
primaryKey = [ident]
Expand Down
7 changes: 4 additions & 3 deletions config/airports.conf.part-3b-solution
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

# This is the airports.conf file as it should look after completing part-3
# This is the airports.conf file as it should look after completing part-3b

dataObjects {

Expand All @@ -17,14 +17,15 @@ dataObjects {

stg-airports {
type = CsvFileDataObject
path = ${env.basePath}"~{id}"
path = ${env.basePathWithId}
schema = "id string, ident string, type string, name string, latitude_deg string, longitude_deg string, elevation_ft string, continent string, iso_country string, iso_region string, municipality string, scheduled_service string, gps_code string, iata_code string, local_code string, home_link string, wikipedia_link string, keywords string"
}

int-airports {
type = DeltaLakeTableDataObject
path = ${env.basePath}"~{id}"
path = ${env.tablePathWithId}
table = {
catalog = ${env.catalog}
db = ${env.database}
name = "int_airports"
primaryKey = [ident]
Expand Down
11 changes: 7 additions & 4 deletions config/btl.conf.part-2-solution
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,19 @@ dataObjects {

btl-departures-arrivals-airports {
type = DeltaLakeTableDataObject
path = ${env.basePath}"~{id}"
path = ${env.tablePathWithId}
table {
catalog = ${env.catalog}
db = ${env.database}
name = "btl_departures_arrivals_airports"
}
}

btl-distances {
type = DeltaLakeTableDataObject
path = ${env.basePath}"~{id}"
path = ${env.tablePathWithId}
table {
catalog = ${env.catalog}
db = ${env.database}
name = "btl_distances"
}
Expand All @@ -28,12 +30,13 @@ actions {
type = CustomDataFrameAction
inputIds = [int-departures, int-airports]
outputIds = [btl-departures-arrivals-airports]
breakDataFrameLineage = true
transformers = [{
type = SQLDfsTransformer
code = {
btl-connected-airports = """
select int_departures.estdepartureairport, int_departures.estarrivalairport, airports.*
from int_departures join int_airports airports on int_departures.estArrivalAirport = airports.ident
from int_departures join int_airports airports on int_departures.estArrivalAirport = airports.ident and current_timestamp between airports.dl_ts_captured and airports.dl_ts_delimited
"""
}
},{
Expand All @@ -43,7 +46,7 @@ actions {
select btl_connected_airports.estdepartureairport, btl_connected_airports.estarrivalairport,
btl_connected_airports.name as arr_name, btl_connected_airports.latitude_deg as arr_latitude_deg, btl_connected_airports.longitude_deg as arr_longitude_deg,
airports.name as dep_name, airports.latitude_deg as dep_latitude_deg, airports.longitude_deg as dep_longitude_deg
from btl_connected_airports join int_airports airports on btl_connected_airports.estdepartureairport = airports.ident
from btl_connected_airports join int_airports airports on btl_connected_airports.estdepartureairport = airports.ident and current_timestamp between airports.dl_ts_captured and airports.dl_ts_delimited
"""
}
description = "Get the name and coordinates of the departures airport"
Expand Down
70 changes: 70 additions & 0 deletions config/btl.conf.part-2b-solution
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# This is the btl.conf file as it should look after completing part-2a

dataObjects {

btl-departures-arrivals-airports {
type = DeltaLakeTableDataObject
path = "~{id}"
table {
db = "default"
name = "btl_departures_arrivals_airports"
}
}

btl-distances {
type = DeltaLakeTableDataObject
path = "~{id}"
table {
db = "default"
name = "btl_distances"
}
}

}

actions {

join-departures-airports {
type = CustomDataFrameAction
inputIds = [int-departures, int-airports]
outputIds = [btl-departures-arrivals-airports]
breakDataFrameLineage = true
transformers = [{
type = SQLDfsTransformer
code = {
btl-connected-airports = """
select int_departures.estdepartureairport, int_departures.estarrivalairport, airports.*
from int_departures join int_airports airports on int_departures.estArrivalAirport = airports.ident and current_timestamp between airports.dl_ts_captured and airports.dl_ts_delimited
"""
}
},{
type = SQLDfsTransformer
code = {
btl-departures-arrivals-airports = """
select btl_connected_airports.estdepartureairport, btl_connected_airports.estarrivalairport,
btl_connected_airports.name as arr_name, btl_connected_airports.latitude_deg as arr_latitude_deg, btl_connected_airports.longitude_deg as arr_longitude_deg,
airports.name as dep_name, airports.latitude_deg as dep_latitude_deg, airports.longitude_deg as dep_longitude_deg
from btl_connected_airports join int_airports airports on btl_connected_airports.estdepartureairport = airports.ident and current_timestamp between airports.dl_ts_captured and airports.dl_ts_delimited
"""
}
description = "Get the name and coordinates of the departures airport"
}]
metadata {
feed = compute
}
}

compute-distances {
type = CopyAction
inputId = btl-departures-arrivals-airports
outputId = btl-distances
transformers = [{
type = ScalaClassSparkDfTransformer
className = com.sample.ComputeDistanceTransformer
}]
metadata {
feed = compute
}
}

}
11 changes: 7 additions & 4 deletions config/btl.conf.part-3-solution
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ dataObjects {

btl-departures-arrivals-airports {
type = DeltaLakeTableDataObject
path = ${env.basePath}"~{id}"
path = ${env.tablePathWithId}
table {
catalog = ${env.catalog}
db = ${env.database}
name = "btl_departures_arrivals_airports"
}
Expand All @@ -18,8 +19,9 @@ dataObjects {

btl-distances {
type = DeltaLakeTableDataObject
path = ${env.basePath}"~{id}"
path = ${env.tablePathWithId}
table {
catalog = ${env.catalog}
db = ${env.database}
name = "btl_distances"
}
Expand All @@ -38,12 +40,13 @@ actions {
type = CustomDataFrameAction
inputIds = [int-departures, int-airports]
outputIds = [btl-departures-arrivals-airports]
breakDataFrameLineage = true
transformers = [{
type = SQLDfsTransformer
code = {
btl-connected-airports = """
select int_departures.estdepartureairport, int_departures.estarrivalairport, airports.*
from int_departures join int_airports airports on int_departures.estArrivalAirport = airports.ident
from int_departures join int_airports airports on int_departures.estArrivalAirport = airports.ident and current_timestamp between airports.dl_ts_captured and airports.dl_ts_delimited
"""
}
},{
Expand All @@ -53,7 +56,7 @@ actions {
select btl_connected_airports.estdepartureairport, btl_connected_airports.estarrivalairport,
btl_connected_airports.name as arr_name, btl_connected_airports.latitude_deg as arr_latitude_deg, btl_connected_airports.longitude_deg as arr_longitude_deg,
airports.name as dep_name, airports.latitude_deg as dep_latitude_deg, airports.longitude_deg as dep_longitude_deg
from btl_connected_airports join int_airports airports on btl_connected_airports.estdepartureairport = airports.ident
from btl_connected_airports join int_airports airports on btl_connected_airports.estdepartureairport = airports.ident and current_timestamp between airports.dl_ts_captured and airports.dl_ts_delimited
"""
}
description = "Get the name and coordinates of the departures airport"
Expand Down
Loading